summaryrefslogtreecommitdiff
path: root/src/gallium
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c1
-rw-r--r--src/gallium/auxiliary/target-helpers/drm_helper.h20
-rw-r--r--src/gallium/auxiliary/target-helpers/drm_helper_public.h1
-rw-r--r--src/gallium/drivers/crocus/crocus_batch.c1047
-rw-r--r--src/gallium/drivers/crocus/crocus_batch.h325
-rw-r--r--src/gallium/drivers/crocus/crocus_blit.c836
-rw-r--r--src/gallium/drivers/crocus/crocus_blorp.c399
-rw-r--r--src/gallium/drivers/crocus/crocus_blt.c337
-rw-r--r--src/gallium/drivers/crocus/crocus_bufmgr.c1689
-rw-r--r--src/gallium/drivers/crocus/crocus_bufmgr.h331
-rw-r--r--src/gallium/drivers/crocus/crocus_clear.c859
-rw-r--r--src/gallium/drivers/crocus/crocus_context.c336
-rw-r--r--src/gallium/drivers/crocus/crocus_context.h955
-rw-r--r--src/gallium/drivers/crocus/crocus_defines.h58
-rw-r--r--src/gallium/drivers/crocus/crocus_disk_cache.c263
-rw-r--r--src/gallium/drivers/crocus/crocus_draw.c511
-rw-r--r--src/gallium/drivers/crocus/crocus_fence.c571
-rw-r--r--src/gallium/drivers/crocus/crocus_fence.h60
-rw-r--r--src/gallium/drivers/crocus/crocus_fine_fence.c85
-rw-r--r--src/gallium/drivers/crocus/crocus_fine_fence.h109
-rw-r--r--src/gallium/drivers/crocus/crocus_formats.c576
-rw-r--r--src/gallium/drivers/crocus/crocus_genx_macros.h164
-rw-r--r--src/gallium/drivers/crocus/crocus_genx_protos.h56
-rw-r--r--src/gallium/drivers/crocus/crocus_monitor.c484
-rw-r--r--src/gallium/drivers/crocus/crocus_monitor.h72
-rw-r--r--src/gallium/drivers/crocus/crocus_pipe.h74
-rw-r--r--src/gallium/drivers/crocus/crocus_pipe_control.c368
-rw-r--r--src/gallium/drivers/crocus/crocus_program.c3171
-rw-r--r--src/gallium/drivers/crocus/crocus_program_cache.c347
-rw-r--r--src/gallium/drivers/crocus/crocus_query.c996
-rw-r--r--src/gallium/drivers/crocus/crocus_resolve.c1061
-rw-r--r--src/gallium/drivers/crocus/crocus_resource.c1946
-rw-r--r--src/gallium/drivers/crocus/crocus_resource.h501
-rw-r--r--src/gallium/drivers/crocus/crocus_screen.c829
-rw-r--r--src/gallium/drivers/crocus/crocus_screen.h253
-rw-r--r--src/gallium/drivers/crocus/crocus_state.c8382
-rw-r--r--src/gallium/drivers/crocus/crocus_todo.txt16
-rw-r--r--src/gallium/drivers/crocus/driinfo_crocus.h11
-rw-r--r--src/gallium/drivers/crocus/gen4_blorp_exec.h190
-rw-r--r--src/gallium/drivers/crocus/meson.build90
-rw-r--r--src/gallium/meson.build6
-rw-r--r--src/gallium/targets/d3dadapter9/meson.build2
-rw-r--r--src/gallium/targets/dri/meson.build3
-rw-r--r--src/gallium/targets/dri/target.c4
-rw-r--r--src/gallium/winsys/crocus/drm/crocus_drm_public.h33
-rw-r--r--src/gallium/winsys/crocus/drm/crocus_drm_winsys.c39
-rw-r--r--src/gallium/winsys/crocus/drm/meson.build29
47 files changed, 28494 insertions, 2 deletions
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 8147c3ca346..ca5bf121a88 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -70,6 +70,7 @@ static const struct pipe_loader_ops pipe_loader_drm_ops;
static const struct drm_driver_descriptor *driver_descriptors[] = {
&i915_driver_descriptor,
&iris_driver_descriptor,
+ &crocus_driver_descriptor,
&nouveau_driver_descriptor,
&r300_driver_descriptor,
&r600_driver_descriptor,
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h
index 6bab07a40e7..ff4621e1a88 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -112,6 +112,26 @@ DRM_DRIVER_DESCRIPTOR(iris, iris_driconf, ARRAY_SIZE(iris_driconf))
DRM_DRIVER_DESCRIPTOR_STUB(iris)
#endif
+#ifdef GALLIUM_CROCUS
+#include "crocus/drm/crocus_drm_public.h"
+
+static struct pipe_screen *
+pipe_crocus_create_screen(int fd, const struct pipe_screen_config *config)
+{
+ struct pipe_screen *screen;
+
+ screen = crocus_drm_screen_create(fd, config);
+ return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+const driOptionDescription crocus_driconf[] = {
+ #include "crocus/driinfo_crocus.h"
+};
+DRM_DRIVER_DESCRIPTOR(crocus, crocus_driconf, ARRAY_SIZE(crocus_driconf))
+#else
+DRM_DRIVER_DESCRIPTOR_STUB(crocus)
+#endif
+
#ifdef GALLIUM_NOUVEAU
#include "nouveau/drm/nouveau_drm_public.h"
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
index 5fd3084dfdb..478e72b8525 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
@@ -6,6 +6,7 @@ struct pipe_screen_config;
extern const struct drm_driver_descriptor i915_driver_descriptor;
extern const struct drm_driver_descriptor iris_driver_descriptor;
+extern const struct drm_driver_descriptor crocus_driver_descriptor;
extern const struct drm_driver_descriptor nouveau_driver_descriptor;
extern const struct drm_driver_descriptor r300_driver_descriptor;
extern const struct drm_driver_descriptor r600_driver_descriptor;
diff --git a/src/gallium/drivers/crocus/crocus_batch.c b/src/gallium/drivers/crocus/crocus_batch.c
new file mode 100644
index 00000000000..63cfe282de4
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_batch.c
@@ -0,0 +1,1047 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_batch.c
+ *
+ * Batchbuffer and command submission module.
+ *
+ * Every API draw call results in a number of GPU commands, which we
+ * collect into a "batch buffer". Typically, many draw calls are grouped
+ * into a single batch to amortize command submission overhead.
+ *
+ * We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl.
+ * One critical piece of data is the "validation list", which contains a
+ * list of the buffer objects (BOs) which the commands in the GPU need.
+ * The kernel will make sure these are resident and pinned at the correct
+ * virtual memory address before executing our batch. If a BO is not in
+ * the validation list, it effectively does not exist, so take care.
+ */
+
+#include "crocus_batch.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "crocus_fence.h"
+
+#include "drm-uapi/i915_drm.h"
+
+#include "intel/common/intel_gem.h"
+#include "main/macros.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "util/u_upload_mgr.h"
+
+#include <errno.h>
+#include <xf86drm.h>
+
+#if HAVE_VALGRIND
+#include <memcheck.h>
+#include <valgrind.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+
+/* Terminating the batch takes either 4 bytes for MI_BATCH_BUFFER_END
+ * or 12 bytes for MI_BATCH_BUFFER_START (when chaining). Plus, we may
+ * need an extra 4 bytes to pad out to the nearest QWord. So reserve 16.
+ */
+#define BATCH_RESERVED(devinfo) ((devinfo)->is_haswell ? 32 : 16)
+
+static void crocus_batch_reset(struct crocus_batch *batch);
+
+static unsigned
+num_fences(struct crocus_batch *batch)
+{
+ return util_dynarray_num_elements(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence);
+}
+
+/**
+ * Debugging code to dump the fence list, used by INTEL_DEBUG=submit.
+ */
+static void
+dump_fence_list(struct crocus_batch *batch)
+{
+ fprintf(stderr, "Fence list (length %u): ", num_fences(batch));
+
+ util_dynarray_foreach(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence, f) {
+ fprintf(stderr, "%s%u%s ",
+ (f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "",
+ f->handle,
+ (f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : "");
+ }
+
+ fprintf(stderr, "\n");
+}
+
+/**
+ * Debugging code to dump the validation list, used by INTEL_DEBUG=submit.
+ */
+static void
+dump_validation_list(struct crocus_batch *batch)
+{
+ fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ uint64_t flags = batch->validation_list[i].flags;
+ assert(batch->validation_list[i].handle ==
+ batch->exec_bos[i]->gem_handle);
+ fprintf(stderr,
+ "[%2d]: %2d %-14s @ 0x%016llx (%" PRIu64 "B)\t %2d refs %s\n", i,
+ batch->validation_list[i].handle, batch->exec_bos[i]->name,
+ batch->validation_list[i].offset, batch->exec_bos[i]->size,
+ batch->exec_bos[i]->refcount,
+ (flags & EXEC_OBJECT_WRITE) ? " (write)" : "");
+ }
+}
+
+/**
+ * Return BO information to the batch decoder (for debugging).
+ */
+static struct intel_batch_decode_bo
+decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
+{
+ struct crocus_batch *batch = v_batch;
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ struct crocus_bo *bo = batch->exec_bos[i];
+ /* The decoder zeroes out the top 16 bits, so we need to as well */
+ uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
+
+ if (address >= bo_address && address < bo_address + bo->size) {
+ return (struct intel_batch_decode_bo){
+ .addr = address,
+ .size = bo->size,
+ .map = crocus_bo_map(batch->dbg, bo, MAP_READ) +
+ (address - bo_address),
+ };
+ }
+ }
+
+ return (struct intel_batch_decode_bo) { };
+}
+
+static unsigned
+decode_get_state_size(void *v_batch, uint64_t address,
+ uint64_t base_address)
+{
+ struct crocus_batch *batch = v_batch;
+
+ /* The decoder gives us offsets from a base address, which is not great.
+ * Binding tables are relative to surface state base address, and other
+ * state is relative to dynamic state base address. These could alias,
+ * but in practice it's unlikely because surface offsets are always in
+ * the [0, 64K) range, and we assign dynamic state addresses starting at
+ * the top of the 4GB range. We should fix this but it's likely good
+ * enough for now.
+ */
+ unsigned size = (uintptr_t)
+ _mesa_hash_table_u64_search(batch->state_sizes, address - base_address);
+
+ return size;
+}
+
+/**
+ * Decode the current batch.
+ */
+static void
+decode_batch(struct crocus_batch *batch)
+{
+ void *map = crocus_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ);
+ intel_print_batch(&batch->decoder, map, batch->primary_batch_size,
+ batch->exec_bos[0]->gtt_offset, false);
+}
+
+static void
+init_reloc_list(struct crocus_reloc_list *rlist, int count)
+{
+ rlist->reloc_count = 0;
+ rlist->reloc_array_size = count;
+ rlist->relocs = malloc(rlist->reloc_array_size *
+ sizeof(struct drm_i915_gem_relocation_entry));
+}
+
+void
+crocus_init_batch(struct crocus_context *ice,
+ enum crocus_batch_name name,
+ int priority)
+{
+ struct crocus_batch *batch = &ice->batches[name];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+
+ batch->ice = ice;
+ batch->screen = screen;
+ batch->dbg = &ice->dbg;
+ batch->reset = &ice->reset;
+ batch->name = name;
+ batch->contains_fence_signal = false;
+
+ if (devinfo->ver >= 7) {
+ batch->fine_fences.uploader =
+ u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_STAGING, 0);
+ }
+ crocus_fine_fence_init(batch);
+
+ batch->hw_ctx_id = crocus_create_hw_context(screen->bufmgr);
+ assert(batch->hw_ctx_id);
+
+ crocus_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
+
+ batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
+ if (devinfo->ver == 6)
+ batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
+
+ if (INTEL_DEBUG & DEBUG_BATCH) {
+ /* The shadow doesn't get relocs written so state decode fails. */
+ batch->use_shadow_copy = false;
+ } else
+ batch->use_shadow_copy = !devinfo->has_llc;
+
+ util_dynarray_init(&batch->exec_fences, ralloc_context(NULL));
+ util_dynarray_init(&batch->syncobjs, ralloc_context(NULL));
+
+ init_reloc_list(&batch->command.relocs, 250);
+ init_reloc_list(&batch->state.relocs, 250);
+
+ batch->exec_count = 0;
+ batch->exec_array_size = 100;
+ batch->exec_bos =
+ malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
+ batch->validation_list =
+ malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
+
+ batch->cache.render = _mesa_hash_table_create(NULL, NULL,
+ _mesa_key_pointer_equal);
+ batch->cache.depth = _mesa_set_create(NULL, NULL,
+ _mesa_key_pointer_equal);
+
+ memset(batch->other_batches, 0, sizeof(batch->other_batches));
+
+ for (int i = 0, j = 0; i < ice->batch_count; i++) {
+ if (i != name)
+ batch->other_batches[j++] = &ice->batches[i];
+ }
+
+ if (INTEL_DEBUG & DEBUG_BATCH) {
+
+ batch->state_sizes = _mesa_hash_table_u64_create(NULL);
+ const unsigned decode_flags =
+ INTEL_BATCH_DECODE_FULL |
+ ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
+ INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS;
+
+ intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo, stderr,
+ decode_flags, NULL, decode_get_bo,
+ decode_get_state_size, batch);
+ batch->decoder.max_vbo_decoded_lines = 32;
+ }
+
+ crocus_batch_reset(batch);
+}
+
+static struct drm_i915_gem_exec_object2 *
+find_validation_entry(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+ unsigned index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return &batch->validation_list[index];
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
+ return &batch->validation_list[index];
+ }
+
+ return NULL;
+}
+
+static void
+ensure_exec_obj_space(struct crocus_batch *batch, uint32_t count)
+{
+ while (batch->exec_count + count > batch->exec_array_size) {
+ batch->exec_array_size *= 2;
+ batch->exec_bos = realloc(
+ batch->exec_bos, batch->exec_array_size * sizeof(batch->exec_bos[0]));
+ batch->validation_list =
+ realloc(batch->validation_list,
+ batch->exec_array_size * sizeof(batch->validation_list[0]));
+ }
+}
+
+static struct drm_i915_gem_exec_object2 *
+crocus_use_bo(struct crocus_batch *batch, struct crocus_bo *bo, bool writable)
+{
+ assert(bo->bufmgr == batch->command.bo->bufmgr);
+
+ if (bo == batch->ice->workaround_bo)
+ writable = false;
+
+ struct drm_i915_gem_exec_object2 *existing_entry =
+ find_validation_entry(batch, bo);
+
+ if (existing_entry) {
+ /* The BO is already in the validation list; mark it writable */
+ if (writable)
+ existing_entry->flags |= EXEC_OBJECT_WRITE;
+ return existing_entry;
+ }
+
+ if (bo != batch->command.bo && bo != batch->state.bo) {
+ /* This is the first time our batch has seen this BO. Before we use it,
+ * we may need to flush and synchronize with other batches.
+ */
+ for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
+
+ if (!batch->other_batches[b])
+ continue;
+ struct drm_i915_gem_exec_object2 *other_entry =
+ find_validation_entry(batch->other_batches[b], bo);
+
+ /* If the buffer is referenced by another batch, and either batch
+ * intends to write it, then flush the other batch and synchronize.
+ *
+ * Consider these cases:
+ *
+ * 1. They read, we read => No synchronization required.
+ * 2. They read, we write => Synchronize (they need the old value)
+ * 3. They write, we read => Synchronize (we need their new value)
+ * 4. They write, we write => Synchronize (order writes)
+ *
+ * The read/read case is very common, as multiple batches usually
+ * share a streaming state buffer or shader assembly buffer, and
+ * we want to avoid synchronizing in this case.
+ */
+ if (other_entry &&
+ ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) {
+ crocus_batch_flush(batch->other_batches[b]);
+ crocus_batch_add_syncobj(batch,
+ batch->other_batches[b]->last_fence->syncobj,
+ I915_EXEC_FENCE_WAIT);
+ }
+ }
+ }
+
+ /* Bump the ref count since the batch is now using this bo. */
+ crocus_bo_reference(bo);
+
+ ensure_exec_obj_space(batch, 1);
+
+ batch->validation_list[batch->exec_count] =
+ (struct drm_i915_gem_exec_object2) {
+ .handle = bo->gem_handle,
+ .offset = bo->gtt_offset,
+ .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0),
+ };
+
+ bo->index = batch->exec_count;
+ batch->exec_bos[batch->exec_count] = bo;
+ batch->aperture_space += bo->size;
+
+ batch->exec_count++;
+
+ return &batch->validation_list[batch->exec_count - 1];
+}
+
+static uint64_t
+emit_reloc(struct crocus_batch *batch,
+ struct crocus_reloc_list *rlist, uint32_t offset,
+ struct crocus_bo *target, int32_t target_offset,
+ unsigned int reloc_flags)
+{
+ assert(target != NULL);
+
+ bool writable = reloc_flags & RELOC_WRITE;
+
+ struct drm_i915_gem_exec_object2 *entry =
+ crocus_use_bo(batch, target, writable);
+
+ if (rlist->reloc_count == rlist->reloc_array_size) {
+ rlist->reloc_array_size *= 2;
+ rlist->relocs = realloc(rlist->relocs,
+ rlist->reloc_array_size *
+ sizeof(struct drm_i915_gem_relocation_entry));
+ }
+
+ if (reloc_flags & RELOC_32BIT) {
+ /* Restrict this buffer to the low 32 bits of the address space.
+ *
+ * Altering the validation list flags restricts it for this batch,
+ * but we also alter the BO's kflags to restrict it permanently
+ * (until the BO is destroyed and put back in the cache). Buffers
+ * may stay bound across batches, and we want keep it constrained.
+ */
+ target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+ entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+ /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
+ reloc_flags &= ~RELOC_32BIT;
+ }
+
+ if (reloc_flags)
+ entry->flags |= reloc_flags & batch->valid_reloc_flags;
+
+ rlist->relocs[rlist->reloc_count++] =
+ (struct drm_i915_gem_relocation_entry) {
+ .offset = offset,
+ .delta = target_offset,
+ .target_handle = target->index,
+ .presumed_offset = entry->offset,
+ };
+
+ /* Using the old buffer offset, write in what the right data would be, in
+ * case the buffer doesn't move and we can short-circuit the relocation
+ * processing in the kernel
+ */
+ return entry->offset + target_offset;
+}
+
+uint64_t
+crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags)
+{
+ assert(batch_offset <= batch->command.bo->size - sizeof(uint32_t));
+
+ return emit_reloc(batch, &batch->command.relocs, batch_offset,
+ target, target_offset, reloc_flags);
+}
+
+uint64_t
+crocus_state_reloc(struct crocus_batch *batch, uint32_t state_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags)
+{
+ assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
+
+ return emit_reloc(batch, &batch->state.relocs, state_offset,
+ target, target_offset, reloc_flags);
+}
+
+static void
+recreate_growing_buffer(struct crocus_batch *batch,
+ struct crocus_growing_bo *grow,
+ const char *name, unsigned size)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ grow->bo = crocus_bo_alloc(bufmgr, name, size);
+ grow->bo->kflags |= EXEC_OBJECT_CAPTURE;
+ grow->partial_bo = NULL;
+ grow->partial_bo_map = NULL;
+ grow->partial_bytes = 0;
+ if (batch->use_shadow_copy)
+ grow->map = realloc(grow->map, grow->bo->size);
+ else
+ grow->map = crocus_bo_map(NULL, grow->bo, MAP_READ | MAP_WRITE);
+ grow->map_next = grow->map;
+}
+
+static void
+create_batch(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ recreate_growing_buffer(batch, &batch->command,
+ "command buffer",
+ BATCH_SZ + BATCH_RESERVED(&screen->devinfo));
+
+ crocus_use_bo(batch, batch->command.bo, false);
+
+ recreate_growing_buffer(batch, &batch->state,
+ "state buffer",
+ STATE_SZ);
+
+ batch->state.used = 1;
+ crocus_use_bo(batch, batch->state.bo, false);
+}
+
+static void
+crocus_batch_maybe_noop(struct crocus_batch *batch)
+{
+ /* We only insert the NOOP at the beginning of the batch. */
+ assert(crocus_batch_bytes_used(batch) == 0);
+
+ if (batch->noop_enabled) {
+ /* Emit MI_BATCH_BUFFER_END to prevent any further command to be
+ * executed.
+ */
+ uint32_t *map = batch->command.map_next;
+
+ map[0] = (0xA << 23);
+
+ batch->command.map_next += 4;
+ }
+}
+
+static void
+crocus_batch_reset(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ crocus_bo_unreference(batch->command.bo);
+ crocus_bo_unreference(batch->state.bo);
+ batch->primary_batch_size = 0;
+ batch->contains_draw = false;
+ batch->contains_fence_signal = false;
+ batch->state_base_address_emitted = false;
+ batch->screen->vtbl.batch_reset_dirty(batch);
+
+ create_batch(batch);
+ assert(batch->command.bo->index == 0);
+
+ if (batch->state_sizes)
+ _mesa_hash_table_u64_clear(batch->state_sizes);
+ struct crocus_syncobj *syncobj = crocus_create_syncobj(screen);
+ crocus_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL);
+ crocus_syncobj_reference(screen, &syncobj, NULL);
+
+ crocus_cache_sets_clear(batch);
+}
+
+void
+crocus_batch_free(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+
+ if (batch->use_shadow_copy) {
+ free(batch->command.map);
+ free(batch->state.map);
+ }
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ crocus_bo_unreference(batch->exec_bos[i]);
+ }
+
+ pipe_resource_reference(&batch->fine_fences.ref.res, NULL);
+
+ free(batch->command.relocs.relocs);
+ free(batch->state.relocs.relocs);
+ free(batch->exec_bos);
+ free(batch->validation_list);
+
+ ralloc_free(batch->exec_fences.mem_ctx);
+
+ util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
+ crocus_syncobj_reference(screen, s, NULL);
+ ralloc_free(batch->syncobjs.mem_ctx);
+
+ crocus_fine_fence_reference(batch->screen, &batch->last_fence, NULL);
+ if (batch_has_fine_fence(batch))
+ u_upload_destroy(batch->fine_fences.uploader);
+
+ crocus_bo_unreference(batch->command.bo);
+ batch->command.bo = NULL;
+ batch->command.map = NULL;
+ batch->command.map_next = NULL;
+
+ crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+
+ _mesa_hash_table_destroy(batch->cache.render, NULL);
+ _mesa_set_destroy(batch->cache.depth, NULL);
+
+ if (batch->state_sizes) {
+ _mesa_hash_table_u64_destroy(batch->state_sizes);
+ intel_batch_decode_ctx_finish(&batch->decoder);
+ }
+}
+
+/**
+ * If we've chained to a secondary batch, or are getting near to the end,
+ * then flush. This should only be called between draws.
+ */
+void
+crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate)
+{
+ if (batch->command.bo != batch->exec_bos[0] ||
+ crocus_batch_bytes_used(batch) + estimate >= BATCH_SZ) {
+ crocus_batch_flush(batch);
+ }
+}
+
+/**
+ * Finish copying the old batch/state buffer's contents to the new one
+ * after we tried to "grow" the buffer in an earlier operation.
+ */
+static void
+finish_growing_bos(struct crocus_growing_bo *grow)
+{
+ struct crocus_bo *old_bo = grow->partial_bo;
+ if (!old_bo)
+ return;
+
+ memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
+
+ grow->partial_bo = NULL;
+ grow->partial_bo_map = NULL;
+ grow->partial_bytes = 0;
+
+ crocus_bo_unreference(old_bo);
+}
+
+void
+crocus_grow_buffer(struct crocus_batch *batch, bool grow_state,
+ unsigned used,
+ unsigned new_size)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ struct crocus_growing_bo *grow = grow_state ? &batch->state : &batch->command;
+ struct crocus_bo *bo = grow->bo;
+
+ if (grow->partial_bo) {
+ /* We've already grown once, and now we need to do it again.
+ * Finish our last grow operation so we can start a new one.
+ * This should basically never happen.
+ */
+ finish_growing_bos(grow);
+ }
+
+ struct crocus_bo *new_bo = crocus_bo_alloc(bufmgr, bo->name, new_size);
+
+ /* Copy existing data to the new larger buffer */
+ grow->partial_bo_map = grow->map;
+
+ if (batch->use_shadow_copy) {
+ /* We can't safely use realloc, as it may move the existing buffer,
+ * breaking existing pointers the caller may still be using. Just
+ * malloc a new copy and memcpy it like the normal BO path.
+ *
+ * Use bo->size rather than new_size because the bufmgr may have
+ * rounded up the size, and we want the shadow size to match.
+ */
+ grow->map = malloc(new_bo->size);
+ } else {
+ grow->map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE);
+ }
+ /* Try to put the new BO at the same GTT offset as the old BO (which
+ * we're throwing away, so it doesn't need to be there).
+ *
+ * This guarantees that our relocations continue to work: values we've
+ * already written into the buffer, values we're going to write into the
+ * buffer, and the validation/relocation lists all will match.
+ *
+ * Also preserve kflags for EXEC_OBJECT_CAPTURE.
+ */
+ new_bo->gtt_offset = bo->gtt_offset;
+ new_bo->index = bo->index;
+ new_bo->kflags = bo->kflags;
+
+ /* Batch/state buffers are per-context, and if we've run out of space,
+ * we must have actually used them before, so...they will be in the list.
+ */
+ assert(bo->index < batch->exec_count);
+ assert(batch->exec_bos[bo->index] == bo);
+
+ /* Update the validation list to use the new BO. */
+ batch->validation_list[bo->index].handle = new_bo->gem_handle;
+ /* Exchange the two BOs...without breaking pointers to the old BO.
+ *
+ * Consider this scenario:
+ *
+ * 1. Somebody calls brw_state_batch() to get a region of memory, and
+ * and then creates a brw_address pointing to brw->batch.state.bo.
+ * 2. They then call brw_state_batch() a second time, which happens to
+ * grow and replace the state buffer. They then try to emit a
+ * relocation to their first section of memory.
+ *
+ * If we replace the brw->batch.state.bo pointer at step 2, we would
+ * break the address created in step 1. They'd have a pointer to the
+ * old destroyed BO. Emitting a relocation would add this dead BO to
+ * the validation list...causing /both/ statebuffers to be in the list,
+ * and all kinds of disasters.
+ *
+ * This is not a contrived case - BLORP vertex data upload hits this.
+ *
+ * There are worse scenarios too. Fences for GL sync objects reference
+ * brw->batch.batch.bo. If we replaced the batch pointer when growing,
+ * we'd need to chase down every fence and update it to point to the
+ * new BO. Otherwise, it would refer to a "batch" that never actually
+ * gets submitted, and would fail to trigger.
+ *
+ * To work around both of these issues, we transmutate the buffers in
+ * place, making the existing struct brw_bo represent the new buffer,
+ * and "new_bo" represent the old BO. This is highly unusual, but it
+ * seems like a necessary evil.
+ *
+ * We also defer the memcpy of the existing batch's contents. Callers
+ * may make multiple brw_state_batch calls, and retain pointers to the
+ * old BO's map. We'll perform the memcpy in finish_growing_bo() when
+ * we finally submit the batch, at which point we've finished uploading
+ * state, and nobody should have any old references anymore.
+ *
+ * To do that, we keep a reference to the old BO in grow->partial_bo,
+ * and store the number of bytes to copy in grow->partial_bytes. We
+ * can monkey with the refcounts directly without atomics because these
+ * are per-context BOs and they can only be touched by this thread.
+ */
+ assert(new_bo->refcount == 1);
+ new_bo->refcount = bo->refcount;
+ bo->refcount = 1;
+
+ struct crocus_bo tmp;
+ memcpy(&tmp, bo, sizeof(struct crocus_bo));
+ memcpy(bo, new_bo, sizeof(struct crocus_bo));
+ memcpy(new_bo, &tmp, sizeof(struct crocus_bo));
+
+ grow->partial_bo = new_bo; /* the one reference of the OLD bo */
+ grow->partial_bytes = used;
+}
+
+static void
+finish_seqno(struct crocus_batch *batch)
+{
+ struct crocus_fine_fence *sq = crocus_fine_fence_new(batch, CROCUS_FENCE_END);
+ if (!sq)
+ return;
+
+ crocus_fine_fence_reference(batch->screen, &batch->last_fence, sq);
+ crocus_fine_fence_reference(batch->screen, &sq, NULL);
+}
+
+/**
+ * Terminate a batch with MI_BATCH_BUFFER_END.
+ */
+static void
+crocus_finish_batch(struct crocus_batch *batch)
+{
+
+ batch->no_wrap = true;
+ if (batch->screen->vtbl.finish_batch)
+ batch->screen->vtbl.finish_batch(batch);
+
+ finish_seqno(batch);
+
+ /* Emit MI_BATCH_BUFFER_END to finish our batch. */
+ uint32_t *map = batch->command.map_next;
+
+ map[0] = (0xA << 23);
+
+ batch->command.map_next += 4;
+ VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->command.map, crocus_batch_bytes_used(batch)));
+
+ if (batch->command.bo == batch->exec_bos[0])
+ batch->primary_batch_size = crocus_batch_bytes_used(batch);
+ batch->no_wrap = false;
+}
+
+/**
+ * Replace our current GEM context with a new one (in case it got banned).
+ */
+static bool
+replace_hw_ctx(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+
+ uint32_t new_ctx = crocus_clone_hw_context(bufmgr, batch->hw_ctx_id);
+ if (!new_ctx)
+ return false;
+
+ crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+ batch->hw_ctx_id = new_ctx;
+
+ /* Notify the context that state must be re-initialized. */
+ crocus_lost_context_state(batch);
+
+ return true;
+}
+
+enum pipe_reset_status
+crocus_batch_check_for_reset(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+ enum pipe_reset_status status = PIPE_NO_RESET;
+ struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id };
+
+ if (drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats))
+ DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno));
+
+ if (stats.batch_active != 0) {
+ /* A reset was observed while a batch from this hardware context was
+ * executing. Assume that this context was at fault.
+ */
+ status = PIPE_GUILTY_CONTEXT_RESET;
+ } else if (stats.batch_pending != 0) {
+ /* A reset was observed while a batch from this context was in progress,
+ * but the batch was not executing. In this case, assume that the
+ * context was not at fault.
+ */
+ status = PIPE_INNOCENT_CONTEXT_RESET;
+ }
+
+ if (status != PIPE_NO_RESET) {
+ /* Our context is likely banned, or at least in an unknown state.
+ * Throw it away and start with a fresh context. Ideally this may
+ * catch the problem before our next execbuf fails with -EIO.
+ */
+ replace_hw_ctx(batch);
+ }
+
+ return status;
+}
+
+/**
+ * Submit the batch to the GPU via execbuffer2.
+ */
+static int
+submit_batch(struct crocus_batch *batch)
+{
+
+ if (batch->use_shadow_copy) {
+ void *bo_map = crocus_bo_map(batch->dbg, batch->command.bo, MAP_WRITE);
+ memcpy(bo_map, batch->command.map, crocus_batch_bytes_used(batch));
+
+ bo_map = crocus_bo_map(batch->dbg, batch->state.bo, MAP_WRITE);
+ memcpy(bo_map, batch->state.map, batch->state.used);
+ }
+
+ crocus_bo_unmap(batch->command.bo);
+ crocus_bo_unmap(batch->state.bo);
+
+ /* The requirement for using I915_EXEC_NO_RELOC are:
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.gtt_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ */
+ /* Set statebuffer relocations */
+ const unsigned state_index = batch->state.bo->index;
+ if (state_index < batch->exec_count &&
+ batch->exec_bos[state_index] == batch->state.bo) {
+ struct drm_i915_gem_exec_object2 *entry =
+ &batch->validation_list[state_index];
+ assert(entry->handle == batch->state.bo->gem_handle);
+ entry->relocation_count = batch->state.relocs.reloc_count;
+ entry->relocs_ptr = (uintptr_t)batch->state.relocs.relocs;
+ }
+
+ /* Set batchbuffer relocations */
+ struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
+ assert(entry->handle == batch->command.bo->gem_handle);
+ entry->relocation_count = batch->command.relocs.reloc_count;
+ entry->relocs_ptr = (uintptr_t)batch->command.relocs.relocs;
+
+ struct drm_i915_gem_execbuffer2 execbuf = {
+ .buffers_ptr = (uintptr_t)batch->validation_list,
+ .buffer_count = batch->exec_count,
+ .batch_start_offset = 0,
+ /* This must be QWord aligned. */
+ .batch_len = ALIGN(batch->primary_batch_size, 8),
+ .flags = I915_EXEC_RENDER |
+ I915_EXEC_NO_RELOC |
+ I915_EXEC_BATCH_FIRST |
+ I915_EXEC_HANDLE_LUT,
+ .rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */
+ };
+
+ if (num_fences(batch)) {
+ execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+ execbuf.num_cliprects = num_fences(batch);
+ execbuf.cliprects_ptr =
+ (uintptr_t)util_dynarray_begin(&batch->exec_fences);
+ }
+
+ int ret = 0;
+ if (!batch->screen->no_hw &&
+ intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
+ ret = -errno;
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ struct crocus_bo *bo = batch->exec_bos[i];
+
+ bo->idle = false;
+ bo->index = -1;
+
+ /* Update brw_bo::gtt_offset */
+ if (batch->validation_list[i].offset != bo->gtt_offset) {
+ DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
+ bo->gem_handle, bo->gtt_offset,
+ batch->validation_list[i].offset);
+ assert(!(bo->kflags & EXEC_OBJECT_PINNED));
+ bo->gtt_offset = batch->validation_list[i].offset;
+ }
+ }
+
+ return ret;
+}
+
+static const char *
+batch_name_to_string(enum crocus_batch_name name)
+{
+ const char *names[CROCUS_BATCH_COUNT] = {
+ [CROCUS_BATCH_RENDER] = "render",
+ [CROCUS_BATCH_COMPUTE] = "compute",
+ };
+ return names[name];
+}
+
+/**
+ * Flush the batch buffer, submitting it to the GPU and resetting it so
+ * we're ready to emit the next batch.
+ *
+ * \param in_fence_fd is ignored if -1. Otherwise, this function takes
+ * ownership of the fd.
+ *
+ * \param out_fence_fd is ignored if NULL. Otherwise, the caller must
+ * take ownership of the returned fd.
+ */
+void
+_crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ /* If a fence signals we need to flush it. */
+ if (crocus_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal)
+ return;
+
+ assert(!batch->no_wrap);
+ crocus_finish_batch(batch);
+
+ finish_growing_bos(&batch->command);
+ finish_growing_bos(&batch->state);
+ int ret = submit_batch(batch);
+
+ if (unlikely(INTEL_DEBUG &
+ (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL))) {
+ int bytes_for_commands = crocus_batch_bytes_used(batch);
+ int second_bytes = 0;
+ if (batch->command.bo != batch->exec_bos[0]) {
+ second_bytes = bytes_for_commands;
+ bytes_for_commands += batch->primary_batch_size;
+ }
+ fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) "
+ "(cmds), %4d BOs (%0.1fMb aperture),"
+ " %4d command relocs, %4d state relocs\n",
+ file, line, batch_name_to_string(batch->name), batch->hw_ctx_id,
+ batch->primary_batch_size, second_bytes,
+ 100.0f * bytes_for_commands / BATCH_SZ,
+ batch->exec_count,
+ (float) batch->aperture_space / (1024 * 1024),
+ batch->command.relocs.reloc_count,
+ batch->state.relocs.reloc_count);
+
+ if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) {
+ dump_fence_list(batch);
+ dump_validation_list(batch);
+ }
+
+ if (INTEL_DEBUG & DEBUG_BATCH) {
+ decode_batch(batch);
+ }
+ }
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ struct crocus_bo *bo = batch->exec_bos[i];
+ crocus_bo_unreference(bo);
+ }
+
+ batch->command.relocs.reloc_count = 0;
+ batch->state.relocs.reloc_count = 0;
+ batch->exec_count = 0;
+ batch->aperture_space = 0;
+
+ util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
+ crocus_syncobj_reference(screen, s, NULL);
+ util_dynarray_clear(&batch->syncobjs);
+
+ util_dynarray_clear(&batch->exec_fences);
+
+ if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
+ dbg_printf("waiting for idle\n");
+ crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */
+ }
+
+ /* Start a new batch buffer. */
+ crocus_batch_reset(batch);
+
+ /* EIO means our context is banned. In this case, try and replace it
+ * with a new logical context, and inform crocus_context that all state
+ * has been lost and needs to be re-initialized. If this succeeds,
+ * dubiously claim success...
+ */
+ if (ret == -EIO && replace_hw_ctx(batch)) {
+ if (batch->reset->reset) {
+ /* Tell the state tracker the device is lost and it was our fault. */
+ batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET);
+ }
+
+ ret = 0;
+ }
+
+ if (ret < 0) {
+#ifdef DEBUG
+ const bool color = INTEL_DEBUG & DEBUG_COLOR;
+ fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n",
+ color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
+#endif
+ abort();
+ }
+}
+
+/**
+ * Does the current batch refer to the given BO?
+ *
+ * (In other words, is the BO in the current batch's validation list?)
+ */
+bool
+crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+ return find_validation_entry(batch, bo) != NULL;
+}
+
+/**
+ * Updates the state of the noop feature. Returns true if there was a noop
+ * transition that led to state invalidation.
+ */
+bool
+crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable)
+{
+ if (batch->noop_enabled == noop_enable)
+ return 0;
+
+ batch->noop_enabled = noop_enable;
+
+ crocus_batch_flush(batch);
+
+ /* If the batch was empty, flush had no effect, so insert our noop. */
+ if (crocus_batch_bytes_used(batch) == 0)
+ crocus_batch_maybe_noop(batch);
+
+ /* We only need to update the entire state if we transition from noop ->
+ * not-noop.
+ */
+ return !batch->noop_enabled;
+}
diff --git a/src/gallium/drivers/crocus/crocus_batch.h b/src/gallium/drivers/crocus/crocus_batch.h
new file mode 100644
index 00000000000..fe6857d83ed
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_batch.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_BATCH_DOT_H
+#define CROCUS_BATCH_DOT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/u_dynarray.h"
+
+#include "common/intel_decoder.h"
+#include "drm-uapi/i915_drm.h"
+
+#include "crocus_fence.h"
+#include "crocus_fine_fence.h"
+
+#include "crocus_bufmgr.h"
+/* The kernel assumes batchbuffers are smaller than 256kB. */
+#define MAX_BATCH_SIZE (256 * 1024)
+
+/* 3DSTATE_BINDING_TABLE_POINTERS has a U16 offset from Surface State Base
+ * Address, which means that we can't put binding tables beyond 64kB. This
+ * effectively limits the maximum statebuffer size to 64kB.
+ */
+#define MAX_STATE_SIZE (64 * 1024)
+
+/* Our target batch size - flush approximately at this point. */
+#define BATCH_SZ (20 * 1024)
+#define STATE_SZ (16 * 1024)
+
+enum crocus_batch_name {
+ CROCUS_BATCH_RENDER,
+ CROCUS_BATCH_COMPUTE,
+};
+
+#define CROCUS_BATCH_COUNT 2
+
+struct crocus_address {
+ struct crocus_bo *bo;
+ int32_t offset;
+ uint32_t reloc_flags;
+};
+
+struct crocus_reloc_list {
+ struct drm_i915_gem_relocation_entry *relocs;
+ int reloc_count;
+ int reloc_array_size;
+};
+
+struct crocus_growing_bo {
+ struct crocus_bo *bo;
+ void *map;
+ void *map_next;
+ struct crocus_bo *partial_bo;
+ void *partial_bo_map;
+ unsigned partial_bytes;
+ struct crocus_reloc_list relocs;
+ unsigned used;
+};
+
+struct crocus_batch {
+ struct crocus_context *ice;
+ struct crocus_screen *screen;
+ struct pipe_debug_callback *dbg;
+ struct pipe_device_reset_callback *reset;
+
+ /** What batch is this? (e.g. CROCUS_BATCH_RENDER/COMPUTE) */
+ enum crocus_batch_name name;
+
+ /** buffers: command, state */
+ struct crocus_growing_bo command, state;
+
+ /** Size of the primary batch if we've moved on to a secondary. */
+ unsigned primary_batch_size;
+
+ bool state_base_address_emitted;
+ uint8_t pipe_controls_since_last_cs_stall;
+
+ uint32_t hw_ctx_id;
+
+ uint32_t valid_reloc_flags;
+
+ bool use_shadow_copy;
+ bool no_wrap;
+
+ /** The validation list */
+ struct drm_i915_gem_exec_object2 *validation_list;
+ struct crocus_bo **exec_bos;
+ int exec_count;
+ int exec_array_size;
+
+ /** Whether INTEL_BLACKHOLE_RENDER is enabled in the batch (aka first
+ * instruction is a MI_BATCH_BUFFER_END).
+ */
+ bool noop_enabled;
+
+ /**
+ * A list of crocus_syncobjs associated with this batch.
+ *
+ * The first list entry will always be a signalling sync-point, indicating
+ * that this batch has completed. The others are likely to be sync-points
+ * to wait on before executing the batch.
+ */
+ struct util_dynarray syncobjs;
+
+ /** A list of drm_i915_exec_fences to have execbuf signal or wait on */
+ struct util_dynarray exec_fences;
+
+ /** The amount of aperture space (in bytes) used by all exec_bos */
+ int aperture_space;
+
+ struct {
+ /** Uploader to use for sequence numbers */
+ struct u_upload_mgr *uploader;
+
+ /** GPU buffer and CPU map where our seqno's will be written. */
+ struct crocus_state_ref ref;
+ uint32_t *map;
+
+ /** The sequence number to write the next time we add a fence. */
+ uint32_t next;
+ } fine_fences;
+
+ /** A seqno (and syncobj) for the last batch that was submitted. */
+ struct crocus_fine_fence *last_fence;
+
+ /** List of other batches which we might need to flush to use a BO */
+ struct crocus_batch *other_batches[CROCUS_BATCH_COUNT - 1];
+
+ struct {
+ /**
+ * Set of struct brw_bo * that have been rendered to within this
+ * batchbuffer and would need flushing before being used from another
+ * cache domain that isn't coherent with it (i.e. the sampler).
+ */
+ struct hash_table *render;
+
+ /**
+ * Set of struct brw_bo * that have been used as a depth buffer within
+ * this batchbuffer and would need flushing before being used from
+ * another cache domain that isn't coherent with it (i.e. the sampler).
+ */
+ struct set *depth;
+ } cache;
+
+ struct intel_batch_decode_ctx decoder;
+ struct hash_table_u64 *state_sizes;
+
+ /** Have we emitted any draw calls to this batch? */
+ bool contains_draw;
+
+ /** Batch contains fence signal operation. */
+ bool contains_fence_signal;
+};
+
+static inline bool
+batch_has_fine_fence(struct crocus_batch *batch)
+{
+ return !!batch->fine_fences.uploader;
+}
+
+#define BATCH_HAS_FINE_FENCES(batch) (!!(batch)->fine_fences.uploader)
+void crocus_init_batch(struct crocus_context *ctx,
+ enum crocus_batch_name name,
+ int priority);
+void crocus_batch_free(struct crocus_batch *batch);
+void crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate);
+
+void _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line);
+#define crocus_batch_flush(batch) _crocus_batch_flush((batch), __FILE__, __LINE__)
+
+bool crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo);
+
+bool crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable);
+
+#define RELOC_WRITE EXEC_OBJECT_WRITE
+#define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
+/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
+#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
+
+void crocus_use_pinned_bo(struct crocus_batch *batch, struct crocus_bo *bo,
+ bool writable);
+uint64_t crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags);
+uint64_t crocus_state_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags);
+
+enum pipe_reset_status crocus_batch_check_for_reset(struct crocus_batch *batch);
+
+void crocus_grow_buffer(struct crocus_batch *batch, bool grow_state,
+ unsigned used, unsigned new_size);
+
+static inline unsigned
+crocus_batch_bytes_used(struct crocus_batch *batch)
+{
+ return batch->command.map_next - batch->command.map;
+}
+
+/**
+ * Ensure the current command buffer has \param size bytes of space
+ * remaining. If not, this creates a secondary batch buffer and emits
+ * a jump from the primary batch to the start of the secondary.
+ *
+ * Most callers want crocus_get_command_space() instead.
+ */
+static inline void
+crocus_require_command_space(struct crocus_batch *batch, unsigned size)
+{
+ const unsigned required_bytes = crocus_batch_bytes_used(batch) + size;
+ unsigned used = crocus_batch_bytes_used(batch);
+ if (required_bytes >= BATCH_SZ && !batch->no_wrap) {
+ crocus_batch_flush(batch);
+ } else if (used + size >= batch->command.bo->size) {
+ const unsigned new_size =
+ MIN2(batch->command.bo->size + batch->command.bo->size / 2,
+ MAX_BATCH_SIZE);
+
+ crocus_grow_buffer(batch, false, used, new_size);
+ batch->command.map_next = (void *)batch->command.map + used;
+ assert(crocus_batch_bytes_used(batch) + size < batch->command.bo->size);
+ }
+}
+
+/**
+ * Allocate space in the current command buffer, and return a pointer
+ * to the mapped area so the caller can write commands there.
+ *
+ * This should be called whenever emitting commands.
+ */
+static inline void *
+crocus_get_command_space(struct crocus_batch *batch, unsigned bytes)
+{
+ crocus_require_command_space(batch, bytes);
+ void *map = batch->command.map_next;
+ batch->command.map_next += bytes;
+ return map;
+}
+
+/**
+ * Helper to emit GPU commands - allocates space, copies them there.
+ */
+static inline void
+crocus_batch_emit(struct crocus_batch *batch, const void *data, unsigned size)
+{
+ void *map = crocus_get_command_space(batch, size);
+ memcpy(map, data, size);
+}
+
+/**
+ * Get a pointer to the batch's signalling syncobj. Does not refcount.
+ */
+static inline struct crocus_syncobj *
+crocus_batch_get_signal_syncobj(struct crocus_batch *batch)
+{
+ /* The signalling syncobj is the first one in the list. */
+ struct crocus_syncobj *syncobj =
+ ((struct crocus_syncobj **)util_dynarray_begin(&batch->syncobjs))[0];
+ return syncobj;
+}
+
+/**
+ * Take a reference to the batch's signalling syncobj.
+ *
+ * Callers can use this to wait for the the current batch under construction
+ * to complete (after flushing it).
+ */
+static inline void
+crocus_batch_reference_signal_syncobj(struct crocus_batch *batch,
+ struct crocus_syncobj **out_syncobj)
+{
+ struct crocus_syncobj *syncobj = crocus_batch_get_signal_syncobj(batch);
+ crocus_syncobj_reference(batch->screen, out_syncobj, syncobj);
+}
+
+/**
+ * Record the size of a piece of state for use in INTEL_DEBUG=bat printing.
+ */
+static inline void
+crocus_record_state_size(struct hash_table_u64 *ht, uint32_t offset_from_base,
+ uint32_t size)
+{
+ if (ht) {
+ _mesa_hash_table_u64_insert(ht, offset_from_base,
+ (void *)(uintptr_t)size);
+ }
+}
+
+static inline bool
+crocus_ptr_in_state_buffer(struct crocus_batch *batch, void *p)
+{
+ return (char *)p >= (char *)batch->state.map &&
+ (char *)p < (char *)batch->state.map + batch->state.bo->size;
+}
+
+static inline void
+crocus_require_statebuffer_space(struct crocus_batch *batch, int size)
+{
+ if (batch->state.used + size >= STATE_SZ)
+ crocus_batch_flush(batch);
+}
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_blit.c b/src/gallium/drivers/crocus/crocus_blit.c
new file mode 100644
index 00000000000..9cae82e3e2d
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blit.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/format/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/ralloc.h"
+#include "intel/blorp/blorp.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond)
+{
+ util_blitter_save_vertex_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_VERTEX]);
+ util_blitter_save_tessctrl_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL]);
+ util_blitter_save_tesseval_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]);
+ util_blitter_save_geometry_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]);
+ util_blitter_save_so_targets(ice->blitter, ice->state.so_targets,
+ (struct pipe_stream_output_target**)ice->state.so_target);
+ util_blitter_save_vertex_buffer_slot(ice->blitter, ice->state.vertex_buffers);
+ util_blitter_save_vertex_elements(ice->blitter, (void *)ice->state.cso_vertex_elements);
+ if (op & CROCUS_SAVE_FRAGMENT_STATE) {
+ util_blitter_save_blend(ice->blitter, ice->state.cso_blend);
+ util_blitter_save_depth_stencil_alpha(ice->blitter, ice->state.cso_zsa);
+ util_blitter_save_stencil_ref(ice->blitter, &ice->state.stencil_ref);
+ util_blitter_save_fragment_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]);
+ util_blitter_save_sample_mask(ice->blitter, ice->state.sample_mask);
+ util_blitter_save_rasterizer(ice->blitter, ice->state.cso_rast);
+ util_blitter_save_scissor(ice->blitter, &ice->state.scissors[0]);
+ util_blitter_save_viewport(ice->blitter, &ice->state.viewports[0]);
+ util_blitter_save_fragment_constant_buffer_slot(ice->blitter, &ice->state.shaders[MESA_SHADER_FRAGMENT].constbufs[0]);
+ }
+
+ if (!render_cond)
+ util_blitter_save_render_condition(ice->blitter,
+ (struct pipe_query *)ice->condition.query,
+ ice->condition.condition,
+ ice->condition.mode);
+
+// util_blitter_save_scissor(ice->blitter, &ice->scissors[0]);
+ if (op & CROCUS_SAVE_FRAMEBUFFER)
+ util_blitter_save_framebuffer(ice->blitter, &ice->state.framebuffer);
+
+ if (op & CROCUS_SAVE_TEXTURES) {
+ util_blitter_save_fragment_sampler_states(ice->blitter, 1, (void **)ice->state.shaders[MESA_SHADER_FRAGMENT].samplers);
+ util_blitter_save_fragment_sampler_views(ice->blitter, 1, (struct pipe_sampler_view **)ice->state.shaders[MESA_SHADER_FRAGMENT].textures);
+ }
+}
+
+/**
+ * Helper function for handling mirror image blits.
+ *
+ * If coord0 > coord1, swap them and return "true" (mirrored).
+ */
+static bool
+apply_mirror(float *coord0, float *coord1)
+{
+ if (*coord0 > *coord1) {
+ float tmp = *coord0;
+ *coord0 = *coord1;
+ *coord1 = tmp;
+ return true;
+ }
+ return false;
+}
+
+/**
+ * Compute the number of pixels to clip for each side of a rect
+ *
+ * \param x0 The rect's left coordinate
+ * \param y0 The rect's bottom coordinate
+ * \param x1 The rect's right coordinate
+ * \param y1 The rect's top coordinate
+ * \param min_x The clipping region's left coordinate
+ * \param min_y The clipping region's bottom coordinate
+ * \param max_x The clipping region's right coordinate
+ * \param max_y The clipping region's top coordinate
+ * \param clipped_x0 The number of pixels to clip from the left side
+ * \param clipped_y0 The number of pixels to clip from the bottom side
+ * \param clipped_x1 The number of pixels to clip from the right side
+ * \param clipped_y1 The number of pixels to clip from the top side
+ *
+ * \return false if we clip everything away, true otherwise
+ */
+static inline bool
+compute_pixels_clipped(float x0, float y0, float x1, float y1,
+ float min_x, float min_y, float max_x, float max_y,
+ float *clipped_x0, float *clipped_y0,
+ float *clipped_x1, float *clipped_y1)
+{
+ /* If we are going to clip everything away, stop. */
+ if (!(min_x <= max_x &&
+ min_y <= max_y &&
+ x0 <= max_x &&
+ y0 <= max_y &&
+ min_x <= x1 &&
+ min_y <= y1 &&
+ x0 <= x1 &&
+ y0 <= y1)) {
+ return false;
+ }
+
+ if (x0 < min_x)
+ *clipped_x0 = min_x - x0;
+ else
+ *clipped_x0 = 0;
+ if (max_x < x1)
+ *clipped_x1 = x1 - max_x;
+ else
+ *clipped_x1 = 0;
+
+ if (y0 < min_y)
+ *clipped_y0 = min_y - y0;
+ else
+ *clipped_y0 = 0;
+ if (max_y < y1)
+ *clipped_y1 = y1 - max_y;
+ else
+ *clipped_y1 = 0;
+
+ return true;
+}
+
+/**
+ * Clips a coordinate (left, right, top or bottom) for the src or dst rect
+ * (whichever requires the largest clip) and adjusts the coordinate
+ * for the other rect accordingly.
+ *
+ * \param mirror true if mirroring is required
+ * \param src the source rect coordinate (for example src_x0)
+ * \param dst0 the dst rect coordinate (for example dst_x0)
+ * \param dst1 the opposite dst rect coordinate (for example dst_x1)
+ * \param clipped_dst0 number of pixels to clip from the dst coordinate
+ * \param clipped_dst1 number of pixels to clip from the opposite dst coordinate
+ * \param scale the src vs dst scale involved for that coordinate
+ * \param is_left_or_bottom true if we are clipping the left or bottom sides
+ * of the rect.
+ */
+static void
+clip_coordinates(bool mirror,
+ float *src, float *dst0, float *dst1,
+ float clipped_dst0,
+ float clipped_dst1,
+ float scale,
+ bool is_left_or_bottom)
+{
+ /* When clipping we need to add or subtract pixels from the original
+ * coordinates depending on whether we are acting on the left/bottom
+ * or right/top sides of the rect respectively. We assume we have to
+ * add them in the code below, and multiply by -1 when we should
+ * subtract.
+ */
+ int mult = is_left_or_bottom ? 1 : -1;
+
+ if (!mirror) {
+ *dst0 += clipped_dst0 * mult;
+ *src += clipped_dst0 * scale * mult;
+ } else {
+ *dst1 -= clipped_dst1 * mult;
+ *src += clipped_dst1 * scale * mult;
+ }
+}
+
+/**
+ * Apply a scissor rectangle to blit coordinates.
+ *
+ * Returns true if the blit was entirely scissored away.
+ */
+static bool
+apply_blit_scissor(const struct pipe_scissor_state *scissor,
+ float *src_x0, float *src_y0,
+ float *src_x1, float *src_y1,
+ float *dst_x0, float *dst_y0,
+ float *dst_x1, float *dst_y1,
+ bool mirror_x, bool mirror_y)
+{
+ float clip_dst_x0, clip_dst_x1, clip_dst_y0, clip_dst_y1;
+
+ /* Compute number of pixels to scissor away. */
+ if (!compute_pixels_clipped(*dst_x0, *dst_y0, *dst_x1, *dst_y1,
+ scissor->minx, scissor->miny,
+ scissor->maxx, scissor->maxy,
+ &clip_dst_x0, &clip_dst_y0,
+ &clip_dst_x1, &clip_dst_y1))
+ return true;
+
+ // XXX: comments assume source clipping, which we don't do
+
+ /* When clipping any of the two rects we need to adjust the coordinates
+ * in the other rect considering the scaling factor involved. To obtain
+ * the best precision we want to make sure that we only clip once per
+ * side to avoid accumulating errors due to the scaling adjustment.
+ *
+ * For example, if src_x0 and dst_x0 need both to be clipped we want to
+ * avoid the situation where we clip src_x0 first, then adjust dst_x0
+ * accordingly but then we realize that the resulting dst_x0 still needs
+ * to be clipped, so we clip dst_x0 and adjust src_x0 again. Because we are
+ * applying scaling factors to adjust the coordinates in each clipping
+ * pass we lose some precision and that can affect the results of the
+ * blorp blit operation slightly. What we want to do here is detect the
+ * rect that we should clip first for each side so that when we adjust
+ * the other rect we ensure the resulting coordinate does not need to be
+ * clipped again.
+ *
+ * The code below implements this by comparing the number of pixels that
+ * we need to clip for each side of both rects considering the scales
+ * involved. For example, clip_src_x0 represents the number of pixels
+ * to be clipped for the src rect's left side, so if clip_src_x0 = 5,
+ * clip_dst_x0 = 4 and scale_x = 2 it means that we are clipping more
+ * from the dst rect so we should clip dst_x0 only and adjust src_x0.
+ * This is because clipping 4 pixels in the dst is equivalent to
+ * clipping 4 * 2 = 8 > 5 in the src.
+ */
+
+ if (*src_x0 == *src_x1 || *src_y0 == *src_y1
+ || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1)
+ return true;
+
+ float scale_x = (float) (*src_x1 - *src_x0) / (*dst_x1 - *dst_x0);
+ float scale_y = (float) (*src_y1 - *src_y0) / (*dst_y1 - *dst_y0);
+
+ /* Clip left side */
+ clip_coordinates(mirror_x, src_x0, dst_x0, dst_x1,
+ clip_dst_x0, clip_dst_x1, scale_x, true);
+
+ /* Clip right side */
+ clip_coordinates(mirror_x, src_x1, dst_x1, dst_x0,
+ clip_dst_x1, clip_dst_x0, scale_x, false);
+
+ /* Clip bottom side */
+ clip_coordinates(mirror_y, src_y0, dst_y0, dst_y1,
+ clip_dst_y0, clip_dst_y1, scale_y, true);
+
+ /* Clip top side */
+ clip_coordinates(mirror_y, src_y1, dst_y1, dst_y0,
+ clip_dst_y1, clip_dst_y0, scale_y, false);
+
+ /* Check for invalid bounds
+ * Can't blit for 0-dimensions
+ */
+ return *src_x0 == *src_x1 || *src_y0 == *src_y1
+ || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1;
+}
+
+void
+crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl,
+ struct isl_device *isl_dev,
+ struct blorp_surf *surf,
+ struct pipe_resource *p_res,
+ enum isl_aux_usage aux_usage,
+ unsigned level,
+ bool is_render_target)
+{
+ struct crocus_resource *res = (void *) p_res;
+
+ assert(!crocus_resource_unfinished_aux_import(res));
+
+ if (isl_aux_usage_has_hiz(aux_usage) &&
+ !crocus_resource_level_has_hiz(res, level))
+ aux_usage = ISL_AUX_USAGE_NONE;
+
+ *surf = (struct blorp_surf) {
+ .surf = &res->surf,
+ .addr = (struct blorp_address) {
+ .buffer = res->bo,
+ .offset = res->offset,
+ .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0,
+ .mocs = crocus_mocs(res->bo, isl_dev),
+ },
+ .aux_usage = aux_usage,
+ };
+
+ if (aux_usage != ISL_AUX_USAGE_NONE) {
+ surf->aux_surf = &res->aux.surf;
+ surf->aux_addr = (struct blorp_address) {
+ .buffer = res->aux.bo,
+ .offset = res->aux.offset,
+ .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0,
+ .mocs = crocus_mocs(res->bo, isl_dev),
+ };
+ surf->clear_color =
+ crocus_resource_get_clear_color(res);
+ }
+}
+
+static void
+tex_cache_flush_hack(struct crocus_batch *batch,
+ enum isl_format view_format,
+ enum isl_format surf_format)
+{
+ /* The WaSamplerCacheFlushBetweenRedescribedSurfaceReads workaround says:
+ *
+ * "Currently Sampler assumes that a surface would not have two
+ * different format associate with it. It will not properly cache
+ * the different views in the MT cache, causing a data corruption."
+ *
+ * We may need to handle this for texture views in general someday, but
+ * for now we handle it here, as it hurts copies and blits particularly
+ * badly because they ofter reinterpret formats.
+ *
+ * If the BO hasn't been referenced yet this batch, we assume that the
+ * texture cache doesn't contain any relevant data nor need flushing.
+ *
+ * Icelake (Gen11+) claims to fix this issue, but seems to still have
+ * issues with ASTC formats.
+ */
+ bool need_flush = view_format != surf_format;
+ if (!need_flush)
+ return;
+
+ const char *reason =
+ "workaround: WaSamplerCacheFlushBetweenRedescribedSurfaceReads";
+
+ crocus_emit_pipe_control_flush(batch, reason, PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(batch, reason,
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+}
+
+static struct crocus_resource *
+crocus_resource_for_aspect(const struct intel_device_info *devinfo,
+ struct pipe_resource *p_res, unsigned pipe_mask)
+{
+ if (pipe_mask == PIPE_MASK_S) {
+ struct crocus_resource *junk, *s_res;
+ crocus_get_depth_stencil_resources(devinfo, p_res, &junk, &s_res);
+ return s_res;
+ } else {
+ return (struct crocus_resource *)p_res;
+ }
+}
+
+static enum pipe_format
+pipe_format_for_aspect(enum pipe_format format, unsigned pipe_mask)
+{
+ if (pipe_mask == PIPE_MASK_S) {
+ return util_format_stencil_only(format);
+ } else if (pipe_mask == PIPE_MASK_Z) {
+ return util_format_get_depth_only(format);
+ } else {
+ return format;
+ }
+}
+
+static void
+crocus_u_blitter(struct crocus_context *ice,
+ const struct pipe_blit_info *info)
+{
+ struct pipe_blit_info dinfo = *info;
+ if (!util_format_has_alpha(dinfo.dst.resource->format))
+ dinfo.mask &= ~PIPE_MASK_A;
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+ util_blitter_blit(ice->blitter, &dinfo);
+}
+
+/**
+ * The pipe->blit() driver hook.
+ *
+ * This performs a blit between two surfaces, which copies data but may
+ * also perform format conversion, scaling, flipping, and so on.
+ */
+static void
+crocus_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ enum blorp_batch_flags blorp_flags = 0;
+
+ /* We don't support color masking. */
+ assert((info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA ||
+ (info->mask & PIPE_MASK_RGBA) == 0);
+
+ if (info->render_condition_enable)
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (devinfo->ver <= 5) {
+ if (!screen->vtbl.blit_blt(batch, info)) {
+
+ if (!util_format_is_depth_or_stencil(info->src.resource->format) &&
+ info->dst.resource->target != PIPE_TEXTURE_3D)
+ goto use_blorp;
+
+ if (!util_blitter_is_blit_supported(ice->blitter, info)) {
+ if (util_format_is_depth_or_stencil(info->src.resource->format)) {
+
+ struct pipe_blit_info depth_blit = *info;
+ depth_blit.mask = PIPE_MASK_Z;
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+ util_blitter_blit(ice->blitter, &depth_blit);
+
+ struct pipe_surface *dst_view, dst_templ;
+ util_blitter_default_dst_texture(&dst_templ, info->dst.resource, info->dst.level, info->dst.box.z);
+ dst_view = ctx->create_surface(ctx, info->dst.resource, &dst_templ);
+
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+
+ util_blitter_clear_depth_stencil(ice->blitter, dst_view, PIPE_CLEAR_STENCIL,
+ 0, 0, info->dst.box.x, info->dst.box.y,
+ info->dst.box.width, info->dst.box.height);
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+ util_blitter_stencil_fallback(ice->blitter,
+ info->dst.resource,
+ info->dst.level,
+ &info->dst.box,
+ info->src.resource,
+ info->src.level,
+ &info->src.box, NULL);
+
+ }
+ return;
+ }
+
+ crocus_u_blitter(ice, info);
+ }
+ return;
+ }
+
+ if (devinfo->ver == 6) {
+ if (info->src.resource->target == PIPE_TEXTURE_3D &&
+ info->dst.resource->target == PIPE_TEXTURE_3D) {
+ crocus_u_blitter(ice, info);
+ return;
+ }
+ }
+
+use_blorp:
+ if (info->render_condition_enable) {
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+ blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+ }
+
+ float src_x0 = info->src.box.x;
+ float src_x1 = info->src.box.x + info->src.box.width;
+ float src_y0 = info->src.box.y;
+ float src_y1 = info->src.box.y + info->src.box.height;
+ float dst_x0 = info->dst.box.x;
+ float dst_x1 = info->dst.box.x + info->dst.box.width;
+ float dst_y0 = info->dst.box.y;
+ float dst_y1 = info->dst.box.y + info->dst.box.height;
+ bool mirror_x = apply_mirror(&src_x0, &src_x1);
+ bool mirror_y = apply_mirror(&src_y0, &src_y1);
+ enum blorp_filter filter;
+
+ if (info->scissor_enable) {
+ bool noop = apply_blit_scissor(&info->scissor,
+ &src_x0, &src_y0, &src_x1, &src_y1,
+ &dst_x0, &dst_y0, &dst_x1, &dst_y1,
+ mirror_x, mirror_y);
+ if (noop)
+ return;
+ }
+
+ if (abs(info->dst.box.width) == abs(info->src.box.width) &&
+ abs(info->dst.box.height) == abs(info->src.box.height)) {
+ if (info->src.resource->nr_samples > 1 &&
+ info->dst.resource->nr_samples <= 1) {
+ /* The OpenGL ES 3.2 specification, section 16.2.1, says:
+ *
+ * "If the read framebuffer is multisampled (its effective
+ * value of SAMPLE_BUFFERS is one) and the draw framebuffer
+ * is not (its value of SAMPLE_BUFFERS is zero), the samples
+ * corresponding to each pixel location in the source are
+ * converted to a single sample before being written to the
+ * destination. The filter parameter is ignored. If the
+ * source formats are integer types or stencil values, a
+ * single sample’s value is selected for each pixel. If the
+ * source formats are floating-point or normalized types,
+ * the sample values for each pixel are resolved in an
+ * implementation-dependent manner. If the source formats
+ * are depth values, sample values are resolved in an
+ * implementation-dependent manner where the result will be
+ * between the minimum and maximum depth values in the pixel."
+ *
+ * When selecting a single sample, we always choose sample 0.
+ */
+ if (util_format_is_depth_or_stencil(info->src.format) ||
+ util_format_is_pure_integer(info->src.format)) {
+ filter = BLORP_FILTER_SAMPLE_0;
+ } else {
+ filter = BLORP_FILTER_AVERAGE;
+ }
+ } else {
+ /* The OpenGL 4.6 specification, section 18.3.1, says:
+ *
+ * "If the source and destination dimensions are identical,
+ * no filtering is applied."
+ *
+ * Using BLORP_FILTER_NONE will also handle the upsample case by
+ * replicating the one value in the source to all values in the
+ * destination.
+ */
+ filter = BLORP_FILTER_NONE;
+ }
+ } else if (info->filter == PIPE_TEX_FILTER_LINEAR) {
+ filter = BLORP_FILTER_BILINEAR;
+ } else {
+ filter = BLORP_FILTER_NEAREST;
+ }
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ float src_z_step = (float)info->src.box.depth / (float)info->dst.box.depth;
+
+ /* There is no interpolation to the pixel center during rendering, so
+ * add the 0.5 offset ourselves here.
+ */
+ float depth_center_offset = 0;
+ if (info->src.resource->target == PIPE_TEXTURE_3D)
+ depth_center_offset = 0.5 / info->dst.box.depth * info->src.box.depth;
+
+ /* Perform a blit for each aspect requested by the caller. PIPE_MASK_R is
+ * used to represent the color aspect. */
+ unsigned aspect_mask = info->mask & (PIPE_MASK_R | PIPE_MASK_ZS);
+ while (aspect_mask) {
+ unsigned aspect = 1 << u_bit_scan(&aspect_mask);
+
+ struct crocus_resource *src_res =
+ crocus_resource_for_aspect(devinfo, info->src.resource, aspect);
+ struct crocus_resource *dst_res =
+ crocus_resource_for_aspect(devinfo, info->dst.resource, aspect);
+
+ enum pipe_format src_pfmt =
+ pipe_format_for_aspect(info->src.format, aspect);
+ enum pipe_format dst_pfmt =
+ pipe_format_for_aspect(info->dst.format, aspect);
+
+ if (crocus_resource_unfinished_aux_import(src_res))
+ crocus_resource_finish_aux_import(ctx->screen, src_res);
+ if (crocus_resource_unfinished_aux_import(dst_res))
+ crocus_resource_finish_aux_import(ctx->screen, dst_res);
+
+ struct crocus_format_info src_fmt =
+ crocus_format_for_usage(devinfo, src_pfmt, ISL_SURF_USAGE_TEXTURE_BIT);
+ enum isl_aux_usage src_aux_usage =
+ crocus_resource_texture_aux_usage(src_res);
+
+ crocus_resource_prepare_texture(ice, src_res, src_fmt.fmt,
+ info->src.level, 1, info->src.box.z,
+ info->src.box.depth);
+ // crocus_emit_buffer_barrier_for(batch, src_res->bo,
+ // CROCUS_DOMAIN_OTHER_READ);
+
+ struct crocus_format_info dst_fmt =
+ crocus_format_for_usage(devinfo, dst_pfmt,
+ ISL_SURF_USAGE_RENDER_TARGET_BIT);
+ enum isl_aux_usage dst_aux_usage =
+ crocus_resource_render_aux_usage(ice, dst_res, info->dst.level,
+ dst_fmt.fmt, false);
+
+ struct blorp_surf src_surf, dst_surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf,
+ &src_res->base, src_aux_usage,
+ info->src.level, false);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf,
+ &dst_res->base, dst_aux_usage,
+ info->dst.level, true);
+
+ crocus_resource_prepare_render(ice, dst_res, info->dst.level,
+ info->dst.box.z, info->dst.box.depth,
+ dst_aux_usage);
+ // crocus_emit_buffer_barrier_for(batch, dst_res->bo,
+ // CROCUS_DOMAIN_RENDER_WRITE);
+
+ if (crocus_batch_references(batch, src_res->bo))
+ tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format);
+
+ if (dst_res->base.target == PIPE_BUFFER) {
+ util_range_add(&dst_res->base, &dst_res->valid_buffer_range,
+ dst_x0, dst_x1);
+ }
+
+ struct isl_swizzle src_swiz = pipe_to_isl_swizzles(src_fmt.swizzles);
+ struct isl_swizzle dst_swiz = pipe_to_isl_swizzles(dst_fmt.swizzles);
+
+ for (int slice = 0; slice < info->dst.box.depth; slice++) {
+ unsigned dst_z = info->dst.box.z + slice;
+ float src_z = info->src.box.z + slice * src_z_step +
+ depth_center_offset;
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ blorp_blit(&blorp_batch,
+ &src_surf, info->src.level, src_z,
+ src_fmt.fmt, src_swiz,
+ &dst_surf, info->dst.level, dst_z,
+ dst_fmt.fmt, dst_swiz,
+ src_x0, src_y0, src_x1, src_y1,
+ dst_x0, dst_y0, dst_x1, dst_y1,
+ filter, mirror_x, mirror_y);
+
+ }
+
+ tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format);
+
+ crocus_resource_finish_render(ice, dst_res, info->dst.level,
+ info->dst.box.z, info->dst.box.depth,
+ dst_aux_usage);
+ }
+
+ blorp_batch_finish(&blorp_batch);
+
+ crocus_flush_and_dirty_for_history(ice, batch, (struct crocus_resource *)
+ info->dst.resource,
+ PIPE_CONTROL_RENDER_TARGET_FLUSH,
+ "cache history: post-blit");
+}
+
+static void
+get_copy_region_aux_settings(struct crocus_resource *res,
+ enum isl_aux_usage *out_aux_usage,
+ bool is_render_target)
+{
+ switch (res->aux.usage) {
+ case ISL_AUX_USAGE_MCS:
+ /* A stencil resolve operation must be performed prior to doing resource
+ * copies or used by CPU.
+ * (see HSD 1209978162)
+ */
+ if (is_render_target && isl_surf_usage_is_stencil(res->surf.usage)) {
+ *out_aux_usage = ISL_AUX_USAGE_NONE;
+ } else {
+ *out_aux_usage = res->aux.usage;
+ }
+ break;
+ default:
+ *out_aux_usage = ISL_AUX_USAGE_NONE;
+ break;
+ }
+}
+
+/**
+ * Perform a GPU-based raw memory copy between compatible view classes.
+ *
+ * Does not perform any flushing - the new data may still be left in the
+ * render cache, and old data may remain in other caches.
+ *
+ * Wraps blorp_copy() and blorp_buffer_copy().
+ */
+void
+crocus_copy_region(struct blorp_context *blorp,
+ struct crocus_batch *batch,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ struct blorp_batch blorp_batch;
+ struct crocus_context *ice = blorp->driver_ctx;
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_resource *src_res = (void *) src;
+ struct crocus_resource *dst_res = (void *) dst;
+
+ if (devinfo->ver <= 5) {
+ if (screen->vtbl.copy_region_blt(batch, dst_res,
+ dst_level, dstx, dsty, dstz,
+ src_res, src_level, src_box))
+ return;
+ }
+ enum isl_aux_usage src_aux_usage, dst_aux_usage;
+ get_copy_region_aux_settings(src_res, &src_aux_usage,
+ false);
+ get_copy_region_aux_settings(dst_res, &dst_aux_usage,
+ true);
+
+ if (crocus_batch_references(batch, src_res->bo))
+ tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
+
+ if (dst->target == PIPE_BUFFER)
+ util_range_add(&dst_res->base, &dst_res->valid_buffer_range, dstx, dstx + src_box->width);
+
+ if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+ struct blorp_address src_addr = {
+ .buffer = crocus_resource_bo(src), .offset = src_box->x,
+ };
+ struct blorp_address dst_addr = {
+ .buffer = crocus_resource_bo(dst), .offset = dstx,
+ .reloc_flags = EXEC_OBJECT_WRITE,
+ };
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+ blorp_buffer_copy(&blorp_batch, src_addr, dst_addr, src_box->width);
+ blorp_batch_finish(&blorp_batch);
+ } else {
+ // XXX: what about one surface being a buffer and not the other?
+
+ struct blorp_surf src_surf, dst_surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf,
+ src, src_aux_usage, src_level, false);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf,
+ dst, dst_aux_usage, dst_level, true);
+
+ crocus_resource_prepare_access(ice, src_res, src_level, 1,
+ src_box->z, src_box->depth,
+ src_aux_usage, false);
+ crocus_resource_prepare_access(ice, dst_res, dst_level, 1,
+ dstz, src_box->depth,
+ dst_aux_usage, false);
+
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+
+ for (int slice = 0; slice < src_box->depth; slice++) {
+ crocus_batch_maybe_flush(batch, 1500);
+
+ blorp_copy(&blorp_batch, &src_surf, src_level, src_box->z + slice,
+ &dst_surf, dst_level, dstz + slice,
+ src_box->x, src_box->y, dstx, dsty,
+ src_box->width, src_box->height);
+ }
+ blorp_batch_finish(&blorp_batch);
+
+ crocus_resource_finish_write(ice, dst_res, dst_level, dstz,
+ src_box->depth, dst_aux_usage);
+ }
+
+ tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
+}
+
+static struct crocus_batch *
+get_preferred_batch(struct crocus_context *ice, struct crocus_bo *bo)
+{
+ /* If the compute batch is already using this buffer, we'd prefer to
+ * continue queueing in the compute batch.
+ */
+ if (crocus_batch_references(&ice->batches[CROCUS_BATCH_COMPUTE], bo))
+ return &ice->batches[CROCUS_BATCH_COMPUTE];
+
+ /* Otherwise default to the render batch. */
+ return &ice->batches[CROCUS_BATCH_RENDER];
+}
+
+
+/**
+ * The pipe->resource_copy_region() driver hook.
+ *
+ * This implements ARB_copy_image semantics - a raw memory copy between
+ * compatible view classes.
+ */
+static void
+crocus_resource_copy_region(struct pipe_context *ctx,
+ struct pipe_resource *p_dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *p_src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_resource *src = (void *) p_src;
+ struct crocus_resource *dst = (void *) p_dst;
+
+ if (crocus_resource_unfinished_aux_import(src))
+ crocus_resource_finish_aux_import(ctx->screen, src);
+ if (crocus_resource_unfinished_aux_import(dst))
+ crocus_resource_finish_aux_import(ctx->screen, dst);
+
+ /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */
+ if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER &&
+ (src_box->width % 4 == 0) && src_box->width <= 16 &&
+ screen->vtbl.copy_mem_mem) {
+ struct crocus_bo *dst_bo = crocus_resource_bo(p_dst);
+ batch = get_preferred_batch(ice, dst_bo);
+ crocus_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4));
+ crocus_emit_pipe_control_flush(batch,
+ "stall for MI_COPY_MEM_MEM copy_region",
+ PIPE_CONTROL_CS_STALL);
+ screen->vtbl.copy_mem_mem(batch, dst_bo, dstx, crocus_resource_bo(p_src),
+ src_box->x, src_box->width);
+ return;
+ }
+
+ if (devinfo->ver < 6 && util_format_is_depth_or_stencil(p_dst->format)) {
+ util_resource_copy_region(ctx, p_dst, dst_level, dstx, dsty, dstz,
+ p_src, src_level, src_box);
+ return;
+ }
+ crocus_copy_region(&ice->blorp, batch, p_dst, dst_level, dstx, dsty, dstz,
+ p_src, src_level, src_box);
+
+ if (util_format_is_depth_and_stencil(p_dst->format) &&
+ util_format_has_stencil(util_format_description(p_src->format)) &&
+ devinfo->ver >= 6) {
+ struct crocus_resource *junk, *s_src_res, *s_dst_res;
+ crocus_get_depth_stencil_resources(devinfo, p_src, &junk, &s_src_res);
+ crocus_get_depth_stencil_resources(devinfo, p_dst, &junk, &s_dst_res);
+
+ crocus_copy_region(&ice->blorp, batch, &s_dst_res->base, dst_level, dstx,
+ dsty, dstz, &s_src_res->base, src_level, src_box);
+ }
+
+ crocus_flush_and_dirty_for_history(ice, batch, dst,
+ PIPE_CONTROL_RENDER_TARGET_FLUSH,
+ "cache history: post copy_region");
+}
+
+void
+crocus_init_blit_functions(struct pipe_context *ctx)
+{
+ ctx->blit = crocus_blit;
+ ctx->resource_copy_region = crocus_resource_copy_region;
+}
diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c
new file mode 100644
index 00000000000..75f0078d535
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blorp.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_blorp.c
+ *
+ * ============================= GENXML CODE =============================
+ * [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * GenX specific code for working with BLORP (blitting, resolves, clears
+ * on the 3D engine). This provides the driver-specific hooks needed to
+ * implement the BLORP API.
+ *
+ * See crocus_blit.c, crocus_clear.c, and so on.
+ */
+
+#include <assert.h>
+
+#include "crocus_batch.h"
+#include "crocus_resource.h"
+#include "crocus_context.h"
+
+#include "util/u_upload_mgr.h"
+#include "intel/common/intel_l3_config.h"
+
+#include "blorp/blorp_genX_exec.h"
+
+#if GFX_VER <= 5
+#include "gen4_blorp_exec.h"
+#endif
+
+static uint32_t *
+stream_state(struct crocus_batch *batch,
+ unsigned size,
+ unsigned alignment,
+ uint32_t *out_offset,
+ struct crocus_bo **out_bo)
+{
+ uint32_t offset = ALIGN(batch->state.used, alignment);
+
+ if (offset + size >= STATE_SZ && !batch->no_wrap) {
+ crocus_batch_flush(batch);
+ offset = ALIGN(batch->state.used, alignment);
+ } else if (offset + size >= batch->state.bo->size) {
+ const unsigned new_size =
+ MIN2(batch->state.bo->size + batch->state.bo->size / 2,
+ MAX_STATE_SIZE);
+ crocus_grow_buffer(batch, true, batch->state.used, new_size);
+ assert(offset + size < batch->state.bo->size);
+ }
+
+ crocus_record_state_size(batch->state_sizes, offset, size);
+
+ batch->state.used = offset + size;
+ *out_offset = offset;
+
+ /* If the caller has asked for a BO, we leave them the responsibility of
+ * adding bo->gtt_offset (say, by handing an address to genxml). If not,
+ * we assume they want the offset from a base address.
+ */
+ if (out_bo)
+ *out_bo = batch->state.bo;
+
+ return (uint32_t *)batch->state.map + (offset >> 2);
+}
+
+static void *
+blorp_emit_dwords(struct blorp_batch *blorp_batch, unsigned n)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ return crocus_get_command_space(batch, n * sizeof(uint32_t));
+}
+
+static uint64_t
+blorp_emit_reloc(struct blorp_batch *blorp_batch, UNUSED void *location,
+ struct blorp_address addr, uint32_t delta)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ uint32_t offset;
+
+ if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) {
+ offset = (char *)location - (char *)batch->state.map;
+ return crocus_state_reloc(batch, offset,
+ addr.buffer, addr.offset + delta,
+ addr.reloc_flags);
+ }
+
+ assert(!crocus_ptr_in_state_buffer(batch, location));
+
+ offset = (char *)location - (char *)batch->command.map;
+ return crocus_command_reloc(batch, offset,
+ addr.buffer, addr.offset + delta,
+ addr.reloc_flags);
+}
+
+static void
+blorp_surface_reloc(struct blorp_batch *blorp_batch, uint32_t ss_offset,
+ struct blorp_address addr, uint32_t delta)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ struct crocus_bo *bo = addr.buffer;
+
+ uint64_t reloc_val =
+ crocus_state_reloc(batch, ss_offset, bo, addr.offset + delta,
+ addr.reloc_flags);
+
+ void *reloc_ptr = (void *)batch->state.map + ss_offset;
+ *(uint32_t *)reloc_ptr = reloc_val;
+}
+
+static uint64_t
+blorp_get_surface_address(struct blorp_batch *blorp_batch,
+ struct blorp_address addr)
+{
+ /* We'll let blorp_surface_reloc write the address. */
+ return 0ull;
+}
+
+#if GFX_VER >= 7
+static struct blorp_address
+blorp_get_surface_base_address(struct blorp_batch *blorp_batch)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ return (struct blorp_address) {
+ .buffer = batch->state.bo,
+ .offset = 0
+ };
+}
+#endif
+
+static void *
+blorp_alloc_dynamic_state(struct blorp_batch *blorp_batch,
+ uint32_t size,
+ uint32_t alignment,
+ uint32_t *offset)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ return stream_state(batch, size, alignment, offset, NULL);
+}
+
+static void
+blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
+ unsigned num_entries,
+ unsigned state_size,
+ unsigned state_alignment,
+ uint32_t *bt_offset,
+ uint32_t *surface_offsets,
+ void **surface_maps)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ uint32_t *bt_map = stream_state(batch, num_entries * sizeof(uint32_t), 32,
+ bt_offset, NULL);
+
+ for (unsigned i = 0; i < num_entries; i++) {
+ surface_maps[i] = stream_state(batch,
+ state_size, state_alignment,
+ &(surface_offsets)[i], NULL);
+ bt_map[i] = surface_offsets[i];
+ }
+}
+
+static void *
+blorp_alloc_vertex_buffer(struct blorp_batch *blorp_batch,
+ uint32_t size,
+ struct blorp_address *addr)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ struct crocus_bo *bo;
+ uint32_t offset;
+
+ void *map = stream_state(batch, size, 64,
+ &offset, &bo);
+
+ *addr = (struct blorp_address) {
+ .buffer = bo,
+ .offset = offset,
+ .reloc_flags = RELOC_32BIT,
+#if GFX_VER >= 7
+ .mocs = crocus_mocs(bo, &batch->screen->isl_dev),
+#endif
+ };
+
+ return map;
+}
+
+/**
+ */
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *blorp_batch,
+ const struct blorp_address *addrs,
+ UNUSED uint32_t *sizes,
+ unsigned num_vbs)
+{
+}
+
+static struct blorp_address
+blorp_get_workaround_address(struct blorp_batch *blorp_batch)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ return (struct blorp_address) {
+ .buffer = batch->ice->workaround_bo,
+ .offset = batch->ice->workaround_offset,
+ };
+}
+
+static void
+blorp_flush_range(UNUSED struct blorp_batch *blorp_batch,
+ UNUSED void *start,
+ UNUSED size_t size)
+{
+ /* All allocated states come from the batch which we will flush before we
+ * submit it. There's nothing for us to do here.
+ */
+}
+
+#if GFX_VER >= 7
+static const struct intel_l3_config *
+blorp_get_l3_config(struct blorp_batch *blorp_batch)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ return batch->screen->l3_config_3d;
+}
+#else /* GFX_VER < 7 */
+static void
+blorp_emit_urb_config(struct blorp_batch *blorp_batch,
+ unsigned vs_entry_size,
+ UNUSED unsigned sf_entry_size)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+#if GFX_VER <= 5
+ batch->screen->vtbl.calculate_urb_fence(batch, 0, vs_entry_size, sf_entry_size);
+#else
+ genX(upload_urb)(batch, vs_entry_size, false, vs_entry_size);
+#endif
+}
+#endif
+
+static void
+crocus_blorp_exec(struct blorp_batch *blorp_batch,
+ const struct blorp_params *params)
+{
+ struct crocus_context *ice = blorp_batch->blorp->driver_ctx;
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ /* Flush the sampler and render caches. We definitely need to flush the
+ * sampler cache so that we get updated contents from the render cache for
+ * the glBlitFramebuffer() source. Also, we are sometimes warned in the
+ * docs to flush the cache between reinterpretations of the same surface
+ * data with different formats, which blorp does for stencil and depth
+ * data.
+ */
+ if (params->src.enabled)
+ crocus_cache_flush_for_read(batch, params->src.addr.buffer);
+ if (params->dst.enabled) {
+ crocus_cache_flush_for_render(batch, params->dst.addr.buffer,
+ params->dst.view.format,
+ params->dst.aux_usage);
+ }
+ if (params->depth.enabled)
+ crocus_cache_flush_for_depth(batch, params->depth.addr.buffer);
+ if (params->stencil.enabled)
+ crocus_cache_flush_for_depth(batch, params->stencil.addr.buffer);
+
+ crocus_require_command_space(batch, 1400);
+ crocus_require_statebuffer_space(batch, 600);
+ batch->no_wrap = true;
+#if GFX_VER == 6
+ /* Emit workaround flushes when we switch from drawing to blorping. */
+ crocus_emit_post_sync_nonzero_flush(batch);
+#endif
+
+#if GFX_VER >= 6
+ crocus_emit_depth_stall_flushes(batch);
+#endif
+
+ blorp_emit(blorp_batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+ rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
+ rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
+ }
+
+ batch->screen->vtbl.update_surface_base_address(batch);
+ crocus_handle_always_flush_cache(batch);
+
+ batch->contains_draw = true;
+ blorp_exec(blorp_batch, params);
+
+ batch->no_wrap = false;
+ crocus_handle_always_flush_cache(batch);
+
+ /* We've smashed all state compared to what the normal 3D pipeline
+ * rendering tracks for GL.
+ */
+
+ uint64_t skip_bits = (CROCUS_DIRTY_POLYGON_STIPPLE |
+ CROCUS_DIRTY_GEN7_SO_BUFFERS |
+ CROCUS_DIRTY_SO_DECL_LIST |
+ CROCUS_DIRTY_LINE_STIPPLE |
+ CROCUS_ALL_DIRTY_FOR_COMPUTE |
+ CROCUS_DIRTY_GEN6_SCISSOR_RECT |
+ CROCUS_DIRTY_GEN75_VF |
+ CROCUS_DIRTY_SF_CL_VIEWPORT);
+
+ uint64_t skip_stage_bits = (CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_VS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TCS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TES |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_GS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_FS |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS);
+
+ if (!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) {
+ /* BLORP disabled tessellation, that's fine for the next draw */
+ skip_stage_bits |= CROCUS_STAGE_DIRTY_TCS |
+ CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES |
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS |
+ CROCUS_STAGE_DIRTY_BINDINGS_TES;
+ }
+
+ if (!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) {
+ /* BLORP disabled geometry shaders, that's fine for the next draw */
+ skip_stage_bits |= CROCUS_STAGE_DIRTY_GS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_GS |
+ CROCUS_STAGE_DIRTY_BINDINGS_GS;
+ }
+
+ /* we can skip flagging CROCUS_DIRTY_DEPTH_BUFFER, if
+ * BLORP_BATCH_NO_EMIT_DEPTH_STENCIL is set.
+ */
+ if (blorp_batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL)
+ skip_bits |= CROCUS_DIRTY_DEPTH_BUFFER;
+
+ if (!params->wm_prog_data)
+ skip_bits |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+
+ ice->state.dirty |= ~skip_bits;
+ ice->state.stage_dirty |= ~skip_stage_bits;
+
+ ice->urb.vsize = 0;
+ ice->urb.gs_present = false;
+ ice->urb.gsize = 0;
+ ice->urb.tess_present = false;
+ ice->urb.hsize = 0;
+ ice->urb.dsize = 0;
+
+ if (params->dst.enabled) {
+ crocus_render_cache_add_bo(batch, params->dst.addr.buffer,
+ params->dst.view.format,
+ params->dst.aux_usage);
+ }
+ if (params->depth.enabled)
+ crocus_depth_cache_add_bo(batch, params->depth.addr.buffer);
+ if (params->stencil.enabled)
+ crocus_depth_cache_add_bo(batch, params->stencil.addr.buffer);
+}
+
+static void
+blorp_measure_start(struct blorp_batch *blorp_batch,
+ const struct blorp_params *params)
+{
+}
+
+void
+genX(init_blorp)(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+
+ blorp_init(&ice->blorp, ice, &screen->isl_dev);
+ ice->blorp.compiler = screen->compiler;
+ ice->blorp.lookup_shader = crocus_blorp_lookup_shader;
+ ice->blorp.upload_shader = crocus_blorp_upload_shader;
+ ice->blorp.exec = crocus_blorp_exec;
+}
diff --git a/src/gallium/drivers/crocus/crocus_blt.c b/src/gallium/drivers/crocus/crocus_blt.c
new file mode 100644
index 00000000000..d27891352bd
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blt.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* blt command encoding for gen4/5 */
+#include "crocus_context.h"
+
+#include "crocus_genx_macros.h"
+#include "crocus_genx_protos.h"
+#include "crocus_resource.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+#if GFX_VER <= 5
+
+static bool validate_blit_for_blt(struct crocus_batch *batch,
+ const struct pipe_blit_info *info)
+{
+ /* If the source and destination are the same size with no mirroring,
+ * the rectangles are within the size of the texture and there is no
+ * scissor, then we can probably use the blit engine.
+ */
+ if (info->dst.box.width != info->src.box.width ||
+ info->dst.box.height != info->src.box.height)
+ return false;
+
+ if (info->scissor_enable)
+ return false;
+
+ if (info->dst.box.height < 0 || info->src.box.height < 0)
+ return false;
+
+ if (info->dst.box.depth > 1 || info->src.box.depth > 1)
+ return false;
+
+ return true;
+}
+
+static inline int crocus_resource_blt_pitch(struct crocus_resource *res)
+{
+ int pitch = res->surf.row_pitch_B;
+ if (res->surf.tiling != ISL_TILING_LINEAR)
+ pitch /= 4;
+ return pitch;
+}
+
+static uint32_t
+color_depth_for_cpp(int cpp)
+{
+ switch (cpp) {
+ case 4: return COLOR_DEPTH__32bit;
+ case 2: return COLOR_DEPTH__565;
+ case 1: return COLOR_DEPTH__8bit;
+ default:
+ unreachable("not reached");
+ }
+}
+
+static bool emit_copy_blt(struct crocus_batch *batch,
+ struct crocus_resource *src,
+ struct crocus_resource *dst,
+ unsigned cpp,
+ int32_t src_pitch,
+ unsigned src_offset,
+ int32_t dst_pitch,
+ unsigned dst_offset,
+ uint16_t src_x, uint16_t src_y,
+ uint16_t dst_x, uint16_t dst_y,
+ uint16_t w, uint16_t h)
+
+{
+ uint32_t src_tile_w, src_tile_h;
+ uint32_t dst_tile_w, dst_tile_h;
+ int dst_y2 = dst_y + h;
+ int dst_x2 = dst_x + w;
+
+ DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+ __func__,
+ src, src_pitch, src_offset, src_x, src_y,
+ dst, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+
+ isl_get_tile_dims(src->surf.tiling, cpp, &src_tile_w, &src_tile_h);
+ isl_get_tile_dims(dst->surf.tiling, cpp, &dst_tile_w, &dst_tile_h);
+
+ /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
+ * (X direction width of the Tile). This is ensured while allocating the
+ * buffer object.
+ */
+ assert(src->surf.tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0);
+ assert(dst->surf.tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0);
+
+ /* For big formats (such as floating point), do the copy using 16 or
+ * 32bpp and multiply the coordinates.
+ */
+ if (cpp > 4) {
+ if (cpp % 4 == 2) {
+ dst_x *= cpp / 2;
+ dst_x2 *= cpp / 2;
+ src_x *= cpp / 2;
+ cpp = 2;
+ } else {
+ assert(cpp % 4 == 0);
+ dst_x *= cpp / 4;
+ dst_x2 *= cpp / 4;
+ src_x *= cpp / 4;
+ cpp = 4;
+ }
+ }
+
+ /* For tiled source and destination, pitch value should be specified
+ * as a number of Dwords.
+ */
+ if (dst->surf.tiling != ISL_TILING_LINEAR)
+ dst_pitch /= 4;
+
+ if (src->surf.tiling != ISL_TILING_LINEAR)
+ src_pitch /= 4;
+
+ assert(cpp <= 4);
+ crocus_emit_cmd(batch, GENX(XY_SRC_COPY_BLT), xyblt) {
+ xyblt.RasterOperation = 0xCC;
+ xyblt.DestinationTilingEnable = dst->surf.tiling != ISL_TILING_LINEAR;
+ xyblt.SourceTilingEnable = src->surf.tiling != ISL_TILING_LINEAR;
+ xyblt.SourceBaseAddress = ro_bo(src->bo, src_offset);
+ xyblt.DestinationBaseAddress = rw_bo(dst->bo, dst_offset);
+ xyblt.ColorDepth = color_depth_for_cpp(cpp);
+ xyblt._32bppByteMask = cpp == 4 ? 0x3 : 0x1;
+ xyblt.DestinationX1Coordinate = dst_x;
+ xyblt.DestinationY1Coordinate = dst_y;
+ xyblt.DestinationX2Coordinate = dst_x2;
+ xyblt.DestinationY2Coordinate = dst_y2;
+ xyblt.DestinationPitch = dst_pitch;
+ xyblt.SourceX1Coordinate = src_x;
+ xyblt.SourceY1Coordinate = src_y;
+ xyblt.SourcePitch = src_pitch;
+ };
+
+ crocus_emit_mi_flush(batch);
+ return true;
+}
+
+static bool crocus_emit_blt(struct crocus_batch *batch,
+ struct crocus_resource *src,
+ struct crocus_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x, unsigned dst_y,
+ unsigned dst_z,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ const struct isl_format_layout *src_fmtl = isl_format_get_layout(src->surf.format);
+ unsigned src_cpp = src_fmtl->bpb / 8;
+ const struct isl_format_layout *dst_fmtl = isl_format_get_layout(dst->surf.format);
+ const unsigned dst_cpp = dst_fmtl->bpb / 8;
+ uint16_t src_x, src_y;
+ uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y;
+ uint32_t src_width = src_box->width, src_height = src_box->height;
+
+ /* gen4/5 can't handle Y tiled blits. */
+ if (src->surf.tiling == ISL_TILING_Y0 || dst->surf.tiling == ISL_TILING_Y0)
+ return false;
+
+ if (src->surf.format != dst->surf.format)
+ return false;
+
+ if (src_cpp != dst_cpp)
+ return false;
+
+ src_x = src_box->x;
+ src_y = src_box->y;
+
+ assert(src_cpp == dst_cpp);
+
+ crocus_resource_get_image_offset(src, src_level, src_box->z, &src_image_x,
+ &src_image_y);
+ if (util_format_is_compressed(src->base.format)) {
+ int bw = util_format_get_blockwidth(src->base.format);
+ int bh = util_format_get_blockheight(src->base.format);
+ assert(src_x % bw == 0);
+ assert(src_y % bh == 0);
+ src_x /= (int)bw;
+ src_y /= (int)bh;
+ src_width = DIV_ROUND_UP(src_width, (int)bw);
+ src_height = DIV_ROUND_UP(src_height, (int)bh);
+ }
+
+ crocus_resource_get_image_offset(dst, dst_level, dst_z, &dst_image_x,
+ &dst_image_y);
+ if (util_format_is_compressed(dst->base.format)) {
+ int bw = util_format_get_blockwidth(dst->base.format);
+ int bh = util_format_get_blockheight(dst->base.format);
+ assert(dst_x % bw == 0);
+ assert(dst_y % bh == 0);
+ dst_x /= (int)bw;
+ dst_y /= (int)bh;
+ }
+ src_x += src_image_x;
+ src_y += src_image_y;
+ dst_x += dst_image_x;
+ dst_y += dst_image_y;
+
+ /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics
+ * Data Size Limitations):
+ *
+ * The BLT engine is capable of transferring very large quantities of
+ * graphics data. Any graphics data read from and written to the
+ * destination is permitted to represent a number of pixels that
+ * occupies up to 65,536 scan lines and up to 32,768 bytes per scan line
+ * at the destination. The maximum number of pixels that may be
+ * represented per scan line’s worth of graphics data depends on the
+ * color depth.
+ *
+ * The blitter's pitch is a signed 16-bit integer, but measured in bytes
+ * for linear surfaces and DWords for tiled surfaces. So the maximum
+ * pitch is 32k linear and 128k tiled.
+ */
+ if (crocus_resource_blt_pitch(src) >= 32768 ||
+ crocus_resource_blt_pitch(dst) >= 32768) {
+ return false;
+ }
+
+ /* We need to split the blit into chunks that each fit within the blitter's
+ * restrictions. We can't use a chunk size of 32768 because we need to
+ * ensure that src_tile_x + chunk_size fits. We choose 16384 because it's
+ * a nice round power of two, big enough that performance won't suffer, and
+ * small enough to guarantee everything fits.
+ */
+ const uint32_t max_chunk_size = 16384;
+
+ for (uint32_t chunk_x = 0; chunk_x < src_width; chunk_x += max_chunk_size) {
+ for (uint32_t chunk_y = 0; chunk_y < src_height; chunk_y += max_chunk_size) {
+ const uint32_t chunk_w = MIN2(max_chunk_size, src_width - chunk_x);
+ const uint32_t chunk_h = MIN2(max_chunk_size, src_height - chunk_y);
+
+ ASSERTED uint32_t z_offset_el, array_offset;
+ uint32_t src_offset, src_tile_x, src_tile_y;
+ isl_tiling_get_intratile_offset_el(src->surf.tiling,
+ src_cpp * 8, src->surf.row_pitch_B,
+ src->surf.array_pitch_el_rows,
+ src_x + chunk_x, src_y + chunk_y, 0, 0,
+ &src_offset,
+ &src_tile_x, &src_tile_y,
+ &z_offset_el, &array_offset);
+ assert(z_offset_el == 0);
+ assert(array_offset == 0);
+
+ uint32_t dst_offset, dst_tile_x, dst_tile_y;
+ isl_tiling_get_intratile_offset_el(dst->surf.tiling,
+ dst_cpp * 8, dst->surf.row_pitch_B,
+ dst->surf.array_pitch_el_rows,
+ dst_x + chunk_x, dst_y + chunk_y, 0, 0,
+ &dst_offset,
+ &dst_tile_x, &dst_tile_y,
+ &z_offset_el, &array_offset);
+ assert(z_offset_el == 0);
+ assert(array_offset == 0);
+ if (!emit_copy_blt(batch, src, dst,
+ src_cpp, src->surf.row_pitch_B,
+ src_offset,
+ dst->surf.row_pitch_B, dst_offset,
+ src_tile_x, src_tile_y,
+ dst_tile_x, dst_tile_y,
+ chunk_w, chunk_h)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static bool crocus_blit_blt(struct crocus_batch *batch,
+ const struct pipe_blit_info *info)
+{
+ if (!validate_blit_for_blt(batch, info))
+ return false;
+
+ return crocus_emit_blt(batch,
+ (struct crocus_resource *)info->src.resource,
+ (struct crocus_resource *)info->dst.resource,
+ info->dst.level,
+ info->dst.box.x,
+ info->dst.box.y,
+ info->dst.box.z,
+ info->src.level,
+ &info->src.box);
+}
+
+
+static bool crocus_copy_region_blt(struct crocus_batch *batch,
+ struct crocus_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct crocus_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ if (dst->base.target == PIPE_BUFFER || src->base.target == PIPE_BUFFER)
+ return false;
+ return crocus_emit_blt(batch,
+ src,
+ dst,
+ dst_level,
+ dstx, dsty, dstz,
+ src_level,
+ src_box);
+}
+#endif
+
+void
+genX(init_blt)(struct crocus_screen *screen)
+{
+#if GFX_VER <= 5
+ screen->vtbl.blit_blt = crocus_blit_blt;
+ screen->vtbl.copy_region_blt = crocus_copy_region_blt;
+#else
+ screen->vtbl.blit_blt = NULL;
+ screen->vtbl.copy_region_blt = NULL;
+#endif
+}
diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.c b/src/gallium/drivers/crocus/crocus_bufmgr.c
new file mode 100644
index 00000000000..caca821cd7e
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_bufmgr.c
@@ -0,0 +1,1689 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_bufmgr.c
+ *
+ * The crocus buffer manager.
+ *
+ * XXX: write better comments
+ * - BOs
+ * - Explain BO cache
+ * - main interface to GEM in the kernel
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xf86drm.h>
+#include <util/u_atomic.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "errno.h"
+#include "common/intel_clflush.h"
+#include "dev/intel_debug.h"
+#include "common/intel_gem.h"
+#include "dev/intel_device_info.h"
+#include "main/macros.h"
+#include "util/debug.h"
+#include "util/macros.h"
+#include "util/hash_table.h"
+#include "util/list.h"
+#include "util/os_file.h"
+#include "util/u_dynarray.h"
+#include "util/vma.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "string.h"
+
+#include "drm-uapi/i915_drm.h"
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+/**
+ * For debugging purposes, this returns a time in seconds.
+ */
+static double
+get_time(void)
+{
+ struct timespec tp;
+
+ clock_gettime(CLOCK_MONOTONIC, &tp);
+
+ return tp.tv_sec + tp.tv_nsec / 1000000000.0;
+}
+
+/* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier
+ * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is
+ * leaked. All because it does not call VG(cli_free) from its
+ * VG_USERREQ__FREELIKE_BLOCK handler. Instead of treating the memory like
+ * and allocation, we mark it available for use upon mmapping and remove
+ * it upon unmapping.
+ */
+#define VG_DEFINED(ptr, size) VG(VALGRIND_MAKE_MEM_DEFINED(ptr, size))
+#define VG_NOACCESS(ptr, size) VG(VALGRIND_MAKE_MEM_NOACCESS(ptr, size))
+
+#define PAGE_SIZE 4096
+
+#define WARN_ONCE(cond, fmt...) do { \
+ if (unlikely(cond)) { \
+ static bool _warned = false; \
+ if (!_warned) { \
+ fprintf(stderr, "WARNING: "); \
+ fprintf(stderr, fmt); \
+ _warned = true; \
+ } \
+ } \
+} while (0)
+
+#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+
+static inline int
+atomic_add_unless(int *v, int add, int unless)
+{
+ int c, old;
+ c = p_atomic_read(v);
+ while (c != unless && (old = p_atomic_cmpxchg(v, c, c + add)) != c)
+ c = old;
+ return c == unless;
+}
+
+struct bo_cache_bucket {
+ /** List of cached BOs. */
+ struct list_head head;
+
+ /** Size of this bucket, in bytes. */
+ uint64_t size;
+};
+
+struct bo_export {
+ /** File descriptor associated with a handle export. */
+ int drm_fd;
+
+ /** GEM handle in drm_fd */
+ uint32_t gem_handle;
+
+ struct list_head link;
+};
+
+struct crocus_bufmgr {
+ /**
+ * List into the list of bufmgr.
+ */
+ struct list_head link;
+
+ uint32_t refcount;
+
+ int fd;
+
+ mtx_t lock;
+
+ /** Array of lists of cached gem objects of power-of-two sizes */
+ struct bo_cache_bucket cache_bucket[14 * 4];
+ int num_buckets;
+ time_t time;
+
+ struct hash_table *name_table;
+ struct hash_table *handle_table;
+
+ /**
+ * List of BOs which we've effectively freed, but are hanging on to
+ * until they're idle before closing and returning the VMA.
+ */
+ struct list_head zombie_list;
+
+ bool has_llc:1;
+ bool has_mmap_offset:1;
+ bool has_tiling_uapi:1;
+ bool bo_reuse:1;
+};
+
+static mtx_t global_bufmgr_list_mutex = _MTX_INITIALIZER_NP;
+static struct list_head global_bufmgr_list = {
+ .next = &global_bufmgr_list,
+ .prev = &global_bufmgr_list,
+};
+
+static int bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode,
+ uint32_t stride);
+
+static void bo_free(struct crocus_bo *bo);
+
+static uint32_t
+key_hash_uint(const void *key)
+{
+ return _mesa_hash_data(key, 4);
+}
+
+static bool
+key_uint_equal(const void *a, const void *b)
+{
+ return *((unsigned *) a) == *((unsigned *) b);
+}
+
+static struct crocus_bo *
+find_and_ref_external_bo(struct hash_table *ht, unsigned int key)
+{
+ struct hash_entry *entry = _mesa_hash_table_search(ht, &key);
+ struct crocus_bo *bo = entry ? entry->data : NULL;
+
+ if (bo) {
+ assert(bo->external);
+ assert(!bo->reusable);
+
+ /* Being non-reusable, the BO cannot be in the cache lists, but it
+ * may be in the zombie list if it had reached zero references, but
+ * we hadn't yet closed it...and then reimported the same BO. If it
+ * is, then remove it since it's now been resurrected.
+ */
+ if (bo->head.prev || bo->head.next)
+ list_del(&bo->head);
+
+ crocus_bo_reference(bo);
+ }
+
+ return bo;
+}
+
+/**
+ * This function finds the correct bucket fit for the input size.
+ * The function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
+static struct bo_cache_bucket *
+bucket_for_size(struct crocus_bufmgr *bufmgr, uint64_t size)
+{
+ /* Calculating the pages and rounding up to the page size. */
+ const unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ /* Row Bucket sizes clz((x-1) | 3) Row Column
+ * in pages stride size
+ * 0: 1 2 3 4 -> 30 30 30 30 4 1
+ * 1: 5 6 7 8 -> 29 29 29 29 4 1
+ * 2: 10 12 14 16 -> 28 28 28 28 8 2
+ * 3: 20 24 28 32 -> 27 27 27 27 16 4
+ */
+ const unsigned row = 30 - __builtin_clz((pages - 1) | 3);
+ const unsigned row_max_pages = 4 << row;
+
+ /* The '& ~2' is the special case for row 1. In row 1, max pages /
+ * 2 is 2, but the previous row maximum is zero (because there is
+ * no previous row). All row maximum sizes are power of 2, so that
+ * is the only case where that bit will be set.
+ */
+ const unsigned prev_row_max_pages = (row_max_pages / 2) & ~2;
+ int col_size_log2 = row - 1;
+ col_size_log2 += (col_size_log2 < 0);
+
+ const unsigned col = (pages - prev_row_max_pages +
+ ((1 << col_size_log2) - 1)) >> col_size_log2;
+
+ /* Calculating the index based on the row and column. */
+ const unsigned index = (row * 4) + (col - 1);
+
+ return (index < bufmgr->num_buckets) ?
+ &bufmgr->cache_bucket[index] : NULL;
+}
+
+
+int
+crocus_bo_busy(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct drm_i915_gem_busy busy = { .handle = bo->gem_handle };
+
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
+ if (ret == 0) {
+ bo->idle = !busy.busy;
+ return busy.busy;
+ }
+ return false;
+}
+
+int
+crocus_bo_madvise(struct crocus_bo *bo, int state)
+{
+ struct drm_i915_gem_madvise madv = {
+ .handle = bo->gem_handle,
+ .madv = state,
+ .retained = 1,
+ };
+
+ intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv);
+
+ return madv.retained;
+}
+
+static struct crocus_bo *
+bo_calloc(void)
+{
+ struct crocus_bo *bo = calloc(1, sizeof(*bo));
+ if (!bo)
+ return NULL;
+
+ list_inithead(&bo->exports);
+ bo->hash = _mesa_hash_pointer(bo);
+ return bo;
+}
+
+static struct crocus_bo *
+alloc_bo_from_cache(struct crocus_bufmgr *bufmgr,
+ struct bo_cache_bucket *bucket,
+ uint32_t alignment,
+ unsigned flags)
+{
+ if (!bucket)
+ return NULL;
+
+ struct crocus_bo *bo = NULL;
+
+ list_for_each_entry_safe(struct crocus_bo, cur, &bucket->head, head) {
+ /* If the last BO in the cache is busy, there are no idle BOs. Bail,
+ * either falling back to a non-matching memzone, or if that fails,
+ * allocating a fresh buffer.
+ */
+ if (crocus_bo_busy(cur))
+ return NULL;
+
+ list_del(&cur->head);
+
+ /* Tell the kernel we need this BO. If it still exists, we're done! */
+ if (crocus_bo_madvise(cur, I915_MADV_WILLNEED)) {
+ bo = cur;
+ break;
+ }
+
+ /* This BO was purged, throw it out and keep looking. */
+ bo_free(cur);
+ }
+
+ if (!bo)
+ return NULL;
+
+ /* Zero the contents if necessary. If this fails, fall back to
+ * allocating a fresh BO, which will always be zeroed by the kernel.
+ */
+ if (flags & BO_ALLOC_ZEROED) {
+ void *map = crocus_bo_map(NULL, bo, MAP_WRITE | MAP_RAW);
+ if (map) {
+ memset(map, 0, bo->size);
+ } else {
+ bo_free(bo);
+ return NULL;
+ }
+ }
+
+ return bo;
+}
+
+static struct crocus_bo *
+alloc_fresh_bo(struct crocus_bufmgr *bufmgr, uint64_t bo_size)
+{
+ struct crocus_bo *bo = bo_calloc();
+ if (!bo)
+ return NULL;
+
+ struct drm_i915_gem_create create = { .size = bo_size };
+
+ /* All new BOs we get from the kernel are zeroed, so we don't need to
+ * worry about that here.
+ */
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create) != 0) {
+ free(bo);
+ return NULL;
+ }
+
+ bo->gem_handle = create.handle;
+ bo->bufmgr = bufmgr;
+ bo->size = bo_size;
+ bo->idle = true;
+ bo->tiling_mode = I915_TILING_NONE;
+ bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
+ bo->stride = 0;
+
+ /* Calling set_domain() will allocate pages for the BO outside of the
+ * struct mutex lock in the kernel, which is more efficient than waiting
+ * to create them during the first execbuf that uses the BO.
+ */
+ struct drm_i915_gem_set_domain sd = {
+ .handle = bo->gem_handle,
+ .read_domains = I915_GEM_DOMAIN_CPU,
+ .write_domain = 0,
+ };
+
+ if (intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) {
+ bo_free(bo);
+ return NULL;
+ }
+
+ return bo;
+}
+
+static struct crocus_bo *
+bo_alloc_internal(struct crocus_bufmgr *bufmgr,
+ const char *name,
+ uint64_t size,
+ uint32_t alignment,
+ unsigned flags,
+ uint32_t tiling_mode,
+ uint32_t stride)
+{
+ struct crocus_bo *bo;
+ unsigned int page_size = getpagesize();
+ struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size);
+
+ /* Round the size up to the bucket size, or if we don't have caching
+ * at this size, a multiple of the page size.
+ */
+ uint64_t bo_size =
+ bucket ? bucket->size : MAX2(ALIGN(size, page_size), page_size);
+
+ mtx_lock(&bufmgr->lock);
+
+ /* Get a buffer out of the cache if available. First, we try to find
+ * one with a matching memory zone so we can avoid reallocating VMA.
+ */
+ bo = alloc_bo_from_cache(bufmgr, bucket, alignment, flags);
+
+ mtx_unlock(&bufmgr->lock);
+
+ if (!bo) {
+ bo = alloc_fresh_bo(bufmgr, bo_size);
+ if (!bo)
+ return NULL;
+ }
+
+ if (bo_set_tiling_internal(bo, tiling_mode, stride))
+ goto err_free;
+
+ bo->name = name;
+ p_atomic_set(&bo->refcount, 1);
+ bo->reusable = bucket && bufmgr->bo_reuse;
+ bo->cache_coherent = bufmgr->has_llc;
+ bo->index = -1;
+ bo->kflags = 0;
+
+ if ((flags & BO_ALLOC_COHERENT) && !bo->cache_coherent) {
+ struct drm_i915_gem_caching arg = {
+ .handle = bo->gem_handle,
+ .caching = 1,
+ };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) == 0) {
+ bo->cache_coherent = true;
+ bo->reusable = false;
+ }
+ }
+
+ DBG("bo_create: buf %d (%s) %llub\n", bo->gem_handle,
+ bo->name, (unsigned long long) size);
+
+ return bo;
+
+err_free:
+ bo_free(bo);
+ return NULL;
+}
+
+struct crocus_bo *
+crocus_bo_alloc(struct crocus_bufmgr *bufmgr,
+ const char *name,
+ uint64_t size)
+{
+ return bo_alloc_internal(bufmgr, name, size, 1,
+ 0, I915_TILING_NONE, 0);
+}
+
+struct crocus_bo *
+crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr, const char *name,
+ uint64_t size, uint32_t alignment,
+ uint32_t tiling_mode, uint32_t pitch, unsigned flags)
+{
+ return bo_alloc_internal(bufmgr, name, size, alignment,
+ flags, tiling_mode, pitch);
+}
+
+struct crocus_bo *
+crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr, const char *name,
+ void *ptr, size_t size)
+{
+ struct crocus_bo *bo;
+
+ bo = bo_calloc();
+ if (!bo)
+ return NULL;
+
+ struct drm_i915_gem_userptr arg = {
+ .user_ptr = (uintptr_t)ptr,
+ .user_size = size,
+ };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_USERPTR, &arg))
+ goto err_free;
+ bo->gem_handle = arg.handle;
+
+ /* Check the buffer for validity before we try and use it in a batch */
+ struct drm_i915_gem_set_domain sd = {
+ .handle = bo->gem_handle,
+ .read_domains = I915_GEM_DOMAIN_CPU,
+ };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd))
+ goto err_close;
+
+ bo->name = name;
+ bo->size = size;
+ bo->map_cpu = ptr;
+
+ bo->bufmgr = bufmgr;
+ bo->kflags = 0;
+
+ if (bo->gtt_offset == 0ull)
+ goto err_close;
+
+ p_atomic_set(&bo->refcount, 1);
+ bo->userptr = true;
+ bo->cache_coherent = true;
+ bo->index = -1;
+ bo->idle = true;
+
+ return bo;
+
+err_close:
+ intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &bo->gem_handle);
+err_free:
+ free(bo);
+ return NULL;
+}
+
+/**
+ * Returns a crocus_bo wrapping the given buffer object handle.
+ *
+ * This can be used when one application needs to pass a buffer object
+ * to another.
+ */
+struct crocus_bo *
+crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr,
+ const char *name, unsigned int handle)
+{
+ struct crocus_bo *bo;
+
+ /* At the moment most applications only have a few named bo.
+ * For instance, in a DRI client only the render buffers passed
+ * between X and the client are named. And since X returns the
+ * alternating names for the front/back buffer a linear search
+ * provides a sufficiently fast match.
+ */
+ mtx_lock(&bufmgr->lock);
+ bo = find_and_ref_external_bo(bufmgr->name_table, handle);
+ if (bo)
+ goto out;
+
+ struct drm_gem_open open_arg = { .name = handle };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_OPEN, &open_arg);
+ if (ret != 0) {
+ DBG("Couldn't reference %s handle 0x%08x: %s\n",
+ name, handle, strerror(errno));
+ bo = NULL;
+ goto out;
+ }
+ /* Now see if someone has used a prime handle to get this
+ * object from the kernel before by looking through the list
+ * again for a matching gem_handle
+ */
+ bo = find_and_ref_external_bo(bufmgr->handle_table, open_arg.handle);
+ if (bo)
+ goto out;
+
+ bo = bo_calloc();
+ if (!bo)
+ goto out;
+
+ p_atomic_set(&bo->refcount, 1);
+
+ bo->size = open_arg.size;
+ bo->gtt_offset = 0;
+ bo->bufmgr = bufmgr;
+ bo->gem_handle = open_arg.handle;
+ bo->name = name;
+ bo->global_name = handle;
+ bo->reusable = false;
+ bo->external = true;
+ bo->kflags = 0;
+
+ _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
+ _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+
+ struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
+ ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling);
+ if (ret != 0)
+ goto err_unref;
+
+ bo->tiling_mode = get_tiling.tiling_mode;
+ bo->swizzle_mode = get_tiling.swizzle_mode;
+ /* XXX stride is unknown */
+ DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name);
+
+out:
+ mtx_unlock(&bufmgr->lock);
+ return bo;
+
+err_unref:
+ bo_free(bo);
+ mtx_unlock(&bufmgr->lock);
+ return NULL;
+}
+
+static void
+bo_close(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bo->external) {
+ struct hash_entry *entry;
+
+ if (bo->global_name) {
+ entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name);
+ _mesa_hash_table_remove(bufmgr->name_table, entry);
+ }
+
+ entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle);
+ _mesa_hash_table_remove(bufmgr->handle_table, entry);
+ }
+
+ /* Close this object */
+ struct drm_gem_close close = { .handle = bo->gem_handle };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close);
+ if (ret != 0) {
+ DBG("DRM_IOCTL_GEM_CLOSE %d failed (%s): %s\n",
+ bo->gem_handle, bo->name, strerror(errno));
+ }
+
+ free(bo);
+}
+
+static void
+bo_free(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bo->map_cpu && !bo->userptr) {
+ VG_NOACCESS(bo->map_cpu, bo->size);
+ munmap(bo->map_cpu, bo->size);
+ }
+ if (bo->map_wc) {
+ VG_NOACCESS(bo->map_wc, bo->size);
+ munmap(bo->map_wc, bo->size);
+ }
+ if (bo->map_gtt) {
+ VG_NOACCESS(bo->map_gtt, bo->size);
+ munmap(bo->map_gtt, bo->size);
+ }
+
+ if (bo->idle) {
+ bo_close(bo);
+ } else {
+ /* Defer closing the GEM BO and returning the VMA for reuse until the
+ * BO is idle. Just move it to the dead list for now.
+ */
+ list_addtail(&bo->head, &bufmgr->zombie_list);
+ }
+}
+
+/** Frees all cached buffers significantly older than @time. */
+static void
+cleanup_bo_cache(struct crocus_bufmgr *bufmgr, time_t time)
+{
+ int i;
+
+ if (bufmgr->time == time)
+ return;
+
+ for (i = 0; i < bufmgr->num_buckets; i++) {
+ struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
+
+ list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) {
+ if (time - bo->free_time <= 1)
+ break;
+
+ list_del(&bo->head);
+
+ bo_free(bo);
+ }
+ }
+
+ list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) {
+ /* Stop once we reach a busy BO - all others past this point were
+ * freed more recently so are likely also busy.
+ */
+ if (!bo->idle && crocus_bo_busy(bo))
+ break;
+
+ list_del(&bo->head);
+ bo_close(bo);
+ }
+
+ bufmgr->time = time;
+}
+
+static void
+bo_unreference_final(struct crocus_bo *bo, time_t time)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct bo_cache_bucket *bucket;
+
+ DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name);
+
+ bucket = NULL;
+ if (bo->reusable)
+ bucket = bucket_for_size(bufmgr, bo->size);
+ /* Put the buffer into our internal cache for reuse if we can. */
+ if (bucket && crocus_bo_madvise(bo, I915_MADV_DONTNEED)) {
+ bo->free_time = time;
+ bo->name = NULL;
+
+ list_addtail(&bo->head, &bucket->head);
+ } else {
+ bo_free(bo);
+ }
+}
+
+void
+crocus_bo_unreference(struct crocus_bo *bo)
+{
+ if (bo == NULL)
+ return;
+
+ assert(p_atomic_read(&bo->refcount) > 0);
+
+ if (atomic_add_unless(&bo->refcount, -1, 1)) {
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct timespec time;
+
+ clock_gettime(CLOCK_MONOTONIC, &time);
+
+ mtx_lock(&bufmgr->lock);
+
+ if (p_atomic_dec_zero(&bo->refcount)) {
+ bo_unreference_final(bo, time.tv_sec);
+ cleanup_bo_cache(bufmgr, time.tv_sec);
+ }
+
+ mtx_unlock(&bufmgr->lock);
+ }
+}
+
+static void
+bo_wait_with_stall_warning(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo,
+ const char *action)
+{
+ bool busy = dbg && !bo->idle;
+ double elapsed = unlikely(busy) ? -get_time() : 0.0;
+
+ crocus_bo_wait_rendering(bo);
+
+ if (unlikely(busy)) {
+ elapsed += get_time();
+ if (elapsed > 1e-5) /* 0.01ms */ {
+ perf_debug(dbg, "%s a busy \"%s\" BO stalled and took %.03f ms.\n",
+ action, bo->name, elapsed * 1000);
+ }
+ }
+}
+
+static void
+print_flags(unsigned flags)
+{
+ if (flags & MAP_READ)
+ DBG("READ ");
+ if (flags & MAP_WRITE)
+ DBG("WRITE ");
+ if (flags & MAP_ASYNC)
+ DBG("ASYNC ");
+ if (flags & MAP_PERSISTENT)
+ DBG("PERSISTENT ");
+ if (flags & MAP_COHERENT)
+ DBG("COHERENT ");
+ if (flags & MAP_RAW)
+ DBG("RAW ");
+ DBG("\n");
+}
+
+static void *
+crocus_bo_gem_mmap_legacy(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, bool wc)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ struct drm_i915_gem_mmap mmap_arg = {
+ .handle = bo->gem_handle,
+ .size = bo->size,
+ .flags = wc ? I915_MMAP_WC : 0,
+ };
+
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
+ if (ret != 0) {
+ DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+ void *map = (void *) (uintptr_t) mmap_arg.addr_ptr;
+
+ return map;
+}
+
+static void *
+crocus_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct crocus_bo *bo,
+ bool wc)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ struct drm_i915_gem_mmap_offset mmap_arg = {
+ .handle = bo->gem_handle,
+ .flags = wc ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB,
+ };
+
+ /* Get the fake offset back */
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mmap_arg);
+ if (ret != 0) {
+ DBG("%s:%d: Error preparing buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ /* And map it */
+ void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ bufmgr->fd, mmap_arg.offset);
+ if (map == MAP_FAILED) {
+ DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ return map;
+}
+
+static void *
+crocus_bo_gem_mmap(struct pipe_debug_callback *dbg, struct crocus_bo *bo, bool wc)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bufmgr->has_mmap_offset)
+ return crocus_bo_gem_mmap_offset(dbg, bo, wc);
+ else
+ return crocus_bo_gem_mmap_legacy(dbg, bo, wc);
+}
+
+static void *
+crocus_bo_map_cpu(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ /* We disallow CPU maps for writing to non-coherent buffers, as the
+ * CPU map can become invalidated when a batch is flushed out, which
+ * can happen at unpredictable times. You should use WC maps instead.
+ */
+ assert(bo->cache_coherent || !(flags & MAP_WRITE));
+
+ if (!bo->map_cpu) {
+ DBG("crocus_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name);
+
+ void *map = crocus_bo_gem_mmap(dbg, bo, false);
+ if (!map) {
+ return NULL;
+ }
+
+ VG_DEFINED(map, bo->size);
+
+ if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) {
+ VG_NOACCESS(map, bo->size);
+ munmap(map, bo->size);
+ }
+ }
+ assert(bo->map_cpu);
+
+ DBG("crocus_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name,
+ bo->map_cpu);
+ print_flags(flags);
+
+ if (!(flags & MAP_ASYNC)) {
+ bo_wait_with_stall_warning(dbg, bo, "CPU mapping");
+ }
+
+ if (!bo->cache_coherent && !bo->bufmgr->has_llc) {
+ /* If we're reusing an existing CPU mapping, the CPU caches may
+ * contain stale data from the last time we read from that mapping.
+ * (With the BO cache, it might even be data from a previous buffer!)
+ * Even if it's a brand new mapping, the kernel may have zeroed the
+ * buffer via CPU writes.
+ *
+ * We need to invalidate those cachelines so that we see the latest
+ * contents, and so long as we only read from the CPU mmap we do not
+ * need to write those cachelines back afterwards.
+ *
+ * On LLC, the emprical evidence suggests that writes from the GPU
+ * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU
+ * cachelines. (Other reads, such as the display engine, bypass the
+ * LLC entirely requiring us to keep dirty pixels for the scanout
+ * out of any cache.)
+ */
+ intel_invalidate_range(bo->map_cpu, bo->size);
+ }
+
+ return bo->map_cpu;
+}
+
+static void *
+crocus_bo_map_wc(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ if (!bo->map_wc) {
+ DBG("crocus_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name);
+
+ void *map = crocus_bo_gem_mmap(dbg, bo, true);
+ if (!map) {
+ return NULL;
+ }
+
+ VG_DEFINED(map, bo->size);
+
+ if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) {
+ VG_NOACCESS(map, bo->size);
+ munmap(map, bo->size);
+ }
+ }
+ assert(bo->map_wc);
+
+ DBG("crocus_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc);
+ print_flags(flags);
+
+ if (!(flags & MAP_ASYNC)) {
+ bo_wait_with_stall_warning(dbg, bo, "WC mapping");
+ }
+
+ return bo->map_wc;
+}
+
+/**
+ * Perform an uncached mapping via the GTT.
+ *
+ * Write access through the GTT is not quite fully coherent. On low power
+ * systems especially, like modern Atoms, we can observe reads from RAM before
+ * the write via GTT has landed. A write memory barrier that flushes the Write
+ * Combining Buffer (i.e. sfence/mfence) is not sufficient to order the later
+ * read after the write as the GTT write suffers a small delay through the GTT
+ * indirection. The kernel uses an uncached mmio read to ensure the GTT write
+ * is ordered with reads (either by the GPU, WB or WC) and unconditionally
+ * flushes prior to execbuf submission. However, if we are not informing the
+ * kernel about our GTT writes, it will not flush before earlier access, such
+ * as when using the cmdparser. Similarly, we need to be careful if we should
+ * ever issue a CPU read immediately following a GTT write.
+ *
+ * Telling the kernel about write access also has one more important
+ * side-effect. Upon receiving notification about the write, it cancels any
+ * scanout buffering for FBC/PSR and friends. Later FBC/PSR is then flushed by
+ * either SW_FINISH or DIRTYFB. The presumption is that we never write to the
+ * actual scanout via a mmaping, only to a backbuffer and so all the FBC/PSR
+ * tracking is handled on the buffer exchange instead.
+ */
+static void *
+crocus_bo_map_gtt(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ /* If we don't support get/set_tiling, there's no support for GTT mapping
+ * either (it won't do any de-tiling for us).
+ */
+ assert(bufmgr->has_tiling_uapi);
+
+ /* Get a mapping of the buffer if we haven't before. */
+ if (bo->map_gtt == NULL) {
+ DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name);
+
+ struct drm_i915_gem_mmap_gtt mmap_arg = { .handle = bo->gem_handle };
+
+ /* Get the fake offset back... */
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg);
+ if (ret != 0) {
+ DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ /* and mmap it. */
+ void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, bufmgr->fd, mmap_arg.offset);
+ if (map == MAP_FAILED) {
+ DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ /* We don't need to use VALGRIND_MALLOCLIKE_BLOCK because Valgrind will
+ * already intercept this mmap call. However, for consistency between
+ * all the mmap paths, we mark the pointer as defined now and mark it
+ * as inaccessible afterwards.
+ */
+ VG_DEFINED(map, bo->size);
+
+ if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) {
+ VG_NOACCESS(map, bo->size);
+ munmap(map, bo->size);
+ }
+ }
+ assert(bo->map_gtt);
+
+ DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt);
+ print_flags(flags);
+
+ if (!(flags & MAP_ASYNC)) {
+ bo_wait_with_stall_warning(dbg, bo, "GTT mapping");
+ }
+
+ return bo->map_gtt;
+}
+
+static bool
+can_map_cpu(struct crocus_bo *bo, unsigned flags)
+{
+ if (bo->cache_coherent)
+ return true;
+
+ /* Even if the buffer itself is not cache-coherent (such as a scanout), on
+ * an LLC platform reads always are coherent (as they are performed via the
+ * central system agent). It is just the writes that we need to take special
+ * care to ensure that land in main memory and not stick in the CPU cache.
+ */
+ if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc)
+ return true;
+
+ /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid
+ * across batch flushes where the kernel will change cache domains of the
+ * bo, invalidating continued access to the CPU mmap on non-LLC device.
+ *
+ * Similarly, ASYNC typically means that the buffer will be accessed via
+ * both the CPU and the GPU simultaneously. Batches may be executed that
+ * use the BO even while it is mapped. While OpenGL technically disallows
+ * most drawing while non-persistent mappings are active, we may still use
+ * the GPU for blits or other operations, causing batches to happen at
+ * inconvenient times.
+ *
+ * If RAW is set, we expect the caller to be able to handle a WC buffer
+ * more efficiently than the involuntary clflushes.
+ */
+ if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC | MAP_RAW))
+ return false;
+
+ return !(flags & MAP_WRITE);
+}
+
+void *
+crocus_bo_map(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW))
+ return crocus_bo_map_gtt(dbg, bo, flags);
+
+ void *map;
+
+ if (can_map_cpu(bo, flags))
+ map = crocus_bo_map_cpu(dbg, bo, flags);
+ else
+ map = crocus_bo_map_wc(dbg, bo, flags);
+
+ /* Allow the attempt to fail by falling back to the GTT where necessary.
+ *
+ * Not every buffer can be mmaped directly using the CPU (or WC), for
+ * example buffers that wrap stolen memory or are imported from other
+ * devices. For those, we have little choice but to use a GTT mmapping.
+ * However, if we use a slow GTT mmapping for reads where we expected fast
+ * access, that order of magnitude difference in throughput will be clearly
+ * expressed by angry users.
+ *
+ * We skip MAP_RAW because we want to avoid map_gtt's fence detiling.
+ */
+ if (!map && !(flags & MAP_RAW)) {
+ perf_debug(dbg, "Fallback GTT mapping for %s with access flags %x\n",
+ bo->name, flags);
+ map = crocus_bo_map_gtt(dbg, bo, flags);
+ }
+
+ return map;
+}
+
+/** Waits for all GPU rendering with the object to have completed. */
+void
+crocus_bo_wait_rendering(struct crocus_bo *bo)
+{
+ /* We require a kernel recent enough for WAIT_IOCTL support.
+ * See intel_init_bufmgr()
+ */
+ crocus_bo_wait(bo, -1);
+}
+
+/**
+ * Waits on a BO for the given amount of time.
+ *
+ * @bo: buffer object to wait for
+ * @timeout_ns: amount of time to wait in nanoseconds.
+ * If value is less than 0, an infinite wait will occur.
+ *
+ * Returns 0 if the wait was successful ie. the last batch referencing the
+ * object has completed within the allotted time. Otherwise some negative return
+ * value describes the error. Of particular interest is -ETIME when the wait has
+ * failed to yield the desired result.
+ *
+ * Similar to crocus_bo_wait_rendering except a timeout parameter allows
+ * the operation to give up after a certain amount of time. Another subtle
+ * difference is the internal locking semantics are different (this variant does
+ * not hold the lock for the duration of the wait). This makes the wait subject
+ * to a larger userspace race window.
+ *
+ * The implementation shall wait until the object is no longer actively
+ * referenced within a batch buffer at the time of the call. The wait will
+ * not guarantee that the buffer is re-issued via another thread, or an flinked
+ * handle. Userspace must make sure this race does not occur if such precision
+ * is important.
+ *
+ * Note that some kernels have broken the inifite wait for negative values
+ * promise, upgrade to latest stable kernels if this is the case.
+ */
+int
+crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ /* If we know it's idle, don't bother with the kernel round trip */
+ if (bo->idle && !bo->external)
+ return 0;
+
+ struct drm_i915_gem_wait wait = {
+ .bo_handle = bo->gem_handle,
+ .timeout_ns = timeout_ns,
+ };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+ if (ret != 0)
+ return -errno;
+
+ bo->idle = true;
+
+ return ret;
+}
+
+static void
+crocus_bufmgr_destroy(struct crocus_bufmgr *bufmgr)
+{
+ mtx_destroy(&bufmgr->lock);
+
+ /* Free any cached buffer objects we were going to reuse */
+ for (int i = 0; i < bufmgr->num_buckets; i++) {
+ struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
+
+ list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) {
+ list_del(&bo->head);
+
+ bo_free(bo);
+ }
+ }
+
+ /* Close any buffer objects on the dead list. */
+ list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) {
+ list_del(&bo->head);
+ bo_close(bo);
+ }
+
+ _mesa_hash_table_destroy(bufmgr->name_table, NULL);
+ _mesa_hash_table_destroy(bufmgr->handle_table, NULL);
+
+ close(bufmgr->fd);
+
+ free(bufmgr);
+}
+
+static int
+bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode,
+ uint32_t stride)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct drm_i915_gem_set_tiling set_tiling;
+ int ret;
+
+ if (bo->global_name == 0 &&
+ tiling_mode == bo->tiling_mode && stride == bo->stride)
+ return 0;
+
+ memset(&set_tiling, 0, sizeof(set_tiling));
+ do {
+ /* set_tiling is slightly broken and overwrites the
+ * input on the error path, so we have to open code
+ * drm_ioctl.
+ */
+ set_tiling.handle = bo->gem_handle;
+ set_tiling.tiling_mode = tiling_mode;
+ set_tiling.stride = stride;
+
+ ret = ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+ } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+ if (ret == -1)
+ return -errno;
+
+ bo->tiling_mode = set_tiling.tiling_mode;
+ bo->swizzle_mode = set_tiling.swizzle_mode;
+ bo->stride = set_tiling.stride;
+ return 0;
+}
+
+int
+crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode,
+ uint32_t *swizzle_mode)
+{
+ *tiling_mode = bo->tiling_mode;
+ *swizzle_mode = bo->swizzle_mode;
+ return 0;
+}
+
+struct crocus_bo *
+crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr, int prime_fd,
+ uint32_t tiling, uint32_t stride)
+{
+ uint32_t handle;
+ struct crocus_bo *bo;
+
+ mtx_lock(&bufmgr->lock);
+ int ret = drmPrimeFDToHandle(bufmgr->fd, prime_fd, &handle);
+ if (ret) {
+ DBG("import_dmabuf: failed to obtain handle from fd: %s\n",
+ strerror(errno));
+ mtx_unlock(&bufmgr->lock);
+ return NULL;
+ }
+
+ /*
+ * See if the kernel has already returned this buffer to us. Just as
+ * for named buffers, we must not create two bo's pointing at the same
+ * kernel object
+ */
+ bo = find_and_ref_external_bo(bufmgr->handle_table, handle);
+ if (bo)
+ goto out;
+
+ bo = bo_calloc();
+ if (!bo)
+ goto out;
+
+ p_atomic_set(&bo->refcount, 1);
+
+ /* Determine size of bo. The fd-to-handle ioctl really should
+ * return the size, but it doesn't. If we have kernel 3.12 or
+ * later, we can lseek on the prime fd to get the size. Older
+ * kernels will just fail, in which case we fall back to the
+ * provided (estimated or guess size). */
+ ret = lseek(prime_fd, 0, SEEK_END);
+ if (ret != -1)
+ bo->size = ret;
+
+ bo->bufmgr = bufmgr;
+ bo->name = "prime";
+ bo->reusable = false;
+ bo->external = true;
+ bo->kflags = 0;
+ bo->gem_handle = handle;
+ _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
+
+ struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling))
+ goto err;
+
+ if (get_tiling.tiling_mode == tiling || tiling > I915_TILING_LAST) {
+ bo->tiling_mode = get_tiling.tiling_mode;
+ bo->swizzle_mode = get_tiling.swizzle_mode;
+ /* XXX stride is unknown */
+ } else {
+ if (bo_set_tiling_internal(bo, tiling, stride)) {
+ goto err;
+ }
+ }
+
+out:
+ mtx_unlock(&bufmgr->lock);
+ return bo;
+
+err:
+ bo_free(bo);
+ mtx_unlock(&bufmgr->lock);
+ return NULL;
+}
+
+static void
+crocus_bo_make_external_locked(struct crocus_bo *bo)
+{
+ if (!bo->external) {
+ _mesa_hash_table_insert(bo->bufmgr->handle_table, &bo->gem_handle, bo);
+ bo->external = true;
+ bo->reusable = false;
+ }
+}
+
+static void
+crocus_bo_make_external(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bo->external) {
+ assert(!bo->reusable);
+ return;
+ }
+
+ mtx_lock(&bufmgr->lock);
+ crocus_bo_make_external_locked(bo);
+ mtx_unlock(&bufmgr->lock);
+}
+
+int
+crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ crocus_bo_make_external(bo);
+
+ if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
+ DRM_CLOEXEC, prime_fd) != 0)
+ return -errno;
+
+ return 0;
+}
+
+uint32_t
+crocus_bo_export_gem_handle(struct crocus_bo *bo)
+{
+ crocus_bo_make_external(bo);
+
+ return bo->gem_handle;
+}
+
+int
+crocus_bo_flink(struct crocus_bo *bo, uint32_t *name)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (!bo->global_name) {
+ struct drm_gem_flink flink = { .handle = bo->gem_handle };
+
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink))
+ return -errno;
+
+ mtx_lock(&bufmgr->lock);
+ if (!bo->global_name) {
+ crocus_bo_make_external_locked(bo);
+ bo->global_name = flink.name;
+ _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+ }
+ mtx_unlock(&bufmgr->lock);
+ }
+
+ *name = bo->global_name;
+ return 0;
+}
+
+int
+crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd,
+ uint32_t *out_handle)
+{
+ /* Only add the new GEM handle to the list of export if it belongs to a
+ * different GEM device. Otherwise we might close the same buffer multiple
+ * times.
+ */
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ int ret = os_same_file_description(drm_fd, bufmgr->fd);
+ WARN_ONCE(ret < 0,
+ "Kernel has no file descriptor comparison support: %s\n",
+ strerror(errno));
+ if (ret == 0) {
+ *out_handle = crocus_bo_export_gem_handle(bo);
+ return 0;
+ }
+
+ struct bo_export *export = calloc(1, sizeof(*export));
+ if (!export)
+ return -ENOMEM;
+
+ export->drm_fd = drm_fd;
+
+ int dmabuf_fd = -1;
+ int err = crocus_bo_export_dmabuf(bo, &dmabuf_fd);
+ if (err) {
+ free(export);
+ return err;
+ }
+
+ mtx_lock(&bufmgr->lock);
+ err = drmPrimeFDToHandle(drm_fd, dmabuf_fd, &export->gem_handle);
+ close(dmabuf_fd);
+ if (err) {
+ mtx_unlock(&bufmgr->lock);
+ free(export);
+ return err;
+ }
+
+ bool found = false;
+ list_for_each_entry(struct bo_export, iter, &bo->exports, link) {
+ if (iter->drm_fd != drm_fd)
+ continue;
+ /* Here we assume that for a given DRM fd, we'll always get back the
+ * same GEM handle for a given buffer.
+ */
+ assert(iter->gem_handle == export->gem_handle);
+ free(export);
+ export = iter;
+ found = true;
+ break;
+ }
+ if (!found)
+ list_addtail(&export->link, &bo->exports);
+
+ mtx_unlock(&bufmgr->lock);
+
+ *out_handle = export->gem_handle;
+
+ return 0;
+}
+
+static void
+add_bucket(struct crocus_bufmgr *bufmgr, int size)
+{
+ unsigned int i = bufmgr->num_buckets;
+
+ assert(i < ARRAY_SIZE(bufmgr->cache_bucket));
+
+ list_inithead(&bufmgr->cache_bucket[i].head);
+ bufmgr->cache_bucket[i].size = size;
+ bufmgr->num_buckets++;
+
+ assert(bucket_for_size(bufmgr, size) == &bufmgr->cache_bucket[i]);
+ assert(bucket_for_size(bufmgr, size - 2048) == &bufmgr->cache_bucket[i]);
+ assert(bucket_for_size(bufmgr, size + 1) != &bufmgr->cache_bucket[i]);
+}
+
+static void
+init_cache_buckets(struct crocus_bufmgr *bufmgr)
+{
+ uint64_t size, cache_max_size = 64 * 1024 * 1024;
+
+ /* OK, so power of two buckets was too wasteful of memory.
+ * Give 3 other sizes between each power of two, to hopefully
+ * cover things accurately enough. (The alternative is
+ * probably to just go for exact matching of sizes, and assume
+ * that for things like composited window resize the tiled
+ * width/height alignment and rounding of sizes to pages will
+ * get us useful cache hit rates anyway)
+ */
+ add_bucket(bufmgr, PAGE_SIZE);
+ add_bucket(bufmgr, PAGE_SIZE * 2);
+ add_bucket(bufmgr, PAGE_SIZE * 3);
+
+ /* Initialize the linked lists for BO reuse cache. */
+ for (size = 4 * PAGE_SIZE; size <= cache_max_size; size *= 2) {
+ add_bucket(bufmgr, size);
+
+ add_bucket(bufmgr, size + size * 1 / 4);
+ add_bucket(bufmgr, size + size * 2 / 4);
+ add_bucket(bufmgr, size + size * 3 / 4);
+ }
+}
+
+uint32_t
+crocus_create_hw_context(struct crocus_bufmgr *bufmgr)
+{
+ struct drm_i915_gem_context_create create = { };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+ if (ret != 0) {
+ DBG("DRM_IOCTL_I915_GEM_CONTEXT_CREATE failed: %s\n", strerror(errno));
+ return 0;
+ }
+
+ /* Upon declaring a GPU hang, the kernel will zap the guilty context
+ * back to the default logical HW state and attempt to continue on to
+ * our next submitted batchbuffer. However, our render batches assume
+ * the previous GPU state is preserved, and only emit commands needed
+ * to incrementally change that state. In particular, we inherit the
+ * STATE_BASE_ADDRESS and PIPELINE_SELECT settings, which are critical.
+ * With default base addresses, our next batches will almost certainly
+ * cause more GPU hangs, leading to repeated hangs until we're banned
+ * or the machine is dead.
+ *
+ * Here we tell the kernel not to attempt to recover our context but
+ * immediately (on the next batchbuffer submission) report that the
+ * context is lost, and we will do the recovery ourselves. Ideally,
+ * we'll have two lost batches instead of a continual stream of hangs.
+ */
+ struct drm_i915_gem_context_param p = {
+ .ctx_id = create.ctx_id,
+ .param = I915_CONTEXT_PARAM_RECOVERABLE,
+ .value = false,
+ };
+ drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p);
+
+ return create.ctx_id;
+}
+
+static int
+crocus_hw_context_get_priority(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+ struct drm_i915_gem_context_param p = {
+ .ctx_id = ctx_id,
+ .param = I915_CONTEXT_PARAM_PRIORITY,
+ };
+ drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p);
+ return p.value; /* on error, return 0 i.e. default priority */
+}
+
+int
+crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr,
+ uint32_t ctx_id,
+ int priority)
+{
+ struct drm_i915_gem_context_param p = {
+ .ctx_id = ctx_id,
+ .param = I915_CONTEXT_PARAM_PRIORITY,
+ .value = priority,
+ };
+ int err;
+
+ err = 0;
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p))
+ err = -errno;
+
+ return err;
+}
+
+uint32_t
+crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+ uint32_t new_ctx = crocus_create_hw_context(bufmgr);
+
+ if (new_ctx) {
+ int priority = crocus_hw_context_get_priority(bufmgr, ctx_id);
+ crocus_hw_context_set_priority(bufmgr, new_ctx, priority);
+ }
+
+ return new_ctx;
+}
+
+void
+crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+ struct drm_i915_gem_context_destroy d = { .ctx_id = ctx_id };
+
+ if (ctx_id != 0 &&
+ intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &d) != 0) {
+ fprintf(stderr, "DRM_IOCTL_I915_GEM_CONTEXT_DESTROY failed: %s\n",
+ strerror(errno));
+ }
+}
+
+int
+crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset, uint64_t *result)
+{
+ struct drm_i915_reg_read reg_read = { .offset = offset };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_REG_READ, &reg_read);
+
+ *result = reg_read.val;
+ return ret;
+}
+
+static int
+gem_param(int fd, int name)
+{
+ int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+ struct drm_i915_getparam gp = { .param = name, .value = &v };
+ if (intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+ return -1;
+
+ return v;
+}
+
+/**
+ * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
+ * and manage map buffer objections.
+ *
+ * \param fd File descriptor of the opened DRM device.
+ */
+static struct crocus_bufmgr *
+crocus_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
+{
+ struct crocus_bufmgr *bufmgr = calloc(1, sizeof(*bufmgr));
+ if (bufmgr == NULL)
+ return NULL;
+
+ /* Handles to buffer objects belong to the device fd and are not
+ * reference counted by the kernel. If the same fd is used by
+ * multiple parties (threads sharing the same screen bufmgr, or
+ * even worse the same device fd passed to multiple libraries)
+ * ownership of those handles is shared by those independent parties.
+ *
+ * Don't do this! Ensure that each library/bufmgr has its own device
+ * fd so that its namespace does not clash with another.
+ */
+ bufmgr->fd = os_dupfd_cloexec(fd);
+
+ p_atomic_set(&bufmgr->refcount, 1);
+
+ if (mtx_init(&bufmgr->lock, mtx_plain) != 0) {
+ free(bufmgr);
+ return NULL;
+ }
+
+ list_inithead(&bufmgr->zombie_list);
+
+ bufmgr->has_llc = devinfo->has_llc;
+ bufmgr->has_tiling_uapi = devinfo->has_tiling_uapi;
+ bufmgr->bo_reuse = bo_reuse;
+ bufmgr->has_mmap_offset = gem_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4;
+
+ init_cache_buckets(bufmgr);
+
+ bufmgr->name_table =
+ _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
+ bufmgr->handle_table =
+ _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
+
+ return bufmgr;
+}
+
+static struct crocus_bufmgr *
+crocus_bufmgr_ref(struct crocus_bufmgr *bufmgr)
+{
+ p_atomic_inc(&bufmgr->refcount);
+ return bufmgr;
+}
+
+void
+crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr)
+{
+ mtx_lock(&global_bufmgr_list_mutex);
+ if (p_atomic_dec_zero(&bufmgr->refcount)) {
+ list_del(&bufmgr->link);
+ crocus_bufmgr_destroy(bufmgr);
+ }
+ mtx_unlock(&global_bufmgr_list_mutex);
+}
+
+/**
+ * Gets an already existing GEM buffer manager or create a new one.
+ *
+ * \param fd File descriptor of the opened DRM device.
+ */
+struct crocus_bufmgr *
+crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd, bool bo_reuse)
+{
+ struct stat st;
+
+ if (fstat(fd, &st))
+ return NULL;
+
+ struct crocus_bufmgr *bufmgr = NULL;
+
+ mtx_lock(&global_bufmgr_list_mutex);
+ list_for_each_entry(struct crocus_bufmgr, iter_bufmgr, &global_bufmgr_list, link) {
+ struct stat iter_st;
+ if (fstat(iter_bufmgr->fd, &iter_st))
+ continue;
+
+ if (st.st_rdev == iter_st.st_rdev) {
+ assert(iter_bufmgr->bo_reuse == bo_reuse);
+ bufmgr = crocus_bufmgr_ref(iter_bufmgr);
+ goto unlock;
+ }
+ }
+
+ bufmgr = crocus_bufmgr_create(devinfo, fd, bo_reuse);
+ if (bufmgr)
+ list_addtail(&bufmgr->link, &global_bufmgr_list);
+
+ unlock:
+ mtx_unlock(&global_bufmgr_list_mutex);
+
+ return bufmgr;
+}
+
+int
+crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr)
+{
+ return bufmgr->fd;
+}
diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.h b/src/gallium/drivers/crocus/crocus_bufmgr.h
new file mode 100644
index 00000000000..8bb328fdeae
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_bufmgr.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_BUFMGR_H
+#define CROCUS_BUFMGR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include "util/macros.h"
+#include "util/u_atomic.h"
+#include "util/list.h"
+#include "pipe/p_defines.h"
+
+struct crocus_batch;
+struct intel_device_info;
+struct pipe_debug_callback;
+
+#define CROCUS_BINDER_SIZE (64 * 1024)
+#define CROCUS_MAX_BINDERS 100
+
+struct crocus_bo {
+ /**
+ * Size in bytes of the buffer object.
+ *
+ * The size may be larger than the size originally requested for the
+ * allocation, such as being aligned to page size.
+ */
+ uint64_t size;
+
+ /** Buffer manager context associated with this buffer object */
+ struct crocus_bufmgr *bufmgr;
+
+ /** The GEM handle for this buffer object. */
+ uint32_t gem_handle;
+
+ /**
+ * Virtual address of the buffer inside the PPGTT (Per-Process Graphics
+ * Translation Table).
+ *
+ * Although each hardware context has its own VMA, we assign BO's to the
+ * same address in all contexts, for simplicity.
+ */
+ uint64_t gtt_offset;
+
+ /**
+ * The validation list index for this buffer, or -1 when not in a batch.
+ * Note that a single buffer may be in multiple batches (contexts), and
+ * this is a global field, which refers to the last batch using the BO.
+ * It should not be considered authoritative, but can be used to avoid a
+ * linear walk of the validation list in the common case by guessing that
+ * exec_bos[bo->index] == bo and confirming whether that's the case.
+ *
+ * XXX: this is not ideal now that we have more than one batch per context,
+ * XXX: as the index will flop back and forth between the render index and
+ * XXX: compute index...
+ */
+ unsigned index;
+
+ /**
+ * Boolean of whether the GPU is definitely not accessing the buffer.
+ *
+ * This is only valid when reusable, since non-reusable
+ * buffers are those that have been shared with other
+ * processes, so we don't know their state.
+ */
+ bool idle;
+
+ int refcount;
+ const char *name;
+
+ uint64_t kflags;
+
+ /**
+ * Kenel-assigned global name for this object
+ *
+ * List contains both flink named and prime fd'd objects
+ */
+ unsigned global_name;
+
+ /**
+ * Current tiling mode
+ */
+ uint32_t tiling_mode;
+ uint32_t swizzle_mode;
+ uint32_t stride;
+
+ time_t free_time;
+
+ /** Mapped address for the buffer, saved across map/unmap cycles */
+ void *map_cpu;
+ /** GTT virtual address for the buffer, saved across map/unmap cycles */
+ void *map_gtt;
+ /** WC CPU address for the buffer, saved across map/unmap cycles */
+ void *map_wc;
+
+ /** BO cache list */
+ struct list_head head;
+
+ /** List of GEM handle exports of this buffer (bo_export) */
+ struct list_head exports;
+
+ /**
+ * Boolean of whether this buffer can be re-used
+ */
+ bool reusable;
+
+ /**
+ * Boolean of whether this buffer has been shared with an external client.
+ */
+ bool external;
+
+ /**
+ * Boolean of whether this buffer is cache coherent
+ */
+ bool cache_coherent;
+
+ /**
+ * Boolean of whether this buffer points into user memory
+ */
+ bool userptr;
+
+ /** Pre-computed hash using _mesa_hash_pointer for cache tracking sets */
+ uint32_t hash;
+};
+
+#define BO_ALLOC_ZEROED (1 << 0)
+#define BO_ALLOC_COHERENT (1 << 1)
+
+/**
+ * Allocate a buffer object.
+ *
+ * Buffer objects are not necessarily initially mapped into CPU virtual
+ * address space or graphics device aperture. They must be mapped
+ * using crocus_bo_map() to be used by the CPU.
+ */
+struct crocus_bo *crocus_bo_alloc(struct crocus_bufmgr *bufmgr,
+ const char *name, uint64_t size);
+
+/**
+ * Allocate a tiled buffer object.
+ *
+ * Alignment for tiled objects is set automatically; the 'flags'
+ * argument provides a hint about how the object will be used initially.
+ *
+ * Valid tiling formats are:
+ * I915_TILING_NONE
+ * I915_TILING_X
+ * I915_TILING_Y
+ */
+struct crocus_bo *crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr,
+ const char *name, uint64_t size,
+ uint32_t alignment,
+ uint32_t tiling_mode, uint32_t pitch,
+ unsigned flags);
+
+struct crocus_bo *crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr,
+ const char *name, void *ptr,
+ size_t size);
+
+/** Takes a reference on a buffer object */
+static inline void
+crocus_bo_reference(struct crocus_bo *bo)
+{
+ p_atomic_inc(&bo->refcount);
+}
+
+/**
+ * Releases a reference on a buffer object, freeing the data if
+ * no references remain.
+ */
+void crocus_bo_unreference(struct crocus_bo *bo);
+
+#define MAP_READ PIPE_MAP_READ
+#define MAP_WRITE PIPE_MAP_WRITE
+#define MAP_ASYNC PIPE_MAP_UNSYNCHRONIZED
+#define MAP_PERSISTENT PIPE_MAP_PERSISTENT
+#define MAP_COHERENT PIPE_MAP_COHERENT
+/* internal */
+#define MAP_INTERNAL_MASK (0xff << 24)
+#define MAP_RAW (0x01 << 24)
+
+#define MAP_FLAGS (MAP_READ | MAP_WRITE | MAP_ASYNC | \
+ MAP_PERSISTENT | MAP_COHERENT | MAP_INTERNAL_MASK)
+
+/**
+ * Maps the buffer into userspace.
+ *
+ * This function will block waiting for any existing execution on the
+ * buffer to complete, first. The resulting mapping is returned.
+ */
+MUST_CHECK void *crocus_bo_map(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags);
+
+/**
+ * Reduces the refcount on the userspace mapping of the buffer
+ * object.
+ */
+static inline int crocus_bo_unmap(struct crocus_bo *bo) { return 0; }
+
+/**
+ * Waits for rendering to an object by the GPU to have completed.
+ *
+ * This is not required for any access to the BO by bo_map,
+ * bo_subdata, etc. It is merely a way for the driver to implement
+ * glFinish.
+ */
+void crocus_bo_wait_rendering(struct crocus_bo *bo);
+
+/**
+ * Unref a buffer manager instance.
+ */
+void crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr);
+
+/**
+ * Get the current tiling (and resulting swizzling) mode for the bo.
+ *
+ * \param buf Buffer to get tiling mode for
+ * \param tiling_mode returned tiling mode
+ * \param swizzle_mode returned swizzling mode
+ */
+int crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode,
+ uint32_t *swizzle_mode);
+
+/**
+ * Create a visible name for a buffer which can be used by other apps
+ *
+ * \param buf Buffer to create a name for
+ * \param name Returned name
+ */
+int crocus_bo_flink(struct crocus_bo *bo, uint32_t *name);
+
+/**
+ * Is this buffer shared with external clients (exported)?
+ */
+static inline bool
+crocus_bo_is_external(const struct crocus_bo *bo)
+{
+ return bo->external;
+}
+
+/**
+ * Returns 1 if mapping the buffer for write could cause the process
+ * to block, due to the object being active in the GPU.
+ */
+int crocus_bo_busy(struct crocus_bo *bo);
+
+/**
+ * Specify the volatility of the buffer.
+ * \param bo Buffer to create a name for
+ * \param madv The purgeable status
+ *
+ * Use I915_MADV_DONTNEED to mark the buffer as purgeable, and it will be
+ * reclaimed under memory pressure. If you subsequently require the buffer,
+ * then you must pass I915_MADV_WILLNEED to mark the buffer as required.
+ *
+ * Returns 1 if the buffer was retained, or 0 if it was discarded whilst
+ * marked as I915_MADV_DONTNEED.
+ */
+int crocus_bo_madvise(struct crocus_bo *bo, int madv);
+
+/* drm_bacon_bufmgr_gem.c */
+struct crocus_bufmgr *
+crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd,
+ bool bo_reuse);
+int crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr);
+
+struct crocus_bo *crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr,
+ const char *name,
+ unsigned handle);
+
+int crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns);
+
+uint32_t crocus_create_hw_context(struct crocus_bufmgr *bufmgr);
+uint32_t crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id);
+
+#define CROCUS_CONTEXT_LOW_PRIORITY ((I915_CONTEXT_MIN_USER_PRIORITY - 1) / 2)
+#define CROCUS_CONTEXT_MEDIUM_PRIORITY (I915_CONTEXT_DEFAULT_PRIORITY)
+#define CROCUS_CONTEXT_HIGH_PRIORITY ((I915_CONTEXT_MAX_USER_PRIORITY + 1) / 2)
+
+int crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr,
+ uint32_t ctx_id, int priority);
+
+void crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id);
+
+int crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd);
+struct crocus_bo *crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr,
+ int prime_fd, uint32_t tiling,
+ uint32_t stride);
+
+/**
+ * Exports a bo as a GEM handle into a given DRM file descriptor
+ * \param bo Buffer to export
+ * \param drm_fd File descriptor where the new handle is created
+ * \param out_handle Pointer to store the new handle
+ *
+ * Returns 0 if the buffer was successfully exported, a non zero error code
+ * otherwise.
+ */
+int crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd,
+ uint32_t *out_handle);
+
+uint32_t crocus_bo_export_gem_handle(struct crocus_bo *bo);
+
+int crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset,
+ uint64_t *out);
+
+int drm_ioctl(int fd, unsigned long request, void *arg);
+
+#endif /* CROCUS_BUFMGR_H */
diff --git a/src/gallium/drivers/crocus/crocus_clear.c b/src/gallium/drivers/crocus/crocus_clear.c
new file mode 100644
index 00000000000..1c56e23f794
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_clear.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/format/u_format.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/compiler/brw_compiler.h"
+#include "util/format_srgb.h"
+
+static bool
+crocus_is_color_fast_clear_compatible(struct crocus_context *ice,
+ enum isl_format format,
+ const union isl_color_value color)
+{
+ if (isl_format_has_int_channel(format)) {
+ perf_debug(&ice->dbg, "Integer fast clear not enabled for %s",
+ isl_format_get_name(format));
+ return false;
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (!isl_format_has_color_component(format, i)) {
+ continue;
+ }
+
+ if (color.f32[i] != 0.0f && color.f32[i] != 1.0f) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
+can_fast_clear_color(struct crocus_context *ice,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ enum isl_format format,
+ enum isl_format render_format,
+ union isl_color_value color)
+{
+ struct crocus_resource *res = (void *) p_res;
+
+ if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+ return false;
+
+ if (!isl_aux_usage_has_fast_clears(res->aux.usage))
+ return false;
+
+ /* Check for partial clear */
+ if (box->x > 0 || box->y > 0 ||
+ box->width < minify(p_res->width0, level) ||
+ box->height < minify(p_res->height0, level)) {
+ return false;
+ }
+
+ /* Avoid conditional fast clears to maintain correct tracking of the aux
+ * state (see iris_resource_finish_write for more info). Note that partial
+ * fast clears (if they existed) would not pose a problem with conditional
+ * rendering.
+ */
+ if (render_condition_enabled &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ return false;
+ }
+
+ /* We store clear colors as floats or uints as needed. If there are
+ * texture views in play, the formats will not properly be respected
+ * during resolves because the resolve operations only know about the
+ * resource and not the renderbuffer.
+ */
+ if (isl_format_srgb_to_linear(render_format) !=
+ isl_format_srgb_to_linear(format)) {
+ return false;
+ }
+
+ /* XXX: if (irb->mt->supports_fast_clear)
+ * see intel_miptree_create_for_dri_image()
+ */
+
+ if (!crocus_is_color_fast_clear_compatible(ice, format, color))
+ return false;
+
+ return true;
+}
+
+static union isl_color_value
+convert_fast_clear_color(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format render_format,
+ const union isl_color_value color)
+{
+ union isl_color_value override_color = color;
+ struct pipe_resource *p_res = (void *) res;
+
+ const enum pipe_format format = p_res->format;
+ const struct util_format_description *desc =
+ util_format_description(format);
+ unsigned colormask = util_format_colormask(desc);
+
+ if (util_format_is_intensity(format) ||
+ util_format_is_luminance(format) ||
+ util_format_is_luminance_alpha(format)) {
+ override_color.u32[1] = override_color.u32[0];
+ override_color.u32[2] = override_color.u32[0];
+ if (util_format_is_intensity(format))
+ override_color.u32[3] = override_color.u32[0];
+ } else {
+ for (int chan = 0; chan < 3; chan++) {
+ if (!(colormask & (1 << chan)))
+ override_color.u32[chan] = 0;
+ }
+ }
+
+ if (util_format_is_unorm(format)) {
+ for (int i = 0; i < 4; i++)
+ override_color.f32[i] = CLAMP(override_color.f32[i], 0.0f, 1.0f);
+ } else if (util_format_is_snorm(format)) {
+ for (int i = 0; i < 4; i++)
+ override_color.f32[i] = CLAMP(override_color.f32[i], -1.0f, 1.0f);
+ } else if (util_format_is_pure_uint(format)) {
+ for (int i = 0; i < 4; i++) {
+ unsigned bits = util_format_get_component_bits(
+ format, UTIL_FORMAT_COLORSPACE_RGB, i);
+ if (bits < 32) {
+ uint32_t max = (1u << bits) - 1;
+ override_color.u32[i] = MIN2(override_color.u32[i], max);
+ }
+ }
+ } else if (util_format_is_pure_sint(format)) {
+ for (int i = 0; i < 4; i++) {
+ unsigned bits = util_format_get_component_bits(
+ format, UTIL_FORMAT_COLORSPACE_RGB, i);
+ if (bits < 32) {
+ int32_t max = (1 << (bits - 1)) - 1;
+ int32_t min = -(1 << (bits - 1));
+ override_color.i32[i] = CLAMP(override_color.i32[i], min, max);
+ }
+ }
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+ format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ /* these packed float formats only store unsigned values */
+ for (int i = 0; i < 4; i++)
+ override_color.f32[i] = MAX2(override_color.f32[i], 0.0f);
+ }
+
+ if (!(colormask & 1 << 3)) {
+ if (util_format_is_pure_integer(format))
+ override_color.u32[3] = 1;
+ else
+ override_color.f32[3] = 1.0f;
+ }
+
+ /* Handle linear to SRGB conversion */
+ if (isl_format_is_srgb(render_format)) {
+ for (int i = 0; i < 3; i++) {
+ override_color.f32[i] =
+ util_format_linear_to_srgb_float(override_color.f32[i]);
+ }
+ }
+
+ return override_color;
+}
+
+static void
+fast_clear_color(struct crocus_context *ice,
+ struct crocus_resource *res,
+ unsigned level,
+ const struct pipe_box *box,
+ enum isl_format format,
+ union isl_color_value color,
+ enum blorp_batch_flags blorp_flags)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ struct pipe_resource *p_res = (void *) res;
+
+ color = convert_fast_clear_color(ice, res, format, color);
+
+ bool color_changed = !!memcmp(&res->aux.clear_color, &color,
+ sizeof(color));
+
+ if (color_changed) {
+ /* If we are clearing to a new clear value, we need to resolve fast
+ * clears from other levels/layers first, since we can't have different
+ * levels/layers with different fast clear colors.
+ */
+ for (unsigned res_lvl = 0; res_lvl < res->surf.levels; res_lvl++) {
+ const unsigned level_layers =
+ crocus_get_num_logical_layers(res, res_lvl);
+ for (unsigned layer = 0; layer < level_layers; layer++) {
+ if (res_lvl == level &&
+ layer >= box->z &&
+ layer < box->z + box->depth) {
+ /* We're going to clear this layer anyway. Leave it alone. */
+ continue;
+ }
+
+ enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, res_lvl, layer);
+
+ if (aux_state != ISL_AUX_STATE_CLEAR &&
+ aux_state != ISL_AUX_STATE_PARTIAL_CLEAR &&
+ aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) {
+ /* This slice doesn't have any fast-cleared bits. */
+ continue;
+ }
+
+ /* If we got here, then the level may have fast-clear bits that use
+ * the old clear value. We need to do a color resolve to get rid
+ * of their use of the clear color before we can change it.
+ * Fortunately, few applications ever change their clear color at
+ * different levels/layers, so this shouldn't happen often.
+ */
+ crocus_resource_prepare_access(ice, res,
+ res_lvl, 1, layer, 1,
+ res->aux.usage,
+ false);
+ perf_debug(&ice->dbg,
+ "Resolving resource (%p) level %d, layer %d: color changing from "
+ "(%0.2f, %0.2f, %0.2f, %0.2f) to "
+ "(%0.2f, %0.2f, %0.2f, %0.2f)\n",
+ res, res_lvl, layer,
+ res->aux.clear_color.f32[0],
+ res->aux.clear_color.f32[1],
+ res->aux.clear_color.f32[2],
+ res->aux.clear_color.f32[3],
+ color.f32[0], color.f32[1], color.f32[2], color.f32[3]);
+ }
+ }
+ }
+
+ crocus_resource_set_clear_color(ice, res, color);
+
+ /* If the buffer is already in ISL_AUX_STATE_CLEAR, and the color hasn't
+ * changed, the clear is redundant and can be skipped.
+ */
+ const enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, box->z);
+ if (!color_changed && box->depth == 1 && aux_state == ISL_AUX_STATE_CLEAR)
+ return;
+
+ /* Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+ *
+ * "Any transition from any value in {Clear, Render, Resolve} to a
+ * different value in {Clear, Render, Resolve} requires end of pipe
+ * synchronization."
+ *
+ * In other words, fast clear ops are not properly synchronized with
+ * other drawing. We need to use a PIPE_CONTROL to ensure that the
+ * contents of the previous draw hit the render target before we resolve
+ * and again afterwards to ensure that the resolve is complete before we
+ * do any more regular drawing.
+ */
+ crocus_emit_end_of_pipe_sync(batch,
+ "fast clear: pre-flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+ /* If we reach this point, we need to fast clear to change the state to
+ * ISL_AUX_STATE_CLEAR, or to update the fast clear color (or both).
+ */
+ blorp_flags |= color_changed ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR;
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ p_res, res->aux.usage, level, true);
+
+ /* In newer gens (> 9), the hardware will do a linear -> sRGB conversion of
+ * the clear color during the fast clear, if the surface format is of sRGB
+ * type. We use the linear version of the surface format here to prevent
+ * that from happening, since we already do our own linear -> sRGB
+ * conversion in convert_fast_clear_color().
+ */
+ blorp_fast_clear(&blorp_batch, &surf, isl_format_srgb_to_linear(format),
+ ISL_SWIZZLE_IDENTITY,
+ level, box->z, box->depth,
+ box->x, box->y, box->x + box->width,
+ box->y + box->height);
+ blorp_batch_finish(&blorp_batch);
+ crocus_emit_end_of_pipe_sync(batch,
+ "fast clear: post flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+ crocus_resource_set_aux_state(ice, res, level, box->z,
+ box->depth, ISL_AUX_STATE_CLEAR);
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+ return;
+}
+
+static void
+clear_color(struct crocus_context *ice,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ enum isl_format format,
+ struct isl_swizzle swizzle,
+ union isl_color_value color)
+{
+ struct crocus_resource *res = (void *) p_res;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ enum blorp_batch_flags blorp_flags = 0;
+
+ if (render_condition_enabled) {
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+ blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+ }
+
+ if (p_res->target == PIPE_BUFFER)
+ util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width);
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ bool can_fast_clear = can_fast_clear_color(ice, p_res, level, box,
+ render_condition_enabled,
+ res->surf.format, format, color);
+ if (can_fast_clear) {
+ fast_clear_color(ice, res, level, box, format, color,
+ blorp_flags);
+ return;
+ }
+
+ bool color_write_disable[4] = { false, false, false, false };
+ enum isl_aux_usage aux_usage =
+ crocus_resource_render_aux_usage(ice, res, format,
+ false, false);
+
+ crocus_resource_prepare_render(ice, res, level,
+ box->z, box->depth, aux_usage);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ p_res, aux_usage, level, true);
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ if (!isl_format_supports_rendering(devinfo, format) &&
+ isl_format_is_rgbx(format))
+ format = isl_format_rgbx_to_rgba(format);
+
+ blorp_clear(&blorp_batch, &surf, format, swizzle,
+ level, box->z, box->depth, box->x, box->y,
+ box->x + box->width, box->y + box->height,
+ color, color_write_disable);
+
+ blorp_batch_finish(&blorp_batch);
+ crocus_flush_and_dirty_for_history(ice, batch, res,
+ PIPE_CONTROL_RENDER_TARGET_FLUSH,
+ "cache history: post color clear");
+
+ crocus_resource_finish_render(ice, res, level,
+ box->z, box->depth, aux_usage);
+}
+
+static bool
+can_fast_clear_depth(struct crocus_context *ice,
+ struct crocus_resource *res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ float depth)
+{
+ struct pipe_resource *p_res = (void *) res;
+ struct pipe_context *ctx = (void *) ice;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (devinfo->ver < 6)
+ return false;
+
+ if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+ return false;
+
+ /* Check for partial clears */
+ if (box->x > 0 || box->y > 0 ||
+ box->width < u_minify(p_res->width0, level) ||
+ box->height < u_minify(p_res->height0, level)) {
+ return false;
+ }
+
+ /* Avoid conditional fast clears to maintain correct tracking of the aux
+ * state (see iris_resource_finish_write for more info). Note that partial
+ * fast clears would not pose a problem with conditional rendering.
+ */
+ if (render_condition_enabled &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ return false;
+ }
+
+ if (!crocus_resource_level_has_hiz(res, level))
+ return false;
+
+ if (res->base.format == PIPE_FORMAT_Z16_UNORM) {
+ /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+ *
+ * "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be
+ * enabled (the legacy method of clearing must be performed):
+ *
+ * - DevSNB{W/A}]: When depth buffer format is D16_UNORM and the
+ * width of the map (LOD0) is not multiple of 16, fast clear
+ * optimization must be disabled.
+ */
+ if (devinfo->ver == 6 &&
+ (minify(res->surf.phys_level0_sa.width,
+ level) % 16) != 0)
+ return false;
+ }
+ return true;
+}
+
+static void
+fast_clear_depth(struct crocus_context *ice,
+ struct crocus_resource *res,
+ unsigned level,
+ const struct pipe_box *box,
+ float depth)
+{
+ struct pipe_resource *p_res = (void *) res;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ /* Quantize the clear value to what can be stored in the actual depth
+ * buffer. This makes the following check more accurate because it now
+ * checks if the actual depth bits will match. It also prevents us from
+ * getting a too-accurate depth value during depth testing or when sampling
+ * with HiZ enabled.
+ */
+ const unsigned nbits = p_res->format == PIPE_FORMAT_Z16_UNORM ? 16 : 24;
+ const uint32_t depth_max = (1 << nbits) - 1;
+ depth = p_res->format == PIPE_FORMAT_Z32_FLOAT ? depth :
+ (unsigned)(depth * depth_max) / (float)depth_max;
+
+ bool update_clear_depth = false;
+
+ /* If we're clearing to a new clear value, then we need to resolve any clear
+ * flags out of the HiZ buffer into the real depth buffer.
+ */
+ if (res->aux.clear_color.f32[0] != depth) {
+ for (unsigned res_level = 0; res_level < res->surf.levels; res_level++) {
+ if (!crocus_resource_level_has_hiz(res, res_level))
+ continue;
+
+ const unsigned level_layers =
+ crocus_get_num_logical_layers(res, res_level);
+ for (unsigned layer = 0; layer < level_layers; layer++) {
+ if (res_level == level &&
+ layer >= box->z &&
+ layer < box->z + box->depth) {
+ /* We're going to clear this layer anyway. Leave it alone. */
+ continue;
+ }
+
+ enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, res_level, layer);
+
+ if (aux_state != ISL_AUX_STATE_CLEAR &&
+ aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) {
+ /* This slice doesn't have any fast-cleared bits. */
+ continue;
+ }
+
+ /* If we got here, then the level may have fast-clear bits that
+ * use the old clear value. We need to do a depth resolve to get
+ * rid of their use of the clear value before we can change it.
+ * Fortunately, few applications ever change their depth clear
+ * value so this shouldn't happen often.
+ */
+ crocus_hiz_exec(ice, batch, res, res_level, layer, 1,
+ ISL_AUX_OP_FULL_RESOLVE, false);
+ crocus_resource_set_aux_state(ice, res, res_level, layer, 1,
+ ISL_AUX_STATE_RESOLVED);
+ }
+ }
+ const union isl_color_value clear_value = { .f32 = {depth, } };
+ crocus_resource_set_clear_color(ice, res, clear_value);
+ update_clear_depth = true;
+ }
+
+ for (unsigned l = 0; l < box->depth; l++) {
+ enum isl_aux_state aux_state =
+ crocus_resource_level_has_hiz(res, level) ?
+ crocus_resource_get_aux_state(res, level, box->z + l) :
+ ISL_AUX_STATE_AUX_INVALID;
+ if (update_clear_depth || aux_state != ISL_AUX_STATE_CLEAR) {
+ if (aux_state == ISL_AUX_STATE_CLEAR) {
+ perf_debug(&ice->dbg, "Performing HiZ clear just to update the "
+ "depth clear value\n");
+ }
+ crocus_hiz_exec(ice, batch, res, level,
+ box->z + l, 1, ISL_AUX_OP_FAST_CLEAR,
+ update_clear_depth);
+ }
+ }
+
+ crocus_resource_set_aux_state(ice, res, level, box->z, box->depth,
+ ISL_AUX_STATE_CLEAR);
+ ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
+}
+
+static void
+clear_depth_stencil(struct crocus_context *ice,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ bool clear_depth,
+ bool clear_stencil,
+ float depth,
+ uint8_t stencil)
+{
+ struct crocus_resource *res = (void *) p_res;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ enum blorp_batch_flags blorp_flags = 0;
+
+ if (render_condition_enabled) {
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+ blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+ }
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ struct crocus_resource *z_res;
+ struct crocus_resource *stencil_res;
+ struct blorp_surf z_surf;
+ struct blorp_surf stencil_surf;
+
+ crocus_get_depth_stencil_resources(&batch->screen->devinfo, p_res, &z_res, &stencil_res);
+ if (z_res && clear_depth &&
+ can_fast_clear_depth(ice, z_res, level, box, render_condition_enabled,
+ depth)) {
+ fast_clear_depth(ice, z_res, level, box, depth);
+ crocus_flush_and_dirty_for_history(ice, batch, res, 0,
+ "cache history: post fast Z clear");
+ clear_depth = false;
+ z_res = NULL;
+ }
+
+ /* At this point, we might have fast cleared the depth buffer. So if there's
+ * no stencil clear pending, return early.
+ */
+ if (!(clear_depth || (clear_stencil && stencil_res))) {
+ return;
+ }
+
+ if (clear_depth && z_res) {
+ const enum isl_aux_usage aux_usage =
+ crocus_resource_render_aux_usage(ice, z_res, level, z_res->surf.format,
+ false);
+ crocus_resource_prepare_render(ice, z_res, level, box->z, box->depth,
+ aux_usage);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev,
+ &z_surf, &z_res->base, aux_usage,
+ level, true);
+ }
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ uint8_t stencil_mask = clear_stencil && stencil_res ? 0xff : 0;
+ if (stencil_mask) {
+ crocus_resource_prepare_access(ice, stencil_res, level, 1, box->z,
+ box->depth, stencil_res->aux.usage, false);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev,
+ &stencil_surf, &stencil_res->base,
+ stencil_res->aux.usage, level, true);
+ }
+
+ blorp_clear_depth_stencil(&blorp_batch, &z_surf, &stencil_surf,
+ level, box->z, box->depth,
+ box->x, box->y,
+ box->x + box->width,
+ box->y + box->height,
+ clear_depth && z_res, depth,
+ stencil_mask, stencil);
+
+ blorp_batch_finish(&blorp_batch);
+ crocus_flush_and_dirty_for_history(ice, batch, res, 0,
+ "cache history: post slow ZS clear");
+
+ if (clear_depth && z_res) {
+ crocus_resource_finish_render(ice, z_res, level,
+ box->z, box->depth, z_surf.aux_usage);
+ }
+
+ if (stencil_mask) {
+ crocus_resource_finish_write(ice, stencil_res, level, box->z, box->depth,
+ stencil_res->aux.usage);
+ }
+}
+
+/**
+ * The pipe->clear() driver hook.
+ *
+ * This clears buffers attached to the current draw framebuffer.
+ */
+static void
+crocus_clear(struct pipe_context *ctx,
+ unsigned buffers,
+ const struct pipe_scissor_state *scissor_state,
+ const union pipe_color_union *p_color,
+ double depth,
+ unsigned stencil)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ assert(buffers != 0);
+
+ struct pipe_box box = {
+ .width = cso_fb->width,
+ .height = cso_fb->height,
+ };
+
+ if (scissor_state) {
+ box.x = scissor_state->minx;
+ box.y = scissor_state->miny;
+ box.width = MIN2(box.width, scissor_state->maxx - scissor_state->minx);
+ box.height = MIN2(box.height, scissor_state->maxy - scissor_state->miny);
+ }
+
+ if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+ if (devinfo->ver < 6) {
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE, true);
+ util_blitter_clear(ice->blitter, cso_fb->width, cso_fb->height,
+ util_framebuffer_get_num_layers(cso_fb),
+ buffers & PIPE_CLEAR_DEPTHSTENCIL, p_color, depth, stencil, false);
+ } else {
+ struct pipe_surface *psurf = cso_fb->zsbuf;
+ box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1;
+ box.z = psurf->u.tex.first_layer;
+
+ clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box, true,
+ buffers & PIPE_CLEAR_DEPTH,
+ buffers & PIPE_CLEAR_STENCIL,
+ depth, stencil);
+ }
+ buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
+ }
+
+ if (buffers & PIPE_CLEAR_COLOR) {
+ /* pipe_color_union and isl_color_value are interchangeable */
+ union isl_color_value *color = (void *) p_color;
+
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ if (buffers & (PIPE_CLEAR_COLOR0 << i)) {
+ struct pipe_surface *psurf = cso_fb->cbufs[i];
+ struct crocus_surface *isurf = (void *) psurf;
+ box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1,
+ box.z = psurf->u.tex.first_layer,
+
+ clear_color(ice, psurf->texture, psurf->u.tex.level, &box,
+ true, isurf->view.format, isurf->view.swizzle,
+ *color);
+ }
+ }
+ }
+}
+
+/**
+ * The pipe->clear_texture() driver hook.
+ *
+ * This clears the given texture resource.
+ */
+static void
+crocus_clear_texture(struct pipe_context *ctx,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ const void *data)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_resource *res = (void *) p_res;
+
+ if (devinfo->ver < 6) {
+ util_clear_texture(ctx, p_res,
+ level, box, data);
+ return;
+ }
+
+ if (crocus_resource_unfinished_aux_import(res))
+ crocus_resource_finish_aux_import(ctx->screen, res);
+
+ if (util_format_is_depth_or_stencil(p_res->format)) {
+ const struct util_format_unpack_description *fmt_unpack =
+ util_format_unpack_description(p_res->format);
+
+ float depth = 0.0;
+ uint8_t stencil = 0;
+
+ if (fmt_unpack->unpack_z_float)
+ fmt_unpack->unpack_z_float(&depth, 0, data, 0, 1, 1);
+
+ if (fmt_unpack->unpack_s_8uint)
+ fmt_unpack->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+
+ clear_depth_stencil(ice, p_res, level, box, true, true, true,
+ depth, stencil);
+ } else {
+ union isl_color_value color;
+ struct crocus_resource *res = (void *) p_res;
+ enum isl_format format = res->surf.format;
+
+ if (!isl_format_supports_rendering(devinfo, format)) {
+ const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+ // XXX: actually just get_copy_format_for_bpb from BLORP
+ // XXX: don't cut and paste this
+ switch (fmtl->bpb) {
+ case 8: format = ISL_FORMAT_R8_UINT; break;
+ case 16: format = ISL_FORMAT_R8G8_UINT; break;
+ case 24: format = ISL_FORMAT_R8G8B8_UINT; break;
+ case 32: format = ISL_FORMAT_R8G8B8A8_UINT; break;
+ case 48: format = ISL_FORMAT_R16G16B16_UINT; break;
+ case 64: format = ISL_FORMAT_R16G16B16A16_UINT; break;
+ case 96: format = ISL_FORMAT_R32G32B32_UINT; break;
+ case 128: format = ISL_FORMAT_R32G32B32A32_UINT; break;
+ default:
+ unreachable("Unknown format bpb");
+ }
+
+ /* No aux surfaces for non-renderable surfaces */
+ assert(res->aux.usage == ISL_AUX_USAGE_NONE);
+ }
+
+ isl_color_value_unpack(&color, format, data);
+
+ clear_color(ice, p_res, level, box, true, format,
+ ISL_SWIZZLE_IDENTITY, color);
+ }
+}
+
+/**
+ * The pipe->clear_render_target() driver hook.
+ *
+ * This clears the given render target surface.
+ */
+static void
+crocus_clear_render_target(struct pipe_context *ctx,
+ struct pipe_surface *psurf,
+ const union pipe_color_union *p_color,
+ unsigned dst_x, unsigned dst_y,
+ unsigned width, unsigned height,
+ bool render_condition_enabled)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_surface *isurf = (void *) psurf;
+ struct pipe_box box = {
+ .x = dst_x,
+ .y = dst_y,
+ .z = psurf->u.tex.first_layer,
+ .width = width,
+ .height = height,
+ .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1
+ };
+
+ /* pipe_color_union and isl_color_value are interchangeable */
+ union isl_color_value *color = (void *) p_color;
+
+ clear_color(ice, psurf->texture, psurf->u.tex.level, &box,
+ render_condition_enabled,
+ isurf->view.format, isurf->view.swizzle, *color);
+}
+
+/**
+ * The pipe->clear_depth_stencil() driver hook.
+ *
+ * This clears the given depth/stencil surface.
+ */
+static void
+crocus_clear_depth_stencil(struct pipe_context *ctx,
+ struct pipe_surface *psurf,
+ unsigned flags,
+ double depth,
+ unsigned stencil,
+ unsigned dst_x, unsigned dst_y,
+ unsigned width, unsigned height,
+ bool render_condition_enabled)
+{
+ return;
+#if 0
+ struct crocus_context *ice = (void *) ctx;
+ struct pipe_box box = {
+ .x = dst_x,
+ .y = dst_y,
+ .z = psurf->u.tex.first_layer,
+ .width = width,
+ .height = height,
+ .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1
+ };
+ uint32_t blit_flags = 0;
+
+ assert(util_format_is_depth_or_stencil(psurf->texture->format));
+
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE);
+ util_blitter_clear(ice->blitter, width, height,
+ 1, flags, NULL, depth, stencil, render_condition_enabled);
+#if 0
+ clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box,
+ render_condition_enabled,
+ flags & PIPE_CLEAR_DEPTH, flags & PIPE_CLEAR_STENCIL,
+ depth, stencil);
+#endif
+#endif
+}
+
+void
+crocus_init_clear_functions(struct pipe_context *ctx)
+{
+ ctx->clear = crocus_clear;
+ ctx->clear_texture = crocus_clear_texture;
+ ctx->clear_render_target = crocus_clear_render_target;
+ ctx->clear_depth_stencil = crocus_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/crocus/crocus_context.c b/src/gallium/drivers/crocus/crocus_context.c
new file mode 100644
index 00000000000..cd8a54d6d34
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_context.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/ralloc.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_upload_mgr.h"
+#include "drm-uapi/i915_drm.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "common/intel_defines.h"
+#include "common/intel_sample_positions.h"
+
+/**
+ * The pipe->set_debug_callback() driver hook.
+ */
+static void
+crocus_set_debug_callback(struct pipe_context *ctx,
+ const struct pipe_debug_callback *cb)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ if (cb)
+ ice->dbg = *cb;
+ else
+ memset(&ice->dbg, 0, sizeof(ice->dbg));
+}
+
+static bool
+crocus_init_identifier_bo(struct crocus_context *ice)
+{
+ void *bo_map;
+
+ bo_map = crocus_bo_map(NULL, ice->workaround_bo, MAP_READ | MAP_WRITE);
+ if (!bo_map)
+ return false;
+
+ ice->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE;
+ ice->workaround_offset = ALIGN(
+ intel_debug_write_identifiers(bo_map, 4096, "Crocus") + 8, 8);
+
+ crocus_bo_unmap(ice->workaround_bo);
+
+ return true;
+}
+
+/**
+ * Called from the batch module when it detects a GPU hang.
+ *
+ * In this case, we've lost our GEM context, and can't rely on any existing
+ * state on the GPU. We must mark everything dirty and wipe away any saved
+ * assumptions about the last known state of the GPU.
+ */
+void
+crocus_lost_context_state(struct crocus_batch *batch)
+{
+ /* The batch module doesn't have an crocus_context, because we want to
+ * avoid introducing lots of layering violations. Unfortunately, here
+ * we do need to inform the context of batch catastrophe. We know the
+ * batch is one of our context's, so hackily claw our way back.
+ */
+ struct crocus_context *ice = batch->ice;
+ struct crocus_screen *screen = batch->screen;
+ if (batch->name == CROCUS_BATCH_RENDER) {
+ screen->vtbl.init_render_context(batch);
+ } else if (batch->name == CROCUS_BATCH_COMPUTE) {
+ screen->vtbl.init_compute_context(batch);
+ } else {
+ unreachable("unhandled batch reset");
+ }
+
+ ice->state.dirty = ~0ull;
+ memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+ batch->state_base_address_emitted = false;
+ screen->vtbl.lost_genx_state(ice, batch);
+}
+
+static enum pipe_reset_status
+crocus_get_device_reset_status(struct pipe_context *ctx)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ enum pipe_reset_status worst_reset = PIPE_NO_RESET;
+
+ /* Check the reset status of each batch's hardware context, and take the
+ * worst status (if one was guilty, proclaim guilt).
+ */
+ for (int i = 0; i < ice->batch_count; i++) {
+ /* This will also recreate the hardware contexts as necessary, so any
+ * future queries will show no resets. We only want to report once.
+ */
+ enum pipe_reset_status batch_reset =
+ crocus_batch_check_for_reset(&ice->batches[i]);
+
+ if (batch_reset == PIPE_NO_RESET)
+ continue;
+
+ if (worst_reset == PIPE_NO_RESET) {
+ worst_reset = batch_reset;
+ } else {
+ /* GUILTY < INNOCENT < UNKNOWN */
+ worst_reset = MIN2(worst_reset, batch_reset);
+ }
+ }
+
+ if (worst_reset != PIPE_NO_RESET && ice->reset.reset)
+ ice->reset.reset(ice->reset.data, worst_reset);
+
+ return worst_reset;
+}
+
+static void
+crocus_set_device_reset_callback(struct pipe_context *ctx,
+ const struct pipe_device_reset_callback *cb)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ if (cb)
+ ice->reset = *cb;
+ else
+ memset(&ice->reset, 0, sizeof(ice->reset));
+}
+
+static void
+crocus_get_sample_position(struct pipe_context *ctx,
+ unsigned sample_count,
+ unsigned sample_index,
+ float *out_value)
+{
+ union {
+ struct {
+ float x[16];
+ float y[16];
+ } a;
+ struct {
+ float _0XOffset, _1XOffset, _2XOffset, _3XOffset,
+ _4XOffset, _5XOffset, _6XOffset, _7XOffset,
+ _8XOffset, _9XOffset, _10XOffset, _11XOffset,
+ _12XOffset, _13XOffset, _14XOffset, _15XOffset;
+ float _0YOffset, _1YOffset, _2YOffset, _3YOffset,
+ _4YOffset, _5YOffset, _6YOffset, _7YOffset,
+ _8YOffset, _9YOffset, _10YOffset, _11YOffset,
+ _12YOffset, _13YOffset, _14YOffset, _15YOffset;
+ } v;
+ } u;
+ switch (sample_count) {
+ case 1: INTEL_SAMPLE_POS_1X(u.v._); break;
+ case 2: INTEL_SAMPLE_POS_2X(u.v._); break;
+ case 4: INTEL_SAMPLE_POS_4X(u.v._); break;
+ case 8: INTEL_SAMPLE_POS_8X(u.v._); break;
+ case 16: INTEL_SAMPLE_POS_16X(u.v._); break;
+ default: unreachable("invalid sample count");
+ }
+
+ out_value[0] = u.a.x[sample_index];
+ out_value[1] = u.a.y[sample_index];
+}
+
+/**
+ * Destroy a context, freeing any associated memory.
+ */
+static void
+crocus_destroy_context(struct pipe_context *ctx)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ if (ctx->stream_uploader)
+ u_upload_destroy(ctx->stream_uploader);
+
+ if (ice->blitter)
+ util_blitter_destroy(ice->blitter);
+ screen->vtbl.destroy_state(ice);
+ crocus_destroy_program_cache(ice);
+ u_upload_destroy(ice->query_buffer_uploader);
+
+ crocus_bo_unreference(ice->workaround_bo);
+
+ slab_destroy_child(&ice->transfer_pool);
+
+ crocus_batch_free(&ice->batches[CROCUS_BATCH_RENDER]);
+ if (ice->batches[CROCUS_BATCH_COMPUTE].ice)
+ crocus_batch_free(&ice->batches[CROCUS_BATCH_COMPUTE]);
+
+ ralloc_free(ice);
+}
+
+#define genX_call(devinfo, func, ...) \
+ switch ((devinfo)->verx10) { \
+ case 75: \
+ gfx75_##func(__VA_ARGS__); \
+ break; \
+ case 70: \
+ gfx7_##func(__VA_ARGS__); \
+ break; \
+ case 60: \
+ gfx6_##func(__VA_ARGS__); \
+ break; \
+ case 50: \
+ gfx5_##func(__VA_ARGS__); \
+ break; \
+ case 45: \
+ gfx45_##func(__VA_ARGS__); \
+ break; \
+ case 40: \
+ gfx4_##func(__VA_ARGS__); \
+ break; \
+ default: \
+ unreachable("Unknown hardware generation"); \
+ }
+
+/**
+ * Create a context.
+ *
+ * This is where each context begins.
+ */
+struct pipe_context *
+crocus_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
+{
+ struct crocus_screen *screen = (struct crocus_screen*)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_context *ice = rzalloc(NULL, struct crocus_context);
+
+ if (!ice)
+ return NULL;
+
+ struct pipe_context *ctx = &ice->ctx;
+
+ ctx->screen = pscreen;
+ ctx->priv = priv;
+
+ ctx->stream_uploader = u_upload_create_default(ctx);
+ if (!ctx->stream_uploader) {
+ free(ctx);
+ return NULL;
+ }
+ ctx->const_uploader = ctx->stream_uploader;
+
+ ctx->destroy = crocus_destroy_context;
+ ctx->set_debug_callback = crocus_set_debug_callback;
+ ctx->set_device_reset_callback = crocus_set_device_reset_callback;
+ ctx->get_device_reset_status = crocus_get_device_reset_status;
+ ctx->get_sample_position = crocus_get_sample_position;
+
+ ice->shaders.urb_size = devinfo->urb.size;
+
+ crocus_init_context_fence_functions(ctx);
+ crocus_init_blit_functions(ctx);
+ crocus_init_clear_functions(ctx);
+ crocus_init_program_functions(ctx);
+ crocus_init_resource_functions(ctx);
+ crocus_init_flush_functions(ctx);
+
+ crocus_init_program_cache(ice);
+
+ slab_create_child(&ice->transfer_pool, &screen->transfer_pool);
+
+ ice->query_buffer_uploader =
+ u_upload_create(ctx, 4096, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING,
+ 0);
+
+ ice->workaround_bo =
+ crocus_bo_alloc(screen->bufmgr, "workaround", 4096);
+ if (!ice->workaround_bo)
+ return NULL;
+
+ if (!crocus_init_identifier_bo(ice))
+ return NULL;
+
+ genX_call(devinfo, init_state, ice);
+ genX_call(devinfo, init_blorp, ice);
+ genX_call(devinfo, init_query, ice);
+
+ ice->blitter = util_blitter_create(&ice->ctx);
+ if (ice->blitter == NULL)
+ return NULL;
+ int priority = 0;
+ if (flags & PIPE_CONTEXT_HIGH_PRIORITY)
+ priority = INTEL_CONTEXT_HIGH_PRIORITY;
+ if (flags & PIPE_CONTEXT_LOW_PRIORITY)
+ priority = INTEL_CONTEXT_LOW_PRIORITY;
+
+ ice->batch_count = devinfo->ver >= 7 ? CROCUS_BATCH_COUNT : 1;
+ for (int i = 0; i < ice->batch_count; i++) {
+ crocus_init_batch(ice, (enum crocus_batch_name) i,
+ priority);
+ }
+
+ ice->urb.size = devinfo->urb.size;
+ screen->vtbl.init_render_context(&ice->batches[CROCUS_BATCH_RENDER]);
+ if (ice->batch_count > 1)
+ screen->vtbl.init_compute_context(&ice->batches[CROCUS_BATCH_COMPUTE]);
+
+ return ctx;
+}
+
+bool
+crocus_sw_check_cond_render(struct crocus_context *ice)
+{
+ struct crocus_query *q = ice->condition.query;
+ union pipe_query_result result;
+
+ bool wait = ice->condition.mode == PIPE_RENDER_COND_WAIT ||
+ ice->condition.mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+ if (!q)
+ return true;
+
+ bool ret = ice->ctx.get_query_result(&ice->ctx, (void *)q, wait, &result);
+ if (!ret)
+ return true;
+
+ return ice->condition.condition ? result.u64 == 0 : result.u64 != 0;
+}
diff --git a/src/gallium/drivers/crocus/crocus_context.h b/src/gallium/drivers/crocus/crocus_context.h
new file mode 100644
index 00000000000..8d6e43d80f6
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_context.h
@@ -0,0 +1,955 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_CONTEXT_H
+#define CROCUS_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "intel/blorp/blorp.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/compiler/brw_compiler.h"
+#include "crocus_batch.h"
+#include "crocus_fence.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "util/u_blitter.h"
+
+struct crocus_bo;
+struct crocus_context;
+struct blorp_batch;
+struct blorp_params;
+
+#define CROCUS_MAX_TEXTURE_BUFFER_SIZE (1 << 27)
+#define CROCUS_MAX_TEXTURE_SAMPLERS 32
+/* CROCUS_MAX_ABOS and CROCUS_MAX_SSBOS must be the same. */
+#define CROCUS_MAX_ABOS 16
+#define CROCUS_MAX_SSBOS 16
+#define CROCUS_MAX_VIEWPORTS 16
+#define CROCUS_MAX_CLIP_PLANES 8
+
+enum crocus_param_domain {
+ BRW_PARAM_DOMAIN_BUILTIN = 0,
+ BRW_PARAM_DOMAIN_IMAGE,
+};
+
+enum {
+ DRI_CONF_BO_REUSE_DISABLED,
+ DRI_CONF_BO_REUSE_ALL
+};
+
+#define BRW_PARAM(domain, val) (BRW_PARAM_DOMAIN_##domain << 24 | (val))
+#define BRW_PARAM_DOMAIN(param) ((uint32_t)(param) >> 24)
+#define BRW_PARAM_VALUE(param) ((uint32_t)(param) & 0x00ffffff)
+#define BRW_PARAM_IMAGE(idx, offset) BRW_PARAM(IMAGE, ((idx) << 8) | (offset))
+#define BRW_PARAM_IMAGE_IDX(value) (BRW_PARAM_VALUE(value) >> 8)
+#define BRW_PARAM_IMAGE_OFFSET(value)(BRW_PARAM_VALUE(value) & 0xf)
+
+/**
+ * Dirty flags. When state changes, we flag some combination of these
+ * to indicate that particular GPU commands need to be re-emitted.
+ *
+ * Each bit typically corresponds to a single 3DSTATE_* command packet, but
+ * in rare cases they map to a group of related packets that need to be
+ * emitted together.
+ *
+ * See crocus_upload_render_state().
+ */
+#define CROCUS_DIRTY_COLOR_CALC_STATE (1ull << 0)
+#define CROCUS_DIRTY_POLYGON_STIPPLE (1ull << 1)
+#define CROCUS_DIRTY_CC_VIEWPORT (1ull << 2)
+#define CROCUS_DIRTY_SF_CL_VIEWPORT (1ull << 3)
+#define CROCUS_DIRTY_RASTER (1ull << 4)
+#define CROCUS_DIRTY_CLIP (1ull << 5)
+#define CROCUS_DIRTY_LINE_STIPPLE (1ull << 6)
+#define CROCUS_DIRTY_VERTEX_ELEMENTS (1ull << 7)
+#define CROCUS_DIRTY_VERTEX_BUFFERS (1ull << 8)
+#define CROCUS_DIRTY_DRAWING_RECTANGLE (1ull << 9)
+#define CROCUS_DIRTY_GEN6_URB (1ull << 10)
+#define CROCUS_DIRTY_DEPTH_BUFFER (1ull << 11)
+#define CROCUS_DIRTY_WM (1ull << 12)
+#define CROCUS_DIRTY_SO_DECL_LIST (1ull << 13)
+#define CROCUS_DIRTY_STREAMOUT (1ull << 14)
+#define CROCUS_DIRTY_GEN4_CONSTANT_COLOR (1ull << 15)
+#define CROCUS_DIRTY_GEN4_CURBE (1ull << 16)
+#define CROCUS_DIRTY_GEN4_URB_FENCE (1ull << 17)
+#define CROCUS_DIRTY_GEN5_PIPELINED_POINTERS (1ull << 18)
+#define CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS (1ull << 19)
+#define CROCUS_DIRTY_GEN6_BLEND_STATE (1ull << 20)
+#define CROCUS_DIRTY_GEN6_SCISSOR_RECT (1ull << 21)
+#define CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL (1ull << 22)
+#define CROCUS_DIRTY_GEN6_MULTISAMPLE (1ull << 23)
+#define CROCUS_DIRTY_GEN6_SAMPLE_MASK (1ull << 24)
+#define CROCUS_DIRTY_GEN7_SBE (1ull << 25)
+#define CROCUS_DIRTY_GEN7_L3_CONFIG (1ull << 26)
+#define CROCUS_DIRTY_GEN7_SO_BUFFERS (1ull << 27)
+#define CROCUS_DIRTY_GEN75_VF (1ull << 28)
+#define CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES (1ull << 29)
+#define CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 30)
+#define CROCUS_DIRTY_VF_STATISTICS (1ull << 31)
+#define CROCUS_DIRTY_GEN4_CLIP_PROG (1ull << 32)
+#define CROCUS_DIRTY_GEN4_SF_PROG (1ull << 33)
+#define CROCUS_DIRTY_GEN4_FF_GS_PROG (1ull << 34)
+#define CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS (1ull << 35)
+#define CROCUS_DIRTY_GEN6_SVBI (1ull << 36)
+
+#define CROCUS_ALL_DIRTY_FOR_COMPUTE (CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES)
+
+#define CROCUS_ALL_DIRTY_FOR_RENDER (~CROCUS_ALL_DIRTY_FOR_COMPUTE)
+
+/**
+ * Per-stage dirty flags. When state changes, we flag some combination of
+ * these to indicate that particular GPU commands need to be re-emitted.
+ * Unlike the IRIS_DIRTY_* flags these are shader stage-specific and can be
+ * indexed by shifting the mask by the shader stage index.
+ *
+ * See crocus_upload_render_state().
+ */
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS (1ull << 0)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS (1ull << 1)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES (1ull << 2)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS (1ull << 3)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS (1ull << 4)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS (1ull << 5)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_VS (1ull << 6)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_TCS (1ull << 7)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_TES (1ull << 8)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_GS (1ull << 9)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_FS (1ull << 10)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_CS (1ull << 11)
+#define CROCUS_STAGE_DIRTY_VS (1ull << 12)
+#define CROCUS_STAGE_DIRTY_TCS (1ull << 13)
+#define CROCUS_STAGE_DIRTY_TES (1ull << 14)
+#define CROCUS_STAGE_DIRTY_GS (1ull << 15)
+#define CROCUS_STAGE_DIRTY_FS (1ull << 16)
+#define CROCUS_STAGE_DIRTY_CS (1ull << 17)
+#define CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS 18
+#define CROCUS_STAGE_DIRTY_CONSTANTS_VS (1ull << 18)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_TCS (1ull << 19)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_TES (1ull << 20)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_GS (1ull << 21)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_FS (1ull << 22)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_CS (1ull << 23)
+#define CROCUS_STAGE_DIRTY_BINDINGS_VS (1ull << 24)
+#define CROCUS_STAGE_DIRTY_BINDINGS_TCS (1ull << 25)
+#define CROCUS_STAGE_DIRTY_BINDINGS_TES (1ull << 26)
+#define CROCUS_STAGE_DIRTY_BINDINGS_GS (1ull << 27)
+#define CROCUS_STAGE_DIRTY_BINDINGS_FS (1ull << 28)
+#define CROCUS_STAGE_DIRTY_BINDINGS_CS (1ull << 29)
+
+#define CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE (CROCUS_STAGE_DIRTY_CS | \
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS | \
+ CROCUS_STAGE_DIRTY_UNCOMPILED_CS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_CS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_CS)
+
+#define CROCUS_ALL_STAGE_DIRTY_FOR_RENDER (~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE)
+
+#define CROCUS_ALL_STAGE_DIRTY_BINDINGS (CROCUS_STAGE_DIRTY_BINDINGS_VS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_TES | \
+ CROCUS_STAGE_DIRTY_BINDINGS_GS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_FS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_CS)
+
+#define CROCUS_RENDER_STAGE_DIRTY_CONSTANTS (CROCUS_STAGE_DIRTY_CONSTANTS_VS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_GS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_FS)
+
+/**
+ * Non-orthogonal state (NOS) dependency flags.
+ *
+ * Shader programs may depend on non-orthogonal state. These flags are
+ * used to indicate that a shader's key depends on the state provided by
+ * a certain Gallium CSO. Changing any CSOs marked as a dependency will
+ * cause the driver to re-compute the shader key, possibly triggering a
+ * shader recompile.
+ */
+enum crocus_nos_dep {
+ CROCUS_NOS_FRAMEBUFFER,
+ CROCUS_NOS_DEPTH_STENCIL_ALPHA,
+ CROCUS_NOS_RASTERIZER,
+ CROCUS_NOS_BLEND,
+ CROCUS_NOS_LAST_VUE_MAP,
+ CROCUS_NOS_TEXTURES,
+ CROCUS_NOS_VERTEX_ELEMENTS,
+ CROCUS_NOS_COUNT,
+};
+
+struct crocus_depth_stencil_alpha_state;
+
+/**
+ * Cache IDs for the in-memory program cache (ice->shaders.cache).
+ */
+enum crocus_program_cache_id {
+ CROCUS_CACHE_VS = MESA_SHADER_VERTEX,
+ CROCUS_CACHE_TCS = MESA_SHADER_TESS_CTRL,
+ CROCUS_CACHE_TES = MESA_SHADER_TESS_EVAL,
+ CROCUS_CACHE_GS = MESA_SHADER_GEOMETRY,
+ CROCUS_CACHE_FS = MESA_SHADER_FRAGMENT,
+ CROCUS_CACHE_CS = MESA_SHADER_COMPUTE,
+ CROCUS_CACHE_BLORP,
+ CROCUS_CACHE_SF,
+ CROCUS_CACHE_CLIP,
+ CROCUS_CACHE_FF_GS,
+};
+
+/** @{
+ *
+ * Defines for PIPE_CONTROL operations, which trigger cache flushes,
+ * synchronization, pipelined memory writes, and so on.
+ *
+ * The bits here are not the actual hardware values. The actual fields
+ * move between various generations, so we just have flags for each
+ * potential operation, and use genxml to encode the actual packet.
+ */
+enum pipe_control_flags
+{
+ PIPE_CONTROL_FLUSH_LLC = (1 << 1),
+ PIPE_CONTROL_LRI_POST_SYNC_OP = (1 << 2),
+ PIPE_CONTROL_STORE_DATA_INDEX = (1 << 3),
+ PIPE_CONTROL_CS_STALL = (1 << 4),
+ PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET = (1 << 5),
+ PIPE_CONTROL_SYNC_GFDT = (1 << 6),
+ PIPE_CONTROL_TLB_INVALIDATE = (1 << 7),
+ PIPE_CONTROL_MEDIA_STATE_CLEAR = (1 << 8),
+ PIPE_CONTROL_WRITE_IMMEDIATE = (1 << 9),
+ PIPE_CONTROL_WRITE_DEPTH_COUNT = (1 << 10),
+ PIPE_CONTROL_WRITE_TIMESTAMP = (1 << 11),
+ PIPE_CONTROL_DEPTH_STALL = (1 << 12),
+ PIPE_CONTROL_RENDER_TARGET_FLUSH = (1 << 13),
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE = (1 << 14),
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE = (1 << 15),
+ PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE = (1 << 16),
+ PIPE_CONTROL_NOTIFY_ENABLE = (1 << 17),
+ PIPE_CONTROL_FLUSH_ENABLE = (1 << 18),
+ PIPE_CONTROL_DATA_CACHE_FLUSH = (1 << 19),
+ PIPE_CONTROL_VF_CACHE_INVALIDATE = (1 << 20),
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE = (1 << 21),
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE = (1 << 22),
+ PIPE_CONTROL_STALL_AT_SCOREBOARD = (1 << 23),
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH = (1 << 24),
+ PIPE_CONTROL_TILE_CACHE_FLUSH = (1 << 25),
+};
+
+#define PIPE_CONTROL_CACHE_FLUSH_BITS \
+ (PIPE_CONTROL_DEPTH_CACHE_FLUSH | \
+ PIPE_CONTROL_DATA_CACHE_FLUSH | \
+ PIPE_CONTROL_RENDER_TARGET_FLUSH)
+
+#define PIPE_CONTROL_CACHE_INVALIDATE_BITS \
+ (PIPE_CONTROL_STATE_CACHE_INVALIDATE | \
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE | \
+ PIPE_CONTROL_VF_CACHE_INVALIDATE | \
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE)
+
+enum crocus_predicate_state {
+ /* The first two states are used if we can determine whether to draw
+ * without having to look at the values in the query object buffer. This
+ * will happen if there is no conditional render in progress, if the query
+ * object is already completed or if something else has already added
+ * samples to the preliminary result.
+ */
+ CROCUS_PREDICATE_STATE_RENDER,
+ CROCUS_PREDICATE_STATE_DONT_RENDER,
+
+ /* In this case whether to draw or not depends on the result of an
+ * MI_PREDICATE command so the predicate enable bit needs to be checked.
+ */
+ CROCUS_PREDICATE_STATE_USE_BIT,
+ /* In this case, either MI_PREDICATE doesn't exist or we lack the
+ * necessary kernel features to use it. Stall for the query result.
+ */
+ CROCUS_PREDICATE_STATE_STALL_FOR_QUERY,
+};
+
+/** @} */
+
+/**
+ * An uncompiled, API-facing shader. This is the Gallium CSO for shaders.
+ * It primarily contains the NIR for the shader.
+ *
+ * Each API-facing shader can be compiled into multiple shader variants,
+ * based on non-orthogonal state dependencies, recorded in the shader key.
+ *
+ * See crocus_compiled_shader, which represents a compiled shader variant.
+ */
+struct crocus_uncompiled_shader {
+ struct nir_shader *nir;
+
+ struct pipe_stream_output_info stream_output;
+
+ /* A SHA1 of the serialized NIR for the disk cache. */
+ unsigned char nir_sha1[20];
+
+ unsigned program_id;
+
+ /** Bitfield of (1 << CROCUS_NOS_*) flags. */
+ unsigned nos;
+
+ /** Have any shader variants been compiled yet? */
+ bool compiled_once;
+
+ /** Should we use ALT mode for math? Useful for ARB programs. */
+ bool use_alt_mode;
+
+ bool needs_edge_flag;
+
+ /** Constant data scraped from the shader by nir_opt_large_constants */
+ struct pipe_resource *const_data;
+
+ /** Surface state for const_data */
+ struct crocus_state_ref const_data_state;
+};
+
+enum crocus_surface_group {
+ CROCUS_SURFACE_GROUP_RENDER_TARGET,
+ CROCUS_SURFACE_GROUP_RENDER_TARGET_READ,
+ CROCUS_SURFACE_GROUP_SOL,
+ CROCUS_SURFACE_GROUP_CS_WORK_GROUPS,
+ CROCUS_SURFACE_GROUP_TEXTURE,
+ CROCUS_SURFACE_GROUP_TEXTURE_GATHER,
+ CROCUS_SURFACE_GROUP_IMAGE,
+ CROCUS_SURFACE_GROUP_UBO,
+ CROCUS_SURFACE_GROUP_SSBO,
+
+ CROCUS_SURFACE_GROUP_COUNT,
+};
+
+enum {
+ /* Invalid value for a binding table index. */
+ CROCUS_SURFACE_NOT_USED = 0xa0a0a0a0,
+};
+
+struct crocus_binding_table {
+ uint32_t size_bytes;
+
+ /** Number of surfaces in each group, before compacting. */
+ uint32_t sizes[CROCUS_SURFACE_GROUP_COUNT];
+
+ /** Initial offset of each group. */
+ uint32_t offsets[CROCUS_SURFACE_GROUP_COUNT];
+
+ /** Mask of surfaces used in each group. */
+ uint64_t used_mask[CROCUS_SURFACE_GROUP_COUNT];
+};
+
+/**
+ * A compiled shader variant, containing a pointer to the GPU assembly,
+ * as well as program data and other packets needed by state upload.
+ *
+ * There can be several crocus_compiled_shader variants per API-level shader
+ * (crocus_uncompiled_shader), due to state-based recompiles (brw_*_prog_key).
+ */
+struct crocus_compiled_shader {
+ /** Reference to the uploaded assembly. */
+ uint32_t offset;
+
+ /* asm size in map */
+ uint32_t map_size;
+
+ /** The program data (owned by the program cache hash table) */
+ struct brw_stage_prog_data *prog_data;
+ uint32_t prog_data_size;
+
+ /** A list of system values to be uploaded as uniforms. */
+ enum brw_param_builtin *system_values;
+ unsigned num_system_values;
+
+ /** Number of constbufs expected by the shader. */
+ unsigned num_cbufs;
+
+ /**
+ * Derived 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets
+ * (the VUE-based information for transform feedback outputs).
+ */
+ uint32_t *streamout;
+
+ struct crocus_binding_table bt;
+
+ uint32_t bind_bo_offset;
+ uint32_t surf_offset[128];//TODO
+};
+
+/**
+ * API context state that is replicated per shader stage.
+ */
+struct crocus_shader_state {
+ /** Uniform Buffers */
+ struct pipe_constant_buffer constbufs[PIPE_MAX_CONSTANT_BUFFERS];
+
+ bool sysvals_need_upload;
+
+ /** Shader Storage Buffers */
+ struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS];
+
+ /** Shader Storage Images (image load store) */
+ struct crocus_image_view image[PIPE_MAX_SHADER_IMAGES];
+
+ struct crocus_sampler_state *samplers[CROCUS_MAX_TEXTURE_SAMPLERS];
+ struct crocus_sampler_view *textures[CROCUS_MAX_TEXTURE_SAMPLERS];
+
+ /** Bitfield of which constant buffers are bound (non-null). */
+ uint32_t bound_cbufs;
+
+ /** Bitfield of which image views are bound (non-null). */
+ uint32_t bound_image_views;
+
+ /** Bitfield of which sampler views are bound (non-null). */
+ uint32_t bound_sampler_views;
+
+ /** Bitfield of which shader storage buffers are bound (non-null). */
+ uint32_t bound_ssbos;
+
+ /** Bitfield of which shader storage buffers are writable. */
+ uint32_t writable_ssbos;
+
+ uint32_t sampler_offset;
+};
+
+/**
+ * The API context (derived from pipe_context).
+ *
+ * Most driver state is tracked here.
+ */
+struct crocus_context {
+ struct pipe_context ctx;
+
+ /** A debug callback for KHR_debug output. */
+ struct pipe_debug_callback dbg;
+
+ /** A device reset status callback for notifying that the GPU is hosed. */
+ struct pipe_device_reset_callback reset;
+
+ /** Slab allocator for crocus_transfer_map objects. */
+ struct slab_child_pool transfer_pool;
+
+ struct blorp_context blorp;
+
+ int batch_count;
+ struct crocus_batch batches[CROCUS_BATCH_COUNT];
+
+ struct u_upload_mgr *query_buffer_uploader;
+
+ struct blitter_context *blitter;
+
+ struct {
+ struct {
+ /**
+ * Either the value of BaseVertex for indexed draw calls or the value
+ * of the argument <first> for non-indexed draw calls.
+ */
+ int firstvertex;
+ int baseinstance;
+ } params;
+
+ /**
+ * Are the above values the ones stored in the draw_params buffer?
+ * If so, we can compare them against new values to see if anything
+ * changed. If not, we need to assume they changed.
+ */
+ bool params_valid;
+
+ /**
+ * Resource and offset that stores draw_parameters from the indirect
+ * buffer or to the buffer that stures the previous values for non
+ * indirect draws.
+ */
+ struct crocus_state_ref draw_params;
+
+ struct {
+ /**
+ * The value of DrawID. This always comes in from it's own vertex
+ * buffer since it's not part of the indirect draw parameters.
+ */
+ int drawid;
+
+ /**
+ * Stores if an indexed or non-indexed draw (~0/0). Useful to
+ * calculate BaseVertex as an AND of firstvertex and is_indexed_draw.
+ */
+ int is_indexed_draw;
+ } derived_params;
+
+ /**
+ * Resource and offset used for GL_ARB_shader_draw_parameters which
+ * contains parameters that are not present in the indirect buffer as
+ * drawid and is_indexed_draw. They will go in their own vertex element.
+ */
+ struct crocus_state_ref derived_draw_params;
+ } draw;
+
+ struct {
+ struct crocus_uncompiled_shader *uncompiled[MESA_SHADER_STAGES];
+ struct crocus_compiled_shader *prog[MESA_SHADER_STAGES];
+ struct brw_vue_map *last_vue_map;
+
+ struct crocus_bo *cache_bo;
+ uint32_t cache_next_offset;
+ void *cache_bo_map;
+ struct hash_table *cache;
+
+ unsigned urb_size;
+
+ /* gen 4/5 clip/sf progs */
+ struct crocus_compiled_shader *clip_prog;
+ struct crocus_compiled_shader *sf_prog;
+ /* gen4/5 prims, gen6 streamout */
+ struct crocus_compiled_shader *ff_gs_prog;
+ uint32_t clip_offset;
+ uint32_t sf_offset;
+ uint32_t wm_offset;
+ uint32_t vs_offset;
+ uint32_t gs_offset;
+ uint32_t cc_offset;
+
+ /** Is a GS or TES outputting points or lines? */
+ bool output_topology_is_points_or_lines;
+
+ /* Track last VS URB entry size */
+ unsigned last_vs_entry_size;
+
+ /**
+ * Scratch buffers for various sizes and stages.
+ *
+ * Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding,
+ * and shader stage.
+ */
+ struct crocus_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES];
+ } shaders;
+
+ struct {
+ struct crocus_query *query;
+ bool condition;
+ enum pipe_render_cond_flag mode;
+ } condition;
+
+ struct intel_perf_context *perf_ctx;
+
+ struct {
+ uint64_t dirty;
+ uint64_t stage_dirty;
+ uint64_t stage_dirty_for_nos[CROCUS_NOS_COUNT];
+
+ unsigned num_viewports;
+ unsigned sample_mask;
+ struct crocus_blend_state *cso_blend;
+ struct crocus_rasterizer_state *cso_rast;
+ struct crocus_depth_stencil_alpha_state *cso_zsa;
+ struct crocus_vertex_element_state *cso_vertex_elements;
+ struct pipe_blend_color blend_color;
+ struct pipe_poly_stipple poly_stipple;
+ struct pipe_viewport_state viewports[CROCUS_MAX_VIEWPORTS];
+ struct pipe_scissor_state scissors[CROCUS_MAX_VIEWPORTS];
+ struct pipe_stencil_ref stencil_ref;
+ struct pipe_framebuffer_state framebuffer;
+ struct pipe_clip_state clip_planes;
+
+ float default_outer_level[4];
+ float default_inner_level[2];
+
+ /** Bitfield of which vertex buffers are bound (non-null). */
+ uint32_t bound_vertex_buffers;
+ struct pipe_vertex_buffer vertex_buffers[16];
+ uint32_t vb_end[16];
+
+ bool primitive_restart;
+ unsigned cut_index;
+ enum pipe_prim_type prim_mode:8;
+ bool prim_is_points_or_lines;
+ uint8_t vertices_per_patch;
+
+ bool window_space_position;
+
+ /** The last compute group size */
+ uint32_t last_block[3];
+
+ /** The last compute grid size */
+ uint32_t last_grid[3];
+ /** Reference to the BO containing the compute grid size */
+ struct crocus_state_ref grid_size;
+
+ /**
+ * Array of aux usages for drawing, altered to account for any
+ * self-dependencies from resources bound for sampling and rendering.
+ */
+ enum isl_aux_usage draw_aux_usage[BRW_MAX_DRAW_BUFFERS];
+
+ /** Aux usage of the fb's depth buffer (which may or may not exist). */
+ enum isl_aux_usage hiz_usage;
+
+ /** Bitfield of whether color blending is enabled for RT[i] */
+ uint8_t blend_enables;
+
+ /** Are depth writes enabled? (Depth buffer may or may not exist.) */
+ bool depth_writes_enabled;
+
+ /** Are stencil writes enabled? (Stencil buffer may or may not exist.) */
+ bool stencil_writes_enabled;
+
+ /** GenX-specific current state */
+ struct crocus_genx_state *genx;
+
+ struct crocus_shader_state shaders[MESA_SHADER_STAGES];
+
+ /** Do vertex shader uses shader draw parameters ? */
+ bool vs_uses_draw_params;
+ bool vs_uses_derived_draw_params;
+ bool vs_needs_sgvs_element;
+ bool vs_uses_vertexid;
+ bool vs_uses_instanceid;
+
+ /** Do vertex shader uses edge flag ? */
+ bool vs_needs_edge_flag;
+
+ struct pipe_stream_output_target *so_target[PIPE_MAX_SO_BUFFERS];
+ bool streamout_active;
+ int so_targets;
+
+ bool statistics_counters_enabled;
+
+ /** Current conditional rendering mode */
+ enum crocus_predicate_state predicate;
+ bool predicate_supported;
+
+ /**
+ * Query BO with a MI_PREDICATE_RESULT snapshot calculated on the
+ * render context that needs to be uploaded to the compute context.
+ */
+ struct crocus_bo *compute_predicate;
+
+ /** Is a PIPE_QUERY_PRIMITIVES_GENERATED query active? */
+ bool prims_generated_query_active;
+
+ /** 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets */
+ uint32_t *streamout;
+
+ /**
+ * Resources containing streamed state which our render context
+ * currently points to. Used to re-add these to the validation
+ * list when we start a new batch and haven't resubmitted commands.
+ */
+ struct {
+ struct pipe_resource *res;
+ uint32_t offset;
+ uint32_t size;
+ uint32_t index_size;
+ bool prim_restart;
+ } index_buffer;
+
+ uint32_t sf_vp_address;
+ uint32_t clip_vp_address;
+ uint32_t cc_vp_address;
+
+ uint32_t stats_wm;
+ float global_depth_offset_clamp;
+
+ uint32_t last_xfb_verts_per_prim;
+ uint64_t svbi;
+ } state;
+
+ /* BRW_NEW_URB_ALLOCATIONS:
+ */
+ struct {
+ uint32_t vsize; /* vertex size plus header in urb registers */
+ uint32_t gsize; /* GS output size in urb registers */
+ uint32_t hsize; /* Tessellation control output size in urb registers */
+ uint32_t dsize; /* Tessellation evaluation output size in urb registers */
+ uint32_t csize; /* constant buffer size in urb registers */
+ uint32_t sfsize; /* setup data size in urb registers */
+
+ bool constrained;
+
+ uint32_t nr_vs_entries;
+ uint32_t nr_hs_entries;
+ uint32_t nr_ds_entries;
+ uint32_t nr_gs_entries;
+ uint32_t nr_clip_entries;
+ uint32_t nr_sf_entries;
+ uint32_t nr_cs_entries;
+
+ uint32_t vs_start;
+ uint32_t hs_start;
+ uint32_t ds_start;
+ uint32_t gs_start;
+ uint32_t clip_start;
+ uint32_t sf_start;
+ uint32_t cs_start;
+ /**
+ * URB size in the current configuration. The units this is expressed
+ * in are somewhat inconsistent, see intel_device_info::urb::size.
+ *
+ * FINISHME: Represent the URB size consistently in KB on all platforms.
+ */
+ uint32_t size;
+
+ /* True if the most recently sent _3DSTATE_URB message allocated
+ * URB space for the GS.
+ */
+ bool gs_present;
+
+ /* True if the most recently sent _3DSTATE_URB message allocated
+ * URB space for the HS and DS.
+ */
+ bool tess_present;
+ } urb;
+
+ /* GEN4/5 curbe */
+ struct {
+ unsigned wm_start;
+ unsigned wm_size;
+ unsigned clip_start;
+ unsigned clip_size;
+ unsigned vs_start;
+ unsigned vs_size;
+ unsigned total_size;
+
+ struct crocus_resource *curbe_res;
+ unsigned curbe_offset;
+ } curbe;
+
+ /**
+ * A buffer containing a marker + description of the driver. This buffer is
+ * added to all execbufs syscalls so that we can identify the driver that
+ * generated a hang by looking at the content of the buffer in the error
+ * state. It is also used for hardware workarounds that require scratch
+ * writes or reads from some unimportant memory. To avoid overriding the
+ * debug data, use the workaround_address field for workarounds.
+ */
+ struct crocus_bo *workaround_bo;
+ unsigned workaround_offset;
+};
+
+#define perf_debug(dbg, ...) do { \
+ if (INTEL_DEBUG & DEBUG_PERF) \
+ dbg_printf(__VA_ARGS__); \
+ if (unlikely(dbg)) \
+ pipe_debug_message(dbg, PERF_INFO, __VA_ARGS__); \
+} while(0)
+
+
+struct pipe_context *
+crocus_create_context(struct pipe_screen *screen, void *priv, unsigned flags);
+
+void crocus_lost_context_state(struct crocus_batch *batch);
+
+void crocus_init_blit_functions(struct pipe_context *ctx);
+void crocus_init_clear_functions(struct pipe_context *ctx);
+void crocus_init_program_functions(struct pipe_context *ctx);
+void crocus_init_resource_functions(struct pipe_context *ctx);
+bool crocus_update_compiled_shaders(struct crocus_context *ice);
+void crocus_update_compiled_compute_shader(struct crocus_context *ice);
+void crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+ unsigned threads, uint32_t *dst);
+
+
+/* crocus_blit.c */
+enum crocus_blitter_op
+{
+ CROCUS_SAVE_TEXTURES = 1,
+ CROCUS_SAVE_FRAMEBUFFER = 2,
+ CROCUS_SAVE_FRAGMENT_STATE = 4,
+ CROCUS_DISABLE_RENDER_COND = 8,
+};
+void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond);
+
+void crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl,
+ struct isl_device *isl_dev,
+ struct blorp_surf *surf,
+ struct pipe_resource *p_res,
+ enum isl_aux_usage aux_usage,
+ unsigned level,
+ bool is_render_target);
+void crocus_copy_region(struct blorp_context *blorp,
+ struct crocus_batch *batch,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box);
+
+/* crocus_draw.c */
+void crocus_draw_vbo(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws);
+void crocus_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
+
+/* crocus_pipe_control.c */
+
+void crocus_emit_pipe_control_flush(struct crocus_batch *batch,
+ const char *reason, uint32_t flags);
+void crocus_emit_pipe_control_write(struct crocus_batch *batch,
+ const char *reason, uint32_t flags,
+ struct crocus_bo *bo, uint32_t offset,
+ uint64_t imm);
+void crocus_emit_mi_flush(struct crocus_batch *batch);
+void crocus_emit_depth_stall_flushes(struct crocus_batch *batch);
+void crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch);
+void crocus_emit_end_of_pipe_sync(struct crocus_batch *batch,
+ const char *reason, uint32_t flags);
+void crocus_flush_all_caches(struct crocus_batch *batch);
+
+#define crocus_handle_always_flush_cache(batch) \
+ if (unlikely(batch->screen->driconf.always_flush_cache)) \
+ crocus_flush_all_caches(batch);
+
+void crocus_init_flush_functions(struct pipe_context *ctx);
+
+/* crocus_program.c */
+const struct shader_info *crocus_get_shader_info(const struct crocus_context *ice,
+ gl_shader_stage stage);
+struct crocus_bo *crocus_get_scratch_space(struct crocus_context *ice,
+ unsigned per_thread_scratch,
+ gl_shader_stage stage);
+uint32_t crocus_group_index_to_bti(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group,
+ uint32_t index);
+uint32_t crocus_bti_to_group_index(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group,
+ uint32_t bti);
+
+/* crocus_disk_cache.c */
+
+void crocus_disk_cache_store(struct disk_cache *cache,
+ const struct crocus_uncompiled_shader *ish,
+ const struct crocus_compiled_shader *shader,
+ void *map,
+ const void *prog_key,
+ uint32_t prog_key_size);
+struct crocus_compiled_shader *
+crocus_disk_cache_retrieve(struct crocus_context *ice,
+ const struct crocus_uncompiled_shader *ish,
+ const void *prog_key,
+ uint32_t prog_key_size);
+
+/* crocus_program_cache.c */
+
+void crocus_init_program_cache(struct crocus_context *ice);
+void crocus_destroy_program_cache(struct crocus_context *ice);
+void crocus_print_program_cache(struct crocus_context *ice);
+struct crocus_compiled_shader *crocus_find_cached_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id,
+ uint32_t key_size,
+ const void *key);
+struct crocus_compiled_shader *crocus_upload_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id,
+ uint32_t key_size,
+ const void *key,
+ const void *assembly,
+ uint32_t asm_size,
+ struct brw_stage_prog_data *,
+ uint32_t prog_data_size,
+ uint32_t *streamout,
+ enum brw_param_builtin *sysv,
+ unsigned num_system_values,
+ unsigned num_cbufs,
+ const struct crocus_binding_table *bt);
+const void *crocus_find_previous_compile(const struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id,
+ unsigned program_string_id);
+bool crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch,
+ const void *key,
+ uint32_t key_size,
+ uint32_t *kernel_out,
+ void *prog_data_out);
+bool crocus_blorp_upload_shader(struct blorp_batch *blorp_batch,
+ uint32_t stage,
+ const void *key, uint32_t key_size,
+ const void *kernel, uint32_t kernel_size,
+ const struct brw_stage_prog_data *prog_data,
+ uint32_t prog_data_size,
+ uint32_t *kernel_out,
+ void *prog_data_out);
+
+/* crocus_resolve.c */
+
+void crocus_predraw_resolve_inputs(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled,
+ gl_shader_stage stage,
+ bool consider_framebuffer);
+void crocus_predraw_resolve_framebuffer(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled);
+void crocus_postdraw_update_resolve_tracking(struct crocus_context *ice,
+ struct crocus_batch *batch);
+void crocus_cache_sets_clear(struct crocus_batch *batch);
+void crocus_flush_depth_and_render_caches(struct crocus_batch *batch);
+void crocus_cache_flush_for_read(struct crocus_batch *batch, struct crocus_bo *bo);
+void crocus_cache_flush_for_render(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage);
+void crocus_render_cache_add_bo(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage);
+void crocus_cache_flush_for_depth(struct crocus_batch *batch, struct crocus_bo *bo);
+void crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo);
+int crocus_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+ struct pipe_driver_query_info *info);
+int crocus_get_driver_query_group_info(struct pipe_screen *pscreen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info);
+
+struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ctx);
+
+bool crocus_sw_check_cond_render(struct crocus_context *ice);
+static inline bool crocus_check_conditional_render(struct crocus_context *ice)
+{
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_STALL_FOR_QUERY)
+ return crocus_sw_check_cond_render(ice);
+ return ice->state.predicate != CROCUS_PREDICATE_STATE_DONT_RENDER;
+}
+
+#ifdef genX
+# include "crocus_genx_protos.h"
+#else
+# define genX(x) gfx4_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx45_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx5_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx6_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx7_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx75_##x
+# include "crocus_genx_protos.h"
+# undef genX
+#endif
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_defines.h b/src/gallium/drivers/crocus/crocus_defines.h
new file mode 100644
index 00000000000..a634d0746b0
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_defines.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_DEFINES_H
+#define CROCUS_DEFINES_H
+
+/**
+ * @file crocus_defines.h
+ *
+ * Random hardware #defines that we're not using GENXML for.
+ */
+
+#define MI_PREDICATE (0xC << 23)
+# define MI_PREDICATE_LOADOP_KEEP (0 << 6)
+# define MI_PREDICATE_LOADOP_LOAD (2 << 6)
+# define MI_PREDICATE_LOADOP_LOADINV (3 << 6)
+# define MI_PREDICATE_COMBINEOP_SET (0 << 3)
+# define MI_PREDICATE_COMBINEOP_AND (1 << 3)
+# define MI_PREDICATE_COMBINEOP_OR (2 << 3)
+# define MI_PREDICATE_COMBINEOP_XOR (3 << 3)
+# define MI_PREDICATE_COMPAREOP_TRUE (0 << 0)
+# define MI_PREDICATE_COMPAREOP_FALSE (1 << 0)
+# define MI_PREDICATE_COMPAREOP_SRCS_EQUAL (2 << 0)
+# define MI_PREDICATE_COMPAREOP_DELTAS_EQUAL (3 << 0)
+
+/* Predicate registers */
+#define MI_PREDICATE_SRC0 0x2400
+#define MI_PREDICATE_SRC1 0x2408
+#define MI_PREDICATE_DATA 0x2410
+#define MI_PREDICATE_RESULT 0x2418
+#define MI_PREDICATE_RESULT_1 0x241C
+#define MI_PREDICATE_RESULT_2 0x2214
+
+#define CS_GPR(n) (0x2600 + (n) * 8)
+
+/* The number of bits in our TIMESTAMP queries. */
+#define TIMESTAMP_BITS 36
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_disk_cache.c b/src/gallium/drivers/crocus/crocus_disk_cache.c
new file mode 100644
index 00000000000..c84d043fbc8
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_disk_cache.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_disk_cache.c
+ *
+ * Functions for interacting with the on-disk shader cache.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+#include "compiler/nir/nir.h"
+#include "util/blob.h"
+#include "util/build_id.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
+
+#include "crocus_context.h"
+
+static bool debug = false;
+
+/**
+ * Compute a disk cache key for the given uncompiled shader and NOS key.
+ */
+static void
+crocus_disk_cache_compute_key(struct disk_cache *cache,
+ const struct crocus_uncompiled_shader *ish,
+ const void *orig_prog_key,
+ uint32_t prog_key_size,
+ cache_key cache_key)
+{
+ /* Create a copy of the program key with program_string_id zeroed out.
+ * It's essentially random data which we don't want to include in our
+ * hashing and comparisons. We'll set a proper value on a cache hit.
+ */
+ union brw_any_prog_key prog_key;
+ memcpy(&prog_key, orig_prog_key, prog_key_size);
+ prog_key.base.program_string_id = 0;
+
+ uint8_t data[sizeof(prog_key) + sizeof(ish->nir_sha1)];
+ uint32_t data_size = prog_key_size + sizeof(ish->nir_sha1);
+
+ memcpy(data, ish->nir_sha1, sizeof(ish->nir_sha1));
+ memcpy(data + sizeof(ish->nir_sha1), &prog_key, prog_key_size);
+
+ disk_cache_compute_key(cache, data, data_size, cache_key);
+}
+
+/**
+ * Store the given compiled shader in the disk cache.
+ *
+ * This should only be called on newly compiled shaders. No checking is
+ * done to prevent repeated stores of the same shader.
+ */
+void
+crocus_disk_cache_store(struct disk_cache *cache,
+ const struct crocus_uncompiled_shader *ish,
+ const struct crocus_compiled_shader *shader,
+ void *map,
+ const void *prog_key,
+ uint32_t prog_key_size)
+{
+#ifdef ENABLE_SHADER_CACHE
+ if (!cache)
+ return;
+
+ gl_shader_stage stage = ish->nir->info.stage;
+ const struct brw_stage_prog_data *prog_data = shader->prog_data;
+
+ cache_key cache_key;
+ crocus_disk_cache_compute_key(cache, ish, prog_key, prog_key_size, cache_key);
+
+ if (debug) {
+ char sha1[41];
+ _mesa_sha1_format(sha1, cache_key);
+ fprintf(stderr, "[mesa disk cache] storing %s\n", sha1);
+ }
+
+ struct blob blob;
+ blob_init(&blob);
+
+ /* We write the following data to the cache blob:
+ *
+ * 1. Prog data (must come first because it has the assembly size)
+ * 2. Assembly code
+ * 3. Number of entries in the system value array
+ * 4. System value array
+ * 5. Legacy param array (only used for compute workgroup ID)
+ * 6. Binding table
+ */
+ blob_write_bytes(&blob, shader->prog_data, brw_prog_data_size(stage));
+ blob_write_bytes(&blob, map + shader->offset, shader->prog_data->program_size);
+ blob_write_bytes(&blob, &shader->num_system_values, sizeof(unsigned));
+ blob_write_bytes(&blob, shader->system_values,
+ shader->num_system_values * sizeof(enum brw_param_builtin));
+ blob_write_bytes(&blob, prog_data->param,
+ prog_data->nr_params * sizeof(uint32_t));
+ blob_write_bytes(&blob, &shader->bt, sizeof(shader->bt));
+
+ disk_cache_put(cache, cache_key, blob.data, blob.size, NULL);
+ blob_finish(&blob);
+#endif
+}
+
+/**
+ * Search for a compiled shader in the disk cache. If found, upload it
+ * to the in-memory program cache so we can use it.
+ */
+struct crocus_compiled_shader *
+crocus_disk_cache_retrieve(struct crocus_context *ice,
+ const struct crocus_uncompiled_shader *ish,
+ const void *prog_key,
+ uint32_t key_size)
+{
+#ifdef ENABLE_SHADER_CACHE
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct disk_cache *cache = screen->disk_cache;
+ gl_shader_stage stage = ish->nir->info.stage;
+
+ if (!cache)
+ return NULL;
+
+ cache_key cache_key;
+ crocus_disk_cache_compute_key(cache, ish, prog_key, key_size, cache_key);
+
+ if (debug) {
+ char sha1[41];
+ _mesa_sha1_format(sha1, cache_key);
+ fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1);
+ }
+
+ size_t size;
+ void *buffer = disk_cache_get(screen->disk_cache, cache_key, &size);
+
+ if (debug)
+ fprintf(stderr, "%s\n", buffer ? "found" : "missing");
+
+ if (!buffer)
+ return NULL;
+
+ const uint32_t prog_data_size = brw_prog_data_size(stage);
+
+ struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size);
+ const void *assembly;
+ uint32_t num_system_values;
+ uint32_t *system_values = NULL;
+ uint32_t *so_decls = NULL;
+
+ struct blob_reader blob;
+ blob_reader_init(&blob, buffer, size);
+ blob_copy_bytes(&blob, prog_data, prog_data_size);
+ assembly = blob_read_bytes(&blob, prog_data->program_size);
+ num_system_values = blob_read_uint32(&blob);
+ if (num_system_values) {
+ system_values =
+ ralloc_array(NULL, enum brw_param_builtin, num_system_values);
+ blob_copy_bytes(&blob, system_values,
+ num_system_values * sizeof(enum brw_param_builtin));
+ }
+
+ prog_data->param = NULL;
+ prog_data->pull_param = NULL;
+ assert(prog_data->nr_pull_params == 0);
+
+ if (prog_data->nr_params) {
+ prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
+ blob_copy_bytes(&blob, prog_data->param,
+ prog_data->nr_params * sizeof(uint32_t));
+ }
+
+ struct crocus_binding_table bt;
+ blob_copy_bytes(&blob, &bt, sizeof(bt));
+
+ if ((stage == MESA_SHADER_VERTEX ||
+ stage == MESA_SHADER_TESS_EVAL ||
+ stage == MESA_SHADER_GEOMETRY) && screen->devinfo.ver > 6) {
+ struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+ }
+
+ /* System values and uniforms are stored in constant buffer 0, the
+ * user-facing UBOs are indexed by one. So if any constant buffer is
+ * needed, the constant buffer 0 will be needed, so account for it.
+ */
+ unsigned num_cbufs = ish->nir->info.num_ubos;
+
+ if (num_cbufs || ish->nir->num_uniforms)
+ num_cbufs++;
+
+ if (num_system_values)
+ num_cbufs++;
+
+ /* Upload our newly read shader to the in-memory program cache and
+ * return it to the caller.
+ */
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, stage, key_size, prog_key, assembly,
+ prog_data->program_size,
+ prog_data, prog_data_size, so_decls, system_values,
+ num_system_values, num_cbufs, &bt);
+
+ free(buffer);
+
+ return shader;
+#else
+ return NULL;
+#endif
+}
+
+/**
+ * Initialize the on-disk shader cache.
+ */
+void
+crocus_disk_cache_init(struct crocus_screen *screen)
+{
+#ifdef ENABLE_SHADER_CACHE
+ if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK)
+ return;
+
+ /* array length = print length + nul char + 1 extra to verify it's unused */
+ char renderer[13];
+ UNUSED int len =
+ snprintf(renderer, sizeof(renderer), "crocus_%04x", screen->pci_id);
+ assert(len == sizeof(renderer) - 2);
+
+ const struct build_id_note *note =
+ build_id_find_nhdr_for_addr(crocus_disk_cache_init);
+ assert(note && build_id_length(note) == 20); /* sha1 */
+
+ const uint8_t *id_sha1 = build_id_data(note);
+ assert(id_sha1);
+
+ char timestamp[41];
+ _mesa_sha1_format(timestamp, id_sha1);
+
+ const uint64_t driver_flags =
+ brw_get_compiler_config_value(screen->compiler);
+ screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+#endif
+}
diff --git a/src/gallium/drivers/crocus/crocus_draw.c b/src/gallium/drivers/crocus/crocus_draw.c
new file mode 100644
index 00000000000..119c5571ae1
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_draw.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_draw.c
+ *
+ * The main driver hooks for drawing and launching compute shaders.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_draw.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_eu_defines.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "util/u_prim_restart.h"
+#include "indices/u_primconvert.h"
+#include "util/u_prim.h"
+
+static bool
+prim_is_points_or_lines(enum pipe_prim_type mode)
+{
+ /* We don't need to worry about adjacency - it can only be used with
+ * geometry shaders, and we don't care about this info when GS is on.
+ */
+ return mode == PIPE_PRIM_POINTS ||
+ mode == PIPE_PRIM_LINES ||
+ mode == PIPE_PRIM_LINE_LOOP ||
+ mode == PIPE_PRIM_LINE_STRIP;
+}
+
+static bool
+can_cut_index_handle_restart_index(struct crocus_context *ice,
+ const struct pipe_draw_info *draw)
+{
+ switch (draw->index_size) {
+ case 1:
+ return draw->restart_index == 0xff;
+ case 2:
+ return draw->restart_index == 0xffff;
+ case 4:
+ return draw->restart_index == 0xffffffff;
+ default:
+ unreachable("illegal index size\n");
+ }
+
+ return false;
+}
+
+static bool
+can_cut_index_handle_prim(struct crocus_context *ice,
+ const struct pipe_draw_info *draw)
+{
+ struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ /* Haswell can do it all. */
+ if (devinfo->is_haswell)
+ return true;
+
+ if (!can_cut_index_handle_restart_index(ice, draw))
+ return false;
+
+ switch (draw->mode) {
+ case PIPE_PRIM_POINTS:
+ case PIPE_PRIM_LINES:
+ case PIPE_PRIM_LINE_STRIP:
+ case PIPE_PRIM_TRIANGLES:
+ case PIPE_PRIM_TRIANGLE_STRIP:
+ case PIPE_PRIM_LINES_ADJACENCY:
+ case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+ case PIPE_PRIM_TRIANGLES_ADJACENCY:
+ case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+/**
+ * Record the current primitive mode and restart information, flagging
+ * related packets as dirty if necessary.
+ *
+ * This must be called before updating compiled shaders, because the patch
+ * information informs the TCS key.
+ */
+static void
+crocus_update_draw_info(struct crocus_context *ice,
+ const struct pipe_draw_info *info,
+ const struct pipe_draw_start_count_bias *draw)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ enum pipe_prim_type mode = info->mode;
+
+ if (screen->devinfo.ver < 6) {
+ /* Slight optimization to avoid the GS program when not needed:
+ */
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ if (mode == PIPE_PRIM_QUAD_STRIP && !rs_state->flatshade &&
+ rs_state->fill_front == PIPE_POLYGON_MODE_FILL &&
+ rs_state->fill_back == PIPE_POLYGON_MODE_FILL)
+ mode = PIPE_PRIM_TRIANGLE_STRIP;
+ if (mode == PIPE_PRIM_QUADS &&
+ draw->count == 4 &&
+ !rs_state->flatshade &&
+ rs_state->fill_front == PIPE_POLYGON_MODE_FILL &&
+ rs_state->fill_back == PIPE_POLYGON_MODE_FILL)
+ mode = PIPE_PRIM_TRIANGLE_FAN;
+ }
+
+ if (ice->state.prim_mode != mode) {
+ ice->state.prim_mode = mode;
+
+ if (screen->devinfo.ver < 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+ if (screen->devinfo.ver <= 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+
+ if (screen->devinfo.ver >= 7)
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+
+ /* For XY Clip enables */
+ bool points_or_lines = prim_is_points_or_lines(mode);
+ if (points_or_lines != ice->state.prim_is_points_or_lines) {
+ ice->state.prim_is_points_or_lines = points_or_lines;
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ }
+ }
+
+ if (info->mode == PIPE_PRIM_PATCHES &&
+ ice->state.vertices_per_patch != info->vertices_per_patch) {
+ ice->state.vertices_per_patch = info->vertices_per_patch;
+
+ /* This is needed for key->input_vertices */
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_TCS;
+
+ /* Flag constants dirty for gl_PatchVerticesIn if needed. */
+ const struct shader_info *tcs_info =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+ if (tcs_info &&
+ BITSET_TEST(tcs_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+ ice->state.shaders[MESA_SHADER_TESS_CTRL].sysvals_need_upload = true;
+ }
+ }
+
+ const unsigned cut_index = info->primitive_restart ? info->restart_index :
+ ice->state.cut_index;
+ if (ice->state.primitive_restart != info->primitive_restart ||
+ ice->state.cut_index != cut_index) {
+ if (screen->devinfo.is_haswell)
+ ice->state.dirty |= CROCUS_DIRTY_GEN75_VF;
+ ice->state.primitive_restart = info->primitive_restart;
+ ice->state.cut_index = info->restart_index;
+ }
+}
+
+/**
+ * Update shader draw parameters, flagging VF packets as dirty if necessary.
+ */
+static void
+crocus_update_draw_parameters(struct crocus_context *ice,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draw)
+{
+ bool changed = false;
+
+ if (ice->state.vs_uses_draw_params) {
+ struct crocus_state_ref *draw_params = &ice->draw.draw_params;
+
+ if (indirect && indirect->buffer) {
+ pipe_resource_reference(&draw_params->res, indirect->buffer);
+ draw_params->offset =
+ indirect->offset + (info->index_size ? 12 : 8);
+
+ changed = true;
+ ice->draw.params_valid = false;
+ } else {
+ int firstvertex = info->index_size ? draw->index_bias : draw->start;
+
+ if (!ice->draw.params_valid ||
+ ice->draw.params.firstvertex != firstvertex ||
+ ice->draw.params.baseinstance != info->start_instance) {
+
+ changed = true;
+ ice->draw.params.firstvertex = firstvertex;
+ ice->draw.params.baseinstance = info->start_instance;
+ ice->draw.params_valid = true;
+
+ u_upload_data(ice->ctx.stream_uploader, 0,
+ sizeof(ice->draw.params), 4, &ice->draw.params,
+ &draw_params->offset, &draw_params->res);
+ }
+ }
+ }
+
+ if (ice->state.vs_uses_derived_draw_params) {
+ struct crocus_state_ref *derived_params = &ice->draw.derived_draw_params;
+ int is_indexed_draw = info->index_size ? -1 : 0;
+
+ if (ice->draw.derived_params.drawid != drawid_offset ||
+ ice->draw.derived_params.is_indexed_draw != is_indexed_draw) {
+
+ changed = true;
+ ice->draw.derived_params.drawid = drawid_offset;
+ ice->draw.derived_params.is_indexed_draw = is_indexed_draw;
+
+ u_upload_data(ice->ctx.stream_uploader, 0,
+ sizeof(ice->draw.derived_params), 4,
+ &ice->draw.derived_params, &derived_params->offset,
+ &derived_params->res);
+ }
+ }
+
+ if (changed) {
+ ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS |
+ CROCUS_DIRTY_VERTEX_ELEMENTS;
+ }
+}
+
+static void
+crocus_indirect_draw_vbo(struct crocus_context *ice,
+ const struct pipe_draw_info *dinfo,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *dindirect,
+ const struct pipe_draw_start_count_bias *draws)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ struct pipe_draw_info info = *dinfo;
+ struct pipe_draw_indirect_info indirect = *dindirect;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ if (devinfo->is_haswell && indirect.indirect_draw_count &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ /* Upload MI_PREDICATE_RESULT to GPR15.*/
+ screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
+ }
+
+ uint64_t orig_dirty = ice->state.dirty;
+ uint64_t orig_stage_dirty = ice->state.stage_dirty;
+
+ for (int i = 0; i < indirect.draw_count; i++) {
+ crocus_batch_maybe_flush(batch, 1500);
+ crocus_require_statebuffer_space(batch, 2400);
+
+ crocus_update_draw_parameters(ice, &info, drawid_offset + i, &indirect, draws);
+
+ screen->vtbl.upload_render_state(ice, batch, &info, drawid_offset + i, &indirect, draws);
+
+ ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER;
+ ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+
+ indirect.offset += indirect.stride;
+ }
+
+ if (devinfo->is_haswell && indirect.indirect_draw_count &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ /* Restore MI_PREDICATE_RESULT. */
+ screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15));
+ }
+
+ /* Put this back for post-draw resolves, we'll clear it again after. */
+ ice->state.dirty = orig_dirty;
+ ice->state.stage_dirty = orig_stage_dirty;
+}
+
+static void
+crocus_simple_draw_vbo(struct crocus_context *ice,
+ const struct pipe_draw_info *draw,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *sc)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+
+ crocus_batch_maybe_flush(batch, 1500);
+ crocus_require_statebuffer_space(batch, 2400);
+
+ crocus_update_draw_parameters(ice, draw, drawid_offset, indirect, sc);
+
+ screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc);
+}
+
+static void
+crocus_draw_vbo_get_vertex_count(struct pipe_context *ctx,
+ const struct pipe_draw_info *info_in,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ struct pipe_draw_info info = *info_in;
+ struct pipe_draw_start_count_bias draw;
+
+ uint32_t val = screen->vtbl.get_so_offset(indirect->count_from_stream_output);
+
+ draw.start = 0;
+ draw.count = val;
+ ctx->draw_vbo(ctx, &info, drawid_offset, NULL, &draw, 1);
+}
+
+/**
+ * The pipe->draw_vbo() driver hook. Performs a draw on the GPU.
+ */
+void
+crocus_draw_vbo(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws)
+{
+ if (num_draws > 1) {
+ util_draw_multi(ctx, info, drawid_offset, indirect, draws, num_draws);
+ return;
+ }
+
+ if (!indirect && (!draws[0].count || !info->instance_count))
+ return;
+
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (info->primitive_restart && !can_cut_index_handle_prim(ice, info)) {
+ util_draw_vbo_without_prim_restart(ctx, info, drawid_offset,
+ indirect, draws);
+ return;
+ }
+
+ if (indirect && indirect->count_from_stream_output &&
+ !screen->devinfo.is_haswell) {
+ crocus_draw_vbo_get_vertex_count(ctx, info, drawid_offset, indirect);
+ return;
+ }
+
+ /**
+ * The hardware is capable of removing dangling vertices on its own; however,
+ * prior to Gen6, we sometimes convert quads into trifans (and quad strips
+ * into tristrips), since pre-Gen6 hardware requires a GS to render quads.
+ * This function manually trims dangling vertices from a draw call involving
+ * quads so that those dangling vertices won't get drawn when we convert to
+ * trifans/tristrips.
+ */
+ if (screen->devinfo.ver < 6) {
+ if (info->mode == PIPE_PRIM_QUADS || info->mode == PIPE_PRIM_QUAD_STRIP) {
+ bool trim = u_trim_pipe_prim(info->mode, (unsigned *)&draws[0].count);
+ if (!trim)
+ return;
+ }
+ }
+
+ /* We can't safely re-emit 3DSTATE_SO_BUFFERS because it may zero the
+ * write offsets, changing the behavior.
+ */
+ if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
+ ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER & ~CROCUS_DIRTY_GEN7_SO_BUFFERS;
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+ }
+
+ /* Emit Sandybridge workaround flushes on every primitive, for safety. */
+ if (screen->devinfo.ver == 6)
+ crocus_emit_post_sync_nonzero_flush(batch);
+
+ crocus_update_draw_info(ice, info, draws);
+
+ if (!crocus_update_compiled_shaders(ice))
+ return;
+
+ if (ice->state.dirty & CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES) {
+ bool draw_aux_buffer_disabled[BRW_MAX_DRAW_BUFFERS] = { };
+ for (gl_shader_stage stage = 0; stage < MESA_SHADER_COMPUTE; stage++) {
+ if (ice->shaders.prog[stage])
+ crocus_predraw_resolve_inputs(ice, batch, draw_aux_buffer_disabled,
+ stage, true);
+ }
+ crocus_predraw_resolve_framebuffer(ice, batch, draw_aux_buffer_disabled);
+ }
+
+ crocus_handle_always_flush_cache(batch);
+
+ if (indirect && indirect->buffer)
+ crocus_indirect_draw_vbo(ice, info, drawid_offset, indirect, draws);
+ else
+ crocus_simple_draw_vbo(ice, info, drawid_offset, indirect, draws);
+
+ crocus_handle_always_flush_cache(batch);
+
+ crocus_postdraw_update_resolve_tracking(ice, batch);
+
+ ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER;
+ ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+}
+
+static void
+crocus_update_grid_size_resource(struct crocus_context *ice,
+ const struct pipe_grid_info *grid)
+{
+ struct crocus_state_ref *grid_ref = &ice->state.grid_size;
+ const struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_COMPUTE];
+ bool grid_needs_surface = shader->bt.used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS];
+
+ if (grid->indirect) {
+ pipe_resource_reference(&grid_ref->res, grid->indirect);
+ grid_ref->offset = grid->indirect_offset;
+
+ /* Zero out the grid size so that the next non-indirect grid launch will
+ * re-upload it properly.
+ */
+ memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+ } else if (memcmp(ice->state.last_grid, grid->grid, sizeof(grid->grid)) != 0) {
+ memcpy(ice->state.last_grid, grid->grid, sizeof(grid->grid));
+ u_upload_data(ice->ctx.const_uploader, 0, sizeof(grid->grid), 4,
+ grid->grid, &grid_ref->offset, &grid_ref->res);
+ }
+
+ /* Skip surface upload if we don't need it or we already have one */
+ if (!grid_needs_surface)
+ return;
+
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_CS;
+}
+
+
+void
+crocus_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_COMPUTE];
+ struct crocus_screen *screen = batch->screen;
+
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
+ ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+ }
+
+ /* We can't do resolves on the compute engine, so awkwardly, we have to
+ * do them on the render batch...
+ */
+ if (ice->state.dirty & CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES) {
+ crocus_predraw_resolve_inputs(ice, &ice->batches[CROCUS_BATCH_RENDER], NULL,
+ MESA_SHADER_COMPUTE, false);
+ }
+
+ crocus_batch_maybe_flush(batch, 1500);
+ crocus_require_statebuffer_space(batch, 2500);
+ crocus_update_compiled_compute_shader(ice);
+
+ if (memcmp(ice->state.last_block, grid->block, sizeof(grid->block)) != 0) {
+ memcpy(ice->state.last_block, grid->block, sizeof(grid->block));
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+ ice->state.shaders[MESA_SHADER_COMPUTE].sysvals_need_upload = true;
+ }
+
+ crocus_update_grid_size_resource(ice, grid);
+
+ if (ice->state.compute_predicate) {
+ screen->vtbl.emit_compute_predicate(batch);
+ ice->state.compute_predicate = NULL;
+ }
+
+ crocus_handle_always_flush_cache(batch);
+
+ screen->vtbl.upload_compute_state(ice, batch, grid);
+
+ crocus_handle_always_flush_cache(batch);
+
+ ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_COMPUTE;
+ ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+
+ /* Note: since compute shaders can't access the framebuffer, there's
+ * no need to call crocus_postdraw_update_resolve_tracking.
+ */
+}
diff --git a/src/gallium/drivers/crocus/crocus_fence.c b/src/gallium/drivers/crocus/crocus_fence.c
new file mode 100644
index 00000000000..fdff24b2dd4
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fence.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_fence.c
+ *
+ * Fences for driver and IPC serialisation, scheduling and synchronisation.
+ */
+
+#include "util/u_inlines.h"
+#include "intel/common/intel_gem.h"
+
+#include "crocus_batch.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "crocus_fence.h"
+#include "crocus_screen.h"
+
+static uint32_t
+gem_syncobj_create(int fd, uint32_t flags)
+{
+ struct drm_syncobj_create args = {
+ .flags = flags,
+ };
+
+ intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_CREATE, &args);
+
+ return args.handle;
+}
+
+static void
+gem_syncobj_destroy(int fd, uint32_t handle)
+{
+ struct drm_syncobj_destroy args = {
+ .handle = handle,
+ };
+
+ intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_DESTROY, &args);
+}
+
+/**
+ * Make a new sync-point.
+ */
+struct crocus_syncobj *
+crocus_create_syncobj(struct crocus_screen *screen)
+{
+ struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj));
+
+ if (!syncobj)
+ return NULL;
+
+ syncobj->handle = gem_syncobj_create(screen->fd, 0);
+ assert(syncobj->handle);
+
+ pipe_reference_init(&syncobj->ref, 1);
+
+ return syncobj;
+}
+
+void
+crocus_syncobj_destroy(struct crocus_screen *screen,
+ struct crocus_syncobj *syncobj)
+{
+ gem_syncobj_destroy(screen->fd, syncobj->handle);
+ free(syncobj);
+}
+
+/**
+ * Add a sync-point to the batch, with the given flags.
+ *
+ * \p flags One of I915_EXEC_FENCE_WAIT or I915_EXEC_FENCE_SIGNAL.
+ */
+void
+crocus_batch_add_syncobj(struct crocus_batch *batch,
+ struct crocus_syncobj *syncobj, unsigned flags)
+{
+ struct drm_i915_gem_exec_fence *fence =
+ util_dynarray_grow(&batch->exec_fences, struct drm_i915_gem_exec_fence, 1);
+
+ *fence = (struct drm_i915_gem_exec_fence){
+ .handle = syncobj->handle,
+ .flags = flags,
+ };
+
+ struct crocus_syncobj **store =
+ util_dynarray_grow(&batch->syncobjs, struct crocus_syncobj *, 1);
+
+ *store = NULL;
+ crocus_syncobj_reference(batch->screen, store, syncobj);
+}
+
+/**
+ * Walk through a batch's dependencies (any I915_EXEC_FENCE_WAIT syncobjs)
+ * and unreference any which have already passed.
+ *
+ * Sometimes the compute batch is seldom used, and accumulates references
+ * to stale render batches that are no longer of interest, so we can free
+ * those up.
+ */
+static void
+clear_stale_syncobjs(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ int n = util_dynarray_num_elements(&batch->syncobjs, struct crocus_syncobj *);
+
+ assert(n == util_dynarray_num_elements(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence));
+
+ /* Skip the first syncobj, as it's the signalling one. */
+ for (int i = n - 1; i > 1; i--) {
+ struct crocus_syncobj **syncobj =
+ util_dynarray_element(&batch->syncobjs, struct crocus_syncobj *, i);
+ struct drm_i915_gem_exec_fence *fence =
+ util_dynarray_element(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence, i);
+ assert(fence->flags & I915_EXEC_FENCE_WAIT);
+
+ if (crocus_wait_syncobj(&screen->base, *syncobj, 0))
+ continue;
+
+ /* This sync object has already passed, there's no need to continue
+ * marking it as a dependency; we can stop holding on to the reference.
+ */
+ crocus_syncobj_reference(screen, syncobj, NULL);
+
+ /* Remove it from the lists; move the last element here. */
+ struct crocus_syncobj **nth_syncobj =
+ util_dynarray_pop_ptr(&batch->syncobjs, struct crocus_syncobj *);
+ struct drm_i915_gem_exec_fence *nth_fence =
+ util_dynarray_pop_ptr(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence);
+
+ if (syncobj != nth_syncobj) {
+ *syncobj = *nth_syncobj;
+ memcpy(fence, nth_fence, sizeof(*fence));
+ }
+ }
+}
+
+/* ------------------------------------------------------------------- */
+
+struct pipe_fence_handle {
+ struct pipe_reference ref;
+
+ struct pipe_context *unflushed_ctx;
+
+ struct crocus_fine_fence *fine[CROCUS_BATCH_COUNT];
+};
+
+static void
+crocus_fence_destroy(struct pipe_screen *p_screen,
+ struct pipe_fence_handle *fence)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++)
+ crocus_fine_fence_reference(screen, &fence->fine[i], NULL);
+
+ free(fence);
+}
+
+static void
+crocus_fence_reference(struct pipe_screen *p_screen,
+ struct pipe_fence_handle **dst,
+ struct pipe_fence_handle *src)
+{
+ if (pipe_reference(&(*dst)->ref, &src->ref))
+ crocus_fence_destroy(p_screen, *dst);
+
+ *dst = src;
+}
+
+bool
+crocus_wait_syncobj(struct pipe_screen *p_screen,
+ struct crocus_syncobj *syncobj, int64_t timeout_nsec)
+{
+ if (!syncobj)
+ return false;
+
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+ struct drm_syncobj_wait args = {
+ .handles = (uintptr_t)&syncobj->handle,
+ .count_handles = 1,
+ .timeout_nsec = timeout_nsec,
+ };
+ return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
+}
+
+static void
+crocus_fence_flush(struct pipe_context *ctx,
+ struct pipe_fence_handle **out_fence, unsigned flags)
+{
+ struct crocus_screen *screen = (void *)ctx->screen;
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ const bool deferred = flags & PIPE_FLUSH_DEFERRED;
+
+ if (!deferred) {
+ for (unsigned i = 0; i < ice->batch_count; i++)
+ crocus_batch_flush(&ice->batches[i]);
+ }
+
+ if (!out_fence)
+ return;
+
+ struct pipe_fence_handle *fence = calloc(1, sizeof(*fence));
+ if (!fence)
+ return;
+
+ pipe_reference_init(&fence->ref, 1);
+
+ if (deferred)
+ fence->unflushed_ctx = ctx;
+
+ for (unsigned b = 0; b < ice->batch_count; b++) {
+ struct crocus_batch *batch = &ice->batches[b];
+
+ if (deferred && crocus_batch_bytes_used(batch) > 0) {
+ struct crocus_fine_fence *fine =
+ crocus_fine_fence_new(batch, CROCUS_FENCE_BOTTOM_OF_PIPE);
+ crocus_fine_fence_reference(screen, &fence->fine[b], fine);
+ crocus_fine_fence_reference(screen, &fine, NULL);
+ } else {
+ /* This batch has no commands queued up (perhaps we just flushed,
+ * or all the commands are on the other batch). Wait for the last
+ * syncobj on this engine - unless it's already finished by now.
+ */
+ if (crocus_fine_fence_signaled(batch->last_fence))
+ continue;
+
+ crocus_fine_fence_reference(screen, &fence->fine[b],
+ batch->last_fence);
+ }
+ }
+
+ crocus_fence_reference(ctx->screen, out_fence, NULL);
+ *out_fence = fence;
+}
+
+static void
+crocus_fence_await(struct pipe_context *ctx, struct pipe_fence_handle *fence)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ /* Unflushed fences from the same context are no-ops. */
+ if (ctx && ctx == fence->unflushed_ctx)
+ return;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ for (unsigned b = 0; b < ice->batch_count; b++) {
+ struct crocus_batch *batch = &ice->batches[b];
+
+ /* We're going to make any future work in this batch wait for our
+ * fence to have gone by. But any currently queued work doesn't
+ * need to wait. Flush the batch now, so it can happen sooner.
+ */
+ crocus_batch_flush(batch);
+
+ /* Before adding a new reference, clean out any stale ones. */
+ clear_stale_syncobjs(batch);
+
+ crocus_batch_add_syncobj(batch, fine->syncobj, I915_EXEC_FENCE_WAIT);
+ }
+ }
+}
+
+#define NSEC_PER_SEC (1000 * USEC_PER_SEC)
+#define USEC_PER_SEC (1000 * MSEC_PER_SEC)
+#define MSEC_PER_SEC (1000)
+
+static uint64_t
+gettime_ns(void)
+{
+ struct timespec current;
+ clock_gettime(CLOCK_MONOTONIC, &current);
+ return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
+}
+
+static uint64_t
+rel2abs(uint64_t timeout)
+{
+ if (timeout == 0)
+ return 0;
+
+ uint64_t current_time = gettime_ns();
+ uint64_t max_timeout = (uint64_t)INT64_MAX - current_time;
+
+ timeout = MIN2(max_timeout, timeout);
+
+ return current_time + timeout;
+}
+
+static bool
+crocus_fence_finish(struct pipe_screen *p_screen, struct pipe_context *ctx,
+ struct pipe_fence_handle *fence, uint64_t timeout)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+
+ /* If we created the fence with PIPE_FLUSH_DEFERRED, we may not have
+ * flushed yet. Check if our syncobj is the current batch's signalling
+ * syncobj - if so, we haven't flushed and need to now.
+ *
+ * The Gallium docs mention that a flush will occur if \p ctx matches
+ * the context the fence was created with. It may be NULL, so we check
+ * that it matches first.
+ */
+ if (ctx && ctx == fence->unflushed_ctx) {
+ for (unsigned i = 0; i < ice->batch_count; i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ if (fine->syncobj == crocus_batch_get_signal_syncobj(&ice->batches[i]))
+ crocus_batch_flush(&ice->batches[i]);
+ }
+
+ /* The fence is no longer deferred. */
+ fence->unflushed_ctx = NULL;
+ }
+
+ unsigned int handle_count = 0;
+ uint32_t handles[ARRAY_SIZE(fence->fine)];
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ handles[handle_count++] = fine->syncobj->handle;
+ }
+
+ if (handle_count == 0)
+ return true;
+
+ struct drm_syncobj_wait args = {
+ .handles = (uintptr_t)handles,
+ .count_handles = handle_count,
+ .timeout_nsec = rel2abs(timeout),
+ .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL
+ };
+ if (fence->unflushed_ctx) {
+ /* This fence had a deferred flush from another context. We can't
+ * safely flush it here, because the context might be bound to a
+ * different thread, and poking at its internals wouldn't be safe.
+ *
+ * Instead, use the WAIT_FOR_SUBMIT flag to block and hope that
+ * another thread submits the work.
+ */
+ args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
+ }
+ return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args) == 0;
+}
+
+#ifndef SYNC_IOC_MAGIC
+/* duplicated from linux/sync_file.h to avoid build-time dependency
+ * on new (v4.7) kernel headers. Once distro's are mostly using
+ * something newer than v4.7 drop this and #include <linux/sync_file.h>
+ * instead.
+ */
+struct sync_merge_data {
+ char name[32];
+ __s32 fd2;
+ __s32 fence;
+ __u32 flags;
+ __u32 pad;
+};
+
+#define SYNC_IOC_MAGIC '>'
+#define SYNC_IOC_MERGE _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
+#endif
+
+static int
+sync_merge_fd(int sync_fd, int new_fd)
+{
+ if (sync_fd == -1)
+ return new_fd;
+
+ if (new_fd == -1)
+ return sync_fd;
+
+ struct sync_merge_data args = {
+ .name = "crocus fence",
+ .fd2 = new_fd,
+ .fence = -1,
+ };
+
+ intel_ioctl(sync_fd, SYNC_IOC_MERGE, &args);
+ close(new_fd);
+ close(sync_fd);
+
+ return args.fence;
+}
+
+static int
+crocus_fence_get_fd(struct pipe_screen *p_screen,
+ struct pipe_fence_handle *fence)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+ int fd = -1;
+
+ /* Deferred fences aren't supported. */
+ if (fence->unflushed_ctx)
+ return -1;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ struct drm_syncobj_handle args = {
+ .handle = fine->syncobj->handle,
+ .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
+ .fd = -1,
+ };
+
+ intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
+ fd = sync_merge_fd(fd, args.fd);
+ }
+
+ if (fd == -1) {
+ /* Our fence has no syncobj's recorded. This means that all of the
+ * batches had already completed, their syncobj's had been signalled,
+ * and so we didn't bother to record them. But we're being asked to
+ * export such a fence. So export a dummy already-signalled syncobj.
+ */
+ struct drm_syncobj_handle args = {
+ .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
+ .fd = -1,
+ };
+
+ args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED);
+ intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
+ gem_syncobj_destroy(screen->fd, args.handle);
+ return args.fd;
+ }
+
+ return fd;
+}
+
+static void
+crocus_fence_create_fd(struct pipe_context *ctx, struct pipe_fence_handle **out,
+ int fd, enum pipe_fd_type type)
+{
+ assert(type == PIPE_FD_TYPE_NATIVE_SYNC || type == PIPE_FD_TYPE_SYNCOBJ);
+
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ struct drm_syncobj_handle args = {
+ .fd = fd,
+ };
+
+ if (type == PIPE_FD_TYPE_NATIVE_SYNC) {
+ args.flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE;
+ args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED);
+ }
+
+ if (intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args) == -1) {
+ fprintf(stderr, "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE failed: %s\n",
+ strerror(errno));
+ if (type == PIPE_FD_TYPE_NATIVE_SYNC)
+ gem_syncobj_destroy(screen->fd, args.handle);
+ *out = NULL;
+ return;
+ }
+
+ struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj));
+ if (!syncobj) {
+ *out = NULL;
+ return;
+ }
+ syncobj->handle = args.handle;
+ pipe_reference_init(&syncobj->ref, 1);
+
+ struct crocus_fine_fence *fine = calloc(1, sizeof(*fine));
+ if (!fine) {
+ free(syncobj);
+ *out = NULL;
+ return;
+ }
+
+ static const uint32_t zero = 0;
+
+ /* Fences work in terms of crocus_fine_fence, but we don't actually have a
+ * seqno for an imported fence. So, create a fake one which always
+ * returns as 'not signaled' so we fall back to using the sync object.
+ */
+ fine->seqno = UINT32_MAX;
+ fine->map = &zero;
+ fine->syncobj = syncobj;
+ fine->flags = CROCUS_FENCE_END;
+ pipe_reference_init(&fine->reference, 1);
+
+ struct pipe_fence_handle *fence = calloc(1, sizeof(*fence));
+ if (!fence) {
+ free(fine);
+ free(syncobj);
+ *out = NULL;
+ return;
+ }
+ pipe_reference_init(&fence->ref, 1);
+ fence->fine[0] = fine;
+
+ *out = fence;
+}
+
+static void
+crocus_fence_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ if (ctx == fence->unflushed_ctx)
+ return;
+
+ for (unsigned b = 0; b < ice->batch_count; b++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ /* already signaled fence skipped */
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ ice->batches[b].contains_fence_signal = true;
+ crocus_batch_add_syncobj(&ice->batches[b], fine->syncobj,
+ I915_EXEC_FENCE_SIGNAL);
+ }
+ }
+}
+
+void
+crocus_init_screen_fence_functions(struct pipe_screen *screen)
+{
+ screen->fence_reference = crocus_fence_reference;
+ screen->fence_finish = crocus_fence_finish;
+ screen->fence_get_fd = crocus_fence_get_fd;
+}
+
+void
+crocus_init_context_fence_functions(struct pipe_context *ctx)
+{
+ ctx->flush = crocus_fence_flush;
+ ctx->create_fence_fd = crocus_fence_create_fd;
+ ctx->fence_server_sync = crocus_fence_await;
+ ctx->fence_server_signal = crocus_fence_signal;
+}
diff --git a/src/gallium/drivers/crocus/crocus_fence.h b/src/gallium/drivers/crocus/crocus_fence.h
new file mode 100644
index 00000000000..ef2eff5259b
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fence.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_FENCE_H
+#define CROCUS_FENCE_H
+
+#include "util/u_inlines.h"
+
+struct pipe_screen;
+struct crocus_screen;
+struct crocus_batch;
+
+struct crocus_syncobj {
+ struct pipe_reference ref;
+ uint32_t handle;
+};
+
+void crocus_init_context_fence_functions(struct pipe_context *ctx);
+void crocus_init_screen_fence_functions(struct pipe_screen *screen);
+
+struct crocus_syncobj *crocus_create_syncobj(struct crocus_screen *screen);
+void crocus_syncobj_destroy(struct crocus_screen *, struct crocus_syncobj *);
+void crocus_batch_add_syncobj(struct crocus_batch *batch,
+ struct crocus_syncobj *syncobj,
+ unsigned flags);
+bool crocus_wait_syncobj(struct pipe_screen *screen,
+ struct crocus_syncobj *syncobj,
+ int64_t timeout_nsec);
+static inline void
+crocus_syncobj_reference(struct crocus_screen *screen,
+ struct crocus_syncobj **dst,
+ struct crocus_syncobj *src)
+{
+ if (pipe_reference(&(*dst)->ref, &src->ref))
+ crocus_syncobj_destroy(screen, *dst);
+
+ *dst = src;
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.c b/src/gallium/drivers/crocus/crocus_fine_fence.c
new file mode 100644
index 00000000000..9bb8a9673e3
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fine_fence.c
@@ -0,0 +1,85 @@
+#include "crocus_context.h"
+#include "crocus_fine_fence.h"
+#include "util/u_upload_mgr.h"
+
+static void
+crocus_fine_fence_reset(struct crocus_batch *batch)
+{
+ u_upload_alloc(batch->fine_fences.uploader,
+ 0, sizeof(uint64_t), sizeof(uint64_t),
+ &batch->fine_fences.ref.offset, &batch->fine_fences.ref.res,
+ (void **)&batch->fine_fences.map);
+ WRITE_ONCE(*batch->fine_fences.map, 0);
+ batch->fine_fences.next++;
+}
+
+void
+crocus_fine_fence_init(struct crocus_batch *batch)
+{
+ batch->fine_fences.ref.res = NULL;
+ batch->fine_fences.next = 0;
+ if (batch_has_fine_fence(batch))
+ crocus_fine_fence_reset(batch);
+}
+
+static uint32_t
+crocus_fine_fence_next(struct crocus_batch *batch)
+{
+ if (!batch_has_fine_fence(batch))
+ return UINT32_MAX;
+
+ uint32_t seqno = batch->fine_fences.next++;
+
+ if (batch->fine_fences.next == 0)
+ crocus_fine_fence_reset(batch);
+
+ return seqno;
+}
+
+void
+crocus_fine_fence_destroy(struct crocus_screen *screen,
+ struct crocus_fine_fence *fine)
+{
+ crocus_syncobj_reference(screen, &fine->syncobj, NULL);
+ pipe_resource_reference(&fine->ref.res, NULL);
+ free(fine);
+}
+
+struct crocus_fine_fence *
+crocus_fine_fence_new(struct crocus_batch *batch, unsigned flags)
+{
+ struct crocus_fine_fence *fine = calloc(1, sizeof(*fine));
+ if (!fine)
+ return NULL;
+
+ pipe_reference_init(&fine->reference, 1);
+
+ fine->seqno = crocus_fine_fence_next(batch);
+
+ crocus_syncobj_reference(batch->screen, &fine->syncobj,
+ crocus_batch_get_signal_syncobj(batch));
+
+ if (!batch_has_fine_fence(batch))
+ return fine;
+ pipe_resource_reference(&fine->ref.res, batch->fine_fences.ref.res);
+ fine->ref.offset = batch->fine_fences.ref.offset;
+ fine->map = batch->fine_fences.map;
+ fine->flags = flags;
+
+ unsigned pc;
+ if (flags & CROCUS_FENCE_TOP_OF_PIPE) {
+ pc = PIPE_CONTROL_WRITE_IMMEDIATE | PIPE_CONTROL_CS_STALL;
+ } else {
+ pc = PIPE_CONTROL_WRITE_IMMEDIATE |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_TILE_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_DATA_CACHE_FLUSH;
+ }
+ crocus_emit_pipe_control_write(batch, "fence: fine", pc,
+ crocus_resource_bo(fine->ref.res),
+ fine->ref.offset,
+ fine->seqno);
+
+ return fine;
+}
diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.h b/src/gallium/drivers/crocus/crocus_fine_fence.h
new file mode 100644
index 00000000000..ad6f02a945a
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fine_fence.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_FINE_FENCE_DOT_H
+#define CROCUS_FINE_FENCE_DOT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "crocus_screen.h"
+#include "crocus_resource.h"
+
+/**
+ * A lightweight sequence number fence.
+ *
+ * We emit PIPE_CONTROLs inside a batch (possibly in the middle)
+ * which update a monotonically increasing, 32-bit counter. We
+ * can then check if that moment has passed by either:
+ *
+ * 1. Checking on the CPU by snooping on the DWord via a coherent map
+ *
+ * 2. Blocking on the GPU with MI_SEMAPHORE_WAIT from a second batch
+ * (relying on mid-batch preemption to switch GPU execution to the
+ * batch that writes it).
+ */
+struct crocus_fine_fence {
+ struct pipe_reference reference;
+
+ /** Buffer where the seqno lives */
+ struct crocus_state_ref ref;
+
+ /** Coherent CPU map of the buffer containing the seqno DWord. */
+ const uint32_t *map;
+
+ /**
+ * A drm_syncobj pointing which will be signaled at the end of the
+ * batch which writes this seqno. This can be used to block until
+ * the seqno has definitely passed (but may wait longer than necessary).
+ */
+ struct crocus_syncobj *syncobj;
+
+#define CROCUS_FENCE_BOTTOM_OF_PIPE 0x0 /**< Written by bottom-of-pipe flush */
+#define CROCUS_FENCE_TOP_OF_PIPE 0x1 /**< Written by top-of-pipe flush */
+#define CROCUS_FENCE_END 0x2 /**< Written at the end of a batch */
+
+ /** Information about the type of flush involved (see CROCUS_FENCE_*) */
+ uint32_t flags;
+
+ /**
+ * Sequence number expected to be written by the flush we inserted
+ * when creating this fence. The crocus_fine_fence is 'signaled' when *@map
+ * (written by the flush on the GPU) is greater-than-or-equal to @seqno.
+ */
+ uint32_t seqno;
+};
+
+void crocus_fine_fence_init(struct crocus_batch *batch);
+
+struct crocus_fine_fence *crocus_fine_fence_new(struct crocus_batch *batch,
+ unsigned flags);
+
+void crocus_fine_fence_destroy(struct crocus_screen *screen,
+ struct crocus_fine_fence *sq);
+
+static inline void
+crocus_fine_fence_reference(struct crocus_screen *screen,
+ struct crocus_fine_fence **dst,
+ struct crocus_fine_fence *src)
+{
+ if (pipe_reference(&(*dst)->reference, &src->reference))
+ crocus_fine_fence_destroy(screen, *dst);
+
+ *dst = src;
+}
+
+/**
+ * Return true if this seqno has passed.
+ *
+ * NULL is considered signaled.
+ */
+static inline bool
+crocus_fine_fence_signaled(const struct crocus_fine_fence *sq)
+{
+ if (sq && !sq->map)
+ return false;
+ return !sq || (READ_ONCE(*sq->map) >= sq->seqno);
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_formats.c b/src/gallium/drivers/crocus/crocus_formats.c
new file mode 100644
index 00000000000..31762643bdc
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_formats.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_formats.c
+ *
+ * Converts Gallium formats (PIPE_FORMAT_*) to hardware ones (ISL_FORMAT_*).
+ * Provides information about which formats support what features.
+ */
+
+#include "util/bitscan.h"
+#include "util/macros.h"
+#include "util/format/u_format.h"
+
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+static enum isl_format
+crocus_isl_format_for_pipe_format(enum pipe_format pf)
+{
+ static const enum isl_format table[PIPE_FORMAT_COUNT] = {
+ [0 ... PIPE_FORMAT_COUNT-1] = ISL_FORMAT_UNSUPPORTED,
+
+ [PIPE_FORMAT_B8G8R8A8_UNORM] = ISL_FORMAT_B8G8R8A8_UNORM,
+ [PIPE_FORMAT_B8G8R8X8_UNORM] = ISL_FORMAT_B8G8R8X8_UNORM,
+ [PIPE_FORMAT_B5G5R5A1_UNORM] = ISL_FORMAT_B5G5R5A1_UNORM,
+ [PIPE_FORMAT_B4G4R4A4_UNORM] = ISL_FORMAT_B4G4R4A4_UNORM,
+ [PIPE_FORMAT_B5G6R5_UNORM] = ISL_FORMAT_B5G6R5_UNORM,
+ [PIPE_FORMAT_R10G10B10A2_UNORM] = ISL_FORMAT_R10G10B10A2_UNORM,
+
+ [PIPE_FORMAT_Z16_UNORM] = ISL_FORMAT_R16_UNORM,
+ [PIPE_FORMAT_Z32_UNORM] = ISL_FORMAT_R32_UNORM,
+ [PIPE_FORMAT_Z32_FLOAT] = ISL_FORMAT_R32_FLOAT,
+
+ /* We translate the combined depth/stencil formats to depth only here */
+ [PIPE_FORMAT_Z24_UNORM_S8_UINT] = ISL_FORMAT_R24_UNORM_X8_TYPELESS,
+ [PIPE_FORMAT_Z24X8_UNORM] = ISL_FORMAT_R24_UNORM_X8_TYPELESS,
+ [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = ISL_FORMAT_R32_FLOAT,
+
+ [PIPE_FORMAT_S8_UINT] = ISL_FORMAT_R8_UINT,
+ [PIPE_FORMAT_X24S8_UINT] = ISL_FORMAT_R8_UINT,
+ [PIPE_FORMAT_X32_S8X24_UINT] = ISL_FORMAT_R8_UINT,
+
+ [PIPE_FORMAT_R64_FLOAT] = ISL_FORMAT_R64_FLOAT,
+ [PIPE_FORMAT_R64G64_FLOAT] = ISL_FORMAT_R64G64_FLOAT,
+ [PIPE_FORMAT_R64G64B64_FLOAT] = ISL_FORMAT_R64G64B64_FLOAT,
+ [PIPE_FORMAT_R64G64B64A64_FLOAT] = ISL_FORMAT_R64G64B64A64_FLOAT,
+ [PIPE_FORMAT_R32_FLOAT] = ISL_FORMAT_R32_FLOAT,
+ [PIPE_FORMAT_R32G32_FLOAT] = ISL_FORMAT_R32G32_FLOAT,
+ [PIPE_FORMAT_R32G32B32_FLOAT] = ISL_FORMAT_R32G32B32_FLOAT,
+ [PIPE_FORMAT_R32G32B32A32_FLOAT] = ISL_FORMAT_R32G32B32A32_FLOAT,
+ [PIPE_FORMAT_R32_UNORM] = ISL_FORMAT_R32_UNORM,
+ [PIPE_FORMAT_R32G32_UNORM] = ISL_FORMAT_R32G32_UNORM,
+ [PIPE_FORMAT_R32G32B32_UNORM] = ISL_FORMAT_R32G32B32_UNORM,
+ [PIPE_FORMAT_R32G32B32A32_UNORM] = ISL_FORMAT_R32G32B32A32_UNORM,
+ [PIPE_FORMAT_R32_USCALED] = ISL_FORMAT_R32_USCALED,
+ [PIPE_FORMAT_R32G32_USCALED] = ISL_FORMAT_R32G32_USCALED,
+ [PIPE_FORMAT_R32G32B32_USCALED] = ISL_FORMAT_R32G32B32_USCALED,
+ [PIPE_FORMAT_R32G32B32A32_USCALED] = ISL_FORMAT_R32G32B32A32_USCALED,
+ [PIPE_FORMAT_R32_SNORM] = ISL_FORMAT_R32_SNORM,
+ [PIPE_FORMAT_R32G32_SNORM] = ISL_FORMAT_R32G32_SNORM,
+ [PIPE_FORMAT_R32G32B32_SNORM] = ISL_FORMAT_R32G32B32_SNORM,
+ [PIPE_FORMAT_R32G32B32A32_SNORM] = ISL_FORMAT_R32G32B32A32_SNORM,
+ [PIPE_FORMAT_R32_SSCALED] = ISL_FORMAT_R32_SSCALED,
+ [PIPE_FORMAT_R32G32_SSCALED] = ISL_FORMAT_R32G32_SSCALED,
+ [PIPE_FORMAT_R32G32B32_SSCALED] = ISL_FORMAT_R32G32B32_SSCALED,
+ [PIPE_FORMAT_R32G32B32A32_SSCALED] = ISL_FORMAT_R32G32B32A32_SSCALED,
+ [PIPE_FORMAT_R16_UNORM] = ISL_FORMAT_R16_UNORM,
+ [PIPE_FORMAT_R16G16_UNORM] = ISL_FORMAT_R16G16_UNORM,
+ [PIPE_FORMAT_R16G16B16_UNORM] = ISL_FORMAT_R16G16B16_UNORM,
+ [PIPE_FORMAT_R16G16B16A16_UNORM] = ISL_FORMAT_R16G16B16A16_UNORM,
+ [PIPE_FORMAT_R16_USCALED] = ISL_FORMAT_R16_USCALED,
+ [PIPE_FORMAT_R16G16_USCALED] = ISL_FORMAT_R16G16_USCALED,
+ [PIPE_FORMAT_R16G16B16_USCALED] = ISL_FORMAT_R16G16B16_USCALED,
+ [PIPE_FORMAT_R16G16B16A16_USCALED] = ISL_FORMAT_R16G16B16A16_USCALED,
+ [PIPE_FORMAT_R16_SNORM] = ISL_FORMAT_R16_SNORM,
+ [PIPE_FORMAT_R16G16_SNORM] = ISL_FORMAT_R16G16_SNORM,
+ [PIPE_FORMAT_R16G16B16_SNORM] = ISL_FORMAT_R16G16B16_SNORM,
+ [PIPE_FORMAT_R16G16B16A16_SNORM] = ISL_FORMAT_R16G16B16A16_SNORM,
+ [PIPE_FORMAT_R16_SSCALED] = ISL_FORMAT_R16_SSCALED,
+ [PIPE_FORMAT_R16G16_SSCALED] = ISL_FORMAT_R16G16_SSCALED,
+ [PIPE_FORMAT_R16G16B16_SSCALED] = ISL_FORMAT_R16G16B16_SSCALED,
+ [PIPE_FORMAT_R16G16B16A16_SSCALED] = ISL_FORMAT_R16G16B16A16_SSCALED,
+ [PIPE_FORMAT_R8_UNORM] = ISL_FORMAT_R8_UNORM,
+ [PIPE_FORMAT_R8G8_UNORM] = ISL_FORMAT_R8G8_UNORM,
+ [PIPE_FORMAT_R8G8B8_UNORM] = ISL_FORMAT_R8G8B8_UNORM,
+ [PIPE_FORMAT_R8G8B8A8_UNORM] = ISL_FORMAT_R8G8B8A8_UNORM,
+ [PIPE_FORMAT_R8_USCALED] = ISL_FORMAT_R8_USCALED,
+ [PIPE_FORMAT_R8G8_USCALED] = ISL_FORMAT_R8G8_USCALED,
+ [PIPE_FORMAT_R8G8B8_USCALED] = ISL_FORMAT_R8G8B8_USCALED,
+ [PIPE_FORMAT_R8G8B8A8_USCALED] = ISL_FORMAT_R8G8B8A8_USCALED,
+ [PIPE_FORMAT_R8_SNORM] = ISL_FORMAT_R8_SNORM,
+ [PIPE_FORMAT_R8G8_SNORM] = ISL_FORMAT_R8G8_SNORM,
+ [PIPE_FORMAT_R8G8B8_SNORM] = ISL_FORMAT_R8G8B8_SNORM,
+ [PIPE_FORMAT_R8G8B8A8_SNORM] = ISL_FORMAT_R8G8B8A8_SNORM,
+ [PIPE_FORMAT_R8_SSCALED] = ISL_FORMAT_R8_SSCALED,
+ [PIPE_FORMAT_R8G8_SSCALED] = ISL_FORMAT_R8G8_SSCALED,
+ [PIPE_FORMAT_R8G8B8_SSCALED] = ISL_FORMAT_R8G8B8_SSCALED,
+ [PIPE_FORMAT_R8G8B8A8_SSCALED] = ISL_FORMAT_R8G8B8A8_SSCALED,
+ [PIPE_FORMAT_R32_FIXED] = ISL_FORMAT_R32_SFIXED,
+ [PIPE_FORMAT_R32G32_FIXED] = ISL_FORMAT_R32G32_SFIXED,
+ [PIPE_FORMAT_R32G32B32_FIXED] = ISL_FORMAT_R32G32B32_SFIXED,
+ [PIPE_FORMAT_R32G32B32A32_FIXED] = ISL_FORMAT_R32G32B32A32_SFIXED,
+ [PIPE_FORMAT_R16_FLOAT] = ISL_FORMAT_R16_FLOAT,
+ [PIPE_FORMAT_R16G16_FLOAT] = ISL_FORMAT_R16G16_FLOAT,
+ [PIPE_FORMAT_R16G16B16_FLOAT] = ISL_FORMAT_R16G16B16_FLOAT,
+ [PIPE_FORMAT_R16G16B16A16_FLOAT] = ISL_FORMAT_R16G16B16A16_FLOAT,
+
+ [PIPE_FORMAT_R8G8B8_SRGB] = ISL_FORMAT_R8G8B8_UNORM_SRGB,
+ [PIPE_FORMAT_B8G8R8A8_SRGB] = ISL_FORMAT_B8G8R8A8_UNORM_SRGB,
+ [PIPE_FORMAT_B8G8R8X8_SRGB] = ISL_FORMAT_B8G8R8X8_UNORM_SRGB,
+ [PIPE_FORMAT_R8G8B8A8_SRGB] = ISL_FORMAT_R8G8B8A8_UNORM_SRGB,
+
+ [PIPE_FORMAT_DXT1_RGB] = ISL_FORMAT_BC1_UNORM,
+ [PIPE_FORMAT_DXT1_RGBA] = ISL_FORMAT_BC1_UNORM,
+ [PIPE_FORMAT_DXT3_RGBA] = ISL_FORMAT_BC2_UNORM,
+ [PIPE_FORMAT_DXT5_RGBA] = ISL_FORMAT_BC3_UNORM,
+
+ [PIPE_FORMAT_DXT1_SRGB] = ISL_FORMAT_BC1_UNORM_SRGB,
+ [PIPE_FORMAT_DXT1_SRGBA] = ISL_FORMAT_BC1_UNORM_SRGB,
+ [PIPE_FORMAT_DXT3_SRGBA] = ISL_FORMAT_BC2_UNORM_SRGB,
+ [PIPE_FORMAT_DXT5_SRGBA] = ISL_FORMAT_BC3_UNORM_SRGB,
+
+ [PIPE_FORMAT_RGTC1_UNORM] = ISL_FORMAT_BC4_UNORM,
+ [PIPE_FORMAT_RGTC1_SNORM] = ISL_FORMAT_BC4_SNORM,
+ [PIPE_FORMAT_RGTC2_UNORM] = ISL_FORMAT_BC5_UNORM,
+ [PIPE_FORMAT_RGTC2_SNORM] = ISL_FORMAT_BC5_SNORM,
+
+ [PIPE_FORMAT_R10G10B10A2_USCALED] = ISL_FORMAT_R10G10B10A2_USCALED,
+ [PIPE_FORMAT_R11G11B10_FLOAT] = ISL_FORMAT_R11G11B10_FLOAT,
+ [PIPE_FORMAT_R9G9B9E5_FLOAT] = ISL_FORMAT_R9G9B9E5_SHAREDEXP,
+ [PIPE_FORMAT_R1_UNORM] = ISL_FORMAT_R1_UNORM,
+ [PIPE_FORMAT_R10G10B10X2_USCALED] = ISL_FORMAT_R10G10B10X2_USCALED,
+ [PIPE_FORMAT_B10G10R10A2_UNORM] = ISL_FORMAT_B10G10R10A2_UNORM,
+ [PIPE_FORMAT_R8G8B8X8_UNORM] = ISL_FORMAT_R8G8B8X8_UNORM,
+
+ [PIPE_FORMAT_I8_UNORM] = ISL_FORMAT_R8_UNORM,
+ [PIPE_FORMAT_I16_UNORM] = ISL_FORMAT_R16_UNORM,
+ [PIPE_FORMAT_I8_SNORM] = ISL_FORMAT_R8_SNORM,
+ [PIPE_FORMAT_I16_SNORM] = ISL_FORMAT_R16_SNORM,
+ [PIPE_FORMAT_I16_FLOAT] = ISL_FORMAT_R16_FLOAT,
+ [PIPE_FORMAT_I32_FLOAT] = ISL_FORMAT_R32_FLOAT,
+
+ [PIPE_FORMAT_L8_UINT] = ISL_FORMAT_L8_UINT,
+ [PIPE_FORMAT_L8_UNORM] = ISL_FORMAT_L8_UNORM,
+ [PIPE_FORMAT_L8_SNORM] = ISL_FORMAT_R8_SNORM,
+ [PIPE_FORMAT_L8_SINT] = ISL_FORMAT_L8_SINT,
+ [PIPE_FORMAT_L16_UNORM] = ISL_FORMAT_L16_UNORM,
+ [PIPE_FORMAT_L16_SNORM] = ISL_FORMAT_R16_SNORM,
+ [PIPE_FORMAT_L16_FLOAT] = ISL_FORMAT_L16_FLOAT,
+ [PIPE_FORMAT_L32_FLOAT] = ISL_FORMAT_L32_FLOAT,
+
+ [PIPE_FORMAT_A8_UNORM] = ISL_FORMAT_A8_UNORM,
+ [PIPE_FORMAT_A16_UNORM] = ISL_FORMAT_A16_UNORM,
+ [PIPE_FORMAT_A16_FLOAT] = ISL_FORMAT_A16_FLOAT,
+ [PIPE_FORMAT_A32_FLOAT] = ISL_FORMAT_A32_FLOAT,
+
+ [PIPE_FORMAT_L8A8_UNORM] = ISL_FORMAT_L8A8_UNORM,
+ [PIPE_FORMAT_L16A16_UNORM] = ISL_FORMAT_L16A16_UNORM,
+ [PIPE_FORMAT_L16A16_FLOAT] = ISL_FORMAT_L16A16_FLOAT,
+ [PIPE_FORMAT_L32A32_FLOAT] = ISL_FORMAT_L32A32_FLOAT,
+
+ /* Sadly, we have to use luminance[-alpha] formats for sRGB decoding. */
+ [PIPE_FORMAT_R8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB,
+ [PIPE_FORMAT_L8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB,
+ [PIPE_FORMAT_L8A8_SRGB] = ISL_FORMAT_L8A8_UNORM_SRGB,
+
+ [PIPE_FORMAT_R10G10B10A2_SSCALED] = ISL_FORMAT_R10G10B10A2_SSCALED,
+ [PIPE_FORMAT_R10G10B10A2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM,
+
+ [PIPE_FORMAT_B10G10R10A2_USCALED] = ISL_FORMAT_B10G10R10A2_USCALED,
+ [PIPE_FORMAT_B10G10R10A2_SSCALED] = ISL_FORMAT_B10G10R10A2_SSCALED,
+ [PIPE_FORMAT_B10G10R10A2_SNORM] = ISL_FORMAT_B10G10R10A2_SNORM,
+
+ [PIPE_FORMAT_R8_UINT] = ISL_FORMAT_R8_UINT,
+ [PIPE_FORMAT_R8G8_UINT] = ISL_FORMAT_R8G8_UINT,
+ [PIPE_FORMAT_R8G8B8_UINT] = ISL_FORMAT_R8G8B8_UINT,
+ [PIPE_FORMAT_R8G8B8A8_UINT] = ISL_FORMAT_R8G8B8A8_UINT,
+
+ [PIPE_FORMAT_R8_SINT] = ISL_FORMAT_R8_SINT,
+ [PIPE_FORMAT_R8G8_SINT] = ISL_FORMAT_R8G8_SINT,
+ [PIPE_FORMAT_R8G8B8_SINT] = ISL_FORMAT_R8G8B8_SINT,
+ [PIPE_FORMAT_R8G8B8A8_SINT] = ISL_FORMAT_R8G8B8A8_SINT,
+
+ [PIPE_FORMAT_R16_UINT] = ISL_FORMAT_R16_UINT,
+ [PIPE_FORMAT_R16G16_UINT] = ISL_FORMAT_R16G16_UINT,
+ [PIPE_FORMAT_R16G16B16_UINT] = ISL_FORMAT_R16G16B16_UINT,
+ [PIPE_FORMAT_R16G16B16A16_UINT] = ISL_FORMAT_R16G16B16A16_UINT,
+
+ [PIPE_FORMAT_R16_SINT] = ISL_FORMAT_R16_SINT,
+ [PIPE_FORMAT_R16G16_SINT] = ISL_FORMAT_R16G16_SINT,
+ [PIPE_FORMAT_R16G16B16_SINT] = ISL_FORMAT_R16G16B16_SINT,
+ [PIPE_FORMAT_R16G16B16A16_SINT] = ISL_FORMAT_R16G16B16A16_SINT,
+
+ [PIPE_FORMAT_R32_UINT] = ISL_FORMAT_R32_UINT,
+ [PIPE_FORMAT_R32G32_UINT] = ISL_FORMAT_R32G32_UINT,
+ [PIPE_FORMAT_R32G32B32_UINT] = ISL_FORMAT_R32G32B32_UINT,
+ [PIPE_FORMAT_R32G32B32A32_UINT] = ISL_FORMAT_R32G32B32A32_UINT,
+
+ [PIPE_FORMAT_R32_SINT] = ISL_FORMAT_R32_SINT,
+ [PIPE_FORMAT_R32G32_SINT] = ISL_FORMAT_R32G32_SINT,
+ [PIPE_FORMAT_R32G32B32_SINT] = ISL_FORMAT_R32G32B32_SINT,
+ [PIPE_FORMAT_R32G32B32A32_SINT] = ISL_FORMAT_R32G32B32A32_SINT,
+
+ [PIPE_FORMAT_B10G10R10A2_UINT] = ISL_FORMAT_B10G10R10A2_UINT,
+
+ [PIPE_FORMAT_ETC1_RGB8] = ISL_FORMAT_ETC1_RGB8,
+
+ [PIPE_FORMAT_R8G8B8X8_SRGB] = ISL_FORMAT_R8G8B8X8_UNORM_SRGB,
+ [PIPE_FORMAT_B10G10R10X2_UNORM] = ISL_FORMAT_B10G10R10X2_UNORM,
+ [PIPE_FORMAT_R16G16B16X16_UNORM] = ISL_FORMAT_R16G16B16X16_UNORM,
+ [PIPE_FORMAT_R16G16B16X16_FLOAT] = ISL_FORMAT_R16G16B16X16_FLOAT,
+ [PIPE_FORMAT_R32G32B32X32_FLOAT] = ISL_FORMAT_R32G32B32X32_FLOAT,
+
+ [PIPE_FORMAT_R10G10B10A2_UINT] = ISL_FORMAT_R10G10B10A2_UINT,
+
+ [PIPE_FORMAT_B5G6R5_SRGB] = ISL_FORMAT_B5G6R5_UNORM_SRGB,
+
+ [PIPE_FORMAT_BPTC_RGBA_UNORM] = ISL_FORMAT_BC7_UNORM,
+ [PIPE_FORMAT_BPTC_SRGBA] = ISL_FORMAT_BC7_UNORM_SRGB,
+ [PIPE_FORMAT_BPTC_RGB_FLOAT] = ISL_FORMAT_BC6H_SF16,
+ [PIPE_FORMAT_BPTC_RGB_UFLOAT] = ISL_FORMAT_BC6H_UF16,
+
+ [PIPE_FORMAT_ETC2_RGB8] = ISL_FORMAT_ETC2_RGB8,
+ [PIPE_FORMAT_ETC2_SRGB8] = ISL_FORMAT_ETC2_SRGB8,
+ [PIPE_FORMAT_ETC2_RGB8A1] = ISL_FORMAT_ETC2_RGB8_PTA,
+ [PIPE_FORMAT_ETC2_SRGB8A1] = ISL_FORMAT_ETC2_SRGB8_PTA,
+ [PIPE_FORMAT_ETC2_RGBA8] = ISL_FORMAT_ETC2_EAC_RGBA8,
+ [PIPE_FORMAT_ETC2_SRGBA8] = ISL_FORMAT_ETC2_EAC_SRGB8_A8,
+ [PIPE_FORMAT_ETC2_R11_UNORM] = ISL_FORMAT_EAC_R11,
+ [PIPE_FORMAT_ETC2_R11_SNORM] = ISL_FORMAT_EAC_SIGNED_R11,
+ [PIPE_FORMAT_ETC2_RG11_UNORM] = ISL_FORMAT_EAC_RG11,
+ [PIPE_FORMAT_ETC2_RG11_SNORM] = ISL_FORMAT_EAC_SIGNED_RG11,
+
+ [PIPE_FORMAT_FXT1_RGB] = ISL_FORMAT_FXT1,
+ [PIPE_FORMAT_FXT1_RGBA] = ISL_FORMAT_FXT1,
+
+ [PIPE_FORMAT_ASTC_4x4] = ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16,
+ [PIPE_FORMAT_ASTC_5x4] = ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16,
+ [PIPE_FORMAT_ASTC_5x5] = ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16,
+ [PIPE_FORMAT_ASTC_6x5] = ISL_FORMAT_ASTC_LDR_2D_6X5_FLT16,
+ [PIPE_FORMAT_ASTC_6x6] = ISL_FORMAT_ASTC_LDR_2D_6X6_FLT16,
+ [PIPE_FORMAT_ASTC_8x5] = ISL_FORMAT_ASTC_LDR_2D_8X5_FLT16,
+ [PIPE_FORMAT_ASTC_8x6] = ISL_FORMAT_ASTC_LDR_2D_8X6_FLT16,
+ [PIPE_FORMAT_ASTC_8x8] = ISL_FORMAT_ASTC_LDR_2D_8X8_FLT16,
+ [PIPE_FORMAT_ASTC_10x5] = ISL_FORMAT_ASTC_LDR_2D_10X5_FLT16,
+ [PIPE_FORMAT_ASTC_10x6] = ISL_FORMAT_ASTC_LDR_2D_10X6_FLT16,
+ [PIPE_FORMAT_ASTC_10x8] = ISL_FORMAT_ASTC_LDR_2D_10X8_FLT16,
+ [PIPE_FORMAT_ASTC_10x10] = ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16,
+ [PIPE_FORMAT_ASTC_12x10] = ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16,
+ [PIPE_FORMAT_ASTC_12x12] = ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16,
+
+ [PIPE_FORMAT_ASTC_4x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_4X4_U8SRGB,
+ [PIPE_FORMAT_ASTC_5x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X4_U8SRGB,
+ [PIPE_FORMAT_ASTC_5x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_6x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_6x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X6_U8SRGB,
+ [PIPE_FORMAT_ASTC_8x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_8x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X6_U8SRGB,
+ [PIPE_FORMAT_ASTC_8x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X8_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X6_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X8_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X10_U8SRGB,
+ [PIPE_FORMAT_ASTC_12x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB,
+ [PIPE_FORMAT_ASTC_12x12_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB,
+
+ [PIPE_FORMAT_A1B5G5R5_UNORM] = ISL_FORMAT_A1B5G5R5_UNORM,
+
+ /* We support these so that we know the API expects no alpha channel.
+ * Otherwise, the state tracker would just give us a format with alpha
+ * and we wouldn't know to override the swizzle to 1.
+ */
+ [PIPE_FORMAT_R16G16B16X16_UINT] = ISL_FORMAT_R16G16B16A16_UINT,
+ [PIPE_FORMAT_R16G16B16X16_SINT] = ISL_FORMAT_R16G16B16A16_SINT,
+ [PIPE_FORMAT_R32G32B32X32_UINT] = ISL_FORMAT_R32G32B32A32_UINT,
+ [PIPE_FORMAT_R32G32B32X32_SINT] = ISL_FORMAT_R32G32B32A32_SINT,
+ [PIPE_FORMAT_R10G10B10X2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM,
+ };
+ assert(pf < PIPE_FORMAT_COUNT);
+ return table[pf];
+}
+
+static enum isl_format
+get_render_format(enum pipe_format pformat, enum isl_format def_format)
+{
+ switch (pformat) {
+ case PIPE_FORMAT_A16_UNORM: return ISL_FORMAT_R16_UNORM;
+ case PIPE_FORMAT_A16_FLOAT: return ISL_FORMAT_R16_FLOAT;
+ case PIPE_FORMAT_A32_FLOAT: return ISL_FORMAT_R32_FLOAT;
+
+ case PIPE_FORMAT_I8_UNORM: return ISL_FORMAT_R8_UNORM;
+ case PIPE_FORMAT_I16_UNORM: return ISL_FORMAT_R16_UNORM;
+ case PIPE_FORMAT_I16_FLOAT: return ISL_FORMAT_R16_FLOAT;
+ case PIPE_FORMAT_I32_FLOAT: return ISL_FORMAT_R32_FLOAT;
+
+ case PIPE_FORMAT_L8_UNORM: return ISL_FORMAT_R8_UNORM;
+ case PIPE_FORMAT_L8_UINT: return ISL_FORMAT_R8_UINT;
+ case PIPE_FORMAT_L8_SINT: return ISL_FORMAT_R8_SINT;
+ case PIPE_FORMAT_L16_UNORM: return ISL_FORMAT_R16_UNORM;
+ case PIPE_FORMAT_L16_FLOAT: return ISL_FORMAT_R16_FLOAT;
+ case PIPE_FORMAT_L32_FLOAT: return ISL_FORMAT_R32_FLOAT;
+
+ case PIPE_FORMAT_L8A8_UNORM: return ISL_FORMAT_R8G8_UNORM;
+ case PIPE_FORMAT_L16A16_UNORM: return ISL_FORMAT_R16G16_UNORM;
+ case PIPE_FORMAT_L16A16_FLOAT: return ISL_FORMAT_R16G16_FLOAT;
+ case PIPE_FORMAT_L32A32_FLOAT: return ISL_FORMAT_R32G32_FLOAT;
+
+ default:
+ return def_format;
+ }
+}
+
+struct crocus_format_info
+crocus_format_for_usage(const struct intel_device_info *devinfo,
+ enum pipe_format pformat,
+ isl_surf_usage_flags_t usage)
+{
+ struct crocus_format_info info = { crocus_isl_format_for_pipe_format(pformat),
+ { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W } };
+
+ if (info.fmt == ISL_FORMAT_UNSUPPORTED)
+ return info;
+
+ if (pformat == PIPE_FORMAT_A8_UNORM) {
+ info.fmt = ISL_FORMAT_A8_UNORM;
+ }
+
+ if (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)
+ info.fmt = get_render_format(pformat, info.fmt);
+ if (devinfo->ver < 6) {
+ if (pformat == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+ info.fmt = ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS;
+ if (pformat == PIPE_FORMAT_X32_S8X24_UINT)
+ info.fmt = ISL_FORMAT_X32_TYPELESS_G8X24_UINT;
+ if (pformat == PIPE_FORMAT_X24S8_UINT)
+ info.fmt = ISL_FORMAT_X24_TYPELESS_G8_UINT;
+ }
+
+ const struct isl_format_layout *fmtl = isl_format_get_layout(info.fmt);
+
+ if (util_format_is_snorm(pformat)) {
+ if (util_format_is_intensity(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_X;
+ info.swizzles[2] = PIPE_SWIZZLE_X;
+ info.swizzles[3] = PIPE_SWIZZLE_X;
+ } else if (util_format_is_luminance(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_X;
+ info.swizzles[2] = PIPE_SWIZZLE_X;
+ info.swizzles[3] = PIPE_SWIZZLE_1;
+ } else if (util_format_is_luminance_alpha(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_X;
+ info.swizzles[2] = PIPE_SWIZZLE_X;
+ info.swizzles[3] = PIPE_SWIZZLE_Y;
+ } else if (util_format_is_alpha(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_0;
+ info.swizzles[1] = PIPE_SWIZZLE_0;
+ info.swizzles[2] = PIPE_SWIZZLE_0;
+ info.swizzles[3] = PIPE_SWIZZLE_X;
+ }
+ }
+
+ /* When faking RGBX pipe formats with RGBA ISL formats, override alpha. */
+ if (!util_format_has_alpha(pformat) && fmtl->channels.a.type != ISL_VOID) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_Y;
+ info.swizzles[2] = PIPE_SWIZZLE_Z;
+ info.swizzles[3] = PIPE_SWIZZLE_1;
+ }
+
+ /* We choose RGBA over RGBX for rendering the hardware doesn't support
+ * rendering to RGBX. However, when this internal override is used on Gen9+,
+ * fast clears don't work correctly.
+ *
+ * i965 fixes this by pretending to not support RGBX formats, and the higher
+ * layers of Mesa pick the RGBA format instead. Gallium doesn't work that
+ * way, and might choose a different format, like BGRX instead of RGBX,
+ * which will also cause problems when sampling from a surface fast cleared
+ * as RGBX. So we always choose RGBA instead of RGBX explicitly
+ * here.
+ */
+ if (isl_format_is_rgbx(info.fmt) &&
+ !isl_format_supports_rendering(devinfo, info.fmt) &&
+ (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)) {
+ info.fmt = isl_format_rgbx_to_rgba(info.fmt);
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_Y;
+ info.swizzles[2] = PIPE_SWIZZLE_Z;
+ info.swizzles[3] = PIPE_SWIZZLE_1;
+ }
+
+ return info;
+}
+
+/**
+ * The pscreen->is_format_supported() driver hook.
+ *
+ * Returns true if the given format is supported for the given usage
+ * (PIPE_BIND_*) and sample count.
+ */
+bool
+crocus_is_format_supported(struct pipe_screen *pscreen,
+ enum pipe_format pformat,
+ enum pipe_texture_target target,
+ unsigned sample_count, unsigned storage_sample_count,
+ unsigned usage)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (!util_is_power_of_two_or_zero(sample_count))
+ return false;
+ if (devinfo->ver >= 7) {
+ if (sample_count > 8 || sample_count == 2)
+ return false;
+ } else if (devinfo->ver == 6) {
+ if (sample_count > 4 || sample_count == 2)
+ return false;
+ } else if (sample_count > 1) {
+ return false;
+ }
+
+ if (pformat == PIPE_FORMAT_NONE)
+ return true;
+
+ enum isl_format format = crocus_isl_format_for_pipe_format(pformat);
+
+ if (format == ISL_FORMAT_UNSUPPORTED)
+ return false;
+
+ /* no stencil texturing prior to haswell */
+ if (!devinfo->is_haswell) {
+ if (pformat == PIPE_FORMAT_S8_UINT ||
+ pformat == PIPE_FORMAT_X24S8_UINT ||
+ pformat == PIPE_FORMAT_S8X24_UINT ||
+ pformat == PIPE_FORMAT_X32_S8X24_UINT)
+ return FALSE;
+ }
+
+ const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+ const bool is_integer = isl_format_has_int_channel(format);
+ bool supported = true;
+
+ if (sample_count > 1)
+ supported &= isl_format_supports_multisampling(devinfo, format);
+
+ if (usage & PIPE_BIND_DEPTH_STENCIL) {
+ supported &= format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS ||
+ format == ISL_FORMAT_R32_FLOAT ||
+ format == ISL_FORMAT_R24_UNORM_X8_TYPELESS ||
+ format == ISL_FORMAT_R16_UNORM ||
+ format == ISL_FORMAT_R8_UINT;
+ }
+
+ if (usage & PIPE_BIND_RENDER_TARGET) {
+ /* Alpha and luminance-alpha formats other than A8_UNORM are not
+ * renderable.
+ *
+ * For BLORP, we can apply the swizzle in the shader. But for
+ * general rendering, this would mean recompiling the shader, which
+ * we'd like to avoid doing. So we mark these formats non-renderable.
+ *
+ * We do support A8_UNORM as it's required and is renderable.
+ */
+ if (pformat != PIPE_FORMAT_A8_UNORM &&
+ (util_format_is_alpha(pformat) ||
+ util_format_is_luminance_alpha(pformat)))
+ supported = false;
+
+ enum isl_format rt_format = format;
+
+ if (isl_format_is_rgbx(format) &&
+ !isl_format_supports_rendering(devinfo, format))
+ rt_format = isl_format_rgbx_to_rgba(format);
+
+ supported &= isl_format_supports_rendering(devinfo, rt_format);
+
+ if (!is_integer)
+ supported &= isl_format_supports_alpha_blending(devinfo, rt_format);
+ }
+
+ if (usage & PIPE_BIND_SHADER_IMAGE) {
+ /* Dataport doesn't support compression, and we can't resolve an MCS
+ * compressed surface. (Buffer images may have sample count of 0.)
+ */
+ supported &= sample_count == 0;
+
+ supported &= isl_format_supports_typed_writes(devinfo, format);
+ supported &= isl_has_matching_typed_storage_image_format(devinfo, format);
+ }
+
+ if (usage & PIPE_BIND_SAMPLER_VIEW) {
+ supported &= isl_format_supports_sampling(devinfo, format);
+ bool ignore_filtering = false;
+
+ if (is_integer)
+ ignore_filtering = true;
+
+ /* I said them, but I lied them. */
+ if (devinfo->ver < 5 && (format == ISL_FORMAT_R32G32B32A32_FLOAT ||
+ format == ISL_FORMAT_R24_UNORM_X8_TYPELESS ||
+ format == ISL_FORMAT_R32_FLOAT ||
+ format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS))
+ ignore_filtering = true;
+ if (!ignore_filtering)
+ supported &= isl_format_supports_filtering(devinfo, format);
+
+ /* Don't advertise 3-component RGB formats for non-buffer textures.
+ * This ensures that they are renderable from an API perspective since
+ * the state tracker will fall back to RGBA or RGBX, which are
+ * renderable. We want to render internally for copies and blits,
+ * even if the application doesn't.
+ *
+ * Buffer textures don't need to be renderable, so we support real RGB.
+ * This is useful for PBO upload, and 32-bit RGB support is mandatory.
+ */
+ if (target != PIPE_BUFFER)
+ supported &= fmtl->bpb != 24 && fmtl->bpb != 48 && fmtl->bpb != 96;
+ }
+
+ if (usage & PIPE_BIND_VERTEX_BUFFER) {
+ supported &= isl_format_supports_vertex_fetch(devinfo, format);
+
+ if (!devinfo->is_haswell) {
+ /* W/A: Pre-Haswell, the hardware doesn't really support the formats
+ * we'd like to use here, so upload everything as UINT and fix it in
+ * the shader
+ */
+ if (format == ISL_FORMAT_R10G10B10A2_UNORM ||
+ format == ISL_FORMAT_B10G10R10A2_UNORM ||
+ format == ISL_FORMAT_R10G10B10A2_SNORM ||
+ format == ISL_FORMAT_B10G10R10A2_SNORM ||
+ format == ISL_FORMAT_R10G10B10A2_USCALED ||
+ format == ISL_FORMAT_B10G10R10A2_USCALED ||
+ format == ISL_FORMAT_R10G10B10A2_SSCALED ||
+ format == ISL_FORMAT_B10G10R10A2_SSCALED)
+ supported = true;
+
+ if (format == ISL_FORMAT_R8G8B8_SINT ||
+ format == ISL_FORMAT_R8G8B8_UINT ||
+ format == ISL_FORMAT_R16G16B16_SINT ||
+ format == ISL_FORMAT_R16G16B16_UINT)
+ supported = true;
+ }
+ }
+
+ if (usage & PIPE_BIND_INDEX_BUFFER) {
+ supported &= format == ISL_FORMAT_R8_UINT ||
+ format == ISL_FORMAT_R16_UINT ||
+ format == ISL_FORMAT_R32_UINT;
+ }
+
+ return supported;
+}
diff --git a/src/gallium/drivers/crocus/crocus_genx_macros.h b/src/gallium/drivers/crocus/crocus_genx_macros.h
new file mode 100644
index 00000000000..a0309513ed2
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_genx_macros.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Macro and function definitions needed in order to use genxml.
+ *
+ * This should only be included in sources compiled per-generation.
+ */
+
+#include "crocus_batch.h"
+
+#include "genxml/gen_macros.h"
+
+#define __gen_address_type struct crocus_address
+#define __gen_user_data struct crocus_batch
+#define __gen_combine_address crocus_combine_address
+
+static inline void *
+__gen_get_batch_dwords(struct crocus_batch *batch, unsigned dwords)
+{
+ return crocus_get_command_space(batch, dwords * sizeof(uint32_t));
+}
+
+static inline struct crocus_address
+__gen_address_offset(struct crocus_address addr, uint64_t offset)
+{
+ addr.offset += offset;
+ return addr;
+}
+
+static uint64_t
+__gen_combine_address(struct crocus_batch *batch, void *location,
+ struct crocus_address addr, uint32_t delta)
+{
+ uint32_t offset = (char *)location - (char *)batch->command.map;
+
+ if (addr.bo == NULL) {
+ return addr.offset + delta;
+ } else {
+ if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) {
+ offset = (char *) location - (char *) batch->state.map;
+ return crocus_state_reloc(batch, offset, addr.bo,
+ addr.offset + delta,
+ addr.reloc_flags);
+ }
+
+ assert(!crocus_ptr_in_state_buffer(batch, location));
+
+ offset = (char *) location - (char *) batch->command.map;
+ return crocus_command_reloc(batch, offset, addr.bo,
+ addr.offset + delta,
+ addr.reloc_flags);
+ }
+}
+
+#define __gen_address_type struct crocus_address
+#define __gen_user_data struct crocus_batch
+
+#define __genxml_cmd_length(cmd) cmd ## _length
+#define __genxml_cmd_length_bias(cmd) cmd ## _length_bias
+#define __genxml_cmd_header(cmd) cmd ## _header
+#define __genxml_cmd_pack(cmd) cmd ## _pack
+#define __genxml_reg_num(cmd) cmd ## _num
+
+#include "genxml/genX_pack.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_bits.h"
+
+/* CS_GPR(15) is reserved for combining conditional rendering predicates
+ * with GL_ARB_indirect_parameters draw number predicates.
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 15
+#include "common/mi_builder.h"
+
+#define _crocus_pack_command(batch, cmd, dst, name) \
+ for (struct cmd name = { __genxml_cmd_header(cmd) }, \
+ *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
+ ({ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name); \
+ _dst = NULL; \
+ }))
+
+#define crocus_pack_command(cmd, dst, name) \
+ _crocus_pack_command(NULL, cmd, dst, name)
+
+#define _crocus_pack_state(batch, cmd, dst, name) \
+ for (struct cmd name = {}, \
+ *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
+ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name), \
+ _dst = NULL)
+
+#define crocus_pack_state(cmd, dst, name) \
+ _crocus_pack_state(NULL, cmd, dst, name)
+
+#define crocus_emit_cmd(batch, cmd, name) \
+ _crocus_pack_command(batch, cmd, __gen_get_batch_dwords(batch, __genxml_cmd_length(cmd)), name)
+
+#define crocus_emit_merge(batch, dwords0, dwords1, num_dwords) \
+ do { \
+ uint32_t *dw = __gen_get_batch_dwords(batch, num_dwords); \
+ for (uint32_t i = 0; i < num_dwords; i++) \
+ dw[i] = (dwords0)[i] | (dwords1)[i]; \
+ VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dwords)); \
+ } while (0)
+
+#define crocus_emit_reg(batch, reg, name) \
+ for (struct reg name = {}, *_cont = (struct reg *)1; _cont != NULL; \
+ ({ \
+ uint32_t _dw[__genxml_cmd_length(reg)]; \
+ __genxml_cmd_pack(reg)(NULL, _dw, &name); \
+ for (unsigned i = 0; i < __genxml_cmd_length(reg); i++) { \
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { \
+ lri.RegisterOffset = __genxml_reg_num(reg); \
+ lri.DataDWord = _dw[i]; \
+ } \
+ } \
+ _cont = NULL; \
+ }))
+
+
+/**
+ * crocus_address constructor helpers:
+ *
+ * When using these to construct a CSO, pass NULL for \p bo, and manually
+ * pin the BO later. Otherwise, genxml's address handling will add the
+ * BO to the current batch's validation list at CSO creation time, rather
+ * than at draw time as desired.
+ */
+
+UNUSED static struct crocus_address
+ro_bo(struct crocus_bo *bo, uint64_t offset)
+{
+ return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT };
+}
+
+UNUSED static struct crocus_address
+rw_bo(struct crocus_bo *bo, uint64_t offset)
+{
+ return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT | RELOC_WRITE };
+}
+
+UNUSED static struct crocus_address
+ggtt_bo(struct crocus_bo *bo, uint64_t offset)
+{
+ return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT };
+}
diff --git a/src/gallium/drivers/crocus/crocus_genx_protos.h b/src/gallium/drivers/crocus/crocus_genx_protos.h
new file mode 100644
index 00000000000..ba6798f991e
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_genx_protos.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* GenX-specific function declarations.
+ *
+ * Don't include this directly, it will be included by crocus_context.h.
+ *
+ * NOTE: This header can be included multiple times, from the same file.
+ */
+
+/* crocus_state.c */
+void genX(init_state)(struct crocus_context *ice);
+void genX(init_screen_state)(struct crocus_screen *screen);
+void genX(upload_urb)(struct crocus_batch *batch,
+ unsigned vs_size,
+ bool gs_present,
+ unsigned gs_size);
+void genX(emit_hashing_mode)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ unsigned width, unsigned height,
+ unsigned scale);
+
+/* crocus_blorp.c */
+void genX(init_blorp)(struct crocus_context *ice);
+
+/* crocus_query.c */
+void genX(init_query)(struct crocus_context *ice);
+void genX(init_screen_query)(struct crocus_screen *screen);
+void genX(math_add32_gpr0)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ uint32_t x);
+void genX(math_div32_gpr0)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ uint32_t D);
+
+/* crocus_blt.c */
+void genX(init_blt)(struct crocus_screen *screen);
diff --git a/src/gallium/drivers/crocus/crocus_monitor.c b/src/gallium/drivers/crocus/crocus_monitor.c
new file mode 100644
index 00000000000..c0465f22875
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_monitor.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "crocus_monitor.h"
+
+#include <xf86drm.h>
+
+#include "crocus_screen.h"
+#include "crocus_context.h"
+
+#include "perf/intel_perf.h"
+#include "perf/intel_perf_query.h"
+#include "perf/intel_perf_regs.h"
+
+struct crocus_monitor_object {
+ int num_active_counters;
+ int *active_counters;
+
+ size_t result_size;
+ unsigned char *result_buffer;
+
+ struct intel_perf_query_object *query;
+};
+
+int
+crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index,
+ struct pipe_driver_query_info *info)
+{
+ const struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ assert(screen->monitor_cfg);
+ if (!screen->monitor_cfg)
+ return 0;
+
+ const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+
+ if (!info) {
+ /* return the number of metrics */
+ return monitor_cfg->num_counters;
+ }
+
+ const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+ const int group = monitor_cfg->counters[index].group;
+ const int counter_index = monitor_cfg->counters[index].counter;
+ struct intel_perf_query_counter *counter =
+ &perf_cfg->queries[group].counters[counter_index];
+
+ info->group_id = group;
+ info->name = counter->name;
+ info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index;
+
+ if (counter->type == INTEL_PERF_COUNTER_TYPE_THROUGHPUT)
+ info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
+ else
+ info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
+ switch (counter->data_type) {
+ case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
+ info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
+ info->max_value.u32 = 0;
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+ info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+ info->max_value.u64 = 0;
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+ case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
+ info->type = PIPE_DRIVER_QUERY_TYPE_FLOAT;
+ info->max_value.u64 = -1;
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ /* indicates that this is an OA query, not a pipeline statistics query */
+ info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+ return 1;
+}
+
+typedef void (*bo_unreference_t)(void *);
+typedef void *(*bo_map_t)(void *, void *, unsigned flags);
+typedef void (*bo_unmap_t)(void *);
+typedef void (*emit_mi_report_t)(void *, void *, uint32_t, uint32_t);
+typedef void (*emit_mi_flush_t)(void *);
+typedef void (*capture_frequency_stat_register_t)(void *, void *,
+ uint32_t );
+typedef void (*store_register_mem64_t)(void *ctx, void *bo,
+ uint32_t reg, uint32_t offset);
+typedef bool (*batch_references_t)(void *batch, void *bo);
+typedef void (*bo_wait_rendering_t)(void *bo);
+typedef int (*bo_busy_t)(void *bo);
+
+static void *
+crocus_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size)
+{
+ return crocus_bo_alloc(bufmgr, name, size);
+}
+
+#if 0
+static void
+crocus_monitor_emit_mi_flush(struct crocus_context *ice)
+{
+ const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_VF_CACHE_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CS_STALL;
+ crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
+ "OA metrics", flags);
+}
+#endif
+
+static void
+crocus_monitor_emit_mi_report_perf_count(void *c,
+ void *bo,
+ uint32_t offset_in_bytes,
+ uint32_t report_id)
+{
+ struct crocus_context *ice = c;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ screen->vtbl.emit_mi_report_perf_count(batch, bo, offset_in_bytes, report_id);
+}
+
+static void
+crocus_monitor_batchbuffer_flush(void *c, const char *file, int line)
+{
+ struct crocus_context *ice = c;
+ _crocus_batch_flush(&ice->batches[CROCUS_BATCH_RENDER], __FILE__, __LINE__);
+}
+
+#if 0
+static void
+crocus_monitor_capture_frequency_stat_register(void *ctx,
+ void *bo,
+ uint32_t bo_offset)
+{
+ struct crocus_context *ice = ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ ice->vtbl.store_register_mem32(batch, GEN9_RPSTAT0, bo, bo_offset, false);
+}
+
+static void
+crocus_monitor_store_register_mem64(void *ctx, void *bo,
+ uint32_t reg, uint32_t offset)
+{
+ struct crocus_context *ice = ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
+}
+#endif
+
+static bool
+crocus_monitor_init_metrics(struct crocus_screen *screen)
+{
+ struct crocus_monitor_config *monitor_cfg =
+ rzalloc(screen, struct crocus_monitor_config);
+ struct intel_perf_config *perf_cfg = NULL;
+ if (unlikely(!monitor_cfg))
+ goto allocation_error;
+ perf_cfg = intel_perf_new(monitor_cfg);
+ if (unlikely(!perf_cfg))
+ goto allocation_error;
+
+ monitor_cfg->perf_cfg = perf_cfg;
+
+ perf_cfg->vtbl.bo_alloc = crocus_oa_bo_alloc;
+ perf_cfg->vtbl.bo_unreference = (bo_unreference_t)crocus_bo_unreference;
+ perf_cfg->vtbl.bo_map = (bo_map_t)crocus_bo_map;
+ perf_cfg->vtbl.bo_unmap = (bo_unmap_t)crocus_bo_unmap;
+
+ perf_cfg->vtbl.emit_mi_report_perf_count =
+ (emit_mi_report_t)crocus_monitor_emit_mi_report_perf_count;
+ perf_cfg->vtbl.batchbuffer_flush = crocus_monitor_batchbuffer_flush;
+ perf_cfg->vtbl.batch_references = (batch_references_t)crocus_batch_references;
+ perf_cfg->vtbl.bo_wait_rendering =
+ (bo_wait_rendering_t)crocus_bo_wait_rendering;
+ perf_cfg->vtbl.bo_busy = (bo_busy_t)crocus_bo_busy;
+
+ intel_perf_init_metrics(perf_cfg, &screen->devinfo, screen->fd, false, false);
+ screen->monitor_cfg = monitor_cfg;
+
+ /* a gallium "group" is equivalent to a gen "query"
+ * a gallium "query" is equivalent to a gen "query_counter"
+ *
+ * Each gen_query supports a specific number of query_counters. To
+ * allocate the array of crocus_monitor_counter, we need an upper bound
+ * (ignoring duplicate query_counters).
+ */
+ int gen_query_counters_count = 0;
+ for (int gen_query_id = 0;
+ gen_query_id < perf_cfg->n_queries;
+ ++gen_query_id) {
+ gen_query_counters_count += perf_cfg->queries[gen_query_id].n_counters;
+ }
+
+ monitor_cfg->counters = rzalloc_size(monitor_cfg,
+ sizeof(struct crocus_monitor_counter) *
+ gen_query_counters_count);
+ if (unlikely(!monitor_cfg->counters))
+ goto allocation_error;
+
+ int crocus_monitor_id = 0;
+ for (int group = 0; group < perf_cfg->n_queries; ++group) {
+ for (int counter = 0;
+ counter < perf_cfg->queries[group].n_counters;
+ ++counter) {
+ /* Check previously identified metrics to filter out duplicates. The
+ * user is not helped by having the same metric available in several
+ * groups. (n^2 algorithm).
+ */
+ bool duplicate = false;
+ for (int existing_group = 0;
+ existing_group < group && !duplicate;
+ ++existing_group) {
+ for (int existing_counter = 0;
+ existing_counter < perf_cfg->queries[existing_group].n_counters && !duplicate;
+ ++existing_counter) {
+ const char *current_name =
+ perf_cfg->queries[group].counters[counter].name;
+ const char *existing_name =
+ perf_cfg->queries[existing_group].counters[existing_counter].name;
+ if (strcmp(current_name, existing_name) == 0) {
+ duplicate = true;
+ }
+ }
+ }
+ if (duplicate)
+ continue;
+ monitor_cfg->counters[crocus_monitor_id].group = group;
+ monitor_cfg->counters[crocus_monitor_id].counter = counter;
+ ++crocus_monitor_id;
+ }
+ }
+ monitor_cfg->num_counters = crocus_monitor_id;
+ return monitor_cfg->num_counters;
+
+allocation_error:
+ if (monitor_cfg)
+ free(monitor_cfg->counters);
+ free(perf_cfg);
+ free(monitor_cfg);
+ return false;
+}
+
+int
+crocus_get_monitor_group_info(struct pipe_screen *pscreen,
+ unsigned group_index,
+ struct pipe_driver_query_group_info *info)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ if (!screen->monitor_cfg) {
+ if (!crocus_monitor_init_metrics(screen))
+ return 0;
+ }
+
+ const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+ const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+
+ if (!info) {
+ /* return the count that can be queried */
+ return perf_cfg->n_queries;
+ }
+
+ if (group_index >= perf_cfg->n_queries) {
+ /* out of range */
+ return 0;
+ }
+
+ struct intel_perf_query_info *query = &perf_cfg->queries[group_index];
+
+ info->name = query->name;
+ info->max_active_queries = query->n_counters;
+ info->num_queries = query->n_counters;
+
+ return 1;
+}
+
+static void
+crocus_init_monitor_ctx(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+ struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+
+ ice->perf_ctx = intel_perf_new_context(ice);
+ if (unlikely(!ice->perf_ctx))
+ return;
+
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+ struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+ intel_perf_init_context(perf_ctx,
+ perf_cfg,
+ ice,
+ ice,
+ screen->bufmgr,
+ &screen->devinfo,
+ ice->batches[CROCUS_BATCH_RENDER].hw_ctx_id,
+ screen->fd);
+}
+
+/* entry point for GenPerfMonitorsAMD */
+struct crocus_monitor_object *
+crocus_create_monitor_object(struct crocus_context *ice,
+ unsigned num_queries,
+ unsigned *query_types)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+ struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+ struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+ struct intel_perf_query_object *query_obj = NULL;
+
+ /* initialize perf context if this has not already been done. This
+ * function is the first entry point that carries the gl context.
+ */
+ if (ice->perf_ctx == NULL) {
+ crocus_init_monitor_ctx(ice);
+ }
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+ assert(num_queries > 0);
+ int query_index = query_types[0] - PIPE_QUERY_DRIVER_SPECIFIC;
+ assert(query_index <= monitor_cfg->num_counters);
+ const int group = monitor_cfg->counters[query_index].group;
+
+ struct crocus_monitor_object *monitor =
+ calloc(1, sizeof(struct crocus_monitor_object));
+ if (unlikely(!monitor))
+ goto allocation_failure;
+
+ monitor->num_active_counters = num_queries;
+ monitor->active_counters = calloc(num_queries, sizeof(int));
+ if (unlikely(!monitor->active_counters))
+ goto allocation_failure;
+
+ for (int i = 0; i < num_queries; ++i) {
+ unsigned current_query = query_types[i];
+ unsigned current_query_index = current_query - PIPE_QUERY_DRIVER_SPECIFIC;
+
+ /* all queries must be in the same group */
+ assert(current_query_index <= monitor_cfg->num_counters);
+ assert(monitor_cfg->counters[current_query_index].group == group);
+ monitor->active_counters[i] =
+ monitor_cfg->counters[current_query_index].counter;
+ }
+
+ /* create the intel_perf_query */
+ query_obj = intel_perf_new_query(perf_ctx, group);
+ if (unlikely(!query_obj))
+ goto allocation_failure;
+
+ monitor->query = query_obj;
+ monitor->result_size = perf_cfg->queries[group].data_size;
+ monitor->result_buffer = calloc(1, monitor->result_size);
+ if (unlikely(!monitor->result_buffer))
+ goto allocation_failure;
+
+ return monitor;
+
+allocation_failure:
+ if (monitor) {
+ free(monitor->active_counters);
+ free(monitor->result_buffer);
+ }
+ free(query_obj);
+ free(monitor);
+ return NULL;
+}
+
+void
+crocus_destroy_monitor_object(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ intel_perf_delete_query(ice->perf_ctx, monitor->query);
+ free(monitor->result_buffer);
+ monitor->result_buffer = NULL;
+ free(monitor->active_counters);
+ monitor->active_counters = NULL;
+ free(monitor);
+}
+
+bool
+crocus_begin_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+ return intel_perf_begin_query(perf_ctx, monitor->query);
+}
+
+bool
+crocus_end_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+ intel_perf_end_query(perf_ctx, monitor->query);
+ return true;
+}
+
+bool
+crocus_get_monitor_result(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor,
+ bool wait,
+ union pipe_numeric_type_union *result)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ bool monitor_ready =
+ intel_perf_is_query_ready(perf_ctx, monitor->query, batch);
+
+ if (!monitor_ready) {
+ if (!wait)
+ return false;
+ intel_perf_wait_query(perf_ctx, monitor->query, batch);
+ }
+
+ assert(intel_perf_is_query_ready(perf_ctx, monitor->query, batch));
+
+ unsigned bytes_written;
+ intel_perf_get_query_data(perf_ctx, monitor->query, batch,
+ monitor->result_size,
+ (unsigned*) monitor->result_buffer,
+ &bytes_written);
+ if (bytes_written != monitor->result_size)
+ return false;
+
+ /* copy metrics into the batch result */
+ for (int i = 0; i < monitor->num_active_counters; ++i) {
+ int current_counter = monitor->active_counters[i];
+ const struct intel_perf_query_info *info =
+ intel_perf_query_info(monitor->query);
+ const struct intel_perf_query_counter *counter =
+ &info->counters[current_counter];
+ assert(intel_perf_query_counter_get_size(counter));
+ switch (counter->data_type) {
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+ result[i].u64 = *(uint64_t*)(monitor->result_buffer + counter->offset);
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+ result[i].f = *(float*)(monitor->result_buffer + counter->offset);
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
+ case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
+ result[i].u64 = *(uint32_t*)(monitor->result_buffer + counter->offset);
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: {
+ double v = *(double*)(monitor->result_buffer + counter->offset);
+ result[i].f = v;
+ break;
+ }
+ default:
+ unreachable("unexpected counter data type");
+ }
+ }
+ return true;
+}
diff --git a/src/gallium/drivers/crocus/crocus_monitor.h b/src/gallium/drivers/crocus/crocus_monitor.h
new file mode 100644
index 00000000000..3335c8860e2
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_monitor.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_MONITOR_H
+#define CROCUS_MONITOR_H
+
+#include "pipe/p_screen.h"
+
+struct crocus_monitor_counter {
+ int group;
+ int counter;
+};
+
+struct crocus_monitor_config {
+ struct intel_perf_config *perf_cfg;
+
+ /* gallium requires an index for each counter */
+ int num_counters;
+ struct crocus_monitor_counter *counters;
+};
+
+int crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index,
+ struct pipe_driver_query_info *info);
+int crocus_get_monitor_group_info(struct pipe_screen *pscreen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info);
+
+struct crocus_context;
+struct crocus_screen;
+
+struct crocus_monitor_object *
+crocus_create_monitor_object(struct crocus_context *ice,
+ unsigned num_queries,
+ unsigned *query_types);
+
+struct pipe_query;
+void crocus_destroy_monitor_object(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor);
+
+bool
+crocus_begin_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor);
+bool
+crocus_end_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor);
+
+bool
+crocus_get_monitor_result(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor,
+ bool wait,
+ union pipe_numeric_type_union *result);
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_pipe.h b/src/gallium/drivers/crocus/crocus_pipe.h
new file mode 100644
index 00000000000..71b12d08e16
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_pipe.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_PIPE_H
+#define CROCUS_PIPE_H
+
+#include "pipe/p_defines.h"
+#include "compiler/shader_enums.h"
+
+static inline gl_shader_stage
+stage_from_pipe(enum pipe_shader_type pstage)
+{
+ static const gl_shader_stage stages[PIPE_SHADER_TYPES] = {
+ [PIPE_SHADER_VERTEX] = MESA_SHADER_VERTEX,
+ [PIPE_SHADER_TESS_CTRL] = MESA_SHADER_TESS_CTRL,
+ [PIPE_SHADER_TESS_EVAL] = MESA_SHADER_TESS_EVAL,
+ [PIPE_SHADER_GEOMETRY] = MESA_SHADER_GEOMETRY,
+ [PIPE_SHADER_FRAGMENT] = MESA_SHADER_FRAGMENT,
+ [PIPE_SHADER_COMPUTE] = MESA_SHADER_COMPUTE,
+ };
+ return stages[pstage];
+}
+
+static inline enum pipe_shader_type
+stage_to_pipe(gl_shader_stage stage)
+{
+ static const enum pipe_shader_type pstages[MESA_SHADER_STAGES] = {
+ [MESA_SHADER_VERTEX] = PIPE_SHADER_VERTEX,
+ [MESA_SHADER_TESS_CTRL] = PIPE_SHADER_TESS_CTRL,
+ [MESA_SHADER_TESS_EVAL] = PIPE_SHADER_TESS_EVAL,
+ [MESA_SHADER_GEOMETRY] = PIPE_SHADER_GEOMETRY,
+ [MESA_SHADER_FRAGMENT] = PIPE_SHADER_FRAGMENT,
+ [MESA_SHADER_COMPUTE] = PIPE_SHADER_COMPUTE,
+ };
+ return pstages[stage];
+}
+
+/**
+ * Convert an swizzle enumeration (i.e. PIPE_SWIZZLE_X) to one of the HW's
+ * "Shader Channel Select" enumerations (i.e. SCS_RED). The mappings are
+ *
+ * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
+ * 0 1 2 3 4 5
+ * 4 5 6 7 0 1
+ * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE
+ *
+ * which is simply adding 4 then modding by 8 (or anding with 7).
+ */
+static inline enum isl_channel_select
+pipe_swizzle_to_isl_channel(enum pipe_swizzle swizzle)
+{
+ return (swizzle + 4) & 7;
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_pipe_control.c b/src/gallium/drivers/crocus/crocus_pipe_control.c
new file mode 100644
index 00000000000..7a9625c61ed
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_pipe_control.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_pipe_control.c
+ *
+ * PIPE_CONTROL is the main flushing and synchronization primitive on Intel
+ * GPUs. It can invalidate caches, stall until rendering reaches various
+ * stages of completion, write to memory, and other things. In a way, it's
+ * a swiss army knife command - it has all kinds of capabilities, but some
+ * significant limitations as well.
+ *
+ * Unfortunately, it's notoriously complicated and difficult to use. Many
+ * sub-commands can't be used together. Some are meant to be used at the
+ * top of the pipeline (invalidating caches before drawing), while some are
+ * meant to be used at the end (stalling or flushing after drawing).
+ *
+ * Also, there's a list of restrictions a mile long, which vary by generation.
+ * Do this before doing that, or suffer the consequences (usually a GPU hang).
+ *
+ * This file contains helpers for emitting them safely. You can simply call
+ * crocus_emit_pipe_control_flush() with the desired operations (as logical
+ * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple
+ * PIPE_CONTROL commands as necessary. The per-generation workarounds are
+ * applied in crocus_emit_raw_pipe_control() in crocus_state.c.
+ */
+
+#include "crocus_context.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+
+/**
+ * Emit a PIPE_CONTROL with various flushing flags.
+ *
+ * The caller is responsible for deciding what flags are appropriate for the
+ * given generation.
+ */
+void
+crocus_emit_pipe_control_flush(struct crocus_batch *batch,
+ const char *reason,
+ uint32_t flags)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ if (devinfo->ver >= 6 &&
+ (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
+ (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
+ /* A pipe control command with flush and invalidate bits set
+ * simultaneously is an inherently racy operation on Gen6+ if the
+ * contents of the flushed caches were intended to become visible from
+ * any of the invalidated caches. Split it in two PIPE_CONTROLs, the
+ * first one should stall the pipeline to make sure that the flushed R/W
+ * caches are coherent with memory once the specified R/O caches are
+ * invalidated. On pre-Gen6 hardware the (implicit) R/O cache
+ * invalidation seems to happen at the bottom of the pipeline together
+ * with any write cache flush, so this shouldn't be a concern. In order
+ * to ensure a full stall, we do an end-of-pipe sync.
+ */
+ crocus_emit_end_of_pipe_sync(batch, reason,
+ flags & PIPE_CONTROL_CACHE_FLUSH_BITS);
+ flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
+ }
+
+ batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0);
+}
+
+/**
+ * Emit a PIPE_CONTROL that writes to a buffer object.
+ *
+ * \p flags should contain one of the following items:
+ * - PIPE_CONTROL_WRITE_IMMEDIATE
+ * - PIPE_CONTROL_WRITE_TIMESTAMP
+ * - PIPE_CONTROL_WRITE_DEPTH_COUNT
+ */
+void
+crocus_emit_pipe_control_write(struct crocus_batch *batch,
+ const char *reason, uint32_t flags,
+ struct crocus_bo *bo, uint32_t offset,
+ uint64_t imm)
+{
+ batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm);
+}
+
+/**
+ * Restriction [DevSNB, DevIVB]:
+ *
+ * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
+ * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
+ * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
+ * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
+ * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
+ * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
+ * unless SW can otherwise guarantee that the pipeline from WM onwards is
+ * already flushed (e.g., via a preceding MI_FLUSH).
+ */
+void
+crocus_emit_depth_stall_flushes(struct crocus_batch *batch)
+{
+ UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ assert(devinfo->ver >= 6);
+
+ crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL);
+ crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+ crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL);
+}
+
+/*
+ * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
+ *
+ * Write synchronization is a special case of end-of-pipe
+ * synchronization that requires that the render cache and/or depth
+ * related caches are flushed to memory, where the data will become
+ * globally visible. This type of synchronization is required prior to
+ * SW (CPU) actually reading the result data from memory, or initiating
+ * an operation that will use as a read surface (such as a texture
+ * surface) a previous render target and/or depth/stencil buffer
+ *
+ * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
+ *
+ * Exercising the write cache flush bits (Render Target Cache Flush
+ * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
+ * ensures the write caches are flushed and doesn't guarantee the data
+ * is globally visible.
+ *
+ * SW can track the completion of the end-of-pipe-synchronization by
+ * using "Notify Enable" and "PostSync Operation - Write Immediate
+ * Data" in the PIPE_CONTROL command.
+ */
+void
+crocus_emit_end_of_pipe_sync(struct crocus_batch *batch,
+ const char *reason, uint32_t flags)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ if (devinfo->ver >= 6) {
+ /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
+ *
+ * "The most common action to perform upon reaching a synchronization
+ * point is to write a value out to memory. An immediate value
+ * (included with the synchronization command) may be written."
+ *
+ * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
+ *
+ * "In case the data flushed out by the render engine is to be read
+ * back in to the render engine in coherent manner, then the render
+ * engine has to wait for the fence completion before accessing the
+ * flushed data. This can be achieved by following means on various
+ * products: PIPE_CONTROL command with CS Stall and the required
+ * write caches flushed with Post-Sync-Operation as Write Immediate
+ * Data.
+ *
+ * Example:
+ * - Workload-1 (3D/GPGPU/MEDIA)
+ * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
+ * Data, Required Write Cache Flush bits set)
+ * - Workload-2 (Can use the data produce or output by Workload-1)
+ */
+ crocus_emit_pipe_control_write(batch, reason,
+ flags | PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_WRITE_IMMEDIATE,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset, 0);
+
+ if (batch->screen->devinfo.is_haswell) {
+#define GEN7_3DPRIM_START_INSTANCE 0x243C
+ batch->screen->vtbl.load_register_mem32(batch, GEN7_3DPRIM_START_INSTANCE,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset);
+ }
+ } else {
+ /* On gen4-5, a regular pipe control seems to suffice. */
+ crocus_emit_pipe_control_flush(batch, reason, flags);
+ }
+}
+
+/* Emit a pipelined flush to either flush render and texture cache for
+ * reading from a FBO-drawn texture, or flush so that frontbuffer
+ * render appears on the screen in DRI1.
+ *
+ * This is also used for the always_flush_cache driconf debug option.
+ */
+void
+crocus_emit_mi_flush(struct crocus_batch *batch)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH;
+ if (devinfo->ver >= 6) {
+ flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_VF_CACHE_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CS_STALL;
+ }
+ crocus_emit_pipe_control_flush(batch, "mi flush", flags);
+}
+
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6. From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ * "1 of the following must also be set:
+ * - Render Target Cache Flush Enable ([12] of DW1)
+ * - Depth Cache Flush Enable ([0] of DW1)
+ * - Stall at Pixel Scoreboard ([1] of DW1)
+ * - Depth Stall ([13] of DW1)
+ * - Post-Sync Operation ([13] of DW1)
+ * - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it. Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either. Notify enable is IRQs, which aren't
+ * really our business. That leaves only stall at scoreboard.
+ */
+void
+crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch)
+{
+ crocus_emit_pipe_control_flush(batch, "nonzero",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+
+ crocus_emit_pipe_control_write(batch, "nonzero",
+ PIPE_CONTROL_WRITE_IMMEDIATE,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset, 0);
+}
+
+/**
+ * Flush and invalidate all caches (for debugging purposes).
+ */
+void
+crocus_flush_all_caches(struct crocus_batch *batch)
+{
+ crocus_emit_pipe_control_flush(batch, "debug: flush all caches",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_VF_CACHE_INVALIDATE |
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+}
+
+static void
+crocus_texture_barrier(struct pipe_context *ctx, unsigned flags)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_batch *render_batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_batch *compute_batch = &ice->batches[CROCUS_BATCH_COMPUTE];
+ const struct intel_device_info *devinfo = &render_batch->screen->devinfo;
+
+ if (devinfo->ver < 6) {
+ crocus_emit_mi_flush(render_batch);
+ return;
+ }
+
+ if (render_batch->contains_draw) {
+ crocus_batch_maybe_flush(render_batch, 48);
+ crocus_emit_pipe_control_flush(render_batch,
+ "API: texture barrier (1/2)",
+ (flags == 1 ? PIPE_CONTROL_DEPTH_CACHE_FLUSH : 0) |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(render_batch,
+ "API: texture barrier (2/2)",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+ }
+
+ if (compute_batch->contains_draw) {
+ crocus_batch_maybe_flush(compute_batch, 48);
+ crocus_emit_pipe_control_flush(compute_batch,
+ "API: texture barrier (1/2)",
+ PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(compute_batch,
+ "API: texture barrier (2/2)",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+ }
+}
+
+static void
+crocus_memory_barrier(struct pipe_context *ctx, unsigned flags)
+{
+ struct crocus_context *ice = (void *) ctx;
+ unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
+ const struct intel_device_info *devinfo = &ice->batches[0].screen->devinfo;
+
+ assert(devinfo->ver == 7);
+
+ if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
+ PIPE_BARRIER_INDEX_BUFFER |
+ PIPE_BARRIER_INDIRECT_BUFFER)) {
+ bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+ }
+
+ if (flags & PIPE_BARRIER_CONSTANT_BUFFER) {
+ bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+ }
+
+ if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) {
+ bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH;
+ }
+
+ /* Typed surface messages are handled by the render cache on IVB, so we
+ * need to flush it too.
+ */
+ if (!devinfo->is_haswell)
+ bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
+
+ for (int i = 0; i < ice->batch_count; i++) {
+ if (ice->batches[i].contains_draw) {
+ crocus_batch_maybe_flush(&ice->batches[i], 24);
+ crocus_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier",
+ bits);
+ }
+ }
+}
+
+void
+crocus_init_flush_functions(struct pipe_context *ctx)
+{
+ ctx->memory_barrier = crocus_memory_barrier;
+ ctx->texture_barrier = crocus_texture_barrier;
+}
diff --git a/src/gallium/drivers/crocus/crocus_program.c b/src/gallium/drivers/crocus/crocus_program.c
new file mode 100644
index 00000000000..fb8216b71ab
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_program.c
@@ -0,0 +1,3171 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_program.c
+ *
+ * This file contains the driver interface for compiling shaders.
+ *
+ * See crocus_program_cache.c for the in-memory program cache where the
+ * compiled shaders are stored.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_atomic.h"
+#include "util/u_upload_mgr.h"
+#include "util/debug.h"
+#include "util/u_prim.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_serialize.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_nir.h"
+#include "crocus_context.h"
+#include "nir/tgsi_to_nir.h"
+
+#define KEY_INIT_NO_ID() \
+ .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \
+ .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688, \
+ .base.tex.compressed_multisample_layout_mask = ~0
+#define KEY_INIT() .base.program_string_id = ish->program_id, KEY_INIT_NO_ID()
+
+static void
+crocus_sanitize_tex_key(struct brw_sampler_prog_key_data *key)
+{
+ key->gather_channel_quirk_mask = 0;
+ for (unsigned s = 0; s < MAX_SAMPLERS; s++) {
+ key->swizzles[s] = SWIZZLE_NOOP;
+ key->gfx6_gather_wa[s] = 0;
+ }
+}
+
+static uint32_t
+crocus_get_texture_swizzle(const struct crocus_context *ice,
+ const struct crocus_sampler_view *t)
+{
+ uint32_t swiz = 0;
+
+ for (int i = 0; i < 4; i++) {
+ swiz |= t->swizzle[i] << (i * 3);
+ }
+ return swiz;
+}
+
+static inline bool can_push_ubo(const struct intel_device_info *devinfo)
+{
+ /* push works for everyone except SNB at the moment */
+ return devinfo->ver != 6;
+}
+
+static uint8_t
+gfx6_gather_workaround(enum pipe_format pformat)
+{
+ switch (pformat) {
+ case PIPE_FORMAT_R8_SINT: return WA_SIGN | WA_8BIT;
+ case PIPE_FORMAT_R8_UINT: return WA_8BIT;
+ case PIPE_FORMAT_R16_SINT: return WA_SIGN | WA_16BIT;
+ case PIPE_FORMAT_R16_UINT: return WA_16BIT;
+ default:
+ /* Note that even though PIPE_FORMAT_R32_SINT and
+ * PIPE_FORMAT_R32_UINThave format overrides in
+ * the surface state, there is no shader w/a required.
+ */
+ return 0;
+ }
+}
+
+static const unsigned crocus_gfx6_swizzle_for_offset[4] = {
+ BRW_SWIZZLE4(0, 1, 2, 3),
+ BRW_SWIZZLE4(1, 2, 3, 3),
+ BRW_SWIZZLE4(2, 3, 3, 3),
+ BRW_SWIZZLE4(3, 3, 3, 3)
+};
+
+static void
+gfx6_gs_xfb_setup(const struct pipe_stream_output_info *so_info,
+ struct brw_gs_prog_data *gs_prog_data)
+{
+ /* Make sure that the VUE slots won't overflow the unsigned chars in
+ * prog_data->transform_feedback_bindings[].
+ */
+ STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
+
+ /* Make sure that we don't need more binding table entries than we've
+ * set aside for use in transform feedback. (We shouldn't, since we
+ * set aside enough binding table entries to have one per component).
+ */
+ assert(so_info->num_outputs <= BRW_MAX_SOL_BINDINGS);
+
+ gs_prog_data->num_transform_feedback_bindings = so_info->num_outputs;
+ for (unsigned i = 0; i < so_info->num_outputs; i++) {
+ gs_prog_data->transform_feedback_bindings[i] =
+ so_info->output[i].register_index;
+ gs_prog_data->transform_feedback_swizzles[i] =
+ crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component];
+ }
+}
+
+static void
+gfx6_ff_gs_xfb_setup(const struct pipe_stream_output_info *so_info,
+ struct brw_ff_gs_prog_key *key)
+{
+ key->num_transform_feedback_bindings = so_info->num_outputs;
+ for (unsigned i = 0; i < so_info->num_outputs; i++) {
+ key->transform_feedback_bindings[i] =
+ so_info->output[i].register_index;
+ key->transform_feedback_swizzles[i] =
+ crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component];
+ }
+}
+
+static void
+crocus_populate_sampler_prog_key_data(struct crocus_context *ice,
+ const struct intel_device_info *devinfo,
+ gl_shader_stage stage,
+ struct crocus_uncompiled_shader *ish,
+ bool uses_texture_gather,
+ struct brw_sampler_prog_key_data *key)
+{
+ uint32_t mask = ish->nir->info.textures_used[0];
+
+ while (mask) {
+ const int s = u_bit_scan(&mask);
+
+ struct crocus_sampler_view *texture = ice->state.shaders[stage].textures[s];
+ key->swizzles[s] = SWIZZLE_NOOP;
+ key->scale_factors[s] = 0.0f;
+
+ if (!texture)
+ continue;
+ if (texture->base.target == PIPE_BUFFER)
+ continue;
+ if (!devinfo->is_haswell) {
+ key->swizzles[s] = crocus_get_texture_swizzle(ice, texture);
+ }
+
+ /* gather4 for RG32* is broken in multiple ways on Gen7. */
+ if (devinfo->ver == 7 && uses_texture_gather) {
+ switch (texture->base.format) {
+ case PIPE_FORMAT_R32G32_UINT:
+ case PIPE_FORMAT_R32G32_SINT: {
+ /* We have to override the format to R32G32_FLOAT_LD.
+ * This means that SCS_ALPHA and SCS_ONE will return 0x3f8
+ * (1.0) rather than integer 1. This needs shader hacks.
+ *
+ * On Ivybridge, we whack W (alpha) to ONE in our key's
+ * swizzle. On Haswell, we look at the original texture
+ * swizzle, and use XYZW with channels overridden to ONE,
+ * leaving normal texture swizzling to SCS.
+ */
+ unsigned src_swizzle = key->swizzles[s];
+ for (int i = 0; i < 4; i++) {
+ unsigned src_comp = GET_SWZ(src_swizzle, i);
+ if (src_comp == SWIZZLE_ONE || src_comp == SWIZZLE_W) {
+ key->swizzles[i] &= ~(0x7 << (3 * i));
+ key->swizzles[i] |= SWIZZLE_ONE << (3 * i);
+ }
+ }
+ }
+ FALLTHROUGH;
+ case PIPE_FORMAT_R32G32_FLOAT:
+ /* The channel select for green doesn't work - we have to
+ * request blue. Haswell can use SCS for this, but Ivybridge
+ * needs a shader workaround.
+ */
+ if (!devinfo->is_haswell)
+ key->gather_channel_quirk_mask |= 1 << s;
+ break;
+ default:
+ break;
+ }
+ }
+ if (devinfo->ver == 6 && uses_texture_gather) {
+ key->gfx6_gather_wa[s] = gfx6_gather_workaround(texture->base.format);
+ }
+ }
+}
+
+static void
+crocus_lower_swizzles(struct nir_shader *nir,
+ const struct brw_sampler_prog_key_data *key_tex)
+{
+ struct nir_lower_tex_options tex_options = { 0 };
+ uint32_t mask = nir->info.textures_used[0];
+
+ while (mask) {
+ const int s = u_bit_scan(&mask);
+
+ if (key_tex->swizzles[s] == SWIZZLE_NOOP)
+ continue;
+
+ tex_options.swizzle_result |= (1 << s);
+ for (unsigned c = 0; c < 4; c++)
+ tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c);
+ }
+ if (tex_options.swizzle_result)
+ nir_lower_tex(nir, &tex_options);
+}
+
+static unsigned
+get_new_program_id(struct crocus_screen *screen)
+{
+ return p_atomic_inc_return(&screen->program_id);
+}
+
+static nir_ssa_def *
+get_aoa_deref_offset(nir_builder *b,
+ nir_deref_instr *deref,
+ unsigned elem_size)
+{
+ unsigned array_size = elem_size;
+ nir_ssa_def *offset = nir_imm_int(b, 0);
+
+ while (deref->deref_type != nir_deref_type_var) {
+ assert(deref->deref_type == nir_deref_type_array);
+
+ /* This level's element size is the previous level's array size */
+ nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1);
+ assert(deref->arr.index.ssa);
+ offset = nir_iadd(b, offset,
+ nir_imul(b, index, nir_imm_int(b, array_size)));
+
+ deref = nir_deref_instr_parent(deref);
+ assert(glsl_type_is_array(deref->type));
+ array_size *= glsl_get_length(deref->type);
+ }
+
+ /* Accessing an invalid surface index with the dataport can result in a
+ * hang. According to the spec "if the index used to select an individual
+ * element is negative or greater than or equal to the size of the array,
+ * the results of the operation are undefined but may not lead to
+ * termination" -- which is one of the possible outcomes of the hang.
+ * Clamp the index to prevent access outside of the array bounds.
+ */
+ return nir_umin(b, offset, nir_imm_int(b, array_size - elem_size));
+}
+
+static void
+crocus_lower_storage_image_derefs(nir_shader *nir)
+{
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_deref_store:
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_imin:
+ case nir_intrinsic_image_deref_atomic_umin:
+ case nir_intrinsic_image_deref_atomic_imax:
+ case nir_intrinsic_image_deref_atomic_umax:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_deref_size:
+ case nir_intrinsic_image_deref_samples:
+ case nir_intrinsic_image_deref_load_raw_intel:
+ case nir_intrinsic_image_deref_store_raw_intel: {
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+
+ b.cursor = nir_before_instr(&intrin->instr);
+ nir_ssa_def *index =
+ nir_iadd(&b, nir_imm_int(&b, var->data.driver_location),
+ get_aoa_deref_offset(&b, deref, 1));
+ nir_rewrite_image_intrinsic(intrin, index, false);
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+ }
+}
+
+// XXX: need unify_interfaces() at link time...
+
+/**
+ * Undo nir_lower_passthrough_edgeflags but keep the inputs_read flag.
+ */
+static bool
+crocus_fix_edge_flags(nir_shader *nir)
+{
+ if (nir->info.stage != MESA_SHADER_VERTEX) {
+ nir_shader_preserve_all_metadata(nir);
+ return false;
+ }
+
+ nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out,
+ VARYING_SLOT_EDGE);
+ if (!var) {
+ nir_shader_preserve_all_metadata(nir);
+ return false;
+ }
+
+ var->data.mode = nir_var_shader_temp;
+ nir->info.outputs_written &= ~VARYING_BIT_EDGE;
+ nir->info.inputs_read &= ~VERT_BIT_EDGEFLAG;
+ nir_fixup_deref_modes(nir);
+
+ nir_foreach_function(f, nir) {
+ if (f->impl) {
+ nir_metadata_preserve(f->impl, nir_metadata_block_index |
+ nir_metadata_dominance |
+ nir_metadata_live_ssa_defs |
+ nir_metadata_loop_analysis);
+ } else {
+ nir_metadata_preserve(f->impl, nir_metadata_all);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Fix an uncompiled shader's stream output info.
+ *
+ * Core Gallium stores output->register_index as a "slot" number, where
+ * slots are assigned consecutively to all outputs in info->outputs_written.
+ * This naive packing of outputs doesn't work for us - we too have slots,
+ * but the layout is defined by the VUE map, which we won't have until we
+ * compile a specific shader variant. So, we remap these and simply store
+ * VARYING_SLOT_* in our copy's output->register_index fields.
+ *
+ * We also fix up VARYING_SLOT_{LAYER,VIEWPORT,PSIZ} to select the Y/Z/W
+ * components of our VUE header. See brw_vue_map.c for the layout.
+ */
+static void
+update_so_info(struct pipe_stream_output_info *so_info,
+ uint64_t outputs_written)
+{
+ uint8_t reverse_map[64] = {};
+ unsigned slot = 0;
+ while (outputs_written) {
+ reverse_map[slot++] = u_bit_scan64(&outputs_written);
+ }
+
+ for (unsigned i = 0; i < so_info->num_outputs; i++) {
+ struct pipe_stream_output *output = &so_info->output[i];
+
+ /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */
+ output->register_index = reverse_map[output->register_index];
+
+ /* The VUE header contains three scalar fields packed together:
+ * - gl_PointSize is stored in VARYING_SLOT_PSIZ.w
+ * - gl_Layer is stored in VARYING_SLOT_PSIZ.y
+ * - gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
+ */
+ switch (output->register_index) {
+ case VARYING_SLOT_LAYER:
+ assert(output->num_components == 1);
+ output->register_index = VARYING_SLOT_PSIZ;
+ output->start_component = 1;
+ break;
+ case VARYING_SLOT_VIEWPORT:
+ assert(output->num_components == 1);
+ output->register_index = VARYING_SLOT_PSIZ;
+ output->start_component = 2;
+ break;
+ case VARYING_SLOT_PSIZ:
+ assert(output->num_components == 1);
+ output->start_component = 3;
+ break;
+ }
+
+ //info->outputs_written |= 1ull << output->register_index;
+ }
+}
+
+static void
+setup_vec4_image_sysval(uint32_t *sysvals, uint32_t idx,
+ unsigned offset, unsigned n)
+{
+ assert(offset % sizeof(uint32_t) == 0);
+
+ for (unsigned i = 0; i < n; ++i)
+ sysvals[i] = BRW_PARAM_IMAGE(idx, offset / sizeof(uint32_t) + i);
+
+ for (unsigned i = n; i < 4; ++i)
+ sysvals[i] = BRW_PARAM_BUILTIN_ZERO;
+}
+
+/**
+ * Associate NIR uniform variables with the prog_data->param[] mechanism
+ * used by the backend. Also, decide which UBOs we'd like to push in an
+ * ideal situation (though the backend can reduce this).
+ */
+static void
+crocus_setup_uniforms(const struct brw_compiler *compiler,
+ void *mem_ctx,
+ nir_shader *nir,
+ struct brw_stage_prog_data *prog_data,
+ enum brw_param_builtin **out_system_values,
+ unsigned *out_num_system_values,
+ unsigned *out_num_cbufs)
+{
+ UNUSED const struct intel_device_info *devinfo = compiler->devinfo;
+
+ const unsigned CROCUS_MAX_SYSTEM_VALUES =
+ PIPE_MAX_SHADER_IMAGES * BRW_IMAGE_PARAM_SIZE;
+ enum brw_param_builtin *system_values =
+ rzalloc_array(mem_ctx, enum brw_param_builtin, CROCUS_MAX_SYSTEM_VALUES);
+ unsigned num_system_values = 0;
+
+ unsigned patch_vert_idx = -1;
+ unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES];
+ unsigned img_idx[PIPE_MAX_SHADER_IMAGES];
+ unsigned variable_group_size_idx = -1;
+ memset(ucp_idx, -1, sizeof(ucp_idx));
+ memset(img_idx, -1, sizeof(img_idx));
+
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ b.cursor = nir_before_block(nir_start_block(impl));
+ nir_ssa_def *temp_ubo_name = nir_ssa_undef(&b, 1, 32);
+ nir_ssa_def *temp_const_ubo_name = NULL;
+
+ /* Turn system value intrinsics into uniforms */
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ nir_ssa_def *offset;
+
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_constant: {
+ /* This one is special because it reads from the shader constant
+ * data and not cbuf0 which gallium uploads for us.
+ */
+ b.cursor = nir_before_instr(instr);
+ nir_ssa_def *offset =
+ nir_iadd_imm(&b, nir_ssa_for_src(&b, intrin->src[0], 1),
+ nir_intrinsic_base(intrin));
+
+ if (temp_const_ubo_name == NULL)
+ temp_const_ubo_name = nir_imm_int(&b, 0);
+
+ nir_intrinsic_instr *load_ubo =
+ nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo);
+ load_ubo->num_components = intrin->num_components;
+ load_ubo->src[0] = nir_src_for_ssa(temp_const_ubo_name);
+ load_ubo->src[1] = nir_src_for_ssa(offset);
+ nir_intrinsic_set_align(load_ubo, 4, 0);
+ nir_intrinsic_set_range_base(load_ubo, 0);
+ nir_intrinsic_set_range(load_ubo, ~0);
+ nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest,
+ intrin->dest.ssa.num_components,
+ intrin->dest.ssa.bit_size,
+ intrin->dest.ssa.name);
+ nir_builder_instr_insert(&b, &load_ubo->instr);
+
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+ &load_ubo->dest.ssa);
+ nir_instr_remove(&intrin->instr);
+ continue;
+ }
+ case nir_intrinsic_load_user_clip_plane: {
+ unsigned ucp = nir_intrinsic_ucp_id(intrin);
+
+ if (ucp_idx[ucp] == -1) {
+ ucp_idx[ucp] = num_system_values;
+ num_system_values += 4;
+ }
+
+ for (int i = 0; i < 4; i++) {
+ system_values[ucp_idx[ucp] + i] =
+ BRW_PARAM_BUILTIN_CLIP_PLANE(ucp, i);
+ }
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_imm_int(&b, ucp_idx[ucp] * sizeof(uint32_t));
+ break;
+ }
+ case nir_intrinsic_load_patch_vertices_in:
+ if (patch_vert_idx == -1)
+ patch_vert_idx = num_system_values++;
+
+ system_values[patch_vert_idx] =
+ BRW_PARAM_BUILTIN_PATCH_VERTICES_IN;
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t));
+ break;
+ case nir_intrinsic_image_deref_load_param_intel: {
+ assert(devinfo->ver < 9);
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+
+ if (img_idx[var->data.binding] == -1) {
+ /* GL only allows arrays of arrays of images. */
+ assert(glsl_type_is_image(glsl_without_array(var->type)));
+ unsigned num_images = MAX2(1, glsl_get_aoa_size(var->type));
+
+ for (int i = 0; i < num_images; i++) {
+ const unsigned img = var->data.binding + i;
+
+ img_idx[img] = num_system_values;
+ num_system_values += BRW_IMAGE_PARAM_SIZE;
+
+ uint32_t *img_sv = &system_values[img_idx[img]];
+
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_OFFSET_OFFSET, img,
+ offsetof(struct brw_image_param, offset), 2);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_SIZE_OFFSET, img,
+ offsetof(struct brw_image_param, size), 3);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_STRIDE_OFFSET, img,
+ offsetof(struct brw_image_param, stride), 4);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_TILING_OFFSET, img,
+ offsetof(struct brw_image_param, tiling), 3);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_SWIZZLING_OFFSET, img,
+ offsetof(struct brw_image_param, swizzling), 2);
+ }
+ }
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_iadd(&b,
+ get_aoa_deref_offset(&b, deref, BRW_IMAGE_PARAM_SIZE * 4),
+ nir_imm_int(&b, img_idx[var->data.binding] * 4 +
+ nir_intrinsic_base(intrin) * 16));
+ break;
+ }
+ case nir_intrinsic_load_workgroup_size: {
+ assert(nir->info.workgroup_size_variable);
+ if (variable_group_size_idx == -1) {
+ variable_group_size_idx = num_system_values;
+ num_system_values += 3;
+ for (int i = 0; i < 3; i++) {
+ system_values[variable_group_size_idx + i] =
+ BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i;
+ }
+ }
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_imm_int(&b,
+ variable_group_size_idx * sizeof(uint32_t));
+ break;
+ }
+ default:
+ continue;
+ }
+
+ unsigned comps = nir_intrinsic_dest_components(intrin);
+
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(nir, nir_intrinsic_load_ubo);
+ load->num_components = comps;
+ load->src[0] = nir_src_for_ssa(temp_ubo_name);
+ load->src[1] = nir_src_for_ssa(offset);
+ nir_intrinsic_set_align(load, 4, 0);
+ nir_intrinsic_set_range_base(load, 0);
+ nir_intrinsic_set_range(load, ~0);
+ nir_ssa_dest_init(&load->instr, &load->dest, comps, 32, NULL);
+ nir_builder_instr_insert(&b, &load->instr);
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+ &load->dest.ssa);
+ nir_instr_remove(instr);
+ }
+ }
+
+ nir_validate_shader(nir, "before remapping");
+
+ /* Uniforms are stored in constant buffer 0, the
+ * user-facing UBOs are indexed by one. So if any constant buffer is
+ * needed, the constant buffer 0 will be needed, so account for it.
+ */
+ unsigned num_cbufs = nir->info.num_ubos;
+ if (num_cbufs || nir->num_uniforms)
+ num_cbufs++;
+
+ /* Place the new params in a new cbuf. */
+ if (num_system_values > 0) {
+ unsigned sysval_cbuf_index = num_cbufs;
+ num_cbufs++;
+
+ system_values = reralloc(mem_ctx, system_values, enum brw_param_builtin,
+ num_system_values);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+
+ if (load->intrinsic != nir_intrinsic_load_ubo)
+ continue;
+
+ b.cursor = nir_before_instr(instr);
+
+ assert(load->src[0].is_ssa);
+
+ if (load->src[0].ssa == temp_ubo_name) {
+ nir_ssa_def *imm = nir_imm_int(&b, sysval_cbuf_index);
+ nir_instr_rewrite_src(instr, &load->src[0],
+ nir_src_for_ssa(imm));
+ }
+ }
+ }
+
+ /* We need to fold the new iadds for brw_nir_analyze_ubo_ranges */
+ nir_opt_constant_folding(nir);
+ } else {
+ ralloc_free(system_values);
+ system_values = NULL;
+ }
+
+ assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS);
+ nir_validate_shader(nir, "after remap");
+
+ /* We don't use params[] but gallium leaves num_uniforms set. We use this
+ * to detect when cbuf0 exists but we don't need it anymore when we get
+ * here. Instead, zero it out so that the back-end doesn't get confused
+ * when nr_params * 4 != num_uniforms != nr_params * 4.
+ */
+ nir->num_uniforms = 0;
+
+ /* Constant loads (if any) need to go at the end of the constant buffers so
+ * we need to know num_cbufs before we can lower to them.
+ */
+ if (temp_const_ubo_name != NULL) {
+ nir_load_const_instr *const_ubo_index =
+ nir_instr_as_load_const(temp_const_ubo_name->parent_instr);
+ assert(const_ubo_index->def.bit_size == 32);
+ const_ubo_index->value[0].u32 = num_cbufs;
+ }
+
+ *out_system_values = system_values;
+ *out_num_system_values = num_system_values;
+ *out_num_cbufs = num_cbufs;
+}
+
+static const char *surface_group_names[] = {
+ [CROCUS_SURFACE_GROUP_RENDER_TARGET] = "render target",
+ [CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = "non-coherent render target read",
+ [CROCUS_SURFACE_GROUP_SOL] = "streamout",
+ [CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = "CS work groups",
+ [CROCUS_SURFACE_GROUP_TEXTURE] = "texture",
+ [CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = "texture gather",
+ [CROCUS_SURFACE_GROUP_UBO] = "ubo",
+ [CROCUS_SURFACE_GROUP_SSBO] = "ssbo",
+ [CROCUS_SURFACE_GROUP_IMAGE] = "image",
+};
+
+static void
+crocus_print_binding_table(FILE *fp, const char *name,
+ const struct crocus_binding_table *bt)
+{
+ STATIC_ASSERT(ARRAY_SIZE(surface_group_names) == CROCUS_SURFACE_GROUP_COUNT);
+
+ uint32_t total = 0;
+ uint32_t compacted = 0;
+
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+ uint32_t size = bt->sizes[i];
+ total += size;
+ if (size)
+ compacted += util_bitcount64(bt->used_mask[i]);
+ }
+
+ if (total == 0) {
+ fprintf(fp, "Binding table for %s is empty\n\n", name);
+ return;
+ }
+
+ if (total != compacted) {
+ fprintf(fp, "Binding table for %s "
+ "(compacted to %u entries from %u entries)\n",
+ name, compacted, total);
+ } else {
+ fprintf(fp, "Binding table for %s (%u entries)\n", name, total);
+ }
+
+ uint32_t entry = 0;
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+ uint64_t mask = bt->used_mask[i];
+ while (mask) {
+ int index = u_bit_scan64(&mask);
+ fprintf(fp, " [%u] %s #%d\n", entry++, surface_group_names[i], index);
+ }
+ }
+ fprintf(fp, "\n");
+}
+
+enum {
+ /* Max elements in a surface group. */
+ SURFACE_GROUP_MAX_ELEMENTS = 64,
+};
+
+/**
+ * Map a <group, index> pair to a binding table index.
+ *
+ * For example: <UBO, 5> => binding table index 12
+ */
+uint32_t
+crocus_group_index_to_bti(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group, uint32_t index)
+{
+ assert(index < bt->sizes[group]);
+ uint64_t mask = bt->used_mask[group];
+ uint64_t bit = 1ull << index;
+ if (bit & mask) {
+ return bt->offsets[group] + util_bitcount64((bit - 1) & mask);
+ } else {
+ return CROCUS_SURFACE_NOT_USED;
+ }
+}
+
+/**
+ * Map a binding table index back to a <group, index> pair.
+ *
+ * For example: binding table index 12 => <UBO, 5>
+ */
+uint32_t
+crocus_bti_to_group_index(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group, uint32_t bti)
+{
+ uint64_t used_mask = bt->used_mask[group];
+ assert(bti >= bt->offsets[group]);
+
+ uint32_t c = bti - bt->offsets[group];
+ while (used_mask) {
+ int i = u_bit_scan64(&used_mask);
+ if (c == 0)
+ return i;
+ c--;
+ }
+
+ return CROCUS_SURFACE_NOT_USED;
+}
+
+static void
+rewrite_src_with_bti(nir_builder *b, struct crocus_binding_table *bt,
+ nir_instr *instr, nir_src *src,
+ enum crocus_surface_group group)
+{
+ assert(bt->sizes[group] > 0);
+
+ b->cursor = nir_before_instr(instr);
+ nir_ssa_def *bti;
+ if (nir_src_is_const(*src)) {
+ uint32_t index = nir_src_as_uint(*src);
+ bti = nir_imm_intN_t(b, crocus_group_index_to_bti(bt, group, index),
+ src->ssa->bit_size);
+ } else {
+ /* Indirect usage makes all the surfaces of the group to be available,
+ * so we can just add the base.
+ */
+ assert(bt->used_mask[group] == BITFIELD64_MASK(bt->sizes[group]));
+ bti = nir_iadd_imm(b, src->ssa, bt->offsets[group]);
+ }
+ nir_instr_rewrite_src(instr, src, nir_src_for_ssa(bti));
+}
+
+static void
+mark_used_with_src(struct crocus_binding_table *bt, nir_src *src,
+ enum crocus_surface_group group)
+{
+ assert(bt->sizes[group] > 0);
+
+ if (nir_src_is_const(*src)) {
+ uint64_t index = nir_src_as_uint(*src);
+ assert(index < bt->sizes[group]);
+ bt->used_mask[group] |= 1ull << index;
+ } else {
+ /* There's an indirect usage, we need all the surfaces. */
+ bt->used_mask[group] = BITFIELD64_MASK(bt->sizes[group]);
+ }
+}
+
+static bool
+skip_compacting_binding_tables(void)
+{
+ static int skip = -1;
+ if (skip < 0)
+ skip = env_var_as_boolean("INTEL_DISABLE_COMPACT_BINDING_TABLE", false);
+ return skip;
+}
+
+/**
+ * Set up the binding table indices and apply to the shader.
+ */
+static void
+crocus_setup_binding_table(const struct intel_device_info *devinfo,
+ struct nir_shader *nir,
+ struct crocus_binding_table *bt,
+ unsigned num_render_targets,
+ unsigned num_system_values,
+ unsigned num_cbufs,
+ const struct brw_sampler_prog_key_data *key)
+{
+ const struct shader_info *info = &nir->info;
+
+ memset(bt, 0, sizeof(*bt));
+
+ /* Set the sizes for each surface group. For some groups, we already know
+ * upfront how many will be used, so mark them.
+ */
+ if (info->stage == MESA_SHADER_FRAGMENT) {
+ bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET] = num_render_targets;
+ /* All render targets used. */
+ bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET] =
+ BITFIELD64_MASK(num_render_targets);
+
+ /* Setup render target read surface group in order to support non-coherent
+ * framebuffer fetch on Gfx7
+ */
+ if (devinfo->ver >= 6 && info->outputs_read) {
+ bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = num_render_targets;
+ bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] =
+ BITFIELD64_MASK(num_render_targets);
+ }
+ } else if (info->stage == MESA_SHADER_COMPUTE) {
+ bt->sizes[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1;
+ } else if (info->stage == MESA_SHADER_GEOMETRY) {
+ /* In gfx6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
+ * feedback surfaces.
+ */
+ if (devinfo->ver == 6) {
+ bt->sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS;
+ bt->used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1;
+ }
+ }
+
+ bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE] = BITSET_LAST_BIT(info->textures_used);
+ bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE] = info->textures_used[0];
+
+ if (info->uses_texture_gather) {
+ bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = BITSET_LAST_BIT(info->textures_used);
+ bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = info->textures_used[0];
+ }
+
+ bt->sizes[CROCUS_SURFACE_GROUP_IMAGE] = info->num_images;
+
+ /* Allocate an extra slot in the UBO section for NIR constants.
+ * Binding table compaction will remove it if unnecessary.
+ *
+ * We don't include them in crocus_compiled_shader::num_cbufs because
+ * they are uploaded separately from shs->constbufs[], but from a shader
+ * point of view, they're another UBO (at the end of the section).
+ */
+ bt->sizes[CROCUS_SURFACE_GROUP_UBO] = num_cbufs + 1;
+
+ bt->sizes[CROCUS_SURFACE_GROUP_SSBO] = info->num_ssbos;
+
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++)
+ assert(bt->sizes[i] <= SURFACE_GROUP_MAX_ELEMENTS);
+
+ /* Mark surfaces used for the cases we don't have the information available
+ * upfront.
+ */
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_foreach_block (block, impl) {
+ nir_foreach_instr (instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_num_workgroups:
+ bt->used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1;
+ break;
+
+ case nir_intrinsic_load_output:
+ if (devinfo->ver >= 6) {
+ mark_used_with_src(bt, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_RENDER_TARGET_READ);
+ }
+ break;
+
+ case nir_intrinsic_image_size:
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_store:
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_imin:
+ case nir_intrinsic_image_atomic_umin:
+ case nir_intrinsic_image_atomic_imax:
+ case nir_intrinsic_image_atomic_umax:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_image_load_raw_intel:
+ case nir_intrinsic_image_store_raw_intel:
+ mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_IMAGE);
+ break;
+
+ case nir_intrinsic_load_ubo:
+ mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_UBO);
+ break;
+
+ case nir_intrinsic_store_ssbo:
+ mark_used_with_src(bt, &intrin->src[1], CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ case nir_intrinsic_get_ssbo_size:
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ case nir_intrinsic_ssbo_atomic_fmin:
+ case nir_intrinsic_ssbo_atomic_fmax:
+ case nir_intrinsic_ssbo_atomic_fcomp_swap:
+ case nir_intrinsic_load_ssbo:
+ mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+
+ /* When disable we just mark everything as used. */
+ if (unlikely(skip_compacting_binding_tables())) {
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++)
+ bt->used_mask[i] = BITFIELD64_MASK(bt->sizes[i]);
+ }
+
+ /* Calculate the offsets and the binding table size based on the used
+ * surfaces. After this point, the functions to go between "group indices"
+ * and binding table indices can be used.
+ */
+ uint32_t next = 0;
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+ if (bt->used_mask[i] != 0) {
+ bt->offsets[i] = next;
+ next += util_bitcount64(bt->used_mask[i]);
+ }
+ }
+ bt->size_bytes = next * 4;
+
+ if (unlikely(INTEL_DEBUG & DEBUG_BT)) {
+ crocus_print_binding_table(stderr, gl_shader_stage_name(info->stage), bt);
+ }
+
+ /* Apply the binding table indices. The backend compiler is not expected
+ * to change those, as we haven't set any of the *_start entries in brw
+ * binding_table.
+ */
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ nir_foreach_block (block, impl) {
+ nir_foreach_instr (instr, block) {
+ if (instr->type == nir_instr_type_tex) {
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+ bool is_gather = tex->op == nir_texop_tg4;
+
+ /* rewrite the tg4 component from green to blue before replacing the
+ texture index */
+ if (devinfo->ver == 7 && !devinfo->is_haswell) {
+ if (tex->component == 1)
+ if (key->gather_channel_quirk_mask & (1 << tex->texture_index))
+ tex->component = 2;
+ }
+
+ if (is_gather && devinfo->ver == 6 && key->gfx6_gather_wa[tex->texture_index]) {
+ b.cursor = nir_after_instr(instr);
+ enum gfx6_gather_sampler_wa wa = key->gfx6_gather_wa[tex->texture_index];
+ int width = (wa & WA_8BIT) ? 8 : 16;
+
+ nir_ssa_def *val = nir_fmul_imm(&b, &tex->dest.ssa, (1 << width) - 1);
+ val = nir_f2u32(&b, val);
+ if (wa & WA_SIGN) {
+ val = nir_ishl(&b, val, nir_imm_int(&b, 32 - width));
+ val = nir_ishr(&b, val, nir_imm_int(&b, 32 - width));
+ }
+ nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, val, val->parent_instr);
+ }
+
+ tex->texture_index =
+ crocus_group_index_to_bti(bt, is_gather ? CROCUS_SURFACE_GROUP_TEXTURE_GATHER : CROCUS_SURFACE_GROUP_TEXTURE,
+ tex->texture_index);
+ continue;
+ }
+
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_image_size:
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_store:
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_imin:
+ case nir_intrinsic_image_atomic_umin:
+ case nir_intrinsic_image_atomic_imax:
+ case nir_intrinsic_image_atomic_umax:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_image_load_raw_intel:
+ case nir_intrinsic_image_store_raw_intel:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_IMAGE);
+ break;
+
+ case nir_intrinsic_load_ubo:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_UBO);
+ break;
+
+ case nir_intrinsic_store_ssbo:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[1],
+ CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ case nir_intrinsic_load_output:
+ if (devinfo->ver >= 6) {
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_RENDER_TARGET_READ);
+ }
+ break;
+
+ case nir_intrinsic_get_ssbo_size:
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ case nir_intrinsic_ssbo_atomic_fmin:
+ case nir_intrinsic_ssbo_atomic_fmax:
+ case nir_intrinsic_ssbo_atomic_fcomp_swap:
+ case nir_intrinsic_load_ssbo:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+}
+
+static void
+crocus_debug_recompile(struct crocus_context *ice,
+ struct shader_info *info,
+ const struct brw_base_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+ const struct brw_compiler *c = screen->compiler;
+
+ if (!info)
+ return;
+
+ c->shader_perf_log(&ice->dbg, "Recompiling %s shader for program %s: %s\n",
+ _mesa_shader_stage_to_string(info->stage),
+ info->name ? info->name : "(no identifier)",
+ info->label ? info->label : "");
+
+ const void *old_key =
+ crocus_find_previous_compile(ice, info->stage, key->program_string_id);
+
+ brw_debug_key_recompile(c, &ice->dbg, info->stage, old_key, key);
+}
+
+/**
+ * Get the shader for the last enabled geometry stage.
+ *
+ * This stage is the one which will feed stream output and the rasterizer.
+ */
+static gl_shader_stage
+last_vue_stage(struct crocus_context *ice)
+{
+ if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+ return MESA_SHADER_GEOMETRY;
+
+ if (ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL])
+ return MESA_SHADER_TESS_EVAL;
+
+ return MESA_SHADER_VERTEX;
+}
+
+static GLbitfield64
+crocus_vs_outputs_written(struct crocus_context *ice,
+ const struct brw_vs_prog_key *key,
+ GLbitfield64 user_varyings)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ GLbitfield64 outputs_written = user_varyings;
+
+ if (devinfo->ver < 6) {
+
+ if (key->copy_edgeflag)
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
+
+ /* Put dummy slots into the VUE for the SF to put the replaced
+ * point sprite coords in. We shouldn't need these dummy slots,
+ * which take up precious URB space, but it would mean that the SF
+ * doesn't get nice aligned pairs of input coords into output
+ * coords, which would be a pain to handle.
+ */
+ for (unsigned i = 0; i < 8; i++) {
+ if (key->point_coord_replace & (1 << i))
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
+ }
+
+ /* if back colors are written, allocate slots for front colors too */
+ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0);
+ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1);
+ }
+
+ /* In order for legacy clipping to work, we need to populate the clip
+ * distance varying slots whenever clipping is enabled, even if the vertex
+ * shader doesn't write to gl_ClipDistance.
+ */
+ if (key->nr_userclip_plane_consts > 0) {
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+ }
+
+ return outputs_written;
+}
+
+/*
+ * If no edgeflags come from the user, gen4/5
+ * require giving the clip shader a default edgeflag.
+ *
+ * This will always be 1.0.
+ */
+static void
+crocus_lower_default_edgeflags(struct nir_shader *nir)
+{
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ b.cursor = nir_after_cf_list(&b.impl->body);
+ nir_variable *var = nir_variable_create(nir, nir_var_shader_out,
+ glsl_float_type(),
+ "edgeflag");
+ var->data.location = VARYING_SLOT_EDGE;
+ nir_store_var(&b, var, nir_imm_float(&b, 1.0), 0x1);
+}
+
+/**
+ * Compile a vertex shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_vs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_vs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_vs_prog_data *vs_prog_data =
+ rzalloc(mem_ctx, struct brw_vs_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &vs_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ enum brw_param_builtin *system_values;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ if (key->nr_userclip_plane_consts) {
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true,
+ false, NULL);
+ nir_lower_io_to_temporaries(nir, impl, true, false);
+ nir_lower_global_vars_to_local(nir);
+ nir_lower_vars_to_ssa(nir);
+ nir_shader_gather_info(nir, impl);
+ }
+
+ prog_data->use_alt_mode = ish->use_alt_mode;
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+
+ crocus_lower_swizzles(nir, &key->base.tex);
+
+ if (devinfo->ver <= 5 &&
+ !(nir->info.inputs_read & BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG)))
+ crocus_lower_default_edgeflags(nir);
+
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ uint64_t outputs_written =
+ crocus_vs_outputs_written(ice, key, nir->info.outputs_written);
+ brw_compute_vue_map(devinfo,
+ &vue_prog_data->vue_map, outputs_written,
+ nir->info.separate_shader, /* pos slots */ 1);
+
+ /* Don't tell the backend about our clip plane constants, we've already
+ * lowered them in NIR and we don't want it doing it again.
+ */
+ struct brw_vs_prog_key key_no_ucp = *key;
+ key_no_ucp.nr_userclip_plane_consts = 0;
+ key_no_ucp.copy_edgeflag = false;
+ crocus_sanitize_tex_key(&key_no_ucp.base.tex);
+
+ struct brw_compile_vs_params params = {
+ .nir = nir,
+ .key = &key_no_ucp,
+ .prog_data = vs_prog_data,
+ .edgeflag_is_last = devinfo->ver < 6,
+ .log_data = &ice->dbg,
+ };
+ const unsigned *program =
+ brw_compile_vs(compiler, mem_ctx, &params);
+ if (program == NULL) {
+ dbg_printf("Failed to compile vertex shader: %s\n", params.error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ uint32_t *so_decls = NULL;
+ if (devinfo->ver > 6)
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_VS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*vs_prog_data), so_decls,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current vertex shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_vs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_VERTEX];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_vs_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_VERTEX, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_vs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_VS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_VS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_vs(ice, ish, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_VS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS |
+ CROCUS_STAGE_DIRTY_BINDINGS_VS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_VS;
+ shs->sysvals_need_upload = true;
+
+ const struct brw_vs_prog_data *vs_prog_data =
+ (void *) shader->prog_data;
+ const bool uses_draw_params = vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance;
+ const bool uses_derived_draw_params = vs_prog_data->uses_drawid ||
+ vs_prog_data->uses_is_indexed_draw;
+ const bool needs_sgvs_element = uses_draw_params ||
+ vs_prog_data->uses_instanceid ||
+ vs_prog_data->uses_vertexid;
+
+ if (ice->state.vs_uses_draw_params != uses_draw_params ||
+ ice->state.vs_uses_derived_draw_params != uses_derived_draw_params ||
+ ice->state.vs_needs_edge_flag != ish->needs_edge_flag ||
+ ice->state.vs_uses_vertexid != vs_prog_data->uses_vertexid ||
+ ice->state.vs_uses_instanceid != vs_prog_data->uses_instanceid) {
+ ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS |
+ CROCUS_DIRTY_VERTEX_ELEMENTS;
+ }
+ ice->state.vs_uses_draw_params = uses_draw_params;
+ ice->state.vs_uses_derived_draw_params = uses_derived_draw_params;
+ ice->state.vs_needs_sgvs_element = needs_sgvs_element;
+ ice->state.vs_needs_edge_flag = ish->needs_edge_flag;
+ ice->state.vs_uses_vertexid = vs_prog_data->uses_vertexid;
+ ice->state.vs_uses_instanceid = vs_prog_data->uses_instanceid;
+ }
+}
+
+/**
+ * Get the shader_info for a given stage, or NULL if the stage is disabled.
+ */
+const struct shader_info *
+crocus_get_shader_info(const struct crocus_context *ice, gl_shader_stage stage)
+{
+ const struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[stage];
+
+ if (!ish)
+ return NULL;
+
+ const nir_shader *nir = ish->nir;
+ return &nir->info;
+}
+
+/**
+ * Get the union of TCS output and TES input slots.
+ *
+ * TCS and TES need to agree on a common URB entry layout. In particular,
+ * the data for all patch vertices is stored in a single URB entry (unlike
+ * GS which has one entry per input vertex). This means that per-vertex
+ * array indexing needs a stride.
+ *
+ * SSO requires locations to match, but doesn't require the number of
+ * outputs/inputs to match (in fact, the TCS often has extra outputs).
+ * So, we need to take the extra step of unifying these on the fly.
+ */
+static void
+get_unified_tess_slots(const struct crocus_context *ice,
+ uint64_t *per_vertex_slots,
+ uint32_t *per_patch_slots)
+{
+ const struct shader_info *tcs =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+ const struct shader_info *tes =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL);
+
+ *per_vertex_slots = tes->inputs_read;
+ *per_patch_slots = tes->patch_inputs_read;
+
+ if (tcs) {
+ *per_vertex_slots |= tcs->outputs_written;
+ *per_patch_slots |= tcs->patch_outputs_written;
+ }
+}
+
+/**
+ * Compile a tessellation control shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_tcs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_tcs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ const struct nir_shader_compiler_options *options =
+ compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].NirOptions;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_tcs_prog_data *tcs_prog_data =
+ rzalloc(mem_ctx, struct brw_tcs_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ enum brw_param_builtin *system_values = NULL;
+ unsigned num_system_values = 0;
+ unsigned num_cbufs = 0;
+
+ nir_shader *nir;
+
+ struct crocus_binding_table bt;
+
+ if (ish) {
+ nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+
+ crocus_lower_swizzles(nir, &key->base.tex);
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+ } else {
+ nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, options, key);
+
+ /* Reserve space for passing the default tess levels as constants. */
+ num_cbufs = 1;
+ num_system_values = 8;
+ system_values =
+ rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values);
+ prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values);
+ prog_data->nr_params = num_system_values;
+
+ if (key->tes_primitive_mode == GL_QUADS) {
+ for (int i = 0; i < 4; i++)
+ system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
+
+ system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
+ system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y;
+ } else if (key->tes_primitive_mode == GL_TRIANGLES) {
+ for (int i = 0; i < 3; i++)
+ system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
+
+ system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
+ } else {
+ assert(key->tes_primitive_mode == GL_ISOLINES);
+ system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y;
+ system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
+ }
+
+ /* Manually setup the TCS binding table. */
+ memset(&bt, 0, sizeof(bt));
+ bt.sizes[CROCUS_SURFACE_GROUP_UBO] = 1;
+ bt.used_mask[CROCUS_SURFACE_GROUP_UBO] = 1;
+ bt.size_bytes = 4;
+
+ prog_data->ubo_ranges[0].length = 1;
+ }
+
+ struct brw_tcs_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+ char *error_str = NULL;
+ const unsigned *program =
+ brw_compile_tcs(compiler, &ice->dbg, mem_ctx, &key_clean, tcs_prog_data, nir,
+ -1, NULL, &error_str);
+ if (program == NULL) {
+ dbg_printf("Failed to compile control shader: %s\n", error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish) {
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_TCS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*tcs_prog_data), NULL,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ if (ish)
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current tessellation control shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_tcs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
+ struct crocus_uncompiled_shader *tcs =
+ ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ const struct shader_info *tes_info =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL);
+ struct brw_tcs_prog_key key = {
+ KEY_INIT_NO_ID(),
+ .base.program_string_id = tcs ? tcs->program_id : 0,
+ .tes_primitive_mode = tes_info->tess.primitive_mode,
+ .input_vertices = ice->state.vertices_per_patch,
+ .quads_workaround = tes_info->tess.primitive_mode == GL_QUADS &&
+ tes_info->tess.spacing == TESS_SPACING_EQUAL,
+ };
+
+ if (tcs && tcs->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_CTRL, tcs,
+ tcs->nir->info.uses_texture_gather, &key.base.tex);
+ get_unified_tess_slots(ice, &key.outputs_written,
+ &key.patch_outputs_written);
+ screen->vtbl.populate_tcs_key(ice, &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TCS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_TCS, sizeof(key), &key);
+
+ if (tcs && !shader)
+ shader = crocus_disk_cache_retrieve(ice, tcs, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_tcs(ice, tcs, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_TCS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TCS |
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Compile a tessellation evaluation shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_tes(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_tes_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_tes_prog_data *tes_prog_data =
+ rzalloc(mem_ctx, struct brw_tes_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ enum brw_param_builtin *system_values;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ if (key->nr_userclip_plane_consts) {
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true,
+ false, NULL);
+ nir_lower_io_to_temporaries(nir, impl, true, false);
+ nir_lower_global_vars_to_local(nir);
+ nir_lower_vars_to_ssa(nir);
+ nir_shader_gather_info(nir, impl);
+ }
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+ crocus_lower_swizzles(nir, &key->base.tex);
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ struct brw_vue_map input_vue_map;
+ brw_compute_tess_vue_map(&input_vue_map, key->inputs_read,
+ key->patch_inputs_read);
+
+ struct brw_tes_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+ char *error_str = NULL;
+ const unsigned *program =
+ brw_compile_tes(compiler, &ice->dbg, mem_ctx, &key_clean, &input_vue_map,
+ tes_prog_data, nir, -1, NULL, &error_str);
+ if (program == NULL) {
+ dbg_printf("Failed to compile evaluation shader: %s\n", error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ uint32_t *so_decls = NULL;
+ if (devinfo->ver > 6)
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_TES, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*tes_prog_data), so_decls,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current tessellation evaluation shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_tes(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL];
+ struct brw_tes_prog_key key = { KEY_INIT() };
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_EVAL, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ get_unified_tess_slots(ice, &key.inputs_read, &key.patch_inputs_read);
+ screen->vtbl.populate_tes_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TES];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_TES, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_tes(ice, ish, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_TES] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_BINDINGS_TES |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ shs->sysvals_need_upload = true;
+ }
+
+ /* TODO: Could compare and avoid flagging this. */
+ const struct shader_info *tes_info = &ish->nir->info;
+ if (BITSET_TEST(tes_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ ice->state.shaders[MESA_SHADER_TESS_EVAL].sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Compile a geometry shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_gs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_gs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_gs_prog_data *gs_prog_data =
+ rzalloc(mem_ctx, struct brw_gs_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &gs_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ enum brw_param_builtin *system_values;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ if (key->nr_userclip_plane_consts) {
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_lower_clip_gs(nir, (1 << key->nr_userclip_plane_consts) - 1, false,
+ NULL);
+ nir_lower_io_to_temporaries(nir, impl, true, false);
+ nir_lower_global_vars_to_local(nir);
+ nir_lower_vars_to_ssa(nir);
+ nir_shader_gather_info(nir, impl);
+ }
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+ crocus_lower_swizzles(nir, &key->base.tex);
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ brw_compute_vue_map(devinfo,
+ &vue_prog_data->vue_map, nir->info.outputs_written,
+ nir->info.separate_shader, /* pos slots */ 1);
+
+ if (devinfo->ver == 6)
+ gfx6_gs_xfb_setup(&ish->stream_output, gs_prog_data);
+ struct brw_gs_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+
+ char *error_str = NULL;
+ const unsigned *program =
+ brw_compile_gs(compiler, &ice->dbg, mem_ctx, &key_clean, gs_prog_data, nir,
+ -1, NULL, &error_str);
+ if (program == NULL) {
+ dbg_printf("Failed to compile geometry shader: %s\n", error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ uint32_t *so_decls = NULL;
+ if (devinfo->ver > 6)
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_GS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*gs_prog_data), so_decls,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current geometry shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_gs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_GEOMETRY];
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_GS];
+ struct crocus_compiled_shader *shader = NULL;
+
+ if (ish) {
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_gs_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_GEOMETRY, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_gs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+ shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_GS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_gs(ice, ish, &key);
+ }
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_GS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS |
+ CROCUS_STAGE_DIRTY_BINDINGS_GS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_GS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Compile a fragment (pixel) shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_fs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_wm_prog_key *key,
+ struct brw_vue_map *vue_map)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_wm_prog_data *fs_prog_data =
+ rzalloc(mem_ctx, struct brw_wm_prog_data);
+ struct brw_stage_prog_data *prog_data = &fs_prog_data->base;
+ enum brw_param_builtin *system_values;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ prog_data->use_alt_mode = ish->use_alt_mode;
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+
+ /* Lower output variables to load_output intrinsics before setting up
+ * binding tables, so crocus_setup_binding_table can map any load_output
+ * intrinsics to CROCUS_SURFACE_GROUP_RENDER_TARGET_READ on Gen8 for
+ * non-coherent framebuffer fetches.
+ */
+ brw_nir_lower_fs_outputs(nir);
+
+ /* lower swizzles before binding table */
+ crocus_lower_swizzles(nir, &key->base.tex);
+ int null_rts = 1;
+
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt,
+ MAX2(key->nr_color_regions, null_rts),
+ num_system_values, num_cbufs,
+ &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ struct brw_wm_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+
+ struct brw_compile_fs_params params = {
+ .nir = nir,
+ .key = &key_clean,
+ .prog_data = fs_prog_data,
+
+ .allow_spilling = true,
+ .vue_map = vue_map,
+
+ .log_data = &ice->dbg,
+ };
+ const unsigned *program =
+ brw_compile_fs(compiler, mem_ctx, &params);
+ if (program == NULL) {
+ dbg_printf("Failed to compile fragment shader: %s\n", params.error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_FS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*fs_prog_data), NULL,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current fragment shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_fs(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+ struct brw_wm_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_FRAGMENT, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_fs_key(ice, &ish->nir->info, &key);
+
+ if (ish->nos & (1ull << CROCUS_NOS_LAST_VUE_MAP))
+ key.input_slots_valid = ice->shaders.last_vue_map->slots_valid;
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_FS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_FS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_fs(ice, ish, &key, ice->shaders.last_vue_map);
+
+ if (old != shader) {
+ // XXX: only need to flag CLIP if barycentric has NONPERSPECTIVE
+ // toggles. might be able to avoid flagging SBE too.
+ ice->shaders.prog[CROCUS_CACHE_FS] = shader;
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+ /* gen4 clip/sf rely on fs prog_data */
+ if (devinfo->ver < 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+ else
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ if (devinfo->ver == 6)
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ if (devinfo->ver >= 7)
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS |
+ CROCUS_STAGE_DIRTY_BINDINGS_FS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_FS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Update the last enabled stage's VUE map.
+ *
+ * When the shader feeding the rasterizer's output interface changes, we
+ * need to re-emit various packets.
+ */
+static void
+update_last_vue_map(struct crocus_context *ice,
+ struct brw_stage_prog_data *prog_data)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+ struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
+ struct brw_vue_map *old_map = ice->shaders.last_vue_map;
+ const uint64_t changed_slots =
+ (old_map ? old_map->slots_valid : 0ull) ^ vue_map->slots_valid;
+
+ if (changed_slots & VARYING_BIT_VIEWPORT) {
+ ice->state.num_viewports =
+ (vue_map->slots_valid & VARYING_BIT_VIEWPORT) ? CROCUS_MAX_VIEWPORTS : 1;
+ ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT |
+ CROCUS_DIRTY_CC_VIEWPORT;
+ if (devinfo->ver < 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+
+ if (devinfo->ver <= 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+
+ if (devinfo->ver >= 6)
+ ice->state.dirty |= CROCUS_DIRTY_CLIP |
+ CROCUS_DIRTY_GEN6_SCISSOR_RECT;;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS |
+ ice->state.stage_dirty_for_nos[CROCUS_NOS_LAST_VUE_MAP];
+ }
+
+ if (changed_slots || (old_map && old_map->separate != vue_map->separate)) {
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS;
+ }
+
+ ice->shaders.last_vue_map = &vue_prog_data->vue_map;
+}
+
+static void
+crocus_update_pull_constant_descriptors(struct crocus_context *ice,
+ gl_shader_stage stage)
+{
+ struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+
+ if (!shader || !shader->prog_data->has_ubo_pull)
+ return;
+
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ bool any_new_descriptors =
+ shader->num_system_values > 0 && shs->sysvals_need_upload;
+
+ unsigned bound_cbufs = shs->bound_cbufs;
+
+ while (bound_cbufs) {
+ const int i = u_bit_scan(&bound_cbufs);
+ struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
+ if (cbuf->buffer) {
+ any_new_descriptors = true;
+ }
+ }
+
+ if (any_new_descriptors)
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
+}
+
+/**
+ * Get the prog_data for a given stage, or NULL if the stage is disabled.
+ */
+static struct brw_vue_prog_data *
+get_vue_prog_data(struct crocus_context *ice, gl_shader_stage stage)
+{
+ if (!ice->shaders.prog[stage])
+ return NULL;
+
+ return (void *) ice->shaders.prog[stage]->prog_data;
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_clip(struct crocus_context *ice, struct brw_clip_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx;
+ unsigned program_size;
+ mem_ctx = ralloc_context(NULL);
+
+ struct brw_clip_prog_data *clip_prog_data =
+ rzalloc(mem_ctx, struct brw_clip_prog_data);
+
+ const unsigned *program = brw_compile_clip(compiler, mem_ctx, key, clip_prog_data,
+ ice->shaders.last_vue_map, &program_size);
+
+ if (program == NULL) {
+ dbg_printf("failed to compile clip shader\n");
+ ralloc_free(mem_ctx);
+ return false;
+ }
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_CLIP, sizeof(*key), key, program,
+ program_size,
+ (struct brw_stage_prog_data *)clip_prog_data, sizeof(*clip_prog_data),
+ NULL, NULL, 0, 0, &bt);
+ ralloc_free(mem_ctx);
+ return shader;
+}
+static void
+crocus_update_compiled_clip(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct brw_clip_prog_key key;
+ struct crocus_compiled_shader *old = ice->shaders.clip_prog;
+ memset(&key, 0, sizeof(key));
+
+ const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+ if (wm_prog_data) {
+ key.contains_flat_varying = wm_prog_data->contains_flat_varying;
+ key.contains_noperspective_varying =
+ wm_prog_data->contains_noperspective_varying;
+ memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode));
+ }
+
+ key.primitive = u_reduced_prim(ice->state.prim_mode);
+ key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ key.pv_first = rs_state->flatshade_first;
+
+ if (rs_state->clip_plane_enable)
+ key.nr_userclip = util_logbase2(rs_state->clip_plane_enable) + 1;
+
+ if (screen->devinfo.ver == 5)
+ key.clip_mode = BRW_CLIP_MODE_KERNEL_CLIP;
+ else
+ key.clip_mode = BRW_CLIP_MODE_NORMAL;
+
+ if (key.primitive == PIPE_PRIM_TRIANGLES) {
+ if (rs_state->cull_face == PIPE_FACE_FRONT_AND_BACK)
+ key.clip_mode = BRW_CLIP_MODE_REJECT_ALL;
+ else {
+ uint32_t fill_front = BRW_CLIP_FILL_MODE_CULL;
+ uint32_t fill_back = BRW_CLIP_FILL_MODE_CULL;
+ uint32_t offset_front = 0;
+ uint32_t offset_back = 0;
+
+ if (!(rs_state->cull_face & PIPE_FACE_FRONT)) {
+ switch (rs_state->fill_front) {
+ case PIPE_POLYGON_MODE_FILL:
+ fill_front = BRW_CLIP_FILL_MODE_FILL;
+ offset_front = 0;
+ break;
+ case PIPE_POLYGON_MODE_LINE:
+ fill_front = BRW_CLIP_FILL_MODE_LINE;
+ offset_front = rs_state->offset_line;
+ break;
+ case PIPE_POLYGON_MODE_POINT:
+ fill_front = BRW_CLIP_FILL_MODE_POINT;
+ offset_front = rs_state->offset_point;
+ break;
+ }
+ }
+
+ if (!(rs_state->cull_face & PIPE_FACE_BACK)) {
+ switch (rs_state->fill_back) {
+ case PIPE_POLYGON_MODE_FILL:
+ fill_back = BRW_CLIP_FILL_MODE_FILL;
+ offset_back = 0;
+ break;
+ case PIPE_POLYGON_MODE_LINE:
+ fill_back = BRW_CLIP_FILL_MODE_LINE;
+ offset_back = rs_state->offset_line;
+ break;
+ case PIPE_POLYGON_MODE_POINT:
+ fill_back = BRW_CLIP_FILL_MODE_POINT;
+ offset_back = rs_state->offset_point;
+ break;
+ }
+ }
+
+ if (rs_state->fill_back != PIPE_POLYGON_MODE_FILL ||
+ rs_state->fill_front != PIPE_POLYGON_MODE_FILL) {
+ key.do_unfilled = 1;
+
+ /* Most cases the fixed function units will handle. Cases where
+ * one or more polygon faces are unfilled will require help:
+ */
+ key.clip_mode = BRW_CLIP_MODE_CLIP_NON_REJECTED;
+
+ if (offset_back || offset_front) {
+ double mrd = 0.0;
+ if (ice->state.framebuffer.zsbuf)
+ mrd = util_get_depth_format_mrd(util_format_description(ice->state.framebuffer.zsbuf->format));
+ key.offset_units = rs_state->offset_units * mrd * 2;
+ key.offset_factor = rs_state->offset_scale * mrd;
+ key.offset_clamp = rs_state->offset_clamp * mrd;
+ }
+
+ if (!(rs_state->front_ccw ^ rs_state->bottom_edge_rule)) {
+ key.fill_ccw = fill_front;
+ key.fill_cw = fill_back;
+ key.offset_ccw = offset_front;
+ key.offset_cw = offset_back;
+ if (rs_state->light_twoside &&
+ key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
+ key.copy_bfc_cw = 1;
+ } else {
+ key.fill_cw = fill_front;
+ key.fill_ccw = fill_back;
+ key.offset_cw = offset_front;
+ key.offset_ccw = offset_back;
+ if (rs_state->light_twoside &&
+ key.fill_ccw != BRW_CLIP_FILL_MODE_CULL)
+ key.copy_bfc_ccw = 1;
+ }
+ }
+ }
+ }
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_CLIP, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_compile_clip(ice, &key);
+
+ if (old != shader) {
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ ice->shaders.clip_prog = shader;
+ }
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_sf(struct crocus_context *ice, struct brw_sf_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx;
+ unsigned program_size;
+ mem_ctx = ralloc_context(NULL);
+
+ struct brw_sf_prog_data *sf_prog_data =
+ rzalloc(mem_ctx, struct brw_sf_prog_data);
+
+ const unsigned *program = brw_compile_sf(compiler, mem_ctx, key, sf_prog_data,
+ ice->shaders.last_vue_map, &program_size);
+
+ if (program == NULL) {
+ dbg_printf("failed to compile sf shader\n");
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_SF, sizeof(*key), key, program,
+ program_size,
+ (struct brw_stage_prog_data *)sf_prog_data, sizeof(*sf_prog_data),
+ NULL, NULL, 0, 0, &bt);
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+static void
+crocus_update_compiled_sf(struct crocus_context *ice)
+{
+ struct brw_sf_prog_key key;
+ struct crocus_compiled_shader *old = ice->shaders.sf_prog;
+ memset(&key, 0, sizeof(key));
+
+ key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+ switch (u_reduced_prim(ice->state.prim_mode)) {
+ case GL_TRIANGLES:
+ default:
+ if (key.attrs & BITFIELD64_BIT(VARYING_SLOT_EDGE))
+ key.primitive = BRW_SF_PRIM_UNFILLED_TRIS;
+ else
+ key.primitive = BRW_SF_PRIM_TRIANGLES;
+ break;
+ case GL_LINES:
+ key.primitive = BRW_SF_PRIM_LINES;
+ break;
+ case GL_POINTS:
+ key.primitive = BRW_SF_PRIM_POINTS;
+ break;
+ }
+
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ key.userclip_active = rs_state->clip_plane_enable != 0;
+ const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+ if (wm_prog_data) {
+ key.contains_flat_varying = wm_prog_data->contains_flat_varying;
+ memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode));
+ }
+
+ key.do_twoside_color = rs_state->light_twoside;
+
+ key.do_point_sprite = rs_state->point_quad_rasterization;
+ if (key.do_point_sprite) {
+ key.point_sprite_coord_replace = rs_state->sprite_coord_enable & 0xff;
+ if (rs_state->sprite_coord_enable & (1 << 8))
+ key.do_point_coord = 1;
+ if (wm_prog_data && wm_prog_data->urb_setup[VARYING_SLOT_PNTC] != -1)
+ key.do_point_coord = 1;
+ }
+
+ key.sprite_origin_lower_left = rs_state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT;
+
+ if (key.do_twoside_color) {
+ key.frontface_ccw = rs_state->front_ccw;
+ }
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_SF, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_compile_sf(ice, &key);
+
+ if (old != shader) {
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ ice->shaders.sf_prog = shader;
+ }
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_ff_gs(struct crocus_context *ice, struct brw_ff_gs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx;
+ unsigned program_size;
+ mem_ctx = ralloc_context(NULL);
+
+ struct brw_ff_gs_prog_data *ff_gs_prog_data =
+ rzalloc(mem_ctx, struct brw_ff_gs_prog_data);
+
+ const unsigned *program = brw_compile_ff_gs_prog(compiler, mem_ctx, key, ff_gs_prog_data,
+ ice->shaders.last_vue_map, &program_size);
+
+ if (program == NULL) {
+ dbg_printf("failed to compile sf shader\n");
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+
+ if (screen->devinfo.ver == 6) {
+ bt.sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS;
+ bt.used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1;
+
+ bt.size_bytes = BRW_MAX_SOL_BINDINGS * 4;
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_FF_GS, sizeof(*key), key, program,
+ program_size,
+ (struct brw_stage_prog_data *)ff_gs_prog_data, sizeof(*ff_gs_prog_data),
+ NULL, NULL, 0, 0, &bt);
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+static void
+crocus_update_compiled_ff_gs(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_ff_gs_prog_key key;
+ struct crocus_compiled_shader *old = ice->shaders.ff_gs_prog;
+ memset(&key, 0, sizeof(key));
+
+ assert(devinfo->ver < 7);
+
+ key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+ key.primitive = screen->vtbl.translate_prim_type(ice->state.prim_mode, 0);
+
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ key.pv_first = rs_state->flatshade_first;
+
+ if (key.primitive == _3DPRIM_QUADLIST && !rs_state->flatshade) {
+ /* Provide consistenbbbbbt primitive order with brw_set_prim's
+ * optimization of single quads to trifans.
+ */
+ key.pv_first = true;
+ }
+
+ if (devinfo->ver >= 6) {
+ key.need_gs_prog = ice->state.streamout_active;
+ if (key.need_gs_prog) {
+ struct crocus_uncompiled_shader *vs =
+ ice->shaders.uncompiled[MESA_SHADER_VERTEX];
+ gfx6_ff_gs_xfb_setup(&vs->stream_output,
+ &key);
+ }
+ } else {
+ key.need_gs_prog = (key.primitive == _3DPRIM_QUADLIST ||
+ key.primitive == _3DPRIM_QUADSTRIP ||
+ key.primitive == _3DPRIM_LINELOOP);
+ }
+
+ struct crocus_compiled_shader *shader = NULL;
+ if (key.need_gs_prog) {
+ shader = crocus_find_cached_shader(ice, CROCUS_CACHE_FF_GS,
+ sizeof(key), &key);
+ if (!shader)
+ shader = crocus_compile_ff_gs(ice, &key);
+ }
+ if (old != shader) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
+ if (!!old != !!shader)
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+ ice->shaders.ff_gs_prog = shader;
+ if (shader) {
+ const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
+ ice->state.last_xfb_verts_per_prim = gs_prog_data->svbi_postincrement_value;
+ }
+ }
+}
+
+// XXX: crocus_compiled_shaders are space-leaking :(
+// XXX: do remember to unbind them if deleting them.
+
+/**
+ * Update the current shader variants for the given state.
+ *
+ * This should be called on every draw call to ensure that the correct
+ * shaders are bound. It will also flag any dirty state triggered by
+ * swapping out those shaders.
+ */
+bool
+crocus_update_compiled_shaders(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ const uint64_t stage_dirty = ice->state.stage_dirty;
+
+ struct brw_vue_prog_data *old_prog_datas[4];
+ if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) {
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++)
+ old_prog_datas[i] = get_vue_prog_data(ice, i);
+ }
+
+ if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_TCS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) {
+ struct crocus_uncompiled_shader *tes =
+ ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL];
+ if (tes) {
+ crocus_update_compiled_tcs(ice);
+ crocus_update_compiled_tes(ice);
+ } else {
+ ice->shaders.prog[CROCUS_CACHE_TCS] = NULL;
+ ice->shaders.prog[CROCUS_CACHE_TES] = NULL;
+ ice->state.stage_dirty |=
+ CROCUS_STAGE_DIRTY_TCS | CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS | CROCUS_STAGE_DIRTY_BINDINGS_TES |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS | CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ }
+ }
+
+ if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_VS)
+ crocus_update_compiled_vs(ice);
+ if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_GS)
+ crocus_update_compiled_gs(ice);
+
+ if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_GS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) {
+ const struct crocus_compiled_shader *gs =
+ ice->shaders.prog[MESA_SHADER_GEOMETRY];
+ const struct crocus_compiled_shader *tes =
+ ice->shaders.prog[MESA_SHADER_TESS_EVAL];
+
+ bool points_or_lines = false;
+
+ if (gs) {
+ const struct brw_gs_prog_data *gs_prog_data = (void *) gs->prog_data;
+ points_or_lines =
+ gs_prog_data->output_topology == _3DPRIM_POINTLIST ||
+ gs_prog_data->output_topology == _3DPRIM_LINESTRIP;
+ } else if (tes) {
+ const struct brw_tes_prog_data *tes_data = (void *) tes->prog_data;
+ points_or_lines =
+ tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_LINE ||
+ tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+ }
+
+ if (ice->shaders.output_topology_is_points_or_lines != points_or_lines) {
+ /* Outbound to XY Clip enables */
+ ice->shaders.output_topology_is_points_or_lines = points_or_lines;
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ }
+ }
+
+ if (!ice->shaders.prog[MESA_SHADER_VERTEX])
+ return false;
+
+ gl_shader_stage last_stage = last_vue_stage(ice);
+ struct crocus_compiled_shader *shader = ice->shaders.prog[last_stage];
+ struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[last_stage];
+ update_last_vue_map(ice, shader->prog_data);
+ if (ice->state.streamout != shader->streamout) {
+ ice->state.streamout = shader->streamout;
+ ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST | CROCUS_DIRTY_STREAMOUT;
+ }
+
+ if (ice->state.streamout_active) {
+ screen->vtbl.update_so_strides(ice, ish->stream_output.stride);
+ }
+
+ /* use ice->state version as last_vue_map can dirty this bit */
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_FS)
+ crocus_update_compiled_fs(ice);
+
+ if (screen->devinfo.ver <= 6) {
+ if (ice->state.dirty & CROCUS_DIRTY_GEN4_FF_GS_PROG &&
+ !ice->shaders.prog[MESA_SHADER_GEOMETRY])
+ crocus_update_compiled_ff_gs(ice);
+ }
+
+ if (screen->devinfo.ver < 6) {
+ if (ice->state.dirty & CROCUS_DIRTY_GEN4_CLIP_PROG)
+ crocus_update_compiled_clip(ice);
+ if (ice->state.dirty & CROCUS_DIRTY_GEN4_SF_PROG)
+ crocus_update_compiled_sf(ice);
+ }
+
+
+ /* Changing shader interfaces may require a URB configuration. */
+ if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) {
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+ struct brw_vue_prog_data *old = old_prog_datas[i];
+ struct brw_vue_prog_data *new = get_vue_prog_data(ice, i);
+ if (!!old != !!new ||
+ (new && new->urb_entry_size != old->urb_entry_size)) {
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+ break;
+ }
+ }
+ }
+
+ if (ice->state.stage_dirty & CROCUS_RENDER_STAGE_DIRTY_CONSTANTS) {
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) {
+ if (ice->state.stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << i))
+ crocus_update_pull_constant_descriptors(ice, i);
+ }
+ }
+ return true;
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_cs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_cs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_cs_prog_data *cs_prog_data =
+ rzalloc(mem_ctx, struct brw_cs_prog_data);
+ struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+ enum brw_param_builtin *system_values;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+ crocus_lower_swizzles(nir, &key->base.tex);
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ struct brw_compile_cs_params params = {
+ .nir = nir,
+ .key = key,
+ .prog_data = cs_prog_data,
+ .log_data = &ice->dbg,
+ };
+
+ const unsigned *program =
+ brw_compile_cs(compiler, mem_ctx, &params);
+ if (program == NULL) {
+ dbg_printf("Failed to compile compute shader: %s\n", params.error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_CS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*cs_prog_data), NULL,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+static void
+crocus_update_compiled_cs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_cs_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_COMPUTE, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_cs_key(ice, &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_CS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_CS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_cs(ice, ish, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_CS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS |
+ CROCUS_STAGE_DIRTY_BINDINGS_CS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+void
+crocus_update_compiled_compute_shader(struct crocus_context *ice)
+{
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_CS)
+ crocus_update_compiled_cs(ice);
+
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS)
+ crocus_update_pull_constant_descriptors(ice, MESA_SHADER_COMPUTE);
+}
+
+void
+crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+ unsigned threads,
+ uint32_t *dst)
+{
+ assert(brw_cs_push_const_total_size(cs_prog_data, threads) > 0);
+ assert(cs_prog_data->push.cross_thread.size == 0);
+ assert(cs_prog_data->push.per_thread.dwords == 1);
+ assert(cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
+ for (unsigned t = 0; t < threads; t++)
+ dst[8 * t] = t;
+}
+
+/**
+ * Allocate scratch BOs as needed for the given per-thread size and stage.
+ */
+struct crocus_bo *
+crocus_get_scratch_space(struct crocus_context *ice,
+ unsigned per_thread_scratch,
+ gl_shader_stage stage)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ unsigned encoded_size = ffs(per_thread_scratch) - 11;
+ assert(encoded_size < (1 << 16));
+
+ struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
+
+ unsigned subslice_total = screen->subslice_total;
+ subslice_total = 4 * devinfo->num_slices;
+ // assert(subslice_total >= screen->subslice_total);
+
+ if (!*bop) {
+ unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
+
+ uint32_t max_threads[] = {
+ [MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
+ [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
+ [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
+ [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
+ [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
+ [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total,
+ };
+
+ uint32_t size = per_thread_scratch * max_threads[stage];
+
+ *bop = crocus_bo_alloc(bufmgr, "scratch", size);
+ }
+
+ return *bop;
+}
+
+/* ------------------------------------------------------------------- */
+
+/**
+ * The pipe->create_[stage]_state() driver hooks.
+ *
+ * Performs basic NIR preprocessing, records any state dependencies, and
+ * returns an crocus_uncompiled_shader as the Gallium CSO.
+ *
+ * Actual shader compilation to assembly happens later, at first use.
+ */
+static void *
+crocus_create_uncompiled_shader(struct pipe_context *ctx,
+ nir_shader *nir,
+ const struct pipe_stream_output_info *so_info)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_uncompiled_shader *ish =
+ calloc(1, sizeof(struct crocus_uncompiled_shader));
+ if (!ish)
+ return NULL;
+
+ if (devinfo->ver >= 6)
+ NIR_PASS(ish->needs_edge_flag, nir, crocus_fix_edge_flags);
+ else
+ ish->needs_edge_flag = false;
+
+ brw_preprocess_nir(screen->compiler, nir, NULL);
+
+ NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo, false);
+ NIR_PASS_V(nir, crocus_lower_storage_image_derefs);
+
+ nir_sweep(nir);
+
+ ish->program_id = get_new_program_id(screen);
+ ish->nir = nir;
+ if (so_info) {
+ memcpy(&ish->stream_output, so_info, sizeof(*so_info));
+ update_so_info(&ish->stream_output, nir->info.outputs_written);
+ }
+
+ /* Save this now before potentially dropping nir->info.name */
+ if (nir->info.name && strncmp(nir->info.name, "ARB", 3) == 0)
+ ish->use_alt_mode = true;
+
+ if (screen->disk_cache) {
+ /* Serialize the NIR to a binary blob that we can hash for the disk
+ * cache. Drop unnecessary information (like variable names)
+ * so the serialized NIR is smaller, and also to let us detect more
+ * isomorphic shaders when hashing, increasing cache hits.
+ */
+ struct blob blob;
+ blob_init(&blob);
+ nir_serialize(&blob, nir, true);
+ _mesa_sha1_compute(blob.data, blob.size, ish->nir_sha1);
+ blob_finish(&blob);
+ }
+
+ return ish;
+}
+
+static struct crocus_uncompiled_shader *
+crocus_create_shader_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct nir_shader *nir;
+
+ if (state->type == PIPE_SHADER_IR_TGSI)
+ nir = tgsi_to_nir(state->tokens, ctx->screen, false);
+ else
+ nir = state->ir.nir;
+
+ return crocus_create_uncompiled_shader(ctx, nir, &state->stream_output);
+}
+
+static void *
+crocus_create_vs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ /* User clip planes or gen5 sprite coord enable */
+ if (ish->nir->info.clip_distance_array_size == 0 ||
+ screen->devinfo.ver <= 5)
+ ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+ if (!screen->devinfo.is_haswell)
+ ish->nos |= (1ull << CROCUS_NOS_VERTEX_ELEMENTS);
+
+ if (screen->precompile) {
+ struct brw_vs_prog_key key = { KEY_INIT() };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_vs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_tcs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+ struct shader_info *info = &ish->nir->info;
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ if (screen->precompile) {
+ const unsigned _GL_TRIANGLES = 0x0004;
+ struct brw_tcs_prog_key key = {
+ KEY_INIT(),
+ // XXX: make sure the linker fills this out from the TES...
+ .tes_primitive_mode =
+ info->tess.primitive_mode ? info->tess.primitive_mode
+ : _GL_TRIANGLES,
+ .outputs_written = info->outputs_written,
+ .patch_outputs_written = info->patch_outputs_written,
+ };
+
+ key.input_vertices = info->tess.tcs_vertices_out;
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_tcs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_tes_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+ struct shader_info *info = &ish->nir->info;
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ /* User clip planes */
+ if (ish->nir->info.clip_distance_array_size == 0)
+ ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+ if (screen->precompile) {
+ struct brw_tes_prog_key key = {
+ KEY_INIT(),
+ // XXX: not ideal, need TCS output/TES input unification
+ .inputs_read = info->inputs_read,
+ .patch_inputs_read = info->patch_inputs_read,
+ };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_tes(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_gs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ /* User clip planes */
+ if (ish->nir->info.clip_distance_array_size == 0)
+ ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+ if (screen->precompile) {
+ struct brw_gs_prog_key key = { KEY_INIT() };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_gs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_fs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+ struct shader_info *info = &ish->nir->info;
+
+ ish->nos |= (1ull << CROCUS_NOS_FRAMEBUFFER) |
+ (1ull << CROCUS_NOS_DEPTH_STENCIL_ALPHA) |
+ (1ull << CROCUS_NOS_RASTERIZER) |
+ (1ull << CROCUS_NOS_TEXTURES) |
+ (1ull << CROCUS_NOS_BLEND);
+
+ /* The program key needs the VUE map if there are > 16 inputs or gen4/5 */
+ if (screen->devinfo.ver < 6 || util_bitcount64(ish->nir->info.inputs_read &
+ BRW_FS_VARYING_INPUT_MASK) > 16) {
+ ish->nos |= (1ull << CROCUS_NOS_LAST_VUE_MAP);
+ }
+
+ if (screen->precompile) {
+ const uint64_t color_outputs = info->outputs_written &
+ ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
+ BITFIELD64_BIT(FRAG_RESULT_STENCIL) |
+ BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+
+ bool can_rearrange_varyings =
+ screen->devinfo.ver > 6 && util_bitcount64(info->inputs_read & BRW_FS_VARYING_INPUT_MASK) <= 16;
+
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_wm_prog_key key = {
+ KEY_INIT(),
+ .nr_color_regions = util_bitcount(color_outputs),
+ .coherent_fb_fetch = false,
+ .input_slots_valid =
+ can_rearrange_varyings ? 0 : info->inputs_read | VARYING_BIT_POS,
+ };
+
+ struct brw_vue_map vue_map;
+ if (devinfo->ver < 6) {
+ brw_compute_vue_map(devinfo, &vue_map,
+ info->inputs_read | VARYING_BIT_POS,
+ false, /* pos slots */ 1);
+ }
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_fs(ice, ish, &key, &vue_map);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_compute_state(struct pipe_context *ctx,
+ const struct pipe_compute_state *state)
+{
+ assert(state->ir_type == PIPE_SHADER_IR_NIR);
+
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish =
+ crocus_create_uncompiled_shader(ctx, (void *) state->prog, NULL);
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ // XXX: disallow more than 64KB of shared variables
+
+ if (screen->precompile) {
+ struct brw_cs_prog_key key = { KEY_INIT() };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_cs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+/**
+ * The pipe->delete_[stage]_state() driver hooks.
+ *
+ * Frees the crocus_uncompiled_shader.
+ */
+static void
+crocus_delete_shader_state(struct pipe_context *ctx, void *state, gl_shader_stage stage)
+{
+ struct crocus_uncompiled_shader *ish = state;
+ struct crocus_context *ice = (void *) ctx;
+
+ if (ice->shaders.uncompiled[stage] == ish) {
+ ice->shaders.uncompiled[stage] = NULL;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage;
+ }
+
+ if (ish->const_data) {
+ pipe_resource_reference(&ish->const_data, NULL);
+ pipe_resource_reference(&ish->const_data_state.res, NULL);
+ }
+
+ ralloc_free(ish->nir);
+ free(ish);
+}
+
+static void
+crocus_delete_vs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_VERTEX);
+}
+
+static void
+crocus_delete_tcs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_CTRL);
+}
+
+static void
+crocus_delete_tes_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_EVAL);
+}
+
+static void
+crocus_delete_gs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_GEOMETRY);
+}
+
+static void
+crocus_delete_fs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_FRAGMENT);
+}
+
+static void
+crocus_delete_cs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_COMPUTE);
+}
+
+/**
+ * The pipe->bind_[stage]_state() driver hook.
+ *
+ * Binds an uncompiled shader as the current one for a particular stage.
+ * Updates dirty tracking to account for the shader's NOS.
+ */
+static void
+bind_shader_state(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ gl_shader_stage stage)
+{
+ uint64_t dirty_bit = CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage;
+ const uint64_t nos = ish ? ish->nos : 0;
+
+ const struct shader_info *old_info = crocus_get_shader_info(ice, stage);
+ const struct shader_info *new_info = ish ? &ish->nir->info : NULL;
+
+ if ((old_info ? BITSET_LAST_BIT(old_info->textures_used) : 0) !=
+ (new_info ? BITSET_LAST_BIT(new_info->textures_used) : 0)) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
+ }
+
+ ice->shaders.uncompiled[stage] = ish;
+ ice->state.stage_dirty |= dirty_bit;
+
+ /* Record that CSOs need to mark CROCUS_DIRTY_UNCOMPILED_XS when they change
+ * (or that they no longer need to do so).
+ */
+ for (int i = 0; i < CROCUS_NOS_COUNT; i++) {
+ if (nos & (1 << i))
+ ice->state.stage_dirty_for_nos[i] |= dirty_bit;
+ else
+ ice->state.stage_dirty_for_nos[i] &= ~dirty_bit;
+ }
+}
+
+static void
+crocus_bind_vs_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_uncompiled_shader *new_ish = state;
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (new_ish &&
+ ice->state.window_space_position !=
+ new_ish->nir->info.vs.window_space_position) {
+ ice->state.window_space_position =
+ new_ish->nir->info.vs.window_space_position;
+
+ ice->state.dirty |= CROCUS_DIRTY_CLIP |
+ CROCUS_DIRTY_RASTER |
+ CROCUS_DIRTY_CC_VIEWPORT;
+ }
+
+ if (devinfo->ver == 6) {
+ ice->state.stage_dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+ }
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_VERTEX);
+}
+
+static void
+crocus_bind_tcs_state(struct pipe_context *ctx, void *state)
+{
+ bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_CTRL);
+}
+
+static void
+crocus_bind_tes_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ /* Enabling/disabling optional stages requires a URB reconfiguration. */
+ if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL])
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_EVAL);
+}
+
+static void
+crocus_bind_gs_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ /* Enabling/disabling optional stages requires a URB reconfiguration. */
+ if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_GEOMETRY);
+}
+
+static void
+crocus_bind_fs_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_uncompiled_shader *old_ish =
+ ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+ struct crocus_uncompiled_shader *new_ish = state;
+
+ const unsigned color_bits =
+ BITFIELD64_BIT(FRAG_RESULT_COLOR) |
+ BITFIELD64_RANGE(FRAG_RESULT_DATA0, BRW_MAX_DRAW_BUFFERS);
+
+ /* Fragment shader outputs influence HasWriteableRT */
+ if (!old_ish || !new_ish ||
+ (old_ish->nir->info.outputs_written & color_bits) !=
+ (new_ish->nir->info.outputs_written & color_bits))
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT);
+}
+
+static void
+crocus_bind_cs_state(struct pipe_context *ctx, void *state)
+{
+ bind_shader_state((void *) ctx, state, MESA_SHADER_COMPUTE);
+}
+
+void
+crocus_init_program_functions(struct pipe_context *ctx)
+{
+ ctx->create_vs_state = crocus_create_vs_state;
+ ctx->create_tcs_state = crocus_create_tcs_state;
+ ctx->create_tes_state = crocus_create_tes_state;
+ ctx->create_gs_state = crocus_create_gs_state;
+ ctx->create_fs_state = crocus_create_fs_state;
+ ctx->create_compute_state = crocus_create_compute_state;
+
+ ctx->delete_vs_state = crocus_delete_vs_state;
+ ctx->delete_tcs_state = crocus_delete_tcs_state;
+ ctx->delete_tes_state = crocus_delete_tes_state;
+ ctx->delete_gs_state = crocus_delete_gs_state;
+ ctx->delete_fs_state = crocus_delete_fs_state;
+ ctx->delete_compute_state = crocus_delete_cs_state;
+
+ ctx->bind_vs_state = crocus_bind_vs_state;
+ ctx->bind_tcs_state = crocus_bind_tcs_state;
+ ctx->bind_tes_state = crocus_bind_tes_state;
+ ctx->bind_gs_state = crocus_bind_gs_state;
+ ctx->bind_fs_state = crocus_bind_fs_state;
+ ctx->bind_compute_state = crocus_bind_cs_state;
+}
diff --git a/src/gallium/drivers/crocus/crocus_program_cache.c b/src/gallium/drivers/crocus/crocus_program_cache.c
new file mode 100644
index 00000000000..d2d4b821754
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_program_cache.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_program_cache.c
+ *
+ * The in-memory program cache. This is basically a hash table mapping
+ * API-specified shaders and a state key to a compiled variant. It also
+ * takes care of uploading shader assembly into a BO for use on the GPU.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_atomic.h"
+#include "util/u_upload_mgr.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_eu.h"
+#include "intel/compiler/brw_nir.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+
+struct keybox {
+ uint16_t size;
+ enum crocus_program_cache_id cache_id;
+ uint8_t data[0];
+};
+
+static struct keybox *
+make_keybox(void *mem_ctx, enum crocus_program_cache_id cache_id,
+ const void *key, uint32_t key_size)
+{
+ struct keybox *keybox =
+ ralloc_size(mem_ctx, sizeof(struct keybox) + key_size);
+
+ keybox->cache_id = cache_id;
+ keybox->size = key_size;
+ memcpy(keybox->data, key, key_size);
+
+ return keybox;
+}
+
+static uint32_t
+keybox_hash(const void *void_key)
+{
+ const struct keybox *key = void_key;
+ return _mesa_hash_data(&key->cache_id, key->size + sizeof(key->cache_id));
+}
+
+static bool
+keybox_equals(const void *void_a, const void *void_b)
+{
+ const struct keybox *a = void_a, *b = void_b;
+ if (a->size != b->size)
+ return false;
+
+ return memcmp(a->data, b->data, a->size) == 0;
+}
+
+struct crocus_compiled_shader *
+crocus_find_cached_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id,
+ uint32_t key_size, const void *key)
+{
+ struct keybox *keybox = make_keybox(NULL, cache_id, key, key_size);
+ struct hash_entry *entry =
+ _mesa_hash_table_search(ice->shaders.cache, keybox);
+
+ ralloc_free(keybox);
+
+ return entry ? entry->data : NULL;
+}
+
+const void *
+crocus_find_previous_compile(const struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id,
+ unsigned program_string_id)
+{
+ hash_table_foreach(ice->shaders.cache, entry) {
+ const struct keybox *keybox = entry->key;
+ const struct brw_base_prog_key *key = (const void *)keybox->data;
+ if (keybox->cache_id == cache_id &&
+ key->program_string_id == program_string_id) {
+ return keybox->data;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * Look for an existing entry in the cache that has identical assembly code.
+ *
+ * This is useful for programs generating shaders at runtime, where multiple
+ * distinct shaders (from an API perspective) may compile to the same assembly
+ * in our backend. This saves space in the program cache buffer.
+ */
+static const struct crocus_compiled_shader *
+find_existing_assembly(struct hash_table *cache, void *map,
+ const void *assembly, unsigned assembly_size)
+{
+ hash_table_foreach (cache, entry) {
+ const struct crocus_compiled_shader *existing = entry->data;
+
+ if (existing->map_size != assembly_size)
+ continue;
+
+ if (memcmp(map + existing->offset, assembly, assembly_size) == 0)
+ return existing;
+ }
+ return NULL;
+}
+
+static void
+crocus_cache_new_bo(struct crocus_context *ice,
+ uint32_t new_size)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct crocus_bo *new_bo;
+ new_bo = crocus_bo_alloc(screen->bufmgr, "program cache", new_size);
+
+ void *map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE |
+ MAP_ASYNC | MAP_PERSISTENT);
+
+ if (ice->shaders.cache_next_offset != 0) {
+ memcpy(map, ice->shaders.cache_bo_map, ice->shaders.cache_next_offset);
+ }
+
+ crocus_bo_unmap(ice->shaders.cache_bo);
+ crocus_bo_unreference(ice->shaders.cache_bo);
+ ice->shaders.cache_bo = new_bo;
+ ice->shaders.cache_bo_map = map;
+
+ if (screen->devinfo.ver == 4) {
+ /* reemit all shaders on GEN4 only. */
+ ice->state.dirty |= CROCUS_DIRTY_CLIP | CROCUS_DIRTY_RASTER |
+ CROCUS_DIRTY_WM;
+ }
+ ice->batches[CROCUS_BATCH_RENDER].state_base_address_emitted = false;
+ ice->batches[CROCUS_BATCH_COMPUTE].state_base_address_emitted = false;
+ /* unset state base address */
+}
+
+static uint32_t
+crocus_alloc_item_data(struct crocus_context *ice, uint32_t size)
+{
+ if (ice->shaders.cache_next_offset + size > ice->shaders.cache_bo->size) {
+ uint32_t new_size = ice->shaders.cache_bo->size * 2;
+ while (ice->shaders.cache_next_offset + size > new_size)
+ new_size *= 2;
+
+ crocus_cache_new_bo(ice, new_size);
+ }
+ uint32_t offset = ice->shaders.cache_next_offset;
+
+ /* Programs are always 64-byte aligned, so set up the next one now */
+ ice->shaders.cache_next_offset = ALIGN(offset + size, 64);
+ return offset;
+}
+
+struct crocus_compiled_shader *
+crocus_upload_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id, uint32_t key_size,
+ const void *key, const void *assembly, uint32_t asm_size,
+ struct brw_stage_prog_data *prog_data,
+ uint32_t prog_data_size, uint32_t *streamout,
+ enum brw_param_builtin *system_values,
+ unsigned num_system_values, unsigned num_cbufs,
+ const struct crocus_binding_table *bt)
+{
+ struct hash_table *cache = ice->shaders.cache;
+ struct crocus_compiled_shader *shader =
+ rzalloc_size(cache, sizeof(struct crocus_compiled_shader));
+ const struct crocus_compiled_shader *existing = find_existing_assembly(
+ cache, ice->shaders.cache_bo_map, assembly, asm_size);
+
+ /* If we can find a matching prog in the cache already, then reuse the
+ * existing stuff without creating new copy into the underlying buffer
+ * object. This is notably useful for programs generating shaders at
+ * runtime, where multiple shaders may compile to the same thing in our
+ * backend.
+ */
+ if (existing) {
+ shader->offset = existing->offset;
+ shader->map_size = existing->map_size;
+ } else {
+ shader->offset = crocus_alloc_item_data(ice, asm_size);
+ shader->map_size = asm_size;
+
+ memcpy(ice->shaders.cache_bo_map + shader->offset, assembly, asm_size);
+ }
+
+ shader->prog_data = prog_data;
+ shader->prog_data_size = prog_data_size;
+ shader->streamout = streamout;
+ shader->system_values = system_values;
+ shader->num_system_values = num_system_values;
+ shader->num_cbufs = num_cbufs;
+ shader->bt = *bt;
+
+ ralloc_steal(shader, shader->prog_data);
+ if (prog_data_size > 16) {
+ ralloc_steal(shader->prog_data, prog_data->param);
+ ralloc_steal(shader->prog_data, prog_data->pull_param);
+ }
+ ralloc_steal(shader, shader->streamout);
+ ralloc_steal(shader, shader->system_values);
+
+ struct keybox *keybox = make_keybox(shader, cache_id, key, key_size);
+ _mesa_hash_table_insert(ice->shaders.cache, keybox, shader);
+
+ return shader;
+}
+
+bool
+crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch, const void *key,
+ uint32_t key_size, uint32_t *kernel_out,
+ void *prog_data_out)
+{
+ struct blorp_context *blorp = blorp_batch->blorp;
+ struct crocus_context *ice = blorp->driver_ctx;
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_BLORP, key_size, key);
+
+ if (!shader)
+ return false;
+
+ *kernel_out = shader->offset;
+ *((void **)prog_data_out) = shader->prog_data;
+
+ return true;
+}
+
+bool
+crocus_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage,
+ const void *key, uint32_t key_size,
+ const void *kernel, uint32_t kernel_size,
+ const struct brw_stage_prog_data *prog_data_templ,
+ uint32_t prog_data_size, uint32_t *kernel_out,
+ void *prog_data_out)
+{
+ struct blorp_context *blorp = blorp_batch->blorp;
+ struct crocus_context *ice = blorp->driver_ctx;
+
+ struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size);
+ memcpy(prog_data, prog_data_templ, prog_data_size);
+
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+
+ struct crocus_compiled_shader *shader = crocus_upload_shader(
+ ice, CROCUS_CACHE_BLORP, key_size, key, kernel, kernel_size, prog_data,
+ prog_data_size, NULL, NULL, 0, 0, &bt);
+
+ *kernel_out = shader->offset;
+ *((void **)prog_data_out) = shader->prog_data;
+
+ return true;
+}
+
+void
+crocus_init_program_cache(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ ice->shaders.cache =
+ _mesa_hash_table_create(ice, keybox_hash, keybox_equals);
+
+ ice->shaders.cache_bo =
+ crocus_bo_alloc(screen->bufmgr, "program_cache", 16384);
+ ice->shaders.cache_bo_map =
+ crocus_bo_map(NULL, ice->shaders.cache_bo,
+ MAP_READ | MAP_WRITE | MAP_ASYNC | MAP_PERSISTENT);
+}
+
+void
+crocus_destroy_program_cache(struct crocus_context *ice)
+{
+ for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+ ice->shaders.prog[i] = NULL;
+ }
+
+ if (ice->shaders.cache_bo) {
+ crocus_bo_unmap(ice->shaders.cache_bo);
+ crocus_bo_unreference(ice->shaders.cache_bo);
+ ice->shaders.cache_bo_map = NULL;
+ ice->shaders.cache_bo = NULL;
+ }
+
+ ralloc_free(ice->shaders.cache);
+}
+
+static const char *
+cache_name(enum crocus_program_cache_id cache_id)
+{
+ if (cache_id == CROCUS_CACHE_BLORP)
+ return "BLORP";
+
+ if (cache_id == CROCUS_CACHE_SF)
+ return "SF";
+
+ if (cache_id == CROCUS_CACHE_CLIP)
+ return "CLIP";
+
+ if (cache_id == CROCUS_CACHE_FF_GS)
+ return "FF_GS";
+
+ return _mesa_shader_stage_to_string(cache_id);
+}
+
+void
+crocus_print_program_cache(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ hash_table_foreach(ice->shaders.cache, entry) {
+ const struct keybox *keybox = entry->key;
+ struct crocus_compiled_shader *shader = entry->data;
+ fprintf(stderr, "%s:\n", cache_name(keybox->cache_id));
+ brw_disassemble(devinfo, ice->shaders.cache_bo_map + shader->offset, 0,
+ shader->prog_data->program_size, NULL, stderr);
+ }
+}
diff --git a/src/gallium/drivers/crocus/crocus_query.c b/src/gallium/drivers/crocus/crocus_query.c
new file mode 100644
index 00000000000..14ba9fbce59
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_query.c
@@ -0,0 +1,996 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_query.c
+ *
+ * ============================= GENXML CODE =============================
+ * [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * Query object support. This allows measuring various simple statistics
+ * via counters on the GPU. We use GenX code for MI_MATH calculations.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "perf/intel_perf.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "crocus_fence.h"
+#include "crocus_monitor.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+#include "crocus_genx_macros.h"
+
+#if GFX_VER == 6
+// TOOD: Add these to genxml?
+#define SO_PRIM_STORAGE_NEEDED(n) (0x2280)
+#define SO_NUM_PRIMS_WRITTEN(n) (0x2288)
+
+// TODO: remove HS/DS/CS
+#define GFX6_IA_VERTICES_COUNT_num 0x2310
+#define GFX6_IA_PRIMITIVES_COUNT_num 0x2318
+#define GFX6_VS_INVOCATION_COUNT_num 0x2320
+#define GFX6_HS_INVOCATION_COUNT_num 0x2300
+#define GFX6_DS_INVOCATION_COUNT_num 0x2308
+#define GFX6_GS_INVOCATION_COUNT_num 0x2328
+#define GFX6_GS_PRIMITIVES_COUNT_num 0x2330
+#define GFX6_CL_INVOCATION_COUNT_num 0x2338
+#define GFX6_CL_PRIMITIVES_COUNT_num 0x2340
+#define GFX6_PS_INVOCATION_COUNT_num 0x2348
+#define GFX6_CS_INVOCATION_COUNT_num 0x2290
+#define GFX6_PS_DEPTH_COUNT_num 0x2350
+
+#elif GFX_VER == 7
+#define SO_PRIM_STORAGE_NEEDED(n) (GENX(SO_PRIM_STORAGE_NEEDED0_num) + (n) * 8)
+#define SO_NUM_PRIMS_WRITTEN(n) (GENX(SO_NUM_PRIMS_WRITTEN0_num) + (n) * 8)
+#endif
+
+struct crocus_query {
+ enum pipe_query_type type;
+ int index;
+
+ bool ready;
+
+ bool stalled;
+
+ uint64_t result;
+
+ struct crocus_state_ref query_state_ref;
+ struct crocus_query_snapshots *map;
+ struct crocus_syncobj *syncobj;
+
+ int batch_idx;
+
+ struct crocus_monitor_object *monitor;
+
+ /* Fence for PIPE_QUERY_GPU_FINISHED. */
+ struct pipe_fence_handle *fence;
+};
+
+struct crocus_query_snapshots {
+ /** crocus_render_condition's saved MI_PREDICATE_RESULT value. */
+ uint64_t predicate_result;
+
+ /** Have the start/end snapshots landed? */
+ uint64_t snapshots_landed;
+
+ /** Starting and ending counter snapshots */
+ uint64_t start;
+ uint64_t end;
+};
+
+struct crocus_query_so_overflow {
+ uint64_t predicate_result;
+ uint64_t snapshots_landed;
+
+ struct {
+ uint64_t prim_storage_needed[2];
+ uint64_t num_prims[2];
+ } stream[4];
+};
+
+#if GFX_VERx10 == 75
+static struct mi_value
+query_mem64(struct crocus_query *q, uint32_t offset)
+{
+ return mi_mem64(rw_bo(crocus_resource_bo(q->query_state_ref.res),
+ q->query_state_ref.offset + offset));
+}
+#endif
+
+/**
+ * Is this type of query written by PIPE_CONTROL?
+ */
+static bool
+crocus_is_query_pipelined(struct crocus_query *q)
+{
+ switch (q->type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ case PIPE_QUERY_TIME_ELAPSED:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static void
+mark_available(struct crocus_context *ice, struct crocus_query *q)
+{
+#if GFX_VERx10 == 75
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+ struct crocus_screen *screen = batch->screen;
+ unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
+ unsigned offset = offsetof(struct crocus_query_snapshots, snapshots_landed);
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+ offset += q->query_state_ref.offset;
+
+ if (!crocus_is_query_pipelined(q)) {
+ screen->vtbl.store_data_imm64(batch, bo, offset, true);
+ } else {
+ /* Order available *after* the query results. */
+ flags |= PIPE_CONTROL_FLUSH_ENABLE;
+ crocus_emit_pipe_control_write(batch, "query: mark available",
+ flags, bo, offset, true);
+ }
+#endif
+}
+
+/**
+ * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
+ */
+static void
+crocus_pipelined_write(struct crocus_batch *batch,
+ struct crocus_query *q,
+ enum pipe_control_flags flags,
+ unsigned offset)
+{
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+
+ crocus_emit_pipe_control_write(batch, "query: pipelined snapshot write",
+ flags,
+ bo, offset, 0ull);
+}
+
+static void
+write_value(struct crocus_context *ice, struct crocus_query *q, unsigned offset)
+{
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+#if GFX_VER >= 6
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+#endif
+
+ if (!crocus_is_query_pipelined(q)) {
+ crocus_emit_pipe_control_flush(batch,
+ "query: non-pipelined snapshot write",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+ q->stalled = true;
+ }
+
+ switch (q->type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,
+ PIPE_CONTROL_WRITE_DEPTH_COUNT |
+ PIPE_CONTROL_DEPTH_STALL,
+ offset);
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,
+ PIPE_CONTROL_WRITE_TIMESTAMP,
+ offset);
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+#if GFX_VER >= 6
+ screen->vtbl.store_register_mem64(batch,
+ q->index == 0 ?
+ GENX(CL_INVOCATION_COUNT_num) :
+ SO_PRIM_STORAGE_NEEDED(q->index),
+ bo, offset, false);
+#endif
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+#if GFX_VER >= 6
+ screen->vtbl.store_register_mem64(batch,
+ SO_NUM_PRIMS_WRITTEN(q->index),
+ bo, offset, false);
+#endif
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
+#if GFX_VER >= 6
+ static const uint32_t index_to_reg[] = {
+ GENX(IA_VERTICES_COUNT_num),
+ GENX(IA_PRIMITIVES_COUNT_num),
+ GENX(VS_INVOCATION_COUNT_num),
+ GENX(GS_INVOCATION_COUNT_num),
+ GENX(GS_PRIMITIVES_COUNT_num),
+ GENX(CL_INVOCATION_COUNT_num),
+ GENX(CL_PRIMITIVES_COUNT_num),
+ GENX(PS_INVOCATION_COUNT_num),
+ GENX(HS_INVOCATION_COUNT_num),
+ GENX(DS_INVOCATION_COUNT_num),
+ GENX(CS_INVOCATION_COUNT_num),
+ };
+ uint32_t reg = index_to_reg[q->index];
+
+#if GFX_VER == 6
+ /* Gfx6 GS code counts full primitives, that is, it won't count individual
+ * triangles in a triangle strip. Use CL_INVOCATION_COUNT for that.
+ */
+ if (q->index == PIPE_STAT_QUERY_GS_PRIMITIVES)
+ reg = GENX(CL_INVOCATION_COUNT_num);
+#endif
+
+ screen->vtbl.store_register_mem64(batch, reg, bo, offset, false);
+#endif
+ break;
+ }
+ default:
+ assert(false);
+ }
+}
+
+#if GFX_VER >= 6
+static void
+write_overflow_values(struct crocus_context *ice, struct crocus_query *q, bool end)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+ uint32_t offset = q->query_state_ref.offset;
+ crocus_emit_pipe_control_flush(batch,
+ "query: write SO overflow snapshots",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+ for (uint32_t i = 0; i < count; i++) {
+ int s = q->index + i;
+ int g_idx = offset + offsetof(struct crocus_query_so_overflow,
+ stream[s].num_prims[end]);
+ int w_idx = offset + offsetof(struct crocus_query_so_overflow,
+ stream[s].prim_storage_needed[end]);
+ screen->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
+ bo, g_idx, false);
+ screen->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
+ bo, w_idx, false);
+ }
+}
+#endif
+static uint64_t
+crocus_raw_timestamp_delta(uint64_t time0, uint64_t time1)
+{
+ if (time0 > time1) {
+ return (1ULL << TIMESTAMP_BITS) + time1 - time0;
+ } else {
+ return time1 - time0;
+ }
+}
+
+static bool
+stream_overflowed(struct crocus_query_so_overflow *so, int s)
+{
+ return (so->stream[s].prim_storage_needed[1] -
+ so->stream[s].prim_storage_needed[0]) !=
+ (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
+}
+
+static void
+calculate_result_on_cpu(const struct intel_device_info *devinfo,
+ struct crocus_query *q)
+{
+ switch (q->type) {
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ q->result = q->map->end != q->map->start;
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ /* The timestamp is the single starting snapshot. */
+ q->result = intel_device_info_timebase_scale(devinfo, q->map->start);
+ q->result &= (1ull << TIMESTAMP_BITS) - 1;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ q->result = crocus_raw_timestamp_delta(q->map->start, q->map->end);
+ q->result = intel_device_info_timebase_scale(devinfo, q->result);
+ q->result &= (1ull << TIMESTAMP_BITS) - 1;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ q->result = stream_overflowed((void *) q->map, q->index);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ q->result = false;
+ for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+ q->result |= stream_overflowed((void *) q->map, i);
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
+ q->result = q->map->end - q->map->start;
+
+ /* WaDividePSInvocationCountBy4:HSW,BDW */
+ if (GFX_VER == 7 && devinfo->is_haswell && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
+ q->result /= 4;
+ break;
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ default:
+ q->result = q->map->end - q->map->start;
+ break;
+ }
+
+ q->ready = true;
+}
+
+#if GFX_VERx10 == 75
+/**
+ * Calculate the streamout overflow for stream \p idx:
+ *
+ * (num_prims[1] - num_prims[0]) - (storage_needed[1] - storage_needed[0])
+ */
+static struct mi_value
+calc_overflow_for_stream(struct mi_builder *b,
+ struct crocus_query *q,
+ int idx)
+{
+#define C(counter, i) query_mem64(q, \
+ offsetof(struct crocus_query_so_overflow, stream[idx].counter[i]))
+
+ return mi_isub(b, mi_isub(b, C(num_prims, 1), C(num_prims, 0)),
+ mi_isub(b, C(prim_storage_needed, 1),
+ C(prim_storage_needed, 0)));
+#undef C
+}
+
+/**
+ * Calculate whether any stream has overflowed.
+ */
+static struct mi_value
+calc_overflow_any_stream(struct mi_builder *b, struct crocus_query *q)
+{
+ struct mi_value stream_result[MAX_VERTEX_STREAMS];
+ for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+ stream_result[i] = calc_overflow_for_stream(b, q, i);
+
+ struct mi_value result = stream_result[0];
+ for (int i = 1; i < MAX_VERTEX_STREAMS; i++)
+ result = mi_ior(b, result, stream_result[i]);
+
+ return result;
+}
+
+
+static bool
+query_is_boolean(enum pipe_query_type type)
+{
+ switch (type) {
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * Calculate the result using MI_MATH.
+ */
+static struct mi_value
+calculate_result_on_gpu(const struct intel_device_info *devinfo,
+ struct mi_builder *b,
+ struct crocus_query *q)
+{
+ struct mi_value result;
+ struct mi_value start_val =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, start));
+ struct mi_value end_val =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, end));
+
+ switch (q->type) {
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result = calc_overflow_for_stream(b, q, q->index);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ result = calc_overflow_any_stream(b, q);
+ break;
+ case PIPE_QUERY_TIMESTAMP: {
+ /* TODO: This discards any fractional bits of the timebase scale.
+ * We would need to do a bit of fixed point math on the CS ALU, or
+ * launch an actual shader to calculate this with full precision.
+ */
+ uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
+ result = mi_iand(b, mi_imm((1ull << 36) - 1),
+ mi_imul_imm(b, start_val, scale));
+ break;
+ }
+ case PIPE_QUERY_TIME_ELAPSED: {
+ /* TODO: This discards fractional bits (see above). */
+ uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
+ result = mi_imul_imm(b, mi_isub(b, end_val, start_val), scale);
+ break;
+ }
+ default:
+ result = mi_isub(b, end_val, start_val);
+ break;
+ }
+ /* WaDividePSInvocationCountBy4:HSW,BDW */
+ if (GFX_VER == 7 && devinfo->is_haswell &&
+ q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
+ q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
+ result = mi_ushr32_imm(b, result, 2);
+
+ if (query_is_boolean(q->type))
+ result = mi_iand(b, mi_nz(b, result), mi_imm(1));
+
+ return result;
+}
+#endif
+
+static struct pipe_query *
+crocus_create_query(struct pipe_context *ctx,
+ unsigned query_type,
+ unsigned index)
+{
+ struct crocus_query *q = calloc(1, sizeof(struct crocus_query));
+
+ q->type = query_type;
+ q->index = index;
+ q->monitor = NULL;
+
+ if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
+ q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
+ q->batch_idx = CROCUS_BATCH_COMPUTE;
+ else
+ q->batch_idx = CROCUS_BATCH_RENDER;
+ return (struct pipe_query *) q;
+}
+
+static struct pipe_query *
+crocus_create_batch_query(struct pipe_context *ctx,
+ unsigned num_queries,
+ unsigned *query_types)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = calloc(1, sizeof(struct crocus_query));
+ if (unlikely(!q))
+ return NULL;
+ q->type = PIPE_QUERY_DRIVER_SPECIFIC;
+ q->index = -1;
+ q->monitor = crocus_create_monitor_object(ice, num_queries, query_types);
+ if (unlikely(!q->monitor)) {
+ free(q);
+ return NULL;
+ }
+
+ return (struct pipe_query *) q;
+}
+
+static void
+crocus_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
+{
+ struct crocus_query *query = (void *) p_query;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ if (query->monitor) {
+ crocus_destroy_monitor_object(ctx, query->monitor);
+ query->monitor = NULL;
+ } else {
+ crocus_syncobj_reference(screen, &query->syncobj, NULL);
+ screen->base.fence_reference(ctx->screen, &query->fence, NULL);
+ }
+ free(query);
+}
+
+
+static bool
+crocus_begin_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ if (q->monitor)
+ return crocus_begin_monitor(ctx, q->monitor);
+
+ void *ptr = NULL;
+ uint32_t size;
+
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ size = sizeof(struct crocus_query_so_overflow);
+ else
+ size = sizeof(struct crocus_query_snapshots);
+
+ u_upload_alloc(ice->query_buffer_uploader, 0,
+ size, size, &q->query_state_ref.offset,
+ &q->query_state_ref.res, &ptr);
+
+ if (!crocus_resource_bo(q->query_state_ref.res))
+ return false;
+
+ q->map = ptr;
+ if (!q->map)
+ return false;
+
+ q->result = 0ull;
+ q->ready = false;
+ WRITE_ONCE(q->map->snapshots_landed, false);
+
+ if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+ ice->state.prims_generated_query_active = true;
+ ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+ }
+
+#if GFX_VER <= 5
+ if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ ice->state.stats_wm++;
+ ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;
+ }
+#endif
+#if GFX_VER >= 6
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ write_overflow_values(ice, q, false);
+ else
+#endif
+ write_value(ice, q,
+ q->query_state_ref.offset +
+ offsetof(struct crocus_query_snapshots, start));
+
+ return true;
+}
+
+static bool
+crocus_end_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ if (q->monitor)
+ return crocus_end_monitor(ctx, q->monitor);
+
+ if (q->type == PIPE_QUERY_GPU_FINISHED) {
+ ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED);
+ return true;
+ }
+
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+
+ if (q->type == PIPE_QUERY_TIMESTAMP) {
+ crocus_begin_query(ctx, query);
+ crocus_batch_reference_signal_syncobj(batch, &q->syncobj);
+ mark_available(ice, q);
+ return true;
+ }
+
+#if GFX_VER <= 5
+ if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ ice->state.stats_wm--;
+ ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;
+ }
+#endif
+ if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+ ice->state.prims_generated_query_active = false;
+ ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+ }
+
+#if GFX_VER >= 6
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ write_overflow_values(ice, q, true);
+ else
+#endif
+ write_value(ice, q,
+ q->query_state_ref.offset +
+ offsetof(struct crocus_query_snapshots, end));
+
+ crocus_batch_reference_signal_syncobj(batch, &q->syncobj);
+ mark_available(ice, q);
+
+ return true;
+}
+
+/**
+ * See if the snapshots have landed for a query, and if so, compute the
+ * result and mark it ready. Does not flush (unlike crocus_get_query_result).
+ */
+static void
+crocus_check_query_no_flush(struct crocus_context *ice, struct crocus_query *q)
+{
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
+ calculate_result_on_cpu(devinfo, q);
+ }
+}
+
+static bool
+crocus_get_query_result(struct pipe_context *ctx,
+ struct pipe_query *query,
+ bool wait,
+ union pipe_query_result *result)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ if (q->monitor)
+ return crocus_get_monitor_result(ctx, q->monitor, wait, result->batch);
+
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (unlikely(screen->no_hw)) {
+ result->u64 = 0;
+ return true;
+ }
+
+ if (!q->ready) {
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+ if (q->syncobj == crocus_batch_get_signal_syncobj(batch))
+ crocus_batch_flush(batch);
+
+#if GFX_VERx10 == 75
+ while (!READ_ONCE(q->map->snapshots_landed)) {
+ if (wait)
+ crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
+ else
+ return false;
+ }
+ assert(READ_ONCE(q->map->snapshots_landed));
+#else
+ if (wait)
+ crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
+#endif
+ calculate_result_on_cpu(devinfo, q);
+ }
+
+ assert(q->ready);
+
+ result->u64 = q->result;
+
+ return true;
+}
+
+#if GFX_VER == 7
+static void
+crocus_get_query_result_resource(struct pipe_context *ctx,
+ struct pipe_query *query,
+ bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *p_res,
+ unsigned offset)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+ struct crocus_screen *screen = batch->screen;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ struct crocus_resource *res = (void *) p_res;
+ struct crocus_bo *query_bo = crocus_resource_bo(q->query_state_ref.res);
+ struct crocus_bo *dst_bo = crocus_resource_bo(p_res);
+ unsigned snapshots_landed_offset =
+ offsetof(struct crocus_query_snapshots, snapshots_landed);
+
+ res->bind_history |= PIPE_BIND_QUERY_BUFFER;
+
+ if (index == -1) {
+ /* They're asking for the availability of the result. If we still
+ * have commands queued up which produce the result, submit them
+ * now so that progress happens. Either way, copy the snapshots
+ * landed field to the destination resource.
+ */
+ if (q->syncobj == crocus_batch_get_signal_syncobj(batch))
+ crocus_batch_flush(batch);
+
+ screen->vtbl.copy_mem_mem(batch, dst_bo, offset,
+ query_bo, snapshots_landed_offset,
+ result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
+ return;
+ }
+
+ if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
+ /* The final snapshots happen to have landed, so let's just compute
+ * the result on the CPU now...
+ */
+ calculate_result_on_cpu(devinfo, q);
+ }
+
+ if (q->ready) {
+ /* We happen to have the result on the CPU, so just copy it. */
+ if (result_type <= PIPE_QUERY_TYPE_U32) {
+ screen->vtbl.store_data_imm32(batch, dst_bo, offset, q->result);
+ } else {
+ screen->vtbl.store_data_imm64(batch, dst_bo, offset, q->result);
+ }
+
+ /* Make sure the result lands before they use bind the QBO elsewhere
+ * and use the result.
+ */
+ // XXX: Why? i965 doesn't do this.
+ crocus_emit_pipe_control_flush(batch,
+ "query: unknown QBO flushing hack",
+ PIPE_CONTROL_CS_STALL);
+ return;
+ }
+
+#if GFX_VERx10 == 75
+ bool predicated = !wait && !q->stalled;
+
+ struct mi_builder b;
+ mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+ struct mi_value result = calculate_result_on_gpu(devinfo, &b, q);
+ struct mi_value dst =
+ result_type <= PIPE_QUERY_TYPE_U32 ? mi_mem32(rw_bo(dst_bo, offset))
+ : mi_mem64(rw_bo(dst_bo, offset));
+
+ if (predicated) {
+ mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
+ mi_mem64(ro_bo(query_bo, snapshots_landed_offset)));
+ mi_store_if(&b, dst, result);
+ } else {
+ mi_store(&b, dst, result);
+ }
+#endif
+}
+#endif
+
+static void
+crocus_set_active_query_state(struct pipe_context *ctx, bool enable)
+{
+ struct crocus_context *ice = (void *) ctx;
+
+ if (ice->state.statistics_counters_enabled == enable)
+ return;
+
+ // XXX: most packets aren't paying attention to this yet, because it'd
+ // have to be done dynamically at draw time, which is a pain
+ ice->state.statistics_counters_enabled = enable;
+ ice->state.dirty |= CROCUS_DIRTY_CLIP |
+ CROCUS_DIRTY_RASTER |
+ CROCUS_DIRTY_STREAMOUT |
+ CROCUS_DIRTY_WM;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS |
+ CROCUS_STAGE_DIRTY_TCS |
+ CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_VS;
+}
+
+static void
+set_predicate_enable(struct crocus_context *ice, bool value)
+{
+ if (value)
+ ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;
+ else
+ ice->state.predicate = CROCUS_PREDICATE_STATE_DONT_RENDER;
+}
+
+#if GFX_VER == 7
+static void
+set_predicate_for_result(struct crocus_context *ice,
+ struct crocus_query *q,
+ bool inverted)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+
+#if GFX_VERx10 != 75
+ /* IVB doesn't have enough MI for this */
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;
+ return;
+ }
+#endif
+
+ /* The CPU doesn't have the query result yet; use hardware predication */
+ ice->state.predicate = CROCUS_PREDICATE_STATE_USE_BIT;
+
+ /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
+ crocus_emit_pipe_control_flush(batch,
+ "conditional rendering: set predicate",
+ PIPE_CONTROL_FLUSH_ENABLE);
+ q->stalled = true;
+
+#if GFX_VERx10 != 75
+ struct crocus_screen *screen = batch->screen;
+ screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
+ q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, start));
+ screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
+ q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, end));
+
+ uint32_t mi_predicate = MI_PREDICATE | MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+ if (inverted)
+ mi_predicate |= MI_PREDICATE_LOADOP_LOAD;
+ else
+ mi_predicate |= MI_PREDICATE_LOADOP_LOADINV;
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+#else
+ struct mi_builder b;
+ mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+ struct mi_value result;
+
+ switch (q->type) {
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result = calc_overflow_for_stream(&b, q, q->index);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ result = calc_overflow_any_stream(&b, q);
+ break;
+ default: {
+ /* PIPE_QUERY_OCCLUSION_* */
+ struct mi_value start =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, start));
+ struct mi_value end =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, end));
+ result = mi_isub(&b, end, start);
+ break;
+ }
+ }
+
+ result = inverted ? mi_z(&b, result) : mi_nz(&b, result);
+ result = mi_iand(&b, result, mi_imm(1));
+
+ /* We immediately set the predicate on the render batch, as all the
+ * counters come from 3D operations. However, we may need to predicate
+ * a compute dispatch, which executes in a different GEM context and has
+ * a different MI_PREDICATE_RESULT register. So, we save the result to
+ * memory and reload it in crocus_launch_grid.
+ */
+ mi_value_ref(&b, result);
+
+ mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), result);
+ mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+ unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+ MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+ mi_store(&b, query_mem64(q, offsetof(struct crocus_query_snapshots,
+ predicate_result)), result);
+#endif
+ ice->state.compute_predicate = bo;
+}
+#endif
+
+static void
+crocus_render_condition(struct pipe_context *ctx,
+ struct pipe_query *query,
+ bool condition,
+ enum pipe_render_cond_flag mode)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ /* The old condition isn't relevant; we'll update it if necessary */
+ ice->state.compute_predicate = NULL;
+ ice->condition.query = q;
+ ice->condition.condition = condition;
+ ice->condition.mode = mode;
+
+ if (!q) {
+ ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;
+ return;
+ }
+
+ crocus_check_query_no_flush(ice, q);
+
+ if (q->result || q->ready) {
+ set_predicate_enable(ice, (q->result != 0) ^ condition);
+ } else {
+ if (mode == PIPE_RENDER_COND_NO_WAIT ||
+ mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
+ perf_debug(&ice->dbg, "Conditional rendering demoted from "
+ "\"no wait\" to \"wait\".");
+ }
+#if GFX_VER == 7
+ set_predicate_for_result(ice, q, condition);
+#else
+ ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;
+#endif
+ }
+}
+
+static void
+crocus_resolve_conditional_render(struct crocus_context *ice)
+{
+ struct pipe_context *ctx = (void *) ice;
+ struct crocus_query *q = ice->condition.query;
+ struct pipe_query *query = (void *) q;
+ union pipe_query_result result;
+
+ if (ice->state.predicate != CROCUS_PREDICATE_STATE_USE_BIT)
+ return;
+
+ assert(q);
+
+ crocus_get_query_result(ctx, query, true, &result);
+ set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
+}
+
+#if GFX_VER >= 7
+static void
+crocus_emit_compute_predicate(struct crocus_batch *batch)
+{
+ struct crocus_context *ice = batch->ice;
+ struct crocus_screen *screen = batch->screen;
+ screen->vtbl.load_register_mem32(batch, MI_PREDICATE_SRC0,
+ ice->state.compute_predicate, 0);
+ screen->vtbl.load_register_imm32(batch, MI_PREDICATE_SRC1, 0);
+ unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+ MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+}
+#endif
+
+void
+genX(init_screen_query)(struct crocus_screen *screen)
+{
+ screen->vtbl.resolve_conditional_render = crocus_resolve_conditional_render;
+#if GFX_VER >= 7
+ screen->vtbl.emit_compute_predicate = crocus_emit_compute_predicate;
+#endif
+}
+
+void
+genX(init_query)(struct crocus_context *ice)
+{
+ struct pipe_context *ctx = &ice->ctx;
+
+ ctx->create_query = crocus_create_query;
+ ctx->create_batch_query = crocus_create_batch_query;
+ ctx->destroy_query = crocus_destroy_query;
+ ctx->begin_query = crocus_begin_query;
+ ctx->end_query = crocus_end_query;
+ ctx->get_query_result = crocus_get_query_result;
+#if GFX_VER == 7
+ ctx->get_query_result_resource = crocus_get_query_result_resource;
+#endif
+ ctx->set_active_query_state = crocus_set_active_query_state;
+ ctx->render_condition = crocus_render_condition;
+
+}
diff --git a/src/gallium/drivers/crocus/crocus_resolve.c b/src/gallium/drivers/crocus/crocus_resolve.c
new file mode 100644
index 00000000000..a38eb4a94a7
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resolve.c
@@ -0,0 +1,1061 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_resolve.c
+ *
+ * This file handles resolve tracking for main and auxiliary surfaces.
+ *
+ * It also handles our cache tracking. We have sets for the render cache,
+ * depth cache, and so on. If a BO is in a cache's set, then it may have
+ * data in that cache. The helpers take care of emitting flushes for
+ * render-to-texture, format reinterpretation issues, and other situations.
+ */
+
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "crocus_context.h"
+#include "compiler/nir/nir.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLORP
+
+static void
+crocus_update_stencil_shadow(struct crocus_context *ice,
+ struct crocus_resource *res);
+/**
+ * Disable auxiliary buffers if a renderbuffer is also bound as a texture
+ * or shader image. This causes a self-dependency, where both rendering
+ * and sampling may concurrently read or write the CCS buffer, causing
+ * incorrect pixels.
+ */
+static bool
+disable_rb_aux_buffer(struct crocus_context *ice,
+ bool *draw_aux_buffer_disabled,
+ struct crocus_resource *tex_res,
+ unsigned min_level, unsigned num_levels,
+ const char *usage)
+{
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ bool found = false;
+
+ /* We only need to worry about fast clears. */
+ if (tex_res->aux.usage != ISL_AUX_USAGE_CCS_D)
+ return false;
+
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ if (!surf)
+ continue;
+
+ struct crocus_resource *rb_res = (void *) surf->base.texture;
+
+ if (rb_res->bo == tex_res->bo &&
+ surf->base.u.tex.level >= min_level &&
+ surf->base.u.tex.level < min_level + num_levels) {
+ found = draw_aux_buffer_disabled[i] = true;
+ }
+ }
+
+ if (found) {
+ perf_debug(&ice->dbg,
+ "Disabling CCS because a renderbuffer is also bound %s.\n",
+ usage);
+ }
+
+ return found;
+}
+
+static void
+resolve_sampler_views(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_shader_state *shs,
+ const struct shader_info *info,
+ bool *draw_aux_buffer_disabled,
+ bool consider_framebuffer)
+{
+ uint32_t views = info ? (shs->bound_sampler_views & info->textures_used[0]) : 0;
+
+ while (views) {
+ const int i = u_bit_scan(&views);
+ struct crocus_sampler_view *isv = shs->textures[i];
+
+ if (isv->res->base.target != PIPE_BUFFER) {
+ if (consider_framebuffer) {
+ disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, isv->res,
+ isv->view.base_level, isv->view.levels,
+ "for sampling");
+ }
+
+ crocus_resource_prepare_texture(ice, isv->res, isv->view.format,
+ isv->view.base_level, isv->view.levels,
+ isv->view.base_array_layer,
+ isv->view.array_len);
+ }
+
+ crocus_cache_flush_for_read(batch, isv->res->bo);
+
+ if (batch->screen->devinfo.ver >= 7 &&
+ (isv->base.format == PIPE_FORMAT_X24S8_UINT ||
+ isv->base.format == PIPE_FORMAT_X32_S8X24_UINT ||
+ isv->base.format == PIPE_FORMAT_S8_UINT)) {
+ struct crocus_resource *zres, *sres;
+ crocus_get_depth_stencil_resources(&batch->screen->devinfo, isv->base.texture, &zres, &sres);
+ crocus_update_stencil_shadow(ice, sres);
+ crocus_cache_flush_for_read(batch, sres->shadow->bo);
+ }
+ }
+}
+
+static void
+resolve_image_views(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_shader_state *shs,
+ bool *draw_aux_buffer_disabled,
+ bool consider_framebuffer)
+{
+ /* TODO: Consider images used by program */
+ uint32_t views = shs->bound_image_views;
+
+ while (views) {
+ const int i = u_bit_scan(&views);
+ struct pipe_image_view *pview = &shs->image[i].base;
+ struct crocus_resource *res = (void *) pview->resource;
+
+ if (res->base.target != PIPE_BUFFER) {
+ if (consider_framebuffer) {
+ disable_rb_aux_buffer(ice, draw_aux_buffer_disabled,
+ res, pview->u.tex.level, 1,
+ "as a shader image");
+ }
+
+ unsigned num_layers =
+ pview->u.tex.last_layer - pview->u.tex.first_layer + 1;
+
+ /* The data port doesn't understand any compression */
+ crocus_resource_prepare_access(ice, res,
+ pview->u.tex.level, 1,
+ pview->u.tex.first_layer, num_layers,
+ ISL_AUX_USAGE_NONE, false);
+ }
+
+ crocus_cache_flush_for_read(batch, res->bo);
+ }
+}
+
+static void
+crocus_update_align_res(struct crocus_batch *batch,
+ struct crocus_surface *surf,
+ bool copy_to_wa)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)batch->screen;
+ struct pipe_blit_info info = { 0 };
+
+ info.src.resource = copy_to_wa ? surf->base.texture : surf->align_res;
+ info.src.level = copy_to_wa ? surf->base.u.tex.level : 0;
+ u_box_2d_zslice(0, 0, copy_to_wa ? surf->base.u.tex.first_layer : 0,
+ u_minify(surf->base.texture->width0, surf->base.u.tex.level),
+ u_minify(surf->base.texture->height0, surf->base.u.tex.level), &info.src.box);
+ info.src.format = surf->base.texture->format;
+ info.dst.resource = copy_to_wa ? surf->align_res : surf->base.texture;
+ info.dst.level = copy_to_wa ? 0 : surf->base.u.tex.level;
+ info.dst.box = info.src.box;
+ info.dst.box.z = copy_to_wa ? 0 : surf->base.u.tex.first_layer;
+ info.dst.format = surf->base.texture->format;
+ info.mask = util_format_is_depth_or_stencil(surf->base.texture->format) ? PIPE_MASK_ZS : PIPE_MASK_RGBA;
+ info.filter = 0;
+ if (!screen->vtbl.blit_blt(batch, &info)) {
+ assert(0);
+ }
+}
+
+/**
+ * \brief Resolve buffers before drawing.
+ *
+ * Resolve the depth buffer's HiZ buffer, resolve the depth buffer of each
+ * enabled depth texture, and flush the render cache for any dirty textures.
+ */
+void
+crocus_predraw_resolve_inputs(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled,
+ gl_shader_stage stage,
+ bool consider_framebuffer)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ const struct shader_info *info = crocus_get_shader_info(ice, stage);
+
+ uint64_t stage_dirty = (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage) |
+ (consider_framebuffer ? CROCUS_STAGE_DIRTY_BINDINGS_FS : 0);
+
+ if (ice->state.stage_dirty & stage_dirty) {
+ resolve_sampler_views(ice, batch, shs, info, draw_aux_buffer_disabled,
+ consider_framebuffer);
+ resolve_image_views(ice, batch, shs, draw_aux_buffer_disabled,
+ consider_framebuffer);
+ }
+}
+
+void
+crocus_predraw_resolve_framebuffer(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled)
+{
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+ const nir_shader *nir = ish->nir;
+
+ if (ice->state.dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
+ struct pipe_surface *zs_surf = cso_fb->zsbuf;
+
+ if (zs_surf) {
+ struct crocus_resource *z_res, *s_res;
+ crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res);
+ unsigned num_layers =
+ zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1;
+
+ if (z_res) {
+ crocus_resource_prepare_render(ice, z_res,
+ zs_surf->u.tex.level,
+ zs_surf->u.tex.first_layer,
+ num_layers, ice->state.hiz_usage);
+ crocus_cache_flush_for_depth(batch, z_res->bo);
+
+ if (((struct crocus_surface *)zs_surf)->align_res) {
+ crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, true);
+ }
+ }
+
+ if (s_res) {
+ crocus_cache_flush_for_depth(batch, s_res->bo);
+ }
+ }
+ }
+
+ if (nir->info.outputs_read != 0) {
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ if (cso_fb->cbufs[i]) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ struct crocus_resource *res = (void *) cso_fb->cbufs[i]->texture;
+
+ crocus_resource_prepare_texture(ice, res, surf->view.format,
+ surf->view.base_level, 1,
+ surf->view.base_array_layer,
+ surf->view.array_len);
+ }
+ }
+ }
+
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS) {
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ if (!surf)
+ continue;
+
+ struct crocus_resource *res = (void *) surf->base.texture;
+
+ if (surf->align_res)
+ crocus_update_align_res(batch, surf, true);
+
+ enum isl_aux_usage aux_usage =
+ crocus_resource_render_aux_usage(ice, res, surf->view.format,
+ ice->state.blend_enables & (1u << i),
+ draw_aux_buffer_disabled[i]);
+
+ if (ice->state.draw_aux_usage[i] != aux_usage) {
+ ice->state.draw_aux_usage[i] = aux_usage;
+ /* XXX: Need to track which bindings to make dirty */
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+ }
+
+ crocus_resource_prepare_render(ice, res, surf->view.base_level,
+ surf->view.base_array_layer,
+ surf->view.array_len,
+ aux_usage);
+
+ crocus_cache_flush_for_render(batch, res->bo, surf->view.format,
+ aux_usage);
+ }
+ }
+}
+
+/**
+ * \brief Call this after drawing to mark which buffers need resolving
+ *
+ * If the depth buffer was written to and if it has an accompanying HiZ
+ * buffer, then mark that it needs a depth resolve.
+ *
+ * If the color buffer is a multisample window system buffer, then
+ * mark that it needs a downsample.
+ *
+ * Also mark any render targets which will be textured as needing a render
+ * cache flush.
+ */
+void
+crocus_postdraw_update_resolve_tracking(struct crocus_context *ice,
+ struct crocus_batch *batch)
+{
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+ // XXX: front buffer drawing?
+
+ bool may_have_resolved_depth =
+ ice->state.dirty & (CROCUS_DIRTY_DEPTH_BUFFER |
+ CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL);
+
+ struct pipe_surface *zs_surf = cso_fb->zsbuf;
+ if (zs_surf) {
+ struct crocus_resource *z_res, *s_res;
+ crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res);
+ unsigned num_layers =
+ zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1;
+
+ if (z_res) {
+ if (may_have_resolved_depth && ice->state.depth_writes_enabled) {
+ crocus_resource_finish_render(ice, z_res, zs_surf->u.tex.level,
+ zs_surf->u.tex.first_layer, num_layers,
+ ice->state.hiz_usage);
+ }
+
+ if (ice->state.depth_writes_enabled)
+ crocus_depth_cache_add_bo(batch, z_res->bo);
+
+ if (((struct crocus_surface *)zs_surf)->align_res) {
+ crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, false);
+ }
+ }
+
+ if (s_res) {
+ if (may_have_resolved_depth && ice->state.stencil_writes_enabled) {
+ crocus_resource_finish_write(ice, s_res, zs_surf->u.tex.level,
+ zs_surf->u.tex.first_layer, num_layers,
+ s_res->aux.usage);
+ }
+
+ if (ice->state.stencil_writes_enabled)
+ crocus_depth_cache_add_bo(batch, s_res->bo);
+ }
+ }
+
+ bool may_have_resolved_color =
+ ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS;
+
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ if (!surf)
+ continue;
+
+ if (surf->align_res)
+ crocus_update_align_res(batch, surf, false);
+ struct crocus_resource *res = (void *) surf->base.texture;
+ enum isl_aux_usage aux_usage = ice->state.draw_aux_usage[i];
+
+ crocus_render_cache_add_bo(batch, res->bo, surf->view.format,
+ aux_usage);
+
+ if (may_have_resolved_color) {
+ union pipe_surface_desc *desc = &surf->base.u;
+ unsigned num_layers =
+ desc->tex.last_layer - desc->tex.first_layer + 1;
+ crocus_resource_finish_render(ice, res, desc->tex.level,
+ desc->tex.first_layer, num_layers,
+ aux_usage);
+ }
+ }
+}
+
+/**
+ * Clear the cache-tracking sets.
+ */
+void
+crocus_cache_sets_clear(struct crocus_batch *batch)
+{
+ hash_table_foreach(batch->cache.render, render_entry)
+ _mesa_hash_table_remove(batch->cache.render, render_entry);
+
+ set_foreach(batch->cache.depth, depth_entry)
+ _mesa_set_remove(batch->cache.depth, depth_entry);
+}
+
+/**
+ * Emits an appropriate flush for a BO if it has been rendered to within the
+ * same batchbuffer as a read that's about to be emitted.
+ *
+ * The GPU has separate, incoherent caches for the render cache and the
+ * sampler cache, along with other caches. Usually data in the different
+ * caches don't interact (e.g. we don't render to our driver-generated
+ * immediate constant data), but for render-to-texture in FBOs we definitely
+ * do. When a batchbuffer is flushed, the kernel will ensure that everything
+ * necessary is flushed before another use of that BO, but for reuse from
+ * different caches within a batchbuffer, it's all our responsibility.
+ */
+void
+crocus_flush_depth_and_render_caches(struct crocus_batch *batch)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ if (devinfo->ver >= 6) {
+ crocus_emit_pipe_control_flush(batch,
+ "cache tracker: render-to-texture",
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+
+ crocus_emit_pipe_control_flush(batch,
+ "cache tracker: render-to-texture",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+ } else {
+ crocus_emit_mi_flush(batch);
+ }
+
+ crocus_cache_sets_clear(batch);
+}
+
+void
+crocus_cache_flush_for_read(struct crocus_batch *batch,
+ struct crocus_bo *bo)
+{
+ if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo) ||
+ _mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo))
+ crocus_flush_depth_and_render_caches(batch);
+}
+
+static void *
+format_aux_tuple(enum isl_format format, enum isl_aux_usage aux_usage)
+{
+ return (void *)(uintptr_t)((uint32_t)format << 8 | aux_usage);
+}
+
+void
+crocus_cache_flush_for_render(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage)
+{
+ if (_mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo))
+ crocus_flush_depth_and_render_caches(batch);
+
+ /* Check to see if this bo has been used by a previous rendering operation
+ * but with a different format or aux usage. If it has, flush the render
+ * cache so we ensure that it's only in there with one format or aux usage
+ * at a time.
+ *
+ * Even though it's not obvious, this can easily happen in practice.
+ * Suppose a client is blending on a surface with sRGB encode enabled on
+ * gen9. This implies that you get AUX_USAGE_CCS_D at best. If the client
+ * then disables sRGB decode and continues blending we will flip on
+ * AUX_USAGE_CCS_E without doing any sort of resolve in-between (this is
+ * perfectly valid since CCS_E is a subset of CCS_D). However, this means
+ * that we have fragments in-flight which are rendering with UNORM+CCS_E
+ * and other fragments in-flight with SRGB+CCS_D on the same surface at the
+ * same time and the pixel scoreboard and color blender are trying to sort
+ * it all out. This ends badly (i.e. GPU hangs).
+ *
+ * To date, we have never observed GPU hangs or even corruption to be
+ * associated with switching the format, only the aux usage. However,
+ * there are comments in various docs which indicate that the render cache
+ * isn't 100% resilient to format changes. We may as well be conservative
+ * and flush on format changes too. We can always relax this later if we
+ * find it to be a performance problem.
+ */
+ struct hash_entry *entry =
+ _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo);
+ if (entry && entry->data != format_aux_tuple(format, aux_usage))
+ crocus_flush_depth_and_render_caches(batch);
+}
+
+void
+crocus_render_cache_add_bo(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage)
+{
+#ifndef NDEBUG
+ struct hash_entry *entry =
+ _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo);
+ if (entry) {
+ /* Otherwise, someone didn't do a flush_for_render and that would be
+ * very bad indeed.
+ */
+ assert(entry->data == format_aux_tuple(format, aux_usage));
+ }
+#endif
+
+ _mesa_hash_table_insert_pre_hashed(batch->cache.render, bo->hash, bo,
+ format_aux_tuple(format, aux_usage));
+}
+
+void
+crocus_cache_flush_for_depth(struct crocus_batch *batch,
+ struct crocus_bo *bo)
+{
+ if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo))
+ crocus_flush_depth_and_render_caches(batch);
+}
+
+void
+crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+ _mesa_set_add_pre_hashed(batch->cache.depth, bo->hash, bo);
+}
+
+static void
+crocus_resolve_color(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ unsigned level, unsigned layer,
+ enum isl_aux_op resolve_op)
+{
+ struct crocus_screen *screen = batch->screen;
+ DBG("%s to res %p level %u layer %u\n", __func__, res, level, layer);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ &res->base, res->aux.usage, level, true);
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ /* Ivybridge PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+ *
+ * "Any transition from any value in {Clear, Render, Resolve} to a
+ * different value in {Clear, Render, Resolve} requires end of pipe
+ * synchronization."
+ *
+ * In other words, fast clear ops are not properly synchronized with
+ * other drawing. We need to use a PIPE_CONTROL to ensure that the
+ * contents of the previous draw hit the render target before we resolve
+ * and again afterwards to ensure that the resolve is complete before we
+ * do any more regular drawing.
+ */
+ crocus_emit_end_of_pipe_sync(batch, "color resolve: pre-flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+ blorp_ccs_resolve(&blorp_batch, &surf, level, layer, 1,
+ isl_format_srgb_to_linear(res->surf.format),
+ resolve_op);
+ blorp_batch_finish(&blorp_batch);
+
+ /* See comment above */
+ crocus_emit_end_of_pipe_sync(batch, "color resolve: post-flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+}
+
+static void
+crocus_mcs_partial_resolve(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ uint32_t start_layer,
+ uint32_t num_layers)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ DBG("%s to res %p layers %u-%u\n", __func__, res,
+ start_layer, start_layer + num_layers - 1);
+
+ assert(isl_aux_usage_has_mcs(res->aux.usage));
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ &res->base, res->aux.usage, 0, true);
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+ blorp_mcs_partial_resolve(&blorp_batch, &surf,
+ isl_format_srgb_to_linear(res->surf.format),
+ start_layer, num_layers);
+ blorp_batch_finish(&blorp_batch);
+}
+
+/**
+ * Perform a HiZ or depth resolve operation.
+ *
+ * For an overview of HiZ ops, see the following sections of the Sandy Bridge
+ * PRM, Volume 1, Part 2:
+ * - 7.5.3.1 Depth Buffer Clear
+ * - 7.5.3.2 Depth Buffer Resolve
+ * - 7.5.3.3 Hierarchical Depth Buffer Resolve
+ */
+void
+crocus_hiz_exec(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ unsigned int level, unsigned int start_layer,
+ unsigned int num_layers, enum isl_aux_op op,
+ bool update_clear_depth)
+{
+ struct crocus_screen *screen = batch->screen;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ assert(crocus_resource_level_has_hiz(res, level));
+ assert(op != ISL_AUX_OP_NONE);
+ UNUSED const char *name = NULL;
+
+ switch (op) {
+ case ISL_AUX_OP_FULL_RESOLVE:
+ name = "depth resolve";
+ break;
+ case ISL_AUX_OP_AMBIGUATE:
+ name = "hiz ambiguate";
+ break;
+ case ISL_AUX_OP_FAST_CLEAR:
+ name = "depth clear";
+ break;
+ case ISL_AUX_OP_PARTIAL_RESOLVE:
+ case ISL_AUX_OP_NONE:
+ unreachable("Invalid HiZ op");
+ }
+
+ DBG("%s %s to res %p level %d layers %d-%d\n",
+ __func__, name, res, level, start_layer, start_layer + num_layers - 1);
+
+ /* The following stalls and flushes are only documented to be required
+ * for HiZ clear operations. However, they also seem to be required for
+ * resolve operations.
+ *
+ * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
+ *
+ * "If other rendering operations have preceded this clear, a
+ * PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+ * enabled must be issued before the rectangle primitive used for
+ * the depth buffer clear operation."
+ *
+ * Same applies for Gen8 and Gen9.
+ *
+ * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1
+ * PIPE_CONTROL, Depth Cache Flush Enable:
+ *
+ * "This bit must not be set when Depth Stall Enable bit is set in
+ * this packet."
+ *
+ * This is confirmed to hold for real, Haswell gets immediate gpu hangs.
+ *
+ * Therefore issue two pipe control flushes, one for cache flush and
+ * another for depth stall.
+ */
+ if (devinfo->ver == 6) {
+ /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+ *
+ * "If other rendering operations have preceded this clear, a
+ * PIPE_CONTROL with write cache flush enabled and Z-inhibit
+ * disabled must be issued before the rectangle primitive used for
+ * the depth buffer clear operation.
+ */
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: pre-flushes (1)",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ } else if (devinfo->ver >= 7) {
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: pre-flushes (1/2)",
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(batch, "hiz op: pre-flushes (2/2)",
+ PIPE_CONTROL_DEPTH_STALL);
+ }
+
+ assert(isl_aux_usage_has_hiz(res->aux.usage) && res->aux.bo);
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ &res->base, res->aux.usage, level, true);
+
+ struct blorp_batch blorp_batch;
+ enum blorp_batch_flags flags = 0;
+ flags |= update_clear_depth ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, flags);
+ blorp_hiz_op(&blorp_batch, &surf, level, start_layer, num_layers, op);
+ blorp_batch_finish(&blorp_batch);
+
+ /* The following stalls and flushes are only documented to be required
+ * for HiZ clear operations. However, they also seem to be required for
+ * resolve operations.
+ *
+ * From the Broadwell PRM, volume 7, "Depth Buffer Clear":
+ *
+ * "Depth buffer clear pass using any of the methods (WM_STATE,
+ * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a
+ * PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits
+ * "set" before starting to render. DepthStall and DepthFlush are
+ * not needed between consecutive depth clear passes nor is it
+ * required if the depth clear pass was done with
+ * 'full_surf_clear' bit set in the 3DSTATE_WM_HZ_OP."
+ *
+ * TODO: Such as the spec says, this could be conditional.
+ */
+ if (devinfo->ver == 6) {
+ /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+ *
+ * "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be
+ * followed by a PIPE_CONTROL command with DEPTH_STALL bit set
+ * and Then followed by Depth FLUSH'
+ */
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: post-flushes (1/2)",
+ PIPE_CONTROL_DEPTH_STALL);
+
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: post-flushes (2/2)",
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ }
+}
+
+/**
+ * Does the resource's slice have hiz enabled?
+ */
+bool
+crocus_resource_level_has_hiz(const struct crocus_resource *res, uint32_t level)
+{
+ crocus_resource_check_level_layer(res, level, 0);
+ return res->aux.has_hiz & 1 << level;
+}
+
+static bool
+crocus_resource_level_has_aux(const struct crocus_resource *res, uint32_t level)
+{
+ if (isl_aux_usage_has_hiz(res->aux.usage))
+ return crocus_resource_level_has_hiz(res, level);
+ else
+ return level < res->aux.surf.levels;
+}
+
+/** \brief Assert that the level and layer are valid for the resource. */
+void
+crocus_resource_check_level_layer(UNUSED const struct crocus_resource *res,
+ UNUSED uint32_t level, UNUSED uint32_t layer)
+{
+ assert(level < res->surf.levels);
+ assert(layer < util_num_layers(&res->base, level));
+}
+
+static inline uint32_t
+miptree_level_range_length(const struct crocus_resource *res,
+ uint32_t start_level, uint32_t num_levels)
+{
+ assert(start_level < res->surf.levels);
+
+ if (num_levels == INTEL_REMAINING_LAYERS)
+ num_levels = res->surf.levels;
+
+ /* Check for overflow */
+ assert(start_level + num_levels >= start_level);
+ assert(start_level + num_levels <= res->surf.levels);
+
+ return num_levels;
+}
+
+static inline uint32_t
+miptree_layer_range_length(const struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers)
+{
+ assert(level <= res->base.last_level);
+
+ const uint32_t total_num_layers = crocus_get_num_logical_layers(res, level);
+ assert(start_layer < total_num_layers);
+ if (num_layers == INTEL_REMAINING_LAYERS)
+ num_layers = total_num_layers - start_layer;
+ /* Check for overflow */
+ assert(start_layer + num_layers >= start_layer);
+ assert(start_layer + num_layers <= total_num_layers);
+
+ return num_layers;
+}
+
+bool
+crocus_has_invalid_primary(const struct crocus_resource *res,
+ unsigned start_level, unsigned num_levels,
+ unsigned start_layer, unsigned num_layers)
+{
+ if (!res->aux.bo)
+ return false;
+
+ /* Clamp the level range to fit the resource */
+ num_levels = miptree_level_range_length(res, start_level, num_levels);
+
+ for (uint32_t l = 0; l < num_levels; l++) {
+ const uint32_t level = start_level + l;
+ if (!crocus_resource_level_has_aux(res, level))
+ continue;
+
+ const uint32_t level_layers =
+ miptree_layer_range_length(res, level, start_layer, num_layers);
+ for (unsigned a = 0; a < level_layers; a++) {
+ enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, start_layer + a);
+ if (!isl_aux_state_has_valid_primary(aux_state))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+crocus_resource_prepare_access(struct crocus_context *ice,
+ struct crocus_resource *res,
+ uint32_t start_level, uint32_t num_levels,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_usage aux_usage,
+ bool fast_clear_supported)
+{
+ if (!res->aux.bo)
+ return;
+
+ /* We can't do resolves on the compute engine, so awkwardly, we have to
+ * do them on the render batch...
+ */
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ const uint32_t clamped_levels =
+ miptree_level_range_length(res, start_level, num_levels);
+ for (uint32_t l = 0; l < clamped_levels; l++) {
+ const uint32_t level = start_level + l;
+ if (!crocus_resource_level_has_aux(res, level))
+ continue;
+
+ const uint32_t level_layers =
+ miptree_layer_range_length(res, level, start_layer, num_layers);
+ for (uint32_t a = 0; a < level_layers; a++) {
+ const uint32_t layer = start_layer + a;
+ const enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, layer);
+ const enum isl_aux_op aux_op =
+ isl_aux_prepare_access(aux_state, aux_usage, fast_clear_supported);
+
+ /* Prepare the aux buffer for a conditional or unconditional access.
+ * A conditional access is handled by assuming that the access will
+ * not evaluate to a no-op. If the access does in fact occur, the aux
+ * will be in the required state. If it does not, no data is lost
+ * because the aux_op performed is lossless.
+ */
+ if (aux_op == ISL_AUX_OP_NONE) {
+ /* Nothing to do here. */
+ } else if (isl_aux_usage_has_mcs(res->aux.usage)) {
+ assert(aux_op == ISL_AUX_OP_PARTIAL_RESOLVE);
+ crocus_mcs_partial_resolve(ice, batch, res, layer, 1);
+ } else if (isl_aux_usage_has_hiz(res->aux.usage)) {
+ crocus_hiz_exec(ice, batch, res, level, layer, 1, aux_op, false);
+ } else if (res->aux.usage == ISL_AUX_USAGE_STC_CCS) {
+ unreachable("crocus doesn't resolve STC_CCS resources");
+ } else {
+ assert(isl_aux_usage_has_ccs(res->aux.usage));
+ crocus_resolve_color(ice, batch, res, level, layer, aux_op);
+ }
+
+ const enum isl_aux_state new_state =
+ isl_aux_state_transition_aux_op(aux_state, res->aux.usage, aux_op);
+ crocus_resource_set_aux_state(ice, res, level, layer, 1, new_state);
+ }
+ }
+}
+
+void
+crocus_resource_finish_write(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_usage aux_usage)
+{
+ if (res->base.format == PIPE_FORMAT_S8_UINT)
+ res->shadow_needs_update = true;
+
+ if (!crocus_resource_level_has_aux(res, level))
+ return;
+
+ const uint32_t level_layers =
+ miptree_layer_range_length(res, level, start_layer, num_layers);
+
+ for (uint32_t a = 0; a < level_layers; a++) {
+ const uint32_t layer = start_layer + a;
+ const enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, layer);
+
+ /* Transition the aux state for a conditional or unconditional write. A
+ * conditional write is handled by assuming that the write applies to
+ * only part of the render target. This prevents the new state from
+ * losing the types of compression that might exist in the current state
+ * (e.g. CLEAR). If the write evaluates to a no-op, the state will still
+ * be able to communicate when resolves are necessary (but it may
+ * falsely communicate this as well).
+ */
+ const enum isl_aux_state new_aux_state =
+ isl_aux_state_transition_write(aux_state, aux_usage, false);
+
+ crocus_resource_set_aux_state(ice, res, level, layer, 1, new_aux_state);
+ }
+}
+
+enum isl_aux_state
+crocus_resource_get_aux_state(const struct crocus_resource *res,
+ uint32_t level, uint32_t layer)
+{
+ crocus_resource_check_level_layer(res, level, layer);
+ assert(crocus_resource_level_has_aux(res, level));
+
+ return res->aux.state[level][layer];
+}
+
+void
+crocus_resource_set_aux_state(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_state aux_state)
+{
+ assert(crocus_resource_level_has_aux(res, level));
+
+ num_layers = miptree_layer_range_length(res, level, start_layer, num_layers);
+ for (unsigned a = 0; a < num_layers; a++) {
+ if (res->aux.state[level][start_layer + a] != aux_state) {
+ res->aux.state[level][start_layer + a] = aux_state;
+ ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES |
+ CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES;
+ /* XXX: Need to track which bindings to make dirty */
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+ }
+ }
+}
+
+static bool
+isl_formats_are_fast_clear_compatible(enum isl_format a, enum isl_format b)
+{
+ /* On gen8 and earlier, the hardware was only capable of handling 0/1 clear
+ * values so sRGB curve application was a no-op for all fast-clearable
+ * formats.
+ *
+ * On gen9+, the hardware supports arbitrary clear values. For sRGB clear
+ * values, the hardware interprets the floats, not as what would be
+ * returned from the sampler (or written by the shader), but as being
+ * between format conversion and sRGB curve application. This means that
+ * we can switch between sRGB and UNORM without having to whack the clear
+ * color.
+ */
+ return isl_format_srgb_to_linear(a) == isl_format_srgb_to_linear(b);
+}
+
+void
+crocus_resource_prepare_texture(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format view_format,
+ uint32_t start_level, uint32_t num_levels,
+ uint32_t start_layer, uint32_t num_layers)
+{
+ enum isl_aux_usage aux_usage =
+ crocus_resource_texture_aux_usage(res);
+
+ bool clear_supported = aux_usage != ISL_AUX_USAGE_NONE;
+
+ /* Clear color is specified as ints or floats and the conversion is done by
+ * the sampler. If we have a texture view, we would have to perform the
+ * clear color conversion manually. Just disable clear color.
+ */
+ if (!isl_formats_are_fast_clear_compatible(res->surf.format, view_format))
+ clear_supported = false;
+
+ crocus_resource_prepare_access(ice, res, start_level, num_levels,
+ start_layer, num_layers,
+ aux_usage, clear_supported);
+}
+
+enum isl_aux_usage
+crocus_resource_render_aux_usage(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format render_format,
+ bool blend_enabled,
+ bool draw_aux_disabled)
+{
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (draw_aux_disabled)
+ return ISL_AUX_USAGE_NONE;
+
+ switch (res->aux.usage) {
+ case ISL_AUX_USAGE_MCS:
+ return res->aux.usage;
+
+ case ISL_AUX_USAGE_CCS_D:
+ /* Otherwise, we try to fall back to CCS_D */
+ if (isl_format_supports_ccs_d(devinfo, render_format))
+ return ISL_AUX_USAGE_CCS_D;
+
+ return ISL_AUX_USAGE_NONE;
+
+ default:
+ return ISL_AUX_USAGE_NONE;
+ }
+}
+
+void
+crocus_resource_prepare_render(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t layer_count,
+ enum isl_aux_usage aux_usage)
+{
+ crocus_resource_prepare_access(ice, res, level, 1, start_layer,
+ layer_count, aux_usage,
+ aux_usage != ISL_AUX_USAGE_NONE);
+}
+
+void
+crocus_resource_finish_render(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t layer_count,
+ enum isl_aux_usage aux_usage)
+{
+ crocus_resource_finish_write(ice, res, level, start_layer, layer_count,
+ aux_usage);
+}
+
+static void
+crocus_update_stencil_shadow(struct crocus_context *ice,
+ struct crocus_resource *res)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
+ assert(devinfo->ver >= 7);
+
+ if (!res->shadow_needs_update)
+ return;
+
+ struct pipe_box box;
+ for (unsigned level = 0; level <= res->base.last_level; level++) {
+ u_box_2d(0, 0,
+ u_minify(res->base.width0, level),
+ u_minify(res->base.height0, level), &box);
+ const unsigned depth = res->base.target == PIPE_TEXTURE_3D ?
+ u_minify(res->base.depth0, level) : res->base.array_size;
+
+ for (unsigned layer = 0; layer < depth; layer++) {
+ box.z = layer;
+ ice->ctx.resource_copy_region(&ice->ctx,
+ &res->shadow->base, level, 0, 0, layer,
+ &res->base, level, &box);
+ }
+ }
+ res->shadow_needs_update = false;
+}
diff --git a/src/gallium/drivers/crocus/crocus_resource.c b/src/gallium/drivers/crocus/crocus_resource.c
new file mode 100644
index 00000000000..b5bf5a42e1a
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resource.c
@@ -0,0 +1,1946 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_resource.c
+ *
+ * Resources are images, buffers, and other objects used by the GPU.
+ *
+ * XXX: explain resources
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/os_memory.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_threaded_context.h"
+#include "util/u_transfer.h"
+#include "util/u_transfer_helper.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "crocus_batch.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/dev/intel_debug.h"
+#include "isl/isl.h"
+#include "drm-uapi/drm_fourcc.h"
+#include "drm-uapi/i915_drm.h"
+
+enum modifier_priority {
+ MODIFIER_PRIORITY_INVALID = 0,
+ MODIFIER_PRIORITY_LINEAR,
+ MODIFIER_PRIORITY_X,
+ MODIFIER_PRIORITY_Y,
+ MODIFIER_PRIORITY_Y_CCS,
+};
+
+static const uint64_t priority_to_modifier[] = {
+ [MODIFIER_PRIORITY_INVALID] = DRM_FORMAT_MOD_INVALID,
+ [MODIFIER_PRIORITY_LINEAR] = DRM_FORMAT_MOD_LINEAR,
+ [MODIFIER_PRIORITY_X] = I915_FORMAT_MOD_X_TILED,
+ [MODIFIER_PRIORITY_Y] = I915_FORMAT_MOD_Y_TILED,
+ [MODIFIER_PRIORITY_Y_CCS] = I915_FORMAT_MOD_Y_TILED_CCS,
+};
+
+static bool
+modifier_is_supported(const struct intel_device_info *devinfo,
+ enum pipe_format pfmt, uint64_t modifier)
+{
+ /* XXX: do something real */
+ switch (modifier) {
+ case I915_FORMAT_MOD_Y_TILED_CCS:
+ return false;
+ case I915_FORMAT_MOD_Y_TILED:
+ return devinfo->ver >= 6;
+ case I915_FORMAT_MOD_X_TILED:
+ case DRM_FORMAT_MOD_LINEAR:
+ return true;
+ case DRM_FORMAT_MOD_INVALID:
+ default:
+ return false;
+ }
+}
+
+static uint64_t
+select_best_modifier(struct intel_device_info *devinfo, enum pipe_format pfmt,
+ const uint64_t *modifiers,
+ int count)
+{
+ enum modifier_priority prio = MODIFIER_PRIORITY_INVALID;
+
+ for (int i = 0; i < count; i++) {
+ if (!modifier_is_supported(devinfo, pfmt, modifiers[i]))
+ continue;
+
+ switch (modifiers[i]) {
+ case I915_FORMAT_MOD_Y_TILED_CCS:
+ prio = MAX2(prio, MODIFIER_PRIORITY_Y_CCS);
+ break;
+ case I915_FORMAT_MOD_Y_TILED:
+ prio = MAX2(prio, MODIFIER_PRIORITY_Y);
+ break;
+ case I915_FORMAT_MOD_X_TILED:
+ prio = MAX2(prio, MODIFIER_PRIORITY_X);
+ break;
+ case DRM_FORMAT_MOD_LINEAR:
+ prio = MAX2(prio, MODIFIER_PRIORITY_LINEAR);
+ break;
+ case DRM_FORMAT_MOD_INVALID:
+ default:
+ break;
+ }
+ }
+
+ return priority_to_modifier[prio];
+}
+
+static enum isl_surf_dim
+crocus_target_to_isl_surf_dim(enum pipe_texture_target target)
+{
+ switch (target) {
+ case PIPE_BUFFER:
+ case PIPE_TEXTURE_1D:
+ case PIPE_TEXTURE_1D_ARRAY:
+ return ISL_SURF_DIM_1D;
+ case PIPE_TEXTURE_2D:
+ case PIPE_TEXTURE_CUBE:
+ case PIPE_TEXTURE_RECT:
+ case PIPE_TEXTURE_2D_ARRAY:
+ case PIPE_TEXTURE_CUBE_ARRAY:
+ return ISL_SURF_DIM_2D;
+ case PIPE_TEXTURE_3D:
+ return ISL_SURF_DIM_3D;
+ case PIPE_MAX_TEXTURE_TYPES:
+ break;
+ }
+ unreachable("invalid texture type");
+}
+
+static void
+crocus_query_dmabuf_modifiers(struct pipe_screen *pscreen,
+ enum pipe_format pfmt,
+ int max,
+ uint64_t *modifiers,
+ unsigned int *external_only,
+ int *count)
+{
+ struct crocus_screen *screen = (void *) pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ uint64_t all_modifiers[] = {
+ DRM_FORMAT_MOD_LINEAR,
+ I915_FORMAT_MOD_X_TILED,
+ I915_FORMAT_MOD_Y_TILED,
+ I915_FORMAT_MOD_Y_TILED_CCS,
+ };
+
+ int supported_mods = 0;
+
+ for (int i = 0; i < ARRAY_SIZE(all_modifiers); i++) {
+ if (!modifier_is_supported(devinfo, pfmt, all_modifiers[i]))
+ continue;
+
+ if (supported_mods < max) {
+ if (modifiers)
+ modifiers[supported_mods] = all_modifiers[i];
+
+ if (external_only)
+ external_only[supported_mods] = util_format_is_yuv(pfmt);
+ }
+
+ supported_mods++;
+ }
+
+ *count = supported_mods;
+}
+
+static isl_surf_usage_flags_t
+pipe_bind_to_isl_usage(unsigned bindings)
+{
+ isl_surf_usage_flags_t usage = 0;
+
+ if (bindings & PIPE_BIND_RENDER_TARGET)
+ usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
+ if (bindings & PIPE_BIND_SAMPLER_VIEW)
+ usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+
+ if (bindings & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SHADER_BUFFER))
+ usage |= ISL_SURF_USAGE_STORAGE_BIT;
+
+ if (bindings & PIPE_BIND_DISPLAY_TARGET)
+ usage |= ISL_SURF_USAGE_DISPLAY_BIT;
+
+ return usage;
+}
+
+struct pipe_resource *
+crocus_resource_get_separate_stencil(struct pipe_resource *p_res)
+{
+ /* For packed depth-stencil, we treat depth as the primary resource
+ * and store S8 as the "second plane" resource.
+ */
+ if (p_res->next && p_res->next->format == PIPE_FORMAT_S8_UINT)
+ return p_res->next;
+
+ return NULL;
+
+}
+
+static void
+crocus_resource_set_separate_stencil(struct pipe_resource *p_res,
+ struct pipe_resource *stencil)
+{
+ assert(util_format_has_depth(util_format_description(p_res->format)));
+ pipe_resource_reference(&p_res->next, stencil);
+}
+
+void
+crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo,
+ struct pipe_resource *res,
+ struct crocus_resource **out_z,
+ struct crocus_resource **out_s)
+{
+ if (!res) {
+ *out_z = NULL;
+ *out_s = NULL;
+ return;
+ }
+
+ /* gen4/5 only supports packed ds */
+ if (devinfo->ver < 6) {
+ *out_z = (void *)res;
+ *out_s = (void *)res;
+ return;
+ }
+
+ if (res->format != PIPE_FORMAT_S8_UINT) {
+ *out_z = (void *) res;
+ *out_s = (void *) crocus_resource_get_separate_stencil(res);
+ } else {
+ *out_z = NULL;
+ *out_s = (void *) res;
+ }
+}
+
+void
+crocus_resource_disable_aux(struct crocus_resource *res)
+{
+ crocus_bo_unreference(res->aux.bo);
+ free(res->aux.state);
+
+ res->aux.usage = ISL_AUX_USAGE_NONE;
+ res->aux.has_hiz = 0;
+ res->aux.surf.size_B = 0;
+ res->aux.surf.levels = 0;
+ res->aux.bo = NULL;
+ res->aux.extra_aux.surf.size_B = 0;
+ res->aux.state = NULL;
+}
+
+static void
+crocus_resource_destroy(struct pipe_screen *screen,
+ struct pipe_resource *resource)
+{
+ struct crocus_resource *res = (struct crocus_resource *)resource;
+
+ if (resource->target == PIPE_BUFFER)
+ util_range_destroy(&res->valid_buffer_range);
+
+ if (res->shadow)
+ pipe_resource_reference((struct pipe_resource **)&res->shadow, NULL);
+ crocus_resource_disable_aux(res);
+
+ crocus_bo_unreference(res->bo);
+ crocus_pscreen_unref(res->orig_screen);
+ free(res);
+}
+
+static struct crocus_resource *
+crocus_alloc_resource(struct pipe_screen *pscreen,
+ const struct pipe_resource *templ)
+{
+ struct crocus_resource *res = calloc(1, sizeof(struct crocus_resource));
+ if (!res)
+ return NULL;
+
+ res->base = *templ;
+ res->base.screen = pscreen;
+ res->orig_screen = crocus_pscreen_ref(pscreen);
+ pipe_reference_init(&res->base.reference, 1);
+
+ if (templ->target == PIPE_BUFFER)
+ util_range_init(&res->valid_buffer_range);
+
+ return res;
+}
+
+unsigned
+crocus_get_num_logical_layers(const struct crocus_resource *res, unsigned level)
+{
+ if (res->surf.dim == ISL_SURF_DIM_3D)
+ return minify(res->surf.logical_level0_px.depth, level);
+ else
+ return res->surf.logical_level0_px.array_len;
+}
+
+static enum isl_aux_state **
+create_aux_state_map(struct crocus_resource *res, enum isl_aux_state initial)
+{
+ assert(res->aux.state == NULL);
+
+ uint32_t total_slices = 0;
+ for (uint32_t level = 0; level < res->surf.levels; level++)
+ total_slices += crocus_get_num_logical_layers(res, level);
+
+ const size_t per_level_array_size =
+ res->surf.levels * sizeof(enum isl_aux_state *);
+
+ /* We're going to allocate a single chunk of data for both the per-level
+ * reference array and the arrays of aux_state. This makes cleanup
+ * significantly easier.
+ */
+ const size_t total_size =
+ per_level_array_size + total_slices * sizeof(enum isl_aux_state);
+
+ void *data = malloc(total_size);
+ if (!data)
+ return NULL;
+
+ enum isl_aux_state **per_level_arr = data;
+ enum isl_aux_state *s = data + per_level_array_size;
+ for (uint32_t level = 0; level < res->surf.levels; level++) {
+ per_level_arr[level] = s;
+ const unsigned level_layers = crocus_get_num_logical_layers(res, level);
+ for (uint32_t a = 0; a < level_layers; a++)
+ *(s++) = initial;
+ }
+ assert((void *)s == data + total_size);
+
+ return per_level_arr;
+}
+
+/**
+ * Configure aux for the resource, but don't allocate it. For images which
+ * might be shared with modifiers, we must allocate the image and aux data in
+ * a single bo.
+ *
+ * Returns false on unexpected error (e.g. allocation failed, or invalid
+ * configuration result).
+ */
+static bool
+crocus_resource_configure_aux(struct crocus_screen *screen,
+ struct crocus_resource *res, bool imported,
+ uint64_t *aux_size_B,
+ uint32_t *alloc_flags)
+{
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ /* Try to create the auxiliary surfaces allowed by the modifier or by
+ * the user if no modifier is specified.
+ */
+ assert(!res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE);
+
+ const bool has_mcs = devinfo->ver >= 7 && !res->mod_info &&
+ isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
+
+ const bool has_hiz = devinfo->ver >= 6 && !res->mod_info &&
+ !(INTEL_DEBUG & DEBUG_NO_HIZ) &&
+ isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
+
+ const bool has_ccs =
+ ((devinfo->ver >= 7 && !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) ||
+ (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) &&
+ isl_surf_get_ccs_surf(&screen->isl_dev, &res->surf, &res->aux.surf,
+ &res->aux.extra_aux.surf, 0);
+
+ /* Having both HIZ and MCS is impossible. */
+ assert(!has_mcs || !has_hiz);
+
+ /* Ensure aux surface creation for MCS_CCS and HIZ_CCS is correct. */
+ if (has_ccs && (has_mcs || has_hiz)) {
+ assert(res->aux.extra_aux.surf.size_B > 0 &&
+ res->aux.extra_aux.surf.usage & ISL_SURF_USAGE_CCS_BIT);
+ assert(res->aux.surf.size_B > 0 &&
+ res->aux.surf.usage &
+ (ISL_SURF_USAGE_HIZ_BIT | ISL_SURF_USAGE_MCS_BIT));
+ }
+
+ if (res->mod_info && has_ccs) {
+ res->aux.usage = res->mod_info->aux_usage;
+ } else if (has_mcs) {
+ res->aux.usage = ISL_AUX_USAGE_MCS;
+ } else if (has_hiz) {
+ res->aux.usage = ISL_AUX_USAGE_HIZ;
+ } else if (has_ccs) {
+ if (isl_format_supports_ccs_d(devinfo, res->surf.format))
+ res->aux.usage = ISL_AUX_USAGE_CCS_D;
+ }
+
+ enum isl_aux_state initial_state = ISL_AUX_STATE_AUX_INVALID;
+ *aux_size_B = 0;
+ *alloc_flags = 0;
+ assert(!res->aux.bo);
+
+ switch (res->aux.usage) {
+ case ISL_AUX_USAGE_NONE:
+ /* Having no aux buffer is only okay if there's no modifier with aux. */
+ res->aux.surf.levels = 0;
+ return !res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE;
+ case ISL_AUX_USAGE_HIZ:
+ initial_state = ISL_AUX_STATE_AUX_INVALID;
+ break;
+ case ISL_AUX_USAGE_MCS:
+ /* The Ivybridge PRM, Vol 2 Part 1 p326 says:
+ *
+ * "When MCS buffer is enabled and bound to MSRT, it is required
+ * that it is cleared prior to any rendering."
+ *
+ * Since we only use the MCS buffer for rendering, we just clear it
+ * immediately on allocation. The clear value for MCS buffers is all
+ * 1's, so we simply memset it to 0xff.
+ */
+ initial_state = ISL_AUX_STATE_CLEAR;
+ break;
+ case ISL_AUX_USAGE_CCS_D:
+ /* When CCS_E is used, we need to ensure that the CCS starts off in
+ * a valid state. From the Sky Lake PRM, "MCS Buffer for Render
+ * Target(s)":
+ *
+ * "If Software wants to enable Color Compression without Fast
+ * clear, Software needs to initialize MCS with zeros."
+ *
+ * A CCS value of 0 indicates that the corresponding block is in the
+ * pass-through state which is what we want.
+ *
+ * For CCS_D, do the same thing. On Gen9+, this avoids having any
+ * undefined bits in the aux buffer.
+ */
+ if (imported)
+ initial_state =
+ isl_drm_modifier_get_default_aux_state(res->mod_info->modifier);
+ else
+ initial_state = ISL_AUX_STATE_PASS_THROUGH;
+ *alloc_flags |= BO_ALLOC_ZEROED;
+ break;
+ default:
+ unreachable("non-crocus aux");
+ }
+
+ /* Create the aux_state for the auxiliary buffer. */
+ res->aux.state = create_aux_state_map(res, initial_state);
+ if (!res->aux.state)
+ return false;
+
+ /* Increase the aux offset if the main and aux surfaces will share a BO. */
+ res->aux.offset =
+ !res->mod_info || res->mod_info->aux_usage == res->aux.usage ?
+ ALIGN(res->surf.size_B, res->aux.surf.alignment_B) : 0;
+ uint64_t size = res->aux.surf.size_B;
+
+ /* Allocate space in the buffer for storing the CCS. */
+ if (res->aux.extra_aux.surf.size_B > 0) {
+ const uint64_t padded_aux_size =
+ ALIGN(size, res->aux.extra_aux.surf.alignment_B);
+ res->aux.extra_aux.offset = res->aux.offset + padded_aux_size;
+ size = padded_aux_size + res->aux.extra_aux.surf.size_B;
+ }
+
+ /* Allocate space in the buffer for storing the clear color. On modern
+ * platforms (gen > 9), we can read it directly from such buffer.
+ *
+ * On gen <= 9, we are going to store the clear color on the buffer
+ * anyways, and copy it back to the surface state during state emission.
+ *
+ * Also add some padding to make sure the fast clear color state buffer
+ * starts at a 4K alignment. We believe that 256B might be enough, but due
+ * to lack of testing we will leave this as 4K for now.
+ */
+ size = ALIGN(size, 4096);
+ *aux_size_B = size;
+
+ if (isl_aux_usage_has_hiz(res->aux.usage)) {
+ for (unsigned level = 0; level < res->surf.levels; ++level) {
+ uint32_t width = u_minify(res->surf.phys_level0_sa.width, level);
+ uint32_t height = u_minify(res->surf.phys_level0_sa.height, level);
+
+ /* Disable HiZ for LOD > 0 unless the width/height are 8x4 aligned.
+ * For LOD == 0, we can grow the dimensions to make it work.
+ */
+ if (!devinfo->is_haswell ||
+ (level == 0 || ((width & 7) == 0 && (height & 3) == 0)))
+ res->aux.has_hiz |= 1 << level;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Initialize the aux buffer contents.
+ *
+ * Returns false on unexpected error (e.g. mapping a BO failed).
+ */
+static bool
+crocus_resource_init_aux_buf(struct crocus_resource *res, uint32_t alloc_flags)
+{
+ if (!(alloc_flags & BO_ALLOC_ZEROED)) {
+ void *map = crocus_bo_map(NULL, res->aux.bo, MAP_WRITE | MAP_RAW);
+
+ if (!map)
+ return false;
+
+ if (crocus_resource_get_aux_state(res, 0, 0) != ISL_AUX_STATE_AUX_INVALID) {
+ uint8_t memset_value = isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0;
+ memset((char*)map + res->aux.offset, memset_value,
+ res->aux.surf.size_B);
+ }
+
+ /* Bspec section titled : MCS/CCS Buffers for Render Target(s) states:
+ * - If Software wants to enable Color Compression without Fast clear,
+ * Software needs to initialize MCS with zeros.
+ * - Lossless compression and CCS initialized to all F (using HW Fast
+ * Clear or SW direct Clear)
+ *
+ * We think, the first bullet point above is referring to CCS aux
+ * surface. Since we initialize the MCS in the clear state, we also
+ * initialize the CCS in the clear state (via SW direct clear) to keep
+ * the two in sync.
+ */
+ memset((char*)map + res->aux.extra_aux.offset,
+ isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0,
+ res->aux.extra_aux.surf.size_B);
+
+ crocus_bo_unmap(res->aux.bo);
+ }
+
+ return true;
+}
+
+/**
+ * Allocate the initial aux surface for a resource based on aux.usage
+ *
+ * Returns false on unexpected error (e.g. allocation failed, or invalid
+ * configuration result).
+ */
+static bool
+crocus_resource_alloc_separate_aux(struct crocus_screen *screen,
+ struct crocus_resource *res)
+{
+ uint32_t alloc_flags;
+ uint64_t size;
+ if (!crocus_resource_configure_aux(screen, res, false, &size, &alloc_flags))
+ return false;
+
+ if (size == 0)
+ return true;
+
+ /* Allocate the auxiliary buffer. ISL has stricter set of alignment rules
+ * the drm allocator. Therefore, one can pass the ISL dimensions in terms
+ * of bytes instead of trying to recalculate based on different format
+ * block sizes.
+ */
+ res->aux.bo = crocus_bo_alloc_tiled(screen->bufmgr, "aux buffer", size, 4096,
+ isl_tiling_to_i915_tiling(res->aux.surf.tiling),
+ res->aux.surf.row_pitch_B, alloc_flags);
+ if (!res->aux.bo) {
+ return false;
+ }
+
+ if (!crocus_resource_init_aux_buf(res, alloc_flags))
+ return false;
+
+ return true;
+}
+
+void
+crocus_resource_finish_aux_import(struct pipe_screen *pscreen,
+ struct crocus_resource *res)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ assert(crocus_resource_unfinished_aux_import(res));
+ assert(!res->mod_info->supports_clear_color);
+
+ struct crocus_resource *aux_res = (void *) res->base.next;
+ assert(aux_res->aux.surf.row_pitch_B && aux_res->aux.offset &&
+ aux_res->aux.bo);
+
+ assert(res->bo == aux_res->aux.bo);
+ crocus_bo_reference(aux_res->aux.bo);
+ res->aux.bo = aux_res->aux.bo;
+
+ res->aux.offset = aux_res->aux.offset;
+
+ assert(res->bo->size >= (res->aux.offset + res->aux.surf.size_B));
+ assert(aux_res->aux.surf.row_pitch_B == res->aux.surf.row_pitch_B);
+
+ crocus_resource_destroy(&screen->base, res->base.next);
+ res->base.next = NULL;
+}<