summaryrefslogtreecommitdiff
path: root/src/gallium
diff options
context:
space:
mode:
authorDave Airlie <airlied@gmail.com>2021-06-01 13:14:51 +1000
committerDave Airlie <airlied@gmail.com>2021-06-14 06:34:05 +1000
commitf3630548f1da904ec6c63b43ece7e68afdb8867e (patch)
tree05cfc909591aba9d8bf4bdeb9ba32ce8db2c58f4 /src/gallium
parent8da92b5c0a358e30be557cae3303a4027b24db1c (diff)
crocus: initial gallium driver for Intel gfx 4-7
This is a gallium driver for the Intel gfx 4-7 GPUs. It was initially cloned from the iris driver by Ilia Mirkin, then I ported over large reams of code from i965 until it worked. Acked-by: Jason Ekstrand <jason@jlekstrand.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11146>
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c1
-rw-r--r--src/gallium/auxiliary/target-helpers/drm_helper.h20
-rw-r--r--src/gallium/auxiliary/target-helpers/drm_helper_public.h1
-rw-r--r--src/gallium/drivers/crocus/crocus_batch.c1047
-rw-r--r--src/gallium/drivers/crocus/crocus_batch.h325
-rw-r--r--src/gallium/drivers/crocus/crocus_blit.c836
-rw-r--r--src/gallium/drivers/crocus/crocus_blorp.c399
-rw-r--r--src/gallium/drivers/crocus/crocus_blt.c337
-rw-r--r--src/gallium/drivers/crocus/crocus_bufmgr.c1689
-rw-r--r--src/gallium/drivers/crocus/crocus_bufmgr.h331
-rw-r--r--src/gallium/drivers/crocus/crocus_clear.c859
-rw-r--r--src/gallium/drivers/crocus/crocus_context.c336
-rw-r--r--src/gallium/drivers/crocus/crocus_context.h955
-rw-r--r--src/gallium/drivers/crocus/crocus_defines.h58
-rw-r--r--src/gallium/drivers/crocus/crocus_disk_cache.c263
-rw-r--r--src/gallium/drivers/crocus/crocus_draw.c511
-rw-r--r--src/gallium/drivers/crocus/crocus_fence.c571
-rw-r--r--src/gallium/drivers/crocus/crocus_fence.h60
-rw-r--r--src/gallium/drivers/crocus/crocus_fine_fence.c85
-rw-r--r--src/gallium/drivers/crocus/crocus_fine_fence.h109
-rw-r--r--src/gallium/drivers/crocus/crocus_formats.c576
-rw-r--r--src/gallium/drivers/crocus/crocus_genx_macros.h164
-rw-r--r--src/gallium/drivers/crocus/crocus_genx_protos.h56
-rw-r--r--src/gallium/drivers/crocus/crocus_monitor.c484
-rw-r--r--src/gallium/drivers/crocus/crocus_monitor.h72
-rw-r--r--src/gallium/drivers/crocus/crocus_pipe.h74
-rw-r--r--src/gallium/drivers/crocus/crocus_pipe_control.c368
-rw-r--r--src/gallium/drivers/crocus/crocus_program.c3171
-rw-r--r--src/gallium/drivers/crocus/crocus_program_cache.c347
-rw-r--r--src/gallium/drivers/crocus/crocus_query.c996
-rw-r--r--src/gallium/drivers/crocus/crocus_resolve.c1061
-rw-r--r--src/gallium/drivers/crocus/crocus_resource.c1946
-rw-r--r--src/gallium/drivers/crocus/crocus_resource.h501
-rw-r--r--src/gallium/drivers/crocus/crocus_screen.c829
-rw-r--r--src/gallium/drivers/crocus/crocus_screen.h253
-rw-r--r--src/gallium/drivers/crocus/crocus_state.c8382
-rw-r--r--src/gallium/drivers/crocus/crocus_todo.txt16
-rw-r--r--src/gallium/drivers/crocus/driinfo_crocus.h11
-rw-r--r--src/gallium/drivers/crocus/gen4_blorp_exec.h190
-rw-r--r--src/gallium/drivers/crocus/meson.build90
-rw-r--r--src/gallium/meson.build6
-rw-r--r--src/gallium/targets/d3dadapter9/meson.build2
-rw-r--r--src/gallium/targets/dri/meson.build3
-rw-r--r--src/gallium/targets/dri/target.c4
-rw-r--r--src/gallium/winsys/crocus/drm/crocus_drm_public.h33
-rw-r--r--src/gallium/winsys/crocus/drm/crocus_drm_winsys.c39
-rw-r--r--src/gallium/winsys/crocus/drm/meson.build29
47 files changed, 28494 insertions, 2 deletions
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 8147c3ca346..ca5bf121a88 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -70,6 +70,7 @@ static const struct pipe_loader_ops pipe_loader_drm_ops;
static const struct drm_driver_descriptor *driver_descriptors[] = {
&i915_driver_descriptor,
&iris_driver_descriptor,
+ &crocus_driver_descriptor,
&nouveau_driver_descriptor,
&r300_driver_descriptor,
&r600_driver_descriptor,
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h
index 6bab07a40e7..ff4621e1a88 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -112,6 +112,26 @@ DRM_DRIVER_DESCRIPTOR(iris, iris_driconf, ARRAY_SIZE(iris_driconf))
DRM_DRIVER_DESCRIPTOR_STUB(iris)
#endif
+#ifdef GALLIUM_CROCUS
+#include "crocus/drm/crocus_drm_public.h"
+
+static struct pipe_screen *
+pipe_crocus_create_screen(int fd, const struct pipe_screen_config *config)
+{
+ struct pipe_screen *screen;
+
+ screen = crocus_drm_screen_create(fd, config);
+ return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+const driOptionDescription crocus_driconf[] = {
+ #include "crocus/driinfo_crocus.h"
+};
+DRM_DRIVER_DESCRIPTOR(crocus, crocus_driconf, ARRAY_SIZE(crocus_driconf))
+#else
+DRM_DRIVER_DESCRIPTOR_STUB(crocus)
+#endif
+
#ifdef GALLIUM_NOUVEAU
#include "nouveau/drm/nouveau_drm_public.h"
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
index 5fd3084dfdb..478e72b8525 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
@@ -6,6 +6,7 @@ struct pipe_screen_config;
extern const struct drm_driver_descriptor i915_driver_descriptor;
extern const struct drm_driver_descriptor iris_driver_descriptor;
+extern const struct drm_driver_descriptor crocus_driver_descriptor;
extern const struct drm_driver_descriptor nouveau_driver_descriptor;
extern const struct drm_driver_descriptor r300_driver_descriptor;
extern const struct drm_driver_descriptor r600_driver_descriptor;
diff --git a/src/gallium/drivers/crocus/crocus_batch.c b/src/gallium/drivers/crocus/crocus_batch.c
new file mode 100644
index 00000000000..63cfe282de4
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_batch.c
@@ -0,0 +1,1047 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_batch.c
+ *
+ * Batchbuffer and command submission module.
+ *
+ * Every API draw call results in a number of GPU commands, which we
+ * collect into a "batch buffer". Typically, many draw calls are grouped
+ * into a single batch to amortize command submission overhead.
+ *
+ * We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl.
+ * One critical piece of data is the "validation list", which contains a
+ * list of the buffer objects (BOs) which the commands in the GPU need.
+ * The kernel will make sure these are resident and pinned at the correct
+ * virtual memory address before executing our batch. If a BO is not in
+ * the validation list, it effectively does not exist, so take care.
+ */
+
+#include "crocus_batch.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "crocus_fence.h"
+
+#include "drm-uapi/i915_drm.h"
+
+#include "intel/common/intel_gem.h"
+#include "main/macros.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "util/u_upload_mgr.h"
+
+#include <errno.h>
+#include <xf86drm.h>
+
+#if HAVE_VALGRIND
+#include <memcheck.h>
+#include <valgrind.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+
+/* Terminating the batch takes either 4 bytes for MI_BATCH_BUFFER_END
+ * or 12 bytes for MI_BATCH_BUFFER_START (when chaining). Plus, we may
+ * need an extra 4 bytes to pad out to the nearest QWord. So reserve 16.
+ */
+#define BATCH_RESERVED(devinfo) ((devinfo)->is_haswell ? 32 : 16)
+
+static void crocus_batch_reset(struct crocus_batch *batch);
+
+static unsigned
+num_fences(struct crocus_batch *batch)
+{
+ return util_dynarray_num_elements(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence);
+}
+
+/**
+ * Debugging code to dump the fence list, used by INTEL_DEBUG=submit.
+ */
+static void
+dump_fence_list(struct crocus_batch *batch)
+{
+ fprintf(stderr, "Fence list (length %u): ", num_fences(batch));
+
+ util_dynarray_foreach(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence, f) {
+ fprintf(stderr, "%s%u%s ",
+ (f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "",
+ f->handle,
+ (f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : "");
+ }
+
+ fprintf(stderr, "\n");
+}
+
+/**
+ * Debugging code to dump the validation list, used by INTEL_DEBUG=submit.
+ */
+static void
+dump_validation_list(struct crocus_batch *batch)
+{
+ fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ uint64_t flags = batch->validation_list[i].flags;
+ assert(batch->validation_list[i].handle ==
+ batch->exec_bos[i]->gem_handle);
+ fprintf(stderr,
+ "[%2d]: %2d %-14s @ 0x%016llx (%" PRIu64 "B)\t %2d refs %s\n", i,
+ batch->validation_list[i].handle, batch->exec_bos[i]->name,
+ batch->validation_list[i].offset, batch->exec_bos[i]->size,
+ batch->exec_bos[i]->refcount,
+ (flags & EXEC_OBJECT_WRITE) ? " (write)" : "");
+ }
+}
+
+/**
+ * Return BO information to the batch decoder (for debugging).
+ */
+static struct intel_batch_decode_bo
+decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
+{
+ struct crocus_batch *batch = v_batch;
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ struct crocus_bo *bo = batch->exec_bos[i];
+ /* The decoder zeroes out the top 16 bits, so we need to as well */
+ uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
+
+ if (address >= bo_address && address < bo_address + bo->size) {
+ return (struct intel_batch_decode_bo){
+ .addr = address,
+ .size = bo->size,
+ .map = crocus_bo_map(batch->dbg, bo, MAP_READ) +
+ (address - bo_address),
+ };
+ }
+ }
+
+ return (struct intel_batch_decode_bo) { };
+}
+
+static unsigned
+decode_get_state_size(void *v_batch, uint64_t address,
+ uint64_t base_address)
+{
+ struct crocus_batch *batch = v_batch;
+
+ /* The decoder gives us offsets from a base address, which is not great.
+ * Binding tables are relative to surface state base address, and other
+ * state is relative to dynamic state base address. These could alias,
+ * but in practice it's unlikely because surface offsets are always in
+ * the [0, 64K) range, and we assign dynamic state addresses starting at
+ * the top of the 4GB range. We should fix this but it's likely good
+ * enough for now.
+ */
+ unsigned size = (uintptr_t)
+ _mesa_hash_table_u64_search(batch->state_sizes, address - base_address);
+
+ return size;
+}
+
+/**
+ * Decode the current batch.
+ */
+static void
+decode_batch(struct crocus_batch *batch)
+{
+ void *map = crocus_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ);
+ intel_print_batch(&batch->decoder, map, batch->primary_batch_size,
+ batch->exec_bos[0]->gtt_offset, false);
+}
+
+static void
+init_reloc_list(struct crocus_reloc_list *rlist, int count)
+{
+ rlist->reloc_count = 0;
+ rlist->reloc_array_size = count;
+ rlist->relocs = malloc(rlist->reloc_array_size *
+ sizeof(struct drm_i915_gem_relocation_entry));
+}
+
+void
+crocus_init_batch(struct crocus_context *ice,
+ enum crocus_batch_name name,
+ int priority)
+{
+ struct crocus_batch *batch = &ice->batches[name];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+
+ batch->ice = ice;
+ batch->screen = screen;
+ batch->dbg = &ice->dbg;
+ batch->reset = &ice->reset;
+ batch->name = name;
+ batch->contains_fence_signal = false;
+
+ if (devinfo->ver >= 7) {
+ batch->fine_fences.uploader =
+ u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_STAGING, 0);
+ }
+ crocus_fine_fence_init(batch);
+
+ batch->hw_ctx_id = crocus_create_hw_context(screen->bufmgr);
+ assert(batch->hw_ctx_id);
+
+ crocus_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
+
+ batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
+ if (devinfo->ver == 6)
+ batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
+
+ if (INTEL_DEBUG & DEBUG_BATCH) {
+ /* The shadow doesn't get relocs written so state decode fails. */
+ batch->use_shadow_copy = false;
+ } else
+ batch->use_shadow_copy = !devinfo->has_llc;
+
+ util_dynarray_init(&batch->exec_fences, ralloc_context(NULL));
+ util_dynarray_init(&batch->syncobjs, ralloc_context(NULL));
+
+ init_reloc_list(&batch->command.relocs, 250);
+ init_reloc_list(&batch->state.relocs, 250);
+
+ batch->exec_count = 0;
+ batch->exec_array_size = 100;
+ batch->exec_bos =
+ malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
+ batch->validation_list =
+ malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
+
+ batch->cache.render = _mesa_hash_table_create(NULL, NULL,
+ _mesa_key_pointer_equal);
+ batch->cache.depth = _mesa_set_create(NULL, NULL,
+ _mesa_key_pointer_equal);
+
+ memset(batch->other_batches, 0, sizeof(batch->other_batches));
+
+ for (int i = 0, j = 0; i < ice->batch_count; i++) {
+ if (i != name)
+ batch->other_batches[j++] = &ice->batches[i];
+ }
+
+ if (INTEL_DEBUG & DEBUG_BATCH) {
+
+ batch->state_sizes = _mesa_hash_table_u64_create(NULL);
+ const unsigned decode_flags =
+ INTEL_BATCH_DECODE_FULL |
+ ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
+ INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS;
+
+ intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo, stderr,
+ decode_flags, NULL, decode_get_bo,
+ decode_get_state_size, batch);
+ batch->decoder.max_vbo_decoded_lines = 32;
+ }
+
+ crocus_batch_reset(batch);
+}
+
+static struct drm_i915_gem_exec_object2 *
+find_validation_entry(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+ unsigned index = READ_ONCE(bo->index);
+
+ if (index < batch->exec_count && batch->exec_bos[index] == bo)
+ return &batch->validation_list[index];
+
+ /* May have been shared between multiple active batches */
+ for (index = 0; index < batch->exec_count; index++) {
+ if (batch->exec_bos[index] == bo)
+ return &batch->validation_list[index];
+ }
+
+ return NULL;
+}
+
+static void
+ensure_exec_obj_space(struct crocus_batch *batch, uint32_t count)
+{
+ while (batch->exec_count + count > batch->exec_array_size) {
+ batch->exec_array_size *= 2;
+ batch->exec_bos = realloc(
+ batch->exec_bos, batch->exec_array_size * sizeof(batch->exec_bos[0]));
+ batch->validation_list =
+ realloc(batch->validation_list,
+ batch->exec_array_size * sizeof(batch->validation_list[0]));
+ }
+}
+
+static struct drm_i915_gem_exec_object2 *
+crocus_use_bo(struct crocus_batch *batch, struct crocus_bo *bo, bool writable)
+{
+ assert(bo->bufmgr == batch->command.bo->bufmgr);
+
+ if (bo == batch->ice->workaround_bo)
+ writable = false;
+
+ struct drm_i915_gem_exec_object2 *existing_entry =
+ find_validation_entry(batch, bo);
+
+ if (existing_entry) {
+ /* The BO is already in the validation list; mark it writable */
+ if (writable)
+ existing_entry->flags |= EXEC_OBJECT_WRITE;
+ return existing_entry;
+ }
+
+ if (bo != batch->command.bo && bo != batch->state.bo) {
+ /* This is the first time our batch has seen this BO. Before we use it,
+ * we may need to flush and synchronize with other batches.
+ */
+ for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
+
+ if (!batch->other_batches[b])
+ continue;
+ struct drm_i915_gem_exec_object2 *other_entry =
+ find_validation_entry(batch->other_batches[b], bo);
+
+ /* If the buffer is referenced by another batch, and either batch
+ * intends to write it, then flush the other batch and synchronize.
+ *
+ * Consider these cases:
+ *
+ * 1. They read, we read => No synchronization required.
+ * 2. They read, we write => Synchronize (they need the old value)
+ * 3. They write, we read => Synchronize (we need their new value)
+ * 4. They write, we write => Synchronize (order writes)
+ *
+ * The read/read case is very common, as multiple batches usually
+ * share a streaming state buffer or shader assembly buffer, and
+ * we want to avoid synchronizing in this case.
+ */
+ if (other_entry &&
+ ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) {
+ crocus_batch_flush(batch->other_batches[b]);
+ crocus_batch_add_syncobj(batch,
+ batch->other_batches[b]->last_fence->syncobj,
+ I915_EXEC_FENCE_WAIT);
+ }
+ }
+ }
+
+ /* Bump the ref count since the batch is now using this bo. */
+ crocus_bo_reference(bo);
+
+ ensure_exec_obj_space(batch, 1);
+
+ batch->validation_list[batch->exec_count] =
+ (struct drm_i915_gem_exec_object2) {
+ .handle = bo->gem_handle,
+ .offset = bo->gtt_offset,
+ .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0),
+ };
+
+ bo->index = batch->exec_count;
+ batch->exec_bos[batch->exec_count] = bo;
+ batch->aperture_space += bo->size;
+
+ batch->exec_count++;
+
+ return &batch->validation_list[batch->exec_count - 1];
+}
+
+static uint64_t
+emit_reloc(struct crocus_batch *batch,
+ struct crocus_reloc_list *rlist, uint32_t offset,
+ struct crocus_bo *target, int32_t target_offset,
+ unsigned int reloc_flags)
+{
+ assert(target != NULL);
+
+ bool writable = reloc_flags & RELOC_WRITE;
+
+ struct drm_i915_gem_exec_object2 *entry =
+ crocus_use_bo(batch, target, writable);
+
+ if (rlist->reloc_count == rlist->reloc_array_size) {
+ rlist->reloc_array_size *= 2;
+ rlist->relocs = realloc(rlist->relocs,
+ rlist->reloc_array_size *
+ sizeof(struct drm_i915_gem_relocation_entry));
+ }
+
+ if (reloc_flags & RELOC_32BIT) {
+ /* Restrict this buffer to the low 32 bits of the address space.
+ *
+ * Altering the validation list flags restricts it for this batch,
+ * but we also alter the BO's kflags to restrict it permanently
+ * (until the BO is destroyed and put back in the cache). Buffers
+ * may stay bound across batches, and we want keep it constrained.
+ */
+ target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+ entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+ /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
+ reloc_flags &= ~RELOC_32BIT;
+ }
+
+ if (reloc_flags)
+ entry->flags |= reloc_flags & batch->valid_reloc_flags;
+
+ rlist->relocs[rlist->reloc_count++] =
+ (struct drm_i915_gem_relocation_entry) {
+ .offset = offset,
+ .delta = target_offset,
+ .target_handle = target->index,
+ .presumed_offset = entry->offset,
+ };
+
+ /* Using the old buffer offset, write in what the right data would be, in
+ * case the buffer doesn't move and we can short-circuit the relocation
+ * processing in the kernel
+ */
+ return entry->offset + target_offset;
+}
+
+uint64_t
+crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags)
+{
+ assert(batch_offset <= batch->command.bo->size - sizeof(uint32_t));
+
+ return emit_reloc(batch, &batch->command.relocs, batch_offset,
+ target, target_offset, reloc_flags);
+}
+
+uint64_t
+crocus_state_reloc(struct crocus_batch *batch, uint32_t state_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags)
+{
+ assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
+
+ return emit_reloc(batch, &batch->state.relocs, state_offset,
+ target, target_offset, reloc_flags);
+}
+
+static void
+recreate_growing_buffer(struct crocus_batch *batch,
+ struct crocus_growing_bo *grow,
+ const char *name, unsigned size)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ grow->bo = crocus_bo_alloc(bufmgr, name, size);
+ grow->bo->kflags |= EXEC_OBJECT_CAPTURE;
+ grow->partial_bo = NULL;
+ grow->partial_bo_map = NULL;
+ grow->partial_bytes = 0;
+ if (batch->use_shadow_copy)
+ grow->map = realloc(grow->map, grow->bo->size);
+ else
+ grow->map = crocus_bo_map(NULL, grow->bo, MAP_READ | MAP_WRITE);
+ grow->map_next = grow->map;
+}
+
+static void
+create_batch(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ recreate_growing_buffer(batch, &batch->command,
+ "command buffer",
+ BATCH_SZ + BATCH_RESERVED(&screen->devinfo));
+
+ crocus_use_bo(batch, batch->command.bo, false);
+
+ recreate_growing_buffer(batch, &batch->state,
+ "state buffer",
+ STATE_SZ);
+
+ batch->state.used = 1;
+ crocus_use_bo(batch, batch->state.bo, false);
+}
+
+static void
+crocus_batch_maybe_noop(struct crocus_batch *batch)
+{
+ /* We only insert the NOOP at the beginning of the batch. */
+ assert(crocus_batch_bytes_used(batch) == 0);
+
+ if (batch->noop_enabled) {
+ /* Emit MI_BATCH_BUFFER_END to prevent any further command to be
+ * executed.
+ */
+ uint32_t *map = batch->command.map_next;
+
+ map[0] = (0xA << 23);
+
+ batch->command.map_next += 4;
+ }
+}
+
+static void
+crocus_batch_reset(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ crocus_bo_unreference(batch->command.bo);
+ crocus_bo_unreference(batch->state.bo);
+ batch->primary_batch_size = 0;
+ batch->contains_draw = false;
+ batch->contains_fence_signal = false;
+ batch->state_base_address_emitted = false;
+ batch->screen->vtbl.batch_reset_dirty(batch);
+
+ create_batch(batch);
+ assert(batch->command.bo->index == 0);
+
+ if (batch->state_sizes)
+ _mesa_hash_table_u64_clear(batch->state_sizes);
+ struct crocus_syncobj *syncobj = crocus_create_syncobj(screen);
+ crocus_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL);
+ crocus_syncobj_reference(screen, &syncobj, NULL);
+
+ crocus_cache_sets_clear(batch);
+}
+
+void
+crocus_batch_free(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+
+ if (batch->use_shadow_copy) {
+ free(batch->command.map);
+ free(batch->state.map);
+ }
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ crocus_bo_unreference(batch->exec_bos[i]);
+ }
+
+ pipe_resource_reference(&batch->fine_fences.ref.res, NULL);
+
+ free(batch->command.relocs.relocs);
+ free(batch->state.relocs.relocs);
+ free(batch->exec_bos);
+ free(batch->validation_list);
+
+ ralloc_free(batch->exec_fences.mem_ctx);
+
+ util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
+ crocus_syncobj_reference(screen, s, NULL);
+ ralloc_free(batch->syncobjs.mem_ctx);
+
+ crocus_fine_fence_reference(batch->screen, &batch->last_fence, NULL);
+ if (batch_has_fine_fence(batch))
+ u_upload_destroy(batch->fine_fences.uploader);
+
+ crocus_bo_unreference(batch->command.bo);
+ batch->command.bo = NULL;
+ batch->command.map = NULL;
+ batch->command.map_next = NULL;
+
+ crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+
+ _mesa_hash_table_destroy(batch->cache.render, NULL);
+ _mesa_set_destroy(batch->cache.depth, NULL);
+
+ if (batch->state_sizes) {
+ _mesa_hash_table_u64_destroy(batch->state_sizes);
+ intel_batch_decode_ctx_finish(&batch->decoder);
+ }
+}
+
+/**
+ * If we've chained to a secondary batch, or are getting near to the end,
+ * then flush. This should only be called between draws.
+ */
+void
+crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate)
+{
+ if (batch->command.bo != batch->exec_bos[0] ||
+ crocus_batch_bytes_used(batch) + estimate >= BATCH_SZ) {
+ crocus_batch_flush(batch);
+ }
+}
+
+/**
+ * Finish copying the old batch/state buffer's contents to the new one
+ * after we tried to "grow" the buffer in an earlier operation.
+ */
+static void
+finish_growing_bos(struct crocus_growing_bo *grow)
+{
+ struct crocus_bo *old_bo = grow->partial_bo;
+ if (!old_bo)
+ return;
+
+ memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
+
+ grow->partial_bo = NULL;
+ grow->partial_bo_map = NULL;
+ grow->partial_bytes = 0;
+
+ crocus_bo_unreference(old_bo);
+}
+
+void
+crocus_grow_buffer(struct crocus_batch *batch, bool grow_state,
+ unsigned used,
+ unsigned new_size)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ struct crocus_growing_bo *grow = grow_state ? &batch->state : &batch->command;
+ struct crocus_bo *bo = grow->bo;
+
+ if (grow->partial_bo) {
+ /* We've already grown once, and now we need to do it again.
+ * Finish our last grow operation so we can start a new one.
+ * This should basically never happen.
+ */
+ finish_growing_bos(grow);
+ }
+
+ struct crocus_bo *new_bo = crocus_bo_alloc(bufmgr, bo->name, new_size);
+
+ /* Copy existing data to the new larger buffer */
+ grow->partial_bo_map = grow->map;
+
+ if (batch->use_shadow_copy) {
+ /* We can't safely use realloc, as it may move the existing buffer,
+ * breaking existing pointers the caller may still be using. Just
+ * malloc a new copy and memcpy it like the normal BO path.
+ *
+ * Use bo->size rather than new_size because the bufmgr may have
+ * rounded up the size, and we want the shadow size to match.
+ */
+ grow->map = malloc(new_bo->size);
+ } else {
+ grow->map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE);
+ }
+ /* Try to put the new BO at the same GTT offset as the old BO (which
+ * we're throwing away, so it doesn't need to be there).
+ *
+ * This guarantees that our relocations continue to work: values we've
+ * already written into the buffer, values we're going to write into the
+ * buffer, and the validation/relocation lists all will match.
+ *
+ * Also preserve kflags for EXEC_OBJECT_CAPTURE.
+ */
+ new_bo->gtt_offset = bo->gtt_offset;
+ new_bo->index = bo->index;
+ new_bo->kflags = bo->kflags;
+
+ /* Batch/state buffers are per-context, and if we've run out of space,
+ * we must have actually used them before, so...they will be in the list.
+ */
+ assert(bo->index < batch->exec_count);
+ assert(batch->exec_bos[bo->index] == bo);
+
+ /* Update the validation list to use the new BO. */
+ batch->validation_list[bo->index].handle = new_bo->gem_handle;
+ /* Exchange the two BOs...without breaking pointers to the old BO.
+ *
+ * Consider this scenario:
+ *
+ * 1. Somebody calls brw_state_batch() to get a region of memory, and
+ * and then creates a brw_address pointing to brw->batch.state.bo.
+ * 2. They then call brw_state_batch() a second time, which happens to
+ * grow and replace the state buffer. They then try to emit a
+ * relocation to their first section of memory.
+ *
+ * If we replace the brw->batch.state.bo pointer at step 2, we would
+ * break the address created in step 1. They'd have a pointer to the
+ * old destroyed BO. Emitting a relocation would add this dead BO to
+ * the validation list...causing /both/ statebuffers to be in the list,
+ * and all kinds of disasters.
+ *
+ * This is not a contrived case - BLORP vertex data upload hits this.
+ *
+ * There are worse scenarios too. Fences for GL sync objects reference
+ * brw->batch.batch.bo. If we replaced the batch pointer when growing,
+ * we'd need to chase down every fence and update it to point to the
+ * new BO. Otherwise, it would refer to a "batch" that never actually
+ * gets submitted, and would fail to trigger.
+ *
+ * To work around both of these issues, we transmutate the buffers in
+ * place, making the existing struct brw_bo represent the new buffer,
+ * and "new_bo" represent the old BO. This is highly unusual, but it
+ * seems like a necessary evil.
+ *
+ * We also defer the memcpy of the existing batch's contents. Callers
+ * may make multiple brw_state_batch calls, and retain pointers to the
+ * old BO's map. We'll perform the memcpy in finish_growing_bo() when
+ * we finally submit the batch, at which point we've finished uploading
+ * state, and nobody should have any old references anymore.
+ *
+ * To do that, we keep a reference to the old BO in grow->partial_bo,
+ * and store the number of bytes to copy in grow->partial_bytes. We
+ * can monkey with the refcounts directly without atomics because these
+ * are per-context BOs and they can only be touched by this thread.
+ */
+ assert(new_bo->refcount == 1);
+ new_bo->refcount = bo->refcount;
+ bo->refcount = 1;
+
+ struct crocus_bo tmp;
+ memcpy(&tmp, bo, sizeof(struct crocus_bo));
+ memcpy(bo, new_bo, sizeof(struct crocus_bo));
+ memcpy(new_bo, &tmp, sizeof(struct crocus_bo));
+
+ grow->partial_bo = new_bo; /* the one reference of the OLD bo */
+ grow->partial_bytes = used;
+}
+
+static void
+finish_seqno(struct crocus_batch *batch)
+{
+ struct crocus_fine_fence *sq = crocus_fine_fence_new(batch, CROCUS_FENCE_END);
+ if (!sq)
+ return;
+
+ crocus_fine_fence_reference(batch->screen, &batch->last_fence, sq);
+ crocus_fine_fence_reference(batch->screen, &sq, NULL);
+}
+
+/**
+ * Terminate a batch with MI_BATCH_BUFFER_END.
+ */
+static void
+crocus_finish_batch(struct crocus_batch *batch)
+{
+
+ batch->no_wrap = true;
+ if (batch->screen->vtbl.finish_batch)
+ batch->screen->vtbl.finish_batch(batch);
+
+ finish_seqno(batch);
+
+ /* Emit MI_BATCH_BUFFER_END to finish our batch. */
+ uint32_t *map = batch->command.map_next;
+
+ map[0] = (0xA << 23);
+
+ batch->command.map_next += 4;
+ VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->command.map, crocus_batch_bytes_used(batch)));
+
+ if (batch->command.bo == batch->exec_bos[0])
+ batch->primary_batch_size = crocus_batch_bytes_used(batch);
+ batch->no_wrap = false;
+}
+
+/**
+ * Replace our current GEM context with a new one (in case it got banned).
+ */
+static bool
+replace_hw_ctx(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+
+ uint32_t new_ctx = crocus_clone_hw_context(bufmgr, batch->hw_ctx_id);
+ if (!new_ctx)
+ return false;
+
+ crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+ batch->hw_ctx_id = new_ctx;
+
+ /* Notify the context that state must be re-initialized. */
+ crocus_lost_context_state(batch);
+
+ return true;
+}
+
+enum pipe_reset_status
+crocus_batch_check_for_reset(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+ enum pipe_reset_status status = PIPE_NO_RESET;
+ struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id };
+
+ if (drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats))
+ DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno));
+
+ if (stats.batch_active != 0) {
+ /* A reset was observed while a batch from this hardware context was
+ * executing. Assume that this context was at fault.
+ */
+ status = PIPE_GUILTY_CONTEXT_RESET;
+ } else if (stats.batch_pending != 0) {
+ /* A reset was observed while a batch from this context was in progress,
+ * but the batch was not executing. In this case, assume that the
+ * context was not at fault.
+ */
+ status = PIPE_INNOCENT_CONTEXT_RESET;
+ }
+
+ if (status != PIPE_NO_RESET) {
+ /* Our context is likely banned, or at least in an unknown state.
+ * Throw it away and start with a fresh context. Ideally this may
+ * catch the problem before our next execbuf fails with -EIO.
+ */
+ replace_hw_ctx(batch);
+ }
+
+ return status;
+}
+
+/**
+ * Submit the batch to the GPU via execbuffer2.
+ */
+static int
+submit_batch(struct crocus_batch *batch)
+{
+
+ if (batch->use_shadow_copy) {
+ void *bo_map = crocus_bo_map(batch->dbg, batch->command.bo, MAP_WRITE);
+ memcpy(bo_map, batch->command.map, crocus_batch_bytes_used(batch));
+
+ bo_map = crocus_bo_map(batch->dbg, batch->state.bo, MAP_WRITE);
+ memcpy(bo_map, batch->state.map, batch->state.used);
+ }
+
+ crocus_bo_unmap(batch->command.bo);
+ crocus_bo_unmap(batch->state.bo);
+
+ /* The requirement for using I915_EXEC_NO_RELOC are:
+ *
+ * The addresses written in the objects must match the corresponding
+ * reloc.gtt_offset which in turn must match the corresponding
+ * execobject.offset.
+ *
+ * Any render targets written to in the batch must be flagged with
+ * EXEC_OBJECT_WRITE.
+ *
+ * To avoid stalling, execobject.offset should match the current
+ * address of that object within the active context.
+ */
+ /* Set statebuffer relocations */
+ const unsigned state_index = batch->state.bo->index;
+ if (state_index < batch->exec_count &&
+ batch->exec_bos[state_index] == batch->state.bo) {
+ struct drm_i915_gem_exec_object2 *entry =
+ &batch->validation_list[state_index];
+ assert(entry->handle == batch->state.bo->gem_handle);
+ entry->relocation_count = batch->state.relocs.reloc_count;
+ entry->relocs_ptr = (uintptr_t)batch->state.relocs.relocs;
+ }
+
+ /* Set batchbuffer relocations */
+ struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
+ assert(entry->handle == batch->command.bo->gem_handle);
+ entry->relocation_count = batch->command.relocs.reloc_count;
+ entry->relocs_ptr = (uintptr_t)batch->command.relocs.relocs;
+
+ struct drm_i915_gem_execbuffer2 execbuf = {
+ .buffers_ptr = (uintptr_t)batch->validation_list,
+ .buffer_count = batch->exec_count,
+ .batch_start_offset = 0,
+ /* This must be QWord aligned. */
+ .batch_len = ALIGN(batch->primary_batch_size, 8),
+ .flags = I915_EXEC_RENDER |
+ I915_EXEC_NO_RELOC |
+ I915_EXEC_BATCH_FIRST |
+ I915_EXEC_HANDLE_LUT,
+ .rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */
+ };
+
+ if (num_fences(batch)) {
+ execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+ execbuf.num_cliprects = num_fences(batch);
+ execbuf.cliprects_ptr =
+ (uintptr_t)util_dynarray_begin(&batch->exec_fences);
+ }
+
+ int ret = 0;
+ if (!batch->screen->no_hw &&
+ intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
+ ret = -errno;
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ struct crocus_bo *bo = batch->exec_bos[i];
+
+ bo->idle = false;
+ bo->index = -1;
+
+ /* Update brw_bo::gtt_offset */
+ if (batch->validation_list[i].offset != bo->gtt_offset) {
+ DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
+ bo->gem_handle, bo->gtt_offset,
+ batch->validation_list[i].offset);
+ assert(!(bo->kflags & EXEC_OBJECT_PINNED));
+ bo->gtt_offset = batch->validation_list[i].offset;
+ }
+ }
+
+ return ret;
+}
+
+static const char *
+batch_name_to_string(enum crocus_batch_name name)
+{
+ const char *names[CROCUS_BATCH_COUNT] = {
+ [CROCUS_BATCH_RENDER] = "render",
+ [CROCUS_BATCH_COMPUTE] = "compute",
+ };
+ return names[name];
+}
+
+/**
+ * Flush the batch buffer, submitting it to the GPU and resetting it so
+ * we're ready to emit the next batch.
+ *
+ * \param in_fence_fd is ignored if -1. Otherwise, this function takes
+ * ownership of the fd.
+ *
+ * \param out_fence_fd is ignored if NULL. Otherwise, the caller must
+ * take ownership of the returned fd.
+ */
+void
+_crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ /* If a fence signals we need to flush it. */
+ if (crocus_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal)
+ return;
+
+ assert(!batch->no_wrap);
+ crocus_finish_batch(batch);
+
+ finish_growing_bos(&batch->command);
+ finish_growing_bos(&batch->state);
+ int ret = submit_batch(batch);
+
+ if (unlikely(INTEL_DEBUG &
+ (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL))) {
+ int bytes_for_commands = crocus_batch_bytes_used(batch);
+ int second_bytes = 0;
+ if (batch->command.bo != batch->exec_bos[0]) {
+ second_bytes = bytes_for_commands;
+ bytes_for_commands += batch->primary_batch_size;
+ }
+ fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) "
+ "(cmds), %4d BOs (%0.1fMb aperture),"
+ " %4d command relocs, %4d state relocs\n",
+ file, line, batch_name_to_string(batch->name), batch->hw_ctx_id,
+ batch->primary_batch_size, second_bytes,
+ 100.0f * bytes_for_commands / BATCH_SZ,
+ batch->exec_count,
+ (float) batch->aperture_space / (1024 * 1024),
+ batch->command.relocs.reloc_count,
+ batch->state.relocs.reloc_count);
+
+ if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) {
+ dump_fence_list(batch);
+ dump_validation_list(batch);
+ }
+
+ if (INTEL_DEBUG & DEBUG_BATCH) {
+ decode_batch(batch);
+ }
+ }
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ struct crocus_bo *bo = batch->exec_bos[i];
+ crocus_bo_unreference(bo);
+ }
+
+ batch->command.relocs.reloc_count = 0;
+ batch->state.relocs.reloc_count = 0;
+ batch->exec_count = 0;
+ batch->aperture_space = 0;
+
+ util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
+ crocus_syncobj_reference(screen, s, NULL);
+ util_dynarray_clear(&batch->syncobjs);
+
+ util_dynarray_clear(&batch->exec_fences);
+
+ if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
+ dbg_printf("waiting for idle\n");
+ crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */
+ }
+
+ /* Start a new batch buffer. */
+ crocus_batch_reset(batch);
+
+ /* EIO means our context is banned. In this case, try and replace it
+ * with a new logical context, and inform crocus_context that all state
+ * has been lost and needs to be re-initialized. If this succeeds,
+ * dubiously claim success...
+ */
+ if (ret == -EIO && replace_hw_ctx(batch)) {
+ if (batch->reset->reset) {
+ /* Tell the state tracker the device is lost and it was our fault. */
+ batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET);
+ }
+
+ ret = 0;
+ }
+
+ if (ret < 0) {
+#ifdef DEBUG
+ const bool color = INTEL_DEBUG & DEBUG_COLOR;
+ fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n",
+ color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
+#endif
+ abort();
+ }
+}
+
+/**
+ * Does the current batch refer to the given BO?
+ *
+ * (In other words, is the BO in the current batch's validation list?)
+ */
+bool
+crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+ return find_validation_entry(batch, bo) != NULL;
+}
+
+/**
+ * Updates the state of the noop feature. Returns true if there was a noop
+ * transition that led to state invalidation.
+ */
+bool
+crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable)
+{
+ if (batch->noop_enabled == noop_enable)
+ return 0;
+
+ batch->noop_enabled = noop_enable;
+
+ crocus_batch_flush(batch);
+
+ /* If the batch was empty, flush had no effect, so insert our noop. */
+ if (crocus_batch_bytes_used(batch) == 0)
+ crocus_batch_maybe_noop(batch);
+
+ /* We only need to update the entire state if we transition from noop ->
+ * not-noop.
+ */
+ return !batch->noop_enabled;
+}
diff --git a/src/gallium/drivers/crocus/crocus_batch.h b/src/gallium/drivers/crocus/crocus_batch.h
new file mode 100644
index 00000000000..fe6857d83ed
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_batch.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_BATCH_DOT_H
+#define CROCUS_BATCH_DOT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/u_dynarray.h"
+
+#include "common/intel_decoder.h"
+#include "drm-uapi/i915_drm.h"
+
+#include "crocus_fence.h"
+#include "crocus_fine_fence.h"
+
+#include "crocus_bufmgr.h"
+/* The kernel assumes batchbuffers are smaller than 256kB. */
+#define MAX_BATCH_SIZE (256 * 1024)
+
+/* 3DSTATE_BINDING_TABLE_POINTERS has a U16 offset from Surface State Base
+ * Address, which means that we can't put binding tables beyond 64kB. This
+ * effectively limits the maximum statebuffer size to 64kB.
+ */
+#define MAX_STATE_SIZE (64 * 1024)
+
+/* Our target batch size - flush approximately at this point. */
+#define BATCH_SZ (20 * 1024)
+#define STATE_SZ (16 * 1024)
+
+enum crocus_batch_name {
+ CROCUS_BATCH_RENDER,
+ CROCUS_BATCH_COMPUTE,
+};
+
+#define CROCUS_BATCH_COUNT 2
+
+struct crocus_address {
+ struct crocus_bo *bo;
+ int32_t offset;
+ uint32_t reloc_flags;
+};
+
+struct crocus_reloc_list {
+ struct drm_i915_gem_relocation_entry *relocs;
+ int reloc_count;
+ int reloc_array_size;
+};
+
+struct crocus_growing_bo {
+ struct crocus_bo *bo;
+ void *map;
+ void *map_next;
+ struct crocus_bo *partial_bo;
+ void *partial_bo_map;
+ unsigned partial_bytes;
+ struct crocus_reloc_list relocs;
+ unsigned used;
+};
+
+struct crocus_batch {
+ struct crocus_context *ice;
+ struct crocus_screen *screen;
+ struct pipe_debug_callback *dbg;
+ struct pipe_device_reset_callback *reset;
+
+ /** What batch is this? (e.g. CROCUS_BATCH_RENDER/COMPUTE) */
+ enum crocus_batch_name name;
+
+ /** buffers: command, state */
+ struct crocus_growing_bo command, state;
+
+ /** Size of the primary batch if we've moved on to a secondary. */
+ unsigned primary_batch_size;
+
+ bool state_base_address_emitted;
+ uint8_t pipe_controls_since_last_cs_stall;
+
+ uint32_t hw_ctx_id;
+
+ uint32_t valid_reloc_flags;
+
+ bool use_shadow_copy;
+ bool no_wrap;
+
+ /** The validation list */
+ struct drm_i915_gem_exec_object2 *validation_list;
+ struct crocus_bo **exec_bos;
+ int exec_count;
+ int exec_array_size;
+
+ /** Whether INTEL_BLACKHOLE_RENDER is enabled in the batch (aka first
+ * instruction is a MI_BATCH_BUFFER_END).
+ */
+ bool noop_enabled;
+
+ /**
+ * A list of crocus_syncobjs associated with this batch.
+ *
+ * The first list entry will always be a signalling sync-point, indicating
+ * that this batch has completed. The others are likely to be sync-points
+ * to wait on before executing the batch.
+ */
+ struct util_dynarray syncobjs;
+
+ /** A list of drm_i915_exec_fences to have execbuf signal or wait on */
+ struct util_dynarray exec_fences;
+
+ /** The amount of aperture space (in bytes) used by all exec_bos */
+ int aperture_space;
+
+ struct {
+ /** Uploader to use for sequence numbers */
+ struct u_upload_mgr *uploader;
+
+ /** GPU buffer and CPU map where our seqno's will be written. */
+ struct crocus_state_ref ref;
+ uint32_t *map;
+
+ /** The sequence number to write the next time we add a fence. */
+ uint32_t next;
+ } fine_fences;
+
+ /** A seqno (and syncobj) for the last batch that was submitted. */
+ struct crocus_fine_fence *last_fence;
+
+ /** List of other batches which we might need to flush to use a BO */
+ struct crocus_batch *other_batches[CROCUS_BATCH_COUNT - 1];
+
+ struct {
+ /**
+ * Set of struct brw_bo * that have been rendered to within this
+ * batchbuffer and would need flushing before being used from another
+ * cache domain that isn't coherent with it (i.e. the sampler).
+ */
+ struct hash_table *render;
+
+ /**
+ * Set of struct brw_bo * that have been used as a depth buffer within
+ * this batchbuffer and would need flushing before being used from
+ * another cache domain that isn't coherent with it (i.e. the sampler).
+ */
+ struct set *depth;
+ } cache;
+
+ struct intel_batch_decode_ctx decoder;
+ struct hash_table_u64 *state_sizes;
+
+ /** Have we emitted any draw calls to this batch? */
+ bool contains_draw;
+
+ /** Batch contains fence signal operation. */
+ bool contains_fence_signal;
+};
+
+static inline bool
+batch_has_fine_fence(struct crocus_batch *batch)
+{
+ return !!batch->fine_fences.uploader;
+}
+
+#define BATCH_HAS_FINE_FENCES(batch) (!!(batch)->fine_fences.uploader)
+void crocus_init_batch(struct crocus_context *ctx,
+ enum crocus_batch_name name,
+ int priority);
+void crocus_batch_free(struct crocus_batch *batch);
+void crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate);
+
+void _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line);
+#define crocus_batch_flush(batch) _crocus_batch_flush((batch), __FILE__, __LINE__)
+
+bool crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo);
+
+bool crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable);
+
+#define RELOC_WRITE EXEC_OBJECT_WRITE
+#define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
+/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
+#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
+
+void crocus_use_pinned_bo(struct crocus_batch *batch, struct crocus_bo *bo,
+ bool writable);
+uint64_t crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags);
+uint64_t crocus_state_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+ struct crocus_bo *target, uint32_t target_offset,
+ unsigned int reloc_flags);
+
+enum pipe_reset_status crocus_batch_check_for_reset(struct crocus_batch *batch);
+
+void crocus_grow_buffer(struct crocus_batch *batch, bool grow_state,
+ unsigned used, unsigned new_size);
+
+static inline unsigned
+crocus_batch_bytes_used(struct crocus_batch *batch)
+{
+ return batch->command.map_next - batch->command.map;
+}
+
+/**
+ * Ensure the current command buffer has \param size bytes of space
+ * remaining. If not, this creates a secondary batch buffer and emits
+ * a jump from the primary batch to the start of the secondary.
+ *
+ * Most callers want crocus_get_command_space() instead.
+ */
+static inline void
+crocus_require_command_space(struct crocus_batch *batch, unsigned size)
+{
+ const unsigned required_bytes = crocus_batch_bytes_used(batch) + size;
+ unsigned used = crocus_batch_bytes_used(batch);
+ if (required_bytes >= BATCH_SZ && !batch->no_wrap) {
+ crocus_batch_flush(batch);
+ } else if (used + size >= batch->command.bo->size) {
+ const unsigned new_size =
+ MIN2(batch->command.bo->size + batch->command.bo->size / 2,
+ MAX_BATCH_SIZE);
+
+ crocus_grow_buffer(batch, false, used, new_size);
+ batch->command.map_next = (void *)batch->command.map + used;
+ assert(crocus_batch_bytes_used(batch) + size < batch->command.bo->size);
+ }
+}
+
+/**
+ * Allocate space in the current command buffer, and return a pointer
+ * to the mapped area so the caller can write commands there.
+ *
+ * This should be called whenever emitting commands.
+ */
+static inline void *
+crocus_get_command_space(struct crocus_batch *batch, unsigned bytes)
+{
+ crocus_require_command_space(batch, bytes);
+ void *map = batch->command.map_next;
+ batch->command.map_next += bytes;
+ return map;
+}
+
+/**
+ * Helper to emit GPU commands - allocates space, copies them there.
+ */
+static inline void
+crocus_batch_emit(struct crocus_batch *batch, const void *data, unsigned size)
+{
+ void *map = crocus_get_command_space(batch, size);
+ memcpy(map, data, size);
+}
+
+/**
+ * Get a pointer to the batch's signalling syncobj. Does not refcount.
+ */
+static inline struct crocus_syncobj *
+crocus_batch_get_signal_syncobj(struct crocus_batch *batch)
+{
+ /* The signalling syncobj is the first one in the list. */
+ struct crocus_syncobj *syncobj =
+ ((struct crocus_syncobj **)util_dynarray_begin(&batch->syncobjs))[0];
+ return syncobj;
+}
+
+/**
+ * Take a reference to the batch's signalling syncobj.
+ *
+ * Callers can use this to wait for the the current batch under construction
+ * to complete (after flushing it).
+ */
+static inline void
+crocus_batch_reference_signal_syncobj(struct crocus_batch *batch,
+ struct crocus_syncobj **out_syncobj)
+{
+ struct crocus_syncobj *syncobj = crocus_batch_get_signal_syncobj(batch);
+ crocus_syncobj_reference(batch->screen, out_syncobj, syncobj);
+}
+
+/**
+ * Record the size of a piece of state for use in INTEL_DEBUG=bat printing.
+ */
+static inline void
+crocus_record_state_size(struct hash_table_u64 *ht, uint32_t offset_from_base,
+ uint32_t size)
+{
+ if (ht) {
+ _mesa_hash_table_u64_insert(ht, offset_from_base,
+ (void *)(uintptr_t)size);
+ }
+}
+
+static inline bool
+crocus_ptr_in_state_buffer(struct crocus_batch *batch, void *p)
+{
+ return (char *)p >= (char *)batch->state.map &&
+ (char *)p < (char *)batch->state.map + batch->state.bo->size;
+}
+
+static inline void
+crocus_require_statebuffer_space(struct crocus_batch *batch, int size)
+{
+ if (batch->state.used + size >= STATE_SZ)
+ crocus_batch_flush(batch);
+}
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_blit.c b/src/gallium/drivers/crocus/crocus_blit.c
new file mode 100644
index 00000000000..9cae82e3e2d
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blit.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/format/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/ralloc.h"
+#include "intel/blorp/blorp.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond)
+{
+ util_blitter_save_vertex_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_VERTEX]);
+ util_blitter_save_tessctrl_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL]);
+ util_blitter_save_tesseval_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]);
+ util_blitter_save_geometry_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]);
+ util_blitter_save_so_targets(ice->blitter, ice->state.so_targets,
+ (struct pipe_stream_output_target**)ice->state.so_target);
+ util_blitter_save_vertex_buffer_slot(ice->blitter, ice->state.vertex_buffers);
+ util_blitter_save_vertex_elements(ice->blitter, (void *)ice->state.cso_vertex_elements);
+ if (op & CROCUS_SAVE_FRAGMENT_STATE) {
+ util_blitter_save_blend(ice->blitter, ice->state.cso_blend);
+ util_blitter_save_depth_stencil_alpha(ice->blitter, ice->state.cso_zsa);
+ util_blitter_save_stencil_ref(ice->blitter, &ice->state.stencil_ref);
+ util_blitter_save_fragment_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]);
+ util_blitter_save_sample_mask(ice->blitter, ice->state.sample_mask);
+ util_blitter_save_rasterizer(ice->blitter, ice->state.cso_rast);
+ util_blitter_save_scissor(ice->blitter, &ice->state.scissors[0]);
+ util_blitter_save_viewport(ice->blitter, &ice->state.viewports[0]);
+ util_blitter_save_fragment_constant_buffer_slot(ice->blitter, &ice->state.shaders[MESA_SHADER_FRAGMENT].constbufs[0]);
+ }
+
+ if (!render_cond)
+ util_blitter_save_render_condition(ice->blitter,
+ (struct pipe_query *)ice->condition.query,
+ ice->condition.condition,
+ ice->condition.mode);
+
+// util_blitter_save_scissor(ice->blitter, &ice->scissors[0]);
+ if (op & CROCUS_SAVE_FRAMEBUFFER)
+ util_blitter_save_framebuffer(ice->blitter, &ice->state.framebuffer);
+
+ if (op & CROCUS_SAVE_TEXTURES) {
+ util_blitter_save_fragment_sampler_states(ice->blitter, 1, (void **)ice->state.shaders[MESA_SHADER_FRAGMENT].samplers);
+ util_blitter_save_fragment_sampler_views(ice->blitter, 1, (struct pipe_sampler_view **)ice->state.shaders[MESA_SHADER_FRAGMENT].textures);
+ }
+}
+
+/**
+ * Helper function for handling mirror image blits.
+ *
+ * If coord0 > coord1, swap them and return "true" (mirrored).
+ */
+static bool
+apply_mirror(float *coord0, float *coord1)
+{
+ if (*coord0 > *coord1) {
+ float tmp = *coord0;
+ *coord0 = *coord1;
+ *coord1 = tmp;
+ return true;
+ }
+ return false;
+}
+
+/**
+ * Compute the number of pixels to clip for each side of a rect
+ *
+ * \param x0 The rect's left coordinate
+ * \param y0 The rect's bottom coordinate
+ * \param x1 The rect's right coordinate
+ * \param y1 The rect's top coordinate
+ * \param min_x The clipping region's left coordinate
+ * \param min_y The clipping region's bottom coordinate
+ * \param max_x The clipping region's right coordinate
+ * \param max_y The clipping region's top coordinate
+ * \param clipped_x0 The number of pixels to clip from the left side
+ * \param clipped_y0 The number of pixels to clip from the bottom side
+ * \param clipped_x1 The number of pixels to clip from the right side
+ * \param clipped_y1 The number of pixels to clip from the top side
+ *
+ * \return false if we clip everything away, true otherwise
+ */
+static inline bool
+compute_pixels_clipped(float x0, float y0, float x1, float y1,
+ float min_x, float min_y, float max_x, float max_y,
+ float *clipped_x0, float *clipped_y0,
+ float *clipped_x1, float *clipped_y1)
+{
+ /* If we are going to clip everything away, stop. */
+ if (!(min_x <= max_x &&
+ min_y <= max_y &&
+ x0 <= max_x &&
+ y0 <= max_y &&
+ min_x <= x1 &&
+ min_y <= y1 &&
+ x0 <= x1 &&
+ y0 <= y1)) {
+ return false;
+ }
+
+ if (x0 < min_x)
+ *clipped_x0 = min_x - x0;
+ else
+ *clipped_x0 = 0;
+ if (max_x < x1)
+ *clipped_x1 = x1 - max_x;
+ else
+ *clipped_x1 = 0;
+
+ if (y0 < min_y)
+ *clipped_y0 = min_y - y0;
+ else
+ *clipped_y0 = 0;
+ if (max_y < y1)
+ *clipped_y1 = y1 - max_y;
+ else
+ *clipped_y1 = 0;
+
+ return true;
+}
+
+/**
+ * Clips a coordinate (left, right, top or bottom) for the src or dst rect
+ * (whichever requires the largest clip) and adjusts the coordinate
+ * for the other rect accordingly.
+ *
+ * \param mirror true if mirroring is required
+ * \param src the source rect coordinate (for example src_x0)
+ * \param dst0 the dst rect coordinate (for example dst_x0)
+ * \param dst1 the opposite dst rect coordinate (for example dst_x1)
+ * \param clipped_dst0 number of pixels to clip from the dst coordinate
+ * \param clipped_dst1 number of pixels to clip from the opposite dst coordinate
+ * \param scale the src vs dst scale involved for that coordinate
+ * \param is_left_or_bottom true if we are clipping the left or bottom sides
+ * of the rect.
+ */
+static void
+clip_coordinates(bool mirror,
+ float *src, float *dst0, float *dst1,
+ float clipped_dst0,
+ float clipped_dst1,
+ float scale,
+ bool is_left_or_bottom)
+{
+ /* When clipping we need to add or subtract pixels from the original
+ * coordinates depending on whether we are acting on the left/bottom
+ * or right/top sides of the rect respectively. We assume we have to
+ * add them in the code below, and multiply by -1 when we should
+ * subtract.
+ */
+ int mult = is_left_or_bottom ? 1 : -1;
+
+ if (!mirror) {
+ *dst0 += clipped_dst0 * mult;
+ *src += clipped_dst0 * scale * mult;
+ } else {
+ *dst1 -= clipped_dst1 * mult;
+ *src += clipped_dst1 * scale * mult;
+ }
+}
+
+/**
+ * Apply a scissor rectangle to blit coordinates.
+ *
+ * Returns true if the blit was entirely scissored away.
+ */
+static bool
+apply_blit_scissor(const struct pipe_scissor_state *scissor,
+ float *src_x0, float *src_y0,
+ float *src_x1, float *src_y1,
+ float *dst_x0, float *dst_y0,
+ float *dst_x1, float *dst_y1,
+ bool mirror_x, bool mirror_y)
+{
+ float clip_dst_x0, clip_dst_x1, clip_dst_y0, clip_dst_y1;
+
+ /* Compute number of pixels to scissor away. */
+ if (!compute_pixels_clipped(*dst_x0, *dst_y0, *dst_x1, *dst_y1,
+ scissor->minx, scissor->miny,
+ scissor->maxx, scissor->maxy,
+ &clip_dst_x0, &clip_dst_y0,
+ &clip_dst_x1, &clip_dst_y1))
+ return true;
+
+ // XXX: comments assume source clipping, which we don't do
+
+ /* When clipping any of the two rects we need to adjust the coordinates
+ * in the other rect considering the scaling factor involved. To obtain
+ * the best precision we want to make sure that we only clip once per
+ * side to avoid accumulating errors due to the scaling adjustment.
+ *
+ * For example, if src_x0 and dst_x0 need both to be clipped we want to
+ * avoid the situation where we clip src_x0 first, then adjust dst_x0
+ * accordingly but then we realize that the resulting dst_x0 still needs
+ * to be clipped, so we clip dst_x0 and adjust src_x0 again. Because we are
+ * applying scaling factors to adjust the coordinates in each clipping
+ * pass we lose some precision and that can affect the results of the
+ * blorp blit operation slightly. What we want to do here is detect the
+ * rect that we should clip first for each side so that when we adjust
+ * the other rect we ensure the resulting coordinate does not need to be
+ * clipped again.
+ *
+ * The code below implements this by comparing the number of pixels that
+ * we need to clip for each side of both rects considering the scales
+ * involved. For example, clip_src_x0 represents the number of pixels
+ * to be clipped for the src rect's left side, so if clip_src_x0 = 5,
+ * clip_dst_x0 = 4 and scale_x = 2 it means that we are clipping more
+ * from the dst rect so we should clip dst_x0 only and adjust src_x0.
+ * This is because clipping 4 pixels in the dst is equivalent to
+ * clipping 4 * 2 = 8 > 5 in the src.
+ */
+
+ if (*src_x0 == *src_x1 || *src_y0 == *src_y1
+ || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1)
+ return true;
+
+ float scale_x = (float) (*src_x1 - *src_x0) / (*dst_x1 - *dst_x0);
+ float scale_y = (float) (*src_y1 - *src_y0) / (*dst_y1 - *dst_y0);
+
+ /* Clip left side */
+ clip_coordinates(mirror_x, src_x0, dst_x0, dst_x1,
+ clip_dst_x0, clip_dst_x1, scale_x, true);
+
+ /* Clip right side */
+ clip_coordinates(mirror_x, src_x1, dst_x1, dst_x0,
+ clip_dst_x1, clip_dst_x0, scale_x, false);
+
+ /* Clip bottom side */
+ clip_coordinates(mirror_y, src_y0, dst_y0, dst_y1,
+ clip_dst_y0, clip_dst_y1, scale_y, true);
+
+ /* Clip top side */
+ clip_coordinates(mirror_y, src_y1, dst_y1, dst_y0,
+ clip_dst_y1, clip_dst_y0, scale_y, false);
+
+ /* Check for invalid bounds
+ * Can't blit for 0-dimensions
+ */
+ return *src_x0 == *src_x1 || *src_y0 == *src_y1
+ || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1;
+}
+
+void
+crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl,
+ struct isl_device *isl_dev,
+ struct blorp_surf *surf,
+ struct pipe_resource *p_res,
+ enum isl_aux_usage aux_usage,
+ unsigned level,
+ bool is_render_target)
+{
+ struct crocus_resource *res = (void *) p_res;
+
+ assert(!crocus_resource_unfinished_aux_import(res));
+
+ if (isl_aux_usage_has_hiz(aux_usage) &&
+ !crocus_resource_level_has_hiz(res, level))
+ aux_usage = ISL_AUX_USAGE_NONE;
+
+ *surf = (struct blorp_surf) {
+ .surf = &res->surf,
+ .addr = (struct blorp_address) {
+ .buffer = res->bo,
+ .offset = res->offset,
+ .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0,
+ .mocs = crocus_mocs(res->bo, isl_dev),
+ },
+ .aux_usage = aux_usage,
+ };
+
+ if (aux_usage != ISL_AUX_USAGE_NONE) {
+ surf->aux_surf = &res->aux.surf;
+ surf->aux_addr = (struct blorp_address) {
+ .buffer = res->aux.bo,
+ .offset = res->aux.offset,
+ .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0,
+ .mocs = crocus_mocs(res->bo, isl_dev),
+ };
+ surf->clear_color =
+ crocus_resource_get_clear_color(res);
+ }
+}
+
+static void
+tex_cache_flush_hack(struct crocus_batch *batch,
+ enum isl_format view_format,
+ enum isl_format surf_format)
+{
+ /* The WaSamplerCacheFlushBetweenRedescribedSurfaceReads workaround says:
+ *
+ * "Currently Sampler assumes that a surface would not have two
+ * different format associate with it. It will not properly cache
+ * the different views in the MT cache, causing a data corruption."
+ *
+ * We may need to handle this for texture views in general someday, but
+ * for now we handle it here, as it hurts copies and blits particularly
+ * badly because they ofter reinterpret formats.
+ *
+ * If the BO hasn't been referenced yet this batch, we assume that the
+ * texture cache doesn't contain any relevant data nor need flushing.
+ *
+ * Icelake (Gen11+) claims to fix this issue, but seems to still have
+ * issues with ASTC formats.
+ */
+ bool need_flush = view_format != surf_format;
+ if (!need_flush)
+ return;
+
+ const char *reason =
+ "workaround: WaSamplerCacheFlushBetweenRedescribedSurfaceReads";
+
+ crocus_emit_pipe_control_flush(batch, reason, PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(batch, reason,
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+}
+
+static struct crocus_resource *
+crocus_resource_for_aspect(const struct intel_device_info *devinfo,
+ struct pipe_resource *p_res, unsigned pipe_mask)
+{
+ if (pipe_mask == PIPE_MASK_S) {
+ struct crocus_resource *junk, *s_res;
+ crocus_get_depth_stencil_resources(devinfo, p_res, &junk, &s_res);
+ return s_res;
+ } else {
+ return (struct crocus_resource *)p_res;
+ }
+}
+
+static enum pipe_format
+pipe_format_for_aspect(enum pipe_format format, unsigned pipe_mask)
+{
+ if (pipe_mask == PIPE_MASK_S) {
+ return util_format_stencil_only(format);
+ } else if (pipe_mask == PIPE_MASK_Z) {
+ return util_format_get_depth_only(format);
+ } else {
+ return format;
+ }
+}
+
+static void
+crocus_u_blitter(struct crocus_context *ice,
+ const struct pipe_blit_info *info)
+{
+ struct pipe_blit_info dinfo = *info;
+ if (!util_format_has_alpha(dinfo.dst.resource->format))
+ dinfo.mask &= ~PIPE_MASK_A;
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+ util_blitter_blit(ice->blitter, &dinfo);
+}
+
+/**
+ * The pipe->blit() driver hook.
+ *
+ * This performs a blit between two surfaces, which copies data but may
+ * also perform format conversion, scaling, flipping, and so on.
+ */
+static void
+crocus_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ enum blorp_batch_flags blorp_flags = 0;
+
+ /* We don't support color masking. */
+ assert((info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA ||
+ (info->mask & PIPE_MASK_RGBA) == 0);
+
+ if (info->render_condition_enable)
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (devinfo->ver <= 5) {
+ if (!screen->vtbl.blit_blt(batch, info)) {
+
+ if (!util_format_is_depth_or_stencil(info->src.resource->format) &&
+ info->dst.resource->target != PIPE_TEXTURE_3D)
+ goto use_blorp;
+
+ if (!util_blitter_is_blit_supported(ice->blitter, info)) {
+ if (util_format_is_depth_or_stencil(info->src.resource->format)) {
+
+ struct pipe_blit_info depth_blit = *info;
+ depth_blit.mask = PIPE_MASK_Z;
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+ util_blitter_blit(ice->blitter, &depth_blit);
+
+ struct pipe_surface *dst_view, dst_templ;
+ util_blitter_default_dst_texture(&dst_templ, info->dst.resource, info->dst.level, info->dst.box.z);
+ dst_view = ctx->create_surface(ctx, info->dst.resource, &dst_templ);
+
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+
+ util_blitter_clear_depth_stencil(ice->blitter, dst_view, PIPE_CLEAR_STENCIL,
+ 0, 0, info->dst.box.x, info->dst.box.y,
+ info->dst.box.width, info->dst.box.height);
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+ util_blitter_stencil_fallback(ice->blitter,
+ info->dst.resource,
+ info->dst.level,
+ &info->dst.box,
+ info->src.resource,
+ info->src.level,
+ &info->src.box, NULL);
+
+ }
+ return;
+ }
+
+ crocus_u_blitter(ice, info);
+ }
+ return;
+ }
+
+ if (devinfo->ver == 6) {
+ if (info->src.resource->target == PIPE_TEXTURE_3D &&
+ info->dst.resource->target == PIPE_TEXTURE_3D) {
+ crocus_u_blitter(ice, info);
+ return;
+ }
+ }
+
+use_blorp:
+ if (info->render_condition_enable) {
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+ blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+ }
+
+ float src_x0 = info->src.box.x;
+ float src_x1 = info->src.box.x + info->src.box.width;
+ float src_y0 = info->src.box.y;
+ float src_y1 = info->src.box.y + info->src.box.height;
+ float dst_x0 = info->dst.box.x;
+ float dst_x1 = info->dst.box.x + info->dst.box.width;
+ float dst_y0 = info->dst.box.y;
+ float dst_y1 = info->dst.box.y + info->dst.box.height;
+ bool mirror_x = apply_mirror(&src_x0, &src_x1);
+ bool mirror_y = apply_mirror(&src_y0, &src_y1);
+ enum blorp_filter filter;
+
+ if (info->scissor_enable) {
+ bool noop = apply_blit_scissor(&info->scissor,
+ &src_x0, &src_y0, &src_x1, &src_y1,
+ &dst_x0, &dst_y0, &dst_x1, &dst_y1,
+ mirror_x, mirror_y);
+ if (noop)
+ return;
+ }
+
+ if (abs(info->dst.box.width) == abs(info->src.box.width) &&
+ abs(info->dst.box.height) == abs(info->src.box.height)) {
+ if (info->src.resource->nr_samples > 1 &&
+ info->dst.resource->nr_samples <= 1) {
+ /* The OpenGL ES 3.2 specification, section 16.2.1, says:
+ *
+ * "If the read framebuffer is multisampled (its effective
+ * value of SAMPLE_BUFFERS is one) and the draw framebuffer
+ * is not (its value of SAMPLE_BUFFERS is zero), the samples
+ * corresponding to each pixel location in the source are
+ * converted to a single sample before being written to the
+ * destination. The filter parameter is ignored. If the
+ * source formats are integer types or stencil values, a
+ * single sample’s value is selected for each pixel. If the
+ * source formats are floating-point or normalized types,
+ * the sample values for each pixel are resolved in an
+ * implementation-dependent manner. If the source formats
+ * are depth values, sample values are resolved in an
+ * implementation-dependent manner where the result will be
+ * between the minimum and maximum depth values in the pixel."
+ *
+ * When selecting a single sample, we always choose sample 0.
+ */
+ if (util_format_is_depth_or_stencil(info->src.format) ||
+ util_format_is_pure_integer(info->src.format)) {
+ filter = BLORP_FILTER_SAMPLE_0;
+ } else {
+ filter = BLORP_FILTER_AVERAGE;
+ }
+ } else {
+ /* The OpenGL 4.6 specification, section 18.3.1, says:
+ *
+ * "If the source and destination dimensions are identical,
+ * no filtering is applied."
+ *
+ * Using BLORP_FILTER_NONE will also handle the upsample case by
+ * replicating the one value in the source to all values in the
+ * destination.
+ */
+ filter = BLORP_FILTER_NONE;
+ }
+ } else if (info->filter == PIPE_TEX_FILTER_LINEAR) {
+ filter = BLORP_FILTER_BILINEAR;
+ } else {
+ filter = BLORP_FILTER_NEAREST;
+ }
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ float src_z_step = (float)info->src.box.depth / (float)info->dst.box.depth;
+
+ /* There is no interpolation to the pixel center during rendering, so
+ * add the 0.5 offset ourselves here.
+ */
+ float depth_center_offset = 0;
+ if (info->src.resource->target == PIPE_TEXTURE_3D)
+ depth_center_offset = 0.5 / info->dst.box.depth * info->src.box.depth;
+
+ /* Perform a blit for each aspect requested by the caller. PIPE_MASK_R is
+ * used to represent the color aspect. */
+ unsigned aspect_mask = info->mask & (PIPE_MASK_R | PIPE_MASK_ZS);
+ while (aspect_mask) {
+ unsigned aspect = 1 << u_bit_scan(&aspect_mask);
+
+ struct crocus_resource *src_res =
+ crocus_resource_for_aspect(devinfo, info->src.resource, aspect);
+ struct crocus_resource *dst_res =
+ crocus_resource_for_aspect(devinfo, info->dst.resource, aspect);
+
+ enum pipe_format src_pfmt =
+ pipe_format_for_aspect(info->src.format, aspect);
+ enum pipe_format dst_pfmt =
+ pipe_format_for_aspect(info->dst.format, aspect);
+
+ if (crocus_resource_unfinished_aux_import(src_res))
+ crocus_resource_finish_aux_import(ctx->screen, src_res);
+ if (crocus_resource_unfinished_aux_import(dst_res))
+ crocus_resource_finish_aux_import(ctx->screen, dst_res);
+
+ struct crocus_format_info src_fmt =
+ crocus_format_for_usage(devinfo, src_pfmt, ISL_SURF_USAGE_TEXTURE_BIT);
+ enum isl_aux_usage src_aux_usage =
+ crocus_resource_texture_aux_usage(src_res);
+
+ crocus_resource_prepare_texture(ice, src_res, src_fmt.fmt,
+ info->src.level, 1, info->src.box.z,
+ info->src.box.depth);
+ // crocus_emit_buffer_barrier_for(batch, src_res->bo,
+ // CROCUS_DOMAIN_OTHER_READ);
+
+ struct crocus_format_info dst_fmt =
+ crocus_format_for_usage(devinfo, dst_pfmt,
+ ISL_SURF_USAGE_RENDER_TARGET_BIT);
+ enum isl_aux_usage dst_aux_usage =
+ crocus_resource_render_aux_usage(ice, dst_res, info->dst.level,
+ dst_fmt.fmt, false);
+
+ struct blorp_surf src_surf, dst_surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf,
+ &src_res->base, src_aux_usage,
+ info->src.level, false);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf,
+ &dst_res->base, dst_aux_usage,
+ info->dst.level, true);
+
+ crocus_resource_prepare_render(ice, dst_res, info->dst.level,
+ info->dst.box.z, info->dst.box.depth,
+ dst_aux_usage);
+ // crocus_emit_buffer_barrier_for(batch, dst_res->bo,
+ // CROCUS_DOMAIN_RENDER_WRITE);
+
+ if (crocus_batch_references(batch, src_res->bo))
+ tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format);
+
+ if (dst_res->base.target == PIPE_BUFFER) {
+ util_range_add(&dst_res->base, &dst_res->valid_buffer_range,
+ dst_x0, dst_x1);
+ }
+
+ struct isl_swizzle src_swiz = pipe_to_isl_swizzles(src_fmt.swizzles);
+ struct isl_swizzle dst_swiz = pipe_to_isl_swizzles(dst_fmt.swizzles);
+
+ for (int slice = 0; slice < info->dst.box.depth; slice++) {
+ unsigned dst_z = info->dst.box.z + slice;
+ float src_z = info->src.box.z + slice * src_z_step +
+ depth_center_offset;
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ blorp_blit(&blorp_batch,
+ &src_surf, info->src.level, src_z,
+ src_fmt.fmt, src_swiz,
+ &dst_surf, info->dst.level, dst_z,
+ dst_fmt.fmt, dst_swiz,
+ src_x0, src_y0, src_x1, src_y1,
+ dst_x0, dst_y0, dst_x1, dst_y1,
+ filter, mirror_x, mirror_y);
+
+ }
+
+ tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format);
+
+ crocus_resource_finish_render(ice, dst_res, info->dst.level,
+ info->dst.box.z, info->dst.box.depth,
+ dst_aux_usage);
+ }
+
+ blorp_batch_finish(&blorp_batch);
+
+ crocus_flush_and_dirty_for_history(ice, batch, (struct crocus_resource *)
+ info->dst.resource,
+ PIPE_CONTROL_RENDER_TARGET_FLUSH,
+ "cache history: post-blit");
+}
+
+static void
+get_copy_region_aux_settings(struct crocus_resource *res,
+ enum isl_aux_usage *out_aux_usage,
+ bool is_render_target)
+{
+ switch (res->aux.usage) {
+ case ISL_AUX_USAGE_MCS:
+ /* A stencil resolve operation must be performed prior to doing resource
+ * copies or used by CPU.
+ * (see HSD 1209978162)
+ */
+ if (is_render_target && isl_surf_usage_is_stencil(res->surf.usage)) {
+ *out_aux_usage = ISL_AUX_USAGE_NONE;
+ } else {
+ *out_aux_usage = res->aux.usage;
+ }
+ break;
+ default:
+ *out_aux_usage = ISL_AUX_USAGE_NONE;
+ break;
+ }
+}
+
+/**
+ * Perform a GPU-based raw memory copy between compatible view classes.
+ *
+ * Does not perform any flushing - the new data may still be left in the
+ * render cache, and old data may remain in other caches.
+ *
+ * Wraps blorp_copy() and blorp_buffer_copy().
+ */
+void
+crocus_copy_region(struct blorp_context *blorp,
+ struct crocus_batch *batch,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ struct blorp_batch blorp_batch;
+ struct crocus_context *ice = blorp->driver_ctx;
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_resource *src_res = (void *) src;
+ struct crocus_resource *dst_res = (void *) dst;
+
+ if (devinfo->ver <= 5) {
+ if (screen->vtbl.copy_region_blt(batch, dst_res,
+ dst_level, dstx, dsty, dstz,
+ src_res, src_level, src_box))
+ return;
+ }
+ enum isl_aux_usage src_aux_usage, dst_aux_usage;
+ get_copy_region_aux_settings(src_res, &src_aux_usage,
+ false);
+ get_copy_region_aux_settings(dst_res, &dst_aux_usage,
+ true);
+
+ if (crocus_batch_references(batch, src_res->bo))
+ tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
+
+ if (dst->target == PIPE_BUFFER)
+ util_range_add(&dst_res->base, &dst_res->valid_buffer_range, dstx, dstx + src_box->width);
+
+ if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+ struct blorp_address src_addr = {
+ .buffer = crocus_resource_bo(src), .offset = src_box->x,
+ };
+ struct blorp_address dst_addr = {
+ .buffer = crocus_resource_bo(dst), .offset = dstx,
+ .reloc_flags = EXEC_OBJECT_WRITE,
+ };
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+ blorp_buffer_copy(&blorp_batch, src_addr, dst_addr, src_box->width);
+ blorp_batch_finish(&blorp_batch);
+ } else {
+ // XXX: what about one surface being a buffer and not the other?
+
+ struct blorp_surf src_surf, dst_surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf,
+ src, src_aux_usage, src_level, false);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf,
+ dst, dst_aux_usage, dst_level, true);
+
+ crocus_resource_prepare_access(ice, src_res, src_level, 1,
+ src_box->z, src_box->depth,
+ src_aux_usage, false);
+ crocus_resource_prepare_access(ice, dst_res, dst_level, 1,
+ dstz, src_box->depth,
+ dst_aux_usage, false);
+
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+
+ for (int slice = 0; slice < src_box->depth; slice++) {
+ crocus_batch_maybe_flush(batch, 1500);
+
+ blorp_copy(&blorp_batch, &src_surf, src_level, src_box->z + slice,
+ &dst_surf, dst_level, dstz + slice,
+ src_box->x, src_box->y, dstx, dsty,
+ src_box->width, src_box->height);
+ }
+ blorp_batch_finish(&blorp_batch);
+
+ crocus_resource_finish_write(ice, dst_res, dst_level, dstz,
+ src_box->depth, dst_aux_usage);
+ }
+
+ tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
+}
+
+static struct crocus_batch *
+get_preferred_batch(struct crocus_context *ice, struct crocus_bo *bo)
+{
+ /* If the compute batch is already using this buffer, we'd prefer to
+ * continue queueing in the compute batch.
+ */
+ if (crocus_batch_references(&ice->batches[CROCUS_BATCH_COMPUTE], bo))
+ return &ice->batches[CROCUS_BATCH_COMPUTE];
+
+ /* Otherwise default to the render batch. */
+ return &ice->batches[CROCUS_BATCH_RENDER];
+}
+
+
+/**
+ * The pipe->resource_copy_region() driver hook.
+ *
+ * This implements ARB_copy_image semantics - a raw memory copy between
+ * compatible view classes.
+ */
+static void
+crocus_resource_copy_region(struct pipe_context *ctx,
+ struct pipe_resource *p_dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *p_src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_resource *src = (void *) p_src;
+ struct crocus_resource *dst = (void *) p_dst;
+
+ if (crocus_resource_unfinished_aux_import(src))
+ crocus_resource_finish_aux_import(ctx->screen, src);
+ if (crocus_resource_unfinished_aux_import(dst))
+ crocus_resource_finish_aux_import(ctx->screen, dst);
+
+ /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */
+ if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER &&
+ (src_box->width % 4 == 0) && src_box->width <= 16 &&
+ screen->vtbl.copy_mem_mem) {
+ struct crocus_bo *dst_bo = crocus_resource_bo(p_dst);
+ batch = get_preferred_batch(ice, dst_bo);
+ crocus_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4));
+ crocus_emit_pipe_control_flush(batch,
+ "stall for MI_COPY_MEM_MEM copy_region",
+ PIPE_CONTROL_CS_STALL);
+ screen->vtbl.copy_mem_mem(batch, dst_bo, dstx, crocus_resource_bo(p_src),
+ src_box->x, src_box->width);
+ return;
+ }
+
+ if (devinfo->ver < 6 && util_format_is_depth_or_stencil(p_dst->format)) {
+ util_resource_copy_region(ctx, p_dst, dst_level, dstx, dsty, dstz,
+ p_src, src_level, src_box);
+ return;
+ }
+ crocus_copy_region(&ice->blorp, batch, p_dst, dst_level, dstx, dsty, dstz,
+ p_src, src_level, src_box);
+
+ if (util_format_is_depth_and_stencil(p_dst->format) &&
+ util_format_has_stencil(util_format_description(p_src->format)) &&
+ devinfo->ver >= 6) {
+ struct crocus_resource *junk, *s_src_res, *s_dst_res;
+ crocus_get_depth_stencil_resources(devinfo, p_src, &junk, &s_src_res);
+ crocus_get_depth_stencil_resources(devinfo, p_dst, &junk, &s_dst_res);
+
+ crocus_copy_region(&ice->blorp, batch, &s_dst_res->base, dst_level, dstx,
+ dsty, dstz, &s_src_res->base, src_level, src_box);
+ }
+
+ crocus_flush_and_dirty_for_history(ice, batch, dst,
+ PIPE_CONTROL_RENDER_TARGET_FLUSH,
+ "cache history: post copy_region");
+}
+
+void
+crocus_init_blit_functions(struct pipe_context *ctx)
+{
+ ctx->blit = crocus_blit;
+ ctx->resource_copy_region = crocus_resource_copy_region;
+}
diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c
new file mode 100644
index 00000000000..75f0078d535
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blorp.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_blorp.c
+ *
+ * ============================= GENXML CODE =============================
+ * [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * GenX specific code for working with BLORP (blitting, resolves, clears
+ * on the 3D engine). This provides the driver-specific hooks needed to
+ * implement the BLORP API.
+ *
+ * See crocus_blit.c, crocus_clear.c, and so on.
+ */
+
+#include <assert.h>
+
+#include "crocus_batch.h"
+#include "crocus_resource.h"
+#include "crocus_context.h"
+
+#include "util/u_upload_mgr.h"
+#include "intel/common/intel_l3_config.h"
+
+#include "blorp/blorp_genX_exec.h"
+
+#if GFX_VER <= 5
+#include "gen4_blorp_exec.h"
+#endif
+
+static uint32_t *
+stream_state(struct crocus_batch *batch,
+ unsigned size,
+ unsigned alignment,
+ uint32_t *out_offset,
+ struct crocus_bo **out_bo)
+{
+ uint32_t offset = ALIGN(batch->state.used, alignment);
+
+ if (offset + size >= STATE_SZ && !batch->no_wrap) {
+ crocus_batch_flush(batch);
+ offset = ALIGN(batch->state.used, alignment);
+ } else if (offset + size >= batch->state.bo->size) {
+ const unsigned new_size =
+ MIN2(batch->state.bo->size + batch->state.bo->size / 2,
+ MAX_STATE_SIZE);
+ crocus_grow_buffer(batch, true, batch->state.used, new_size);
+ assert(offset + size < batch->state.bo->size);
+ }
+
+ crocus_record_state_size(batch->state_sizes, offset, size);
+
+ batch->state.used = offset + size;
+ *out_offset = offset;
+
+ /* If the caller has asked for a BO, we leave them the responsibility of
+ * adding bo->gtt_offset (say, by handing an address to genxml). If not,
+ * we assume they want the offset from a base address.
+ */
+ if (out_bo)
+ *out_bo = batch->state.bo;
+
+ return (uint32_t *)batch->state.map + (offset >> 2);
+}
+
+static void *
+blorp_emit_dwords(struct blorp_batch *blorp_batch, unsigned n)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ return crocus_get_command_space(batch, n * sizeof(uint32_t));
+}
+
+static uint64_t
+blorp_emit_reloc(struct blorp_batch *blorp_batch, UNUSED void *location,
+ struct blorp_address addr, uint32_t delta)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ uint32_t offset;
+
+ if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) {
+ offset = (char *)location - (char *)batch->state.map;
+ return crocus_state_reloc(batch, offset,
+ addr.buffer, addr.offset + delta,
+ addr.reloc_flags);
+ }
+
+ assert(!crocus_ptr_in_state_buffer(batch, location));
+
+ offset = (char *)location - (char *)batch->command.map;
+ return crocus_command_reloc(batch, offset,
+ addr.buffer, addr.offset + delta,
+ addr.reloc_flags);
+}
+
+static void
+blorp_surface_reloc(struct blorp_batch *blorp_batch, uint32_t ss_offset,
+ struct blorp_address addr, uint32_t delta)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ struct crocus_bo *bo = addr.buffer;
+
+ uint64_t reloc_val =
+ crocus_state_reloc(batch, ss_offset, bo, addr.offset + delta,
+ addr.reloc_flags);
+
+ void *reloc_ptr = (void *)batch->state.map + ss_offset;
+ *(uint32_t *)reloc_ptr = reloc_val;
+}
+
+static uint64_t
+blorp_get_surface_address(struct blorp_batch *blorp_batch,
+ struct blorp_address addr)
+{
+ /* We'll let blorp_surface_reloc write the address. */
+ return 0ull;
+}
+
+#if GFX_VER >= 7
+static struct blorp_address
+blorp_get_surface_base_address(struct blorp_batch *blorp_batch)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ return (struct blorp_address) {
+ .buffer = batch->state.bo,
+ .offset = 0
+ };
+}
+#endif
+
+static void *
+blorp_alloc_dynamic_state(struct blorp_batch *blorp_batch,
+ uint32_t size,
+ uint32_t alignment,
+ uint32_t *offset)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ return stream_state(batch, size, alignment, offset, NULL);
+}
+
+static void
+blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
+ unsigned num_entries,
+ unsigned state_size,
+ unsigned state_alignment,
+ uint32_t *bt_offset,
+ uint32_t *surface_offsets,
+ void **surface_maps)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ uint32_t *bt_map = stream_state(batch, num_entries * sizeof(uint32_t), 32,
+ bt_offset, NULL);
+
+ for (unsigned i = 0; i < num_entries; i++) {
+ surface_maps[i] = stream_state(batch,
+ state_size, state_alignment,
+ &(surface_offsets)[i], NULL);
+ bt_map[i] = surface_offsets[i];
+ }
+}
+
+static void *
+blorp_alloc_vertex_buffer(struct blorp_batch *blorp_batch,
+ uint32_t size,
+ struct blorp_address *addr)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ struct crocus_bo *bo;
+ uint32_t offset;
+
+ void *map = stream_state(batch, size, 64,
+ &offset, &bo);
+
+ *addr = (struct blorp_address) {
+ .buffer = bo,
+ .offset = offset,
+ .reloc_flags = RELOC_32BIT,
+#if GFX_VER >= 7
+ .mocs = crocus_mocs(bo, &batch->screen->isl_dev),
+#endif
+ };
+
+ return map;
+}
+
+/**
+ */
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *blorp_batch,
+ const struct blorp_address *addrs,
+ UNUSED uint32_t *sizes,
+ unsigned num_vbs)
+{
+}
+
+static struct blorp_address
+blorp_get_workaround_address(struct blorp_batch *blorp_batch)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ return (struct blorp_address) {
+ .buffer = batch->ice->workaround_bo,
+ .offset = batch->ice->workaround_offset,
+ };
+}
+
+static void
+blorp_flush_range(UNUSED struct blorp_batch *blorp_batch,
+ UNUSED void *start,
+ UNUSED size_t size)
+{
+ /* All allocated states come from the batch which we will flush before we
+ * submit it. There's nothing for us to do here.
+ */
+}
+
+#if GFX_VER >= 7
+static const struct intel_l3_config *
+blorp_get_l3_config(struct blorp_batch *blorp_batch)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ return batch->screen->l3_config_3d;
+}
+#else /* GFX_VER < 7 */
+static void
+blorp_emit_urb_config(struct blorp_batch *blorp_batch,
+ unsigned vs_entry_size,
+ UNUSED unsigned sf_entry_size)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+#if GFX_VER <= 5
+ batch->screen->vtbl.calculate_urb_fence(batch, 0, vs_entry_size, sf_entry_size);
+#else
+ genX(upload_urb)(batch, vs_entry_size, false, vs_entry_size);
+#endif
+}
+#endif
+
+static void
+crocus_blorp_exec(struct blorp_batch *blorp_batch,
+ const struct blorp_params *params)
+{
+ struct crocus_context *ice = blorp_batch->blorp->driver_ctx;
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ /* Flush the sampler and render caches. We definitely need to flush the
+ * sampler cache so that we get updated contents from the render cache for
+ * the glBlitFramebuffer() source. Also, we are sometimes warned in the
+ * docs to flush the cache between reinterpretations of the same surface
+ * data with different formats, which blorp does for stencil and depth
+ * data.
+ */
+ if (params->src.enabled)
+ crocus_cache_flush_for_read(batch, params->src.addr.buffer);
+ if (params->dst.enabled) {
+ crocus_cache_flush_for_render(batch, params->dst.addr.buffer,
+ params->dst.view.format,
+ params->dst.aux_usage);
+ }
+ if (params->depth.enabled)
+ crocus_cache_flush_for_depth(batch, params->depth.addr.buffer);
+ if (params->stencil.enabled)
+ crocus_cache_flush_for_depth(batch, params->stencil.addr.buffer);
+
+ crocus_require_command_space(batch, 1400);
+ crocus_require_statebuffer_space(batch, 600);
+ batch->no_wrap = true;
+#if GFX_VER == 6
+ /* Emit workaround flushes when we switch from drawing to blorping. */
+ crocus_emit_post_sync_nonzero_flush(batch);
+#endif
+
+#if GFX_VER >= 6
+ crocus_emit_depth_stall_flushes(batch);
+#endif
+
+ blorp_emit(blorp_batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+ rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
+ rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
+ }
+
+ batch->screen->vtbl.update_surface_base_address(batch);
+ crocus_handle_always_flush_cache(batch);
+
+ batch->contains_draw = true;
+ blorp_exec(blorp_batch, params);
+
+ batch->no_wrap = false;
+ crocus_handle_always_flush_cache(batch);
+
+ /* We've smashed all state compared to what the normal 3D pipeline
+ * rendering tracks for GL.
+ */
+
+ uint64_t skip_bits = (CROCUS_DIRTY_POLYGON_STIPPLE |
+ CROCUS_DIRTY_GEN7_SO_BUFFERS |
+ CROCUS_DIRTY_SO_DECL_LIST |
+ CROCUS_DIRTY_LINE_STIPPLE |
+ CROCUS_ALL_DIRTY_FOR_COMPUTE |
+ CROCUS_DIRTY_GEN6_SCISSOR_RECT |
+ CROCUS_DIRTY_GEN75_VF |
+ CROCUS_DIRTY_SF_CL_VIEWPORT);
+
+ uint64_t skip_stage_bits = (CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_VS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TCS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TES |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_GS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_FS |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES |
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS);
+
+ if (!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) {
+ /* BLORP disabled tessellation, that's fine for the next draw */
+ skip_stage_bits |= CROCUS_STAGE_DIRTY_TCS |
+ CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES |
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS |
+ CROCUS_STAGE_DIRTY_BINDINGS_TES;
+ }
+
+ if (!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) {
+ /* BLORP disabled geometry shaders, that's fine for the next draw */
+ skip_stage_bits |= CROCUS_STAGE_DIRTY_GS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_GS |
+ CROCUS_STAGE_DIRTY_BINDINGS_GS;
+ }
+
+ /* we can skip flagging CROCUS_DIRTY_DEPTH_BUFFER, if
+ * BLORP_BATCH_NO_EMIT_DEPTH_STENCIL is set.
+ */
+ if (blorp_batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL)
+ skip_bits |= CROCUS_DIRTY_DEPTH_BUFFER;
+
+ if (!params->wm_prog_data)
+ skip_bits |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+
+ ice->state.dirty |= ~skip_bits;
+ ice->state.stage_dirty |= ~skip_stage_bits;
+
+ ice->urb.vsize = 0;
+ ice->urb.gs_present = false;
+ ice->urb.gsize = 0;
+ ice->urb.tess_present = false;
+ ice->urb.hsize = 0;
+ ice->urb.dsize = 0;
+
+ if (params->dst.enabled) {
+ crocus_render_cache_add_bo(batch, params->dst.addr.buffer,
+ params->dst.view.format,
+ params->dst.aux_usage);
+ }
+ if (params->depth.enabled)
+ crocus_depth_cache_add_bo(batch, params->depth.addr.buffer);
+ if (params->stencil.enabled)
+ crocus_depth_cache_add_bo(batch, params->stencil.addr.buffer);
+}
+
+static void
+blorp_measure_start(struct blorp_batch *blorp_batch,
+ const struct blorp_params *params)
+{
+}
+
+void
+genX(init_blorp)(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+
+ blorp_init(&ice->blorp, ice, &screen->isl_dev);
+ ice->blorp.compiler = screen->compiler;
+ ice->blorp.lookup_shader = crocus_blorp_lookup_shader;
+ ice->blorp.upload_shader = crocus_blorp_upload_shader;
+ ice->blorp.exec = crocus_blorp_exec;
+}
diff --git a/src/gallium/drivers/crocus/crocus_blt.c b/src/gallium/drivers/crocus/crocus_blt.c
new file mode 100644
index 00000000000..d27891352bd
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blt.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* blt command encoding for gen4/5 */
+#include "crocus_context.h"
+
+#include "crocus_genx_macros.h"
+#include "crocus_genx_protos.h"
+#include "crocus_resource.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+#if GFX_VER <= 5
+
+static bool validate_blit_for_blt(struct crocus_batch *batch,
+ const struct pipe_blit_info *info)
+{
+ /* If the source and destination are the same size with no mirroring,
+ * the rectangles are within the size of the texture and there is no
+ * scissor, then we can probably use the blit engine.
+ */
+ if (info->dst.box.width != info->src.box.width ||
+ info->dst.box.height != info->src.box.height)
+ return false;
+
+ if (info->scissor_enable)
+ return false;
+
+ if (info->dst.box.height < 0 || info->src.box.height < 0)
+ return false;
+
+ if (info->dst.box.depth > 1 || info->src.box.depth > 1)
+ return false;
+
+ return true;
+}
+
+static inline int crocus_resource_blt_pitch(struct crocus_resource *res)
+{
+ int pitch = res->surf.row_pitch_B;
+ if (res->surf.tiling != ISL_TILING_LINEAR)
+ pitch /= 4;
+ return pitch;
+}
+
+static uint32_t
+color_depth_for_cpp(int cpp)
+{
+ switch (cpp) {
+ case 4: return COLOR_DEPTH__32bit;
+ case 2: return COLOR_DEPTH__565;
+ case 1: return COLOR_DEPTH__8bit;
+ default:
+ unreachable("not reached");
+ }
+}
+
+static bool emit_copy_blt(struct crocus_batch *batch,
+ struct crocus_resource *src,
+ struct crocus_resource *dst,
+ unsigned cpp,
+ int32_t src_pitch,
+ unsigned src_offset,
+ int32_t dst_pitch,
+ unsigned dst_offset,
+ uint16_t src_x, uint16_t src_y,
+ uint16_t dst_x, uint16_t dst_y,
+ uint16_t w, uint16_t h)
+
+{
+ uint32_t src_tile_w, src_tile_h;
+ uint32_t dst_tile_w, dst_tile_h;
+ int dst_y2 = dst_y + h;
+ int dst_x2 = dst_x + w;
+
+ DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+ __func__,
+ src, src_pitch, src_offset, src_x, src_y,
+ dst, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+
+ isl_get_tile_dims(src->surf.tiling, cpp, &src_tile_w, &src_tile_h);
+ isl_get_tile_dims(dst->surf.tiling, cpp, &dst_tile_w, &dst_tile_h);
+
+ /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
+ * (X direction width of the Tile). This is ensured while allocating the
+ * buffer object.
+ */
+ assert(src->surf.tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0);
+ assert(dst->surf.tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0);
+
+ /* For big formats (such as floating point), do the copy using 16 or
+ * 32bpp and multiply the coordinates.
+ */
+ if (cpp > 4) {
+ if (cpp % 4 == 2) {
+ dst_x *= cpp / 2;
+ dst_x2 *= cpp / 2;
+ src_x *= cpp / 2;
+ cpp = 2;
+ } else {
+ assert(cpp % 4 == 0);
+ dst_x *= cpp / 4;
+ dst_x2 *= cpp / 4;
+ src_x *= cpp / 4;
+ cpp = 4;
+ }
+ }
+
+ /* For tiled source and destination, pitch value should be specified
+ * as a number of Dwords.
+ */
+ if (dst->surf.tiling != ISL_TILING_LINEAR)
+ dst_pitch /= 4;
+
+ if (src->surf.tiling != ISL_TILING_LINEAR)
+ src_pitch /= 4;
+
+ assert(cpp <= 4);
+ crocus_emit_cmd(batch, GENX(XY_SRC_COPY_BLT), xyblt) {
+ xyblt.RasterOperation = 0xCC;
+ xyblt.DestinationTilingEnable = dst->surf.tiling != ISL_TILING_LINEAR;
+ xyblt.SourceTilingEnable = src->surf.tiling != ISL_TILING_LINEAR;
+ xyblt.SourceBaseAddress = ro_bo(src->bo, src_offset);
+ xyblt.DestinationBaseAddress = rw_bo(dst->bo, dst_offset);
+ xyblt.ColorDepth = color_depth_for_cpp(cpp);
+ xyblt._32bppByteMask = cpp == 4 ? 0x3 : 0x1;
+ xyblt.DestinationX1Coordinate = dst_x;
+ xyblt.DestinationY1Coordinate = dst_y;
+ xyblt.DestinationX2Coordinate = dst_x2;
+ xyblt.DestinationY2Coordinate = dst_y2;
+ xyblt.DestinationPitch = dst_pitch;
+ xyblt.SourceX1Coordinate = src_x;
+ xyblt.SourceY1Coordinate = src_y;
+ xyblt.SourcePitch = src_pitch;
+ };
+
+ crocus_emit_mi_flush(batch);
+ return true;
+}
+
+static bool crocus_emit_blt(struct crocus_batch *batch,
+ struct crocus_resource *src,
+ struct crocus_resource *dst,
+ unsigned dst_level,
+ unsigned dst_x, unsigned dst_y,
+ unsigned dst_z,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ const struct isl_format_layout *src_fmtl = isl_format_get_layout(src->surf.format);
+ unsigned src_cpp = src_fmtl->bpb / 8;
+ const struct isl_format_layout *dst_fmtl = isl_format_get_layout(dst->surf.format);
+ const unsigned dst_cpp = dst_fmtl->bpb / 8;
+ uint16_t src_x, src_y;
+ uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y;
+ uint32_t src_width = src_box->width, src_height = src_box->height;
+
+ /* gen4/5 can't handle Y tiled blits. */
+ if (src->surf.tiling == ISL_TILING_Y0 || dst->surf.tiling == ISL_TILING_Y0)
+ return false;
+
+ if (src->surf.format != dst->surf.format)
+ return false;
+
+ if (src_cpp != dst_cpp)
+ return false;
+
+ src_x = src_box->x;
+ src_y = src_box->y;
+
+ assert(src_cpp == dst_cpp);
+
+ crocus_resource_get_image_offset(src, src_level, src_box->z, &src_image_x,
+ &src_image_y);
+ if (util_format_is_compressed(src->base.format)) {
+ int bw = util_format_get_blockwidth(src->base.format);
+ int bh = util_format_get_blockheight(src->base.format);
+ assert(src_x % bw == 0);
+ assert(src_y % bh == 0);
+ src_x /= (int)bw;
+ src_y /= (int)bh;
+ src_width = DIV_ROUND_UP(src_width, (int)bw);
+ src_height = DIV_ROUND_UP(src_height, (int)bh);
+ }
+
+ crocus_resource_get_image_offset(dst, dst_level, dst_z, &dst_image_x,
+ &dst_image_y);
+ if (util_format_is_compressed(dst->base.format)) {
+ int bw = util_format_get_blockwidth(dst->base.format);
+ int bh = util_format_get_blockheight(dst->base.format);
+ assert(dst_x % bw == 0);
+ assert(dst_y % bh == 0);
+ dst_x /= (int)bw;
+ dst_y /= (int)bh;
+ }
+ src_x += src_image_x;
+ src_y += src_image_y;
+ dst_x += dst_image_x;
+ dst_y += dst_image_y;
+
+ /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics
+ * Data Size Limitations):
+ *
+ * The BLT engine is capable of transferring very large quantities of
+ * graphics data. Any graphics data read from and written to the
+ * destination is permitted to represent a number of pixels that
+ * occupies up to 65,536 scan lines and up to 32,768 bytes per scan line
+ * at the destination. The maximum number of pixels that may be
+ * represented per scan line’s worth of graphics data depends on the
+ * color depth.
+ *
+ * The blitter's pitch is a signed 16-bit integer, but measured in bytes
+ * for linear surfaces and DWords for tiled surfaces. So the maximum
+ * pitch is 32k linear and 128k tiled.
+ */
+ if (crocus_resource_blt_pitch(src) >= 32768 ||
+ crocus_resource_blt_pitch(dst) >= 32768) {
+ return false;
+ }
+
+ /* We need to split the blit into chunks that each fit within the blitter's
+ * restrictions. We can't use a chunk size of 32768 because we need to
+ * ensure that src_tile_x + chunk_size fits. We choose 16384 because it's
+ * a nice round power of two, big enough that performance won't suffer, and
+ * small enough to guarantee everything fits.
+ */
+ const uint32_t max_chunk_size = 16384;
+
+ for (uint32_t chunk_x = 0; chunk_x < src_width; chunk_x += max_chunk_size) {
+ for (uint32_t chunk_y = 0; chunk_y < src_height; chunk_y += max_chunk_size) {
+ const uint32_t chunk_w = MIN2(max_chunk_size, src_width - chunk_x);
+ const uint32_t chunk_h = MIN2(max_chunk_size, src_height - chunk_y);
+
+ ASSERTED uint32_t z_offset_el, array_offset;
+ uint32_t src_offset, src_tile_x, src_tile_y;
+ isl_tiling_get_intratile_offset_el(src->surf.tiling,
+ src_cpp * 8, src->surf.row_pitch_B,
+ src->surf.array_pitch_el_rows,
+ src_x + chunk_x, src_y + chunk_y, 0, 0,
+ &src_offset,
+ &src_tile_x, &src_tile_y,
+ &z_offset_el, &array_offset);
+ assert(z_offset_el == 0);
+ assert(array_offset == 0);
+
+ uint32_t dst_offset, dst_tile_x, dst_tile_y;
+ isl_tiling_get_intratile_offset_el(dst->surf.tiling,
+ dst_cpp * 8, dst->surf.row_pitch_B,
+ dst->surf.array_pitch_el_rows,
+ dst_x + chunk_x, dst_y + chunk_y, 0, 0,
+ &dst_offset,
+ &dst_tile_x, &dst_tile_y,
+ &z_offset_el, &array_offset);
+ assert(z_offset_el == 0);
+ assert(array_offset == 0);
+ if (!emit_copy_blt(batch, src, dst,
+ src_cpp, src->surf.row_pitch_B,
+ src_offset,
+ dst->surf.row_pitch_B, dst_offset,
+ src_tile_x, src_tile_y,
+ dst_tile_x, dst_tile_y,
+ chunk_w, chunk_h)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static bool crocus_blit_blt(struct crocus_batch *batch,
+ const struct pipe_blit_info *info)
+{
+ if (!validate_blit_for_blt(batch, info))
+ return false;
+
+ return crocus_emit_blt(batch,
+ (struct crocus_resource *)info->src.resource,
+ (struct crocus_resource *)info->dst.resource,
+ info->dst.level,
+ info->dst.box.x,
+ info->dst.box.y,
+ info->dst.box.z,
+ info->src.level,
+ &info->src.box);
+}
+
+
+static bool crocus_copy_region_blt(struct crocus_batch *batch,
+ struct crocus_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct crocus_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ if (dst->base.target == PIPE_BUFFER || src->base.target == PIPE_BUFFER)
+ return false;
+ return crocus_emit_blt(batch,
+ src,
+ dst,
+ dst_level,
+ dstx, dsty, dstz,
+ src_level,
+ src_box);
+}
+#endif
+
+void
+genX(init_blt)(struct crocus_screen *screen)
+{
+#if GFX_VER <= 5
+ screen->vtbl.blit_blt = crocus_blit_blt;
+ screen->vtbl.copy_region_blt = crocus_copy_region_blt;
+#else
+ screen->vtbl.blit_blt = NULL;
+ screen->vtbl.copy_region_blt = NULL;
+#endif
+}
diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.c b/src/gallium/drivers/crocus/crocus_bufmgr.c
new file mode 100644
index 00000000000..caca821cd7e
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_bufmgr.c
@@ -0,0 +1,1689 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_bufmgr.c
+ *
+ * The crocus buffer manager.
+ *
+ * XXX: write better comments
+ * - BOs
+ * - Explain BO cache
+ * - main interface to GEM in the kernel
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xf86drm.h>
+#include <util/u_atomic.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "errno.h"
+#include "common/intel_clflush.h"
+#include "dev/intel_debug.h"
+#include "common/intel_gem.h"
+#include "dev/intel_device_info.h"
+#include "main/macros.h"
+#include "util/debug.h"
+#include "util/macros.h"
+#include "util/hash_table.h"
+#include "util/list.h"
+#include "util/os_file.h"
+#include "util/u_dynarray.h"
+#include "util/vma.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "string.h"
+
+#include "drm-uapi/i915_drm.h"
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+/**
+ * For debugging purposes, this returns a time in seconds.
+ */
+static double
+get_time(void)
+{
+ struct timespec tp;
+
+ clock_gettime(CLOCK_MONOTONIC, &tp);
+
+ return tp.tv_sec + tp.tv_nsec / 1000000000.0;
+}
+
+/* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier
+ * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is
+ * leaked. All because it does not call VG(cli_free) from its
+ * VG_USERREQ__FREELIKE_BLOCK handler. Instead of treating the memory like
+ * and allocation, we mark it available for use upon mmapping and remove
+ * it upon unmapping.
+ */
+#define VG_DEFINED(ptr, size) VG(VALGRIND_MAKE_MEM_DEFINED(ptr, size))
+#define VG_NOACCESS(ptr, size) VG(VALGRIND_MAKE_MEM_NOACCESS(ptr, size))
+
+#define PAGE_SIZE 4096
+
+#define WARN_ONCE(cond, fmt...) do { \
+ if (unlikely(cond)) { \
+ static bool _warned = false; \
+ if (!_warned) { \
+ fprintf(stderr, "WARNING: "); \
+ fprintf(stderr, fmt); \
+ _warned = true; \
+ } \
+ } \
+} while (0)
+
+#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+
+static inline int
+atomic_add_unless(int *v, int add, int unless)
+{
+ int c, old;
+ c = p_atomic_read(v);
+ while (c != unless && (old = p_atomic_cmpxchg(v, c, c + add)) != c)
+ c = old;
+ return c == unless;
+}
+
+struct bo_cache_bucket {
+ /** List of cached BOs. */
+ struct list_head head;
+
+ /** Size of this bucket, in bytes. */
+ uint64_t size;
+};
+
+struct bo_export {
+ /** File descriptor associated with a handle export. */
+ int drm_fd;
+
+ /** GEM handle in drm_fd */
+ uint32_t gem_handle;
+
+ struct list_head link;
+};
+
+struct crocus_bufmgr {
+ /**
+ * List into the list of bufmgr.
+ */
+ struct list_head link;
+
+ uint32_t refcount;
+
+ int fd;
+
+ mtx_t lock;
+
+ /** Array of lists of cached gem objects of power-of-two sizes */
+ struct bo_cache_bucket cache_bucket[14 * 4];
+ int num_buckets;
+ time_t time;
+
+ struct hash_table *name_table;
+ struct hash_table *handle_table;
+
+ /**
+ * List of BOs which we've effectively freed, but are hanging on to
+ * until they're idle before closing and returning the VMA.
+ */
+ struct list_head zombie_list;
+
+ bool has_llc:1;
+ bool has_mmap_offset:1;
+ bool has_tiling_uapi:1;
+ bool bo_reuse:1;
+};
+
+static mtx_t global_bufmgr_list_mutex = _MTX_INITIALIZER_NP;
+static struct list_head global_bufmgr_list = {
+ .next = &global_bufmgr_list,
+ .prev = &global_bufmgr_list,
+};
+
+static int bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode,
+ uint32_t stride);
+
+static void bo_free(struct crocus_bo *bo);
+
+static uint32_t
+key_hash_uint(const void *key)
+{
+ return _mesa_hash_data(key, 4);
+}
+
+static bool
+key_uint_equal(const void *a, const void *b)
+{
+ return *((unsigned *) a) == *((unsigned *) b);
+}
+
+static struct crocus_bo *
+find_and_ref_external_bo(struct hash_table *ht, unsigned int key)
+{
+ struct hash_entry *entry = _mesa_hash_table_search(ht, &key);
+ struct crocus_bo *bo = entry ? entry->data : NULL;
+
+ if (bo) {
+ assert(bo->external);
+ assert(!bo->reusable);
+
+ /* Being non-reusable, the BO cannot be in the cache lists, but it
+ * may be in the zombie list if it had reached zero references, but
+ * we hadn't yet closed it...and then reimported the same BO. If it
+ * is, then remove it since it's now been resurrected.
+ */
+ if (bo->head.prev || bo->head.next)
+ list_del(&bo->head);
+
+ crocus_bo_reference(bo);
+ }
+
+ return bo;
+}
+
+/**
+ * This function finds the correct bucket fit for the input size.
+ * The function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
+static struct bo_cache_bucket *
+bucket_for_size(struct crocus_bufmgr *bufmgr, uint64_t size)
+{
+ /* Calculating the pages and rounding up to the page size. */
+ const unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ /* Row Bucket sizes clz((x-1) | 3) Row Column
+ * in pages stride size
+ * 0: 1 2 3 4 -> 30 30 30 30 4 1
+ * 1: 5 6 7 8 -> 29 29 29 29 4 1
+ * 2: 10 12 14 16 -> 28 28 28 28 8 2
+ * 3: 20 24 28 32 -> 27 27 27 27 16 4
+ */
+ const unsigned row = 30 - __builtin_clz((pages - 1) | 3);
+ const unsigned row_max_pages = 4 << row;
+
+ /* The '& ~2' is the special case for row 1. In row 1, max pages /
+ * 2 is 2, but the previous row maximum is zero (because there is
+ * no previous row). All row maximum sizes are power of 2, so that
+ * is the only case where that bit will be set.
+ */
+ const unsigned prev_row_max_pages = (row_max_pages / 2) & ~2;
+ int col_size_log2 = row - 1;
+ col_size_log2 += (col_size_log2 < 0);
+
+ const unsigned col = (pages - prev_row_max_pages +
+ ((1 << col_size_log2) - 1)) >> col_size_log2;
+
+ /* Calculating the index based on the row and column. */
+ const unsigned index = (row * 4) + (col - 1);
+
+ return (index < bufmgr->num_buckets) ?
+ &bufmgr->cache_bucket[index] : NULL;
+}
+
+
+int
+crocus_bo_busy(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct drm_i915_gem_busy busy = { .handle = bo->gem_handle };
+
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
+ if (ret == 0) {
+ bo->idle = !busy.busy;
+ return busy.busy;
+ }
+ return false;
+}
+
+int
+crocus_bo_madvise(struct crocus_bo *bo, int state)
+{
+ struct drm_i915_gem_madvise madv = {
+ .handle = bo->gem_handle,
+ .madv = state,
+ .retained = 1,
+ };
+
+ intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv);
+
+ return madv.retained;
+}
+
+static struct crocus_bo *
+bo_calloc(void)
+{
+ struct crocus_bo *bo = calloc(1, sizeof(*bo));
+ if (!bo)
+ return NULL;
+
+ list_inithead(&bo->exports);
+ bo->hash = _mesa_hash_pointer(bo);
+ return bo;
+}
+
+static struct crocus_bo *
+alloc_bo_from_cache(struct crocus_bufmgr *bufmgr,
+ struct bo_cache_bucket *bucket,
+ uint32_t alignment,
+ unsigned flags)
+{
+ if (!bucket)
+ return NULL;
+
+ struct crocus_bo *bo = NULL;
+
+ list_for_each_entry_safe(struct crocus_bo, cur, &bucket->head, head) {
+ /* If the last BO in the cache is busy, there are no idle BOs. Bail,
+ * either falling back to a non-matching memzone, or if that fails,
+ * allocating a fresh buffer.
+ */
+ if (crocus_bo_busy(cur))
+ return NULL;
+
+ list_del(&cur->head);
+
+ /* Tell the kernel we need this BO. If it still exists, we're done! */
+ if (crocus_bo_madvise(cur, I915_MADV_WILLNEED)) {
+ bo = cur;
+ break;
+ }
+
+ /* This BO was purged, throw it out and keep looking. */
+ bo_free(cur);
+ }
+
+ if (!bo)
+ return NULL;
+
+ /* Zero the contents if necessary. If this fails, fall back to
+ * allocating a fresh BO, which will always be zeroed by the kernel.
+ */
+ if (flags & BO_ALLOC_ZEROED) {
+ void *map = crocus_bo_map(NULL, bo, MAP_WRITE | MAP_RAW);
+ if (map) {
+ memset(map, 0, bo->size);
+ } else {
+ bo_free(bo);
+ return NULL;
+ }
+ }
+
+ return bo;
+}
+
+static struct crocus_bo *
+alloc_fresh_bo(struct crocus_bufmgr *bufmgr, uint64_t bo_size)
+{
+ struct crocus_bo *bo = bo_calloc();
+ if (!bo)
+ return NULL;
+
+ struct drm_i915_gem_create create = { .size = bo_size };
+
+ /* All new BOs we get from the kernel are zeroed, so we don't need to
+ * worry about that here.
+ */
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create) != 0) {
+ free(bo);
+ return NULL;
+ }
+
+ bo->gem_handle = create.handle;
+ bo->bufmgr = bufmgr;
+ bo->size = bo_size;
+ bo->idle = true;
+ bo->tiling_mode = I915_TILING_NONE;
+ bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
+ bo->stride = 0;
+
+ /* Calling set_domain() will allocate pages for the BO outside of the
+ * struct mutex lock in the kernel, which is more efficient than waiting
+ * to create them during the first execbuf that uses the BO.
+ */
+ struct drm_i915_gem_set_domain sd = {
+ .handle = bo->gem_handle,
+ .read_domains = I915_GEM_DOMAIN_CPU,
+ .write_domain = 0,
+ };
+
+ if (intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) {
+ bo_free(bo);
+ return NULL;
+ }
+
+ return bo;
+}
+
+static struct crocus_bo *
+bo_alloc_internal(struct crocus_bufmgr *bufmgr,
+ const char *name,
+ uint64_t size,
+ uint32_t alignment,
+ unsigned flags,
+ uint32_t tiling_mode,
+ uint32_t stride)
+{
+ struct crocus_bo *bo;
+ unsigned int page_size = getpagesize();
+ struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size);
+
+ /* Round the size up to the bucket size, or if we don't have caching
+ * at this size, a multiple of the page size.
+ */
+ uint64_t bo_size =
+ bucket ? bucket->size : MAX2(ALIGN(size, page_size), page_size);
+
+ mtx_lock(&bufmgr->lock);
+
+ /* Get a buffer out of the cache if available. First, we try to find
+ * one with a matching memory zone so we can avoid reallocating VMA.
+ */
+ bo = alloc_bo_from_cache(bufmgr, bucket, alignment, flags);
+
+ mtx_unlock(&bufmgr->lock);
+
+ if (!bo) {
+ bo = alloc_fresh_bo(bufmgr, bo_size);
+ if (!bo)
+ return NULL;
+ }
+
+ if (bo_set_tiling_internal(bo, tiling_mode, stride))
+ goto err_free;
+
+ bo->name = name;
+ p_atomic_set(&bo->refcount, 1);
+ bo->reusable = bucket && bufmgr->bo_reuse;
+ bo->cache_coherent = bufmgr->has_llc;
+ bo->index = -1;
+ bo->kflags = 0;
+
+ if ((flags & BO_ALLOC_COHERENT) && !bo->cache_coherent) {
+ struct drm_i915_gem_caching arg = {
+ .handle = bo->gem_handle,
+ .caching = 1,
+ };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) == 0) {
+ bo->cache_coherent = true;
+ bo->reusable = false;
+ }
+ }
+
+ DBG("bo_create: buf %d (%s) %llub\n", bo->gem_handle,
+ bo->name, (unsigned long long) size);
+
+ return bo;
+
+err_free:
+ bo_free(bo);
+ return NULL;
+}
+
+struct crocus_bo *
+crocus_bo_alloc(struct crocus_bufmgr *bufmgr,
+ const char *name,
+ uint64_t size)
+{
+ return bo_alloc_internal(bufmgr, name, size, 1,
+ 0, I915_TILING_NONE, 0);
+}
+
+struct crocus_bo *
+crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr, const char *name,
+ uint64_t size, uint32_t alignment,
+ uint32_t tiling_mode, uint32_t pitch, unsigned flags)
+{
+ return bo_alloc_internal(bufmgr, name, size, alignment,
+ flags, tiling_mode, pitch);
+}
+
+struct crocus_bo *
+crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr, const char *name,
+ void *ptr, size_t size)
+{
+ struct crocus_bo *bo;
+
+ bo = bo_calloc();
+ if (!bo)
+ return NULL;
+
+ struct drm_i915_gem_userptr arg = {
+ .user_ptr = (uintptr_t)ptr,
+ .user_size = size,
+ };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_USERPTR, &arg))
+ goto err_free;
+ bo->gem_handle = arg.handle;
+
+ /* Check the buffer for validity before we try and use it in a batch */
+ struct drm_i915_gem_set_domain sd = {
+ .handle = bo->gem_handle,
+ .read_domains = I915_GEM_DOMAIN_CPU,
+ };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd))
+ goto err_close;
+
+ bo->name = name;
+ bo->size = size;
+ bo->map_cpu = ptr;
+
+ bo->bufmgr = bufmgr;
+ bo->kflags = 0;
+
+ if (bo->gtt_offset == 0ull)
+ goto err_close;
+
+ p_atomic_set(&bo->refcount, 1);
+ bo->userptr = true;
+ bo->cache_coherent = true;
+ bo->index = -1;
+ bo->idle = true;
+
+ return bo;
+
+err_close:
+ intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &bo->gem_handle);
+err_free:
+ free(bo);
+ return NULL;
+}
+
+/**
+ * Returns a crocus_bo wrapping the given buffer object handle.
+ *
+ * This can be used when one application needs to pass a buffer object
+ * to another.
+ */
+struct crocus_bo *
+crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr,
+ const char *name, unsigned int handle)
+{
+ struct crocus_bo *bo;
+
+ /* At the moment most applications only have a few named bo.
+ * For instance, in a DRI client only the render buffers passed
+ * between X and the client are named. And since X returns the
+ * alternating names for the front/back buffer a linear search
+ * provides a sufficiently fast match.
+ */
+ mtx_lock(&bufmgr->lock);
+ bo = find_and_ref_external_bo(bufmgr->name_table, handle);
+ if (bo)
+ goto out;
+
+ struct drm_gem_open open_arg = { .name = handle };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_OPEN, &open_arg);
+ if (ret != 0) {
+ DBG("Couldn't reference %s handle 0x%08x: %s\n",
+ name, handle, strerror(errno));
+ bo = NULL;
+ goto out;
+ }
+ /* Now see if someone has used a prime handle to get this
+ * object from the kernel before by looking through the list
+ * again for a matching gem_handle
+ */
+ bo = find_and_ref_external_bo(bufmgr->handle_table, open_arg.handle);
+ if (bo)
+ goto out;
+
+ bo = bo_calloc();
+ if (!bo)
+ goto out;
+
+ p_atomic_set(&bo->refcount, 1);
+
+ bo->size = open_arg.size;
+ bo->gtt_offset = 0;
+ bo->bufmgr = bufmgr;
+ bo->gem_handle = open_arg.handle;
+ bo->name = name;
+ bo->global_name = handle;
+ bo->reusable = false;
+ bo->external = true;
+ bo->kflags = 0;
+
+ _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
+ _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+
+ struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
+ ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling);
+ if (ret != 0)
+ goto err_unref;
+
+ bo->tiling_mode = get_tiling.tiling_mode;
+ bo->swizzle_mode = get_tiling.swizzle_mode;
+ /* XXX stride is unknown */
+ DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name);
+
+out:
+ mtx_unlock(&bufmgr->lock);
+ return bo;
+
+err_unref:
+ bo_free(bo);
+ mtx_unlock(&bufmgr->lock);
+ return NULL;
+}
+
+static void
+bo_close(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bo->external) {
+ struct hash_entry *entry;
+
+ if (bo->global_name) {
+ entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name);
+ _mesa_hash_table_remove(bufmgr->name_table, entry);
+ }
+
+ entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle);
+ _mesa_hash_table_remove(bufmgr->handle_table, entry);
+ }
+
+ /* Close this object */
+ struct drm_gem_close close = { .handle = bo->gem_handle };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close);
+ if (ret != 0) {
+ DBG("DRM_IOCTL_GEM_CLOSE %d failed (%s): %s\n",
+ bo->gem_handle, bo->name, strerror(errno));
+ }
+
+ free(bo);
+}
+
+static void
+bo_free(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bo->map_cpu && !bo->userptr) {
+ VG_NOACCESS(bo->map_cpu, bo->size);
+ munmap(bo->map_cpu, bo->size);
+ }
+ if (bo->map_wc) {
+ VG_NOACCESS(bo->map_wc, bo->size);
+ munmap(bo->map_wc, bo->size);
+ }
+ if (bo->map_gtt) {
+ VG_NOACCESS(bo->map_gtt, bo->size);
+ munmap(bo->map_gtt, bo->size);
+ }
+
+ if (bo->idle) {
+ bo_close(bo);
+ } else {
+ /* Defer closing the GEM BO and returning the VMA for reuse until the
+ * BO is idle. Just move it to the dead list for now.
+ */
+ list_addtail(&bo->head, &bufmgr->zombie_list);
+ }
+}
+
+/** Frees all cached buffers significantly older than @time. */
+static void
+cleanup_bo_cache(struct crocus_bufmgr *bufmgr, time_t time)
+{
+ int i;
+
+ if (bufmgr->time == time)
+ return;
+
+ for (i = 0; i < bufmgr->num_buckets; i++) {
+ struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
+
+ list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) {
+ if (time - bo->free_time <= 1)
+ break;
+
+ list_del(&bo->head);
+
+ bo_free(bo);
+ }
+ }
+
+ list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) {
+ /* Stop once we reach a busy BO - all others past this point were
+ * freed more recently so are likely also busy.
+ */
+ if (!bo->idle && crocus_bo_busy(bo))
+ break;
+
+ list_del(&bo->head);
+ bo_close(bo);
+ }
+
+ bufmgr->time = time;
+}
+
+static void
+bo_unreference_final(struct crocus_bo *bo, time_t time)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct bo_cache_bucket *bucket;
+
+ DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name);
+
+ bucket = NULL;
+ if (bo->reusable)
+ bucket = bucket_for_size(bufmgr, bo->size);
+ /* Put the buffer into our internal cache for reuse if we can. */
+ if (bucket && crocus_bo_madvise(bo, I915_MADV_DONTNEED)) {
+ bo->free_time = time;
+ bo->name = NULL;
+
+ list_addtail(&bo->head, &bucket->head);
+ } else {
+ bo_free(bo);
+ }
+}
+
+void
+crocus_bo_unreference(struct crocus_bo *bo)
+{
+ if (bo == NULL)
+ return;
+
+ assert(p_atomic_read(&bo->refcount) > 0);
+
+ if (atomic_add_unless(&bo->refcount, -1, 1)) {
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct timespec time;
+
+ clock_gettime(CLOCK_MONOTONIC, &time);
+
+ mtx_lock(&bufmgr->lock);
+
+ if (p_atomic_dec_zero(&bo->refcount)) {
+ bo_unreference_final(bo, time.tv_sec);
+ cleanup_bo_cache(bufmgr, time.tv_sec);
+ }
+
+ mtx_unlock(&bufmgr->lock);
+ }
+}
+
+static void
+bo_wait_with_stall_warning(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo,
+ const char *action)
+{
+ bool busy = dbg && !bo->idle;
+ double elapsed = unlikely(busy) ? -get_time() : 0.0;
+
+ crocus_bo_wait_rendering(bo);
+
+ if (unlikely(busy)) {
+ elapsed += get_time();
+ if (elapsed > 1e-5) /* 0.01ms */ {
+ perf_debug(dbg, "%s a busy \"%s\" BO stalled and took %.03f ms.\n",
+ action, bo->name, elapsed * 1000);
+ }
+ }
+}
+
+static void
+print_flags(unsigned flags)
+{
+ if (flags & MAP_READ)
+ DBG("READ ");
+ if (flags & MAP_WRITE)
+ DBG("WRITE ");
+ if (flags & MAP_ASYNC)
+ DBG("ASYNC ");
+ if (flags & MAP_PERSISTENT)
+ DBG("PERSISTENT ");
+ if (flags & MAP_COHERENT)
+ DBG("COHERENT ");
+ if (flags & MAP_RAW)
+ DBG("RAW ");
+ DBG("\n");
+}
+
+static void *
+crocus_bo_gem_mmap_legacy(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, bool wc)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ struct drm_i915_gem_mmap mmap_arg = {
+ .handle = bo->gem_handle,
+ .size = bo->size,
+ .flags = wc ? I915_MMAP_WC : 0,
+ };
+
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
+ if (ret != 0) {
+ DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+ void *map = (void *) (uintptr_t) mmap_arg.addr_ptr;
+
+ return map;
+}
+
+static void *
+crocus_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct crocus_bo *bo,
+ bool wc)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ struct drm_i915_gem_mmap_offset mmap_arg = {
+ .handle = bo->gem_handle,
+ .flags = wc ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB,
+ };
+
+ /* Get the fake offset back */
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mmap_arg);
+ if (ret != 0) {
+ DBG("%s:%d: Error preparing buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ /* And map it */
+ void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ bufmgr->fd, mmap_arg.offset);
+ if (map == MAP_FAILED) {
+ DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ return map;
+}
+
+static void *
+crocus_bo_gem_mmap(struct pipe_debug_callback *dbg, struct crocus_bo *bo, bool wc)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bufmgr->has_mmap_offset)
+ return crocus_bo_gem_mmap_offset(dbg, bo, wc);
+ else
+ return crocus_bo_gem_mmap_legacy(dbg, bo, wc);
+}
+
+static void *
+crocus_bo_map_cpu(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ /* We disallow CPU maps for writing to non-coherent buffers, as the
+ * CPU map can become invalidated when a batch is flushed out, which
+ * can happen at unpredictable times. You should use WC maps instead.
+ */
+ assert(bo->cache_coherent || !(flags & MAP_WRITE));
+
+ if (!bo->map_cpu) {
+ DBG("crocus_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name);
+
+ void *map = crocus_bo_gem_mmap(dbg, bo, false);
+ if (!map) {
+ return NULL;
+ }
+
+ VG_DEFINED(map, bo->size);
+
+ if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) {
+ VG_NOACCESS(map, bo->size);
+ munmap(map, bo->size);
+ }
+ }
+ assert(bo->map_cpu);
+
+ DBG("crocus_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name,
+ bo->map_cpu);
+ print_flags(flags);
+
+ if (!(flags & MAP_ASYNC)) {
+ bo_wait_with_stall_warning(dbg, bo, "CPU mapping");
+ }
+
+ if (!bo->cache_coherent && !bo->bufmgr->has_llc) {
+ /* If we're reusing an existing CPU mapping, the CPU caches may
+ * contain stale data from the last time we read from that mapping.
+ * (With the BO cache, it might even be data from a previous buffer!)
+ * Even if it's a brand new mapping, the kernel may have zeroed the
+ * buffer via CPU writes.
+ *
+ * We need to invalidate those cachelines so that we see the latest
+ * contents, and so long as we only read from the CPU mmap we do not
+ * need to write those cachelines back afterwards.
+ *
+ * On LLC, the emprical evidence suggests that writes from the GPU
+ * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU
+ * cachelines. (Other reads, such as the display engine, bypass the
+ * LLC entirely requiring us to keep dirty pixels for the scanout
+ * out of any cache.)
+ */
+ intel_invalidate_range(bo->map_cpu, bo->size);
+ }
+
+ return bo->map_cpu;
+}
+
+static void *
+crocus_bo_map_wc(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ if (!bo->map_wc) {
+ DBG("crocus_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name);
+
+ void *map = crocus_bo_gem_mmap(dbg, bo, true);
+ if (!map) {
+ return NULL;
+ }
+
+ VG_DEFINED(map, bo->size);
+
+ if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) {
+ VG_NOACCESS(map, bo->size);
+ munmap(map, bo->size);
+ }
+ }
+ assert(bo->map_wc);
+
+ DBG("crocus_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc);
+ print_flags(flags);
+
+ if (!(flags & MAP_ASYNC)) {
+ bo_wait_with_stall_warning(dbg, bo, "WC mapping");
+ }
+
+ return bo->map_wc;
+}
+
+/**
+ * Perform an uncached mapping via the GTT.
+ *
+ * Write access through the GTT is not quite fully coherent. On low power
+ * systems especially, like modern Atoms, we can observe reads from RAM before
+ * the write via GTT has landed. A write memory barrier that flushes the Write
+ * Combining Buffer (i.e. sfence/mfence) is not sufficient to order the later
+ * read after the write as the GTT write suffers a small delay through the GTT
+ * indirection. The kernel uses an uncached mmio read to ensure the GTT write
+ * is ordered with reads (either by the GPU, WB or WC) and unconditionally
+ * flushes prior to execbuf submission. However, if we are not informing the
+ * kernel about our GTT writes, it will not flush before earlier access, such
+ * as when using the cmdparser. Similarly, we need to be careful if we should
+ * ever issue a CPU read immediately following a GTT write.
+ *
+ * Telling the kernel about write access also has one more important
+ * side-effect. Upon receiving notification about the write, it cancels any
+ * scanout buffering for FBC/PSR and friends. Later FBC/PSR is then flushed by
+ * either SW_FINISH or DIRTYFB. The presumption is that we never write to the
+ * actual scanout via a mmaping, only to a backbuffer and so all the FBC/PSR
+ * tracking is handled on the buffer exchange instead.
+ */
+static void *
+crocus_bo_map_gtt(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ /* If we don't support get/set_tiling, there's no support for GTT mapping
+ * either (it won't do any de-tiling for us).
+ */
+ assert(bufmgr->has_tiling_uapi);
+
+ /* Get a mapping of the buffer if we haven't before. */
+ if (bo->map_gtt == NULL) {
+ DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name);
+
+ struct drm_i915_gem_mmap_gtt mmap_arg = { .handle = bo->gem_handle };
+
+ /* Get the fake offset back... */
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg);
+ if (ret != 0) {
+ DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ /* and mmap it. */
+ void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, bufmgr->fd, mmap_arg.offset);
+ if (map == MAP_FAILED) {
+ DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+ __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+ return NULL;
+ }
+
+ /* We don't need to use VALGRIND_MALLOCLIKE_BLOCK because Valgrind will
+ * already intercept this mmap call. However, for consistency between
+ * all the mmap paths, we mark the pointer as defined now and mark it
+ * as inaccessible afterwards.
+ */
+ VG_DEFINED(map, bo->size);
+
+ if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) {
+ VG_NOACCESS(map, bo->size);
+ munmap(map, bo->size);
+ }
+ }
+ assert(bo->map_gtt);
+
+ DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt);
+ print_flags(flags);
+
+ if (!(flags & MAP_ASYNC)) {
+ bo_wait_with_stall_warning(dbg, bo, "GTT mapping");
+ }
+
+ return bo->map_gtt;
+}
+
+static bool
+can_map_cpu(struct crocus_bo *bo, unsigned flags)
+{
+ if (bo->cache_coherent)
+ return true;
+
+ /* Even if the buffer itself is not cache-coherent (such as a scanout), on
+ * an LLC platform reads always are coherent (as they are performed via the
+ * central system agent). It is just the writes that we need to take special
+ * care to ensure that land in main memory and not stick in the CPU cache.
+ */
+ if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc)
+ return true;
+
+ /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid
+ * across batch flushes where the kernel will change cache domains of the
+ * bo, invalidating continued access to the CPU mmap on non-LLC device.
+ *
+ * Similarly, ASYNC typically means that the buffer will be accessed via
+ * both the CPU and the GPU simultaneously. Batches may be executed that
+ * use the BO even while it is mapped. While OpenGL technically disallows
+ * most drawing while non-persistent mappings are active, we may still use
+ * the GPU for blits or other operations, causing batches to happen at
+ * inconvenient times.
+ *
+ * If RAW is set, we expect the caller to be able to handle a WC buffer
+ * more efficiently than the involuntary clflushes.
+ */
+ if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC | MAP_RAW))
+ return false;
+
+ return !(flags & MAP_WRITE);
+}
+
+void *
+crocus_bo_map(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags)
+{
+ if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW))
+ return crocus_bo_map_gtt(dbg, bo, flags);
+
+ void *map;
+
+ if (can_map_cpu(bo, flags))
+ map = crocus_bo_map_cpu(dbg, bo, flags);
+ else
+ map = crocus_bo_map_wc(dbg, bo, flags);
+
+ /* Allow the attempt to fail by falling back to the GTT where necessary.
+ *
+ * Not every buffer can be mmaped directly using the CPU (or WC), for
+ * example buffers that wrap stolen memory or are imported from other
+ * devices. For those, we have little choice but to use a GTT mmapping.
+ * However, if we use a slow GTT mmapping for reads where we expected fast
+ * access, that order of magnitude difference in throughput will be clearly
+ * expressed by angry users.
+ *
+ * We skip MAP_RAW because we want to avoid map_gtt's fence detiling.
+ */
+ if (!map && !(flags & MAP_RAW)) {
+ perf_debug(dbg, "Fallback GTT mapping for %s with access flags %x\n",
+ bo->name, flags);
+ map = crocus_bo_map_gtt(dbg, bo, flags);
+ }
+
+ return map;
+}
+
+/** Waits for all GPU rendering with the object to have completed. */
+void
+crocus_bo_wait_rendering(struct crocus_bo *bo)
+{
+ /* We require a kernel recent enough for WAIT_IOCTL support.
+ * See intel_init_bufmgr()
+ */
+ crocus_bo_wait(bo, -1);
+}
+
+/**
+ * Waits on a BO for the given amount of time.
+ *
+ * @bo: buffer object to wait for
+ * @timeout_ns: amount of time to wait in nanoseconds.
+ * If value is less than 0, an infinite wait will occur.
+ *
+ * Returns 0 if the wait was successful ie. the last batch referencing the
+ * object has completed within the allotted time. Otherwise some negative return
+ * value describes the error. Of particular interest is -ETIME when the wait has
+ * failed to yield the desired result.
+ *
+ * Similar to crocus_bo_wait_rendering except a timeout parameter allows
+ * the operation to give up after a certain amount of time. Another subtle
+ * difference is the internal locking semantics are different (this variant does
+ * not hold the lock for the duration of the wait). This makes the wait subject
+ * to a larger userspace race window.
+ *
+ * The implementation shall wait until the object is no longer actively
+ * referenced within a batch buffer at the time of the call. The wait will
+ * not guarantee that the buffer is re-issued via another thread, or an flinked
+ * handle. Userspace must make sure this race does not occur if such precision
+ * is important.
+ *
+ * Note that some kernels have broken the inifite wait for negative values
+ * promise, upgrade to latest stable kernels if this is the case.
+ */
+int
+crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ /* If we know it's idle, don't bother with the kernel round trip */
+ if (bo->idle && !bo->external)
+ return 0;
+
+ struct drm_i915_gem_wait wait = {
+ .bo_handle = bo->gem_handle,
+ .timeout_ns = timeout_ns,
+ };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+ if (ret != 0)
+ return -errno;
+
+ bo->idle = true;
+
+ return ret;
+}
+
+static void
+crocus_bufmgr_destroy(struct crocus_bufmgr *bufmgr)
+{
+ mtx_destroy(&bufmgr->lock);
+
+ /* Free any cached buffer objects we were going to reuse */
+ for (int i = 0; i < bufmgr->num_buckets; i++) {
+ struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
+
+ list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) {
+ list_del(&bo->head);
+
+ bo_free(bo);
+ }
+ }
+
+ /* Close any buffer objects on the dead list. */
+ list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) {
+ list_del(&bo->head);
+ bo_close(bo);
+ }
+
+ _mesa_hash_table_destroy(bufmgr->name_table, NULL);
+ _mesa_hash_table_destroy(bufmgr->handle_table, NULL);
+
+ close(bufmgr->fd);
+
+ free(bufmgr);
+}
+
+static int
+bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode,
+ uint32_t stride)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ struct drm_i915_gem_set_tiling set_tiling;
+ int ret;
+
+ if (bo->global_name == 0 &&
+ tiling_mode == bo->tiling_mode && stride == bo->stride)
+ return 0;
+
+ memset(&set_tiling, 0, sizeof(set_tiling));
+ do {
+ /* set_tiling is slightly broken and overwrites the
+ * input on the error path, so we have to open code
+ * drm_ioctl.
+ */
+ set_tiling.handle = bo->gem_handle;
+ set_tiling.tiling_mode = tiling_mode;
+ set_tiling.stride = stride;
+
+ ret = ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+ } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+ if (ret == -1)
+ return -errno;
+
+ bo->tiling_mode = set_tiling.tiling_mode;
+ bo->swizzle_mode = set_tiling.swizzle_mode;
+ bo->stride = set_tiling.stride;
+ return 0;
+}
+
+int
+crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode,
+ uint32_t *swizzle_mode)
+{
+ *tiling_mode = bo->tiling_mode;
+ *swizzle_mode = bo->swizzle_mode;
+ return 0;
+}
+
+struct crocus_bo *
+crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr, int prime_fd,
+ uint32_t tiling, uint32_t stride)
+{
+ uint32_t handle;
+ struct crocus_bo *bo;
+
+ mtx_lock(&bufmgr->lock);
+ int ret = drmPrimeFDToHandle(bufmgr->fd, prime_fd, &handle);
+ if (ret) {
+ DBG("import_dmabuf: failed to obtain handle from fd: %s\n",
+ strerror(errno));
+ mtx_unlock(&bufmgr->lock);
+ return NULL;
+ }
+
+ /*
+ * See if the kernel has already returned this buffer to us. Just as
+ * for named buffers, we must not create two bo's pointing at the same
+ * kernel object
+ */
+ bo = find_and_ref_external_bo(bufmgr->handle_table, handle);
+ if (bo)
+ goto out;
+
+ bo = bo_calloc();
+ if (!bo)
+ goto out;
+
+ p_atomic_set(&bo->refcount, 1);
+
+ /* Determine size of bo. The fd-to-handle ioctl really should
+ * return the size, but it doesn't. If we have kernel 3.12 or
+ * later, we can lseek on the prime fd to get the size. Older
+ * kernels will just fail, in which case we fall back to the
+ * provided (estimated or guess size). */
+ ret = lseek(prime_fd, 0, SEEK_END);
+ if (ret != -1)
+ bo->size = ret;
+
+ bo->bufmgr = bufmgr;
+ bo->name = "prime";
+ bo->reusable = false;
+ bo->external = true;
+ bo->kflags = 0;
+ bo->gem_handle = handle;
+ _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
+
+ struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling))
+ goto err;
+
+ if (get_tiling.tiling_mode == tiling || tiling > I915_TILING_LAST) {
+ bo->tiling_mode = get_tiling.tiling_mode;
+ bo->swizzle_mode = get_tiling.swizzle_mode;
+ /* XXX stride is unknown */
+ } else {
+ if (bo_set_tiling_internal(bo, tiling, stride)) {
+ goto err;
+ }
+ }
+
+out:
+ mtx_unlock(&bufmgr->lock);
+ return bo;
+
+err:
+ bo_free(bo);
+ mtx_unlock(&bufmgr->lock);
+ return NULL;
+}
+
+static void
+crocus_bo_make_external_locked(struct crocus_bo *bo)
+{
+ if (!bo->external) {
+ _mesa_hash_table_insert(bo->bufmgr->handle_table, &bo->gem_handle, bo);
+ bo->external = true;
+ bo->reusable = false;
+ }
+}
+
+static void
+crocus_bo_make_external(struct crocus_bo *bo)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (bo->external) {
+ assert(!bo->reusable);
+ return;
+ }
+
+ mtx_lock(&bufmgr->lock);
+ crocus_bo_make_external_locked(bo);
+ mtx_unlock(&bufmgr->lock);
+}
+
+int
+crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ crocus_bo_make_external(bo);
+
+ if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
+ DRM_CLOEXEC, prime_fd) != 0)
+ return -errno;
+
+ return 0;
+}
+
+uint32_t
+crocus_bo_export_gem_handle(struct crocus_bo *bo)
+{
+ crocus_bo_make_external(bo);
+
+ return bo->gem_handle;
+}
+
+int
+crocus_bo_flink(struct crocus_bo *bo, uint32_t *name)
+{
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+ if (!bo->global_name) {
+ struct drm_gem_flink flink = { .handle = bo->gem_handle };
+
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink))
+ return -errno;
+
+ mtx_lock(&bufmgr->lock);
+ if (!bo->global_name) {
+ crocus_bo_make_external_locked(bo);
+ bo->global_name = flink.name;
+ _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+ }
+ mtx_unlock(&bufmgr->lock);
+ }
+
+ *name = bo->global_name;
+ return 0;
+}
+
+int
+crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd,
+ uint32_t *out_handle)
+{
+ /* Only add the new GEM handle to the list of export if it belongs to a
+ * different GEM device. Otherwise we might close the same buffer multiple
+ * times.
+ */
+ struct crocus_bufmgr *bufmgr = bo->bufmgr;
+ int ret = os_same_file_description(drm_fd, bufmgr->fd);
+ WARN_ONCE(ret < 0,
+ "Kernel has no file descriptor comparison support: %s\n",
+ strerror(errno));
+ if (ret == 0) {
+ *out_handle = crocus_bo_export_gem_handle(bo);
+ return 0;
+ }
+
+ struct bo_export *export = calloc(1, sizeof(*export));
+ if (!export)
+ return -ENOMEM;
+
+ export->drm_fd = drm_fd;
+
+ int dmabuf_fd = -1;
+ int err = crocus_bo_export_dmabuf(bo, &dmabuf_fd);
+ if (err) {
+ free(export);
+ return err;
+ }
+
+ mtx_lock(&bufmgr->lock);
+ err = drmPrimeFDToHandle(drm_fd, dmabuf_fd, &export->gem_handle);
+ close(dmabuf_fd);
+ if (err) {
+ mtx_unlock(&bufmgr->lock);
+ free(export);
+ return err;
+ }
+
+ bool found = false;
+ list_for_each_entry(struct bo_export, iter, &bo->exports, link) {
+ if (iter->drm_fd != drm_fd)
+ continue;
+ /* Here we assume that for a given DRM fd, we'll always get back the
+ * same GEM handle for a given buffer.
+ */
+ assert(iter->gem_handle == export->gem_handle);
+ free(export);
+ export = iter;
+ found = true;
+ break;
+ }
+ if (!found)
+ list_addtail(&export->link, &bo->exports);
+
+ mtx_unlock(&bufmgr->lock);
+
+ *out_handle = export->gem_handle;
+
+ return 0;
+}
+
+static void
+add_bucket(struct crocus_bufmgr *bufmgr, int size)
+{
+ unsigned int i = bufmgr->num_buckets;
+
+ assert(i < ARRAY_SIZE(bufmgr->cache_bucket));
+
+ list_inithead(&bufmgr->cache_bucket[i].head);
+ bufmgr->cache_bucket[i].size = size;
+ bufmgr->num_buckets++;
+
+ assert(bucket_for_size(bufmgr, size) == &bufmgr->cache_bucket[i]);
+ assert(bucket_for_size(bufmgr, size - 2048) == &bufmgr->cache_bucket[i]);
+ assert(bucket_for_size(bufmgr, size + 1) != &bufmgr->cache_bucket[i]);
+}
+
+static void
+init_cache_buckets(struct crocus_bufmgr *bufmgr)
+{
+ uint64_t size, cache_max_size = 64 * 1024 * 1024;
+
+ /* OK, so power of two buckets was too wasteful of memory.
+ * Give 3 other sizes between each power of two, to hopefully
+ * cover things accurately enough. (The alternative is
+ * probably to just go for exact matching of sizes, and assume
+ * that for things like composited window resize the tiled
+ * width/height alignment and rounding of sizes to pages will
+ * get us useful cache hit rates anyway)
+ */
+ add_bucket(bufmgr, PAGE_SIZE);
+ add_bucket(bufmgr, PAGE_SIZE * 2);
+ add_bucket(bufmgr, PAGE_SIZE * 3);
+
+ /* Initialize the linked lists for BO reuse cache. */
+ for (size = 4 * PAGE_SIZE; size <= cache_max_size; size *= 2) {
+ add_bucket(bufmgr, size);
+
+ add_bucket(bufmgr, size + size * 1 / 4);
+ add_bucket(bufmgr, size + size * 2 / 4);
+ add_bucket(bufmgr, size + size * 3 / 4);
+ }
+}
+
+uint32_t
+crocus_create_hw_context(struct crocus_bufmgr *bufmgr)
+{
+ struct drm_i915_gem_context_create create = { };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+ if (ret != 0) {
+ DBG("DRM_IOCTL_I915_GEM_CONTEXT_CREATE failed: %s\n", strerror(errno));
+ return 0;
+ }
+
+ /* Upon declaring a GPU hang, the kernel will zap the guilty context
+ * back to the default logical HW state and attempt to continue on to
+ * our next submitted batchbuffer. However, our render batches assume
+ * the previous GPU state is preserved, and only emit commands needed
+ * to incrementally change that state. In particular, we inherit the
+ * STATE_BASE_ADDRESS and PIPELINE_SELECT settings, which are critical.
+ * With default base addresses, our next batches will almost certainly
+ * cause more GPU hangs, leading to repeated hangs until we're banned
+ * or the machine is dead.
+ *
+ * Here we tell the kernel not to attempt to recover our context but
+ * immediately (on the next batchbuffer submission) report that the
+ * context is lost, and we will do the recovery ourselves. Ideally,
+ * we'll have two lost batches instead of a continual stream of hangs.
+ */
+ struct drm_i915_gem_context_param p = {
+ .ctx_id = create.ctx_id,
+ .param = I915_CONTEXT_PARAM_RECOVERABLE,
+ .value = false,
+ };
+ drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p);
+
+ return create.ctx_id;
+}
+
+static int
+crocus_hw_context_get_priority(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+ struct drm_i915_gem_context_param p = {
+ .ctx_id = ctx_id,
+ .param = I915_CONTEXT_PARAM_PRIORITY,
+ };
+ drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p);
+ return p.value; /* on error, return 0 i.e. default priority */
+}
+
+int
+crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr,
+ uint32_t ctx_id,
+ int priority)
+{
+ struct drm_i915_gem_context_param p = {
+ .ctx_id = ctx_id,
+ .param = I915_CONTEXT_PARAM_PRIORITY,
+ .value = priority,
+ };
+ int err;
+
+ err = 0;
+ if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p))
+ err = -errno;
+
+ return err;
+}
+
+uint32_t
+crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+ uint32_t new_ctx = crocus_create_hw_context(bufmgr);
+
+ if (new_ctx) {
+ int priority = crocus_hw_context_get_priority(bufmgr, ctx_id);
+ crocus_hw_context_set_priority(bufmgr, new_ctx, priority);
+ }
+
+ return new_ctx;
+}
+
+void
+crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+ struct drm_i915_gem_context_destroy d = { .ctx_id = ctx_id };
+
+ if (ctx_id != 0 &&
+ intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &d) != 0) {
+ fprintf(stderr, "DRM_IOCTL_I915_GEM_CONTEXT_DESTROY failed: %s\n",
+ strerror(errno));
+ }
+}
+
+int
+crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset, uint64_t *result)
+{
+ struct drm_i915_reg_read reg_read = { .offset = offset };
+ int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_REG_READ, &reg_read);
+
+ *result = reg_read.val;
+ return ret;
+}
+
+static int
+gem_param(int fd, int name)
+{
+ int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+ struct drm_i915_getparam gp = { .param = name, .value = &v };
+ if (intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+ return -1;
+
+ return v;
+}
+
+/**
+ * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
+ * and manage map buffer objections.
+ *
+ * \param fd File descriptor of the opened DRM device.
+ */
+static struct crocus_bufmgr *
+crocus_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
+{
+ struct crocus_bufmgr *bufmgr = calloc(1, sizeof(*bufmgr));
+ if (bufmgr == NULL)
+ return NULL;
+
+ /* Handles to buffer objects belong to the device fd and are not
+ * reference counted by the kernel. If the same fd is used by
+ * multiple parties (threads sharing the same screen bufmgr, or
+ * even worse the same device fd passed to multiple libraries)
+ * ownership of those handles is shared by those independent parties.
+ *
+ * Don't do this! Ensure that each library/bufmgr has its own device
+ * fd so that its namespace does not clash with another.
+ */
+ bufmgr->fd = os_dupfd_cloexec(fd);
+
+ p_atomic_set(&bufmgr->refcount, 1);
+
+ if (mtx_init(&bufmgr->lock, mtx_plain) != 0) {
+ free(bufmgr);
+ return NULL;
+ }
+
+ list_inithead(&bufmgr->zombie_list);
+
+ bufmgr->has_llc = devinfo->has_llc;
+ bufmgr->has_tiling_uapi = devinfo->has_tiling_uapi;
+ bufmgr->bo_reuse = bo_reuse;
+ bufmgr->has_mmap_offset = gem_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4;
+
+ init_cache_buckets(bufmgr);
+
+ bufmgr->name_table =
+ _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
+ bufmgr->handle_table =
+ _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
+
+ return bufmgr;
+}
+
+static struct crocus_bufmgr *
+crocus_bufmgr_ref(struct crocus_bufmgr *bufmgr)
+{
+ p_atomic_inc(&bufmgr->refcount);
+ return bufmgr;
+}
+
+void
+crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr)
+{
+ mtx_lock(&global_bufmgr_list_mutex);
+ if (p_atomic_dec_zero(&bufmgr->refcount)) {
+ list_del(&bufmgr->link);
+ crocus_bufmgr_destroy(bufmgr);
+ }
+ mtx_unlock(&global_bufmgr_list_mutex);
+}
+
+/**
+ * Gets an already existing GEM buffer manager or create a new one.
+ *
+ * \param fd File descriptor of the opened DRM device.
+ */
+struct crocus_bufmgr *
+crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd, bool bo_reuse)
+{
+ struct stat st;
+
+ if (fstat(fd, &st))
+ return NULL;
+
+ struct crocus_bufmgr *bufmgr = NULL;
+
+ mtx_lock(&global_bufmgr_list_mutex);
+ list_for_each_entry(struct crocus_bufmgr, iter_bufmgr, &global_bufmgr_list, link) {
+ struct stat iter_st;
+ if (fstat(iter_bufmgr->fd, &iter_st))
+ continue;
+
+ if (st.st_rdev == iter_st.st_rdev) {
+ assert(iter_bufmgr->bo_reuse == bo_reuse);
+ bufmgr = crocus_bufmgr_ref(iter_bufmgr);
+ goto unlock;
+ }
+ }
+
+ bufmgr = crocus_bufmgr_create(devinfo, fd, bo_reuse);
+ if (bufmgr)
+ list_addtail(&bufmgr->link, &global_bufmgr_list);
+
+ unlock:
+ mtx_unlock(&global_bufmgr_list_mutex);
+
+ return bufmgr;
+}
+
+int
+crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr)
+{
+ return bufmgr->fd;
+}
diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.h b/src/gallium/drivers/crocus/crocus_bufmgr.h
new file mode 100644
index 00000000000..8bb328fdeae
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_bufmgr.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_BUFMGR_H
+#define CROCUS_BUFMGR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include "util/macros.h"
+#include "util/u_atomic.h"
+#include "util/list.h"
+#include "pipe/p_defines.h"
+
+struct crocus_batch;
+struct intel_device_info;
+struct pipe_debug_callback;
+
+#define CROCUS_BINDER_SIZE (64 * 1024)
+#define CROCUS_MAX_BINDERS 100
+
+struct crocus_bo {
+ /**
+ * Size in bytes of the buffer object.
+ *
+ * The size may be larger than the size originally requested for the
+ * allocation, such as being aligned to page size.
+ */
+ uint64_t size;
+
+ /** Buffer manager context associated with this buffer object */
+ struct crocus_bufmgr *bufmgr;
+
+ /** The GEM handle for this buffer object. */
+ uint32_t gem_handle;
+
+ /**
+ * Virtual address of the buffer inside the PPGTT (Per-Process Graphics
+ * Translation Table).
+ *
+ * Although each hardware context has its own VMA, we assign BO's to the
+ * same address in all contexts, for simplicity.
+ */
+ uint64_t gtt_offset;
+
+ /**
+ * The validation list index for this buffer, or -1 when not in a batch.
+ * Note that a single buffer may be in multiple batches (contexts), and
+ * this is a global field, which refers to the last batch using the BO.
+ * It should not be considered authoritative, but can be used to avoid a
+ * linear walk of the validation list in the common case by guessing that
+ * exec_bos[bo->index] == bo and confirming whether that's the case.
+ *
+ * XXX: this is not ideal now that we have more than one batch per context,
+ * XXX: as the index will flop back and forth between the render index and
+ * XXX: compute index...
+ */
+ unsigned index;
+
+ /**
+ * Boolean of whether the GPU is definitely not accessing the buffer.
+ *
+ * This is only valid when reusable, since non-reusable
+ * buffers are those that have been shared with other
+ * processes, so we don't know their state.
+ */
+ bool idle;
+
+ int refcount;
+ const char *name;
+
+ uint64_t kflags;
+
+ /**
+ * Kenel-assigned global name for this object
+ *
+ * List contains both flink named and prime fd'd objects
+ */
+ unsigned global_name;
+
+ /**
+ * Current tiling mode
+ */
+ uint32_t tiling_mode;
+ uint32_t swizzle_mode;
+ uint32_t stride;
+
+ time_t free_time;
+
+ /** Mapped address for the buffer, saved across map/unmap cycles */
+ void *map_cpu;
+ /** GTT virtual address for the buffer, saved across map/unmap cycles */
+ void *map_gtt;
+ /** WC CPU address for the buffer, saved across map/unmap cycles */
+ void *map_wc;
+
+ /** BO cache list */
+ struct list_head head;
+
+ /** List of GEM handle exports of this buffer (bo_export) */
+ struct list_head exports;
+
+ /**
+ * Boolean of whether this buffer can be re-used
+ */
+ bool reusable;
+
+ /**
+ * Boolean of whether this buffer has been shared with an external client.
+ */
+ bool external;
+
+ /**
+ * Boolean of whether this buffer is cache coherent
+ */
+ bool cache_coherent;
+
+ /**
+ * Boolean of whether this buffer points into user memory
+ */
+ bool userptr;
+
+ /** Pre-computed hash using _mesa_hash_pointer for cache tracking sets */
+ uint32_t hash;
+};
+
+#define BO_ALLOC_ZEROED (1 << 0)
+#define BO_ALLOC_COHERENT (1 << 1)
+
+/**
+ * Allocate a buffer object.
+ *
+ * Buffer objects are not necessarily initially mapped into CPU virtual
+ * address space or graphics device aperture. They must be mapped
+ * using crocus_bo_map() to be used by the CPU.
+ */
+struct crocus_bo *crocus_bo_alloc(struct crocus_bufmgr *bufmgr,
+ const char *name, uint64_t size);
+
+/**
+ * Allocate a tiled buffer object.
+ *
+ * Alignment for tiled objects is set automatically; the 'flags'
+ * argument provides a hint about how the object will be used initially.
+ *
+ * Valid tiling formats are:
+ * I915_TILING_NONE
+ * I915_TILING_X
+ * I915_TILING_Y
+ */
+struct crocus_bo *crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr,
+ const char *name, uint64_t size,
+ uint32_t alignment,
+ uint32_t tiling_mode, uint32_t pitch,
+ unsigned flags);
+
+struct crocus_bo *crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr,
+ const char *name, void *ptr,
+ size_t size);
+
+/** Takes a reference on a buffer object */
+static inline void
+crocus_bo_reference(struct crocus_bo *bo)
+{
+ p_atomic_inc(&bo->refcount);
+}
+
+/**
+ * Releases a reference on a buffer object, freeing the data if
+ * no references remain.
+ */
+void crocus_bo_unreference(struct crocus_bo *bo);
+
+#define MAP_READ PIPE_MAP_READ
+#define MAP_WRITE PIPE_MAP_WRITE
+#define MAP_ASYNC PIPE_MAP_UNSYNCHRONIZED
+#define MAP_PERSISTENT PIPE_MAP_PERSISTENT
+#define MAP_COHERENT PIPE_MAP_COHERENT
+/* internal */
+#define MAP_INTERNAL_MASK (0xff << 24)
+#define MAP_RAW (0x01 << 24)
+
+#define MAP_FLAGS (MAP_READ | MAP_WRITE | MAP_ASYNC | \
+ MAP_PERSISTENT | MAP_COHERENT | MAP_INTERNAL_MASK)
+
+/**
+ * Maps the buffer into userspace.
+ *
+ * This function will block waiting for any existing execution on the
+ * buffer to complete, first. The resulting mapping is returned.
+ */
+MUST_CHECK void *crocus_bo_map(struct pipe_debug_callback *dbg,
+ struct crocus_bo *bo, unsigned flags);
+
+/**
+ * Reduces the refcount on the userspace mapping of the buffer
+ * object.
+ */
+static inline int crocus_bo_unmap(struct crocus_bo *bo) { return 0; }
+
+/**
+ * Waits for rendering to an object by the GPU to have completed.
+ *
+ * This is not required for any access to the BO by bo_map,
+ * bo_subdata, etc. It is merely a way for the driver to implement
+ * glFinish.
+ */
+void crocus_bo_wait_rendering(struct crocus_bo *bo);
+
+/**
+ * Unref a buffer manager instance.
+ */
+void crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr);
+
+/**
+ * Get the current tiling (and resulting swizzling) mode for the bo.
+ *
+ * \param buf Buffer to get tiling mode for
+ * \param tiling_mode returned tiling mode
+ * \param swizzle_mode returned swizzling mode
+ */
+int crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode,
+ uint32_t *swizzle_mode);
+
+/**
+ * Create a visible name for a buffer which can be used by other apps
+ *
+ * \param buf Buffer to create a name for
+ * \param name Returned name
+ */
+int crocus_bo_flink(struct crocus_bo *bo, uint32_t *name);
+
+/**
+ * Is this buffer shared with external clients (exported)?
+ */
+static inline bool
+crocus_bo_is_external(const struct crocus_bo *bo)
+{
+ return bo->external;
+}
+
+/**
+ * Returns 1 if mapping the buffer for write could cause the process
+ * to block, due to the object being active in the GPU.
+ */
+int crocus_bo_busy(struct crocus_bo *bo);
+
+/**
+ * Specify the volatility of the buffer.
+ * \param bo Buffer to create a name for
+ * \param madv The purgeable status
+ *
+ * Use I915_MADV_DONTNEED to mark the buffer as purgeable, and it will be
+ * reclaimed under memory pressure. If you subsequently require the buffer,
+ * then you must pass I915_MADV_WILLNEED to mark the buffer as required.
+ *
+ * Returns 1 if the buffer was retained, or 0 if it was discarded whilst
+ * marked as I915_MADV_DONTNEED.
+ */
+int crocus_bo_madvise(struct crocus_bo *bo, int madv);
+
+/* drm_bacon_bufmgr_gem.c */
+struct crocus_bufmgr *
+crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd,
+ bool bo_reuse);
+int crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr);
+
+struct crocus_bo *crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr,
+ const char *name,
+ unsigned handle);
+
+int crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns);
+
+uint32_t crocus_create_hw_context(struct crocus_bufmgr *bufmgr);
+uint32_t crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id);
+
+#define CROCUS_CONTEXT_LOW_PRIORITY ((I915_CONTEXT_MIN_USER_PRIORITY - 1) / 2)
+#define CROCUS_CONTEXT_MEDIUM_PRIORITY (I915_CONTEXT_DEFAULT_PRIORITY)
+#define CROCUS_CONTEXT_HIGH_PRIORITY ((I915_CONTEXT_MAX_USER_PRIORITY + 1) / 2)
+
+int crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr,
+ uint32_t ctx_id, int priority);
+
+void crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id);
+
+int crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd);
+struct crocus_bo *crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr,
+ int prime_fd, uint32_t tiling,
+ uint32_t stride);
+
+/**
+ * Exports a bo as a GEM handle into a given DRM file descriptor
+ * \param bo Buffer to export
+ * \param drm_fd File descriptor where the new handle is created
+ * \param out_handle Pointer to store the new handle
+ *
+ * Returns 0 if the buffer was successfully exported, a non zero error code
+ * otherwise.
+ */
+int crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd,
+ uint32_t *out_handle);
+
+uint32_t crocus_bo_export_gem_handle(struct crocus_bo *bo);
+
+int crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset,
+ uint64_t *out);
+
+int drm_ioctl(int fd, unsigned long request, void *arg);
+
+#endif /* CROCUS_BUFMGR_H */
diff --git a/src/gallium/drivers/crocus/crocus_clear.c b/src/gallium/drivers/crocus/crocus_clear.c
new file mode 100644
index 00000000000..1c56e23f794
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_clear.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/format/u_format.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/compiler/brw_compiler.h"
+#include "util/format_srgb.h"
+
+static bool
+crocus_is_color_fast_clear_compatible(struct crocus_context *ice,
+ enum isl_format format,
+ const union isl_color_value color)
+{
+ if (isl_format_has_int_channel(format)) {
+ perf_debug(&ice->dbg, "Integer fast clear not enabled for %s",
+ isl_format_get_name(format));
+ return false;
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (!isl_format_has_color_component(format, i)) {
+ continue;
+ }
+
+ if (color.f32[i] != 0.0f && color.f32[i] != 1.0f) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
+can_fast_clear_color(struct crocus_context *ice,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ enum isl_format format,
+ enum isl_format render_format,
+ union isl_color_value color)
+{
+ struct crocus_resource *res = (void *) p_res;
+
+ if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+ return false;
+
+ if (!isl_aux_usage_has_fast_clears(res->aux.usage))
+ return false;
+
+ /* Check for partial clear */
+ if (box->x > 0 || box->y > 0 ||
+ box->width < minify(p_res->width0, level) ||
+ box->height < minify(p_res->height0, level)) {
+ return false;
+ }
+
+ /* Avoid conditional fast clears to maintain correct tracking of the aux
+ * state (see iris_resource_finish_write for more info). Note that partial
+ * fast clears (if they existed) would not pose a problem with conditional
+ * rendering.
+ */
+ if (render_condition_enabled &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ return false;
+ }
+
+ /* We store clear colors as floats or uints as needed. If there are
+ * texture views in play, the formats will not properly be respected
+ * during resolves because the resolve operations only know about the
+ * resource and not the renderbuffer.
+ */
+ if (isl_format_srgb_to_linear(render_format) !=
+ isl_format_srgb_to_linear(format)) {
+ return false;
+ }
+
+ /* XXX: if (irb->mt->supports_fast_clear)
+ * see intel_miptree_create_for_dri_image()
+ */
+
+ if (!crocus_is_color_fast_clear_compatible(ice, format, color))
+ return false;
+
+ return true;
+}
+
+static union isl_color_value
+convert_fast_clear_color(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format render_format,
+ const union isl_color_value color)
+{
+ union isl_color_value override_color = color;
+ struct pipe_resource *p_res = (void *) res;
+
+ const enum pipe_format format = p_res->format;
+ const struct util_format_description *desc =
+ util_format_description(format);
+ unsigned colormask = util_format_colormask(desc);
+
+ if (util_format_is_intensity(format) ||
+ util_format_is_luminance(format) ||
+ util_format_is_luminance_alpha(format)) {
+ override_color.u32[1] = override_color.u32[0];
+ override_color.u32[2] = override_color.u32[0];
+ if (util_format_is_intensity(format))
+ override_color.u32[3] = override_color.u32[0];
+ } else {
+ for (int chan = 0; chan < 3; chan++) {
+ if (!(colormask & (1 << chan)))
+ override_color.u32[chan] = 0;
+ }
+ }
+
+ if (util_format_is_unorm(format)) {
+ for (int i = 0; i < 4; i++)
+ override_color.f32[i] = CLAMP(override_color.f32[i], 0.0f, 1.0f);
+ } else if (util_format_is_snorm(format)) {
+ for (int i = 0; i < 4; i++)
+ override_color.f32[i] = CLAMP(override_color.f32[i], -1.0f, 1.0f);
+ } else if (util_format_is_pure_uint(format)) {
+ for (int i = 0; i < 4; i++) {
+ unsigned bits = util_format_get_component_bits(
+ format, UTIL_FORMAT_COLORSPACE_RGB, i);
+ if (bits < 32) {
+ uint32_t max = (1u << bits) - 1;
+ override_color.u32[i] = MIN2(override_color.u32[i], max);
+ }
+ }
+ } else if (util_format_is_pure_sint(format)) {
+ for (int i = 0; i < 4; i++) {
+ unsigned bits = util_format_get_component_bits(
+ format, UTIL_FORMAT_COLORSPACE_RGB, i);
+ if (bits < 32) {
+ int32_t max = (1 << (bits - 1)) - 1;
+ int32_t min = -(1 << (bits - 1));
+ override_color.i32[i] = CLAMP(override_color.i32[i], min, max);
+ }
+ }
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+ format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ /* these packed float formats only store unsigned values */
+ for (int i = 0; i < 4; i++)
+ override_color.f32[i] = MAX2(override_color.f32[i], 0.0f);
+ }
+
+ if (!(colormask & 1 << 3)) {
+ if (util_format_is_pure_integer(format))
+ override_color.u32[3] = 1;
+ else
+ override_color.f32[3] = 1.0f;
+ }
+
+ /* Handle linear to SRGB conversion */
+ if (isl_format_is_srgb(render_format)) {
+ for (int i = 0; i < 3; i++) {
+ override_color.f32[i] =
+ util_format_linear_to_srgb_float(override_color.f32[i]);
+ }
+ }
+
+ return override_color;
+}
+
+static void
+fast_clear_color(struct crocus_context *ice,
+ struct crocus_resource *res,
+ unsigned level,
+ const struct pipe_box *box,
+ enum isl_format format,
+ union isl_color_value color,
+ enum blorp_batch_flags blorp_flags)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ struct pipe_resource *p_res = (void *) res;
+
+ color = convert_fast_clear_color(ice, res, format, color);
+
+ bool color_changed = !!memcmp(&res->aux.clear_color, &color,
+ sizeof(color));
+
+ if (color_changed) {
+ /* If we are clearing to a new clear value, we need to resolve fast
+ * clears from other levels/layers first, since we can't have different
+ * levels/layers with different fast clear colors.
+ */
+ for (unsigned res_lvl = 0; res_lvl < res->surf.levels; res_lvl++) {
+ const unsigned level_layers =
+ crocus_get_num_logical_layers(res, res_lvl);
+ for (unsigned layer = 0; layer < level_layers; layer++) {
+ if (res_lvl == level &&
+ layer >= box->z &&
+ layer < box->z + box->depth) {
+ /* We're going to clear this layer anyway. Leave it alone. */
+ continue;
+ }
+
+ enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, res_lvl, layer);
+
+ if (aux_state != ISL_AUX_STATE_CLEAR &&
+ aux_state != ISL_AUX_STATE_PARTIAL_CLEAR &&
+ aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) {
+ /* This slice doesn't have any fast-cleared bits. */
+ continue;
+ }
+
+ /* If we got here, then the level may have fast-clear bits that use
+ * the old clear value. We need to do a color resolve to get rid
+ * of their use of the clear color before we can change it.
+ * Fortunately, few applications ever change their clear color at
+ * different levels/layers, so this shouldn't happen often.
+ */
+ crocus_resource_prepare_access(ice, res,
+ res_lvl, 1, layer, 1,
+ res->aux.usage,
+ false);
+ perf_debug(&ice->dbg,
+ "Resolving resource (%p) level %d, layer %d: color changing from "
+ "(%0.2f, %0.2f, %0.2f, %0.2f) to "
+ "(%0.2f, %0.2f, %0.2f, %0.2f)\n",
+ res, res_lvl, layer,
+ res->aux.clear_color.f32[0],
+ res->aux.clear_color.f32[1],
+ res->aux.clear_color.f32[2],
+ res->aux.clear_color.f32[3],
+ color.f32[0], color.f32[1], color.f32[2], color.f32[3]);
+ }
+ }
+ }
+
+ crocus_resource_set_clear_color(ice, res, color);
+
+ /* If the buffer is already in ISL_AUX_STATE_CLEAR, and the color hasn't
+ * changed, the clear is redundant and can be skipped.
+ */
+ const enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, box->z);
+ if (!color_changed && box->depth == 1 && aux_state == ISL_AUX_STATE_CLEAR)
+ return;
+
+ /* Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+ *
+ * "Any transition from any value in {Clear, Render, Resolve} to a
+ * different value in {Clear, Render, Resolve} requires end of pipe
+ * synchronization."
+ *
+ * In other words, fast clear ops are not properly synchronized with
+ * other drawing. We need to use a PIPE_CONTROL to ensure that the
+ * contents of the previous draw hit the render target before we resolve
+ * and again afterwards to ensure that the resolve is complete before we
+ * do any more regular drawing.
+ */
+ crocus_emit_end_of_pipe_sync(batch,
+ "fast clear: pre-flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+ /* If we reach this point, we need to fast clear to change the state to
+ * ISL_AUX_STATE_CLEAR, or to update the fast clear color (or both).
+ */
+ blorp_flags |= color_changed ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR;
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ p_res, res->aux.usage, level, true);
+
+ /* In newer gens (> 9), the hardware will do a linear -> sRGB conversion of
+ * the clear color during the fast clear, if the surface format is of sRGB
+ * type. We use the linear version of the surface format here to prevent
+ * that from happening, since we already do our own linear -> sRGB
+ * conversion in convert_fast_clear_color().
+ */
+ blorp_fast_clear(&blorp_batch, &surf, isl_format_srgb_to_linear(format),
+ ISL_SWIZZLE_IDENTITY,
+ level, box->z, box->depth,
+ box->x, box->y, box->x + box->width,
+ box->y + box->height);
+ blorp_batch_finish(&blorp_batch);
+ crocus_emit_end_of_pipe_sync(batch,
+ "fast clear: post flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+ crocus_resource_set_aux_state(ice, res, level, box->z,
+ box->depth, ISL_AUX_STATE_CLEAR);
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+ return;
+}
+
+static void
+clear_color(struct crocus_context *ice,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ enum isl_format format,
+ struct isl_swizzle swizzle,
+ union isl_color_value color)
+{
+ struct crocus_resource *res = (void *) p_res;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ enum blorp_batch_flags blorp_flags = 0;
+
+ if (render_condition_enabled) {
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+ blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+ }
+
+ if (p_res->target == PIPE_BUFFER)
+ util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width);
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ bool can_fast_clear = can_fast_clear_color(ice, p_res, level, box,
+ render_condition_enabled,
+ res->surf.format, format, color);
+ if (can_fast_clear) {
+ fast_clear_color(ice, res, level, box, format, color,
+ blorp_flags);
+ return;
+ }
+
+ bool color_write_disable[4] = { false, false, false, false };
+ enum isl_aux_usage aux_usage =
+ crocus_resource_render_aux_usage(ice, res, format,
+ false, false);
+
+ crocus_resource_prepare_render(ice, res, level,
+ box->z, box->depth, aux_usage);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ p_res, aux_usage, level, true);
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ if (!isl_format_supports_rendering(devinfo, format) &&
+ isl_format_is_rgbx(format))
+ format = isl_format_rgbx_to_rgba(format);
+
+ blorp_clear(&blorp_batch, &surf, format, swizzle,
+ level, box->z, box->depth, box->x, box->y,
+ box->x + box->width, box->y + box->height,
+ color, color_write_disable);
+
+ blorp_batch_finish(&blorp_batch);
+ crocus_flush_and_dirty_for_history(ice, batch, res,
+ PIPE_CONTROL_RENDER_TARGET_FLUSH,
+ "cache history: post color clear");
+
+ crocus_resource_finish_render(ice, res, level,
+ box->z, box->depth, aux_usage);
+}
+
+static bool
+can_fast_clear_depth(struct crocus_context *ice,
+ struct crocus_resource *res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ float depth)
+{
+ struct pipe_resource *p_res = (void *) res;
+ struct pipe_context *ctx = (void *) ice;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (devinfo->ver < 6)
+ return false;
+
+ if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+ return false;
+
+ /* Check for partial clears */
+ if (box->x > 0 || box->y > 0 ||
+ box->width < u_minify(p_res->width0, level) ||
+ box->height < u_minify(p_res->height0, level)) {
+ return false;
+ }
+
+ /* Avoid conditional fast clears to maintain correct tracking of the aux
+ * state (see iris_resource_finish_write for more info). Note that partial
+ * fast clears would not pose a problem with conditional rendering.
+ */
+ if (render_condition_enabled &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ return false;
+ }
+
+ if (!crocus_resource_level_has_hiz(res, level))
+ return false;
+
+ if (res->base.format == PIPE_FORMAT_Z16_UNORM) {
+ /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+ *
+ * "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be
+ * enabled (the legacy method of clearing must be performed):
+ *
+ * - DevSNB{W/A}]: When depth buffer format is D16_UNORM and the
+ * width of the map (LOD0) is not multiple of 16, fast clear
+ * optimization must be disabled.
+ */
+ if (devinfo->ver == 6 &&
+ (minify(res->surf.phys_level0_sa.width,
+ level) % 16) != 0)
+ return false;
+ }
+ return true;
+}
+
+static void
+fast_clear_depth(struct crocus_context *ice,
+ struct crocus_resource *res,
+ unsigned level,
+ const struct pipe_box *box,
+ float depth)
+{
+ struct pipe_resource *p_res = (void *) res;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ /* Quantize the clear value to what can be stored in the actual depth
+ * buffer. This makes the following check more accurate because it now
+ * checks if the actual depth bits will match. It also prevents us from
+ * getting a too-accurate depth value during depth testing or when sampling
+ * with HiZ enabled.
+ */
+ const unsigned nbits = p_res->format == PIPE_FORMAT_Z16_UNORM ? 16 : 24;
+ const uint32_t depth_max = (1 << nbits) - 1;
+ depth = p_res->format == PIPE_FORMAT_Z32_FLOAT ? depth :
+ (unsigned)(depth * depth_max) / (float)depth_max;
+
+ bool update_clear_depth = false;
+
+ /* If we're clearing to a new clear value, then we need to resolve any clear
+ * flags out of the HiZ buffer into the real depth buffer.
+ */
+ if (res->aux.clear_color.f32[0] != depth) {
+ for (unsigned res_level = 0; res_level < res->surf.levels; res_level++) {
+ if (!crocus_resource_level_has_hiz(res, res_level))
+ continue;
+
+ const unsigned level_layers =
+ crocus_get_num_logical_layers(res, res_level);
+ for (unsigned layer = 0; layer < level_layers; layer++) {
+ if (res_level == level &&
+ layer >= box->z &&
+ layer < box->z + box->depth) {
+ /* We're going to clear this layer anyway. Leave it alone. */
+ continue;
+ }
+
+ enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, res_level, layer);
+
+ if (aux_state != ISL_AUX_STATE_CLEAR &&
+ aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) {
+ /* This slice doesn't have any fast-cleared bits. */
+ continue;
+ }
+
+ /* If we got here, then the level may have fast-clear bits that
+ * use the old clear value. We need to do a depth resolve to get
+ * rid of their use of the clear value before we can change it.
+ * Fortunately, few applications ever change their depth clear
+ * value so this shouldn't happen often.
+ */
+ crocus_hiz_exec(ice, batch, res, res_level, layer, 1,
+ ISL_AUX_OP_FULL_RESOLVE, false);
+ crocus_resource_set_aux_state(ice, res, res_level, layer, 1,
+ ISL_AUX_STATE_RESOLVED);
+ }
+ }
+ const union isl_color_value clear_value = { .f32 = {depth, } };
+ crocus_resource_set_clear_color(ice, res, clear_value);
+ update_clear_depth = true;
+ }
+
+ for (unsigned l = 0; l < box->depth; l++) {
+ enum isl_aux_state aux_state =
+ crocus_resource_level_has_hiz(res, level) ?
+ crocus_resource_get_aux_state(res, level, box->z + l) :
+ ISL_AUX_STATE_AUX_INVALID;
+ if (update_clear_depth || aux_state != ISL_AUX_STATE_CLEAR) {
+ if (aux_state == ISL_AUX_STATE_CLEAR) {
+ perf_debug(&ice->dbg, "Performing HiZ clear just to update the "
+ "depth clear value\n");
+ }
+ crocus_hiz_exec(ice, batch, res, level,
+ box->z + l, 1, ISL_AUX_OP_FAST_CLEAR,
+ update_clear_depth);
+ }
+ }
+
+ crocus_resource_set_aux_state(ice, res, level, box->z, box->depth,
+ ISL_AUX_STATE_CLEAR);
+ ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
+}
+
+static void
+clear_depth_stencil(struct crocus_context *ice,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ bool render_condition_enabled,
+ bool clear_depth,
+ bool clear_stencil,
+ float depth,
+ uint8_t stencil)
+{
+ struct crocus_resource *res = (void *) p_res;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ enum blorp_batch_flags blorp_flags = 0;
+
+ if (render_condition_enabled) {
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+ blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+ }
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ struct crocus_resource *z_res;
+ struct crocus_resource *stencil_res;
+ struct blorp_surf z_surf;
+ struct blorp_surf stencil_surf;
+
+ crocus_get_depth_stencil_resources(&batch->screen->devinfo, p_res, &z_res, &stencil_res);
+ if (z_res && clear_depth &&
+ can_fast_clear_depth(ice, z_res, level, box, render_condition_enabled,
+ depth)) {
+ fast_clear_depth(ice, z_res, level, box, depth);
+ crocus_flush_and_dirty_for_history(ice, batch, res, 0,
+ "cache history: post fast Z clear");
+ clear_depth = false;
+ z_res = NULL;
+ }
+
+ /* At this point, we might have fast cleared the depth buffer. So if there's
+ * no stencil clear pending, return early.
+ */
+ if (!(clear_depth || (clear_stencil && stencil_res))) {
+ return;
+ }
+
+ if (clear_depth && z_res) {
+ const enum isl_aux_usage aux_usage =
+ crocus_resource_render_aux_usage(ice, z_res, level, z_res->surf.format,
+ false);
+ crocus_resource_prepare_render(ice, z_res, level, box->z, box->depth,
+ aux_usage);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev,
+ &z_surf, &z_res->base, aux_usage,
+ level, true);
+ }
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+ uint8_t stencil_mask = clear_stencil && stencil_res ? 0xff : 0;
+ if (stencil_mask) {
+ crocus_resource_prepare_access(ice, stencil_res, level, 1, box->z,
+ box->depth, stencil_res->aux.usage, false);
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev,
+ &stencil_surf, &stencil_res->base,
+ stencil_res->aux.usage, level, true);
+ }
+
+ blorp_clear_depth_stencil(&blorp_batch, &z_surf, &stencil_surf,
+ level, box->z, box->depth,
+ box->x, box->y,
+ box->x + box->width,
+ box->y + box->height,
+ clear_depth && z_res, depth,
+ stencil_mask, stencil);
+
+ blorp_batch_finish(&blorp_batch);
+ crocus_flush_and_dirty_for_history(ice, batch, res, 0,
+ "cache history: post slow ZS clear");
+
+ if (clear_depth && z_res) {
+ crocus_resource_finish_render(ice, z_res, level,
+ box->z, box->depth, z_surf.aux_usage);
+ }
+
+ if (stencil_mask) {
+ crocus_resource_finish_write(ice, stencil_res, level, box->z, box->depth,
+ stencil_res->aux.usage);
+ }
+}
+
+/**
+ * The pipe->clear() driver hook.
+ *
+ * This clears buffers attached to the current draw framebuffer.
+ */
+static void
+crocus_clear(struct pipe_context *ctx,
+ unsigned buffers,
+ const struct pipe_scissor_state *scissor_state,
+ const union pipe_color_union *p_color,
+ double depth,
+ unsigned stencil)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ assert(buffers != 0);
+
+ struct pipe_box box = {
+ .width = cso_fb->width,
+ .height = cso_fb->height,
+ };
+
+ if (scissor_state) {
+ box.x = scissor_state->minx;
+ box.y = scissor_state->miny;
+ box.width = MIN2(box.width, scissor_state->maxx - scissor_state->minx);
+ box.height = MIN2(box.height, scissor_state->maxy - scissor_state->miny);
+ }
+
+ if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+ if (devinfo->ver < 6) {
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE, true);
+ util_blitter_clear(ice->blitter, cso_fb->width, cso_fb->height,
+ util_framebuffer_get_num_layers(cso_fb),
+ buffers & PIPE_CLEAR_DEPTHSTENCIL, p_color, depth, stencil, false);
+ } else {
+ struct pipe_surface *psurf = cso_fb->zsbuf;
+ box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1;
+ box.z = psurf->u.tex.first_layer;
+
+ clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box, true,
+ buffers & PIPE_CLEAR_DEPTH,
+ buffers & PIPE_CLEAR_STENCIL,
+ depth, stencil);
+ }
+ buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
+ }
+
+ if (buffers & PIPE_CLEAR_COLOR) {
+ /* pipe_color_union and isl_color_value are interchangeable */
+ union isl_color_value *color = (void *) p_color;
+
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ if (buffers & (PIPE_CLEAR_COLOR0 << i)) {
+ struct pipe_surface *psurf = cso_fb->cbufs[i];
+ struct crocus_surface *isurf = (void *) psurf;
+ box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1,
+ box.z = psurf->u.tex.first_layer,
+
+ clear_color(ice, psurf->texture, psurf->u.tex.level, &box,
+ true, isurf->view.format, isurf->view.swizzle,
+ *color);
+ }
+ }
+ }
+}
+
+/**
+ * The pipe->clear_texture() driver hook.
+ *
+ * This clears the given texture resource.
+ */
+static void
+crocus_clear_texture(struct pipe_context *ctx,
+ struct pipe_resource *p_res,
+ unsigned level,
+ const struct pipe_box *box,
+ const void *data)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_resource *res = (void *) p_res;
+
+ if (devinfo->ver < 6) {
+ util_clear_texture(ctx, p_res,
+ level, box, data);
+ return;
+ }
+
+ if (crocus_resource_unfinished_aux_import(res))
+ crocus_resource_finish_aux_import(ctx->screen, res);
+
+ if (util_format_is_depth_or_stencil(p_res->format)) {
+ const struct util_format_unpack_description *fmt_unpack =
+ util_format_unpack_description(p_res->format);
+
+ float depth = 0.0;
+ uint8_t stencil = 0;
+
+ if (fmt_unpack->unpack_z_float)
+ fmt_unpack->unpack_z_float(&depth, 0, data, 0, 1, 1);
+
+ if (fmt_unpack->unpack_s_8uint)
+ fmt_unpack->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+
+ clear_depth_stencil(ice, p_res, level, box, true, true, true,
+ depth, stencil);
+ } else {
+ union isl_color_value color;
+ struct crocus_resource *res = (void *) p_res;
+ enum isl_format format = res->surf.format;
+
+ if (!isl_format_supports_rendering(devinfo, format)) {
+ const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+ // XXX: actually just get_copy_format_for_bpb from BLORP
+ // XXX: don't cut and paste this
+ switch (fmtl->bpb) {
+ case 8: format = ISL_FORMAT_R8_UINT; break;
+ case 16: format = ISL_FORMAT_R8G8_UINT; break;
+ case 24: format = ISL_FORMAT_R8G8B8_UINT; break;
+ case 32: format = ISL_FORMAT_R8G8B8A8_UINT; break;
+ case 48: format = ISL_FORMAT_R16G16B16_UINT; break;
+ case 64: format = ISL_FORMAT_R16G16B16A16_UINT; break;
+ case 96: format = ISL_FORMAT_R32G32B32_UINT; break;
+ case 128: format = ISL_FORMAT_R32G32B32A32_UINT; break;
+ default:
+ unreachable("Unknown format bpb");
+ }
+
+ /* No aux surfaces for non-renderable surfaces */
+ assert(res->aux.usage == ISL_AUX_USAGE_NONE);
+ }
+
+ isl_color_value_unpack(&color, format, data);
+
+ clear_color(ice, p_res, level, box, true, format,
+ ISL_SWIZZLE_IDENTITY, color);
+ }
+}
+
+/**
+ * The pipe->clear_render_target() driver hook.
+ *
+ * This clears the given render target surface.
+ */
+static void
+crocus_clear_render_target(struct pipe_context *ctx,
+ struct pipe_surface *psurf,
+ const union pipe_color_union *p_color,
+ unsigned dst_x, unsigned dst_y,
+ unsigned width, unsigned height,
+ bool render_condition_enabled)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_surface *isurf = (void *) psurf;
+ struct pipe_box box = {
+ .x = dst_x,
+ .y = dst_y,
+ .z = psurf->u.tex.first_layer,
+ .width = width,
+ .height = height,
+ .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1
+ };
+
+ /* pipe_color_union and isl_color_value are interchangeable */
+ union isl_color_value *color = (void *) p_color;
+
+ clear_color(ice, psurf->texture, psurf->u.tex.level, &box,
+ render_condition_enabled,
+ isurf->view.format, isurf->view.swizzle, *color);
+}
+
+/**
+ * The pipe->clear_depth_stencil() driver hook.
+ *
+ * This clears the given depth/stencil surface.
+ */
+static void
+crocus_clear_depth_stencil(struct pipe_context *ctx,
+ struct pipe_surface *psurf,
+ unsigned flags,
+ double depth,
+ unsigned stencil,
+ unsigned dst_x, unsigned dst_y,
+ unsigned width, unsigned height,
+ bool render_condition_enabled)
+{
+ return;
+#if 0
+ struct crocus_context *ice = (void *) ctx;
+ struct pipe_box box = {
+ .x = dst_x,
+ .y = dst_y,
+ .z = psurf->u.tex.first_layer,
+ .width = width,
+ .height = height,
+ .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1
+ };
+ uint32_t blit_flags = 0;
+
+ assert(util_format_is_depth_or_stencil(psurf->texture->format));
+
+ crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE);
+ util_blitter_clear(ice->blitter, width, height,
+ 1, flags, NULL, depth, stencil, render_condition_enabled);
+#if 0
+ clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box,
+ render_condition_enabled,
+ flags & PIPE_CLEAR_DEPTH, flags & PIPE_CLEAR_STENCIL,
+ depth, stencil);
+#endif
+#endif
+}
+
+void
+crocus_init_clear_functions(struct pipe_context *ctx)
+{
+ ctx->clear = crocus_clear;
+ ctx->clear_texture = crocus_clear_texture;
+ ctx->clear_render_target = crocus_clear_render_target;
+ ctx->clear_depth_stencil = crocus_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/crocus/crocus_context.c b/src/gallium/drivers/crocus/crocus_context.c
new file mode 100644
index 00000000000..cd8a54d6d34
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_context.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/ralloc.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_upload_mgr.h"
+#include "drm-uapi/i915_drm.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "common/intel_defines.h"
+#include "common/intel_sample_positions.h"
+
+/**
+ * The pipe->set_debug_callback() driver hook.
+ */
+static void
+crocus_set_debug_callback(struct pipe_context *ctx,
+ const struct pipe_debug_callback *cb)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ if (cb)
+ ice->dbg = *cb;
+ else
+ memset(&ice->dbg, 0, sizeof(ice->dbg));
+}
+
+static bool
+crocus_init_identifier_bo(struct crocus_context *ice)
+{
+ void *bo_map;
+
+ bo_map = crocus_bo_map(NULL, ice->workaround_bo, MAP_READ | MAP_WRITE);
+ if (!bo_map)
+ return false;
+
+ ice->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE;
+ ice->workaround_offset = ALIGN(
+ intel_debug_write_identifiers(bo_map, 4096, "Crocus") + 8, 8);
+
+ crocus_bo_unmap(ice->workaround_bo);
+
+ return true;
+}
+
+/**
+ * Called from the batch module when it detects a GPU hang.
+ *
+ * In this case, we've lost our GEM context, and can't rely on any existing
+ * state on the GPU. We must mark everything dirty and wipe away any saved
+ * assumptions about the last known state of the GPU.
+ */
+void
+crocus_lost_context_state(struct crocus_batch *batch)
+{
+ /* The batch module doesn't have an crocus_context, because we want to
+ * avoid introducing lots of layering violations. Unfortunately, here
+ * we do need to inform the context of batch catastrophe. We know the
+ * batch is one of our context's, so hackily claw our way back.
+ */
+ struct crocus_context *ice = batch->ice;
+ struct crocus_screen *screen = batch->screen;
+ if (batch->name == CROCUS_BATCH_RENDER) {
+ screen->vtbl.init_render_context(batch);
+ } else if (batch->name == CROCUS_BATCH_COMPUTE) {
+ screen->vtbl.init_compute_context(batch);
+ } else {
+ unreachable("unhandled batch reset");
+ }
+
+ ice->state.dirty = ~0ull;
+ memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+ batch->state_base_address_emitted = false;
+ screen->vtbl.lost_genx_state(ice, batch);
+}
+
+static enum pipe_reset_status
+crocus_get_device_reset_status(struct pipe_context *ctx)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ enum pipe_reset_status worst_reset = PIPE_NO_RESET;
+
+ /* Check the reset status of each batch's hardware context, and take the
+ * worst status (if one was guilty, proclaim guilt).
+ */
+ for (int i = 0; i < ice->batch_count; i++) {
+ /* This will also recreate the hardware contexts as necessary, so any
+ * future queries will show no resets. We only want to report once.
+ */
+ enum pipe_reset_status batch_reset =
+ crocus_batch_check_for_reset(&ice->batches[i]);
+
+ if (batch_reset == PIPE_NO_RESET)
+ continue;
+
+ if (worst_reset == PIPE_NO_RESET) {
+ worst_reset = batch_reset;
+ } else {
+ /* GUILTY < INNOCENT < UNKNOWN */
+ worst_reset = MIN2(worst_reset, batch_reset);
+ }
+ }
+
+ if (worst_reset != PIPE_NO_RESET && ice->reset.reset)
+ ice->reset.reset(ice->reset.data, worst_reset);
+
+ return worst_reset;
+}
+
+static void
+crocus_set_device_reset_callback(struct pipe_context *ctx,
+ const struct pipe_device_reset_callback *cb)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ if (cb)
+ ice->reset = *cb;
+ else
+ memset(&ice->reset, 0, sizeof(ice->reset));
+}
+
+static void
+crocus_get_sample_position(struct pipe_context *ctx,
+ unsigned sample_count,
+ unsigned sample_index,
+ float *out_value)
+{
+ union {
+ struct {
+ float x[16];
+ float y[16];
+ } a;
+ struct {
+ float _0XOffset, _1XOffset, _2XOffset, _3XOffset,
+ _4XOffset, _5XOffset, _6XOffset, _7XOffset,
+ _8XOffset, _9XOffset, _10XOffset, _11XOffset,
+ _12XOffset, _13XOffset, _14XOffset, _15XOffset;
+ float _0YOffset, _1YOffset, _2YOffset, _3YOffset,
+ _4YOffset, _5YOffset, _6YOffset, _7YOffset,
+ _8YOffset, _9YOffset, _10YOffset, _11YOffset,
+ _12YOffset, _13YOffset, _14YOffset, _15YOffset;
+ } v;
+ } u;
+ switch (sample_count) {
+ case 1: INTEL_SAMPLE_POS_1X(u.v._); break;
+ case 2: INTEL_SAMPLE_POS_2X(u.v._); break;
+ case 4: INTEL_SAMPLE_POS_4X(u.v._); break;
+ case 8: INTEL_SAMPLE_POS_8X(u.v._); break;
+ case 16: INTEL_SAMPLE_POS_16X(u.v._); break;
+ default: unreachable("invalid sample count");
+ }
+
+ out_value[0] = u.a.x[sample_index];
+ out_value[1] = u.a.y[sample_index];
+}
+
+/**
+ * Destroy a context, freeing any associated memory.
+ */
+static void
+crocus_destroy_context(struct pipe_context *ctx)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ if (ctx->stream_uploader)
+ u_upload_destroy(ctx->stream_uploader);
+
+ if (ice->blitter)
+ util_blitter_destroy(ice->blitter);
+ screen->vtbl.destroy_state(ice);
+ crocus_destroy_program_cache(ice);
+ u_upload_destroy(ice->query_buffer_uploader);
+
+ crocus_bo_unreference(ice->workaround_bo);
+
+ slab_destroy_child(&ice->transfer_pool);
+
+ crocus_batch_free(&ice->batches[CROCUS_BATCH_RENDER]);
+ if (ice->batches[CROCUS_BATCH_COMPUTE].ice)
+ crocus_batch_free(&ice->batches[CROCUS_BATCH_COMPUTE]);
+
+ ralloc_free(ice);
+}
+
+#define genX_call(devinfo, func, ...) \
+ switch ((devinfo)->verx10) { \
+ case 75: \
+ gfx75_##func(__VA_ARGS__); \
+ break; \
+ case 70: \
+ gfx7_##func(__VA_ARGS__); \
+ break; \
+ case 60: \
+ gfx6_##func(__VA_ARGS__); \
+ break; \
+ case 50: \
+ gfx5_##func(__VA_ARGS__); \
+ break; \
+ case 45: \
+ gfx45_##func(__VA_ARGS__); \
+ break; \
+ case 40: \
+ gfx4_##func(__VA_ARGS__); \
+ break; \
+ default: \
+ unreachable("Unknown hardware generation"); \
+ }
+
+/**
+ * Create a context.
+ *
+ * This is where each context begins.
+ */
+struct pipe_context *
+crocus_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
+{
+ struct crocus_screen *screen = (struct crocus_screen*)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_context *ice = rzalloc(NULL, struct crocus_context);
+
+ if (!ice)
+ return NULL;
+
+ struct pipe_context *ctx = &ice->ctx;
+
+ ctx->screen = pscreen;
+ ctx->priv = priv;
+
+ ctx->stream_uploader = u_upload_create_default(ctx);
+ if (!ctx->stream_uploader) {
+ free(ctx);
+ return NULL;
+ }
+ ctx->const_uploader = ctx->stream_uploader;
+
+ ctx->destroy = crocus_destroy_context;
+ ctx->set_debug_callback = crocus_set_debug_callback;
+ ctx->set_device_reset_callback = crocus_set_device_reset_callback;
+ ctx->get_device_reset_status = crocus_get_device_reset_status;
+ ctx->get_sample_position = crocus_get_sample_position;
+
+ ice->shaders.urb_size = devinfo->urb.size;
+
+ crocus_init_context_fence_functions(ctx);
+ crocus_init_blit_functions(ctx);
+ crocus_init_clear_functions(ctx);
+ crocus_init_program_functions(ctx);
+ crocus_init_resource_functions(ctx);
+ crocus_init_flush_functions(ctx);
+
+ crocus_init_program_cache(ice);
+
+ slab_create_child(&ice->transfer_pool, &screen->transfer_pool);
+
+ ice->query_buffer_uploader =
+ u_upload_create(ctx, 4096, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING,
+ 0);
+
+ ice->workaround_bo =
+ crocus_bo_alloc(screen->bufmgr, "workaround", 4096);
+ if (!ice->workaround_bo)
+ return NULL;
+
+ if (!crocus_init_identifier_bo(ice))
+ return NULL;
+
+ genX_call(devinfo, init_state, ice);
+ genX_call(devinfo, init_blorp, ice);
+ genX_call(devinfo, init_query, ice);
+
+ ice->blitter = util_blitter_create(&ice->ctx);
+ if (ice->blitter == NULL)
+ return NULL;
+ int priority = 0;
+ if (flags & PIPE_CONTEXT_HIGH_PRIORITY)
+ priority = INTEL_CONTEXT_HIGH_PRIORITY;
+ if (flags & PIPE_CONTEXT_LOW_PRIORITY)
+ priority = INTEL_CONTEXT_LOW_PRIORITY;
+
+ ice->batch_count = devinfo->ver >= 7 ? CROCUS_BATCH_COUNT : 1;
+ for (int i = 0; i < ice->batch_count; i++) {
+ crocus_init_batch(ice, (enum crocus_batch_name) i,
+ priority);
+ }
+
+ ice->urb.size = devinfo->urb.size;
+ screen->vtbl.init_render_context(&ice->batches[CROCUS_BATCH_RENDER]);
+ if (ice->batch_count > 1)
+ screen->vtbl.init_compute_context(&ice->batches[CROCUS_BATCH_COMPUTE]);
+
+ return ctx;
+}
+
+bool
+crocus_sw_check_cond_render(struct crocus_context *ice)
+{
+ struct crocus_query *q = ice->condition.query;
+ union pipe_query_result result;
+
+ bool wait = ice->condition.mode == PIPE_RENDER_COND_WAIT ||
+ ice->condition.mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+ if (!q)
+ return true;
+
+ bool ret = ice->ctx.get_query_result(&ice->ctx, (void *)q, wait, &result);
+ if (!ret)
+ return true;
+
+ return ice->condition.condition ? result.u64 == 0 : result.u64 != 0;
+}
diff --git a/src/gallium/drivers/crocus/crocus_context.h b/src/gallium/drivers/crocus/crocus_context.h
new file mode 100644
index 00000000000..8d6e43d80f6
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_context.h
@@ -0,0 +1,955 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_CONTEXT_H
+#define CROCUS_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "intel/blorp/blorp.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/compiler/brw_compiler.h"
+#include "crocus_batch.h"
+#include "crocus_fence.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "util/u_blitter.h"
+
+struct crocus_bo;
+struct crocus_context;
+struct blorp_batch;
+struct blorp_params;
+
+#define CROCUS_MAX_TEXTURE_BUFFER_SIZE (1 << 27)
+#define CROCUS_MAX_TEXTURE_SAMPLERS 32
+/* CROCUS_MAX_ABOS and CROCUS_MAX_SSBOS must be the same. */
+#define CROCUS_MAX_ABOS 16
+#define CROCUS_MAX_SSBOS 16
+#define CROCUS_MAX_VIEWPORTS 16
+#define CROCUS_MAX_CLIP_PLANES 8
+
+enum crocus_param_domain {
+ BRW_PARAM_DOMAIN_BUILTIN = 0,
+ BRW_PARAM_DOMAIN_IMAGE,
+};
+
+enum {
+ DRI_CONF_BO_REUSE_DISABLED,
+ DRI_CONF_BO_REUSE_ALL
+};
+
+#define BRW_PARAM(domain, val) (BRW_PARAM_DOMAIN_##domain << 24 | (val))
+#define BRW_PARAM_DOMAIN(param) ((uint32_t)(param) >> 24)
+#define BRW_PARAM_VALUE(param) ((uint32_t)(param) & 0x00ffffff)
+#define BRW_PARAM_IMAGE(idx, offset) BRW_PARAM(IMAGE, ((idx) << 8) | (offset))
+#define BRW_PARAM_IMAGE_IDX(value) (BRW_PARAM_VALUE(value) >> 8)
+#define BRW_PARAM_IMAGE_OFFSET(value)(BRW_PARAM_VALUE(value) & 0xf)
+
+/**
+ * Dirty flags. When state changes, we flag some combination of these
+ * to indicate that particular GPU commands need to be re-emitted.
+ *
+ * Each bit typically corresponds to a single 3DSTATE_* command packet, but
+ * in rare cases they map to a group of related packets that need to be
+ * emitted together.
+ *
+ * See crocus_upload_render_state().
+ */
+#define CROCUS_DIRTY_COLOR_CALC_STATE (1ull << 0)
+#define CROCUS_DIRTY_POLYGON_STIPPLE (1ull << 1)
+#define CROCUS_DIRTY_CC_VIEWPORT (1ull << 2)
+#define CROCUS_DIRTY_SF_CL_VIEWPORT (1ull << 3)
+#define CROCUS_DIRTY_RASTER (1ull << 4)
+#define CROCUS_DIRTY_CLIP (1ull << 5)
+#define CROCUS_DIRTY_LINE_STIPPLE (1ull << 6)
+#define CROCUS_DIRTY_VERTEX_ELEMENTS (1ull << 7)
+#define CROCUS_DIRTY_VERTEX_BUFFERS (1ull << 8)
+#define CROCUS_DIRTY_DRAWING_RECTANGLE (1ull << 9)
+#define CROCUS_DIRTY_GEN6_URB (1ull << 10)
+#define CROCUS_DIRTY_DEPTH_BUFFER (1ull << 11)
+#define CROCUS_DIRTY_WM (1ull << 12)
+#define CROCUS_DIRTY_SO_DECL_LIST (1ull << 13)
+#define CROCUS_DIRTY_STREAMOUT (1ull << 14)
+#define CROCUS_DIRTY_GEN4_CONSTANT_COLOR (1ull << 15)
+#define CROCUS_DIRTY_GEN4_CURBE (1ull << 16)
+#define CROCUS_DIRTY_GEN4_URB_FENCE (1ull << 17)
+#define CROCUS_DIRTY_GEN5_PIPELINED_POINTERS (1ull << 18)
+#define CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS (1ull << 19)
+#define CROCUS_DIRTY_GEN6_BLEND_STATE (1ull << 20)
+#define CROCUS_DIRTY_GEN6_SCISSOR_RECT (1ull << 21)
+#define CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL (1ull << 22)
+#define CROCUS_DIRTY_GEN6_MULTISAMPLE (1ull << 23)
+#define CROCUS_DIRTY_GEN6_SAMPLE_MASK (1ull << 24)
+#define CROCUS_DIRTY_GEN7_SBE (1ull << 25)
+#define CROCUS_DIRTY_GEN7_L3_CONFIG (1ull << 26)
+#define CROCUS_DIRTY_GEN7_SO_BUFFERS (1ull << 27)
+#define CROCUS_DIRTY_GEN75_VF (1ull << 28)
+#define CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES (1ull << 29)
+#define CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 30)
+#define CROCUS_DIRTY_VF_STATISTICS (1ull << 31)
+#define CROCUS_DIRTY_GEN4_CLIP_PROG (1ull << 32)
+#define CROCUS_DIRTY_GEN4_SF_PROG (1ull << 33)
+#define CROCUS_DIRTY_GEN4_FF_GS_PROG (1ull << 34)
+#define CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS (1ull << 35)
+#define CROCUS_DIRTY_GEN6_SVBI (1ull << 36)
+
+#define CROCUS_ALL_DIRTY_FOR_COMPUTE (CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES)
+
+#define CROCUS_ALL_DIRTY_FOR_RENDER (~CROCUS_ALL_DIRTY_FOR_COMPUTE)
+
+/**
+ * Per-stage dirty flags. When state changes, we flag some combination of
+ * these to indicate that particular GPU commands need to be re-emitted.
+ * Unlike the IRIS_DIRTY_* flags these are shader stage-specific and can be
+ * indexed by shifting the mask by the shader stage index.
+ *
+ * See crocus_upload_render_state().
+ */
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS (1ull << 0)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS (1ull << 1)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES (1ull << 2)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS (1ull << 3)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS (1ull << 4)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS (1ull << 5)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_VS (1ull << 6)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_TCS (1ull << 7)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_TES (1ull << 8)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_GS (1ull << 9)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_FS (1ull << 10)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_CS (1ull << 11)
+#define CROCUS_STAGE_DIRTY_VS (1ull << 12)
+#define CROCUS_STAGE_DIRTY_TCS (1ull << 13)
+#define CROCUS_STAGE_DIRTY_TES (1ull << 14)
+#define CROCUS_STAGE_DIRTY_GS (1ull << 15)
+#define CROCUS_STAGE_DIRTY_FS (1ull << 16)
+#define CROCUS_STAGE_DIRTY_CS (1ull << 17)
+#define CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS 18
+#define CROCUS_STAGE_DIRTY_CONSTANTS_VS (1ull << 18)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_TCS (1ull << 19)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_TES (1ull << 20)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_GS (1ull << 21)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_FS (1ull << 22)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_CS (1ull << 23)
+#define CROCUS_STAGE_DIRTY_BINDINGS_VS (1ull << 24)
+#define CROCUS_STAGE_DIRTY_BINDINGS_TCS (1ull << 25)
+#define CROCUS_STAGE_DIRTY_BINDINGS_TES (1ull << 26)
+#define CROCUS_STAGE_DIRTY_BINDINGS_GS (1ull << 27)
+#define CROCUS_STAGE_DIRTY_BINDINGS_FS (1ull << 28)
+#define CROCUS_STAGE_DIRTY_BINDINGS_CS (1ull << 29)
+
+#define CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE (CROCUS_STAGE_DIRTY_CS | \
+ CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS | \
+ CROCUS_STAGE_DIRTY_UNCOMPILED_CS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_CS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_CS)
+
+#define CROCUS_ALL_STAGE_DIRTY_FOR_RENDER (~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE)
+
+#define CROCUS_ALL_STAGE_DIRTY_BINDINGS (CROCUS_STAGE_DIRTY_BINDINGS_VS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_TES | \
+ CROCUS_STAGE_DIRTY_BINDINGS_GS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_FS | \
+ CROCUS_STAGE_DIRTY_BINDINGS_CS)
+
+#define CROCUS_RENDER_STAGE_DIRTY_CONSTANTS (CROCUS_STAGE_DIRTY_CONSTANTS_VS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_GS | \
+ CROCUS_STAGE_DIRTY_CONSTANTS_FS)
+
+/**
+ * Non-orthogonal state (NOS) dependency flags.
+ *
+ * Shader programs may depend on non-orthogonal state. These flags are
+ * used to indicate that a shader's key depends on the state provided by
+ * a certain Gallium CSO. Changing any CSOs marked as a dependency will
+ * cause the driver to re-compute the shader key, possibly triggering a
+ * shader recompile.
+ */
+enum crocus_nos_dep {
+ CROCUS_NOS_FRAMEBUFFER,
+ CROCUS_NOS_DEPTH_STENCIL_ALPHA,
+ CROCUS_NOS_RASTERIZER,
+ CROCUS_NOS_BLEND,
+ CROCUS_NOS_LAST_VUE_MAP,
+ CROCUS_NOS_TEXTURES,
+ CROCUS_NOS_VERTEX_ELEMENTS,
+ CROCUS_NOS_COUNT,
+};
+
+struct crocus_depth_stencil_alpha_state;
+
+/**
+ * Cache IDs for the in-memory program cache (ice->shaders.cache).
+ */
+enum crocus_program_cache_id {
+ CROCUS_CACHE_VS = MESA_SHADER_VERTEX,
+ CROCUS_CACHE_TCS = MESA_SHADER_TESS_CTRL,
+ CROCUS_CACHE_TES = MESA_SHADER_TESS_EVAL,
+ CROCUS_CACHE_GS = MESA_SHADER_GEOMETRY,
+ CROCUS_CACHE_FS = MESA_SHADER_FRAGMENT,
+ CROCUS_CACHE_CS = MESA_SHADER_COMPUTE,
+ CROCUS_CACHE_BLORP,
+ CROCUS_CACHE_SF,
+ CROCUS_CACHE_CLIP,
+ CROCUS_CACHE_FF_GS,
+};
+
+/** @{
+ *
+ * Defines for PIPE_CONTROL operations, which trigger cache flushes,
+ * synchronization, pipelined memory writes, and so on.
+ *
+ * The bits here are not the actual hardware values. The actual fields
+ * move between various generations, so we just have flags for each
+ * potential operation, and use genxml to encode the actual packet.
+ */
+enum pipe_control_flags
+{
+ PIPE_CONTROL_FLUSH_LLC = (1 << 1),
+ PIPE_CONTROL_LRI_POST_SYNC_OP = (1 << 2),
+ PIPE_CONTROL_STORE_DATA_INDEX = (1 << 3),
+ PIPE_CONTROL_CS_STALL = (1 << 4),
+ PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET = (1 << 5),
+ PIPE_CONTROL_SYNC_GFDT = (1 << 6),
+ PIPE_CONTROL_TLB_INVALIDATE = (1 << 7),
+ PIPE_CONTROL_MEDIA_STATE_CLEAR = (1 << 8),
+ PIPE_CONTROL_WRITE_IMMEDIATE = (1 << 9),
+ PIPE_CONTROL_WRITE_DEPTH_COUNT = (1 << 10),
+ PIPE_CONTROL_WRITE_TIMESTAMP = (1 << 11),
+ PIPE_CONTROL_DEPTH_STALL = (1 << 12),
+ PIPE_CONTROL_RENDER_TARGET_FLUSH = (1 << 13),
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE = (1 << 14),
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE = (1 << 15),
+ PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE = (1 << 16),
+ PIPE_CONTROL_NOTIFY_ENABLE = (1 << 17),
+ PIPE_CONTROL_FLUSH_ENABLE = (1 << 18),
+ PIPE_CONTROL_DATA_CACHE_FLUSH = (1 << 19),
+ PIPE_CONTROL_VF_CACHE_INVALIDATE = (1 << 20),
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE = (1 << 21),
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE = (1 << 22),
+ PIPE_CONTROL_STALL_AT_SCOREBOARD = (1 << 23),
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH = (1 << 24),
+ PIPE_CONTROL_TILE_CACHE_FLUSH = (1 << 25),
+};
+
+#define PIPE_CONTROL_CACHE_FLUSH_BITS \
+ (PIPE_CONTROL_DEPTH_CACHE_FLUSH | \
+ PIPE_CONTROL_DATA_CACHE_FLUSH | \
+ PIPE_CONTROL_RENDER_TARGET_FLUSH)
+
+#define PIPE_CONTROL_CACHE_INVALIDATE_BITS \
+ (PIPE_CONTROL_STATE_CACHE_INVALIDATE | \
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE | \
+ PIPE_CONTROL_VF_CACHE_INVALIDATE | \
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE)
+
+enum crocus_predicate_state {
+ /* The first two states are used if we can determine whether to draw
+ * without having to look at the values in the query object buffer. This
+ * will happen if there is no conditional render in progress, if the query
+ * object is already completed or if something else has already added
+ * samples to the preliminary result.
+ */
+ CROCUS_PREDICATE_STATE_RENDER,
+ CROCUS_PREDICATE_STATE_DONT_RENDER,
+
+ /* In this case whether to draw or not depends on the result of an
+ * MI_PREDICATE command so the predicate enable bit needs to be checked.
+ */
+ CROCUS_PREDICATE_STATE_USE_BIT,
+ /* In this case, either MI_PREDICATE doesn't exist or we lack the
+ * necessary kernel features to use it. Stall for the query result.
+ */
+ CROCUS_PREDICATE_STATE_STALL_FOR_QUERY,
+};
+
+/** @} */
+
+/**
+ * An uncompiled, API-facing shader. This is the Gallium CSO for shaders.
+ * It primarily contains the NIR for the shader.
+ *
+ * Each API-facing shader can be compiled into multiple shader variants,
+ * based on non-orthogonal state dependencies, recorded in the shader key.
+ *
+ * See crocus_compiled_shader, which represents a compiled shader variant.
+ */
+struct crocus_uncompiled_shader {
+ struct nir_shader *nir;
+
+ struct pipe_stream_output_info stream_output;
+
+ /* A SHA1 of the serialized NIR for the disk cache. */
+ unsigned char nir_sha1[20];
+
+ unsigned program_id;
+
+ /** Bitfield of (1 << CROCUS_NOS_*) flags. */
+ unsigned nos;
+
+ /** Have any shader variants been compiled yet? */
+ bool compiled_once;
+
+ /** Should we use ALT mode for math? Useful for ARB programs. */
+ bool use_alt_mode;
+
+ bool needs_edge_flag;
+
+ /** Constant data scraped from the shader by nir_opt_large_constants */
+ struct pipe_resource *const_data;
+
+ /** Surface state for const_data */
+ struct crocus_state_ref const_data_state;
+};
+
+enum crocus_surface_group {
+ CROCUS_SURFACE_GROUP_RENDER_TARGET,
+ CROCUS_SURFACE_GROUP_RENDER_TARGET_READ,
+ CROCUS_SURFACE_GROUP_SOL,
+ CROCUS_SURFACE_GROUP_CS_WORK_GROUPS,
+ CROCUS_SURFACE_GROUP_TEXTURE,
+ CROCUS_SURFACE_GROUP_TEXTURE_GATHER,
+ CROCUS_SURFACE_GROUP_IMAGE,
+ CROCUS_SURFACE_GROUP_UBO,
+ CROCUS_SURFACE_GROUP_SSBO,
+
+ CROCUS_SURFACE_GROUP_COUNT,
+};
+
+enum {
+ /* Invalid value for a binding table index. */
+ CROCUS_SURFACE_NOT_USED = 0xa0a0a0a0,
+};
+
+struct crocus_binding_table {
+ uint32_t size_bytes;
+
+ /** Number of surfaces in each group, before compacting. */
+ uint32_t sizes[CROCUS_SURFACE_GROUP_COUNT];
+
+ /** Initial offset of each group. */
+ uint32_t offsets[CROCUS_SURFACE_GROUP_COUNT];
+
+ /** Mask of surfaces used in each group. */
+ uint64_t used_mask[CROCUS_SURFACE_GROUP_COUNT];
+};
+
+/**
+ * A compiled shader variant, containing a pointer to the GPU assembly,
+ * as well as program data and other packets needed by state upload.
+ *
+ * There can be several crocus_compiled_shader variants per API-level shader
+ * (crocus_uncompiled_shader), due to state-based recompiles (brw_*_prog_key).
+ */
+struct crocus_compiled_shader {
+ /** Reference to the uploaded assembly. */
+ uint32_t offset;
+
+ /* asm size in map */
+ uint32_t map_size;
+
+ /** The program data (owned by the program cache hash table) */
+ struct brw_stage_prog_data *prog_data;
+ uint32_t prog_data_size;
+
+ /** A list of system values to be uploaded as uniforms. */
+ enum brw_param_builtin *system_values;
+ unsigned num_system_values;
+
+ /** Number of constbufs expected by the shader. */
+ unsigned num_cbufs;
+
+ /**
+ * Derived 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets
+ * (the VUE-based information for transform feedback outputs).
+ */
+ uint32_t *streamout;
+
+ struct crocus_binding_table bt;
+
+ uint32_t bind_bo_offset;
+ uint32_t surf_offset[128];//TODO
+};
+
+/**
+ * API context state that is replicated per shader stage.
+ */
+struct crocus_shader_state {
+ /** Uniform Buffers */
+ struct pipe_constant_buffer constbufs[PIPE_MAX_CONSTANT_BUFFERS];
+
+ bool sysvals_need_upload;
+
+ /** Shader Storage Buffers */
+ struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS];
+
+ /** Shader Storage Images (image load store) */
+ struct crocus_image_view image[PIPE_MAX_SHADER_IMAGES];
+
+ struct crocus_sampler_state *samplers[CROCUS_MAX_TEXTURE_SAMPLERS];
+ struct crocus_sampler_view *textures[CROCUS_MAX_TEXTURE_SAMPLERS];
+
+ /** Bitfield of which constant buffers are bound (non-null). */
+ uint32_t bound_cbufs;
+
+ /** Bitfield of which image views are bound (non-null). */
+ uint32_t bound_image_views;
+
+ /** Bitfield of which sampler views are bound (non-null). */
+ uint32_t bound_sampler_views;
+
+ /** Bitfield of which shader storage buffers are bound (non-null). */
+ uint32_t bound_ssbos;
+
+ /** Bitfield of which shader storage buffers are writable. */
+ uint32_t writable_ssbos;
+
+ uint32_t sampler_offset;
+};
+
+/**
+ * The API context (derived from pipe_context).
+ *
+ * Most driver state is tracked here.
+ */
+struct crocus_context {
+ struct pipe_context ctx;
+
+ /** A debug callback for KHR_debug output. */
+ struct pipe_debug_callback dbg;
+
+ /** A device reset status callback for notifying that the GPU is hosed. */
+ struct pipe_device_reset_callback reset;
+
+ /** Slab allocator for crocus_transfer_map objects. */
+ struct slab_child_pool transfer_pool;
+
+ struct blorp_context blorp;
+
+ int batch_count;
+ struct crocus_batch batches[CROCUS_BATCH_COUNT];
+
+ struct u_upload_mgr *query_buffer_uploader;
+
+ struct blitter_context *blitter;
+
+ struct {
+ struct {
+ /**
+ * Either the value of BaseVertex for indexed draw calls or the value
+ * of the argument <first> for non-indexed draw calls.
+ */
+ int firstvertex;
+ int baseinstance;
+ } params;
+
+ /**
+ * Are the above values the ones stored in the draw_params buffer?
+ * If so, we can compare them against new values to see if anything
+ * changed. If not, we need to assume they changed.
+ */
+ bool params_valid;
+
+ /**
+ * Resource and offset that stores draw_parameters from the indirect
+ * buffer or to the buffer that stures the previous values for non
+ * indirect draws.
+ */
+ struct crocus_state_ref draw_params;
+
+ struct {
+ /**
+ * The value of DrawID. This always comes in from it's own vertex
+ * buffer since it's not part of the indirect draw parameters.
+ */
+ int drawid;
+
+ /**
+ * Stores if an indexed or non-indexed draw (~0/0). Useful to
+ * calculate BaseVertex as an AND of firstvertex and is_indexed_draw.
+ */
+ int is_indexed_draw;
+ } derived_params;
+
+ /**
+ * Resource and offset used for GL_ARB_shader_draw_parameters which
+ * contains parameters that are not present in the indirect buffer as
+ * drawid and is_indexed_draw. They will go in their own vertex element.
+ */
+ struct crocus_state_ref derived_draw_params;
+ } draw;
+
+ struct {
+ struct crocus_uncompiled_shader *uncompiled[MESA_SHADER_STAGES];
+ struct crocus_compiled_shader *prog[MESA_SHADER_STAGES];
+ struct brw_vue_map *last_vue_map;
+
+ struct crocus_bo *cache_bo;
+ uint32_t cache_next_offset;
+ void *cache_bo_map;
+ struct hash_table *cache;
+
+ unsigned urb_size;
+
+ /* gen 4/5 clip/sf progs */
+ struct crocus_compiled_shader *clip_prog;
+ struct crocus_compiled_shader *sf_prog;
+ /* gen4/5 prims, gen6 streamout */
+ struct crocus_compiled_shader *ff_gs_prog;
+ uint32_t clip_offset;
+ uint32_t sf_offset;
+ uint32_t wm_offset;
+ uint32_t vs_offset;
+ uint32_t gs_offset;
+ uint32_t cc_offset;
+
+ /** Is a GS or TES outputting points or lines? */
+ bool output_topology_is_points_or_lines;
+
+ /* Track last VS URB entry size */
+ unsigned last_vs_entry_size;
+
+ /**
+ * Scratch buffers for various sizes and stages.
+ *
+ * Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding,
+ * and shader stage.
+ */
+ struct crocus_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES];
+ } shaders;
+
+ struct {
+ struct crocus_query *query;
+ bool condition;
+ enum pipe_render_cond_flag mode;
+ } condition;
+
+ struct intel_perf_context *perf_ctx;
+
+ struct {
+ uint64_t dirty;
+ uint64_t stage_dirty;
+ uint64_t stage_dirty_for_nos[CROCUS_NOS_COUNT];
+
+ unsigned num_viewports;
+ unsigned sample_mask;
+ struct crocus_blend_state *cso_blend;
+ struct crocus_rasterizer_state *cso_rast;
+ struct crocus_depth_stencil_alpha_state *cso_zsa;
+ struct crocus_vertex_element_state *cso_vertex_elements;
+ struct pipe_blend_color blend_color;
+ struct pipe_poly_stipple poly_stipple;
+ struct pipe_viewport_state viewports[CROCUS_MAX_VIEWPORTS];
+ struct pipe_scissor_state scissors[CROCUS_MAX_VIEWPORTS];
+ struct pipe_stencil_ref stencil_ref;
+ struct pipe_framebuffer_state framebuffer;
+ struct pipe_clip_state clip_planes;
+
+ float default_outer_level[4];
+ float default_inner_level[2];
+
+ /** Bitfield of which vertex buffers are bound (non-null). */
+ uint32_t bound_vertex_buffers;
+ struct pipe_vertex_buffer vertex_buffers[16];
+ uint32_t vb_end[16];
+
+ bool primitive_restart;
+ unsigned cut_index;
+ enum pipe_prim_type prim_mode:8;
+ bool prim_is_points_or_lines;
+ uint8_t vertices_per_patch;
+
+ bool window_space_position;
+
+ /** The last compute group size */
+ uint32_t last_block[3];
+
+ /** The last compute grid size */
+ uint32_t last_grid[3];
+ /** Reference to the BO containing the compute grid size */
+ struct crocus_state_ref grid_size;
+
+ /**
+ * Array of aux usages for drawing, altered to account for any
+ * self-dependencies from resources bound for sampling and rendering.
+ */
+ enum isl_aux_usage draw_aux_usage[BRW_MAX_DRAW_BUFFERS];
+
+ /** Aux usage of the fb's depth buffer (which may or may not exist). */
+ enum isl_aux_usage hiz_usage;
+
+ /** Bitfield of whether color blending is enabled for RT[i] */
+ uint8_t blend_enables;
+
+ /** Are depth writes enabled? (Depth buffer may or may not exist.) */
+ bool depth_writes_enabled;
+
+ /** Are stencil writes enabled? (Stencil buffer may or may not exist.) */
+ bool stencil_writes_enabled;
+
+ /** GenX-specific current state */
+ struct crocus_genx_state *genx;
+
+ struct crocus_shader_state shaders[MESA_SHADER_STAGES];
+
+ /** Do vertex shader uses shader draw parameters ? */
+ bool vs_uses_draw_params;
+ bool vs_uses_derived_draw_params;
+ bool vs_needs_sgvs_element;
+ bool vs_uses_vertexid;
+ bool vs_uses_instanceid;
+
+ /** Do vertex shader uses edge flag ? */
+ bool vs_needs_edge_flag;
+
+ struct pipe_stream_output_target *so_target[PIPE_MAX_SO_BUFFERS];
+ bool streamout_active;
+ int so_targets;
+
+ bool statistics_counters_enabled;
+
+ /** Current conditional rendering mode */
+ enum crocus_predicate_state predicate;
+ bool predicate_supported;
+
+ /**
+ * Query BO with a MI_PREDICATE_RESULT snapshot calculated on the
+ * render context that needs to be uploaded to the compute context.
+ */
+ struct crocus_bo *compute_predicate;
+
+ /** Is a PIPE_QUERY_PRIMITIVES_GENERATED query active? */
+ bool prims_generated_query_active;
+
+ /** 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets */
+ uint32_t *streamout;
+
+ /**
+ * Resources containing streamed state which our render context
+ * currently points to. Used to re-add these to the validation
+ * list when we start a new batch and haven't resubmitted commands.
+ */
+ struct {
+ struct pipe_resource *res;
+ uint32_t offset;
+ uint32_t size;
+ uint32_t index_size;
+ bool prim_restart;
+ } index_buffer;
+
+ uint32_t sf_vp_address;
+ uint32_t clip_vp_address;
+ uint32_t cc_vp_address;
+
+ uint32_t stats_wm;
+ float global_depth_offset_clamp;
+
+ uint32_t last_xfb_verts_per_prim;
+ uint64_t svbi;
+ } state;
+
+ /* BRW_NEW_URB_ALLOCATIONS:
+ */
+ struct {
+ uint32_t vsize; /* vertex size plus header in urb registers */
+ uint32_t gsize; /* GS output size in urb registers */
+ uint32_t hsize; /* Tessellation control output size in urb registers */
+ uint32_t dsize; /* Tessellation evaluation output size in urb registers */
+ uint32_t csize; /* constant buffer size in urb registers */
+ uint32_t sfsize; /* setup data size in urb registers */
+
+ bool constrained;
+
+ uint32_t nr_vs_entries;
+ uint32_t nr_hs_entries;
+ uint32_t nr_ds_entries;
+ uint32_t nr_gs_entries;
+ uint32_t nr_clip_entries;
+ uint32_t nr_sf_entries;
+ uint32_t nr_cs_entries;
+
+ uint32_t vs_start;
+ uint32_t hs_start;
+ uint32_t ds_start;
+ uint32_t gs_start;
+ uint32_t clip_start;
+ uint32_t sf_start;
+ uint32_t cs_start;
+ /**
+ * URB size in the current configuration. The units this is expressed
+ * in are somewhat inconsistent, see intel_device_info::urb::size.
+ *
+ * FINISHME: Represent the URB size consistently in KB on all platforms.
+ */
+ uint32_t size;
+
+ /* True if the most recently sent _3DSTATE_URB message allocated
+ * URB space for the GS.
+ */
+ bool gs_present;
+
+ /* True if the most recently sent _3DSTATE_URB message allocated
+ * URB space for the HS and DS.
+ */
+ bool tess_present;
+ } urb;
+
+ /* GEN4/5 curbe */
+ struct {
+ unsigned wm_start;
+ unsigned wm_size;
+ unsigned clip_start;
+ unsigned clip_size;
+ unsigned vs_start;
+ unsigned vs_size;
+ unsigned total_size;
+
+ struct crocus_resource *curbe_res;
+ unsigned curbe_offset;
+ } curbe;
+
+ /**
+ * A buffer containing a marker + description of the driver. This buffer is
+ * added to all execbufs syscalls so that we can identify the driver that
+ * generated a hang by looking at the content of the buffer in the error
+ * state. It is also used for hardware workarounds that require scratch
+ * writes or reads from some unimportant memory. To avoid overriding the
+ * debug data, use the workaround_address field for workarounds.
+ */
+ struct crocus_bo *workaround_bo;
+ unsigned workaround_offset;
+};
+
+#define perf_debug(dbg, ...) do { \
+ if (INTEL_DEBUG & DEBUG_PERF) \
+ dbg_printf(__VA_ARGS__); \
+ if (unlikely(dbg)) \
+ pipe_debug_message(dbg, PERF_INFO, __VA_ARGS__); \
+} while(0)
+
+
+struct pipe_context *
+crocus_create_context(struct pipe_screen *screen, void *priv, unsigned flags);
+
+void crocus_lost_context_state(struct crocus_batch *batch);
+
+void crocus_init_blit_functions(struct pipe_context *ctx);
+void crocus_init_clear_functions(struct pipe_context *ctx);
+void crocus_init_program_functions(struct pipe_context *ctx);
+void crocus_init_resource_functions(struct pipe_context *ctx);
+bool crocus_update_compiled_shaders(struct crocus_context *ice);
+void crocus_update_compiled_compute_shader(struct crocus_context *ice);
+void crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+ unsigned threads, uint32_t *dst);
+
+
+/* crocus_blit.c */
+enum crocus_blitter_op
+{
+ CROCUS_SAVE_TEXTURES = 1,
+ CROCUS_SAVE_FRAMEBUFFER = 2,
+ CROCUS_SAVE_FRAGMENT_STATE = 4,
+ CROCUS_DISABLE_RENDER_COND = 8,
+};
+void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond);
+
+void crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl,
+ struct isl_device *isl_dev,
+ struct blorp_surf *surf,
+ struct pipe_resource *p_res,
+ enum isl_aux_usage aux_usage,
+ unsigned level,
+ bool is_render_target);
+void crocus_copy_region(struct blorp_context *blorp,
+ struct crocus_batch *batch,
+ struct pipe_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct pipe_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box);
+
+/* crocus_draw.c */
+void crocus_draw_vbo(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws);
+void crocus_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
+
+/* crocus_pipe_control.c */
+
+void crocus_emit_pipe_control_flush(struct crocus_batch *batch,
+ const char *reason, uint32_t flags);
+void crocus_emit_pipe_control_write(struct crocus_batch *batch,
+ const char *reason, uint32_t flags,
+ struct crocus_bo *bo, uint32_t offset,
+ uint64_t imm);
+void crocus_emit_mi_flush(struct crocus_batch *batch);
+void crocus_emit_depth_stall_flushes(struct crocus_batch *batch);
+void crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch);
+void crocus_emit_end_of_pipe_sync(struct crocus_batch *batch,
+ const char *reason, uint32_t flags);
+void crocus_flush_all_caches(struct crocus_batch *batch);
+
+#define crocus_handle_always_flush_cache(batch) \
+ if (unlikely(batch->screen->driconf.always_flush_cache)) \
+ crocus_flush_all_caches(batch);
+
+void crocus_init_flush_functions(struct pipe_context *ctx);
+
+/* crocus_program.c */
+const struct shader_info *crocus_get_shader_info(const struct crocus_context *ice,
+ gl_shader_stage stage);
+struct crocus_bo *crocus_get_scratch_space(struct crocus_context *ice,
+ unsigned per_thread_scratch,
+ gl_shader_stage stage);
+uint32_t crocus_group_index_to_bti(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group,
+ uint32_t index);
+uint32_t crocus_bti_to_group_index(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group,
+ uint32_t bti);
+
+/* crocus_disk_cache.c */
+
+void crocus_disk_cache_store(struct disk_cache *cache,
+ const struct crocus_uncompiled_shader *ish,
+ const struct crocus_compiled_shader *shader,
+ void *map,
+ const void *prog_key,
+ uint32_t prog_key_size);
+struct crocus_compiled_shader *
+crocus_disk_cache_retrieve(struct crocus_context *ice,
+ const struct crocus_uncompiled_shader *ish,
+ const void *prog_key,
+ uint32_t prog_key_size);
+
+/* crocus_program_cache.c */
+
+void crocus_init_program_cache(struct crocus_context *ice);
+void crocus_destroy_program_cache(struct crocus_context *ice);
+void crocus_print_program_cache(struct crocus_context *ice);
+struct crocus_compiled_shader *crocus_find_cached_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id,
+ uint32_t key_size,
+ const void *key);
+struct crocus_compiled_shader *crocus_upload_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id,
+ uint32_t key_size,
+ const void *key,
+ const void *assembly,
+ uint32_t asm_size,
+ struct brw_stage_prog_data *,
+ uint32_t prog_data_size,
+ uint32_t *streamout,
+ enum brw_param_builtin *sysv,
+ unsigned num_system_values,
+ unsigned num_cbufs,
+ const struct crocus_binding_table *bt);
+const void *crocus_find_previous_compile(const struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id,
+ unsigned program_string_id);
+bool crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch,
+ const void *key,
+ uint32_t key_size,
+ uint32_t *kernel_out,
+ void *prog_data_out);
+bool crocus_blorp_upload_shader(struct blorp_batch *blorp_batch,
+ uint32_t stage,
+ const void *key, uint32_t key_size,
+ const void *kernel, uint32_t kernel_size,
+ const struct brw_stage_prog_data *prog_data,
+ uint32_t prog_data_size,
+ uint32_t *kernel_out,
+ void *prog_data_out);
+
+/* crocus_resolve.c */
+
+void crocus_predraw_resolve_inputs(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled,
+ gl_shader_stage stage,
+ bool consider_framebuffer);
+void crocus_predraw_resolve_framebuffer(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled);
+void crocus_postdraw_update_resolve_tracking(struct crocus_context *ice,
+ struct crocus_batch *batch);
+void crocus_cache_sets_clear(struct crocus_batch *batch);
+void crocus_flush_depth_and_render_caches(struct crocus_batch *batch);
+void crocus_cache_flush_for_read(struct crocus_batch *batch, struct crocus_bo *bo);
+void crocus_cache_flush_for_render(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage);
+void crocus_render_cache_add_bo(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage);
+void crocus_cache_flush_for_depth(struct crocus_batch *batch, struct crocus_bo *bo);
+void crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo);
+int crocus_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+ struct pipe_driver_query_info *info);
+int crocus_get_driver_query_group_info(struct pipe_screen *pscreen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info);
+
+struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ctx);
+
+bool crocus_sw_check_cond_render(struct crocus_context *ice);
+static inline bool crocus_check_conditional_render(struct crocus_context *ice)
+{
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_STALL_FOR_QUERY)
+ return crocus_sw_check_cond_render(ice);
+ return ice->state.predicate != CROCUS_PREDICATE_STATE_DONT_RENDER;
+}
+
+#ifdef genX
+# include "crocus_genx_protos.h"
+#else
+# define genX(x) gfx4_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx45_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx5_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx6_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx7_##x
+# include "crocus_genx_protos.h"
+# undef genX
+# define genX(x) gfx75_##x
+# include "crocus_genx_protos.h"
+# undef genX
+#endif
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_defines.h b/src/gallium/drivers/crocus/crocus_defines.h
new file mode 100644
index 00000000000..a634d0746b0
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_defines.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_DEFINES_H
+#define CROCUS_DEFINES_H
+
+/**
+ * @file crocus_defines.h
+ *
+ * Random hardware #defines that we're not using GENXML for.
+ */
+
+#define MI_PREDICATE (0xC << 23)
+# define MI_PREDICATE_LOADOP_KEEP (0 << 6)
+# define MI_PREDICATE_LOADOP_LOAD (2 << 6)
+# define MI_PREDICATE_LOADOP_LOADINV (3 << 6)
+# define MI_PREDICATE_COMBINEOP_SET (0 << 3)
+# define MI_PREDICATE_COMBINEOP_AND (1 << 3)
+# define MI_PREDICATE_COMBINEOP_OR (2 << 3)
+# define MI_PREDICATE_COMBINEOP_XOR (3 << 3)
+# define MI_PREDICATE_COMPAREOP_TRUE (0 << 0)
+# define MI_PREDICATE_COMPAREOP_FALSE (1 << 0)
+# define MI_PREDICATE_COMPAREOP_SRCS_EQUAL (2 << 0)
+# define MI_PREDICATE_COMPAREOP_DELTAS_EQUAL (3 << 0)
+
+/* Predicate registers */
+#define MI_PREDICATE_SRC0 0x2400
+#define MI_PREDICATE_SRC1 0x2408
+#define MI_PREDICATE_DATA 0x2410
+#define MI_PREDICATE_RESULT 0x2418
+#define MI_PREDICATE_RESULT_1 0x241C
+#define MI_PREDICATE_RESULT_2 0x2214
+
+#define CS_GPR(n) (0x2600 + (n) * 8)
+
+/* The number of bits in our TIMESTAMP queries. */
+#define TIMESTAMP_BITS 36
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_disk_cache.c b/src/gallium/drivers/crocus/crocus_disk_cache.c
new file mode 100644
index 00000000000..c84d043fbc8
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_disk_cache.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_disk_cache.c
+ *
+ * Functions for interacting with the on-disk shader cache.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+#include "compiler/nir/nir.h"
+#include "util/blob.h"
+#include "util/build_id.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
+
+#include "crocus_context.h"
+
+static bool debug = false;
+
+/**
+ * Compute a disk cache key for the given uncompiled shader and NOS key.
+ */
+static void
+crocus_disk_cache_compute_key(struct disk_cache *cache,
+ const struct crocus_uncompiled_shader *ish,
+ const void *orig_prog_key,
+ uint32_t prog_key_size,
+ cache_key cache_key)
+{
+ /* Create a copy of the program key with program_string_id zeroed out.
+ * It's essentially random data which we don't want to include in our
+ * hashing and comparisons. We'll set a proper value on a cache hit.
+ */
+ union brw_any_prog_key prog_key;
+ memcpy(&prog_key, orig_prog_key, prog_key_size);
+ prog_key.base.program_string_id = 0;
+
+ uint8_t data[sizeof(prog_key) + sizeof(ish->nir_sha1)];
+ uint32_t data_size = prog_key_size + sizeof(ish->nir_sha1);
+
+ memcpy(data, ish->nir_sha1, sizeof(ish->nir_sha1));
+ memcpy(data + sizeof(ish->nir_sha1), &prog_key, prog_key_size);
+
+ disk_cache_compute_key(cache, data, data_size, cache_key);
+}
+
+/**
+ * Store the given compiled shader in the disk cache.
+ *
+ * This should only be called on newly compiled shaders. No checking is
+ * done to prevent repeated stores of the same shader.
+ */
+void
+crocus_disk_cache_store(struct disk_cache *cache,
+ const struct crocus_uncompiled_shader *ish,
+ const struct crocus_compiled_shader *shader,
+ void *map,
+ const void *prog_key,
+ uint32_t prog_key_size)
+{
+#ifdef ENABLE_SHADER_CACHE
+ if (!cache)
+ return;
+
+ gl_shader_stage stage = ish->nir->info.stage;
+ const struct brw_stage_prog_data *prog_data = shader->prog_data;
+
+ cache_key cache_key;
+ crocus_disk_cache_compute_key(cache, ish, prog_key, prog_key_size, cache_key);
+
+ if (debug) {
+ char sha1[41];
+ _mesa_sha1_format(sha1, cache_key);
+ fprintf(stderr, "[mesa disk cache] storing %s\n", sha1);
+ }
+
+ struct blob blob;
+ blob_init(&blob);
+
+ /* We write the following data to the cache blob:
+ *
+ * 1. Prog data (must come first because it has the assembly size)
+ * 2. Assembly code
+ * 3. Number of entries in the system value array
+ * 4. System value array
+ * 5. Legacy param array (only used for compute workgroup ID)
+ * 6. Binding table
+ */
+ blob_write_bytes(&blob, shader->prog_data, brw_prog_data_size(stage));
+ blob_write_bytes(&blob, map + shader->offset, shader->prog_data->program_size);
+ blob_write_bytes(&blob, &shader->num_system_values, sizeof(unsigned));
+ blob_write_bytes(&blob, shader->system_values,
+ shader->num_system_values * sizeof(enum brw_param_builtin));
+ blob_write_bytes(&blob, prog_data->param,
+ prog_data->nr_params * sizeof(uint32_t));
+ blob_write_bytes(&blob, &shader->bt, sizeof(shader->bt));
+
+ disk_cache_put(cache, cache_key, blob.data, blob.size, NULL);
+ blob_finish(&blob);
+#endif
+}
+
+/**
+ * Search for a compiled shader in the disk cache. If found, upload it
+ * to the in-memory program cache so we can use it.
+ */
+struct crocus_compiled_shader *
+crocus_disk_cache_retrieve(struct crocus_context *ice,
+ const struct crocus_uncompiled_shader *ish,
+ const void *prog_key,
+ uint32_t key_size)
+{
+#ifdef ENABLE_SHADER_CACHE
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct disk_cache *cache = screen->disk_cache;
+ gl_shader_stage stage = ish->nir->info.stage;
+
+ if (!cache)
+ return NULL;
+
+ cache_key cache_key;
+ crocus_disk_cache_compute_key(cache, ish, prog_key, key_size, cache_key);
+
+ if (debug) {
+ char sha1[41];
+ _mesa_sha1_format(sha1, cache_key);
+ fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1);
+ }
+
+ size_t size;
+ void *buffer = disk_cache_get(screen->disk_cache, cache_key, &size);
+
+ if (debug)
+ fprintf(stderr, "%s\n", buffer ? "found" : "missing");
+
+ if (!buffer)
+ return NULL;
+
+ const uint32_t prog_data_size = brw_prog_data_size(stage);
+
+ struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size);
+ const void *assembly;
+ uint32_t num_system_values;
+ uint32_t *system_values = NULL;
+ uint32_t *so_decls = NULL;
+
+ struct blob_reader blob;
+ blob_reader_init(&blob, buffer, size);
+ blob_copy_bytes(&blob, prog_data, prog_data_size);
+ assembly = blob_read_bytes(&blob, prog_data->program_size);
+ num_system_values = blob_read_uint32(&blob);
+ if (num_system_values) {
+ system_values =
+ ralloc_array(NULL, enum brw_param_builtin, num_system_values);
+ blob_copy_bytes(&blob, system_values,
+ num_system_values * sizeof(enum brw_param_builtin));
+ }
+
+ prog_data->param = NULL;
+ prog_data->pull_param = NULL;
+ assert(prog_data->nr_pull_params == 0);
+
+ if (prog_data->nr_params) {
+ prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
+ blob_copy_bytes(&blob, prog_data->param,
+ prog_data->nr_params * sizeof(uint32_t));
+ }
+
+ struct crocus_binding_table bt;
+ blob_copy_bytes(&blob, &bt, sizeof(bt));
+
+ if ((stage == MESA_SHADER_VERTEX ||
+ stage == MESA_SHADER_TESS_EVAL ||
+ stage == MESA_SHADER_GEOMETRY) && screen->devinfo.ver > 6) {
+ struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+ }
+
+ /* System values and uniforms are stored in constant buffer 0, the
+ * user-facing UBOs are indexed by one. So if any constant buffer is
+ * needed, the constant buffer 0 will be needed, so account for it.
+ */
+ unsigned num_cbufs = ish->nir->info.num_ubos;
+
+ if (num_cbufs || ish->nir->num_uniforms)
+ num_cbufs++;
+
+ if (num_system_values)
+ num_cbufs++;
+
+ /* Upload our newly read shader to the in-memory program cache and
+ * return it to the caller.
+ */
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, stage, key_size, prog_key, assembly,
+ prog_data->program_size,
+ prog_data, prog_data_size, so_decls, system_values,
+ num_system_values, num_cbufs, &bt);
+
+ free(buffer);
+
+ return shader;
+#else
+ return NULL;
+#endif
+}
+
+/**
+ * Initialize the on-disk shader cache.
+ */
+void
+crocus_disk_cache_init(struct crocus_screen *screen)
+{
+#ifdef ENABLE_SHADER_CACHE
+ if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK)
+ return;
+
+ /* array length = print length + nul char + 1 extra to verify it's unused */
+ char renderer[13];
+ UNUSED int len =
+ snprintf(renderer, sizeof(renderer), "crocus_%04x", screen->pci_id);
+ assert(len == sizeof(renderer) - 2);
+
+ const struct build_id_note *note =
+ build_id_find_nhdr_for_addr(crocus_disk_cache_init);
+ assert(note && build_id_length(note) == 20); /* sha1 */
+
+ const uint8_t *id_sha1 = build_id_data(note);
+ assert(id_sha1);
+
+ char timestamp[41];
+ _mesa_sha1_format(timestamp, id_sha1);
+
+ const uint64_t driver_flags =
+ brw_get_compiler_config_value(screen->compiler);
+ screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+#endif
+}
diff --git a/src/gallium/drivers/crocus/crocus_draw.c b/src/gallium/drivers/crocus/crocus_draw.c
new file mode 100644
index 00000000000..119c5571ae1
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_draw.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_draw.c
+ *
+ * The main driver hooks for drawing and launching compute shaders.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_draw.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_eu_defines.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "util/u_prim_restart.h"
+#include "indices/u_primconvert.h"
+#include "util/u_prim.h"
+
+static bool
+prim_is_points_or_lines(enum pipe_prim_type mode)
+{
+ /* We don't need to worry about adjacency - it can only be used with
+ * geometry shaders, and we don't care about this info when GS is on.
+ */
+ return mode == PIPE_PRIM_POINTS ||
+ mode == PIPE_PRIM_LINES ||
+ mode == PIPE_PRIM_LINE_LOOP ||
+ mode == PIPE_PRIM_LINE_STRIP;
+}
+
+static bool
+can_cut_index_handle_restart_index(struct crocus_context *ice,
+ const struct pipe_draw_info *draw)
+{
+ switch (draw->index_size) {
+ case 1:
+ return draw->restart_index == 0xff;
+ case 2:
+ return draw->restart_index == 0xffff;
+ case 4:
+ return draw->restart_index == 0xffffffff;
+ default:
+ unreachable("illegal index size\n");
+ }
+
+ return false;
+}
+
+static bool
+can_cut_index_handle_prim(struct crocus_context *ice,
+ const struct pipe_draw_info *draw)
+{
+ struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ /* Haswell can do it all. */
+ if (devinfo->is_haswell)
+ return true;
+
+ if (!can_cut_index_handle_restart_index(ice, draw))
+ return false;
+
+ switch (draw->mode) {
+ case PIPE_PRIM_POINTS:
+ case PIPE_PRIM_LINES:
+ case PIPE_PRIM_LINE_STRIP:
+ case PIPE_PRIM_TRIANGLES:
+ case PIPE_PRIM_TRIANGLE_STRIP:
+ case PIPE_PRIM_LINES_ADJACENCY:
+ case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+ case PIPE_PRIM_TRIANGLES_ADJACENCY:
+ case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+/**
+ * Record the current primitive mode and restart information, flagging
+ * related packets as dirty if necessary.
+ *
+ * This must be called before updating compiled shaders, because the patch
+ * information informs the TCS key.
+ */
+static void
+crocus_update_draw_info(struct crocus_context *ice,
+ const struct pipe_draw_info *info,
+ const struct pipe_draw_start_count_bias *draw)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ enum pipe_prim_type mode = info->mode;
+
+ if (screen->devinfo.ver < 6) {
+ /* Slight optimization to avoid the GS program when not needed:
+ */
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ if (mode == PIPE_PRIM_QUAD_STRIP && !rs_state->flatshade &&
+ rs_state->fill_front == PIPE_POLYGON_MODE_FILL &&
+ rs_state->fill_back == PIPE_POLYGON_MODE_FILL)
+ mode = PIPE_PRIM_TRIANGLE_STRIP;
+ if (mode == PIPE_PRIM_QUADS &&
+ draw->count == 4 &&
+ !rs_state->flatshade &&
+ rs_state->fill_front == PIPE_POLYGON_MODE_FILL &&
+ rs_state->fill_back == PIPE_POLYGON_MODE_FILL)
+ mode = PIPE_PRIM_TRIANGLE_FAN;
+ }
+
+ if (ice->state.prim_mode != mode) {
+ ice->state.prim_mode = mode;
+
+ if (screen->devinfo.ver < 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+ if (screen->devinfo.ver <= 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+
+ if (screen->devinfo.ver >= 7)
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+
+ /* For XY Clip enables */
+ bool points_or_lines = prim_is_points_or_lines(mode);
+ if (points_or_lines != ice->state.prim_is_points_or_lines) {
+ ice->state.prim_is_points_or_lines = points_or_lines;
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ }
+ }
+
+ if (info->mode == PIPE_PRIM_PATCHES &&
+ ice->state.vertices_per_patch != info->vertices_per_patch) {
+ ice->state.vertices_per_patch = info->vertices_per_patch;
+
+ /* This is needed for key->input_vertices */
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_TCS;
+
+ /* Flag constants dirty for gl_PatchVerticesIn if needed. */
+ const struct shader_info *tcs_info =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+ if (tcs_info &&
+ BITSET_TEST(tcs_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+ ice->state.shaders[MESA_SHADER_TESS_CTRL].sysvals_need_upload = true;
+ }
+ }
+
+ const unsigned cut_index = info->primitive_restart ? info->restart_index :
+ ice->state.cut_index;
+ if (ice->state.primitive_restart != info->primitive_restart ||
+ ice->state.cut_index != cut_index) {
+ if (screen->devinfo.is_haswell)
+ ice->state.dirty |= CROCUS_DIRTY_GEN75_VF;
+ ice->state.primitive_restart = info->primitive_restart;
+ ice->state.cut_index = info->restart_index;
+ }
+}
+
+/**
+ * Update shader draw parameters, flagging VF packets as dirty if necessary.
+ */
+static void
+crocus_update_draw_parameters(struct crocus_context *ice,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draw)
+{
+ bool changed = false;
+
+ if (ice->state.vs_uses_draw_params) {
+ struct crocus_state_ref *draw_params = &ice->draw.draw_params;
+
+ if (indirect && indirect->buffer) {
+ pipe_resource_reference(&draw_params->res, indirect->buffer);
+ draw_params->offset =
+ indirect->offset + (info->index_size ? 12 : 8);
+
+ changed = true;
+ ice->draw.params_valid = false;
+ } else {
+ int firstvertex = info->index_size ? draw->index_bias : draw->start;
+
+ if (!ice->draw.params_valid ||
+ ice->draw.params.firstvertex != firstvertex ||
+ ice->draw.params.baseinstance != info->start_instance) {
+
+ changed = true;
+ ice->draw.params.firstvertex = firstvertex;
+ ice->draw.params.baseinstance = info->start_instance;
+ ice->draw.params_valid = true;
+
+ u_upload_data(ice->ctx.stream_uploader, 0,
+ sizeof(ice->draw.params), 4, &ice->draw.params,
+ &draw_params->offset, &draw_params->res);
+ }
+ }
+ }
+
+ if (ice->state.vs_uses_derived_draw_params) {
+ struct crocus_state_ref *derived_params = &ice->draw.derived_draw_params;
+ int is_indexed_draw = info->index_size ? -1 : 0;
+
+ if (ice->draw.derived_params.drawid != drawid_offset ||
+ ice->draw.derived_params.is_indexed_draw != is_indexed_draw) {
+
+ changed = true;
+ ice->draw.derived_params.drawid = drawid_offset;
+ ice->draw.derived_params.is_indexed_draw = is_indexed_draw;
+
+ u_upload_data(ice->ctx.stream_uploader, 0,
+ sizeof(ice->draw.derived_params), 4,
+ &ice->draw.derived_params, &derived_params->offset,
+ &derived_params->res);
+ }
+ }
+
+ if (changed) {
+ ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS |
+ CROCUS_DIRTY_VERTEX_ELEMENTS;
+ }
+}
+
+static void
+crocus_indirect_draw_vbo(struct crocus_context *ice,
+ const struct pipe_draw_info *dinfo,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *dindirect,
+ const struct pipe_draw_start_count_bias *draws)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ struct pipe_draw_info info = *dinfo;
+ struct pipe_draw_indirect_info indirect = *dindirect;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ if (devinfo->is_haswell && indirect.indirect_draw_count &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ /* Upload MI_PREDICATE_RESULT to GPR15.*/
+ screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
+ }
+
+ uint64_t orig_dirty = ice->state.dirty;
+ uint64_t orig_stage_dirty = ice->state.stage_dirty;
+
+ for (int i = 0; i < indirect.draw_count; i++) {
+ crocus_batch_maybe_flush(batch, 1500);
+ crocus_require_statebuffer_space(batch, 2400);
+
+ crocus_update_draw_parameters(ice, &info, drawid_offset + i, &indirect, draws);
+
+ screen->vtbl.upload_render_state(ice, batch, &info, drawid_offset + i, &indirect, draws);
+
+ ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER;
+ ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+
+ indirect.offset += indirect.stride;
+ }
+
+ if (devinfo->is_haswell && indirect.indirect_draw_count &&
+ ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+ /* Restore MI_PREDICATE_RESULT. */
+ screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15));
+ }
+
+ /* Put this back for post-draw resolves, we'll clear it again after. */
+ ice->state.dirty = orig_dirty;
+ ice->state.stage_dirty = orig_stage_dirty;
+}
+
+static void
+crocus_simple_draw_vbo(struct crocus_context *ice,
+ const struct pipe_draw_info *draw,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *sc)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+
+ crocus_batch_maybe_flush(batch, 1500);
+ crocus_require_statebuffer_space(batch, 2400);
+
+ crocus_update_draw_parameters(ice, draw, drawid_offset, indirect, sc);
+
+ screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc);
+}
+
+static void
+crocus_draw_vbo_get_vertex_count(struct pipe_context *ctx,
+ const struct pipe_draw_info *info_in,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ struct pipe_draw_info info = *info_in;
+ struct pipe_draw_start_count_bias draw;
+
+ uint32_t val = screen->vtbl.get_so_offset(indirect->count_from_stream_output);
+
+ draw.start = 0;
+ draw.count = val;
+ ctx->draw_vbo(ctx, &info, drawid_offset, NULL, &draw, 1);
+}
+
+/**
+ * The pipe->draw_vbo() driver hook. Performs a draw on the GPU.
+ */
+void
+crocus_draw_vbo(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws)
+{
+ if (num_draws > 1) {
+ util_draw_multi(ctx, info, drawid_offset, indirect, draws, num_draws);
+ return;
+ }
+
+ if (!indirect && (!draws[0].count || !info->instance_count))
+ return;
+
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (info->primitive_restart && !can_cut_index_handle_prim(ice, info)) {
+ util_draw_vbo_without_prim_restart(ctx, info, drawid_offset,
+ indirect, draws);
+ return;
+ }
+
+ if (indirect && indirect->count_from_stream_output &&
+ !screen->devinfo.is_haswell) {
+ crocus_draw_vbo_get_vertex_count(ctx, info, drawid_offset, indirect);
+ return;
+ }
+
+ /**
+ * The hardware is capable of removing dangling vertices on its own; however,
+ * prior to Gen6, we sometimes convert quads into trifans (and quad strips
+ * into tristrips), since pre-Gen6 hardware requires a GS to render quads.
+ * This function manually trims dangling vertices from a draw call involving
+ * quads so that those dangling vertices won't get drawn when we convert to
+ * trifans/tristrips.
+ */
+ if (screen->devinfo.ver < 6) {
+ if (info->mode == PIPE_PRIM_QUADS || info->mode == PIPE_PRIM_QUAD_STRIP) {
+ bool trim = u_trim_pipe_prim(info->mode, (unsigned *)&draws[0].count);
+ if (!trim)
+ return;
+ }
+ }
+
+ /* We can't safely re-emit 3DSTATE_SO_BUFFERS because it may zero the
+ * write offsets, changing the behavior.
+ */
+ if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
+ ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER & ~CROCUS_DIRTY_GEN7_SO_BUFFERS;
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+ }
+
+ /* Emit Sandybridge workaround flushes on every primitive, for safety. */
+ if (screen->devinfo.ver == 6)
+ crocus_emit_post_sync_nonzero_flush(batch);
+
+ crocus_update_draw_info(ice, info, draws);
+
+ if (!crocus_update_compiled_shaders(ice))
+ return;
+
+ if (ice->state.dirty & CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES) {
+ bool draw_aux_buffer_disabled[BRW_MAX_DRAW_BUFFERS] = { };
+ for (gl_shader_stage stage = 0; stage < MESA_SHADER_COMPUTE; stage++) {
+ if (ice->shaders.prog[stage])
+ crocus_predraw_resolve_inputs(ice, batch, draw_aux_buffer_disabled,
+ stage, true);
+ }
+ crocus_predraw_resolve_framebuffer(ice, batch, draw_aux_buffer_disabled);
+ }
+
+ crocus_handle_always_flush_cache(batch);
+
+ if (indirect && indirect->buffer)
+ crocus_indirect_draw_vbo(ice, info, drawid_offset, indirect, draws);
+ else
+ crocus_simple_draw_vbo(ice, info, drawid_offset, indirect, draws);
+
+ crocus_handle_always_flush_cache(batch);
+
+ crocus_postdraw_update_resolve_tracking(ice, batch);
+
+ ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER;
+ ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+}
+
+static void
+crocus_update_grid_size_resource(struct crocus_context *ice,
+ const struct pipe_grid_info *grid)
+{
+ struct crocus_state_ref *grid_ref = &ice->state.grid_size;
+ const struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_COMPUTE];
+ bool grid_needs_surface = shader->bt.used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS];
+
+ if (grid->indirect) {
+ pipe_resource_reference(&grid_ref->res, grid->indirect);
+ grid_ref->offset = grid->indirect_offset;
+
+ /* Zero out the grid size so that the next non-indirect grid launch will
+ * re-upload it properly.
+ */
+ memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+ } else if (memcmp(ice->state.last_grid, grid->grid, sizeof(grid->grid)) != 0) {
+ memcpy(ice->state.last_grid, grid->grid, sizeof(grid->grid));
+ u_upload_data(ice->ctx.const_uploader, 0, sizeof(grid->grid), 4,
+ grid->grid, &grid_ref->offset, &grid_ref->res);
+ }
+
+ /* Skip surface upload if we don't need it or we already have one */
+ if (!grid_needs_surface)
+ return;
+
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_CS;
+}
+
+
+void
+crocus_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_COMPUTE];
+ struct crocus_screen *screen = batch->screen;
+
+ if (!crocus_check_conditional_render(ice))
+ return;
+
+ if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
+ ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+ }
+
+ /* We can't do resolves on the compute engine, so awkwardly, we have to
+ * do them on the render batch...
+ */
+ if (ice->state.dirty & CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES) {
+ crocus_predraw_resolve_inputs(ice, &ice->batches[CROCUS_BATCH_RENDER], NULL,
+ MESA_SHADER_COMPUTE, false);
+ }
+
+ crocus_batch_maybe_flush(batch, 1500);
+ crocus_require_statebuffer_space(batch, 2500);
+ crocus_update_compiled_compute_shader(ice);
+
+ if (memcmp(ice->state.last_block, grid->block, sizeof(grid->block)) != 0) {
+ memcpy(ice->state.last_block, grid->block, sizeof(grid->block));
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+ ice->state.shaders[MESA_SHADER_COMPUTE].sysvals_need_upload = true;
+ }
+
+ crocus_update_grid_size_resource(ice, grid);
+
+ if (ice->state.compute_predicate) {
+ screen->vtbl.emit_compute_predicate(batch);
+ ice->state.compute_predicate = NULL;
+ }
+
+ crocus_handle_always_flush_cache(batch);
+
+ screen->vtbl.upload_compute_state(ice, batch, grid);
+
+ crocus_handle_always_flush_cache(batch);
+
+ ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_COMPUTE;
+ ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+
+ /* Note: since compute shaders can't access the framebuffer, there's
+ * no need to call crocus_postdraw_update_resolve_tracking.
+ */
+}
diff --git a/src/gallium/drivers/crocus/crocus_fence.c b/src/gallium/drivers/crocus/crocus_fence.c
new file mode 100644
index 00000000000..fdff24b2dd4
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fence.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_fence.c
+ *
+ * Fences for driver and IPC serialisation, scheduling and synchronisation.
+ */
+
+#include "util/u_inlines.h"
+#include "intel/common/intel_gem.h"
+
+#include "crocus_batch.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "crocus_fence.h"
+#include "crocus_screen.h"
+
+static uint32_t
+gem_syncobj_create(int fd, uint32_t flags)
+{
+ struct drm_syncobj_create args = {
+ .flags = flags,
+ };
+
+ intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_CREATE, &args);
+
+ return args.handle;
+}
+
+static void
+gem_syncobj_destroy(int fd, uint32_t handle)
+{
+ struct drm_syncobj_destroy args = {
+ .handle = handle,
+ };
+
+ intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_DESTROY, &args);
+}
+
+/**
+ * Make a new sync-point.
+ */
+struct crocus_syncobj *
+crocus_create_syncobj(struct crocus_screen *screen)
+{
+ struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj));
+
+ if (!syncobj)
+ return NULL;
+
+ syncobj->handle = gem_syncobj_create(screen->fd, 0);
+ assert(syncobj->handle);
+
+ pipe_reference_init(&syncobj->ref, 1);
+
+ return syncobj;
+}
+
+void
+crocus_syncobj_destroy(struct crocus_screen *screen,
+ struct crocus_syncobj *syncobj)
+{
+ gem_syncobj_destroy(screen->fd, syncobj->handle);
+ free(syncobj);
+}
+
+/**
+ * Add a sync-point to the batch, with the given flags.
+ *
+ * \p flags One of I915_EXEC_FENCE_WAIT or I915_EXEC_FENCE_SIGNAL.
+ */
+void
+crocus_batch_add_syncobj(struct crocus_batch *batch,
+ struct crocus_syncobj *syncobj, unsigned flags)
+{
+ struct drm_i915_gem_exec_fence *fence =
+ util_dynarray_grow(&batch->exec_fences, struct drm_i915_gem_exec_fence, 1);
+
+ *fence = (struct drm_i915_gem_exec_fence){
+ .handle = syncobj->handle,
+ .flags = flags,
+ };
+
+ struct crocus_syncobj **store =
+ util_dynarray_grow(&batch->syncobjs, struct crocus_syncobj *, 1);
+
+ *store = NULL;
+ crocus_syncobj_reference(batch->screen, store, syncobj);
+}
+
+/**
+ * Walk through a batch's dependencies (any I915_EXEC_FENCE_WAIT syncobjs)
+ * and unreference any which have already passed.
+ *
+ * Sometimes the compute batch is seldom used, and accumulates references
+ * to stale render batches that are no longer of interest, so we can free
+ * those up.
+ */
+static void
+clear_stale_syncobjs(struct crocus_batch *batch)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ int n = util_dynarray_num_elements(&batch->syncobjs, struct crocus_syncobj *);
+
+ assert(n == util_dynarray_num_elements(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence));
+
+ /* Skip the first syncobj, as it's the signalling one. */
+ for (int i = n - 1; i > 1; i--) {
+ struct crocus_syncobj **syncobj =
+ util_dynarray_element(&batch->syncobjs, struct crocus_syncobj *, i);
+ struct drm_i915_gem_exec_fence *fence =
+ util_dynarray_element(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence, i);
+ assert(fence->flags & I915_EXEC_FENCE_WAIT);
+
+ if (crocus_wait_syncobj(&screen->base, *syncobj, 0))
+ continue;
+
+ /* This sync object has already passed, there's no need to continue
+ * marking it as a dependency; we can stop holding on to the reference.
+ */
+ crocus_syncobj_reference(screen, syncobj, NULL);
+
+ /* Remove it from the lists; move the last element here. */
+ struct crocus_syncobj **nth_syncobj =
+ util_dynarray_pop_ptr(&batch->syncobjs, struct crocus_syncobj *);
+ struct drm_i915_gem_exec_fence *nth_fence =
+ util_dynarray_pop_ptr(&batch->exec_fences,
+ struct drm_i915_gem_exec_fence);
+
+ if (syncobj != nth_syncobj) {
+ *syncobj = *nth_syncobj;
+ memcpy(fence, nth_fence, sizeof(*fence));
+ }
+ }
+}
+
+/* ------------------------------------------------------------------- */
+
+struct pipe_fence_handle {
+ struct pipe_reference ref;
+
+ struct pipe_context *unflushed_ctx;
+
+ struct crocus_fine_fence *fine[CROCUS_BATCH_COUNT];
+};
+
+static void
+crocus_fence_destroy(struct pipe_screen *p_screen,
+ struct pipe_fence_handle *fence)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++)
+ crocus_fine_fence_reference(screen, &fence->fine[i], NULL);
+
+ free(fence);
+}
+
+static void
+crocus_fence_reference(struct pipe_screen *p_screen,
+ struct pipe_fence_handle **dst,
+ struct pipe_fence_handle *src)
+{
+ if (pipe_reference(&(*dst)->ref, &src->ref))
+ crocus_fence_destroy(p_screen, *dst);
+
+ *dst = src;
+}
+
+bool
+crocus_wait_syncobj(struct pipe_screen *p_screen,
+ struct crocus_syncobj *syncobj, int64_t timeout_nsec)
+{
+ if (!syncobj)
+ return false;
+
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+ struct drm_syncobj_wait args = {
+ .handles = (uintptr_t)&syncobj->handle,
+ .count_handles = 1,
+ .timeout_nsec = timeout_nsec,
+ };
+ return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
+}
+
+static void
+crocus_fence_flush(struct pipe_context *ctx,
+ struct pipe_fence_handle **out_fence, unsigned flags)
+{
+ struct crocus_screen *screen = (void *)ctx->screen;
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ const bool deferred = flags & PIPE_FLUSH_DEFERRED;
+
+ if (!deferred) {
+ for (unsigned i = 0; i < ice->batch_count; i++)
+ crocus_batch_flush(&ice->batches[i]);
+ }
+
+ if (!out_fence)
+ return;
+
+ struct pipe_fence_handle *fence = calloc(1, sizeof(*fence));
+ if (!fence)
+ return;
+
+ pipe_reference_init(&fence->ref, 1);
+
+ if (deferred)
+ fence->unflushed_ctx = ctx;
+
+ for (unsigned b = 0; b < ice->batch_count; b++) {
+ struct crocus_batch *batch = &ice->batches[b];
+
+ if (deferred && crocus_batch_bytes_used(batch) > 0) {
+ struct crocus_fine_fence *fine =
+ crocus_fine_fence_new(batch, CROCUS_FENCE_BOTTOM_OF_PIPE);
+ crocus_fine_fence_reference(screen, &fence->fine[b], fine);
+ crocus_fine_fence_reference(screen, &fine, NULL);
+ } else {
+ /* This batch has no commands queued up (perhaps we just flushed,
+ * or all the commands are on the other batch). Wait for the last
+ * syncobj on this engine - unless it's already finished by now.
+ */
+ if (crocus_fine_fence_signaled(batch->last_fence))
+ continue;
+
+ crocus_fine_fence_reference(screen, &fence->fine[b],
+ batch->last_fence);
+ }
+ }
+
+ crocus_fence_reference(ctx->screen, out_fence, NULL);
+ *out_fence = fence;
+}
+
+static void
+crocus_fence_await(struct pipe_context *ctx, struct pipe_fence_handle *fence)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ /* Unflushed fences from the same context are no-ops. */
+ if (ctx && ctx == fence->unflushed_ctx)
+ return;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ for (unsigned b = 0; b < ice->batch_count; b++) {
+ struct crocus_batch *batch = &ice->batches[b];
+
+ /* We're going to make any future work in this batch wait for our
+ * fence to have gone by. But any currently queued work doesn't
+ * need to wait. Flush the batch now, so it can happen sooner.
+ */
+ crocus_batch_flush(batch);
+
+ /* Before adding a new reference, clean out any stale ones. */
+ clear_stale_syncobjs(batch);
+
+ crocus_batch_add_syncobj(batch, fine->syncobj, I915_EXEC_FENCE_WAIT);
+ }
+ }
+}
+
+#define NSEC_PER_SEC (1000 * USEC_PER_SEC)
+#define USEC_PER_SEC (1000 * MSEC_PER_SEC)
+#define MSEC_PER_SEC (1000)
+
+static uint64_t
+gettime_ns(void)
+{
+ struct timespec current;
+ clock_gettime(CLOCK_MONOTONIC, &current);
+ return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
+}
+
+static uint64_t
+rel2abs(uint64_t timeout)
+{
+ if (timeout == 0)
+ return 0;
+
+ uint64_t current_time = gettime_ns();
+ uint64_t max_timeout = (uint64_t)INT64_MAX - current_time;
+
+ timeout = MIN2(max_timeout, timeout);
+
+ return current_time + timeout;
+}
+
+static bool
+crocus_fence_finish(struct pipe_screen *p_screen, struct pipe_context *ctx,
+ struct pipe_fence_handle *fence, uint64_t timeout)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+
+ /* If we created the fence with PIPE_FLUSH_DEFERRED, we may not have
+ * flushed yet. Check if our syncobj is the current batch's signalling
+ * syncobj - if so, we haven't flushed and need to now.
+ *
+ * The Gallium docs mention that a flush will occur if \p ctx matches
+ * the context the fence was created with. It may be NULL, so we check
+ * that it matches first.
+ */
+ if (ctx && ctx == fence->unflushed_ctx) {
+ for (unsigned i = 0; i < ice->batch_count; i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ if (fine->syncobj == crocus_batch_get_signal_syncobj(&ice->batches[i]))
+ crocus_batch_flush(&ice->batches[i]);
+ }
+
+ /* The fence is no longer deferred. */
+ fence->unflushed_ctx = NULL;
+ }
+
+ unsigned int handle_count = 0;
+ uint32_t handles[ARRAY_SIZE(fence->fine)];
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ handles[handle_count++] = fine->syncobj->handle;
+ }
+
+ if (handle_count == 0)
+ return true;
+
+ struct drm_syncobj_wait args = {
+ .handles = (uintptr_t)handles,
+ .count_handles = handle_count,
+ .timeout_nsec = rel2abs(timeout),
+ .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL
+ };
+ if (fence->unflushed_ctx) {
+ /* This fence had a deferred flush from another context. We can't
+ * safely flush it here, because the context might be bound to a
+ * different thread, and poking at its internals wouldn't be safe.
+ *
+ * Instead, use the WAIT_FOR_SUBMIT flag to block and hope that
+ * another thread submits the work.
+ */
+ args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
+ }
+ return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args) == 0;
+}
+
+#ifndef SYNC_IOC_MAGIC
+/* duplicated from linux/sync_file.h to avoid build-time dependency
+ * on new (v4.7) kernel headers. Once distro's are mostly using
+ * something newer than v4.7 drop this and #include <linux/sync_file.h>
+ * instead.
+ */
+struct sync_merge_data {
+ char name[32];
+ __s32 fd2;
+ __s32 fence;
+ __u32 flags;
+ __u32 pad;
+};
+
+#define SYNC_IOC_MAGIC '>'
+#define SYNC_IOC_MERGE _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
+#endif
+
+static int
+sync_merge_fd(int sync_fd, int new_fd)
+{
+ if (sync_fd == -1)
+ return new_fd;
+
+ if (new_fd == -1)
+ return sync_fd;
+
+ struct sync_merge_data args = {
+ .name = "crocus fence",
+ .fd2 = new_fd,
+ .fence = -1,
+ };
+
+ intel_ioctl(sync_fd, SYNC_IOC_MERGE, &args);
+ close(new_fd);
+ close(sync_fd);
+
+ return args.fence;
+}
+
+static int
+crocus_fence_get_fd(struct pipe_screen *p_screen,
+ struct pipe_fence_handle *fence)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+ int fd = -1;
+
+ /* Deferred fences aren't supported. */
+ if (fence->unflushed_ctx)
+ return -1;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ struct drm_syncobj_handle args = {
+ .handle = fine->syncobj->handle,
+ .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
+ .fd = -1,
+ };
+
+ intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
+ fd = sync_merge_fd(fd, args.fd);
+ }
+
+ if (fd == -1) {
+ /* Our fence has no syncobj's recorded. This means that all of the
+ * batches had already completed, their syncobj's had been signalled,
+ * and so we didn't bother to record them. But we're being asked to
+ * export such a fence. So export a dummy already-signalled syncobj.
+ */
+ struct drm_syncobj_handle args = {
+ .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
+ .fd = -1,
+ };
+
+ args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED);
+ intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
+ gem_syncobj_destroy(screen->fd, args.handle);
+ return args.fd;
+ }
+
+ return fd;
+}
+
+static void
+crocus_fence_create_fd(struct pipe_context *ctx, struct pipe_fence_handle **out,
+ int fd, enum pipe_fd_type type)
+{
+ assert(type == PIPE_FD_TYPE_NATIVE_SYNC || type == PIPE_FD_TYPE_SYNCOBJ);
+
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ struct drm_syncobj_handle args = {
+ .fd = fd,
+ };
+
+ if (type == PIPE_FD_TYPE_NATIVE_SYNC) {
+ args.flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE;
+ args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED);
+ }
+
+ if (intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args) == -1) {
+ fprintf(stderr, "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE failed: %s\n",
+ strerror(errno));
+ if (type == PIPE_FD_TYPE_NATIVE_SYNC)
+ gem_syncobj_destroy(screen->fd, args.handle);
+ *out = NULL;
+ return;
+ }
+
+ struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj));
+ if (!syncobj) {
+ *out = NULL;
+ return;
+ }
+ syncobj->handle = args.handle;
+ pipe_reference_init(&syncobj->ref, 1);
+
+ struct crocus_fine_fence *fine = calloc(1, sizeof(*fine));
+ if (!fine) {
+ free(syncobj);
+ *out = NULL;
+ return;
+ }
+
+ static const uint32_t zero = 0;
+
+ /* Fences work in terms of crocus_fine_fence, but we don't actually have a
+ * seqno for an imported fence. So, create a fake one which always
+ * returns as 'not signaled' so we fall back to using the sync object.
+ */
+ fine->seqno = UINT32_MAX;
+ fine->map = &zero;
+ fine->syncobj = syncobj;
+ fine->flags = CROCUS_FENCE_END;
+ pipe_reference_init(&fine->reference, 1);
+
+ struct pipe_fence_handle *fence = calloc(1, sizeof(*fence));
+ if (!fence) {
+ free(fine);
+ free(syncobj);
+ *out = NULL;
+ return;
+ }
+ pipe_reference_init(&fence->ref, 1);
+ fence->fine[0] = fine;
+
+ *out = fence;
+}
+
+static void
+crocus_fence_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ if (ctx == fence->unflushed_ctx)
+ return;
+
+ for (unsigned b = 0; b < ice->batch_count; b++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+ struct crocus_fine_fence *fine = fence->fine[i];
+
+ /* already signaled fence skipped */
+ if (crocus_fine_fence_signaled(fine))
+ continue;
+
+ ice->batches[b].contains_fence_signal = true;
+ crocus_batch_add_syncobj(&ice->batches[b], fine->syncobj,
+ I915_EXEC_FENCE_SIGNAL);
+ }
+ }
+}
+
+void
+crocus_init_screen_fence_functions(struct pipe_screen *screen)
+{
+ screen->fence_reference = crocus_fence_reference;
+ screen->fence_finish = crocus_fence_finish;
+ screen->fence_get_fd = crocus_fence_get_fd;
+}
+
+void
+crocus_init_context_fence_functions(struct pipe_context *ctx)
+{
+ ctx->flush = crocus_fence_flush;
+ ctx->create_fence_fd = crocus_fence_create_fd;
+ ctx->fence_server_sync = crocus_fence_await;
+ ctx->fence_server_signal = crocus_fence_signal;
+}
diff --git a/src/gallium/drivers/crocus/crocus_fence.h b/src/gallium/drivers/crocus/crocus_fence.h
new file mode 100644
index 00000000000..ef2eff5259b
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fence.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_FENCE_H
+#define CROCUS_FENCE_H
+
+#include "util/u_inlines.h"
+
+struct pipe_screen;
+struct crocus_screen;
+struct crocus_batch;
+
+struct crocus_syncobj {
+ struct pipe_reference ref;
+ uint32_t handle;
+};
+
+void crocus_init_context_fence_functions(struct pipe_context *ctx);
+void crocus_init_screen_fence_functions(struct pipe_screen *screen);
+
+struct crocus_syncobj *crocus_create_syncobj(struct crocus_screen *screen);
+void crocus_syncobj_destroy(struct crocus_screen *, struct crocus_syncobj *);
+void crocus_batch_add_syncobj(struct crocus_batch *batch,
+ struct crocus_syncobj *syncobj,
+ unsigned flags);
+bool crocus_wait_syncobj(struct pipe_screen *screen,
+ struct crocus_syncobj *syncobj,
+ int64_t timeout_nsec);
+static inline void
+crocus_syncobj_reference(struct crocus_screen *screen,
+ struct crocus_syncobj **dst,
+ struct crocus_syncobj *src)
+{
+ if (pipe_reference(&(*dst)->ref, &src->ref))
+ crocus_syncobj_destroy(screen, *dst);
+
+ *dst = src;
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.c b/src/gallium/drivers/crocus/crocus_fine_fence.c
new file mode 100644
index 00000000000..9bb8a9673e3
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fine_fence.c
@@ -0,0 +1,85 @@
+#include "crocus_context.h"
+#include "crocus_fine_fence.h"
+#include "util/u_upload_mgr.h"
+
+static void
+crocus_fine_fence_reset(struct crocus_batch *batch)
+{
+ u_upload_alloc(batch->fine_fences.uploader,
+ 0, sizeof(uint64_t), sizeof(uint64_t),
+ &batch->fine_fences.ref.offset, &batch->fine_fences.ref.res,
+ (void **)&batch->fine_fences.map);
+ WRITE_ONCE(*batch->fine_fences.map, 0);
+ batch->fine_fences.next++;
+}
+
+void
+crocus_fine_fence_init(struct crocus_batch *batch)
+{
+ batch->fine_fences.ref.res = NULL;
+ batch->fine_fences.next = 0;
+ if (batch_has_fine_fence(batch))
+ crocus_fine_fence_reset(batch);
+}
+
+static uint32_t
+crocus_fine_fence_next(struct crocus_batch *batch)
+{
+ if (!batch_has_fine_fence(batch))
+ return UINT32_MAX;
+
+ uint32_t seqno = batch->fine_fences.next++;
+
+ if (batch->fine_fences.next == 0)
+ crocus_fine_fence_reset(batch);
+
+ return seqno;
+}
+
+void
+crocus_fine_fence_destroy(struct crocus_screen *screen,
+ struct crocus_fine_fence *fine)
+{
+ crocus_syncobj_reference(screen, &fine->syncobj, NULL);
+ pipe_resource_reference(&fine->ref.res, NULL);
+ free(fine);
+}
+
+struct crocus_fine_fence *
+crocus_fine_fence_new(struct crocus_batch *batch, unsigned flags)
+{
+ struct crocus_fine_fence *fine = calloc(1, sizeof(*fine));
+ if (!fine)
+ return NULL;
+
+ pipe_reference_init(&fine->reference, 1);
+
+ fine->seqno = crocus_fine_fence_next(batch);
+
+ crocus_syncobj_reference(batch->screen, &fine->syncobj,
+ crocus_batch_get_signal_syncobj(batch));
+
+ if (!batch_has_fine_fence(batch))
+ return fine;
+ pipe_resource_reference(&fine->ref.res, batch->fine_fences.ref.res);
+ fine->ref.offset = batch->fine_fences.ref.offset;
+ fine->map = batch->fine_fences.map;
+ fine->flags = flags;
+
+ unsigned pc;
+ if (flags & CROCUS_FENCE_TOP_OF_PIPE) {
+ pc = PIPE_CONTROL_WRITE_IMMEDIATE | PIPE_CONTROL_CS_STALL;
+ } else {
+ pc = PIPE_CONTROL_WRITE_IMMEDIATE |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_TILE_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_DATA_CACHE_FLUSH;
+ }
+ crocus_emit_pipe_control_write(batch, "fence: fine", pc,
+ crocus_resource_bo(fine->ref.res),
+ fine->ref.offset,
+ fine->seqno);
+
+ return fine;
+}
diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.h b/src/gallium/drivers/crocus/crocus_fine_fence.h
new file mode 100644
index 00000000000..ad6f02a945a
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fine_fence.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_FINE_FENCE_DOT_H
+#define CROCUS_FINE_FENCE_DOT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "crocus_screen.h"
+#include "crocus_resource.h"
+
+/**
+ * A lightweight sequence number fence.
+ *
+ * We emit PIPE_CONTROLs inside a batch (possibly in the middle)
+ * which update a monotonically increasing, 32-bit counter. We
+ * can then check if that moment has passed by either:
+ *
+ * 1. Checking on the CPU by snooping on the DWord via a coherent map
+ *
+ * 2. Blocking on the GPU with MI_SEMAPHORE_WAIT from a second batch
+ * (relying on mid-batch preemption to switch GPU execution to the
+ * batch that writes it).
+ */
+struct crocus_fine_fence {
+ struct pipe_reference reference;
+
+ /** Buffer where the seqno lives */
+ struct crocus_state_ref ref;
+
+ /** Coherent CPU map of the buffer containing the seqno DWord. */
+ const uint32_t *map;
+
+ /**
+ * A drm_syncobj pointing which will be signaled at the end of the
+ * batch which writes this seqno. This can be used to block until
+ * the seqno has definitely passed (but may wait longer than necessary).
+ */
+ struct crocus_syncobj *syncobj;
+
+#define CROCUS_FENCE_BOTTOM_OF_PIPE 0x0 /**< Written by bottom-of-pipe flush */
+#define CROCUS_FENCE_TOP_OF_PIPE 0x1 /**< Written by top-of-pipe flush */
+#define CROCUS_FENCE_END 0x2 /**< Written at the end of a batch */
+
+ /** Information about the type of flush involved (see CROCUS_FENCE_*) */
+ uint32_t flags;
+
+ /**
+ * Sequence number expected to be written by the flush we inserted
+ * when creating this fence. The crocus_fine_fence is 'signaled' when *@map
+ * (written by the flush on the GPU) is greater-than-or-equal to @seqno.
+ */
+ uint32_t seqno;
+};
+
+void crocus_fine_fence_init(struct crocus_batch *batch);
+
+struct crocus_fine_fence *crocus_fine_fence_new(struct crocus_batch *batch,
+ unsigned flags);
+
+void crocus_fine_fence_destroy(struct crocus_screen *screen,
+ struct crocus_fine_fence *sq);
+
+static inline void
+crocus_fine_fence_reference(struct crocus_screen *screen,
+ struct crocus_fine_fence **dst,
+ struct crocus_fine_fence *src)
+{
+ if (pipe_reference(&(*dst)->reference, &src->reference))
+ crocus_fine_fence_destroy(screen, *dst);
+
+ *dst = src;
+}
+
+/**
+ * Return true if this seqno has passed.
+ *
+ * NULL is considered signaled.
+ */
+static inline bool
+crocus_fine_fence_signaled(const struct crocus_fine_fence *sq)
+{
+ if (sq && !sq->map)
+ return false;
+ return !sq || (READ_ONCE(*sq->map) >= sq->seqno);
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_formats.c b/src/gallium/drivers/crocus/crocus_formats.c
new file mode 100644
index 00000000000..31762643bdc
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_formats.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_formats.c
+ *
+ * Converts Gallium formats (PIPE_FORMAT_*) to hardware ones (ISL_FORMAT_*).
+ * Provides information about which formats support what features.
+ */
+
+#include "util/bitscan.h"
+#include "util/macros.h"
+#include "util/format/u_format.h"
+
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+static enum isl_format
+crocus_isl_format_for_pipe_format(enum pipe_format pf)
+{
+ static const enum isl_format table[PIPE_FORMAT_COUNT] = {
+ [0 ... PIPE_FORMAT_COUNT-1] = ISL_FORMAT_UNSUPPORTED,
+
+ [PIPE_FORMAT_B8G8R8A8_UNORM] = ISL_FORMAT_B8G8R8A8_UNORM,
+ [PIPE_FORMAT_B8G8R8X8_UNORM] = ISL_FORMAT_B8G8R8X8_UNORM,
+ [PIPE_FORMAT_B5G5R5A1_UNORM] = ISL_FORMAT_B5G5R5A1_UNORM,
+ [PIPE_FORMAT_B4G4R4A4_UNORM] = ISL_FORMAT_B4G4R4A4_UNORM,
+ [PIPE_FORMAT_B5G6R5_UNORM] = ISL_FORMAT_B5G6R5_UNORM,
+ [PIPE_FORMAT_R10G10B10A2_UNORM] = ISL_FORMAT_R10G10B10A2_UNORM,
+
+ [PIPE_FORMAT_Z16_UNORM] = ISL_FORMAT_R16_UNORM,
+ [PIPE_FORMAT_Z32_UNORM] = ISL_FORMAT_R32_UNORM,
+ [PIPE_FORMAT_Z32_FLOAT] = ISL_FORMAT_R32_FLOAT,
+
+ /* We translate the combined depth/stencil formats to depth only here */
+ [PIPE_FORMAT_Z24_UNORM_S8_UINT] = ISL_FORMAT_R24_UNORM_X8_TYPELESS,
+ [PIPE_FORMAT_Z24X8_UNORM] = ISL_FORMAT_R24_UNORM_X8_TYPELESS,
+ [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = ISL_FORMAT_R32_FLOAT,
+
+ [PIPE_FORMAT_S8_UINT] = ISL_FORMAT_R8_UINT,
+ [PIPE_FORMAT_X24S8_UINT] = ISL_FORMAT_R8_UINT,
+ [PIPE_FORMAT_X32_S8X24_UINT] = ISL_FORMAT_R8_UINT,
+
+ [PIPE_FORMAT_R64_FLOAT] = ISL_FORMAT_R64_FLOAT,
+ [PIPE_FORMAT_R64G64_FLOAT] = ISL_FORMAT_R64G64_FLOAT,
+ [PIPE_FORMAT_R64G64B64_FLOAT] = ISL_FORMAT_R64G64B64_FLOAT,
+ [PIPE_FORMAT_R64G64B64A64_FLOAT] = ISL_FORMAT_R64G64B64A64_FLOAT,
+ [PIPE_FORMAT_R32_FLOAT] = ISL_FORMAT_R32_FLOAT,
+ [PIPE_FORMAT_R32G32_FLOAT] = ISL_FORMAT_R32G32_FLOAT,
+ [PIPE_FORMAT_R32G32B32_FLOAT] = ISL_FORMAT_R32G32B32_FLOAT,
+ [PIPE_FORMAT_R32G32B32A32_FLOAT] = ISL_FORMAT_R32G32B32A32_FLOAT,
+ [PIPE_FORMAT_R32_UNORM] = ISL_FORMAT_R32_UNORM,
+ [PIPE_FORMAT_R32G32_UNORM] = ISL_FORMAT_R32G32_UNORM,
+ [PIPE_FORMAT_R32G32B32_UNORM] = ISL_FORMAT_R32G32B32_UNORM,
+ [PIPE_FORMAT_R32G32B32A32_UNORM] = ISL_FORMAT_R32G32B32A32_UNORM,
+ [PIPE_FORMAT_R32_USCALED] = ISL_FORMAT_R32_USCALED,
+ [PIPE_FORMAT_R32G32_USCALED] = ISL_FORMAT_R32G32_USCALED,
+ [PIPE_FORMAT_R32G32B32_USCALED] = ISL_FORMAT_R32G32B32_USCALED,
+ [PIPE_FORMAT_R32G32B32A32_USCALED] = ISL_FORMAT_R32G32B32A32_USCALED,
+ [PIPE_FORMAT_R32_SNORM] = ISL_FORMAT_R32_SNORM,
+ [PIPE_FORMAT_R32G32_SNORM] = ISL_FORMAT_R32G32_SNORM,
+ [PIPE_FORMAT_R32G32B32_SNORM] = ISL_FORMAT_R32G32B32_SNORM,
+ [PIPE_FORMAT_R32G32B32A32_SNORM] = ISL_FORMAT_R32G32B32A32_SNORM,
+ [PIPE_FORMAT_R32_SSCALED] = ISL_FORMAT_R32_SSCALED,
+ [PIPE_FORMAT_R32G32_SSCALED] = ISL_FORMAT_R32G32_SSCALED,
+ [PIPE_FORMAT_R32G32B32_SSCALED] = ISL_FORMAT_R32G32B32_SSCALED,
+ [PIPE_FORMAT_R32G32B32A32_SSCALED] = ISL_FORMAT_R32G32B32A32_SSCALED,
+ [PIPE_FORMAT_R16_UNORM] = ISL_FORMAT_R16_UNORM,
+ [PIPE_FORMAT_R16G16_UNORM] = ISL_FORMAT_R16G16_UNORM,
+ [PIPE_FORMAT_R16G16B16_UNORM] = ISL_FORMAT_R16G16B16_UNORM,
+ [PIPE_FORMAT_R16G16B16A16_UNORM] = ISL_FORMAT_R16G16B16A16_UNORM,
+ [PIPE_FORMAT_R16_USCALED] = ISL_FORMAT_R16_USCALED,
+ [PIPE_FORMAT_R16G16_USCALED] = ISL_FORMAT_R16G16_USCALED,
+ [PIPE_FORMAT_R16G16B16_USCALED] = ISL_FORMAT_R16G16B16_USCALED,
+ [PIPE_FORMAT_R16G16B16A16_USCALED] = ISL_FORMAT_R16G16B16A16_USCALED,
+ [PIPE_FORMAT_R16_SNORM] = ISL_FORMAT_R16_SNORM,
+ [PIPE_FORMAT_R16G16_SNORM] = ISL_FORMAT_R16G16_SNORM,
+ [PIPE_FORMAT_R16G16B16_SNORM] = ISL_FORMAT_R16G16B16_SNORM,
+ [PIPE_FORMAT_R16G16B16A16_SNORM] = ISL_FORMAT_R16G16B16A16_SNORM,
+ [PIPE_FORMAT_R16_SSCALED] = ISL_FORMAT_R16_SSCALED,
+ [PIPE_FORMAT_R16G16_SSCALED] = ISL_FORMAT_R16G16_SSCALED,
+ [PIPE_FORMAT_R16G16B16_SSCALED] = ISL_FORMAT_R16G16B16_SSCALED,
+ [PIPE_FORMAT_R16G16B16A16_SSCALED] = ISL_FORMAT_R16G16B16A16_SSCALED,
+ [PIPE_FORMAT_R8_UNORM] = ISL_FORMAT_R8_UNORM,
+ [PIPE_FORMAT_R8G8_UNORM] = ISL_FORMAT_R8G8_UNORM,
+ [PIPE_FORMAT_R8G8B8_UNORM] = ISL_FORMAT_R8G8B8_UNORM,
+ [PIPE_FORMAT_R8G8B8A8_UNORM] = ISL_FORMAT_R8G8B8A8_UNORM,
+ [PIPE_FORMAT_R8_USCALED] = ISL_FORMAT_R8_USCALED,
+ [PIPE_FORMAT_R8G8_USCALED] = ISL_FORMAT_R8G8_USCALED,
+ [PIPE_FORMAT_R8G8B8_USCALED] = ISL_FORMAT_R8G8B8_USCALED,
+ [PIPE_FORMAT_R8G8B8A8_USCALED] = ISL_FORMAT_R8G8B8A8_USCALED,
+ [PIPE_FORMAT_R8_SNORM] = ISL_FORMAT_R8_SNORM,
+ [PIPE_FORMAT_R8G8_SNORM] = ISL_FORMAT_R8G8_SNORM,
+ [PIPE_FORMAT_R8G8B8_SNORM] = ISL_FORMAT_R8G8B8_SNORM,
+ [PIPE_FORMAT_R8G8B8A8_SNORM] = ISL_FORMAT_R8G8B8A8_SNORM,
+ [PIPE_FORMAT_R8_SSCALED] = ISL_FORMAT_R8_SSCALED,
+ [PIPE_FORMAT_R8G8_SSCALED] = ISL_FORMAT_R8G8_SSCALED,
+ [PIPE_FORMAT_R8G8B8_SSCALED] = ISL_FORMAT_R8G8B8_SSCALED,
+ [PIPE_FORMAT_R8G8B8A8_SSCALED] = ISL_FORMAT_R8G8B8A8_SSCALED,
+ [PIPE_FORMAT_R32_FIXED] = ISL_FORMAT_R32_SFIXED,
+ [PIPE_FORMAT_R32G32_FIXED] = ISL_FORMAT_R32G32_SFIXED,
+ [PIPE_FORMAT_R32G32B32_FIXED] = ISL_FORMAT_R32G32B32_SFIXED,
+ [PIPE_FORMAT_R32G32B32A32_FIXED] = ISL_FORMAT_R32G32B32A32_SFIXED,
+ [PIPE_FORMAT_R16_FLOAT] = ISL_FORMAT_R16_FLOAT,
+ [PIPE_FORMAT_R16G16_FLOAT] = ISL_FORMAT_R16G16_FLOAT,
+ [PIPE_FORMAT_R16G16B16_FLOAT] = ISL_FORMAT_R16G16B16_FLOAT,
+ [PIPE_FORMAT_R16G16B16A16_FLOAT] = ISL_FORMAT_R16G16B16A16_FLOAT,
+
+ [PIPE_FORMAT_R8G8B8_SRGB] = ISL_FORMAT_R8G8B8_UNORM_SRGB,
+ [PIPE_FORMAT_B8G8R8A8_SRGB] = ISL_FORMAT_B8G8R8A8_UNORM_SRGB,
+ [PIPE_FORMAT_B8G8R8X8_SRGB] = ISL_FORMAT_B8G8R8X8_UNORM_SRGB,
+ [PIPE_FORMAT_R8G8B8A8_SRGB] = ISL_FORMAT_R8G8B8A8_UNORM_SRGB,
+
+ [PIPE_FORMAT_DXT1_RGB] = ISL_FORMAT_BC1_UNORM,
+ [PIPE_FORMAT_DXT1_RGBA] = ISL_FORMAT_BC1_UNORM,
+ [PIPE_FORMAT_DXT3_RGBA] = ISL_FORMAT_BC2_UNORM,
+ [PIPE_FORMAT_DXT5_RGBA] = ISL_FORMAT_BC3_UNORM,
+
+ [PIPE_FORMAT_DXT1_SRGB] = ISL_FORMAT_BC1_UNORM_SRGB,
+ [PIPE_FORMAT_DXT1_SRGBA] = ISL_FORMAT_BC1_UNORM_SRGB,
+ [PIPE_FORMAT_DXT3_SRGBA] = ISL_FORMAT_BC2_UNORM_SRGB,
+ [PIPE_FORMAT_DXT5_SRGBA] = ISL_FORMAT_BC3_UNORM_SRGB,
+
+ [PIPE_FORMAT_RGTC1_UNORM] = ISL_FORMAT_BC4_UNORM,
+ [PIPE_FORMAT_RGTC1_SNORM] = ISL_FORMAT_BC4_SNORM,
+ [PIPE_FORMAT_RGTC2_UNORM] = ISL_FORMAT_BC5_UNORM,
+ [PIPE_FORMAT_RGTC2_SNORM] = ISL_FORMAT_BC5_SNORM,
+
+ [PIPE_FORMAT_R10G10B10A2_USCALED] = ISL_FORMAT_R10G10B10A2_USCALED,
+ [PIPE_FORMAT_R11G11B10_FLOAT] = ISL_FORMAT_R11G11B10_FLOAT,
+ [PIPE_FORMAT_R9G9B9E5_FLOAT] = ISL_FORMAT_R9G9B9E5_SHAREDEXP,
+ [PIPE_FORMAT_R1_UNORM] = ISL_FORMAT_R1_UNORM,
+ [PIPE_FORMAT_R10G10B10X2_USCALED] = ISL_FORMAT_R10G10B10X2_USCALED,
+ [PIPE_FORMAT_B10G10R10A2_UNORM] = ISL_FORMAT_B10G10R10A2_UNORM,
+ [PIPE_FORMAT_R8G8B8X8_UNORM] = ISL_FORMAT_R8G8B8X8_UNORM,
+
+ [PIPE_FORMAT_I8_UNORM] = ISL_FORMAT_R8_UNORM,
+ [PIPE_FORMAT_I16_UNORM] = ISL_FORMAT_R16_UNORM,
+ [PIPE_FORMAT_I8_SNORM] = ISL_FORMAT_R8_SNORM,
+ [PIPE_FORMAT_I16_SNORM] = ISL_FORMAT_R16_SNORM,
+ [PIPE_FORMAT_I16_FLOAT] = ISL_FORMAT_R16_FLOAT,
+ [PIPE_FORMAT_I32_FLOAT] = ISL_FORMAT_R32_FLOAT,
+
+ [PIPE_FORMAT_L8_UINT] = ISL_FORMAT_L8_UINT,
+ [PIPE_FORMAT_L8_UNORM] = ISL_FORMAT_L8_UNORM,
+ [PIPE_FORMAT_L8_SNORM] = ISL_FORMAT_R8_SNORM,
+ [PIPE_FORMAT_L8_SINT] = ISL_FORMAT_L8_SINT,
+ [PIPE_FORMAT_L16_UNORM] = ISL_FORMAT_L16_UNORM,
+ [PIPE_FORMAT_L16_SNORM] = ISL_FORMAT_R16_SNORM,
+ [PIPE_FORMAT_L16_FLOAT] = ISL_FORMAT_L16_FLOAT,
+ [PIPE_FORMAT_L32_FLOAT] = ISL_FORMAT_L32_FLOAT,
+
+ [PIPE_FORMAT_A8_UNORM] = ISL_FORMAT_A8_UNORM,
+ [PIPE_FORMAT_A16_UNORM] = ISL_FORMAT_A16_UNORM,
+ [PIPE_FORMAT_A16_FLOAT] = ISL_FORMAT_A16_FLOAT,
+ [PIPE_FORMAT_A32_FLOAT] = ISL_FORMAT_A32_FLOAT,
+
+ [PIPE_FORMAT_L8A8_UNORM] = ISL_FORMAT_L8A8_UNORM,
+ [PIPE_FORMAT_L16A16_UNORM] = ISL_FORMAT_L16A16_UNORM,
+ [PIPE_FORMAT_L16A16_FLOAT] = ISL_FORMAT_L16A16_FLOAT,
+ [PIPE_FORMAT_L32A32_FLOAT] = ISL_FORMAT_L32A32_FLOAT,
+
+ /* Sadly, we have to use luminance[-alpha] formats for sRGB decoding. */
+ [PIPE_FORMAT_R8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB,
+ [PIPE_FORMAT_L8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB,
+ [PIPE_FORMAT_L8A8_SRGB] = ISL_FORMAT_L8A8_UNORM_SRGB,
+
+ [PIPE_FORMAT_R10G10B10A2_SSCALED] = ISL_FORMAT_R10G10B10A2_SSCALED,
+ [PIPE_FORMAT_R10G10B10A2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM,
+
+ [PIPE_FORMAT_B10G10R10A2_USCALED] = ISL_FORMAT_B10G10R10A2_USCALED,
+ [PIPE_FORMAT_B10G10R10A2_SSCALED] = ISL_FORMAT_B10G10R10A2_SSCALED,
+ [PIPE_FORMAT_B10G10R10A2_SNORM] = ISL_FORMAT_B10G10R10A2_SNORM,
+
+ [PIPE_FORMAT_R8_UINT] = ISL_FORMAT_R8_UINT,
+ [PIPE_FORMAT_R8G8_UINT] = ISL_FORMAT_R8G8_UINT,
+ [PIPE_FORMAT_R8G8B8_UINT] = ISL_FORMAT_R8G8B8_UINT,
+ [PIPE_FORMAT_R8G8B8A8_UINT] = ISL_FORMAT_R8G8B8A8_UINT,
+
+ [PIPE_FORMAT_R8_SINT] = ISL_FORMAT_R8_SINT,
+ [PIPE_FORMAT_R8G8_SINT] = ISL_FORMAT_R8G8_SINT,
+ [PIPE_FORMAT_R8G8B8_SINT] = ISL_FORMAT_R8G8B8_SINT,
+ [PIPE_FORMAT_R8G8B8A8_SINT] = ISL_FORMAT_R8G8B8A8_SINT,
+
+ [PIPE_FORMAT_R16_UINT] = ISL_FORMAT_R16_UINT,
+ [PIPE_FORMAT_R16G16_UINT] = ISL_FORMAT_R16G16_UINT,
+ [PIPE_FORMAT_R16G16B16_UINT] = ISL_FORMAT_R16G16B16_UINT,
+ [PIPE_FORMAT_R16G16B16A16_UINT] = ISL_FORMAT_R16G16B16A16_UINT,
+
+ [PIPE_FORMAT_R16_SINT] = ISL_FORMAT_R16_SINT,
+ [PIPE_FORMAT_R16G16_SINT] = ISL_FORMAT_R16G16_SINT,
+ [PIPE_FORMAT_R16G16B16_SINT] = ISL_FORMAT_R16G16B16_SINT,
+ [PIPE_FORMAT_R16G16B16A16_SINT] = ISL_FORMAT_R16G16B16A16_SINT,
+
+ [PIPE_FORMAT_R32_UINT] = ISL_FORMAT_R32_UINT,
+ [PIPE_FORMAT_R32G32_UINT] = ISL_FORMAT_R32G32_UINT,
+ [PIPE_FORMAT_R32G32B32_UINT] = ISL_FORMAT_R32G32B32_UINT,
+ [PIPE_FORMAT_R32G32B32A32_UINT] = ISL_FORMAT_R32G32B32A32_UINT,
+
+ [PIPE_FORMAT_R32_SINT] = ISL_FORMAT_R32_SINT,
+ [PIPE_FORMAT_R32G32_SINT] = ISL_FORMAT_R32G32_SINT,
+ [PIPE_FORMAT_R32G32B32_SINT] = ISL_FORMAT_R32G32B32_SINT,
+ [PIPE_FORMAT_R32G32B32A32_SINT] = ISL_FORMAT_R32G32B32A32_SINT,
+
+ [PIPE_FORMAT_B10G10R10A2_UINT] = ISL_FORMAT_B10G10R10A2_UINT,
+
+ [PIPE_FORMAT_ETC1_RGB8] = ISL_FORMAT_ETC1_RGB8,
+
+ [PIPE_FORMAT_R8G8B8X8_SRGB] = ISL_FORMAT_R8G8B8X8_UNORM_SRGB,
+ [PIPE_FORMAT_B10G10R10X2_UNORM] = ISL_FORMAT_B10G10R10X2_UNORM,
+ [PIPE_FORMAT_R16G16B16X16_UNORM] = ISL_FORMAT_R16G16B16X16_UNORM,
+ [PIPE_FORMAT_R16G16B16X16_FLOAT] = ISL_FORMAT_R16G16B16X16_FLOAT,
+ [PIPE_FORMAT_R32G32B32X32_FLOAT] = ISL_FORMAT_R32G32B32X32_FLOAT,
+
+ [PIPE_FORMAT_R10G10B10A2_UINT] = ISL_FORMAT_R10G10B10A2_UINT,
+
+ [PIPE_FORMAT_B5G6R5_SRGB] = ISL_FORMAT_B5G6R5_UNORM_SRGB,
+
+ [PIPE_FORMAT_BPTC_RGBA_UNORM] = ISL_FORMAT_BC7_UNORM,
+ [PIPE_FORMAT_BPTC_SRGBA] = ISL_FORMAT_BC7_UNORM_SRGB,
+ [PIPE_FORMAT_BPTC_RGB_FLOAT] = ISL_FORMAT_BC6H_SF16,
+ [PIPE_FORMAT_BPTC_RGB_UFLOAT] = ISL_FORMAT_BC6H_UF16,
+
+ [PIPE_FORMAT_ETC2_RGB8] = ISL_FORMAT_ETC2_RGB8,
+ [PIPE_FORMAT_ETC2_SRGB8] = ISL_FORMAT_ETC2_SRGB8,
+ [PIPE_FORMAT_ETC2_RGB8A1] = ISL_FORMAT_ETC2_RGB8_PTA,
+ [PIPE_FORMAT_ETC2_SRGB8A1] = ISL_FORMAT_ETC2_SRGB8_PTA,
+ [PIPE_FORMAT_ETC2_RGBA8] = ISL_FORMAT_ETC2_EAC_RGBA8,
+ [PIPE_FORMAT_ETC2_SRGBA8] = ISL_FORMAT_ETC2_EAC_SRGB8_A8,
+ [PIPE_FORMAT_ETC2_R11_UNORM] = ISL_FORMAT_EAC_R11,
+ [PIPE_FORMAT_ETC2_R11_SNORM] = ISL_FORMAT_EAC_SIGNED_R11,
+ [PIPE_FORMAT_ETC2_RG11_UNORM] = ISL_FORMAT_EAC_RG11,
+ [PIPE_FORMAT_ETC2_RG11_SNORM] = ISL_FORMAT_EAC_SIGNED_RG11,
+
+ [PIPE_FORMAT_FXT1_RGB] = ISL_FORMAT_FXT1,
+ [PIPE_FORMAT_FXT1_RGBA] = ISL_FORMAT_FXT1,
+
+ [PIPE_FORMAT_ASTC_4x4] = ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16,
+ [PIPE_FORMAT_ASTC_5x4] = ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16,
+ [PIPE_FORMAT_ASTC_5x5] = ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16,
+ [PIPE_FORMAT_ASTC_6x5] = ISL_FORMAT_ASTC_LDR_2D_6X5_FLT16,
+ [PIPE_FORMAT_ASTC_6x6] = ISL_FORMAT_ASTC_LDR_2D_6X6_FLT16,
+ [PIPE_FORMAT_ASTC_8x5] = ISL_FORMAT_ASTC_LDR_2D_8X5_FLT16,
+ [PIPE_FORMAT_ASTC_8x6] = ISL_FORMAT_ASTC_LDR_2D_8X6_FLT16,
+ [PIPE_FORMAT_ASTC_8x8] = ISL_FORMAT_ASTC_LDR_2D_8X8_FLT16,
+ [PIPE_FORMAT_ASTC_10x5] = ISL_FORMAT_ASTC_LDR_2D_10X5_FLT16,
+ [PIPE_FORMAT_ASTC_10x6] = ISL_FORMAT_ASTC_LDR_2D_10X6_FLT16,
+ [PIPE_FORMAT_ASTC_10x8] = ISL_FORMAT_ASTC_LDR_2D_10X8_FLT16,
+ [PIPE_FORMAT_ASTC_10x10] = ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16,
+ [PIPE_FORMAT_ASTC_12x10] = ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16,
+ [PIPE_FORMAT_ASTC_12x12] = ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16,
+
+ [PIPE_FORMAT_ASTC_4x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_4X4_U8SRGB,
+ [PIPE_FORMAT_ASTC_5x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X4_U8SRGB,
+ [PIPE_FORMAT_ASTC_5x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_6x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_6x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X6_U8SRGB,
+ [PIPE_FORMAT_ASTC_8x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_8x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X6_U8SRGB,
+ [PIPE_FORMAT_ASTC_8x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X8_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X5_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X6_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X8_U8SRGB,
+ [PIPE_FORMAT_ASTC_10x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X10_U8SRGB,
+ [PIPE_FORMAT_ASTC_12x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB,
+ [PIPE_FORMAT_ASTC_12x12_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB,
+
+ [PIPE_FORMAT_A1B5G5R5_UNORM] = ISL_FORMAT_A1B5G5R5_UNORM,
+
+ /* We support these so that we know the API expects no alpha channel.
+ * Otherwise, the state tracker would just give us a format with alpha
+ * and we wouldn't know to override the swizzle to 1.
+ */
+ [PIPE_FORMAT_R16G16B16X16_UINT] = ISL_FORMAT_R16G16B16A16_UINT,
+ [PIPE_FORMAT_R16G16B16X16_SINT] = ISL_FORMAT_R16G16B16A16_SINT,
+ [PIPE_FORMAT_R32G32B32X32_UINT] = ISL_FORMAT_R32G32B32A32_UINT,
+ [PIPE_FORMAT_R32G32B32X32_SINT] = ISL_FORMAT_R32G32B32A32_SINT,
+ [PIPE_FORMAT_R10G10B10X2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM,
+ };
+ assert(pf < PIPE_FORMAT_COUNT);
+ return table[pf];
+}
+
+static enum isl_format
+get_render_format(enum pipe_format pformat, enum isl_format def_format)
+{
+ switch (pformat) {
+ case PIPE_FORMAT_A16_UNORM: return ISL_FORMAT_R16_UNORM;
+ case PIPE_FORMAT_A16_FLOAT: return ISL_FORMAT_R16_FLOAT;
+ case PIPE_FORMAT_A32_FLOAT: return ISL_FORMAT_R32_FLOAT;
+
+ case PIPE_FORMAT_I8_UNORM: return ISL_FORMAT_R8_UNORM;
+ case PIPE_FORMAT_I16_UNORM: return ISL_FORMAT_R16_UNORM;
+ case PIPE_FORMAT_I16_FLOAT: return ISL_FORMAT_R16_FLOAT;
+ case PIPE_FORMAT_I32_FLOAT: return ISL_FORMAT_R32_FLOAT;
+
+ case PIPE_FORMAT_L8_UNORM: return ISL_FORMAT_R8_UNORM;
+ case PIPE_FORMAT_L8_UINT: return ISL_FORMAT_R8_UINT;
+ case PIPE_FORMAT_L8_SINT: return ISL_FORMAT_R8_SINT;
+ case PIPE_FORMAT_L16_UNORM: return ISL_FORMAT_R16_UNORM;
+ case PIPE_FORMAT_L16_FLOAT: return ISL_FORMAT_R16_FLOAT;
+ case PIPE_FORMAT_L32_FLOAT: return ISL_FORMAT_R32_FLOAT;
+
+ case PIPE_FORMAT_L8A8_UNORM: return ISL_FORMAT_R8G8_UNORM;
+ case PIPE_FORMAT_L16A16_UNORM: return ISL_FORMAT_R16G16_UNORM;
+ case PIPE_FORMAT_L16A16_FLOAT: return ISL_FORMAT_R16G16_FLOAT;
+ case PIPE_FORMAT_L32A32_FLOAT: return ISL_FORMAT_R32G32_FLOAT;
+
+ default:
+ return def_format;
+ }
+}
+
+struct crocus_format_info
+crocus_format_for_usage(const struct intel_device_info *devinfo,
+ enum pipe_format pformat,
+ isl_surf_usage_flags_t usage)
+{
+ struct crocus_format_info info = { crocus_isl_format_for_pipe_format(pformat),
+ { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W } };
+
+ if (info.fmt == ISL_FORMAT_UNSUPPORTED)
+ return info;
+
+ if (pformat == PIPE_FORMAT_A8_UNORM) {
+ info.fmt = ISL_FORMAT_A8_UNORM;
+ }
+
+ if (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)
+ info.fmt = get_render_format(pformat, info.fmt);
+ if (devinfo->ver < 6) {
+ if (pformat == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+ info.fmt = ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS;
+ if (pformat == PIPE_FORMAT_X32_S8X24_UINT)
+ info.fmt = ISL_FORMAT_X32_TYPELESS_G8X24_UINT;
+ if (pformat == PIPE_FORMAT_X24S8_UINT)
+ info.fmt = ISL_FORMAT_X24_TYPELESS_G8_UINT;
+ }
+
+ const struct isl_format_layout *fmtl = isl_format_get_layout(info.fmt);
+
+ if (util_format_is_snorm(pformat)) {
+ if (util_format_is_intensity(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_X;
+ info.swizzles[2] = PIPE_SWIZZLE_X;
+ info.swizzles[3] = PIPE_SWIZZLE_X;
+ } else if (util_format_is_luminance(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_X;
+ info.swizzles[2] = PIPE_SWIZZLE_X;
+ info.swizzles[3] = PIPE_SWIZZLE_1;
+ } else if (util_format_is_luminance_alpha(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_X;
+ info.swizzles[2] = PIPE_SWIZZLE_X;
+ info.swizzles[3] = PIPE_SWIZZLE_Y;
+ } else if (util_format_is_alpha(pformat)) {
+ info.swizzles[0] = PIPE_SWIZZLE_0;
+ info.swizzles[1] = PIPE_SWIZZLE_0;
+ info.swizzles[2] = PIPE_SWIZZLE_0;
+ info.swizzles[3] = PIPE_SWIZZLE_X;
+ }
+ }
+
+ /* When faking RGBX pipe formats with RGBA ISL formats, override alpha. */
+ if (!util_format_has_alpha(pformat) && fmtl->channels.a.type != ISL_VOID) {
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_Y;
+ info.swizzles[2] = PIPE_SWIZZLE_Z;
+ info.swizzles[3] = PIPE_SWIZZLE_1;
+ }
+
+ /* We choose RGBA over RGBX for rendering the hardware doesn't support
+ * rendering to RGBX. However, when this internal override is used on Gen9+,
+ * fast clears don't work correctly.
+ *
+ * i965 fixes this by pretending to not support RGBX formats, and the higher
+ * layers of Mesa pick the RGBA format instead. Gallium doesn't work that
+ * way, and might choose a different format, like BGRX instead of RGBX,
+ * which will also cause problems when sampling from a surface fast cleared
+ * as RGBX. So we always choose RGBA instead of RGBX explicitly
+ * here.
+ */
+ if (isl_format_is_rgbx(info.fmt) &&
+ !isl_format_supports_rendering(devinfo, info.fmt) &&
+ (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)) {
+ info.fmt = isl_format_rgbx_to_rgba(info.fmt);
+ info.swizzles[0] = PIPE_SWIZZLE_X;
+ info.swizzles[1] = PIPE_SWIZZLE_Y;
+ info.swizzles[2] = PIPE_SWIZZLE_Z;
+ info.swizzles[3] = PIPE_SWIZZLE_1;
+ }
+
+ return info;
+}
+
+/**
+ * The pscreen->is_format_supported() driver hook.
+ *
+ * Returns true if the given format is supported for the given usage
+ * (PIPE_BIND_*) and sample count.
+ */
+bool
+crocus_is_format_supported(struct pipe_screen *pscreen,
+ enum pipe_format pformat,
+ enum pipe_texture_target target,
+ unsigned sample_count, unsigned storage_sample_count,
+ unsigned usage)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (!util_is_power_of_two_or_zero(sample_count))
+ return false;
+ if (devinfo->ver >= 7) {
+ if (sample_count > 8 || sample_count == 2)
+ return false;
+ } else if (devinfo->ver == 6) {
+ if (sample_count > 4 || sample_count == 2)
+ return false;
+ } else if (sample_count > 1) {
+ return false;
+ }
+
+ if (pformat == PIPE_FORMAT_NONE)
+ return true;
+
+ enum isl_format format = crocus_isl_format_for_pipe_format(pformat);
+
+ if (format == ISL_FORMAT_UNSUPPORTED)
+ return false;
+
+ /* no stencil texturing prior to haswell */
+ if (!devinfo->is_haswell) {
+ if (pformat == PIPE_FORMAT_S8_UINT ||
+ pformat == PIPE_FORMAT_X24S8_UINT ||
+ pformat == PIPE_FORMAT_S8X24_UINT ||
+ pformat == PIPE_FORMAT_X32_S8X24_UINT)
+ return FALSE;
+ }
+
+ const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+ const bool is_integer = isl_format_has_int_channel(format);
+ bool supported = true;
+
+ if (sample_count > 1)
+ supported &= isl_format_supports_multisampling(devinfo, format);
+
+ if (usage & PIPE_BIND_DEPTH_STENCIL) {
+ supported &= format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS ||
+ format == ISL_FORMAT_R32_FLOAT ||
+ format == ISL_FORMAT_R24_UNORM_X8_TYPELESS ||
+ format == ISL_FORMAT_R16_UNORM ||
+ format == ISL_FORMAT_R8_UINT;
+ }
+
+ if (usage & PIPE_BIND_RENDER_TARGET) {
+ /* Alpha and luminance-alpha formats other than A8_UNORM are not
+ * renderable.
+ *
+ * For BLORP, we can apply the swizzle in the shader. But for
+ * general rendering, this would mean recompiling the shader, which
+ * we'd like to avoid doing. So we mark these formats non-renderable.
+ *
+ * We do support A8_UNORM as it's required and is renderable.
+ */
+ if (pformat != PIPE_FORMAT_A8_UNORM &&
+ (util_format_is_alpha(pformat) ||
+ util_format_is_luminance_alpha(pformat)))
+ supported = false;
+
+ enum isl_format rt_format = format;
+
+ if (isl_format_is_rgbx(format) &&
+ !isl_format_supports_rendering(devinfo, format))
+ rt_format = isl_format_rgbx_to_rgba(format);
+
+ supported &= isl_format_supports_rendering(devinfo, rt_format);
+
+ if (!is_integer)
+ supported &= isl_format_supports_alpha_blending(devinfo, rt_format);
+ }
+
+ if (usage & PIPE_BIND_SHADER_IMAGE) {
+ /* Dataport doesn't support compression, and we can't resolve an MCS
+ * compressed surface. (Buffer images may have sample count of 0.)
+ */
+ supported &= sample_count == 0;
+
+ supported &= isl_format_supports_typed_writes(devinfo, format);
+ supported &= isl_has_matching_typed_storage_image_format(devinfo, format);
+ }
+
+ if (usage & PIPE_BIND_SAMPLER_VIEW) {
+ supported &= isl_format_supports_sampling(devinfo, format);
+ bool ignore_filtering = false;
+
+ if (is_integer)
+ ignore_filtering = true;
+
+ /* I said them, but I lied them. */
+ if (devinfo->ver < 5 && (format == ISL_FORMAT_R32G32B32A32_FLOAT ||
+ format == ISL_FORMAT_R24_UNORM_X8_TYPELESS ||
+ format == ISL_FORMAT_R32_FLOAT ||
+ format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS))
+ ignore_filtering = true;
+ if (!ignore_filtering)
+ supported &= isl_format_supports_filtering(devinfo, format);
+
+ /* Don't advertise 3-component RGB formats for non-buffer textures.
+ * This ensures that they are renderable from an API perspective since
+ * the state tracker will fall back to RGBA or RGBX, which are
+ * renderable. We want to render internally for copies and blits,
+ * even if the application doesn't.
+ *
+ * Buffer textures don't need to be renderable, so we support real RGB.
+ * This is useful for PBO upload, and 32-bit RGB support is mandatory.
+ */
+ if (target != PIPE_BUFFER)
+ supported &= fmtl->bpb != 24 && fmtl->bpb != 48 && fmtl->bpb != 96;
+ }
+
+ if (usage & PIPE_BIND_VERTEX_BUFFER) {
+ supported &= isl_format_supports_vertex_fetch(devinfo, format);
+
+ if (!devinfo->is_haswell) {
+ /* W/A: Pre-Haswell, the hardware doesn't really support the formats
+ * we'd like to use here, so upload everything as UINT and fix it in
+ * the shader
+ */
+ if (format == ISL_FORMAT_R10G10B10A2_UNORM ||
+ format == ISL_FORMAT_B10G10R10A2_UNORM ||
+ format == ISL_FORMAT_R10G10B10A2_SNORM ||
+ format == ISL_FORMAT_B10G10R10A2_SNORM ||
+ format == ISL_FORMAT_R10G10B10A2_USCALED ||
+ format == ISL_FORMAT_B10G10R10A2_USCALED ||
+ format == ISL_FORMAT_R10G10B10A2_SSCALED ||
+ format == ISL_FORMAT_B10G10R10A2_SSCALED)
+ supported = true;
+
+ if (format == ISL_FORMAT_R8G8B8_SINT ||
+ format == ISL_FORMAT_R8G8B8_UINT ||
+ format == ISL_FORMAT_R16G16B16_SINT ||
+ format == ISL_FORMAT_R16G16B16_UINT)
+ supported = true;
+ }
+ }
+
+ if (usage & PIPE_BIND_INDEX_BUFFER) {
+ supported &= format == ISL_FORMAT_R8_UINT ||
+ format == ISL_FORMAT_R16_UINT ||
+ format == ISL_FORMAT_R32_UINT;
+ }
+
+ return supported;
+}
diff --git a/src/gallium/drivers/crocus/crocus_genx_macros.h b/src/gallium/drivers/crocus/crocus_genx_macros.h
new file mode 100644
index 00000000000..a0309513ed2
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_genx_macros.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Macro and function definitions needed in order to use genxml.
+ *
+ * This should only be included in sources compiled per-generation.
+ */
+
+#include "crocus_batch.h"
+
+#include "genxml/gen_macros.h"
+
+#define __gen_address_type struct crocus_address
+#define __gen_user_data struct crocus_batch
+#define __gen_combine_address crocus_combine_address
+
+static inline void *
+__gen_get_batch_dwords(struct crocus_batch *batch, unsigned dwords)
+{
+ return crocus_get_command_space(batch, dwords * sizeof(uint32_t));
+}
+
+static inline struct crocus_address
+__gen_address_offset(struct crocus_address addr, uint64_t offset)
+{
+ addr.offset += offset;
+ return addr;
+}
+
+static uint64_t
+__gen_combine_address(struct crocus_batch *batch, void *location,
+ struct crocus_address addr, uint32_t delta)
+{
+ uint32_t offset = (char *)location - (char *)batch->command.map;
+
+ if (addr.bo == NULL) {
+ return addr.offset + delta;
+ } else {
+ if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) {
+ offset = (char *) location - (char *) batch->state.map;
+ return crocus_state_reloc(batch, offset, addr.bo,
+ addr.offset + delta,
+ addr.reloc_flags);
+ }
+
+ assert(!crocus_ptr_in_state_buffer(batch, location));
+
+ offset = (char *) location - (char *) batch->command.map;
+ return crocus_command_reloc(batch, offset, addr.bo,
+ addr.offset + delta,
+ addr.reloc_flags);
+ }
+}
+
+#define __gen_address_type struct crocus_address
+#define __gen_user_data struct crocus_batch
+
+#define __genxml_cmd_length(cmd) cmd ## _length
+#define __genxml_cmd_length_bias(cmd) cmd ## _length_bias
+#define __genxml_cmd_header(cmd) cmd ## _header
+#define __genxml_cmd_pack(cmd) cmd ## _pack
+#define __genxml_reg_num(cmd) cmd ## _num
+
+#include "genxml/genX_pack.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_bits.h"
+
+/* CS_GPR(15) is reserved for combining conditional rendering predicates
+ * with GL_ARB_indirect_parameters draw number predicates.
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 15
+#include "common/mi_builder.h"
+
+#define _crocus_pack_command(batch, cmd, dst, name) \
+ for (struct cmd name = { __genxml_cmd_header(cmd) }, \
+ *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
+ ({ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name); \
+ _dst = NULL; \
+ }))
+
+#define crocus_pack_command(cmd, dst, name) \
+ _crocus_pack_command(NULL, cmd, dst, name)
+
+#define _crocus_pack_state(batch, cmd, dst, name) \
+ for (struct cmd name = {}, \
+ *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
+ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name), \
+ _dst = NULL)
+
+#define crocus_pack_state(cmd, dst, name) \
+ _crocus_pack_state(NULL, cmd, dst, name)
+
+#define crocus_emit_cmd(batch, cmd, name) \
+ _crocus_pack_command(batch, cmd, __gen_get_batch_dwords(batch, __genxml_cmd_length(cmd)), name)
+
+#define crocus_emit_merge(batch, dwords0, dwords1, num_dwords) \
+ do { \
+ uint32_t *dw = __gen_get_batch_dwords(batch, num_dwords); \
+ for (uint32_t i = 0; i < num_dwords; i++) \
+ dw[i] = (dwords0)[i] | (dwords1)[i]; \
+ VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dwords)); \
+ } while (0)
+
+#define crocus_emit_reg(batch, reg, name) \
+ for (struct reg name = {}, *_cont = (struct reg *)1; _cont != NULL; \
+ ({ \
+ uint32_t _dw[__genxml_cmd_length(reg)]; \
+ __genxml_cmd_pack(reg)(NULL, _dw, &name); \
+ for (unsigned i = 0; i < __genxml_cmd_length(reg); i++) { \
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { \
+ lri.RegisterOffset = __genxml_reg_num(reg); \
+ lri.DataDWord = _dw[i]; \
+ } \
+ } \
+ _cont = NULL; \
+ }))
+
+
+/**
+ * crocus_address constructor helpers:
+ *
+ * When using these to construct a CSO, pass NULL for \p bo, and manually
+ * pin the BO later. Otherwise, genxml's address handling will add the
+ * BO to the current batch's validation list at CSO creation time, rather
+ * than at draw time as desired.
+ */
+
+UNUSED static struct crocus_address
+ro_bo(struct crocus_bo *bo, uint64_t offset)
+{
+ return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT };
+}
+
+UNUSED static struct crocus_address
+rw_bo(struct crocus_bo *bo, uint64_t offset)
+{
+ return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT | RELOC_WRITE };
+}
+
+UNUSED static struct crocus_address
+ggtt_bo(struct crocus_bo *bo, uint64_t offset)
+{
+ return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT };
+}
diff --git a/src/gallium/drivers/crocus/crocus_genx_protos.h b/src/gallium/drivers/crocus/crocus_genx_protos.h
new file mode 100644
index 00000000000..ba6798f991e
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_genx_protos.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* GenX-specific function declarations.
+ *
+ * Don't include this directly, it will be included by crocus_context.h.
+ *
+ * NOTE: This header can be included multiple times, from the same file.
+ */
+
+/* crocus_state.c */
+void genX(init_state)(struct crocus_context *ice);
+void genX(init_screen_state)(struct crocus_screen *screen);
+void genX(upload_urb)(struct crocus_batch *batch,
+ unsigned vs_size,
+ bool gs_present,
+ unsigned gs_size);
+void genX(emit_hashing_mode)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ unsigned width, unsigned height,
+ unsigned scale);
+
+/* crocus_blorp.c */
+void genX(init_blorp)(struct crocus_context *ice);
+
+/* crocus_query.c */
+void genX(init_query)(struct crocus_context *ice);
+void genX(init_screen_query)(struct crocus_screen *screen);
+void genX(math_add32_gpr0)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ uint32_t x);
+void genX(math_div32_gpr0)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ uint32_t D);
+
+/* crocus_blt.c */
+void genX(init_blt)(struct crocus_screen *screen);
diff --git a/src/gallium/drivers/crocus/crocus_monitor.c b/src/gallium/drivers/crocus/crocus_monitor.c
new file mode 100644
index 00000000000..c0465f22875
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_monitor.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "crocus_monitor.h"
+
+#include <xf86drm.h>
+
+#include "crocus_screen.h"
+#include "crocus_context.h"
+
+#include "perf/intel_perf.h"
+#include "perf/intel_perf_query.h"
+#include "perf/intel_perf_regs.h"
+
+struct crocus_monitor_object {
+ int num_active_counters;
+ int *active_counters;
+
+ size_t result_size;
+ unsigned char *result_buffer;
+
+ struct intel_perf_query_object *query;
+};
+
+int
+crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index,
+ struct pipe_driver_query_info *info)
+{
+ const struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ assert(screen->monitor_cfg);
+ if (!screen->monitor_cfg)
+ return 0;
+
+ const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+
+ if (!info) {
+ /* return the number of metrics */
+ return monitor_cfg->num_counters;
+ }
+
+ const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+ const int group = monitor_cfg->counters[index].group;
+ const int counter_index = monitor_cfg->counters[index].counter;
+ struct intel_perf_query_counter *counter =
+ &perf_cfg->queries[group].counters[counter_index];
+
+ info->group_id = group;
+ info->name = counter->name;
+ info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index;
+
+ if (counter->type == INTEL_PERF_COUNTER_TYPE_THROUGHPUT)
+ info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
+ else
+ info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
+ switch (counter->data_type) {
+ case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
+ info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
+ info->max_value.u32 = 0;
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+ info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+ info->max_value.u64 = 0;
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+ case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
+ info->type = PIPE_DRIVER_QUERY_TYPE_FLOAT;
+ info->max_value.u64 = -1;
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ /* indicates that this is an OA query, not a pipeline statistics query */
+ info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+ return 1;
+}
+
+typedef void (*bo_unreference_t)(void *);
+typedef void *(*bo_map_t)(void *, void *, unsigned flags);
+typedef void (*bo_unmap_t)(void *);
+typedef void (*emit_mi_report_t)(void *, void *, uint32_t, uint32_t);
+typedef void (*emit_mi_flush_t)(void *);
+typedef void (*capture_frequency_stat_register_t)(void *, void *,
+ uint32_t );
+typedef void (*store_register_mem64_t)(void *ctx, void *bo,
+ uint32_t reg, uint32_t offset);
+typedef bool (*batch_references_t)(void *batch, void *bo);
+typedef void (*bo_wait_rendering_t)(void *bo);
+typedef int (*bo_busy_t)(void *bo);
+
+static void *
+crocus_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size)
+{
+ return crocus_bo_alloc(bufmgr, name, size);
+}
+
+#if 0
+static void
+crocus_monitor_emit_mi_flush(struct crocus_context *ice)
+{
+ const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_VF_CACHE_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CS_STALL;
+ crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
+ "OA metrics", flags);
+}
+#endif
+
+static void
+crocus_monitor_emit_mi_report_perf_count(void *c,
+ void *bo,
+ uint32_t offset_in_bytes,
+ uint32_t report_id)
+{
+ struct crocus_context *ice = c;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ screen->vtbl.emit_mi_report_perf_count(batch, bo, offset_in_bytes, report_id);
+}
+
+static void
+crocus_monitor_batchbuffer_flush(void *c, const char *file, int line)
+{
+ struct crocus_context *ice = c;
+ _crocus_batch_flush(&ice->batches[CROCUS_BATCH_RENDER], __FILE__, __LINE__);
+}
+
+#if 0
+static void
+crocus_monitor_capture_frequency_stat_register(void *ctx,
+ void *bo,
+ uint32_t bo_offset)
+{
+ struct crocus_context *ice = ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ ice->vtbl.store_register_mem32(batch, GEN9_RPSTAT0, bo, bo_offset, false);
+}
+
+static void
+crocus_monitor_store_register_mem64(void *ctx, void *bo,
+ uint32_t reg, uint32_t offset)
+{
+ struct crocus_context *ice = ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
+}
+#endif
+
+static bool
+crocus_monitor_init_metrics(struct crocus_screen *screen)
+{
+ struct crocus_monitor_config *monitor_cfg =
+ rzalloc(screen, struct crocus_monitor_config);
+ struct intel_perf_config *perf_cfg = NULL;
+ if (unlikely(!monitor_cfg))
+ goto allocation_error;
+ perf_cfg = intel_perf_new(monitor_cfg);
+ if (unlikely(!perf_cfg))
+ goto allocation_error;
+
+ monitor_cfg->perf_cfg = perf_cfg;
+
+ perf_cfg->vtbl.bo_alloc = crocus_oa_bo_alloc;
+ perf_cfg->vtbl.bo_unreference = (bo_unreference_t)crocus_bo_unreference;
+ perf_cfg->vtbl.bo_map = (bo_map_t)crocus_bo_map;
+ perf_cfg->vtbl.bo_unmap = (bo_unmap_t)crocus_bo_unmap;
+
+ perf_cfg->vtbl.emit_mi_report_perf_count =
+ (emit_mi_report_t)crocus_monitor_emit_mi_report_perf_count;
+ perf_cfg->vtbl.batchbuffer_flush = crocus_monitor_batchbuffer_flush;
+ perf_cfg->vtbl.batch_references = (batch_references_t)crocus_batch_references;
+ perf_cfg->vtbl.bo_wait_rendering =
+ (bo_wait_rendering_t)crocus_bo_wait_rendering;
+ perf_cfg->vtbl.bo_busy = (bo_busy_t)crocus_bo_busy;
+
+ intel_perf_init_metrics(perf_cfg, &screen->devinfo, screen->fd, false, false);
+ screen->monitor_cfg = monitor_cfg;
+
+ /* a gallium "group" is equivalent to a gen "query"
+ * a gallium "query" is equivalent to a gen "query_counter"
+ *
+ * Each gen_query supports a specific number of query_counters. To
+ * allocate the array of crocus_monitor_counter, we need an upper bound
+ * (ignoring duplicate query_counters).
+ */
+ int gen_query_counters_count = 0;
+ for (int gen_query_id = 0;
+ gen_query_id < perf_cfg->n_queries;
+ ++gen_query_id) {
+ gen_query_counters_count += perf_cfg->queries[gen_query_id].n_counters;
+ }
+
+ monitor_cfg->counters = rzalloc_size(monitor_cfg,
+ sizeof(struct crocus_monitor_counter) *
+ gen_query_counters_count);
+ if (unlikely(!monitor_cfg->counters))
+ goto allocation_error;
+
+ int crocus_monitor_id = 0;
+ for (int group = 0; group < perf_cfg->n_queries; ++group) {
+ for (int counter = 0;
+ counter < perf_cfg->queries[group].n_counters;
+ ++counter) {
+ /* Check previously identified metrics to filter out duplicates. The
+ * user is not helped by having the same metric available in several
+ * groups. (n^2 algorithm).
+ */
+ bool duplicate = false;
+ for (int existing_group = 0;
+ existing_group < group && !duplicate;
+ ++existing_group) {
+ for (int existing_counter = 0;
+ existing_counter < perf_cfg->queries[existing_group].n_counters && !duplicate;
+ ++existing_counter) {
+ const char *current_name =
+ perf_cfg->queries[group].counters[counter].name;
+ const char *existing_name =
+ perf_cfg->queries[existing_group].counters[existing_counter].name;
+ if (strcmp(current_name, existing_name) == 0) {
+ duplicate = true;
+ }
+ }
+ }
+ if (duplicate)
+ continue;
+ monitor_cfg->counters[crocus_monitor_id].group = group;
+ monitor_cfg->counters[crocus_monitor_id].counter = counter;
+ ++crocus_monitor_id;
+ }
+ }
+ monitor_cfg->num_counters = crocus_monitor_id;
+ return monitor_cfg->num_counters;
+
+allocation_error:
+ if (monitor_cfg)
+ free(monitor_cfg->counters);
+ free(perf_cfg);
+ free(monitor_cfg);
+ return false;
+}
+
+int
+crocus_get_monitor_group_info(struct pipe_screen *pscreen,
+ unsigned group_index,
+ struct pipe_driver_query_group_info *info)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ if (!screen->monitor_cfg) {
+ if (!crocus_monitor_init_metrics(screen))
+ return 0;
+ }
+
+ const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+ const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+
+ if (!info) {
+ /* return the count that can be queried */
+ return perf_cfg->n_queries;
+ }
+
+ if (group_index >= perf_cfg->n_queries) {
+ /* out of range */
+ return 0;
+ }
+
+ struct intel_perf_query_info *query = &perf_cfg->queries[group_index];
+
+ info->name = query->name;
+ info->max_active_queries = query->n_counters;
+ info->num_queries = query->n_counters;
+
+ return 1;
+}
+
+static void
+crocus_init_monitor_ctx(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+ struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+
+ ice->perf_ctx = intel_perf_new_context(ice);
+ if (unlikely(!ice->perf_ctx))
+ return;
+
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+ struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+ intel_perf_init_context(perf_ctx,
+ perf_cfg,
+ ice,
+ ice,
+ screen->bufmgr,
+ &screen->devinfo,
+ ice->batches[CROCUS_BATCH_RENDER].hw_ctx_id,
+ screen->fd);
+}
+
+/* entry point for GenPerfMonitorsAMD */
+struct crocus_monitor_object *
+crocus_create_monitor_object(struct crocus_context *ice,
+ unsigned num_queries,
+ unsigned *query_types)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+ struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+ struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+ struct intel_perf_query_object *query_obj = NULL;
+
+ /* initialize perf context if this has not already been done. This
+ * function is the first entry point that carries the gl context.
+ */
+ if (ice->perf_ctx == NULL) {
+ crocus_init_monitor_ctx(ice);
+ }
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+ assert(num_queries > 0);
+ int query_index = query_types[0] - PIPE_QUERY_DRIVER_SPECIFIC;
+ assert(query_index <= monitor_cfg->num_counters);
+ const int group = monitor_cfg->counters[query_index].group;
+
+ struct crocus_monitor_object *monitor =
+ calloc(1, sizeof(struct crocus_monitor_object));
+ if (unlikely(!monitor))
+ goto allocation_failure;
+
+ monitor->num_active_counters = num_queries;
+ monitor->active_counters = calloc(num_queries, sizeof(int));
+ if (unlikely(!monitor->active_counters))
+ goto allocation_failure;
+
+ for (int i = 0; i < num_queries; ++i) {
+ unsigned current_query = query_types[i];
+ unsigned current_query_index = current_query - PIPE_QUERY_DRIVER_SPECIFIC;
+
+ /* all queries must be in the same group */
+ assert(current_query_index <= monitor_cfg->num_counters);
+ assert(monitor_cfg->counters[current_query_index].group == group);
+ monitor->active_counters[i] =
+ monitor_cfg->counters[current_query_index].counter;
+ }
+
+ /* create the intel_perf_query */
+ query_obj = intel_perf_new_query(perf_ctx, group);
+ if (unlikely(!query_obj))
+ goto allocation_failure;
+
+ monitor->query = query_obj;
+ monitor->result_size = perf_cfg->queries[group].data_size;
+ monitor->result_buffer = calloc(1, monitor->result_size);
+ if (unlikely(!monitor->result_buffer))
+ goto allocation_failure;
+
+ return monitor;
+
+allocation_failure:
+ if (monitor) {
+ free(monitor->active_counters);
+ free(monitor->result_buffer);
+ }
+ free(query_obj);
+ free(monitor);
+ return NULL;
+}
+
+void
+crocus_destroy_monitor_object(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ intel_perf_delete_query(ice->perf_ctx, monitor->query);
+ free(monitor->result_buffer);
+ monitor->result_buffer = NULL;
+ free(monitor->active_counters);
+ monitor->active_counters = NULL;
+ free(monitor);
+}
+
+bool
+crocus_begin_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+ return intel_perf_begin_query(perf_ctx, monitor->query);
+}
+
+bool
+crocus_end_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+ intel_perf_end_query(perf_ctx, monitor->query);
+ return true;
+}
+
+bool
+crocus_get_monitor_result(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor,
+ bool wait,
+ union pipe_numeric_type_union *result)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct intel_perf_context *perf_ctx = ice->perf_ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ bool monitor_ready =
+ intel_perf_is_query_ready(perf_ctx, monitor->query, batch);
+
+ if (!monitor_ready) {
+ if (!wait)
+ return false;
+ intel_perf_wait_query(perf_ctx, monitor->query, batch);
+ }
+
+ assert(intel_perf_is_query_ready(perf_ctx, monitor->query, batch));
+
+ unsigned bytes_written;
+ intel_perf_get_query_data(perf_ctx, monitor->query, batch,
+ monitor->result_size,
+ (unsigned*) monitor->result_buffer,
+ &bytes_written);
+ if (bytes_written != monitor->result_size)
+ return false;
+
+ /* copy metrics into the batch result */
+ for (int i = 0; i < monitor->num_active_counters; ++i) {
+ int current_counter = monitor->active_counters[i];
+ const struct intel_perf_query_info *info =
+ intel_perf_query_info(monitor->query);
+ const struct intel_perf_query_counter *counter =
+ &info->counters[current_counter];
+ assert(intel_perf_query_counter_get_size(counter));
+ switch (counter->data_type) {
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+ result[i].u64 = *(uint64_t*)(monitor->result_buffer + counter->offset);
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+ result[i].f = *(float*)(monitor->result_buffer + counter->offset);
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
+ case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
+ result[i].u64 = *(uint32_t*)(monitor->result_buffer + counter->offset);
+ break;
+ case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: {
+ double v = *(double*)(monitor->result_buffer + counter->offset);
+ result[i].f = v;
+ break;
+ }
+ default:
+ unreachable("unexpected counter data type");
+ }
+ }
+ return true;
+}
diff --git a/src/gallium/drivers/crocus/crocus_monitor.h b/src/gallium/drivers/crocus/crocus_monitor.h
new file mode 100644
index 00000000000..3335c8860e2
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_monitor.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_MONITOR_H
+#define CROCUS_MONITOR_H
+
+#include "pipe/p_screen.h"
+
+struct crocus_monitor_counter {
+ int group;
+ int counter;
+};
+
+struct crocus_monitor_config {
+ struct intel_perf_config *perf_cfg;
+
+ /* gallium requires an index for each counter */
+ int num_counters;
+ struct crocus_monitor_counter *counters;
+};
+
+int crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index,
+ struct pipe_driver_query_info *info);
+int crocus_get_monitor_group_info(struct pipe_screen *pscreen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info);
+
+struct crocus_context;
+struct crocus_screen;
+
+struct crocus_monitor_object *
+crocus_create_monitor_object(struct crocus_context *ice,
+ unsigned num_queries,
+ unsigned *query_types);
+
+struct pipe_query;
+void crocus_destroy_monitor_object(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor);
+
+bool
+crocus_begin_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor);
+bool
+crocus_end_monitor(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor);
+
+bool
+crocus_get_monitor_result(struct pipe_context *ctx,
+ struct crocus_monitor_object *monitor,
+ bool wait,
+ union pipe_numeric_type_union *result);
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_pipe.h b/src/gallium/drivers/crocus/crocus_pipe.h
new file mode 100644
index 00000000000..71b12d08e16
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_pipe.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_PIPE_H
+#define CROCUS_PIPE_H
+
+#include "pipe/p_defines.h"
+#include "compiler/shader_enums.h"
+
+static inline gl_shader_stage
+stage_from_pipe(enum pipe_shader_type pstage)
+{
+ static const gl_shader_stage stages[PIPE_SHADER_TYPES] = {
+ [PIPE_SHADER_VERTEX] = MESA_SHADER_VERTEX,
+ [PIPE_SHADER_TESS_CTRL] = MESA_SHADER_TESS_CTRL,
+ [PIPE_SHADER_TESS_EVAL] = MESA_SHADER_TESS_EVAL,
+ [PIPE_SHADER_GEOMETRY] = MESA_SHADER_GEOMETRY,
+ [PIPE_SHADER_FRAGMENT] = MESA_SHADER_FRAGMENT,
+ [PIPE_SHADER_COMPUTE] = MESA_SHADER_COMPUTE,
+ };
+ return stages[pstage];
+}
+
+static inline enum pipe_shader_type
+stage_to_pipe(gl_shader_stage stage)
+{
+ static const enum pipe_shader_type pstages[MESA_SHADER_STAGES] = {
+ [MESA_SHADER_VERTEX] = PIPE_SHADER_VERTEX,
+ [MESA_SHADER_TESS_CTRL] = PIPE_SHADER_TESS_CTRL,
+ [MESA_SHADER_TESS_EVAL] = PIPE_SHADER_TESS_EVAL,
+ [MESA_SHADER_GEOMETRY] = PIPE_SHADER_GEOMETRY,
+ [MESA_SHADER_FRAGMENT] = PIPE_SHADER_FRAGMENT,
+ [MESA_SHADER_COMPUTE] = PIPE_SHADER_COMPUTE,
+ };
+ return pstages[stage];
+}
+
+/**
+ * Convert an swizzle enumeration (i.e. PIPE_SWIZZLE_X) to one of the HW's
+ * "Shader Channel Select" enumerations (i.e. SCS_RED). The mappings are
+ *
+ * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
+ * 0 1 2 3 4 5
+ * 4 5 6 7 0 1
+ * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE
+ *
+ * which is simply adding 4 then modding by 8 (or anding with 7).
+ */
+static inline enum isl_channel_select
+pipe_swizzle_to_isl_channel(enum pipe_swizzle swizzle)
+{
+ return (swizzle + 4) & 7;
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_pipe_control.c b/src/gallium/drivers/crocus/crocus_pipe_control.c
new file mode 100644
index 00000000000..7a9625c61ed
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_pipe_control.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_pipe_control.c
+ *
+ * PIPE_CONTROL is the main flushing and synchronization primitive on Intel
+ * GPUs. It can invalidate caches, stall until rendering reaches various
+ * stages of completion, write to memory, and other things. In a way, it's
+ * a swiss army knife command - it has all kinds of capabilities, but some
+ * significant limitations as well.
+ *
+ * Unfortunately, it's notoriously complicated and difficult to use. Many
+ * sub-commands can't be used together. Some are meant to be used at the
+ * top of the pipeline (invalidating caches before drawing), while some are
+ * meant to be used at the end (stalling or flushing after drawing).
+ *
+ * Also, there's a list of restrictions a mile long, which vary by generation.
+ * Do this before doing that, or suffer the consequences (usually a GPU hang).
+ *
+ * This file contains helpers for emitting them safely. You can simply call
+ * crocus_emit_pipe_control_flush() with the desired operations (as logical
+ * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple
+ * PIPE_CONTROL commands as necessary. The per-generation workarounds are
+ * applied in crocus_emit_raw_pipe_control() in crocus_state.c.
+ */
+
+#include "crocus_context.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+
+/**
+ * Emit a PIPE_CONTROL with various flushing flags.
+ *
+ * The caller is responsible for deciding what flags are appropriate for the
+ * given generation.
+ */
+void
+crocus_emit_pipe_control_flush(struct crocus_batch *batch,
+ const char *reason,
+ uint32_t flags)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ if (devinfo->ver >= 6 &&
+ (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
+ (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
+ /* A pipe control command with flush and invalidate bits set
+ * simultaneously is an inherently racy operation on Gen6+ if the
+ * contents of the flushed caches were intended to become visible from
+ * any of the invalidated caches. Split it in two PIPE_CONTROLs, the
+ * first one should stall the pipeline to make sure that the flushed R/W
+ * caches are coherent with memory once the specified R/O caches are
+ * invalidated. On pre-Gen6 hardware the (implicit) R/O cache
+ * invalidation seems to happen at the bottom of the pipeline together
+ * with any write cache flush, so this shouldn't be a concern. In order
+ * to ensure a full stall, we do an end-of-pipe sync.
+ */
+ crocus_emit_end_of_pipe_sync(batch, reason,
+ flags & PIPE_CONTROL_CACHE_FLUSH_BITS);
+ flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
+ }
+
+ batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0);
+}
+
+/**
+ * Emit a PIPE_CONTROL that writes to a buffer object.
+ *
+ * \p flags should contain one of the following items:
+ * - PIPE_CONTROL_WRITE_IMMEDIATE
+ * - PIPE_CONTROL_WRITE_TIMESTAMP
+ * - PIPE_CONTROL_WRITE_DEPTH_COUNT
+ */
+void
+crocus_emit_pipe_control_write(struct crocus_batch *batch,
+ const char *reason, uint32_t flags,
+ struct crocus_bo *bo, uint32_t offset,
+ uint64_t imm)
+{
+ batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm);
+}
+
+/**
+ * Restriction [DevSNB, DevIVB]:
+ *
+ * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
+ * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
+ * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
+ * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
+ * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
+ * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
+ * unless SW can otherwise guarantee that the pipeline from WM onwards is
+ * already flushed (e.g., via a preceding MI_FLUSH).
+ */
+void
+crocus_emit_depth_stall_flushes(struct crocus_batch *batch)
+{
+ UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ assert(devinfo->ver >= 6);
+
+ crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL);
+ crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+ crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL);
+}
+
+/*
+ * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
+ *
+ * Write synchronization is a special case of end-of-pipe
+ * synchronization that requires that the render cache and/or depth
+ * related caches are flushed to memory, where the data will become
+ * globally visible. This type of synchronization is required prior to
+ * SW (CPU) actually reading the result data from memory, or initiating
+ * an operation that will use as a read surface (such as a texture
+ * surface) a previous render target and/or depth/stencil buffer
+ *
+ * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
+ *
+ * Exercising the write cache flush bits (Render Target Cache Flush
+ * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
+ * ensures the write caches are flushed and doesn't guarantee the data
+ * is globally visible.
+ *
+ * SW can track the completion of the end-of-pipe-synchronization by
+ * using "Notify Enable" and "PostSync Operation - Write Immediate
+ * Data" in the PIPE_CONTROL command.
+ */
+void
+crocus_emit_end_of_pipe_sync(struct crocus_batch *batch,
+ const char *reason, uint32_t flags)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ if (devinfo->ver >= 6) {
+ /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
+ *
+ * "The most common action to perform upon reaching a synchronization
+ * point is to write a value out to memory. An immediate value
+ * (included with the synchronization command) may be written."
+ *
+ * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
+ *
+ * "In case the data flushed out by the render engine is to be read
+ * back in to the render engine in coherent manner, then the render
+ * engine has to wait for the fence completion before accessing the
+ * flushed data. This can be achieved by following means on various
+ * products: PIPE_CONTROL command with CS Stall and the required
+ * write caches flushed with Post-Sync-Operation as Write Immediate
+ * Data.
+ *
+ * Example:
+ * - Workload-1 (3D/GPGPU/MEDIA)
+ * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
+ * Data, Required Write Cache Flush bits set)
+ * - Workload-2 (Can use the data produce or output by Workload-1)
+ */
+ crocus_emit_pipe_control_write(batch, reason,
+ flags | PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_WRITE_IMMEDIATE,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset, 0);
+
+ if (batch->screen->devinfo.is_haswell) {
+#define GEN7_3DPRIM_START_INSTANCE 0x243C
+ batch->screen->vtbl.load_register_mem32(batch, GEN7_3DPRIM_START_INSTANCE,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset);
+ }
+ } else {
+ /* On gen4-5, a regular pipe control seems to suffice. */
+ crocus_emit_pipe_control_flush(batch, reason, flags);
+ }
+}
+
+/* Emit a pipelined flush to either flush render and texture cache for
+ * reading from a FBO-drawn texture, or flush so that frontbuffer
+ * render appears on the screen in DRI1.
+ *
+ * This is also used for the always_flush_cache driconf debug option.
+ */
+void
+crocus_emit_mi_flush(struct crocus_batch *batch)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH;
+ if (devinfo->ver >= 6) {
+ flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_VF_CACHE_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CS_STALL;
+ }
+ crocus_emit_pipe_control_flush(batch, "mi flush", flags);
+}
+
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6. From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ * "1 of the following must also be set:
+ * - Render Target Cache Flush Enable ([12] of DW1)
+ * - Depth Cache Flush Enable ([0] of DW1)
+ * - Stall at Pixel Scoreboard ([1] of DW1)
+ * - Depth Stall ([13] of DW1)
+ * - Post-Sync Operation ([13] of DW1)
+ * - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it. Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either. Notify enable is IRQs, which aren't
+ * really our business. That leaves only stall at scoreboard.
+ */
+void
+crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch)
+{
+ crocus_emit_pipe_control_flush(batch, "nonzero",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+
+ crocus_emit_pipe_control_write(batch, "nonzero",
+ PIPE_CONTROL_WRITE_IMMEDIATE,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset, 0);
+}
+
+/**
+ * Flush and invalidate all caches (for debugging purposes).
+ */
+void
+crocus_flush_all_caches(struct crocus_batch *batch)
+{
+ crocus_emit_pipe_control_flush(batch, "debug: flush all caches",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_VF_CACHE_INVALIDATE |
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+}
+
+static void
+crocus_texture_barrier(struct pipe_context *ctx, unsigned flags)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_batch *render_batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_batch *compute_batch = &ice->batches[CROCUS_BATCH_COMPUTE];
+ const struct intel_device_info *devinfo = &render_batch->screen->devinfo;
+
+ if (devinfo->ver < 6) {
+ crocus_emit_mi_flush(render_batch);
+ return;
+ }
+
+ if (render_batch->contains_draw) {
+ crocus_batch_maybe_flush(render_batch, 48);
+ crocus_emit_pipe_control_flush(render_batch,
+ "API: texture barrier (1/2)",
+ (flags == 1 ? PIPE_CONTROL_DEPTH_CACHE_FLUSH : 0) |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(render_batch,
+ "API: texture barrier (2/2)",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+ }
+
+ if (compute_batch->contains_draw) {
+ crocus_batch_maybe_flush(compute_batch, 48);
+ crocus_emit_pipe_control_flush(compute_batch,
+ "API: texture barrier (1/2)",
+ PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(compute_batch,
+ "API: texture barrier (2/2)",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+ }
+}
+
+static void
+crocus_memory_barrier(struct pipe_context *ctx, unsigned flags)
+{
+ struct crocus_context *ice = (void *) ctx;
+ unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
+ const struct intel_device_info *devinfo = &ice->batches[0].screen->devinfo;
+
+ assert(devinfo->ver == 7);
+
+ if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
+ PIPE_BARRIER_INDEX_BUFFER |
+ PIPE_BARRIER_INDIRECT_BUFFER)) {
+ bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+ }
+
+ if (flags & PIPE_BARRIER_CONSTANT_BUFFER) {
+ bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+ }
+
+ if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) {
+ bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH;
+ }
+
+ /* Typed surface messages are handled by the render cache on IVB, so we
+ * need to flush it too.
+ */
+ if (!devinfo->is_haswell)
+ bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
+
+ for (int i = 0; i < ice->batch_count; i++) {
+ if (ice->batches[i].contains_draw) {
+ crocus_batch_maybe_flush(&ice->batches[i], 24);
+ crocus_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier",
+ bits);
+ }
+ }
+}
+
+void
+crocus_init_flush_functions(struct pipe_context *ctx)
+{
+ ctx->memory_barrier = crocus_memory_barrier;
+ ctx->texture_barrier = crocus_texture_barrier;
+}
diff --git a/src/gallium/drivers/crocus/crocus_program.c b/src/gallium/drivers/crocus/crocus_program.c
new file mode 100644
index 00000000000..fb8216b71ab
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_program.c
@@ -0,0 +1,3171 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_program.c
+ *
+ * This file contains the driver interface for compiling shaders.
+ *
+ * See crocus_program_cache.c for the in-memory program cache where the
+ * compiled shaders are stored.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_atomic.h"
+#include "util/u_upload_mgr.h"
+#include "util/debug.h"
+#include "util/u_prim.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_serialize.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_nir.h"
+#include "crocus_context.h"
+#include "nir/tgsi_to_nir.h"
+
+#define KEY_INIT_NO_ID() \
+ .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \
+ .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688, \
+ .base.tex.compressed_multisample_layout_mask = ~0
+#define KEY_INIT() .base.program_string_id = ish->program_id, KEY_INIT_NO_ID()
+
+static void
+crocus_sanitize_tex_key(struct brw_sampler_prog_key_data *key)
+{
+ key->gather_channel_quirk_mask = 0;
+ for (unsigned s = 0; s < MAX_SAMPLERS; s++) {
+ key->swizzles[s] = SWIZZLE_NOOP;
+ key->gfx6_gather_wa[s] = 0;
+ }
+}
+
+static uint32_t
+crocus_get_texture_swizzle(const struct crocus_context *ice,
+ const struct crocus_sampler_view *t)
+{
+ uint32_t swiz = 0;
+
+ for (int i = 0; i < 4; i++) {
+ swiz |= t->swizzle[i] << (i * 3);
+ }
+ return swiz;
+}
+
+static inline bool can_push_ubo(const struct intel_device_info *devinfo)
+{
+ /* push works for everyone except SNB at the moment */
+ return devinfo->ver != 6;
+}
+
+static uint8_t
+gfx6_gather_workaround(enum pipe_format pformat)
+{
+ switch (pformat) {
+ case PIPE_FORMAT_R8_SINT: return WA_SIGN | WA_8BIT;
+ case PIPE_FORMAT_R8_UINT: return WA_8BIT;
+ case PIPE_FORMAT_R16_SINT: return WA_SIGN | WA_16BIT;
+ case PIPE_FORMAT_R16_UINT: return WA_16BIT;
+ default:
+ /* Note that even though PIPE_FORMAT_R32_SINT and
+ * PIPE_FORMAT_R32_UINThave format overrides in
+ * the surface state, there is no shader w/a required.
+ */
+ return 0;
+ }
+}
+
+static const unsigned crocus_gfx6_swizzle_for_offset[4] = {
+ BRW_SWIZZLE4(0, 1, 2, 3),
+ BRW_SWIZZLE4(1, 2, 3, 3),
+ BRW_SWIZZLE4(2, 3, 3, 3),
+ BRW_SWIZZLE4(3, 3, 3, 3)
+};
+
+static void
+gfx6_gs_xfb_setup(const struct pipe_stream_output_info *so_info,
+ struct brw_gs_prog_data *gs_prog_data)
+{
+ /* Make sure that the VUE slots won't overflow the unsigned chars in
+ * prog_data->transform_feedback_bindings[].
+ */
+ STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
+
+ /* Make sure that we don't need more binding table entries than we've
+ * set aside for use in transform feedback. (We shouldn't, since we
+ * set aside enough binding table entries to have one per component).
+ */
+ assert(so_info->num_outputs <= BRW_MAX_SOL_BINDINGS);
+
+ gs_prog_data->num_transform_feedback_bindings = so_info->num_outputs;
+ for (unsigned i = 0; i < so_info->num_outputs; i++) {
+ gs_prog_data->transform_feedback_bindings[i] =
+ so_info->output[i].register_index;
+ gs_prog_data->transform_feedback_swizzles[i] =
+ crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component];
+ }
+}
+
+static void
+gfx6_ff_gs_xfb_setup(const struct pipe_stream_output_info *so_info,
+ struct brw_ff_gs_prog_key *key)
+{
+ key->num_transform_feedback_bindings = so_info->num_outputs;
+ for (unsigned i = 0; i < so_info->num_outputs; i++) {
+ key->transform_feedback_bindings[i] =
+ so_info->output[i].register_index;
+ key->transform_feedback_swizzles[i] =
+ crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component];
+ }
+}
+
+static void
+crocus_populate_sampler_prog_key_data(struct crocus_context *ice,
+ const struct intel_device_info *devinfo,
+ gl_shader_stage stage,
+ struct crocus_uncompiled_shader *ish,
+ bool uses_texture_gather,
+ struct brw_sampler_prog_key_data *key)
+{
+ uint32_t mask = ish->nir->info.textures_used[0];
+
+ while (mask) {
+ const int s = u_bit_scan(&mask);
+
+ struct crocus_sampler_view *texture = ice->state.shaders[stage].textures[s];
+ key->swizzles[s] = SWIZZLE_NOOP;
+ key->scale_factors[s] = 0.0f;
+
+ if (!texture)
+ continue;
+ if (texture->base.target == PIPE_BUFFER)
+ continue;
+ if (!devinfo->is_haswell) {
+ key->swizzles[s] = crocus_get_texture_swizzle(ice, texture);
+ }
+
+ /* gather4 for RG32* is broken in multiple ways on Gen7. */
+ if (devinfo->ver == 7 && uses_texture_gather) {
+ switch (texture->base.format) {
+ case PIPE_FORMAT_R32G32_UINT:
+ case PIPE_FORMAT_R32G32_SINT: {
+ /* We have to override the format to R32G32_FLOAT_LD.
+ * This means that SCS_ALPHA and SCS_ONE will return 0x3f8
+ * (1.0) rather than integer 1. This needs shader hacks.
+ *
+ * On Ivybridge, we whack W (alpha) to ONE in our key's
+ * swizzle. On Haswell, we look at the original texture
+ * swizzle, and use XYZW with channels overridden to ONE,
+ * leaving normal texture swizzling to SCS.
+ */
+ unsigned src_swizzle = key->swizzles[s];
+ for (int i = 0; i < 4; i++) {
+ unsigned src_comp = GET_SWZ(src_swizzle, i);
+ if (src_comp == SWIZZLE_ONE || src_comp == SWIZZLE_W) {
+ key->swizzles[i] &= ~(0x7 << (3 * i));
+ key->swizzles[i] |= SWIZZLE_ONE << (3 * i);
+ }
+ }
+ }
+ FALLTHROUGH;
+ case PIPE_FORMAT_R32G32_FLOAT:
+ /* The channel select for green doesn't work - we have to
+ * request blue. Haswell can use SCS for this, but Ivybridge
+ * needs a shader workaround.
+ */
+ if (!devinfo->is_haswell)
+ key->gather_channel_quirk_mask |= 1 << s;
+ break;
+ default:
+ break;
+ }
+ }
+ if (devinfo->ver == 6 && uses_texture_gather) {
+ key->gfx6_gather_wa[s] = gfx6_gather_workaround(texture->base.format);
+ }
+ }
+}
+
+static void
+crocus_lower_swizzles(struct nir_shader *nir,
+ const struct brw_sampler_prog_key_data *key_tex)
+{
+ struct nir_lower_tex_options tex_options = { 0 };
+ uint32_t mask = nir->info.textures_used[0];
+
+ while (mask) {
+ const int s = u_bit_scan(&mask);
+
+ if (key_tex->swizzles[s] == SWIZZLE_NOOP)
+ continue;
+
+ tex_options.swizzle_result |= (1 << s);
+ for (unsigned c = 0; c < 4; c++)
+ tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c);
+ }
+ if (tex_options.swizzle_result)
+ nir_lower_tex(nir, &tex_options);
+}
+
+static unsigned
+get_new_program_id(struct crocus_screen *screen)
+{
+ return p_atomic_inc_return(&screen->program_id);
+}
+
+static nir_ssa_def *
+get_aoa_deref_offset(nir_builder *b,
+ nir_deref_instr *deref,
+ unsigned elem_size)
+{
+ unsigned array_size = elem_size;
+ nir_ssa_def *offset = nir_imm_int(b, 0);
+
+ while (deref->deref_type != nir_deref_type_var) {
+ assert(deref->deref_type == nir_deref_type_array);
+
+ /* This level's element size is the previous level's array size */
+ nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1);
+ assert(deref->arr.index.ssa);
+ offset = nir_iadd(b, offset,
+ nir_imul(b, index, nir_imm_int(b, array_size)));
+
+ deref = nir_deref_instr_parent(deref);
+ assert(glsl_type_is_array(deref->type));
+ array_size *= glsl_get_length(deref->type);
+ }
+
+ /* Accessing an invalid surface index with the dataport can result in a
+ * hang. According to the spec "if the index used to select an individual
+ * element is negative or greater than or equal to the size of the array,
+ * the results of the operation are undefined but may not lead to
+ * termination" -- which is one of the possible outcomes of the hang.
+ * Clamp the index to prevent access outside of the array bounds.
+ */
+ return nir_umin(b, offset, nir_imm_int(b, array_size - elem_size));
+}
+
+static void
+crocus_lower_storage_image_derefs(nir_shader *nir)
+{
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_deref_store:
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_imin:
+ case nir_intrinsic_image_deref_atomic_umin:
+ case nir_intrinsic_image_deref_atomic_imax:
+ case nir_intrinsic_image_deref_atomic_umax:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ case nir_intrinsic_image_deref_size:
+ case nir_intrinsic_image_deref_samples:
+ case nir_intrinsic_image_deref_load_raw_intel:
+ case nir_intrinsic_image_deref_store_raw_intel: {
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+
+ b.cursor = nir_before_instr(&intrin->instr);
+ nir_ssa_def *index =
+ nir_iadd(&b, nir_imm_int(&b, var->data.driver_location),
+ get_aoa_deref_offset(&b, deref, 1));
+ nir_rewrite_image_intrinsic(intrin, index, false);
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+ }
+}
+
+// XXX: need unify_interfaces() at link time...
+
+/**
+ * Undo nir_lower_passthrough_edgeflags but keep the inputs_read flag.
+ */
+static bool
+crocus_fix_edge_flags(nir_shader *nir)
+{
+ if (nir->info.stage != MESA_SHADER_VERTEX) {
+ nir_shader_preserve_all_metadata(nir);
+ return false;
+ }
+
+ nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out,
+ VARYING_SLOT_EDGE);
+ if (!var) {
+ nir_shader_preserve_all_metadata(nir);
+ return false;
+ }
+
+ var->data.mode = nir_var_shader_temp;
+ nir->info.outputs_written &= ~VARYING_BIT_EDGE;
+ nir->info.inputs_read &= ~VERT_BIT_EDGEFLAG;
+ nir_fixup_deref_modes(nir);
+
+ nir_foreach_function(f, nir) {
+ if (f->impl) {
+ nir_metadata_preserve(f->impl, nir_metadata_block_index |
+ nir_metadata_dominance |
+ nir_metadata_live_ssa_defs |
+ nir_metadata_loop_analysis);
+ } else {
+ nir_metadata_preserve(f->impl, nir_metadata_all);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Fix an uncompiled shader's stream output info.
+ *
+ * Core Gallium stores output->register_index as a "slot" number, where
+ * slots are assigned consecutively to all outputs in info->outputs_written.
+ * This naive packing of outputs doesn't work for us - we too have slots,
+ * but the layout is defined by the VUE map, which we won't have until we
+ * compile a specific shader variant. So, we remap these and simply store
+ * VARYING_SLOT_* in our copy's output->register_index fields.
+ *
+ * We also fix up VARYING_SLOT_{LAYER,VIEWPORT,PSIZ} to select the Y/Z/W
+ * components of our VUE header. See brw_vue_map.c for the layout.
+ */
+static void
+update_so_info(struct pipe_stream_output_info *so_info,
+ uint64_t outputs_written)
+{
+ uint8_t reverse_map[64] = {};
+ unsigned slot = 0;
+ while (outputs_written) {
+ reverse_map[slot++] = u_bit_scan64(&outputs_written);
+ }
+
+ for (unsigned i = 0; i < so_info->num_outputs; i++) {
+ struct pipe_stream_output *output = &so_info->output[i];
+
+ /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */
+ output->register_index = reverse_map[output->register_index];
+
+ /* The VUE header contains three scalar fields packed together:
+ * - gl_PointSize is stored in VARYING_SLOT_PSIZ.w
+ * - gl_Layer is stored in VARYING_SLOT_PSIZ.y
+ * - gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
+ */
+ switch (output->register_index) {
+ case VARYING_SLOT_LAYER:
+ assert(output->num_components == 1);
+ output->register_index = VARYING_SLOT_PSIZ;
+ output->start_component = 1;
+ break;
+ case VARYING_SLOT_VIEWPORT:
+ assert(output->num_components == 1);
+ output->register_index = VARYING_SLOT_PSIZ;
+ output->start_component = 2;
+ break;
+ case VARYING_SLOT_PSIZ:
+ assert(output->num_components == 1);
+ output->start_component = 3;
+ break;
+ }
+
+ //info->outputs_written |= 1ull << output->register_index;
+ }
+}
+
+static void
+setup_vec4_image_sysval(uint32_t *sysvals, uint32_t idx,
+ unsigned offset, unsigned n)
+{
+ assert(offset % sizeof(uint32_t) == 0);
+
+ for (unsigned i = 0; i < n; ++i)
+ sysvals[i] = BRW_PARAM_IMAGE(idx, offset / sizeof(uint32_t) + i);
+
+ for (unsigned i = n; i < 4; ++i)
+ sysvals[i] = BRW_PARAM_BUILTIN_ZERO;
+}
+
+/**
+ * Associate NIR uniform variables with the prog_data->param[] mechanism
+ * used by the backend. Also, decide which UBOs we'd like to push in an
+ * ideal situation (though the backend can reduce this).
+ */
+static void
+crocus_setup_uniforms(const struct brw_compiler *compiler,
+ void *mem_ctx,
+ nir_shader *nir,
+ struct brw_stage_prog_data *prog_data,
+ enum brw_param_builtin **out_system_values,
+ unsigned *out_num_system_values,
+ unsigned *out_num_cbufs)
+{
+ UNUSED const struct intel_device_info *devinfo = compiler->devinfo;
+
+ const unsigned CROCUS_MAX_SYSTEM_VALUES =
+ PIPE_MAX_SHADER_IMAGES * BRW_IMAGE_PARAM_SIZE;
+ enum brw_param_builtin *system_values =
+ rzalloc_array(mem_ctx, enum brw_param_builtin, CROCUS_MAX_SYSTEM_VALUES);
+ unsigned num_system_values = 0;
+
+ unsigned patch_vert_idx = -1;
+ unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES];
+ unsigned img_idx[PIPE_MAX_SHADER_IMAGES];
+ unsigned variable_group_size_idx = -1;
+ memset(ucp_idx, -1, sizeof(ucp_idx));
+ memset(img_idx, -1, sizeof(img_idx));
+
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ b.cursor = nir_before_block(nir_start_block(impl));
+ nir_ssa_def *temp_ubo_name = nir_ssa_undef(&b, 1, 32);
+ nir_ssa_def *temp_const_ubo_name = NULL;
+
+ /* Turn system value intrinsics into uniforms */
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ nir_ssa_def *offset;
+
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_constant: {
+ /* This one is special because it reads from the shader constant
+ * data and not cbuf0 which gallium uploads for us.
+ */
+ b.cursor = nir_before_instr(instr);
+ nir_ssa_def *offset =
+ nir_iadd_imm(&b, nir_ssa_for_src(&b, intrin->src[0], 1),
+ nir_intrinsic_base(intrin));
+
+ if (temp_const_ubo_name == NULL)
+ temp_const_ubo_name = nir_imm_int(&b, 0);
+
+ nir_intrinsic_instr *load_ubo =
+ nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo);
+ load_ubo->num_components = intrin->num_components;
+ load_ubo->src[0] = nir_src_for_ssa(temp_const_ubo_name);
+ load_ubo->src[1] = nir_src_for_ssa(offset);
+ nir_intrinsic_set_align(load_ubo, 4, 0);
+ nir_intrinsic_set_range_base(load_ubo, 0);
+ nir_intrinsic_set_range(load_ubo, ~0);
+ nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest,
+ intrin->dest.ssa.num_components,
+ intrin->dest.ssa.bit_size,
+ intrin->dest.ssa.name);
+ nir_builder_instr_insert(&b, &load_ubo->instr);
+
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+ &load_ubo->dest.ssa);
+ nir_instr_remove(&intrin->instr);
+ continue;
+ }
+ case nir_intrinsic_load_user_clip_plane: {
+ unsigned ucp = nir_intrinsic_ucp_id(intrin);
+
+ if (ucp_idx[ucp] == -1) {
+ ucp_idx[ucp] = num_system_values;
+ num_system_values += 4;
+ }
+
+ for (int i = 0; i < 4; i++) {
+ system_values[ucp_idx[ucp] + i] =
+ BRW_PARAM_BUILTIN_CLIP_PLANE(ucp, i);
+ }
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_imm_int(&b, ucp_idx[ucp] * sizeof(uint32_t));
+ break;
+ }
+ case nir_intrinsic_load_patch_vertices_in:
+ if (patch_vert_idx == -1)
+ patch_vert_idx = num_system_values++;
+
+ system_values[patch_vert_idx] =
+ BRW_PARAM_BUILTIN_PATCH_VERTICES_IN;
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t));
+ break;
+ case nir_intrinsic_image_deref_load_param_intel: {
+ assert(devinfo->ver < 9);
+ nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+
+ if (img_idx[var->data.binding] == -1) {
+ /* GL only allows arrays of arrays of images. */
+ assert(glsl_type_is_image(glsl_without_array(var->type)));
+ unsigned num_images = MAX2(1, glsl_get_aoa_size(var->type));
+
+ for (int i = 0; i < num_images; i++) {
+ const unsigned img = var->data.binding + i;
+
+ img_idx[img] = num_system_values;
+ num_system_values += BRW_IMAGE_PARAM_SIZE;
+
+ uint32_t *img_sv = &system_values[img_idx[img]];
+
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_OFFSET_OFFSET, img,
+ offsetof(struct brw_image_param, offset), 2);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_SIZE_OFFSET, img,
+ offsetof(struct brw_image_param, size), 3);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_STRIDE_OFFSET, img,
+ offsetof(struct brw_image_param, stride), 4);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_TILING_OFFSET, img,
+ offsetof(struct brw_image_param, tiling), 3);
+ setup_vec4_image_sysval(
+ img_sv + BRW_IMAGE_PARAM_SWIZZLING_OFFSET, img,
+ offsetof(struct brw_image_param, swizzling), 2);
+ }
+ }
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_iadd(&b,
+ get_aoa_deref_offset(&b, deref, BRW_IMAGE_PARAM_SIZE * 4),
+ nir_imm_int(&b, img_idx[var->data.binding] * 4 +
+ nir_intrinsic_base(intrin) * 16));
+ break;
+ }
+ case nir_intrinsic_load_workgroup_size: {
+ assert(nir->info.workgroup_size_variable);
+ if (variable_group_size_idx == -1) {
+ variable_group_size_idx = num_system_values;
+ num_system_values += 3;
+ for (int i = 0; i < 3; i++) {
+ system_values[variable_group_size_idx + i] =
+ BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i;
+ }
+ }
+
+ b.cursor = nir_before_instr(instr);
+ offset = nir_imm_int(&b,
+ variable_group_size_idx * sizeof(uint32_t));
+ break;
+ }
+ default:
+ continue;
+ }
+
+ unsigned comps = nir_intrinsic_dest_components(intrin);
+
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(nir, nir_intrinsic_load_ubo);
+ load->num_components = comps;
+ load->src[0] = nir_src_for_ssa(temp_ubo_name);
+ load->src[1] = nir_src_for_ssa(offset);
+ nir_intrinsic_set_align(load, 4, 0);
+ nir_intrinsic_set_range_base(load, 0);
+ nir_intrinsic_set_range(load, ~0);
+ nir_ssa_dest_init(&load->instr, &load->dest, comps, 32, NULL);
+ nir_builder_instr_insert(&b, &load->instr);
+ nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+ &load->dest.ssa);
+ nir_instr_remove(instr);
+ }
+ }
+
+ nir_validate_shader(nir, "before remapping");
+
+ /* Uniforms are stored in constant buffer 0, the
+ * user-facing UBOs are indexed by one. So if any constant buffer is
+ * needed, the constant buffer 0 will be needed, so account for it.
+ */
+ unsigned num_cbufs = nir->info.num_ubos;
+ if (num_cbufs || nir->num_uniforms)
+ num_cbufs++;
+
+ /* Place the new params in a new cbuf. */
+ if (num_system_values > 0) {
+ unsigned sysval_cbuf_index = num_cbufs;
+ num_cbufs++;
+
+ system_values = reralloc(mem_ctx, system_values, enum brw_param_builtin,
+ num_system_values);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+
+ if (load->intrinsic != nir_intrinsic_load_ubo)
+ continue;
+
+ b.cursor = nir_before_instr(instr);
+
+ assert(load->src[0].is_ssa);
+
+ if (load->src[0].ssa == temp_ubo_name) {
+ nir_ssa_def *imm = nir_imm_int(&b, sysval_cbuf_index);
+ nir_instr_rewrite_src(instr, &load->src[0],
+ nir_src_for_ssa(imm));
+ }
+ }
+ }
+
+ /* We need to fold the new iadds for brw_nir_analyze_ubo_ranges */
+ nir_opt_constant_folding(nir);
+ } else {
+ ralloc_free(system_values);
+ system_values = NULL;
+ }
+
+ assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS);
+ nir_validate_shader(nir, "after remap");
+
+ /* We don't use params[] but gallium leaves num_uniforms set. We use this
+ * to detect when cbuf0 exists but we don't need it anymore when we get
+ * here. Instead, zero it out so that the back-end doesn't get confused
+ * when nr_params * 4 != num_uniforms != nr_params * 4.
+ */
+ nir->num_uniforms = 0;
+
+ /* Constant loads (if any) need to go at the end of the constant buffers so
+ * we need to know num_cbufs before we can lower to them.
+ */
+ if (temp_const_ubo_name != NULL) {
+ nir_load_const_instr *const_ubo_index =
+ nir_instr_as_load_const(temp_const_ubo_name->parent_instr);
+ assert(const_ubo_index->def.bit_size == 32);
+ const_ubo_index->value[0].u32 = num_cbufs;
+ }
+
+ *out_system_values = system_values;
+ *out_num_system_values = num_system_values;
+ *out_num_cbufs = num_cbufs;
+}
+
+static const char *surface_group_names[] = {
+ [CROCUS_SURFACE_GROUP_RENDER_TARGET] = "render target",
+ [CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = "non-coherent render target read",
+ [CROCUS_SURFACE_GROUP_SOL] = "streamout",
+ [CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = "CS work groups",
+ [CROCUS_SURFACE_GROUP_TEXTURE] = "texture",
+ [CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = "texture gather",
+ [CROCUS_SURFACE_GROUP_UBO] = "ubo",
+ [CROCUS_SURFACE_GROUP_SSBO] = "ssbo",
+ [CROCUS_SURFACE_GROUP_IMAGE] = "image",
+};
+
+static void
+crocus_print_binding_table(FILE *fp, const char *name,
+ const struct crocus_binding_table *bt)
+{
+ STATIC_ASSERT(ARRAY_SIZE(surface_group_names) == CROCUS_SURFACE_GROUP_COUNT);
+
+ uint32_t total = 0;
+ uint32_t compacted = 0;
+
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+ uint32_t size = bt->sizes[i];
+ total += size;
+ if (size)
+ compacted += util_bitcount64(bt->used_mask[i]);
+ }
+
+ if (total == 0) {
+ fprintf(fp, "Binding table for %s is empty\n\n", name);
+ return;
+ }
+
+ if (total != compacted) {
+ fprintf(fp, "Binding table for %s "
+ "(compacted to %u entries from %u entries)\n",
+ name, compacted, total);
+ } else {
+ fprintf(fp, "Binding table for %s (%u entries)\n", name, total);
+ }
+
+ uint32_t entry = 0;
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+ uint64_t mask = bt->used_mask[i];
+ while (mask) {
+ int index = u_bit_scan64(&mask);
+ fprintf(fp, " [%u] %s #%d\n", entry++, surface_group_names[i], index);
+ }
+ }
+ fprintf(fp, "\n");
+}
+
+enum {
+ /* Max elements in a surface group. */
+ SURFACE_GROUP_MAX_ELEMENTS = 64,
+};
+
+/**
+ * Map a <group, index> pair to a binding table index.
+ *
+ * For example: <UBO, 5> => binding table index 12
+ */
+uint32_t
+crocus_group_index_to_bti(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group, uint32_t index)
+{
+ assert(index < bt->sizes[group]);
+ uint64_t mask = bt->used_mask[group];
+ uint64_t bit = 1ull << index;
+ if (bit & mask) {
+ return bt->offsets[group] + util_bitcount64((bit - 1) & mask);
+ } else {
+ return CROCUS_SURFACE_NOT_USED;
+ }
+}
+
+/**
+ * Map a binding table index back to a <group, index> pair.
+ *
+ * For example: binding table index 12 => <UBO, 5>
+ */
+uint32_t
+crocus_bti_to_group_index(const struct crocus_binding_table *bt,
+ enum crocus_surface_group group, uint32_t bti)
+{
+ uint64_t used_mask = bt->used_mask[group];
+ assert(bti >= bt->offsets[group]);
+
+ uint32_t c = bti - bt->offsets[group];
+ while (used_mask) {
+ int i = u_bit_scan64(&used_mask);
+ if (c == 0)
+ return i;
+ c--;
+ }
+
+ return CROCUS_SURFACE_NOT_USED;
+}
+
+static void
+rewrite_src_with_bti(nir_builder *b, struct crocus_binding_table *bt,
+ nir_instr *instr, nir_src *src,
+ enum crocus_surface_group group)
+{
+ assert(bt->sizes[group] > 0);
+
+ b->cursor = nir_before_instr(instr);
+ nir_ssa_def *bti;
+ if (nir_src_is_const(*src)) {
+ uint32_t index = nir_src_as_uint(*src);
+ bti = nir_imm_intN_t(b, crocus_group_index_to_bti(bt, group, index),
+ src->ssa->bit_size);
+ } else {
+ /* Indirect usage makes all the surfaces of the group to be available,
+ * so we can just add the base.
+ */
+ assert(bt->used_mask[group] == BITFIELD64_MASK(bt->sizes[group]));
+ bti = nir_iadd_imm(b, src->ssa, bt->offsets[group]);
+ }
+ nir_instr_rewrite_src(instr, src, nir_src_for_ssa(bti));
+}
+
+static void
+mark_used_with_src(struct crocus_binding_table *bt, nir_src *src,
+ enum crocus_surface_group group)
+{
+ assert(bt->sizes[group] > 0);
+
+ if (nir_src_is_const(*src)) {
+ uint64_t index = nir_src_as_uint(*src);
+ assert(index < bt->sizes[group]);
+ bt->used_mask[group] |= 1ull << index;
+ } else {
+ /* There's an indirect usage, we need all the surfaces. */
+ bt->used_mask[group] = BITFIELD64_MASK(bt->sizes[group]);
+ }
+}
+
+static bool
+skip_compacting_binding_tables(void)
+{
+ static int skip = -1;
+ if (skip < 0)
+ skip = env_var_as_boolean("INTEL_DISABLE_COMPACT_BINDING_TABLE", false);
+ return skip;
+}
+
+/**
+ * Set up the binding table indices and apply to the shader.
+ */
+static void
+crocus_setup_binding_table(const struct intel_device_info *devinfo,
+ struct nir_shader *nir,
+ struct crocus_binding_table *bt,
+ unsigned num_render_targets,
+ unsigned num_system_values,
+ unsigned num_cbufs,
+ const struct brw_sampler_prog_key_data *key)
+{
+ const struct shader_info *info = &nir->info;
+
+ memset(bt, 0, sizeof(*bt));
+
+ /* Set the sizes for each surface group. For some groups, we already know
+ * upfront how many will be used, so mark them.
+ */
+ if (info->stage == MESA_SHADER_FRAGMENT) {
+ bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET] = num_render_targets;
+ /* All render targets used. */
+ bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET] =
+ BITFIELD64_MASK(num_render_targets);
+
+ /* Setup render target read surface group in order to support non-coherent
+ * framebuffer fetch on Gfx7
+ */
+ if (devinfo->ver >= 6 && info->outputs_read) {
+ bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = num_render_targets;
+ bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] =
+ BITFIELD64_MASK(num_render_targets);
+ }
+ } else if (info->stage == MESA_SHADER_COMPUTE) {
+ bt->sizes[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1;
+ } else if (info->stage == MESA_SHADER_GEOMETRY) {
+ /* In gfx6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
+ * feedback surfaces.
+ */
+ if (devinfo->ver == 6) {
+ bt->sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS;
+ bt->used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1;
+ }
+ }
+
+ bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE] = BITSET_LAST_BIT(info->textures_used);
+ bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE] = info->textures_used[0];
+
+ if (info->uses_texture_gather) {
+ bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = BITSET_LAST_BIT(info->textures_used);
+ bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = info->textures_used[0];
+ }
+
+ bt->sizes[CROCUS_SURFACE_GROUP_IMAGE] = info->num_images;
+
+ /* Allocate an extra slot in the UBO section for NIR constants.
+ * Binding table compaction will remove it if unnecessary.
+ *
+ * We don't include them in crocus_compiled_shader::num_cbufs because
+ * they are uploaded separately from shs->constbufs[], but from a shader
+ * point of view, they're another UBO (at the end of the section).
+ */
+ bt->sizes[CROCUS_SURFACE_GROUP_UBO] = num_cbufs + 1;
+
+ bt->sizes[CROCUS_SURFACE_GROUP_SSBO] = info->num_ssbos;
+
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++)
+ assert(bt->sizes[i] <= SURFACE_GROUP_MAX_ELEMENTS);
+
+ /* Mark surfaces used for the cases we don't have the information available
+ * upfront.
+ */
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_foreach_block (block, impl) {
+ nir_foreach_instr (instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_load_num_workgroups:
+ bt->used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1;
+ break;
+
+ case nir_intrinsic_load_output:
+ if (devinfo->ver >= 6) {
+ mark_used_with_src(bt, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_RENDER_TARGET_READ);
+ }
+ break;
+
+ case nir_intrinsic_image_size:
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_store:
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_imin:
+ case nir_intrinsic_image_atomic_umin:
+ case nir_intrinsic_image_atomic_imax:
+ case nir_intrinsic_image_atomic_umax:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_image_load_raw_intel:
+ case nir_intrinsic_image_store_raw_intel:
+ mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_IMAGE);
+ break;
+
+ case nir_intrinsic_load_ubo:
+ mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_UBO);
+ break;
+
+ case nir_intrinsic_store_ssbo:
+ mark_used_with_src(bt, &intrin->src[1], CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ case nir_intrinsic_get_ssbo_size:
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ case nir_intrinsic_ssbo_atomic_fmin:
+ case nir_intrinsic_ssbo_atomic_fmax:
+ case nir_intrinsic_ssbo_atomic_fcomp_swap:
+ case nir_intrinsic_load_ssbo:
+ mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+
+ /* When disable we just mark everything as used. */
+ if (unlikely(skip_compacting_binding_tables())) {
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++)
+ bt->used_mask[i] = BITFIELD64_MASK(bt->sizes[i]);
+ }
+
+ /* Calculate the offsets and the binding table size based on the used
+ * surfaces. After this point, the functions to go between "group indices"
+ * and binding table indices can be used.
+ */
+ uint32_t next = 0;
+ for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+ if (bt->used_mask[i] != 0) {
+ bt->offsets[i] = next;
+ next += util_bitcount64(bt->used_mask[i]);
+ }
+ }
+ bt->size_bytes = next * 4;
+
+ if (unlikely(INTEL_DEBUG & DEBUG_BT)) {
+ crocus_print_binding_table(stderr, gl_shader_stage_name(info->stage), bt);
+ }
+
+ /* Apply the binding table indices. The backend compiler is not expected
+ * to change those, as we haven't set any of the *_start entries in brw
+ * binding_table.
+ */
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ nir_foreach_block (block, impl) {
+ nir_foreach_instr (instr, block) {
+ if (instr->type == nir_instr_type_tex) {
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+ bool is_gather = tex->op == nir_texop_tg4;
+
+ /* rewrite the tg4 component from green to blue before replacing the
+ texture index */
+ if (devinfo->ver == 7 && !devinfo->is_haswell) {
+ if (tex->component == 1)
+ if (key->gather_channel_quirk_mask & (1 << tex->texture_index))
+ tex->component = 2;
+ }
+
+ if (is_gather && devinfo->ver == 6 && key->gfx6_gather_wa[tex->texture_index]) {
+ b.cursor = nir_after_instr(instr);
+ enum gfx6_gather_sampler_wa wa = key->gfx6_gather_wa[tex->texture_index];
+ int width = (wa & WA_8BIT) ? 8 : 16;
+
+ nir_ssa_def *val = nir_fmul_imm(&b, &tex->dest.ssa, (1 << width) - 1);
+ val = nir_f2u32(&b, val);
+ if (wa & WA_SIGN) {
+ val = nir_ishl(&b, val, nir_imm_int(&b, 32 - width));
+ val = nir_ishr(&b, val, nir_imm_int(&b, 32 - width));
+ }
+ nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, val, val->parent_instr);
+ }
+
+ tex->texture_index =
+ crocus_group_index_to_bti(bt, is_gather ? CROCUS_SURFACE_GROUP_TEXTURE_GATHER : CROCUS_SURFACE_GROUP_TEXTURE,
+ tex->texture_index);
+ continue;
+ }
+
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+ switch (intrin->intrinsic) {
+ case nir_intrinsic_image_size:
+ case nir_intrinsic_image_load:
+ case nir_intrinsic_image_store:
+ case nir_intrinsic_image_atomic_add:
+ case nir_intrinsic_image_atomic_imin:
+ case nir_intrinsic_image_atomic_umin:
+ case nir_intrinsic_image_atomic_imax:
+ case nir_intrinsic_image_atomic_umax:
+ case nir_intrinsic_image_atomic_and:
+ case nir_intrinsic_image_atomic_or:
+ case nir_intrinsic_image_atomic_xor:
+ case nir_intrinsic_image_atomic_exchange:
+ case nir_intrinsic_image_atomic_comp_swap:
+ case nir_intrinsic_image_load_raw_intel:
+ case nir_intrinsic_image_store_raw_intel:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_IMAGE);
+ break;
+
+ case nir_intrinsic_load_ubo:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_UBO);
+ break;
+
+ case nir_intrinsic_store_ssbo:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[1],
+ CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ case nir_intrinsic_load_output:
+ if (devinfo->ver >= 6) {
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_RENDER_TARGET_READ);
+ }
+ break;
+
+ case nir_intrinsic_get_ssbo_size:
+ case nir_intrinsic_ssbo_atomic_add:
+ case nir_intrinsic_ssbo_atomic_imin:
+ case nir_intrinsic_ssbo_atomic_umin:
+ case nir_intrinsic_ssbo_atomic_imax:
+ case nir_intrinsic_ssbo_atomic_umax:
+ case nir_intrinsic_ssbo_atomic_and:
+ case nir_intrinsic_ssbo_atomic_or:
+ case nir_intrinsic_ssbo_atomic_xor:
+ case nir_intrinsic_ssbo_atomic_exchange:
+ case nir_intrinsic_ssbo_atomic_comp_swap:
+ case nir_intrinsic_ssbo_atomic_fmin:
+ case nir_intrinsic_ssbo_atomic_fmax:
+ case nir_intrinsic_ssbo_atomic_fcomp_swap:
+ case nir_intrinsic_load_ssbo:
+ rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+ CROCUS_SURFACE_GROUP_SSBO);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+}
+
+static void
+crocus_debug_recompile(struct crocus_context *ice,
+ struct shader_info *info,
+ const struct brw_base_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+ const struct brw_compiler *c = screen->compiler;
+
+ if (!info)
+ return;
+
+ c->shader_perf_log(&ice->dbg, "Recompiling %s shader for program %s: %s\n",
+ _mesa_shader_stage_to_string(info->stage),
+ info->name ? info->name : "(no identifier)",
+ info->label ? info->label : "");
+
+ const void *old_key =
+ crocus_find_previous_compile(ice, info->stage, key->program_string_id);
+
+ brw_debug_key_recompile(c, &ice->dbg, info->stage, old_key, key);
+}
+
+/**
+ * Get the shader for the last enabled geometry stage.
+ *
+ * This stage is the one which will feed stream output and the rasterizer.
+ */
+static gl_shader_stage
+last_vue_stage(struct crocus_context *ice)
+{
+ if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+ return MESA_SHADER_GEOMETRY;
+
+ if (ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL])
+ return MESA_SHADER_TESS_EVAL;
+
+ return MESA_SHADER_VERTEX;
+}
+
+static GLbitfield64
+crocus_vs_outputs_written(struct crocus_context *ice,
+ const struct brw_vs_prog_key *key,
+ GLbitfield64 user_varyings)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ GLbitfield64 outputs_written = user_varyings;
+
+ if (devinfo->ver < 6) {
+
+ if (key->copy_edgeflag)
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
+
+ /* Put dummy slots into the VUE for the SF to put the replaced
+ * point sprite coords in. We shouldn't need these dummy slots,
+ * which take up precious URB space, but it would mean that the SF
+ * doesn't get nice aligned pairs of input coords into output
+ * coords, which would be a pain to handle.
+ */
+ for (unsigned i = 0; i < 8; i++) {
+ if (key->point_coord_replace & (1 << i))
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
+ }
+
+ /* if back colors are written, allocate slots for front colors too */
+ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0);
+ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1);
+ }
+
+ /* In order for legacy clipping to work, we need to populate the clip
+ * distance varying slots whenever clipping is enabled, even if the vertex
+ * shader doesn't write to gl_ClipDistance.
+ */
+ if (key->nr_userclip_plane_consts > 0) {
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
+ outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+ }
+
+ return outputs_written;
+}
+
+/*
+ * If no edgeflags come from the user, gen4/5
+ * require giving the clip shader a default edgeflag.
+ *
+ * This will always be 1.0.
+ */
+static void
+crocus_lower_default_edgeflags(struct nir_shader *nir)
+{
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ b.cursor = nir_after_cf_list(&b.impl->body);
+ nir_variable *var = nir_variable_create(nir, nir_var_shader_out,
+ glsl_float_type(),
+ "edgeflag");
+ var->data.location = VARYING_SLOT_EDGE;
+ nir_store_var(&b, var, nir_imm_float(&b, 1.0), 0x1);
+}
+
+/**
+ * Compile a vertex shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_vs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_vs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_vs_prog_data *vs_prog_data =
+ rzalloc(mem_ctx, struct brw_vs_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &vs_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ enum brw_param_builtin *system_values;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ if (key->nr_userclip_plane_consts) {
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true,
+ false, NULL);
+ nir_lower_io_to_temporaries(nir, impl, true, false);
+ nir_lower_global_vars_to_local(nir);
+ nir_lower_vars_to_ssa(nir);
+ nir_shader_gather_info(nir, impl);
+ }
+
+ prog_data->use_alt_mode = ish->use_alt_mode;
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+
+ crocus_lower_swizzles(nir, &key->base.tex);
+
+ if (devinfo->ver <= 5 &&
+ !(nir->info.inputs_read & BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG)))
+ crocus_lower_default_edgeflags(nir);
+
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ uint64_t outputs_written =
+ crocus_vs_outputs_written(ice, key, nir->info.outputs_written);
+ brw_compute_vue_map(devinfo,
+ &vue_prog_data->vue_map, outputs_written,
+ nir->info.separate_shader, /* pos slots */ 1);
+
+ /* Don't tell the backend about our clip plane constants, we've already
+ * lowered them in NIR and we don't want it doing it again.
+ */
+ struct brw_vs_prog_key key_no_ucp = *key;
+ key_no_ucp.nr_userclip_plane_consts = 0;
+ key_no_ucp.copy_edgeflag = false;
+ crocus_sanitize_tex_key(&key_no_ucp.base.tex);
+
+ struct brw_compile_vs_params params = {
+ .nir = nir,
+ .key = &key_no_ucp,
+ .prog_data = vs_prog_data,
+ .edgeflag_is_last = devinfo->ver < 6,
+ .log_data = &ice->dbg,
+ };
+ const unsigned *program =
+ brw_compile_vs(compiler, mem_ctx, &params);
+ if (program == NULL) {
+ dbg_printf("Failed to compile vertex shader: %s\n", params.error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ uint32_t *so_decls = NULL;
+ if (devinfo->ver > 6)
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_VS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*vs_prog_data), so_decls,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current vertex shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_vs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_VERTEX];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_vs_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_VERTEX, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_vs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_VS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_VS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_vs(ice, ish, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_VS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS |
+ CROCUS_STAGE_DIRTY_BINDINGS_VS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_VS;
+ shs->sysvals_need_upload = true;
+
+ const struct brw_vs_prog_data *vs_prog_data =
+ (void *) shader->prog_data;
+ const bool uses_draw_params = vs_prog_data->uses_firstvertex ||
+ vs_prog_data->uses_baseinstance;
+ const bool uses_derived_draw_params = vs_prog_data->uses_drawid ||
+ vs_prog_data->uses_is_indexed_draw;
+ const bool needs_sgvs_element = uses_draw_params ||
+ vs_prog_data->uses_instanceid ||
+ vs_prog_data->uses_vertexid;
+
+ if (ice->state.vs_uses_draw_params != uses_draw_params ||
+ ice->state.vs_uses_derived_draw_params != uses_derived_draw_params ||
+ ice->state.vs_needs_edge_flag != ish->needs_edge_flag ||
+ ice->state.vs_uses_vertexid != vs_prog_data->uses_vertexid ||
+ ice->state.vs_uses_instanceid != vs_prog_data->uses_instanceid) {
+ ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS |
+ CROCUS_DIRTY_VERTEX_ELEMENTS;
+ }
+ ice->state.vs_uses_draw_params = uses_draw_params;
+ ice->state.vs_uses_derived_draw_params = uses_derived_draw_params;
+ ice->state.vs_needs_sgvs_element = needs_sgvs_element;
+ ice->state.vs_needs_edge_flag = ish->needs_edge_flag;
+ ice->state.vs_uses_vertexid = vs_prog_data->uses_vertexid;
+ ice->state.vs_uses_instanceid = vs_prog_data->uses_instanceid;
+ }
+}
+
+/**
+ * Get the shader_info for a given stage, or NULL if the stage is disabled.
+ */
+const struct shader_info *
+crocus_get_shader_info(const struct crocus_context *ice, gl_shader_stage stage)
+{
+ const struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[stage];
+
+ if (!ish)
+ return NULL;
+
+ const nir_shader *nir = ish->nir;
+ return &nir->info;
+}
+
+/**
+ * Get the union of TCS output and TES input slots.
+ *
+ * TCS and TES need to agree on a common URB entry layout. In particular,
+ * the data for all patch vertices is stored in a single URB entry (unlike
+ * GS which has one entry per input vertex). This means that per-vertex
+ * array indexing needs a stride.
+ *
+ * SSO requires locations to match, but doesn't require the number of
+ * outputs/inputs to match (in fact, the TCS often has extra outputs).
+ * So, we need to take the extra step of unifying these on the fly.
+ */
+static void
+get_unified_tess_slots(const struct crocus_context *ice,
+ uint64_t *per_vertex_slots,
+ uint32_t *per_patch_slots)
+{
+ const struct shader_info *tcs =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+ const struct shader_info *tes =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL);
+
+ *per_vertex_slots = tes->inputs_read;
+ *per_patch_slots = tes->patch_inputs_read;
+
+ if (tcs) {
+ *per_vertex_slots |= tcs->outputs_written;
+ *per_patch_slots |= tcs->patch_outputs_written;
+ }
+}
+
+/**
+ * Compile a tessellation control shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_tcs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_tcs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ const struct nir_shader_compiler_options *options =
+ compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].NirOptions;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_tcs_prog_data *tcs_prog_data =
+ rzalloc(mem_ctx, struct brw_tcs_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ enum brw_param_builtin *system_values = NULL;
+ unsigned num_system_values = 0;
+ unsigned num_cbufs = 0;
+
+ nir_shader *nir;
+
+ struct crocus_binding_table bt;
+
+ if (ish) {
+ nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+
+ crocus_lower_swizzles(nir, &key->base.tex);
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+ } else {
+ nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, options, key);
+
+ /* Reserve space for passing the default tess levels as constants. */
+ num_cbufs = 1;
+ num_system_values = 8;
+ system_values =
+ rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values);
+ prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values);
+ prog_data->nr_params = num_system_values;
+
+ if (key->tes_primitive_mode == GL_QUADS) {
+ for (int i = 0; i < 4; i++)
+ system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
+
+ system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
+ system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y;
+ } else if (key->tes_primitive_mode == GL_TRIANGLES) {
+ for (int i = 0; i < 3; i++)
+ system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
+
+ system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
+ } else {
+ assert(key->tes_primitive_mode == GL_ISOLINES);
+ system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y;
+ system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
+ }
+
+ /* Manually setup the TCS binding table. */
+ memset(&bt, 0, sizeof(bt));
+ bt.sizes[CROCUS_SURFACE_GROUP_UBO] = 1;
+ bt.used_mask[CROCUS_SURFACE_GROUP_UBO] = 1;
+ bt.size_bytes = 4;
+
+ prog_data->ubo_ranges[0].length = 1;
+ }
+
+ struct brw_tcs_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+ char *error_str = NULL;
+ const unsigned *program =
+ brw_compile_tcs(compiler, &ice->dbg, mem_ctx, &key_clean, tcs_prog_data, nir,
+ -1, NULL, &error_str);
+ if (program == NULL) {
+ dbg_printf("Failed to compile control shader: %s\n", error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish) {
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_TCS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*tcs_prog_data), NULL,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ if (ish)
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current tessellation control shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_tcs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
+ struct crocus_uncompiled_shader *tcs =
+ ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ const struct shader_info *tes_info =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL);
+ struct brw_tcs_prog_key key = {
+ KEY_INIT_NO_ID(),
+ .base.program_string_id = tcs ? tcs->program_id : 0,
+ .tes_primitive_mode = tes_info->tess.primitive_mode,
+ .input_vertices = ice->state.vertices_per_patch,
+ .quads_workaround = tes_info->tess.primitive_mode == GL_QUADS &&
+ tes_info->tess.spacing == TESS_SPACING_EQUAL,
+ };
+
+ if (tcs && tcs->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_CTRL, tcs,
+ tcs->nir->info.uses_texture_gather, &key.base.tex);
+ get_unified_tess_slots(ice, &key.outputs_written,
+ &key.patch_outputs_written);
+ screen->vtbl.populate_tcs_key(ice, &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TCS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_TCS, sizeof(key), &key);
+
+ if (tcs && !shader)
+ shader = crocus_disk_cache_retrieve(ice, tcs, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_tcs(ice, tcs, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_TCS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TCS |
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Compile a tessellation evaluation shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_tes(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_tes_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_tes_prog_data *tes_prog_data =
+ rzalloc(mem_ctx, struct brw_tes_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ enum brw_param_builtin *system_values;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ if (key->nr_userclip_plane_consts) {
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true,
+ false, NULL);
+ nir_lower_io_to_temporaries(nir, impl, true, false);
+ nir_lower_global_vars_to_local(nir);
+ nir_lower_vars_to_ssa(nir);
+ nir_shader_gather_info(nir, impl);
+ }
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+ crocus_lower_swizzles(nir, &key->base.tex);
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ struct brw_vue_map input_vue_map;
+ brw_compute_tess_vue_map(&input_vue_map, key->inputs_read,
+ key->patch_inputs_read);
+
+ struct brw_tes_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+ char *error_str = NULL;
+ const unsigned *program =
+ brw_compile_tes(compiler, &ice->dbg, mem_ctx, &key_clean, &input_vue_map,
+ tes_prog_data, nir, -1, NULL, &error_str);
+ if (program == NULL) {
+ dbg_printf("Failed to compile evaluation shader: %s\n", error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ uint32_t *so_decls = NULL;
+ if (devinfo->ver > 6)
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_TES, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*tes_prog_data), so_decls,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current tessellation evaluation shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_tes(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL];
+ struct brw_tes_prog_key key = { KEY_INIT() };
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_EVAL, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ get_unified_tess_slots(ice, &key.inputs_read, &key.patch_inputs_read);
+ screen->vtbl.populate_tes_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TES];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_TES, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_tes(ice, ish, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_TES] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_BINDINGS_TES |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ shs->sysvals_need_upload = true;
+ }
+
+ /* TODO: Could compare and avoid flagging this. */
+ const struct shader_info *tes_info = &ish->nir->info;
+ if (BITSET_TEST(tes_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ ice->state.shaders[MESA_SHADER_TESS_EVAL].sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Compile a geometry shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_gs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_gs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_gs_prog_data *gs_prog_data =
+ rzalloc(mem_ctx, struct brw_gs_prog_data);
+ struct brw_vue_prog_data *vue_prog_data = &gs_prog_data->base;
+ struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+ enum brw_param_builtin *system_values;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ if (key->nr_userclip_plane_consts) {
+ nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+ nir_lower_clip_gs(nir, (1 << key->nr_userclip_plane_consts) - 1, false,
+ NULL);
+ nir_lower_io_to_temporaries(nir, impl, true, false);
+ nir_lower_global_vars_to_local(nir);
+ nir_lower_vars_to_ssa(nir);
+ nir_shader_gather_info(nir, impl);
+ }
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+ crocus_lower_swizzles(nir, &key->base.tex);
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ brw_compute_vue_map(devinfo,
+ &vue_prog_data->vue_map, nir->info.outputs_written,
+ nir->info.separate_shader, /* pos slots */ 1);
+
+ if (devinfo->ver == 6)
+ gfx6_gs_xfb_setup(&ish->stream_output, gs_prog_data);
+ struct brw_gs_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+
+ char *error_str = NULL;
+ const unsigned *program =
+ brw_compile_gs(compiler, &ice->dbg, mem_ctx, &key_clean, gs_prog_data, nir,
+ -1, NULL, &error_str);
+ if (program == NULL) {
+ dbg_printf("Failed to compile geometry shader: %s\n", error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ uint32_t *so_decls = NULL;
+ if (devinfo->ver > 6)
+ so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+ &vue_prog_data->vue_map);
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_GS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*gs_prog_data), so_decls,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current geometry shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_gs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_GEOMETRY];
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_GS];
+ struct crocus_compiled_shader *shader = NULL;
+
+ if (ish) {
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_gs_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_GEOMETRY, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_gs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+ shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_GS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_gs(ice, ish, &key);
+ }
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_GS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS |
+ CROCUS_STAGE_DIRTY_BINDINGS_GS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_GS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Compile a fragment (pixel) shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_fs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_wm_prog_key *key,
+ struct brw_vue_map *vue_map)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_wm_prog_data *fs_prog_data =
+ rzalloc(mem_ctx, struct brw_wm_prog_data);
+ struct brw_stage_prog_data *prog_data = &fs_prog_data->base;
+ enum brw_param_builtin *system_values;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ prog_data->use_alt_mode = ish->use_alt_mode;
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+
+ /* Lower output variables to load_output intrinsics before setting up
+ * binding tables, so crocus_setup_binding_table can map any load_output
+ * intrinsics to CROCUS_SURFACE_GROUP_RENDER_TARGET_READ on Gen8 for
+ * non-coherent framebuffer fetches.
+ */
+ brw_nir_lower_fs_outputs(nir);
+
+ /* lower swizzles before binding table */
+ crocus_lower_swizzles(nir, &key->base.tex);
+ int null_rts = 1;
+
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt,
+ MAX2(key->nr_color_regions, null_rts),
+ num_system_values, num_cbufs,
+ &key->base.tex);
+
+ if (can_push_ubo(devinfo))
+ brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+ struct brw_wm_prog_key key_clean = *key;
+ crocus_sanitize_tex_key(&key_clean.base.tex);
+
+ struct brw_compile_fs_params params = {
+ .nir = nir,
+ .key = &key_clean,
+ .prog_data = fs_prog_data,
+
+ .allow_spilling = true,
+ .vue_map = vue_map,
+
+ .log_data = &ice->dbg,
+ };
+ const unsigned *program =
+ brw_compile_fs(compiler, mem_ctx, &params);
+ if (program == NULL) {
+ dbg_printf("Failed to compile fragment shader: %s\n", params.error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_FS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*fs_prog_data), NULL,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+/**
+ * Update the current fragment shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_fs(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+ struct brw_wm_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_FRAGMENT, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_fs_key(ice, &ish->nir->info, &key);
+
+ if (ish->nos & (1ull << CROCUS_NOS_LAST_VUE_MAP))
+ key.input_slots_valid = ice->shaders.last_vue_map->slots_valid;
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_FS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_FS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_fs(ice, ish, &key, ice->shaders.last_vue_map);
+
+ if (old != shader) {
+ // XXX: only need to flag CLIP if barycentric has NONPERSPECTIVE
+ // toggles. might be able to avoid flagging SBE too.
+ ice->shaders.prog[CROCUS_CACHE_FS] = shader;
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+ /* gen4 clip/sf rely on fs prog_data */
+ if (devinfo->ver < 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+ else
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ if (devinfo->ver == 6)
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ if (devinfo->ver >= 7)
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS |
+ CROCUS_STAGE_DIRTY_BINDINGS_FS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_FS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+/**
+ * Update the last enabled stage's VUE map.
+ *
+ * When the shader feeding the rasterizer's output interface changes, we
+ * need to re-emit various packets.
+ */
+static void
+update_last_vue_map(struct crocus_context *ice,
+ struct brw_stage_prog_data *prog_data)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+ struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
+ struct brw_vue_map *old_map = ice->shaders.last_vue_map;
+ const uint64_t changed_slots =
+ (old_map ? old_map->slots_valid : 0ull) ^ vue_map->slots_valid;
+
+ if (changed_slots & VARYING_BIT_VIEWPORT) {
+ ice->state.num_viewports =
+ (vue_map->slots_valid & VARYING_BIT_VIEWPORT) ? CROCUS_MAX_VIEWPORTS : 1;
+ ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT |
+ CROCUS_DIRTY_CC_VIEWPORT;
+ if (devinfo->ver < 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+
+ if (devinfo->ver <= 6)
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+
+ if (devinfo->ver >= 6)
+ ice->state.dirty |= CROCUS_DIRTY_CLIP |
+ CROCUS_DIRTY_GEN6_SCISSOR_RECT;;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS |
+ ice->state.stage_dirty_for_nos[CROCUS_NOS_LAST_VUE_MAP];
+ }
+
+ if (changed_slots || (old_map && old_map->separate != vue_map->separate)) {
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS;
+ }
+
+ ice->shaders.last_vue_map = &vue_prog_data->vue_map;
+}
+
+static void
+crocus_update_pull_constant_descriptors(struct crocus_context *ice,
+ gl_shader_stage stage)
+{
+ struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+
+ if (!shader || !shader->prog_data->has_ubo_pull)
+ return;
+
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ bool any_new_descriptors =
+ shader->num_system_values > 0 && shs->sysvals_need_upload;
+
+ unsigned bound_cbufs = shs->bound_cbufs;
+
+ while (bound_cbufs) {
+ const int i = u_bit_scan(&bound_cbufs);
+ struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
+ if (cbuf->buffer) {
+ any_new_descriptors = true;
+ }
+ }
+
+ if (any_new_descriptors)
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
+}
+
+/**
+ * Get the prog_data for a given stage, or NULL if the stage is disabled.
+ */
+static struct brw_vue_prog_data *
+get_vue_prog_data(struct crocus_context *ice, gl_shader_stage stage)
+{
+ if (!ice->shaders.prog[stage])
+ return NULL;
+
+ return (void *) ice->shaders.prog[stage]->prog_data;
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_clip(struct crocus_context *ice, struct brw_clip_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx;
+ unsigned program_size;
+ mem_ctx = ralloc_context(NULL);
+
+ struct brw_clip_prog_data *clip_prog_data =
+ rzalloc(mem_ctx, struct brw_clip_prog_data);
+
+ const unsigned *program = brw_compile_clip(compiler, mem_ctx, key, clip_prog_data,
+ ice->shaders.last_vue_map, &program_size);
+
+ if (program == NULL) {
+ dbg_printf("failed to compile clip shader\n");
+ ralloc_free(mem_ctx);
+ return false;
+ }
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_CLIP, sizeof(*key), key, program,
+ program_size,
+ (struct brw_stage_prog_data *)clip_prog_data, sizeof(*clip_prog_data),
+ NULL, NULL, 0, 0, &bt);
+ ralloc_free(mem_ctx);
+ return shader;
+}
+static void
+crocus_update_compiled_clip(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct brw_clip_prog_key key;
+ struct crocus_compiled_shader *old = ice->shaders.clip_prog;
+ memset(&key, 0, sizeof(key));
+
+ const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+ if (wm_prog_data) {
+ key.contains_flat_varying = wm_prog_data->contains_flat_varying;
+ key.contains_noperspective_varying =
+ wm_prog_data->contains_noperspective_varying;
+ memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode));
+ }
+
+ key.primitive = u_reduced_prim(ice->state.prim_mode);
+ key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ key.pv_first = rs_state->flatshade_first;
+
+ if (rs_state->clip_plane_enable)
+ key.nr_userclip = util_logbase2(rs_state->clip_plane_enable) + 1;
+
+ if (screen->devinfo.ver == 5)
+ key.clip_mode = BRW_CLIP_MODE_KERNEL_CLIP;
+ else
+ key.clip_mode = BRW_CLIP_MODE_NORMAL;
+
+ if (key.primitive == PIPE_PRIM_TRIANGLES) {
+ if (rs_state->cull_face == PIPE_FACE_FRONT_AND_BACK)
+ key.clip_mode = BRW_CLIP_MODE_REJECT_ALL;
+ else {
+ uint32_t fill_front = BRW_CLIP_FILL_MODE_CULL;
+ uint32_t fill_back = BRW_CLIP_FILL_MODE_CULL;
+ uint32_t offset_front = 0;
+ uint32_t offset_back = 0;
+
+ if (!(rs_state->cull_face & PIPE_FACE_FRONT)) {
+ switch (rs_state->fill_front) {
+ case PIPE_POLYGON_MODE_FILL:
+ fill_front = BRW_CLIP_FILL_MODE_FILL;
+ offset_front = 0;
+ break;
+ case PIPE_POLYGON_MODE_LINE:
+ fill_front = BRW_CLIP_FILL_MODE_LINE;
+ offset_front = rs_state->offset_line;
+ break;
+ case PIPE_POLYGON_MODE_POINT:
+ fill_front = BRW_CLIP_FILL_MODE_POINT;
+ offset_front = rs_state->offset_point;
+ break;
+ }
+ }
+
+ if (!(rs_state->cull_face & PIPE_FACE_BACK)) {
+ switch (rs_state->fill_back) {
+ case PIPE_POLYGON_MODE_FILL:
+ fill_back = BRW_CLIP_FILL_MODE_FILL;
+ offset_back = 0;
+ break;
+ case PIPE_POLYGON_MODE_LINE:
+ fill_back = BRW_CLIP_FILL_MODE_LINE;
+ offset_back = rs_state->offset_line;
+ break;
+ case PIPE_POLYGON_MODE_POINT:
+ fill_back = BRW_CLIP_FILL_MODE_POINT;
+ offset_back = rs_state->offset_point;
+ break;
+ }
+ }
+
+ if (rs_state->fill_back != PIPE_POLYGON_MODE_FILL ||
+ rs_state->fill_front != PIPE_POLYGON_MODE_FILL) {
+ key.do_unfilled = 1;
+
+ /* Most cases the fixed function units will handle. Cases where
+ * one or more polygon faces are unfilled will require help:
+ */
+ key.clip_mode = BRW_CLIP_MODE_CLIP_NON_REJECTED;
+
+ if (offset_back || offset_front) {
+ double mrd = 0.0;
+ if (ice->state.framebuffer.zsbuf)
+ mrd = util_get_depth_format_mrd(util_format_description(ice->state.framebuffer.zsbuf->format));
+ key.offset_units = rs_state->offset_units * mrd * 2;
+ key.offset_factor = rs_state->offset_scale * mrd;
+ key.offset_clamp = rs_state->offset_clamp * mrd;
+ }
+
+ if (!(rs_state->front_ccw ^ rs_state->bottom_edge_rule)) {
+ key.fill_ccw = fill_front;
+ key.fill_cw = fill_back;
+ key.offset_ccw = offset_front;
+ key.offset_cw = offset_back;
+ if (rs_state->light_twoside &&
+ key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
+ key.copy_bfc_cw = 1;
+ } else {
+ key.fill_cw = fill_front;
+ key.fill_ccw = fill_back;
+ key.offset_cw = offset_front;
+ key.offset_ccw = offset_back;
+ if (rs_state->light_twoside &&
+ key.fill_ccw != BRW_CLIP_FILL_MODE_CULL)
+ key.copy_bfc_ccw = 1;
+ }
+ }
+ }
+ }
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_CLIP, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_compile_clip(ice, &key);
+
+ if (old != shader) {
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ ice->shaders.clip_prog = shader;
+ }
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_sf(struct crocus_context *ice, struct brw_sf_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx;
+ unsigned program_size;
+ mem_ctx = ralloc_context(NULL);
+
+ struct brw_sf_prog_data *sf_prog_data =
+ rzalloc(mem_ctx, struct brw_sf_prog_data);
+
+ const unsigned *program = brw_compile_sf(compiler, mem_ctx, key, sf_prog_data,
+ ice->shaders.last_vue_map, &program_size);
+
+ if (program == NULL) {
+ dbg_printf("failed to compile sf shader\n");
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_SF, sizeof(*key), key, program,
+ program_size,
+ (struct brw_stage_prog_data *)sf_prog_data, sizeof(*sf_prog_data),
+ NULL, NULL, 0, 0, &bt);
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+static void
+crocus_update_compiled_sf(struct crocus_context *ice)
+{
+ struct brw_sf_prog_key key;
+ struct crocus_compiled_shader *old = ice->shaders.sf_prog;
+ memset(&key, 0, sizeof(key));
+
+ key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+ switch (u_reduced_prim(ice->state.prim_mode)) {
+ case GL_TRIANGLES:
+ default:
+ if (key.attrs & BITFIELD64_BIT(VARYING_SLOT_EDGE))
+ key.primitive = BRW_SF_PRIM_UNFILLED_TRIS;
+ else
+ key.primitive = BRW_SF_PRIM_TRIANGLES;
+ break;
+ case GL_LINES:
+ key.primitive = BRW_SF_PRIM_LINES;
+ break;
+ case GL_POINTS:
+ key.primitive = BRW_SF_PRIM_POINTS;
+ break;
+ }
+
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ key.userclip_active = rs_state->clip_plane_enable != 0;
+ const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+ if (wm_prog_data) {
+ key.contains_flat_varying = wm_prog_data->contains_flat_varying;
+ memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode));
+ }
+
+ key.do_twoside_color = rs_state->light_twoside;
+
+ key.do_point_sprite = rs_state->point_quad_rasterization;
+ if (key.do_point_sprite) {
+ key.point_sprite_coord_replace = rs_state->sprite_coord_enable & 0xff;
+ if (rs_state->sprite_coord_enable & (1 << 8))
+ key.do_point_coord = 1;
+ if (wm_prog_data && wm_prog_data->urb_setup[VARYING_SLOT_PNTC] != -1)
+ key.do_point_coord = 1;
+ }
+
+ key.sprite_origin_lower_left = rs_state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT;
+
+ if (key.do_twoside_color) {
+ key.frontface_ccw = rs_state->front_ccw;
+ }
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_SF, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_compile_sf(ice, &key);
+
+ if (old != shader) {
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ ice->shaders.sf_prog = shader;
+ }
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_ff_gs(struct crocus_context *ice, struct brw_ff_gs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx;
+ unsigned program_size;
+ mem_ctx = ralloc_context(NULL);
+
+ struct brw_ff_gs_prog_data *ff_gs_prog_data =
+ rzalloc(mem_ctx, struct brw_ff_gs_prog_data);
+
+ const unsigned *program = brw_compile_ff_gs_prog(compiler, mem_ctx, key, ff_gs_prog_data,
+ ice->shaders.last_vue_map, &program_size);
+
+ if (program == NULL) {
+ dbg_printf("failed to compile sf shader\n");
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+
+ if (screen->devinfo.ver == 6) {
+ bt.sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS;
+ bt.used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1;
+
+ bt.size_bytes = BRW_MAX_SOL_BINDINGS * 4;
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_FF_GS, sizeof(*key), key, program,
+ program_size,
+ (struct brw_stage_prog_data *)ff_gs_prog_data, sizeof(*ff_gs_prog_data),
+ NULL, NULL, 0, 0, &bt);
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+static void
+crocus_update_compiled_ff_gs(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_ff_gs_prog_key key;
+ struct crocus_compiled_shader *old = ice->shaders.ff_gs_prog;
+ memset(&key, 0, sizeof(key));
+
+ assert(devinfo->ver < 7);
+
+ key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+ key.primitive = screen->vtbl.translate_prim_type(ice->state.prim_mode, 0);
+
+ struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+ key.pv_first = rs_state->flatshade_first;
+
+ if (key.primitive == _3DPRIM_QUADLIST && !rs_state->flatshade) {
+ /* Provide consistenbbbbbt primitive order with brw_set_prim's
+ * optimization of single quads to trifans.
+ */
+ key.pv_first = true;
+ }
+
+ if (devinfo->ver >= 6) {
+ key.need_gs_prog = ice->state.streamout_active;
+ if (key.need_gs_prog) {
+ struct crocus_uncompiled_shader *vs =
+ ice->shaders.uncompiled[MESA_SHADER_VERTEX];
+ gfx6_ff_gs_xfb_setup(&vs->stream_output,
+ &key);
+ }
+ } else {
+ key.need_gs_prog = (key.primitive == _3DPRIM_QUADLIST ||
+ key.primitive == _3DPRIM_QUADSTRIP ||
+ key.primitive == _3DPRIM_LINELOOP);
+ }
+
+ struct crocus_compiled_shader *shader = NULL;
+ if (key.need_gs_prog) {
+ shader = crocus_find_cached_shader(ice, CROCUS_CACHE_FF_GS,
+ sizeof(key), &key);
+ if (!shader)
+ shader = crocus_compile_ff_gs(ice, &key);
+ }
+ if (old != shader) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
+ if (!!old != !!shader)
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+ ice->shaders.ff_gs_prog = shader;
+ if (shader) {
+ const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
+ ice->state.last_xfb_verts_per_prim = gs_prog_data->svbi_postincrement_value;
+ }
+ }
+}
+
+// XXX: crocus_compiled_shaders are space-leaking :(
+// XXX: do remember to unbind them if deleting them.
+
+/**
+ * Update the current shader variants for the given state.
+ *
+ * This should be called on every draw call to ensure that the correct
+ * shaders are bound. It will also flag any dirty state triggered by
+ * swapping out those shaders.
+ */
+bool
+crocus_update_compiled_shaders(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ const uint64_t stage_dirty = ice->state.stage_dirty;
+
+ struct brw_vue_prog_data *old_prog_datas[4];
+ if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) {
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++)
+ old_prog_datas[i] = get_vue_prog_data(ice, i);
+ }
+
+ if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_TCS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) {
+ struct crocus_uncompiled_shader *tes =
+ ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL];
+ if (tes) {
+ crocus_update_compiled_tcs(ice);
+ crocus_update_compiled_tes(ice);
+ } else {
+ ice->shaders.prog[CROCUS_CACHE_TCS] = NULL;
+ ice->shaders.prog[CROCUS_CACHE_TES] = NULL;
+ ice->state.stage_dirty |=
+ CROCUS_STAGE_DIRTY_TCS | CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_BINDINGS_TCS | CROCUS_STAGE_DIRTY_BINDINGS_TES |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS | CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ }
+ }
+
+ if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_VS)
+ crocus_update_compiled_vs(ice);
+ if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_GS)
+ crocus_update_compiled_gs(ice);
+
+ if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_GS |
+ CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) {
+ const struct crocus_compiled_shader *gs =
+ ice->shaders.prog[MESA_SHADER_GEOMETRY];
+ const struct crocus_compiled_shader *tes =
+ ice->shaders.prog[MESA_SHADER_TESS_EVAL];
+
+ bool points_or_lines = false;
+
+ if (gs) {
+ const struct brw_gs_prog_data *gs_prog_data = (void *) gs->prog_data;
+ points_or_lines =
+ gs_prog_data->output_topology == _3DPRIM_POINTLIST ||
+ gs_prog_data->output_topology == _3DPRIM_LINESTRIP;
+ } else if (tes) {
+ const struct brw_tes_prog_data *tes_data = (void *) tes->prog_data;
+ points_or_lines =
+ tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_LINE ||
+ tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+ }
+
+ if (ice->shaders.output_topology_is_points_or_lines != points_or_lines) {
+ /* Outbound to XY Clip enables */
+ ice->shaders.output_topology_is_points_or_lines = points_or_lines;
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ }
+ }
+
+ if (!ice->shaders.prog[MESA_SHADER_VERTEX])
+ return false;
+
+ gl_shader_stage last_stage = last_vue_stage(ice);
+ struct crocus_compiled_shader *shader = ice->shaders.prog[last_stage];
+ struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[last_stage];
+ update_last_vue_map(ice, shader->prog_data);
+ if (ice->state.streamout != shader->streamout) {
+ ice->state.streamout = shader->streamout;
+ ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST | CROCUS_DIRTY_STREAMOUT;
+ }
+
+ if (ice->state.streamout_active) {
+ screen->vtbl.update_so_strides(ice, ish->stream_output.stride);
+ }
+
+ /* use ice->state version as last_vue_map can dirty this bit */
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_FS)
+ crocus_update_compiled_fs(ice);
+
+ if (screen->devinfo.ver <= 6) {
+ if (ice->state.dirty & CROCUS_DIRTY_GEN4_FF_GS_PROG &&
+ !ice->shaders.prog[MESA_SHADER_GEOMETRY])
+ crocus_update_compiled_ff_gs(ice);
+ }
+
+ if (screen->devinfo.ver < 6) {
+ if (ice->state.dirty & CROCUS_DIRTY_GEN4_CLIP_PROG)
+ crocus_update_compiled_clip(ice);
+ if (ice->state.dirty & CROCUS_DIRTY_GEN4_SF_PROG)
+ crocus_update_compiled_sf(ice);
+ }
+
+
+ /* Changing shader interfaces may require a URB configuration. */
+ if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) {
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+ struct brw_vue_prog_data *old = old_prog_datas[i];
+ struct brw_vue_prog_data *new = get_vue_prog_data(ice, i);
+ if (!!old != !!new ||
+ (new && new->urb_entry_size != old->urb_entry_size)) {
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+ break;
+ }
+ }
+ }
+
+ if (ice->state.stage_dirty & CROCUS_RENDER_STAGE_DIRTY_CONSTANTS) {
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) {
+ if (ice->state.stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << i))
+ crocus_update_pull_constant_descriptors(ice, i);
+ }
+ }
+ return true;
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_cs(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ const struct brw_cs_prog_key *key)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct brw_compiler *compiler = screen->compiler;
+ void *mem_ctx = ralloc_context(NULL);
+ struct brw_cs_prog_data *cs_prog_data =
+ rzalloc(mem_ctx, struct brw_cs_prog_data);
+ struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+ enum brw_param_builtin *system_values;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ unsigned num_system_values;
+ unsigned num_cbufs;
+
+ nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+ NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
+
+ crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+ &num_system_values, &num_cbufs);
+ crocus_lower_swizzles(nir, &key->base.tex);
+ struct crocus_binding_table bt;
+ crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+ num_system_values, num_cbufs, &key->base.tex);
+
+ struct brw_compile_cs_params params = {
+ .nir = nir,
+ .key = key,
+ .prog_data = cs_prog_data,
+ .log_data = &ice->dbg,
+ };
+
+ const unsigned *program =
+ brw_compile_cs(compiler, mem_ctx, &params);
+ if (program == NULL) {
+ dbg_printf("Failed to compile compute shader: %s\n", params.error_str);
+ ralloc_free(mem_ctx);
+ return false;
+ }
+
+ if (ish->compiled_once) {
+ crocus_debug_recompile(ice, &nir->info, &key->base);
+ } else {
+ ish->compiled_once = true;
+ }
+
+ struct crocus_compiled_shader *shader =
+ crocus_upload_shader(ice, CROCUS_CACHE_CS, sizeof(*key), key, program,
+ prog_data->program_size,
+ prog_data, sizeof(*cs_prog_data), NULL,
+ system_values, num_system_values,
+ num_cbufs, &bt);
+
+ crocus_disk_cache_store(screen->disk_cache, ish, shader,
+ ice->shaders.cache_bo_map,
+ key, sizeof(*key));
+
+ ralloc_free(mem_ctx);
+ return shader;
+}
+
+static void
+crocus_update_compiled_cs(struct crocus_context *ice)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_cs_prog_key key = { KEY_INIT() };
+
+ if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+ crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_COMPUTE, ish,
+ ish->nir->info.uses_texture_gather, &key.base.tex);
+ screen->vtbl.populate_cs_key(ice, &key);
+
+ struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_CS];
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_CS, sizeof(key), &key);
+
+ if (!shader)
+ shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+ if (!shader)
+ shader = crocus_compile_cs(ice, ish, &key);
+
+ if (old != shader) {
+ ice->shaders.prog[CROCUS_CACHE_CS] = shader;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS |
+ CROCUS_STAGE_DIRTY_BINDINGS_CS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+ shs->sysvals_need_upload = true;
+ }
+}
+
+void
+crocus_update_compiled_compute_shader(struct crocus_context *ice)
+{
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_CS)
+ crocus_update_compiled_cs(ice);
+
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS)
+ crocus_update_pull_constant_descriptors(ice, MESA_SHADER_COMPUTE);
+}
+
+void
+crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+ unsigned threads,
+ uint32_t *dst)
+{
+ assert(brw_cs_push_const_total_size(cs_prog_data, threads) > 0);
+ assert(cs_prog_data->push.cross_thread.size == 0);
+ assert(cs_prog_data->push.per_thread.dwords == 1);
+ assert(cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
+ for (unsigned t = 0; t < threads; t++)
+ dst[8 * t] = t;
+}
+
+/**
+ * Allocate scratch BOs as needed for the given per-thread size and stage.
+ */
+struct crocus_bo *
+crocus_get_scratch_space(struct crocus_context *ice,
+ unsigned per_thread_scratch,
+ gl_shader_stage stage)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ unsigned encoded_size = ffs(per_thread_scratch) - 11;
+ assert(encoded_size < (1 << 16));
+
+ struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
+
+ unsigned subslice_total = screen->subslice_total;
+ subslice_total = 4 * devinfo->num_slices;
+ // assert(subslice_total >= screen->subslice_total);
+
+ if (!*bop) {
+ unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
+
+ uint32_t max_threads[] = {
+ [MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
+ [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
+ [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
+ [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
+ [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
+ [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total,
+ };
+
+ uint32_t size = per_thread_scratch * max_threads[stage];
+
+ *bop = crocus_bo_alloc(bufmgr, "scratch", size);
+ }
+
+ return *bop;
+}
+
+/* ------------------------------------------------------------------- */
+
+/**
+ * The pipe->create_[stage]_state() driver hooks.
+ *
+ * Performs basic NIR preprocessing, records any state dependencies, and
+ * returns an crocus_uncompiled_shader as the Gallium CSO.
+ *
+ * Actual shader compilation to assembly happens later, at first use.
+ */
+static void *
+crocus_create_uncompiled_shader(struct pipe_context *ctx,
+ nir_shader *nir,
+ const struct pipe_stream_output_info *so_info)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_uncompiled_shader *ish =
+ calloc(1, sizeof(struct crocus_uncompiled_shader));
+ if (!ish)
+ return NULL;
+
+ if (devinfo->ver >= 6)
+ NIR_PASS(ish->needs_edge_flag, nir, crocus_fix_edge_flags);
+ else
+ ish->needs_edge_flag = false;
+
+ brw_preprocess_nir(screen->compiler, nir, NULL);
+
+ NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo, false);
+ NIR_PASS_V(nir, crocus_lower_storage_image_derefs);
+
+ nir_sweep(nir);
+
+ ish->program_id = get_new_program_id(screen);
+ ish->nir = nir;
+ if (so_info) {
+ memcpy(&ish->stream_output, so_info, sizeof(*so_info));
+ update_so_info(&ish->stream_output, nir->info.outputs_written);
+ }
+
+ /* Save this now before potentially dropping nir->info.name */
+ if (nir->info.name && strncmp(nir->info.name, "ARB", 3) == 0)
+ ish->use_alt_mode = true;
+
+ if (screen->disk_cache) {
+ /* Serialize the NIR to a binary blob that we can hash for the disk
+ * cache. Drop unnecessary information (like variable names)
+ * so the serialized NIR is smaller, and also to let us detect more
+ * isomorphic shaders when hashing, increasing cache hits.
+ */
+ struct blob blob;
+ blob_init(&blob);
+ nir_serialize(&blob, nir, true);
+ _mesa_sha1_compute(blob.data, blob.size, ish->nir_sha1);
+ blob_finish(&blob);
+ }
+
+ return ish;
+}
+
+static struct crocus_uncompiled_shader *
+crocus_create_shader_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct nir_shader *nir;
+
+ if (state->type == PIPE_SHADER_IR_TGSI)
+ nir = tgsi_to_nir(state->tokens, ctx->screen, false);
+ else
+ nir = state->ir.nir;
+
+ return crocus_create_uncompiled_shader(ctx, nir, &state->stream_output);
+}
+
+static void *
+crocus_create_vs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ /* User clip planes or gen5 sprite coord enable */
+ if (ish->nir->info.clip_distance_array_size == 0 ||
+ screen->devinfo.ver <= 5)
+ ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+ if (!screen->devinfo.is_haswell)
+ ish->nos |= (1ull << CROCUS_NOS_VERTEX_ELEMENTS);
+
+ if (screen->precompile) {
+ struct brw_vs_prog_key key = { KEY_INIT() };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_vs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_tcs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+ struct shader_info *info = &ish->nir->info;
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ if (screen->precompile) {
+ const unsigned _GL_TRIANGLES = 0x0004;
+ struct brw_tcs_prog_key key = {
+ KEY_INIT(),
+ // XXX: make sure the linker fills this out from the TES...
+ .tes_primitive_mode =
+ info->tess.primitive_mode ? info->tess.primitive_mode
+ : _GL_TRIANGLES,
+ .outputs_written = info->outputs_written,
+ .patch_outputs_written = info->patch_outputs_written,
+ };
+
+ key.input_vertices = info->tess.tcs_vertices_out;
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_tcs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_tes_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+ struct shader_info *info = &ish->nir->info;
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ /* User clip planes */
+ if (ish->nir->info.clip_distance_array_size == 0)
+ ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+ if (screen->precompile) {
+ struct brw_tes_prog_key key = {
+ KEY_INIT(),
+ // XXX: not ideal, need TCS output/TES input unification
+ .inputs_read = info->inputs_read,
+ .patch_inputs_read = info->patch_inputs_read,
+ };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_tes(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_gs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ /* User clip planes */
+ if (ish->nir->info.clip_distance_array_size == 0)
+ ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+ if (screen->precompile) {
+ struct brw_gs_prog_key key = { KEY_INIT() };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_gs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_fs_state(struct pipe_context *ctx,
+ const struct pipe_shader_state *state)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+ struct shader_info *info = &ish->nir->info;
+
+ ish->nos |= (1ull << CROCUS_NOS_FRAMEBUFFER) |
+ (1ull << CROCUS_NOS_DEPTH_STENCIL_ALPHA) |
+ (1ull << CROCUS_NOS_RASTERIZER) |
+ (1ull << CROCUS_NOS_TEXTURES) |
+ (1ull << CROCUS_NOS_BLEND);
+
+ /* The program key needs the VUE map if there are > 16 inputs or gen4/5 */
+ if (screen->devinfo.ver < 6 || util_bitcount64(ish->nir->info.inputs_read &
+ BRW_FS_VARYING_INPUT_MASK) > 16) {
+ ish->nos |= (1ull << CROCUS_NOS_LAST_VUE_MAP);
+ }
+
+ if (screen->precompile) {
+ const uint64_t color_outputs = info->outputs_written &
+ ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
+ BITFIELD64_BIT(FRAG_RESULT_STENCIL) |
+ BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+
+ bool can_rearrange_varyings =
+ screen->devinfo.ver > 6 && util_bitcount64(info->inputs_read & BRW_FS_VARYING_INPUT_MASK) <= 16;
+
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct brw_wm_prog_key key = {
+ KEY_INIT(),
+ .nr_color_regions = util_bitcount(color_outputs),
+ .coherent_fb_fetch = false,
+ .input_slots_valid =
+ can_rearrange_varyings ? 0 : info->inputs_read | VARYING_BIT_POS,
+ };
+
+ struct brw_vue_map vue_map;
+ if (devinfo->ver < 6) {
+ brw_compute_vue_map(devinfo, &vue_map,
+ info->inputs_read | VARYING_BIT_POS,
+ false, /* pos slots */ 1);
+ }
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_fs(ice, ish, &key, &vue_map);
+ }
+
+ return ish;
+}
+
+static void *
+crocus_create_compute_state(struct pipe_context *ctx,
+ const struct pipe_compute_state *state)
+{
+ assert(state->ir_type == PIPE_SHADER_IR_NIR);
+
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_uncompiled_shader *ish =
+ crocus_create_uncompiled_shader(ctx, (void *) state->prog, NULL);
+
+ ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+ // XXX: disallow more than 64KB of shared variables
+
+ if (screen->precompile) {
+ struct brw_cs_prog_key key = { KEY_INIT() };
+
+ if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+ crocus_compile_cs(ice, ish, &key);
+ }
+
+ return ish;
+}
+
+/**
+ * The pipe->delete_[stage]_state() driver hooks.
+ *
+ * Frees the crocus_uncompiled_shader.
+ */
+static void
+crocus_delete_shader_state(struct pipe_context *ctx, void *state, gl_shader_stage stage)
+{
+ struct crocus_uncompiled_shader *ish = state;
+ struct crocus_context *ice = (void *) ctx;
+
+ if (ice->shaders.uncompiled[stage] == ish) {
+ ice->shaders.uncompiled[stage] = NULL;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage;
+ }
+
+ if (ish->const_data) {
+ pipe_resource_reference(&ish->const_data, NULL);
+ pipe_resource_reference(&ish->const_data_state.res, NULL);
+ }
+
+ ralloc_free(ish->nir);
+ free(ish);
+}
+
+static void
+crocus_delete_vs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_VERTEX);
+}
+
+static void
+crocus_delete_tcs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_CTRL);
+}
+
+static void
+crocus_delete_tes_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_EVAL);
+}
+
+static void
+crocus_delete_gs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_GEOMETRY);
+}
+
+static void
+crocus_delete_fs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_FRAGMENT);
+}
+
+static void
+crocus_delete_cs_state(struct pipe_context *ctx, void *state)
+{
+ crocus_delete_shader_state(ctx, state, MESA_SHADER_COMPUTE);
+}
+
+/**
+ * The pipe->bind_[stage]_state() driver hook.
+ *
+ * Binds an uncompiled shader as the current one for a particular stage.
+ * Updates dirty tracking to account for the shader's NOS.
+ */
+static void
+bind_shader_state(struct crocus_context *ice,
+ struct crocus_uncompiled_shader *ish,
+ gl_shader_stage stage)
+{
+ uint64_t dirty_bit = CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage;
+ const uint64_t nos = ish ? ish->nos : 0;
+
+ const struct shader_info *old_info = crocus_get_shader_info(ice, stage);
+ const struct shader_info *new_info = ish ? &ish->nir->info : NULL;
+
+ if ((old_info ? BITSET_LAST_BIT(old_info->textures_used) : 0) !=
+ (new_info ? BITSET_LAST_BIT(new_info->textures_used) : 0)) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
+ }
+
+ ice->shaders.uncompiled[stage] = ish;
+ ice->state.stage_dirty |= dirty_bit;
+
+ /* Record that CSOs need to mark CROCUS_DIRTY_UNCOMPILED_XS when they change
+ * (or that they no longer need to do so).
+ */
+ for (int i = 0; i < CROCUS_NOS_COUNT; i++) {
+ if (nos & (1 << i))
+ ice->state.stage_dirty_for_nos[i] |= dirty_bit;
+ else
+ ice->state.stage_dirty_for_nos[i] &= ~dirty_bit;
+ }
+}
+
+static void
+crocus_bind_vs_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_uncompiled_shader *new_ish = state;
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (new_ish &&
+ ice->state.window_space_position !=
+ new_ish->nir->info.vs.window_space_position) {
+ ice->state.window_space_position =
+ new_ish->nir->info.vs.window_space_position;
+
+ ice->state.dirty |= CROCUS_DIRTY_CLIP |
+ CROCUS_DIRTY_RASTER |
+ CROCUS_DIRTY_CC_VIEWPORT;
+ }
+
+ if (devinfo->ver == 6) {
+ ice->state.stage_dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+ }
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_VERTEX);
+}
+
+static void
+crocus_bind_tcs_state(struct pipe_context *ctx, void *state)
+{
+ bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_CTRL);
+}
+
+static void
+crocus_bind_tes_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ /* Enabling/disabling optional stages requires a URB reconfiguration. */
+ if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL])
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_EVAL);
+}
+
+static void
+crocus_bind_gs_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+
+ /* Enabling/disabling optional stages requires a URB reconfiguration. */
+ if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_GEOMETRY);
+}
+
+static void
+crocus_bind_fs_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_uncompiled_shader *old_ish =
+ ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+ struct crocus_uncompiled_shader *new_ish = state;
+
+ const unsigned color_bits =
+ BITFIELD64_BIT(FRAG_RESULT_COLOR) |
+ BITFIELD64_RANGE(FRAG_RESULT_DATA0, BRW_MAX_DRAW_BUFFERS);
+
+ /* Fragment shader outputs influence HasWriteableRT */
+ if (!old_ish || !new_ish ||
+ (old_ish->nir->info.outputs_written & color_bits) !=
+ (new_ish->nir->info.outputs_written & color_bits))
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+
+ bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT);
+}
+
+static void
+crocus_bind_cs_state(struct pipe_context *ctx, void *state)
+{
+ bind_shader_state((void *) ctx, state, MESA_SHADER_COMPUTE);
+}
+
+void
+crocus_init_program_functions(struct pipe_context *ctx)
+{
+ ctx->create_vs_state = crocus_create_vs_state;
+ ctx->create_tcs_state = crocus_create_tcs_state;
+ ctx->create_tes_state = crocus_create_tes_state;
+ ctx->create_gs_state = crocus_create_gs_state;
+ ctx->create_fs_state = crocus_create_fs_state;
+ ctx->create_compute_state = crocus_create_compute_state;
+
+ ctx->delete_vs_state = crocus_delete_vs_state;
+ ctx->delete_tcs_state = crocus_delete_tcs_state;
+ ctx->delete_tes_state = crocus_delete_tes_state;
+ ctx->delete_gs_state = crocus_delete_gs_state;
+ ctx->delete_fs_state = crocus_delete_fs_state;
+ ctx->delete_compute_state = crocus_delete_cs_state;
+
+ ctx->bind_vs_state = crocus_bind_vs_state;
+ ctx->bind_tcs_state = crocus_bind_tcs_state;
+ ctx->bind_tes_state = crocus_bind_tes_state;
+ ctx->bind_gs_state = crocus_bind_gs_state;
+ ctx->bind_fs_state = crocus_bind_fs_state;
+ ctx->bind_compute_state = crocus_bind_cs_state;
+}
diff --git a/src/gallium/drivers/crocus/crocus_program_cache.c b/src/gallium/drivers/crocus/crocus_program_cache.c
new file mode 100644
index 00000000000..d2d4b821754
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_program_cache.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_program_cache.c
+ *
+ * The in-memory program cache. This is basically a hash table mapping
+ * API-specified shaders and a state key to a compiled variant. It also
+ * takes care of uploading shader assembly into a BO for use on the GPU.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_atomic.h"
+#include "util/u_upload_mgr.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_eu.h"
+#include "intel/compiler/brw_nir.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+
+struct keybox {
+ uint16_t size;
+ enum crocus_program_cache_id cache_id;
+ uint8_t data[0];
+};
+
+static struct keybox *
+make_keybox(void *mem_ctx, enum crocus_program_cache_id cache_id,
+ const void *key, uint32_t key_size)
+{
+ struct keybox *keybox =
+ ralloc_size(mem_ctx, sizeof(struct keybox) + key_size);
+
+ keybox->cache_id = cache_id;
+ keybox->size = key_size;
+ memcpy(keybox->data, key, key_size);
+
+ return keybox;
+}
+
+static uint32_t
+keybox_hash(const void *void_key)
+{
+ const struct keybox *key = void_key;
+ return _mesa_hash_data(&key->cache_id, key->size + sizeof(key->cache_id));
+}
+
+static bool
+keybox_equals(const void *void_a, const void *void_b)
+{
+ const struct keybox *a = void_a, *b = void_b;
+ if (a->size != b->size)
+ return false;
+
+ return memcmp(a->data, b->data, a->size) == 0;
+}
+
+struct crocus_compiled_shader *
+crocus_find_cached_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id,
+ uint32_t key_size, const void *key)
+{
+ struct keybox *keybox = make_keybox(NULL, cache_id, key, key_size);
+ struct hash_entry *entry =
+ _mesa_hash_table_search(ice->shaders.cache, keybox);
+
+ ralloc_free(keybox);
+
+ return entry ? entry->data : NULL;
+}
+
+const void *
+crocus_find_previous_compile(const struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id,
+ unsigned program_string_id)
+{
+ hash_table_foreach(ice->shaders.cache, entry) {
+ const struct keybox *keybox = entry->key;
+ const struct brw_base_prog_key *key = (const void *)keybox->data;
+ if (keybox->cache_id == cache_id &&
+ key->program_string_id == program_string_id) {
+ return keybox->data;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * Look for an existing entry in the cache that has identical assembly code.
+ *
+ * This is useful for programs generating shaders at runtime, where multiple
+ * distinct shaders (from an API perspective) may compile to the same assembly
+ * in our backend. This saves space in the program cache buffer.
+ */
+static const struct crocus_compiled_shader *
+find_existing_assembly(struct hash_table *cache, void *map,
+ const void *assembly, unsigned assembly_size)
+{
+ hash_table_foreach (cache, entry) {
+ const struct crocus_compiled_shader *existing = entry->data;
+
+ if (existing->map_size != assembly_size)
+ continue;
+
+ if (memcmp(map + existing->offset, assembly, assembly_size) == 0)
+ return existing;
+ }
+ return NULL;
+}
+
+static void
+crocus_cache_new_bo(struct crocus_context *ice,
+ uint32_t new_size)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ struct crocus_bo *new_bo;
+ new_bo = crocus_bo_alloc(screen->bufmgr, "program cache", new_size);
+
+ void *map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE |
+ MAP_ASYNC | MAP_PERSISTENT);
+
+ if (ice->shaders.cache_next_offset != 0) {
+ memcpy(map, ice->shaders.cache_bo_map, ice->shaders.cache_next_offset);
+ }
+
+ crocus_bo_unmap(ice->shaders.cache_bo);
+ crocus_bo_unreference(ice->shaders.cache_bo);
+ ice->shaders.cache_bo = new_bo;
+ ice->shaders.cache_bo_map = map;
+
+ if (screen->devinfo.ver == 4) {
+ /* reemit all shaders on GEN4 only. */
+ ice->state.dirty |= CROCUS_DIRTY_CLIP | CROCUS_DIRTY_RASTER |
+ CROCUS_DIRTY_WM;
+ }
+ ice->batches[CROCUS_BATCH_RENDER].state_base_address_emitted = false;
+ ice->batches[CROCUS_BATCH_COMPUTE].state_base_address_emitted = false;
+ /* unset state base address */
+}
+
+static uint32_t
+crocus_alloc_item_data(struct crocus_context *ice, uint32_t size)
+{
+ if (ice->shaders.cache_next_offset + size > ice->shaders.cache_bo->size) {
+ uint32_t new_size = ice->shaders.cache_bo->size * 2;
+ while (ice->shaders.cache_next_offset + size > new_size)
+ new_size *= 2;
+
+ crocus_cache_new_bo(ice, new_size);
+ }
+ uint32_t offset = ice->shaders.cache_next_offset;
+
+ /* Programs are always 64-byte aligned, so set up the next one now */
+ ice->shaders.cache_next_offset = ALIGN(offset + size, 64);
+ return offset;
+}
+
+struct crocus_compiled_shader *
+crocus_upload_shader(struct crocus_context *ice,
+ enum crocus_program_cache_id cache_id, uint32_t key_size,
+ const void *key, const void *assembly, uint32_t asm_size,
+ struct brw_stage_prog_data *prog_data,
+ uint32_t prog_data_size, uint32_t *streamout,
+ enum brw_param_builtin *system_values,
+ unsigned num_system_values, unsigned num_cbufs,
+ const struct crocus_binding_table *bt)
+{
+ struct hash_table *cache = ice->shaders.cache;
+ struct crocus_compiled_shader *shader =
+ rzalloc_size(cache, sizeof(struct crocus_compiled_shader));
+ const struct crocus_compiled_shader *existing = find_existing_assembly(
+ cache, ice->shaders.cache_bo_map, assembly, asm_size);
+
+ /* If we can find a matching prog in the cache already, then reuse the
+ * existing stuff without creating new copy into the underlying buffer
+ * object. This is notably useful for programs generating shaders at
+ * runtime, where multiple shaders may compile to the same thing in our
+ * backend.
+ */
+ if (existing) {
+ shader->offset = existing->offset;
+ shader->map_size = existing->map_size;
+ } else {
+ shader->offset = crocus_alloc_item_data(ice, asm_size);
+ shader->map_size = asm_size;
+
+ memcpy(ice->shaders.cache_bo_map + shader->offset, assembly, asm_size);
+ }
+
+ shader->prog_data = prog_data;
+ shader->prog_data_size = prog_data_size;
+ shader->streamout = streamout;
+ shader->system_values = system_values;
+ shader->num_system_values = num_system_values;
+ shader->num_cbufs = num_cbufs;
+ shader->bt = *bt;
+
+ ralloc_steal(shader, shader->prog_data);
+ if (prog_data_size > 16) {
+ ralloc_steal(shader->prog_data, prog_data->param);
+ ralloc_steal(shader->prog_data, prog_data->pull_param);
+ }
+ ralloc_steal(shader, shader->streamout);
+ ralloc_steal(shader, shader->system_values);
+
+ struct keybox *keybox = make_keybox(shader, cache_id, key, key_size);
+ _mesa_hash_table_insert(ice->shaders.cache, keybox, shader);
+
+ return shader;
+}
+
+bool
+crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch, const void *key,
+ uint32_t key_size, uint32_t *kernel_out,
+ void *prog_data_out)
+{
+ struct blorp_context *blorp = blorp_batch->blorp;
+ struct crocus_context *ice = blorp->driver_ctx;
+ struct crocus_compiled_shader *shader =
+ crocus_find_cached_shader(ice, CROCUS_CACHE_BLORP, key_size, key);
+
+ if (!shader)
+ return false;
+
+ *kernel_out = shader->offset;
+ *((void **)prog_data_out) = shader->prog_data;
+
+ return true;
+}
+
+bool
+crocus_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage,
+ const void *key, uint32_t key_size,
+ const void *kernel, uint32_t kernel_size,
+ const struct brw_stage_prog_data *prog_data_templ,
+ uint32_t prog_data_size, uint32_t *kernel_out,
+ void *prog_data_out)
+{
+ struct blorp_context *blorp = blorp_batch->blorp;
+ struct crocus_context *ice = blorp->driver_ctx;
+
+ struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size);
+ memcpy(prog_data, prog_data_templ, prog_data_size);
+
+ struct crocus_binding_table bt;
+ memset(&bt, 0, sizeof(bt));
+
+ struct crocus_compiled_shader *shader = crocus_upload_shader(
+ ice, CROCUS_CACHE_BLORP, key_size, key, kernel, kernel_size, prog_data,
+ prog_data_size, NULL, NULL, 0, 0, &bt);
+
+ *kernel_out = shader->offset;
+ *((void **)prog_data_out) = shader->prog_data;
+
+ return true;
+}
+
+void
+crocus_init_program_cache(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ ice->shaders.cache =
+ _mesa_hash_table_create(ice, keybox_hash, keybox_equals);
+
+ ice->shaders.cache_bo =
+ crocus_bo_alloc(screen->bufmgr, "program_cache", 16384);
+ ice->shaders.cache_bo_map =
+ crocus_bo_map(NULL, ice->shaders.cache_bo,
+ MAP_READ | MAP_WRITE | MAP_ASYNC | MAP_PERSISTENT);
+}
+
+void
+crocus_destroy_program_cache(struct crocus_context *ice)
+{
+ for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+ ice->shaders.prog[i] = NULL;
+ }
+
+ if (ice->shaders.cache_bo) {
+ crocus_bo_unmap(ice->shaders.cache_bo);
+ crocus_bo_unreference(ice->shaders.cache_bo);
+ ice->shaders.cache_bo_map = NULL;
+ ice->shaders.cache_bo = NULL;
+ }
+
+ ralloc_free(ice->shaders.cache);
+}
+
+static const char *
+cache_name(enum crocus_program_cache_id cache_id)
+{
+ if (cache_id == CROCUS_CACHE_BLORP)
+ return "BLORP";
+
+ if (cache_id == CROCUS_CACHE_SF)
+ return "SF";
+
+ if (cache_id == CROCUS_CACHE_CLIP)
+ return "CLIP";
+
+ if (cache_id == CROCUS_CACHE_FF_GS)
+ return "FF_GS";
+
+ return _mesa_shader_stage_to_string(cache_id);
+}
+
+void
+crocus_print_program_cache(struct crocus_context *ice)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ hash_table_foreach(ice->shaders.cache, entry) {
+ const struct keybox *keybox = entry->key;
+ struct crocus_compiled_shader *shader = entry->data;
+ fprintf(stderr, "%s:\n", cache_name(keybox->cache_id));
+ brw_disassemble(devinfo, ice->shaders.cache_bo_map + shader->offset, 0,
+ shader->prog_data->program_size, NULL, stderr);
+ }
+}
diff --git a/src/gallium/drivers/crocus/crocus_query.c b/src/gallium/drivers/crocus/crocus_query.c
new file mode 100644
index 00000000000..14ba9fbce59
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_query.c
@@ -0,0 +1,996 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_query.c
+ *
+ * ============================= GENXML CODE =============================
+ * [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * Query object support. This allows measuring various simple statistics
+ * via counters on the GPU. We use GenX code for MI_MATH calculations.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "perf/intel_perf.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "crocus_fence.h"
+#include "crocus_monitor.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+#include "crocus_genx_macros.h"
+
+#if GFX_VER == 6
+// TOOD: Add these to genxml?
+#define SO_PRIM_STORAGE_NEEDED(n) (0x2280)
+#define SO_NUM_PRIMS_WRITTEN(n) (0x2288)
+
+// TODO: remove HS/DS/CS
+#define GFX6_IA_VERTICES_COUNT_num 0x2310
+#define GFX6_IA_PRIMITIVES_COUNT_num 0x2318
+#define GFX6_VS_INVOCATION_COUNT_num 0x2320
+#define GFX6_HS_INVOCATION_COUNT_num 0x2300
+#define GFX6_DS_INVOCATION_COUNT_num 0x2308
+#define GFX6_GS_INVOCATION_COUNT_num 0x2328
+#define GFX6_GS_PRIMITIVES_COUNT_num 0x2330
+#define GFX6_CL_INVOCATION_COUNT_num 0x2338
+#define GFX6_CL_PRIMITIVES_COUNT_num 0x2340
+#define GFX6_PS_INVOCATION_COUNT_num 0x2348
+#define GFX6_CS_INVOCATION_COUNT_num 0x2290
+#define GFX6_PS_DEPTH_COUNT_num 0x2350
+
+#elif GFX_VER == 7
+#define SO_PRIM_STORAGE_NEEDED(n) (GENX(SO_PRIM_STORAGE_NEEDED0_num) + (n) * 8)
+#define SO_NUM_PRIMS_WRITTEN(n) (GENX(SO_NUM_PRIMS_WRITTEN0_num) + (n) * 8)
+#endif
+
+struct crocus_query {
+ enum pipe_query_type type;
+ int index;
+
+ bool ready;
+
+ bool stalled;
+
+ uint64_t result;
+
+ struct crocus_state_ref query_state_ref;
+ struct crocus_query_snapshots *map;
+ struct crocus_syncobj *syncobj;
+
+ int batch_idx;
+
+ struct crocus_monitor_object *monitor;
+
+ /* Fence for PIPE_QUERY_GPU_FINISHED. */
+ struct pipe_fence_handle *fence;
+};
+
+struct crocus_query_snapshots {
+ /** crocus_render_condition's saved MI_PREDICATE_RESULT value. */
+ uint64_t predicate_result;
+
+ /** Have the start/end snapshots landed? */
+ uint64_t snapshots_landed;
+
+ /** Starting and ending counter snapshots */
+ uint64_t start;
+ uint64_t end;
+};
+
+struct crocus_query_so_overflow {
+ uint64_t predicate_result;
+ uint64_t snapshots_landed;
+
+ struct {
+ uint64_t prim_storage_needed[2];
+ uint64_t num_prims[2];
+ } stream[4];
+};
+
+#if GFX_VERx10 == 75
+static struct mi_value
+query_mem64(struct crocus_query *q, uint32_t offset)
+{
+ return mi_mem64(rw_bo(crocus_resource_bo(q->query_state_ref.res),
+ q->query_state_ref.offset + offset));
+}
+#endif
+
+/**
+ * Is this type of query written by PIPE_CONTROL?
+ */
+static bool
+crocus_is_query_pipelined(struct crocus_query *q)
+{
+ switch (q->type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ case PIPE_QUERY_TIME_ELAPSED:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static void
+mark_available(struct crocus_context *ice, struct crocus_query *q)
+{
+#if GFX_VERx10 == 75
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+ struct crocus_screen *screen = batch->screen;
+ unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
+ unsigned offset = offsetof(struct crocus_query_snapshots, snapshots_landed);
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+ offset += q->query_state_ref.offset;
+
+ if (!crocus_is_query_pipelined(q)) {
+ screen->vtbl.store_data_imm64(batch, bo, offset, true);
+ } else {
+ /* Order available *after* the query results. */
+ flags |= PIPE_CONTROL_FLUSH_ENABLE;
+ crocus_emit_pipe_control_write(batch, "query: mark available",
+ flags, bo, offset, true);
+ }
+#endif
+}
+
+/**
+ * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
+ */
+static void
+crocus_pipelined_write(struct crocus_batch *batch,
+ struct crocus_query *q,
+ enum pipe_control_flags flags,
+ unsigned offset)
+{
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+
+ crocus_emit_pipe_control_write(batch, "query: pipelined snapshot write",
+ flags,
+ bo, offset, 0ull);
+}
+
+static void
+write_value(struct crocus_context *ice, struct crocus_query *q, unsigned offset)
+{
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+#if GFX_VER >= 6
+ struct crocus_screen *screen = batch->screen;
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+#endif
+
+ if (!crocus_is_query_pipelined(q)) {
+ crocus_emit_pipe_control_flush(batch,
+ "query: non-pipelined snapshot write",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+ q->stalled = true;
+ }
+
+ switch (q->type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,
+ PIPE_CONTROL_WRITE_DEPTH_COUNT |
+ PIPE_CONTROL_DEPTH_STALL,
+ offset);
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,
+ PIPE_CONTROL_WRITE_TIMESTAMP,
+ offset);
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+#if GFX_VER >= 6
+ screen->vtbl.store_register_mem64(batch,
+ q->index == 0 ?
+ GENX(CL_INVOCATION_COUNT_num) :
+ SO_PRIM_STORAGE_NEEDED(q->index),
+ bo, offset, false);
+#endif
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+#if GFX_VER >= 6
+ screen->vtbl.store_register_mem64(batch,
+ SO_NUM_PRIMS_WRITTEN(q->index),
+ bo, offset, false);
+#endif
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
+#if GFX_VER >= 6
+ static const uint32_t index_to_reg[] = {
+ GENX(IA_VERTICES_COUNT_num),
+ GENX(IA_PRIMITIVES_COUNT_num),
+ GENX(VS_INVOCATION_COUNT_num),
+ GENX(GS_INVOCATION_COUNT_num),
+ GENX(GS_PRIMITIVES_COUNT_num),
+ GENX(CL_INVOCATION_COUNT_num),
+ GENX(CL_PRIMITIVES_COUNT_num),
+ GENX(PS_INVOCATION_COUNT_num),
+ GENX(HS_INVOCATION_COUNT_num),
+ GENX(DS_INVOCATION_COUNT_num),
+ GENX(CS_INVOCATION_COUNT_num),
+ };
+ uint32_t reg = index_to_reg[q->index];
+
+#if GFX_VER == 6
+ /* Gfx6 GS code counts full primitives, that is, it won't count individual
+ * triangles in a triangle strip. Use CL_INVOCATION_COUNT for that.
+ */
+ if (q->index == PIPE_STAT_QUERY_GS_PRIMITIVES)
+ reg = GENX(CL_INVOCATION_COUNT_num);
+#endif
+
+ screen->vtbl.store_register_mem64(batch, reg, bo, offset, false);
+#endif
+ break;
+ }
+ default:
+ assert(false);
+ }
+}
+
+#if GFX_VER >= 6
+static void
+write_overflow_values(struct crocus_context *ice, struct crocus_query *q, bool end)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_screen *screen = batch->screen;
+ uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+ uint32_t offset = q->query_state_ref.offset;
+ crocus_emit_pipe_control_flush(batch,
+ "query: write SO overflow snapshots",
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD);
+ for (uint32_t i = 0; i < count; i++) {
+ int s = q->index + i;
+ int g_idx = offset + offsetof(struct crocus_query_so_overflow,
+ stream[s].num_prims[end]);
+ int w_idx = offset + offsetof(struct crocus_query_so_overflow,
+ stream[s].prim_storage_needed[end]);
+ screen->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
+ bo, g_idx, false);
+ screen->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
+ bo, w_idx, false);
+ }
+}
+#endif
+static uint64_t
+crocus_raw_timestamp_delta(uint64_t time0, uint64_t time1)
+{
+ if (time0 > time1) {
+ return (1ULL << TIMESTAMP_BITS) + time1 - time0;
+ } else {
+ return time1 - time0;
+ }
+}
+
+static bool
+stream_overflowed(struct crocus_query_so_overflow *so, int s)
+{
+ return (so->stream[s].prim_storage_needed[1] -
+ so->stream[s].prim_storage_needed[0]) !=
+ (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
+}
+
+static void
+calculate_result_on_cpu(const struct intel_device_info *devinfo,
+ struct crocus_query *q)
+{
+ switch (q->type) {
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ q->result = q->map->end != q->map->start;
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ /* The timestamp is the single starting snapshot. */
+ q->result = intel_device_info_timebase_scale(devinfo, q->map->start);
+ q->result &= (1ull << TIMESTAMP_BITS) - 1;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ q->result = crocus_raw_timestamp_delta(q->map->start, q->map->end);
+ q->result = intel_device_info_timebase_scale(devinfo, q->result);
+ q->result &= (1ull << TIMESTAMP_BITS) - 1;
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ q->result = stream_overflowed((void *) q->map, q->index);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ q->result = false;
+ for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+ q->result |= stream_overflowed((void *) q->map, i);
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
+ q->result = q->map->end - q->map->start;
+
+ /* WaDividePSInvocationCountBy4:HSW,BDW */
+ if (GFX_VER == 7 && devinfo->is_haswell && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
+ q->result /= 4;
+ break;
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ default:
+ q->result = q->map->end - q->map->start;
+ break;
+ }
+
+ q->ready = true;
+}
+
+#if GFX_VERx10 == 75
+/**
+ * Calculate the streamout overflow for stream \p idx:
+ *
+ * (num_prims[1] - num_prims[0]) - (storage_needed[1] - storage_needed[0])
+ */
+static struct mi_value
+calc_overflow_for_stream(struct mi_builder *b,
+ struct crocus_query *q,
+ int idx)
+{
+#define C(counter, i) query_mem64(q, \
+ offsetof(struct crocus_query_so_overflow, stream[idx].counter[i]))
+
+ return mi_isub(b, mi_isub(b, C(num_prims, 1), C(num_prims, 0)),
+ mi_isub(b, C(prim_storage_needed, 1),
+ C(prim_storage_needed, 0)));
+#undef C
+}
+
+/**
+ * Calculate whether any stream has overflowed.
+ */
+static struct mi_value
+calc_overflow_any_stream(struct mi_builder *b, struct crocus_query *q)
+{
+ struct mi_value stream_result[MAX_VERTEX_STREAMS];
+ for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+ stream_result[i] = calc_overflow_for_stream(b, q, i);
+
+ struct mi_value result = stream_result[0];
+ for (int i = 1; i < MAX_VERTEX_STREAMS; i++)
+ result = mi_ior(b, result, stream_result[i]);
+
+ return result;
+}
+
+
+static bool
+query_is_boolean(enum pipe_query_type type)
+{
+ switch (type) {
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * Calculate the result using MI_MATH.
+ */
+static struct mi_value
+calculate_result_on_gpu(const struct intel_device_info *devinfo,
+ struct mi_builder *b,
+ struct crocus_query *q)
+{
+ struct mi_value result;
+ struct mi_value start_val =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, start));
+ struct mi_value end_val =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, end));
+
+ switch (q->type) {
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result = calc_overflow_for_stream(b, q, q->index);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ result = calc_overflow_any_stream(b, q);
+ break;
+ case PIPE_QUERY_TIMESTAMP: {
+ /* TODO: This discards any fractional bits of the timebase scale.
+ * We would need to do a bit of fixed point math on the CS ALU, or
+ * launch an actual shader to calculate this with full precision.
+ */
+ uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
+ result = mi_iand(b, mi_imm((1ull << 36) - 1),
+ mi_imul_imm(b, start_val, scale));
+ break;
+ }
+ case PIPE_QUERY_TIME_ELAPSED: {
+ /* TODO: This discards fractional bits (see above). */
+ uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
+ result = mi_imul_imm(b, mi_isub(b, end_val, start_val), scale);
+ break;
+ }
+ default:
+ result = mi_isub(b, end_val, start_val);
+ break;
+ }
+ /* WaDividePSInvocationCountBy4:HSW,BDW */
+ if (GFX_VER == 7 && devinfo->is_haswell &&
+ q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
+ q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
+ result = mi_ushr32_imm(b, result, 2);
+
+ if (query_is_boolean(q->type))
+ result = mi_iand(b, mi_nz(b, result), mi_imm(1));
+
+ return result;
+}
+#endif
+
+static struct pipe_query *
+crocus_create_query(struct pipe_context *ctx,
+ unsigned query_type,
+ unsigned index)
+{
+ struct crocus_query *q = calloc(1, sizeof(struct crocus_query));
+
+ q->type = query_type;
+ q->index = index;
+ q->monitor = NULL;
+
+ if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
+ q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
+ q->batch_idx = CROCUS_BATCH_COMPUTE;
+ else
+ q->batch_idx = CROCUS_BATCH_RENDER;
+ return (struct pipe_query *) q;
+}
+
+static struct pipe_query *
+crocus_create_batch_query(struct pipe_context *ctx,
+ unsigned num_queries,
+ unsigned *query_types)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = calloc(1, sizeof(struct crocus_query));
+ if (unlikely(!q))
+ return NULL;
+ q->type = PIPE_QUERY_DRIVER_SPECIFIC;
+ q->index = -1;
+ q->monitor = crocus_create_monitor_object(ice, num_queries, query_types);
+ if (unlikely(!q->monitor)) {
+ free(q);
+ return NULL;
+ }
+
+ return (struct pipe_query *) q;
+}
+
+static void
+crocus_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
+{
+ struct crocus_query *query = (void *) p_query;
+ struct crocus_screen *screen = (void *) ctx->screen;
+ if (query->monitor) {
+ crocus_destroy_monitor_object(ctx, query->monitor);
+ query->monitor = NULL;
+ } else {
+ crocus_syncobj_reference(screen, &query->syncobj, NULL);
+ screen->base.fence_reference(ctx->screen, &query->fence, NULL);
+ }
+ free(query);
+}
+
+
+static bool
+crocus_begin_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ if (q->monitor)
+ return crocus_begin_monitor(ctx, q->monitor);
+
+ void *ptr = NULL;
+ uint32_t size;
+
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ size = sizeof(struct crocus_query_so_overflow);
+ else
+ size = sizeof(struct crocus_query_snapshots);
+
+ u_upload_alloc(ice->query_buffer_uploader, 0,
+ size, size, &q->query_state_ref.offset,
+ &q->query_state_ref.res, &ptr);
+
+ if (!crocus_resource_bo(q->query_state_ref.res))
+ return false;
+
+ q->map = ptr;
+ if (!q->map)
+ return false;
+
+ q->result = 0ull;
+ q->ready = false;
+ WRITE_ONCE(q->map->snapshots_landed, false);
+
+ if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+ ice->state.prims_generated_query_active = true;
+ ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+ }
+
+#if GFX_VER <= 5
+ if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ ice->state.stats_wm++;
+ ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;
+ }
+#endif
+#if GFX_VER >= 6
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ write_overflow_values(ice, q, false);
+ else
+#endif
+ write_value(ice, q,
+ q->query_state_ref.offset +
+ offsetof(struct crocus_query_snapshots, start));
+
+ return true;
+}
+
+static bool
+crocus_end_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ if (q->monitor)
+ return crocus_end_monitor(ctx, q->monitor);
+
+ if (q->type == PIPE_QUERY_GPU_FINISHED) {
+ ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED);
+ return true;
+ }
+
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+
+ if (q->type == PIPE_QUERY_TIMESTAMP) {
+ crocus_begin_query(ctx, query);
+ crocus_batch_reference_signal_syncobj(batch, &q->syncobj);
+ mark_available(ice, q);
+ return true;
+ }
+
+#if GFX_VER <= 5
+ if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ ice->state.stats_wm--;
+ ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;
+ }
+#endif
+ if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+ ice->state.prims_generated_query_active = false;
+ ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+ }
+
+#if GFX_VER >= 6
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+ write_overflow_values(ice, q, true);
+ else
+#endif
+ write_value(ice, q,
+ q->query_state_ref.offset +
+ offsetof(struct crocus_query_snapshots, end));
+
+ crocus_batch_reference_signal_syncobj(batch, &q->syncobj);
+ mark_available(ice, q);
+
+ return true;
+}
+
+/**
+ * See if the snapshots have landed for a query, and if so, compute the
+ * result and mark it ready. Does not flush (unlike crocus_get_query_result).
+ */
+static void
+crocus_check_query_no_flush(struct crocus_context *ice, struct crocus_query *q)
+{
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
+ calculate_result_on_cpu(devinfo, q);
+ }
+}
+
+static bool
+crocus_get_query_result(struct pipe_context *ctx,
+ struct pipe_query *query,
+ bool wait,
+ union pipe_query_result *result)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ if (q->monitor)
+ return crocus_get_monitor_result(ctx, q->monitor, wait, result->batch);
+
+ struct crocus_screen *screen = (void *) ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (unlikely(screen->no_hw)) {
+ result->u64 = 0;
+ return true;
+ }
+
+ if (!q->ready) {
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+ if (q->syncobj == crocus_batch_get_signal_syncobj(batch))
+ crocus_batch_flush(batch);
+
+#if GFX_VERx10 == 75
+ while (!READ_ONCE(q->map->snapshots_landed)) {
+ if (wait)
+ crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
+ else
+ return false;
+ }
+ assert(READ_ONCE(q->map->snapshots_landed));
+#else
+ if (wait)
+ crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
+#endif
+ calculate_result_on_cpu(devinfo, q);
+ }
+
+ assert(q->ready);
+
+ result->u64 = q->result;
+
+ return true;
+}
+
+#if GFX_VER == 7
+static void
+crocus_get_query_result_resource(struct pipe_context *ctx,
+ struct pipe_query *query,
+ bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *p_res,
+ unsigned offset)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+ struct crocus_batch *batch = &ice->batches[q->batch_idx];
+ struct crocus_screen *screen = batch->screen;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ struct crocus_resource *res = (void *) p_res;
+ struct crocus_bo *query_bo = crocus_resource_bo(q->query_state_ref.res);
+ struct crocus_bo *dst_bo = crocus_resource_bo(p_res);
+ unsigned snapshots_landed_offset =
+ offsetof(struct crocus_query_snapshots, snapshots_landed);
+
+ res->bind_history |= PIPE_BIND_QUERY_BUFFER;
+
+ if (index == -1) {
+ /* They're asking for the availability of the result. If we still
+ * have commands queued up which produce the result, submit them
+ * now so that progress happens. Either way, copy the snapshots
+ * landed field to the destination resource.
+ */
+ if (q->syncobj == crocus_batch_get_signal_syncobj(batch))
+ crocus_batch_flush(batch);
+
+ screen->vtbl.copy_mem_mem(batch, dst_bo, offset,
+ query_bo, snapshots_landed_offset,
+ result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
+ return;
+ }
+
+ if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
+ /* The final snapshots happen to have landed, so let's just compute
+ * the result on the CPU now...
+ */
+ calculate_result_on_cpu(devinfo, q);
+ }
+
+ if (q->ready) {
+ /* We happen to have the result on the CPU, so just copy it. */
+ if (result_type <= PIPE_QUERY_TYPE_U32) {
+ screen->vtbl.store_data_imm32(batch, dst_bo, offset, q->result);
+ } else {
+ screen->vtbl.store_data_imm64(batch, dst_bo, offset, q->result);
+ }
+
+ /* Make sure the result lands before they use bind the QBO elsewhere
+ * and use the result.
+ */
+ // XXX: Why? i965 doesn't do this.
+ crocus_emit_pipe_control_flush(batch,
+ "query: unknown QBO flushing hack",
+ PIPE_CONTROL_CS_STALL);
+ return;
+ }
+
+#if GFX_VERx10 == 75
+ bool predicated = !wait && !q->stalled;
+
+ struct mi_builder b;
+ mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+ struct mi_value result = calculate_result_on_gpu(devinfo, &b, q);
+ struct mi_value dst =
+ result_type <= PIPE_QUERY_TYPE_U32 ? mi_mem32(rw_bo(dst_bo, offset))
+ : mi_mem64(rw_bo(dst_bo, offset));
+
+ if (predicated) {
+ mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
+ mi_mem64(ro_bo(query_bo, snapshots_landed_offset)));
+ mi_store_if(&b, dst, result);
+ } else {
+ mi_store(&b, dst, result);
+ }
+#endif
+}
+#endif
+
+static void
+crocus_set_active_query_state(struct pipe_context *ctx, bool enable)
+{
+ struct crocus_context *ice = (void *) ctx;
+
+ if (ice->state.statistics_counters_enabled == enable)
+ return;
+
+ // XXX: most packets aren't paying attention to this yet, because it'd
+ // have to be done dynamically at draw time, which is a pain
+ ice->state.statistics_counters_enabled = enable;
+ ice->state.dirty |= CROCUS_DIRTY_CLIP |
+ CROCUS_DIRTY_RASTER |
+ CROCUS_DIRTY_STREAMOUT |
+ CROCUS_DIRTY_WM;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS |
+ CROCUS_STAGE_DIRTY_TCS |
+ CROCUS_STAGE_DIRTY_TES |
+ CROCUS_STAGE_DIRTY_VS;
+}
+
+static void
+set_predicate_enable(struct crocus_context *ice, bool value)
+{
+ if (value)
+ ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;
+ else
+ ice->state.predicate = CROCUS_PREDICATE_STATE_DONT_RENDER;
+}
+
+#if GFX_VER == 7
+static void
+set_predicate_for_result(struct crocus_context *ice,
+ struct crocus_query *q,
+ bool inverted)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+
+#if GFX_VERx10 != 75
+ /* IVB doesn't have enough MI for this */
+ if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+ q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+ ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;
+ return;
+ }
+#endif
+
+ /* The CPU doesn't have the query result yet; use hardware predication */
+ ice->state.predicate = CROCUS_PREDICATE_STATE_USE_BIT;
+
+ /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
+ crocus_emit_pipe_control_flush(batch,
+ "conditional rendering: set predicate",
+ PIPE_CONTROL_FLUSH_ENABLE);
+ q->stalled = true;
+
+#if GFX_VERx10 != 75
+ struct crocus_screen *screen = batch->screen;
+ screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
+ q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, start));
+ screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
+ q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, end));
+
+ uint32_t mi_predicate = MI_PREDICATE | MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+ if (inverted)
+ mi_predicate |= MI_PREDICATE_LOADOP_LOAD;
+ else
+ mi_predicate |= MI_PREDICATE_LOADOP_LOADINV;
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+#else
+ struct mi_builder b;
+ mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+ struct mi_value result;
+
+ switch (q->type) {
+ case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ result = calc_overflow_for_stream(&b, q, q->index);
+ break;
+ case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+ result = calc_overflow_any_stream(&b, q);
+ break;
+ default: {
+ /* PIPE_QUERY_OCCLUSION_* */
+ struct mi_value start =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, start));
+ struct mi_value end =
+ query_mem64(q, offsetof(struct crocus_query_snapshots, end));
+ result = mi_isub(&b, end, start);
+ break;
+ }
+ }
+
+ result = inverted ? mi_z(&b, result) : mi_nz(&b, result);
+ result = mi_iand(&b, result, mi_imm(1));
+
+ /* We immediately set the predicate on the render batch, as all the
+ * counters come from 3D operations. However, we may need to predicate
+ * a compute dispatch, which executes in a different GEM context and has
+ * a different MI_PREDICATE_RESULT register. So, we save the result to
+ * memory and reload it in crocus_launch_grid.
+ */
+ mi_value_ref(&b, result);
+
+ mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), result);
+ mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+ unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+ MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+ mi_store(&b, query_mem64(q, offsetof(struct crocus_query_snapshots,
+ predicate_result)), result);
+#endif
+ ice->state.compute_predicate = bo;
+}
+#endif
+
+static void
+crocus_render_condition(struct pipe_context *ctx,
+ struct pipe_query *query,
+ bool condition,
+ enum pipe_render_cond_flag mode)
+{
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_query *q = (void *) query;
+
+ /* The old condition isn't relevant; we'll update it if necessary */
+ ice->state.compute_predicate = NULL;
+ ice->condition.query = q;
+ ice->condition.condition = condition;
+ ice->condition.mode = mode;
+
+ if (!q) {
+ ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;
+ return;
+ }
+
+ crocus_check_query_no_flush(ice, q);
+
+ if (q->result || q->ready) {
+ set_predicate_enable(ice, (q->result != 0) ^ condition);
+ } else {
+ if (mode == PIPE_RENDER_COND_NO_WAIT ||
+ mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
+ perf_debug(&ice->dbg, "Conditional rendering demoted from "
+ "\"no wait\" to \"wait\".");
+ }
+#if GFX_VER == 7
+ set_predicate_for_result(ice, q, condition);
+#else
+ ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;
+#endif
+ }
+}
+
+static void
+crocus_resolve_conditional_render(struct crocus_context *ice)
+{
+ struct pipe_context *ctx = (void *) ice;
+ struct crocus_query *q = ice->condition.query;
+ struct pipe_query *query = (void *) q;
+ union pipe_query_result result;
+
+ if (ice->state.predicate != CROCUS_PREDICATE_STATE_USE_BIT)
+ return;
+
+ assert(q);
+
+ crocus_get_query_result(ctx, query, true, &result);
+ set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
+}
+
+#if GFX_VER >= 7
+static void
+crocus_emit_compute_predicate(struct crocus_batch *batch)
+{
+ struct crocus_context *ice = batch->ice;
+ struct crocus_screen *screen = batch->screen;
+ screen->vtbl.load_register_mem32(batch, MI_PREDICATE_SRC0,
+ ice->state.compute_predicate, 0);
+ screen->vtbl.load_register_imm32(batch, MI_PREDICATE_SRC1, 0);
+ unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+ MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+}
+#endif
+
+void
+genX(init_screen_query)(struct crocus_screen *screen)
+{
+ screen->vtbl.resolve_conditional_render = crocus_resolve_conditional_render;
+#if GFX_VER >= 7
+ screen->vtbl.emit_compute_predicate = crocus_emit_compute_predicate;
+#endif
+}
+
+void
+genX(init_query)(struct crocus_context *ice)
+{
+ struct pipe_context *ctx = &ice->ctx;
+
+ ctx->create_query = crocus_create_query;
+ ctx->create_batch_query = crocus_create_batch_query;
+ ctx->destroy_query = crocus_destroy_query;
+ ctx->begin_query = crocus_begin_query;
+ ctx->end_query = crocus_end_query;
+ ctx->get_query_result = crocus_get_query_result;
+#if GFX_VER == 7
+ ctx->get_query_result_resource = crocus_get_query_result_resource;
+#endif
+ ctx->set_active_query_state = crocus_set_active_query_state;
+ ctx->render_condition = crocus_render_condition;
+
+}
diff --git a/src/gallium/drivers/crocus/crocus_resolve.c b/src/gallium/drivers/crocus/crocus_resolve.c
new file mode 100644
index 00000000000..a38eb4a94a7
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resolve.c
@@ -0,0 +1,1061 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_resolve.c
+ *
+ * This file handles resolve tracking for main and auxiliary surfaces.
+ *
+ * It also handles our cache tracking. We have sets for the render cache,
+ * depth cache, and so on. If a BO is in a cache's set, then it may have
+ * data in that cache. The helpers take care of emitting flushes for
+ * render-to-texture, format reinterpretation issues, and other situations.
+ */
+
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "crocus_context.h"
+#include "compiler/nir/nir.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLORP
+
+static void
+crocus_update_stencil_shadow(struct crocus_context *ice,
+ struct crocus_resource *res);
+/**
+ * Disable auxiliary buffers if a renderbuffer is also bound as a texture
+ * or shader image. This causes a self-dependency, where both rendering
+ * and sampling may concurrently read or write the CCS buffer, causing
+ * incorrect pixels.
+ */
+static bool
+disable_rb_aux_buffer(struct crocus_context *ice,
+ bool *draw_aux_buffer_disabled,
+ struct crocus_resource *tex_res,
+ unsigned min_level, unsigned num_levels,
+ const char *usage)
+{
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ bool found = false;
+
+ /* We only need to worry about fast clears. */
+ if (tex_res->aux.usage != ISL_AUX_USAGE_CCS_D)
+ return false;
+
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ if (!surf)
+ continue;
+
+ struct crocus_resource *rb_res = (void *) surf->base.texture;
+
+ if (rb_res->bo == tex_res->bo &&
+ surf->base.u.tex.level >= min_level &&
+ surf->base.u.tex.level < min_level + num_levels) {
+ found = draw_aux_buffer_disabled[i] = true;
+ }
+ }
+
+ if (found) {
+ perf_debug(&ice->dbg,
+ "Disabling CCS because a renderbuffer is also bound %s.\n",
+ usage);
+ }
+
+ return found;
+}
+
+static void
+resolve_sampler_views(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_shader_state *shs,
+ const struct shader_info *info,
+ bool *draw_aux_buffer_disabled,
+ bool consider_framebuffer)
+{
+ uint32_t views = info ? (shs->bound_sampler_views & info->textures_used[0]) : 0;
+
+ while (views) {
+ const int i = u_bit_scan(&views);
+ struct crocus_sampler_view *isv = shs->textures[i];
+
+ if (isv->res->base.target != PIPE_BUFFER) {
+ if (consider_framebuffer) {
+ disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, isv->res,
+ isv->view.base_level, isv->view.levels,
+ "for sampling");
+ }
+
+ crocus_resource_prepare_texture(ice, isv->res, isv->view.format,
+ isv->view.base_level, isv->view.levels,
+ isv->view.base_array_layer,
+ isv->view.array_len);
+ }
+
+ crocus_cache_flush_for_read(batch, isv->res->bo);
+
+ if (batch->screen->devinfo.ver >= 7 &&
+ (isv->base.format == PIPE_FORMAT_X24S8_UINT ||
+ isv->base.format == PIPE_FORMAT_X32_S8X24_UINT ||
+ isv->base.format == PIPE_FORMAT_S8_UINT)) {
+ struct crocus_resource *zres, *sres;
+ crocus_get_depth_stencil_resources(&batch->screen->devinfo, isv->base.texture, &zres, &sres);
+ crocus_update_stencil_shadow(ice, sres);
+ crocus_cache_flush_for_read(batch, sres->shadow->bo);
+ }
+ }
+}
+
+static void
+resolve_image_views(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_shader_state *shs,
+ bool *draw_aux_buffer_disabled,
+ bool consider_framebuffer)
+{
+ /* TODO: Consider images used by program */
+ uint32_t views = shs->bound_image_views;
+
+ while (views) {
+ const int i = u_bit_scan(&views);
+ struct pipe_image_view *pview = &shs->image[i].base;
+ struct crocus_resource *res = (void *) pview->resource;
+
+ if (res->base.target != PIPE_BUFFER) {
+ if (consider_framebuffer) {
+ disable_rb_aux_buffer(ice, draw_aux_buffer_disabled,
+ res, pview->u.tex.level, 1,
+ "as a shader image");
+ }
+
+ unsigned num_layers =
+ pview->u.tex.last_layer - pview->u.tex.first_layer + 1;
+
+ /* The data port doesn't understand any compression */
+ crocus_resource_prepare_access(ice, res,
+ pview->u.tex.level, 1,
+ pview->u.tex.first_layer, num_layers,
+ ISL_AUX_USAGE_NONE, false);
+ }
+
+ crocus_cache_flush_for_read(batch, res->bo);
+ }
+}
+
+static void
+crocus_update_align_res(struct crocus_batch *batch,
+ struct crocus_surface *surf,
+ bool copy_to_wa)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)batch->screen;
+ struct pipe_blit_info info = { 0 };
+
+ info.src.resource = copy_to_wa ? surf->base.texture : surf->align_res;
+ info.src.level = copy_to_wa ? surf->base.u.tex.level : 0;
+ u_box_2d_zslice(0, 0, copy_to_wa ? surf->base.u.tex.first_layer : 0,
+ u_minify(surf->base.texture->width0, surf->base.u.tex.level),
+ u_minify(surf->base.texture->height0, surf->base.u.tex.level), &info.src.box);
+ info.src.format = surf->base.texture->format;
+ info.dst.resource = copy_to_wa ? surf->align_res : surf->base.texture;
+ info.dst.level = copy_to_wa ? 0 : surf->base.u.tex.level;
+ info.dst.box = info.src.box;
+ info.dst.box.z = copy_to_wa ? 0 : surf->base.u.tex.first_layer;
+ info.dst.format = surf->base.texture->format;
+ info.mask = util_format_is_depth_or_stencil(surf->base.texture->format) ? PIPE_MASK_ZS : PIPE_MASK_RGBA;
+ info.filter = 0;
+ if (!screen->vtbl.blit_blt(batch, &info)) {
+ assert(0);
+ }
+}
+
+/**
+ * \brief Resolve buffers before drawing.
+ *
+ * Resolve the depth buffer's HiZ buffer, resolve the depth buffer of each
+ * enabled depth texture, and flush the render cache for any dirty textures.
+ */
+void
+crocus_predraw_resolve_inputs(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled,
+ gl_shader_stage stage,
+ bool consider_framebuffer)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ const struct shader_info *info = crocus_get_shader_info(ice, stage);
+
+ uint64_t stage_dirty = (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage) |
+ (consider_framebuffer ? CROCUS_STAGE_DIRTY_BINDINGS_FS : 0);
+
+ if (ice->state.stage_dirty & stage_dirty) {
+ resolve_sampler_views(ice, batch, shs, info, draw_aux_buffer_disabled,
+ consider_framebuffer);
+ resolve_image_views(ice, batch, shs, draw_aux_buffer_disabled,
+ consider_framebuffer);
+ }
+}
+
+void
+crocus_predraw_resolve_framebuffer(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool *draw_aux_buffer_disabled)
+{
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_uncompiled_shader *ish =
+ ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+ const nir_shader *nir = ish->nir;
+
+ if (ice->state.dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
+ struct pipe_surface *zs_surf = cso_fb->zsbuf;
+
+ if (zs_surf) {
+ struct crocus_resource *z_res, *s_res;
+ crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res);
+ unsigned num_layers =
+ zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1;
+
+ if (z_res) {
+ crocus_resource_prepare_render(ice, z_res,
+ zs_surf->u.tex.level,
+ zs_surf->u.tex.first_layer,
+ num_layers, ice->state.hiz_usage);
+ crocus_cache_flush_for_depth(batch, z_res->bo);
+
+ if (((struct crocus_surface *)zs_surf)->align_res) {
+ crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, true);
+ }
+ }
+
+ if (s_res) {
+ crocus_cache_flush_for_depth(batch, s_res->bo);
+ }
+ }
+ }
+
+ if (nir->info.outputs_read != 0) {
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ if (cso_fb->cbufs[i]) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ struct crocus_resource *res = (void *) cso_fb->cbufs[i]->texture;
+
+ crocus_resource_prepare_texture(ice, res, surf->view.format,
+ surf->view.base_level, 1,
+ surf->view.base_array_layer,
+ surf->view.array_len);
+ }
+ }
+ }
+
+ if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS) {
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ if (!surf)
+ continue;
+
+ struct crocus_resource *res = (void *) surf->base.texture;
+
+ if (surf->align_res)
+ crocus_update_align_res(batch, surf, true);
+
+ enum isl_aux_usage aux_usage =
+ crocus_resource_render_aux_usage(ice, res, surf->view.format,
+ ice->state.blend_enables & (1u << i),
+ draw_aux_buffer_disabled[i]);
+
+ if (ice->state.draw_aux_usage[i] != aux_usage) {
+ ice->state.draw_aux_usage[i] = aux_usage;
+ /* XXX: Need to track which bindings to make dirty */
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+ }
+
+ crocus_resource_prepare_render(ice, res, surf->view.base_level,
+ surf->view.base_array_layer,
+ surf->view.array_len,
+ aux_usage);
+
+ crocus_cache_flush_for_render(batch, res->bo, surf->view.format,
+ aux_usage);
+ }
+ }
+}
+
+/**
+ * \brief Call this after drawing to mark which buffers need resolving
+ *
+ * If the depth buffer was written to and if it has an accompanying HiZ
+ * buffer, then mark that it needs a depth resolve.
+ *
+ * If the color buffer is a multisample window system buffer, then
+ * mark that it needs a downsample.
+ *
+ * Also mark any render targets which will be textured as needing a render
+ * cache flush.
+ */
+void
+crocus_postdraw_update_resolve_tracking(struct crocus_context *ice,
+ struct crocus_batch *batch)
+{
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+ // XXX: front buffer drawing?
+
+ bool may_have_resolved_depth =
+ ice->state.dirty & (CROCUS_DIRTY_DEPTH_BUFFER |
+ CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL);
+
+ struct pipe_surface *zs_surf = cso_fb->zsbuf;
+ if (zs_surf) {
+ struct crocus_resource *z_res, *s_res;
+ crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res);
+ unsigned num_layers =
+ zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1;
+
+ if (z_res) {
+ if (may_have_resolved_depth && ice->state.depth_writes_enabled) {
+ crocus_resource_finish_render(ice, z_res, zs_surf->u.tex.level,
+ zs_surf->u.tex.first_layer, num_layers,
+ ice->state.hiz_usage);
+ }
+
+ if (ice->state.depth_writes_enabled)
+ crocus_depth_cache_add_bo(batch, z_res->bo);
+
+ if (((struct crocus_surface *)zs_surf)->align_res) {
+ crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, false);
+ }
+ }
+
+ if (s_res) {
+ if (may_have_resolved_depth && ice->state.stencil_writes_enabled) {
+ crocus_resource_finish_write(ice, s_res, zs_surf->u.tex.level,
+ zs_surf->u.tex.first_layer, num_layers,
+ s_res->aux.usage);
+ }
+
+ if (ice->state.stencil_writes_enabled)
+ crocus_depth_cache_add_bo(batch, s_res->bo);
+ }
+ }
+
+ bool may_have_resolved_color =
+ ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS;
+
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+ if (!surf)
+ continue;
+
+ if (surf->align_res)
+ crocus_update_align_res(batch, surf, false);
+ struct crocus_resource *res = (void *) surf->base.texture;
+ enum isl_aux_usage aux_usage = ice->state.draw_aux_usage[i];
+
+ crocus_render_cache_add_bo(batch, res->bo, surf->view.format,
+ aux_usage);
+
+ if (may_have_resolved_color) {
+ union pipe_surface_desc *desc = &surf->base.u;
+ unsigned num_layers =
+ desc->tex.last_layer - desc->tex.first_layer + 1;
+ crocus_resource_finish_render(ice, res, desc->tex.level,
+ desc->tex.first_layer, num_layers,
+ aux_usage);
+ }
+ }
+}
+
+/**
+ * Clear the cache-tracking sets.
+ */
+void
+crocus_cache_sets_clear(struct crocus_batch *batch)
+{
+ hash_table_foreach(batch->cache.render, render_entry)
+ _mesa_hash_table_remove(batch->cache.render, render_entry);
+
+ set_foreach(batch->cache.depth, depth_entry)
+ _mesa_set_remove(batch->cache.depth, depth_entry);
+}
+
+/**
+ * Emits an appropriate flush for a BO if it has been rendered to within the
+ * same batchbuffer as a read that's about to be emitted.
+ *
+ * The GPU has separate, incoherent caches for the render cache and the
+ * sampler cache, along with other caches. Usually data in the different
+ * caches don't interact (e.g. we don't render to our driver-generated
+ * immediate constant data), but for render-to-texture in FBOs we definitely
+ * do. When a batchbuffer is flushed, the kernel will ensure that everything
+ * necessary is flushed before another use of that BO, but for reuse from
+ * different caches within a batchbuffer, it's all our responsibility.
+ */
+void
+crocus_flush_depth_and_render_caches(struct crocus_batch *batch)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ if (devinfo->ver >= 6) {
+ crocus_emit_pipe_control_flush(batch,
+ "cache tracker: render-to-texture",
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+
+ crocus_emit_pipe_control_flush(batch,
+ "cache tracker: render-to-texture",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+ } else {
+ crocus_emit_mi_flush(batch);
+ }
+
+ crocus_cache_sets_clear(batch);
+}
+
+void
+crocus_cache_flush_for_read(struct crocus_batch *batch,
+ struct crocus_bo *bo)
+{
+ if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo) ||
+ _mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo))
+ crocus_flush_depth_and_render_caches(batch);
+}
+
+static void *
+format_aux_tuple(enum isl_format format, enum isl_aux_usage aux_usage)
+{
+ return (void *)(uintptr_t)((uint32_t)format << 8 | aux_usage);
+}
+
+void
+crocus_cache_flush_for_render(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage)
+{
+ if (_mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo))
+ crocus_flush_depth_and_render_caches(batch);
+
+ /* Check to see if this bo has been used by a previous rendering operation
+ * but with a different format or aux usage. If it has, flush the render
+ * cache so we ensure that it's only in there with one format or aux usage
+ * at a time.
+ *
+ * Even though it's not obvious, this can easily happen in practice.
+ * Suppose a client is blending on a surface with sRGB encode enabled on
+ * gen9. This implies that you get AUX_USAGE_CCS_D at best. If the client
+ * then disables sRGB decode and continues blending we will flip on
+ * AUX_USAGE_CCS_E without doing any sort of resolve in-between (this is
+ * perfectly valid since CCS_E is a subset of CCS_D). However, this means
+ * that we have fragments in-flight which are rendering with UNORM+CCS_E
+ * and other fragments in-flight with SRGB+CCS_D on the same surface at the
+ * same time and the pixel scoreboard and color blender are trying to sort
+ * it all out. This ends badly (i.e. GPU hangs).
+ *
+ * To date, we have never observed GPU hangs or even corruption to be
+ * associated with switching the format, only the aux usage. However,
+ * there are comments in various docs which indicate that the render cache
+ * isn't 100% resilient to format changes. We may as well be conservative
+ * and flush on format changes too. We can always relax this later if we
+ * find it to be a performance problem.
+ */
+ struct hash_entry *entry =
+ _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo);
+ if (entry && entry->data != format_aux_tuple(format, aux_usage))
+ crocus_flush_depth_and_render_caches(batch);
+}
+
+void
+crocus_render_cache_add_bo(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ enum isl_format format,
+ enum isl_aux_usage aux_usage)
+{
+#ifndef NDEBUG
+ struct hash_entry *entry =
+ _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo);
+ if (entry) {
+ /* Otherwise, someone didn't do a flush_for_render and that would be
+ * very bad indeed.
+ */
+ assert(entry->data == format_aux_tuple(format, aux_usage));
+ }
+#endif
+
+ _mesa_hash_table_insert_pre_hashed(batch->cache.render, bo->hash, bo,
+ format_aux_tuple(format, aux_usage));
+}
+
+void
+crocus_cache_flush_for_depth(struct crocus_batch *batch,
+ struct crocus_bo *bo)
+{
+ if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo))
+ crocus_flush_depth_and_render_caches(batch);
+}
+
+void
+crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+ _mesa_set_add_pre_hashed(batch->cache.depth, bo->hash, bo);
+}
+
+static void
+crocus_resolve_color(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ unsigned level, unsigned layer,
+ enum isl_aux_op resolve_op)
+{
+ struct crocus_screen *screen = batch->screen;
+ DBG("%s to res %p level %u layer %u\n", __func__, res, level, layer);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ &res->base, res->aux.usage, level, true);
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ /* Ivybridge PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+ *
+ * "Any transition from any value in {Clear, Render, Resolve} to a
+ * different value in {Clear, Render, Resolve} requires end of pipe
+ * synchronization."
+ *
+ * In other words, fast clear ops are not properly synchronized with
+ * other drawing. We need to use a PIPE_CONTROL to ensure that the
+ * contents of the previous draw hit the render target before we resolve
+ * and again afterwards to ensure that the resolve is complete before we
+ * do any more regular drawing.
+ */
+ crocus_emit_end_of_pipe_sync(batch, "color resolve: pre-flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+ blorp_ccs_resolve(&blorp_batch, &surf, level, layer, 1,
+ isl_format_srgb_to_linear(res->surf.format),
+ resolve_op);
+ blorp_batch_finish(&blorp_batch);
+
+ /* See comment above */
+ crocus_emit_end_of_pipe_sync(batch, "color resolve: post-flush",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH);
+}
+
+static void
+crocus_mcs_partial_resolve(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ uint32_t start_layer,
+ uint32_t num_layers)
+{
+ struct crocus_screen *screen = batch->screen;
+
+ DBG("%s to res %p layers %u-%u\n", __func__, res,
+ start_layer, start_layer + num_layers - 1);
+
+ assert(isl_aux_usage_has_mcs(res->aux.usage));
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ &res->base, res->aux.usage, 0, true);
+
+ struct blorp_batch blorp_batch;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+ blorp_mcs_partial_resolve(&blorp_batch, &surf,
+ isl_format_srgb_to_linear(res->surf.format),
+ start_layer, num_layers);
+ blorp_batch_finish(&blorp_batch);
+}
+
+/**
+ * Perform a HiZ or depth resolve operation.
+ *
+ * For an overview of HiZ ops, see the following sections of the Sandy Bridge
+ * PRM, Volume 1, Part 2:
+ * - 7.5.3.1 Depth Buffer Clear
+ * - 7.5.3.2 Depth Buffer Resolve
+ * - 7.5.3.3 Hierarchical Depth Buffer Resolve
+ */
+void
+crocus_hiz_exec(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ unsigned int level, unsigned int start_layer,
+ unsigned int num_layers, enum isl_aux_op op,
+ bool update_clear_depth)
+{
+ struct crocus_screen *screen = batch->screen;
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ assert(crocus_resource_level_has_hiz(res, level));
+ assert(op != ISL_AUX_OP_NONE);
+ UNUSED const char *name = NULL;
+
+ switch (op) {
+ case ISL_AUX_OP_FULL_RESOLVE:
+ name = "depth resolve";
+ break;
+ case ISL_AUX_OP_AMBIGUATE:
+ name = "hiz ambiguate";
+ break;
+ case ISL_AUX_OP_FAST_CLEAR:
+ name = "depth clear";
+ break;
+ case ISL_AUX_OP_PARTIAL_RESOLVE:
+ case ISL_AUX_OP_NONE:
+ unreachable("Invalid HiZ op");
+ }
+
+ DBG("%s %s to res %p level %d layers %d-%d\n",
+ __func__, name, res, level, start_layer, start_layer + num_layers - 1);
+
+ /* The following stalls and flushes are only documented to be required
+ * for HiZ clear operations. However, they also seem to be required for
+ * resolve operations.
+ *
+ * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
+ *
+ * "If other rendering operations have preceded this clear, a
+ * PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+ * enabled must be issued before the rectangle primitive used for
+ * the depth buffer clear operation."
+ *
+ * Same applies for Gen8 and Gen9.
+ *
+ * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1
+ * PIPE_CONTROL, Depth Cache Flush Enable:
+ *
+ * "This bit must not be set when Depth Stall Enable bit is set in
+ * this packet."
+ *
+ * This is confirmed to hold for real, Haswell gets immediate gpu hangs.
+ *
+ * Therefore issue two pipe control flushes, one for cache flush and
+ * another for depth stall.
+ */
+ if (devinfo->ver == 6) {
+ /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+ *
+ * "If other rendering operations have preceded this clear, a
+ * PIPE_CONTROL with write cache flush enabled and Z-inhibit
+ * disabled must be issued before the rectangle primitive used for
+ * the depth buffer clear operation.
+ */
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: pre-flushes (1)",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ } else if (devinfo->ver >= 7) {
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: pre-flushes (1/2)",
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ crocus_emit_pipe_control_flush(batch, "hiz op: pre-flushes (2/2)",
+ PIPE_CONTROL_DEPTH_STALL);
+ }
+
+ assert(isl_aux_usage_has_hiz(res->aux.usage) && res->aux.bo);
+
+ crocus_batch_maybe_flush(batch, 1500);
+
+ struct blorp_surf surf;
+ crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+ &res->base, res->aux.usage, level, true);
+
+ struct blorp_batch blorp_batch;
+ enum blorp_batch_flags flags = 0;
+ flags |= update_clear_depth ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR;
+ blorp_batch_init(&ice->blorp, &blorp_batch, batch, flags);
+ blorp_hiz_op(&blorp_batch, &surf, level, start_layer, num_layers, op);
+ blorp_batch_finish(&blorp_batch);
+
+ /* The following stalls and flushes are only documented to be required
+ * for HiZ clear operations. However, they also seem to be required for
+ * resolve operations.
+ *
+ * From the Broadwell PRM, volume 7, "Depth Buffer Clear":
+ *
+ * "Depth buffer clear pass using any of the methods (WM_STATE,
+ * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a
+ * PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits
+ * "set" before starting to render. DepthStall and DepthFlush are
+ * not needed between consecutive depth clear passes nor is it
+ * required if the depth clear pass was done with
+ * 'full_surf_clear' bit set in the 3DSTATE_WM_HZ_OP."
+ *
+ * TODO: Such as the spec says, this could be conditional.
+ */
+ if (devinfo->ver == 6) {
+ /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+ *
+ * "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be
+ * followed by a PIPE_CONTROL command with DEPTH_STALL bit set
+ * and Then followed by Depth FLUSH'
+ */
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: post-flushes (1/2)",
+ PIPE_CONTROL_DEPTH_STALL);
+
+ crocus_emit_pipe_control_flush(batch,
+ "hiz op: post-flushes (2/2)",
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ }
+}
+
+/**
+ * Does the resource's slice have hiz enabled?
+ */
+bool
+crocus_resource_level_has_hiz(const struct crocus_resource *res, uint32_t level)
+{
+ crocus_resource_check_level_layer(res, level, 0);
+ return res->aux.has_hiz & 1 << level;
+}
+
+static bool
+crocus_resource_level_has_aux(const struct crocus_resource *res, uint32_t level)
+{
+ if (isl_aux_usage_has_hiz(res->aux.usage))
+ return crocus_resource_level_has_hiz(res, level);
+ else
+ return level < res->aux.surf.levels;
+}
+
+/** \brief Assert that the level and layer are valid for the resource. */
+void
+crocus_resource_check_level_layer(UNUSED const struct crocus_resource *res,
+ UNUSED uint32_t level, UNUSED uint32_t layer)
+{
+ assert(level < res->surf.levels);
+ assert(layer < util_num_layers(&res->base, level));
+}
+
+static inline uint32_t
+miptree_level_range_length(const struct crocus_resource *res,
+ uint32_t start_level, uint32_t num_levels)
+{
+ assert(start_level < res->surf.levels);
+
+ if (num_levels == INTEL_REMAINING_LAYERS)
+ num_levels = res->surf.levels;
+
+ /* Check for overflow */
+ assert(start_level + num_levels >= start_level);
+ assert(start_level + num_levels <= res->surf.levels);
+
+ return num_levels;
+}
+
+static inline uint32_t
+miptree_layer_range_length(const struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers)
+{
+ assert(level <= res->base.last_level);
+
+ const uint32_t total_num_layers = crocus_get_num_logical_layers(res, level);
+ assert(start_layer < total_num_layers);
+ if (num_layers == INTEL_REMAINING_LAYERS)
+ num_layers = total_num_layers - start_layer;
+ /* Check for overflow */
+ assert(start_layer + num_layers >= start_layer);
+ assert(start_layer + num_layers <= total_num_layers);
+
+ return num_layers;
+}
+
+bool
+crocus_has_invalid_primary(const struct crocus_resource *res,
+ unsigned start_level, unsigned num_levels,
+ unsigned start_layer, unsigned num_layers)
+{
+ if (!res->aux.bo)
+ return false;
+
+ /* Clamp the level range to fit the resource */
+ num_levels = miptree_level_range_length(res, start_level, num_levels);
+
+ for (uint32_t l = 0; l < num_levels; l++) {
+ const uint32_t level = start_level + l;
+ if (!crocus_resource_level_has_aux(res, level))
+ continue;
+
+ const uint32_t level_layers =
+ miptree_layer_range_length(res, level, start_layer, num_layers);
+ for (unsigned a = 0; a < level_layers; a++) {
+ enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, start_layer + a);
+ if (!isl_aux_state_has_valid_primary(aux_state))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+crocus_resource_prepare_access(struct crocus_context *ice,
+ struct crocus_resource *res,
+ uint32_t start_level, uint32_t num_levels,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_usage aux_usage,
+ bool fast_clear_supported)
+{
+ if (!res->aux.bo)
+ return;
+
+ /* We can't do resolves on the compute engine, so awkwardly, we have to
+ * do them on the render batch...
+ */
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ const uint32_t clamped_levels =
+ miptree_level_range_length(res, start_level, num_levels);
+ for (uint32_t l = 0; l < clamped_levels; l++) {
+ const uint32_t level = start_level + l;
+ if (!crocus_resource_level_has_aux(res, level))
+ continue;
+
+ const uint32_t level_layers =
+ miptree_layer_range_length(res, level, start_layer, num_layers);
+ for (uint32_t a = 0; a < level_layers; a++) {
+ const uint32_t layer = start_layer + a;
+ const enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, layer);
+ const enum isl_aux_op aux_op =
+ isl_aux_prepare_access(aux_state, aux_usage, fast_clear_supported);
+
+ /* Prepare the aux buffer for a conditional or unconditional access.
+ * A conditional access is handled by assuming that the access will
+ * not evaluate to a no-op. If the access does in fact occur, the aux
+ * will be in the required state. If it does not, no data is lost
+ * because the aux_op performed is lossless.
+ */
+ if (aux_op == ISL_AUX_OP_NONE) {
+ /* Nothing to do here. */
+ } else if (isl_aux_usage_has_mcs(res->aux.usage)) {
+ assert(aux_op == ISL_AUX_OP_PARTIAL_RESOLVE);
+ crocus_mcs_partial_resolve(ice, batch, res, layer, 1);
+ } else if (isl_aux_usage_has_hiz(res->aux.usage)) {
+ crocus_hiz_exec(ice, batch, res, level, layer, 1, aux_op, false);
+ } else if (res->aux.usage == ISL_AUX_USAGE_STC_CCS) {
+ unreachable("crocus doesn't resolve STC_CCS resources");
+ } else {
+ assert(isl_aux_usage_has_ccs(res->aux.usage));
+ crocus_resolve_color(ice, batch, res, level, layer, aux_op);
+ }
+
+ const enum isl_aux_state new_state =
+ isl_aux_state_transition_aux_op(aux_state, res->aux.usage, aux_op);
+ crocus_resource_set_aux_state(ice, res, level, layer, 1, new_state);
+ }
+ }
+}
+
+void
+crocus_resource_finish_write(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_usage aux_usage)
+{
+ if (res->base.format == PIPE_FORMAT_S8_UINT)
+ res->shadow_needs_update = true;
+
+ if (!crocus_resource_level_has_aux(res, level))
+ return;
+
+ const uint32_t level_layers =
+ miptree_layer_range_length(res, level, start_layer, num_layers);
+
+ for (uint32_t a = 0; a < level_layers; a++) {
+ const uint32_t layer = start_layer + a;
+ const enum isl_aux_state aux_state =
+ crocus_resource_get_aux_state(res, level, layer);
+
+ /* Transition the aux state for a conditional or unconditional write. A
+ * conditional write is handled by assuming that the write applies to
+ * only part of the render target. This prevents the new state from
+ * losing the types of compression that might exist in the current state
+ * (e.g. CLEAR). If the write evaluates to a no-op, the state will still
+ * be able to communicate when resolves are necessary (but it may
+ * falsely communicate this as well).
+ */
+ const enum isl_aux_state new_aux_state =
+ isl_aux_state_transition_write(aux_state, aux_usage, false);
+
+ crocus_resource_set_aux_state(ice, res, level, layer, 1, new_aux_state);
+ }
+}
+
+enum isl_aux_state
+crocus_resource_get_aux_state(const struct crocus_resource *res,
+ uint32_t level, uint32_t layer)
+{
+ crocus_resource_check_level_layer(res, level, layer);
+ assert(crocus_resource_level_has_aux(res, level));
+
+ return res->aux.state[level][layer];
+}
+
+void
+crocus_resource_set_aux_state(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_state aux_state)
+{
+ assert(crocus_resource_level_has_aux(res, level));
+
+ num_layers = miptree_layer_range_length(res, level, start_layer, num_layers);
+ for (unsigned a = 0; a < num_layers; a++) {
+ if (res->aux.state[level][start_layer + a] != aux_state) {
+ res->aux.state[level][start_layer + a] = aux_state;
+ ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES |
+ CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES;
+ /* XXX: Need to track which bindings to make dirty */
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+ }
+ }
+}
+
+static bool
+isl_formats_are_fast_clear_compatible(enum isl_format a, enum isl_format b)
+{
+ /* On gen8 and earlier, the hardware was only capable of handling 0/1 clear
+ * values so sRGB curve application was a no-op for all fast-clearable
+ * formats.
+ *
+ * On gen9+, the hardware supports arbitrary clear values. For sRGB clear
+ * values, the hardware interprets the floats, not as what would be
+ * returned from the sampler (or written by the shader), but as being
+ * between format conversion and sRGB curve application. This means that
+ * we can switch between sRGB and UNORM without having to whack the clear
+ * color.
+ */
+ return isl_format_srgb_to_linear(a) == isl_format_srgb_to_linear(b);
+}
+
+void
+crocus_resource_prepare_texture(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format view_format,
+ uint32_t start_level, uint32_t num_levels,
+ uint32_t start_layer, uint32_t num_layers)
+{
+ enum isl_aux_usage aux_usage =
+ crocus_resource_texture_aux_usage(res);
+
+ bool clear_supported = aux_usage != ISL_AUX_USAGE_NONE;
+
+ /* Clear color is specified as ints or floats and the conversion is done by
+ * the sampler. If we have a texture view, we would have to perform the
+ * clear color conversion manually. Just disable clear color.
+ */
+ if (!isl_formats_are_fast_clear_compatible(res->surf.format, view_format))
+ clear_supported = false;
+
+ crocus_resource_prepare_access(ice, res, start_level, num_levels,
+ start_layer, num_layers,
+ aux_usage, clear_supported);
+}
+
+enum isl_aux_usage
+crocus_resource_render_aux_usage(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format render_format,
+ bool blend_enabled,
+ bool draw_aux_disabled)
+{
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (draw_aux_disabled)
+ return ISL_AUX_USAGE_NONE;
+
+ switch (res->aux.usage) {
+ case ISL_AUX_USAGE_MCS:
+ return res->aux.usage;
+
+ case ISL_AUX_USAGE_CCS_D:
+ /* Otherwise, we try to fall back to CCS_D */
+ if (isl_format_supports_ccs_d(devinfo, render_format))
+ return ISL_AUX_USAGE_CCS_D;
+
+ return ISL_AUX_USAGE_NONE;
+
+ default:
+ return ISL_AUX_USAGE_NONE;
+ }
+}
+
+void
+crocus_resource_prepare_render(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t layer_count,
+ enum isl_aux_usage aux_usage)
+{
+ crocus_resource_prepare_access(ice, res, level, 1, start_layer,
+ layer_count, aux_usage,
+ aux_usage != ISL_AUX_USAGE_NONE);
+}
+
+void
+crocus_resource_finish_render(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t layer_count,
+ enum isl_aux_usage aux_usage)
+{
+ crocus_resource_finish_write(ice, res, level, start_layer, layer_count,
+ aux_usage);
+}
+
+static void
+crocus_update_stencil_shadow(struct crocus_context *ice,
+ struct crocus_resource *res)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+ UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
+ assert(devinfo->ver >= 7);
+
+ if (!res->shadow_needs_update)
+ return;
+
+ struct pipe_box box;
+ for (unsigned level = 0; level <= res->base.last_level; level++) {
+ u_box_2d(0, 0,
+ u_minify(res->base.width0, level),
+ u_minify(res->base.height0, level), &box);
+ const unsigned depth = res->base.target == PIPE_TEXTURE_3D ?
+ u_minify(res->base.depth0, level) : res->base.array_size;
+
+ for (unsigned layer = 0; layer < depth; layer++) {
+ box.z = layer;
+ ice->ctx.resource_copy_region(&ice->ctx,
+ &res->shadow->base, level, 0, 0, layer,
+ &res->base, level, &box);
+ }
+ }
+ res->shadow_needs_update = false;
+}
diff --git a/src/gallium/drivers/crocus/crocus_resource.c b/src/gallium/drivers/crocus/crocus_resource.c
new file mode 100644
index 00000000000..b5bf5a42e1a
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resource.c
@@ -0,0 +1,1946 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_resource.c
+ *
+ * Resources are images, buffers, and other objects used by the GPU.
+ *
+ * XXX: explain resources
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/os_memory.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_threaded_context.h"
+#include "util/u_transfer.h"
+#include "util/u_transfer_helper.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "crocus_batch.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/dev/intel_debug.h"
+#include "isl/isl.h"
+#include "drm-uapi/drm_fourcc.h"
+#include "drm-uapi/i915_drm.h"
+
+enum modifier_priority {
+ MODIFIER_PRIORITY_INVALID = 0,
+ MODIFIER_PRIORITY_LINEAR,
+ MODIFIER_PRIORITY_X,
+ MODIFIER_PRIORITY_Y,
+ MODIFIER_PRIORITY_Y_CCS,
+};
+
+static const uint64_t priority_to_modifier[] = {
+ [MODIFIER_PRIORITY_INVALID] = DRM_FORMAT_MOD_INVALID,
+ [MODIFIER_PRIORITY_LINEAR] = DRM_FORMAT_MOD_LINEAR,
+ [MODIFIER_PRIORITY_X] = I915_FORMAT_MOD_X_TILED,
+ [MODIFIER_PRIORITY_Y] = I915_FORMAT_MOD_Y_TILED,
+ [MODIFIER_PRIORITY_Y_CCS] = I915_FORMAT_MOD_Y_TILED_CCS,
+};
+
+static bool
+modifier_is_supported(const struct intel_device_info *devinfo,
+ enum pipe_format pfmt, uint64_t modifier)
+{
+ /* XXX: do something real */
+ switch (modifier) {
+ case I915_FORMAT_MOD_Y_TILED_CCS:
+ return false;
+ case I915_FORMAT_MOD_Y_TILED:
+ return devinfo->ver >= 6;
+ case I915_FORMAT_MOD_X_TILED:
+ case DRM_FORMAT_MOD_LINEAR:
+ return true;
+ case DRM_FORMAT_MOD_INVALID:
+ default:
+ return false;
+ }
+}
+
+static uint64_t
+select_best_modifier(struct intel_device_info *devinfo, enum pipe_format pfmt,
+ const uint64_t *modifiers,
+ int count)
+{
+ enum modifier_priority prio = MODIFIER_PRIORITY_INVALID;
+
+ for (int i = 0; i < count; i++) {
+ if (!modifier_is_supported(devinfo, pfmt, modifiers[i]))
+ continue;
+
+ switch (modifiers[i]) {
+ case I915_FORMAT_MOD_Y_TILED_CCS:
+ prio = MAX2(prio, MODIFIER_PRIORITY_Y_CCS);
+ break;
+ case I915_FORMAT_MOD_Y_TILED:
+ prio = MAX2(prio, MODIFIER_PRIORITY_Y);
+ break;
+ case I915_FORMAT_MOD_X_TILED:
+ prio = MAX2(prio, MODIFIER_PRIORITY_X);
+ break;
+ case DRM_FORMAT_MOD_LINEAR:
+ prio = MAX2(prio, MODIFIER_PRIORITY_LINEAR);
+ break;
+ case DRM_FORMAT_MOD_INVALID:
+ default:
+ break;
+ }
+ }
+
+ return priority_to_modifier[prio];
+}
+
+static enum isl_surf_dim
+crocus_target_to_isl_surf_dim(enum pipe_texture_target target)
+{
+ switch (target) {
+ case PIPE_BUFFER:
+ case PIPE_TEXTURE_1D:
+ case PIPE_TEXTURE_1D_ARRAY:
+ return ISL_SURF_DIM_1D;
+ case PIPE_TEXTURE_2D:
+ case PIPE_TEXTURE_CUBE:
+ case PIPE_TEXTURE_RECT:
+ case PIPE_TEXTURE_2D_ARRAY:
+ case PIPE_TEXTURE_CUBE_ARRAY:
+ return ISL_SURF_DIM_2D;
+ case PIPE_TEXTURE_3D:
+ return ISL_SURF_DIM_3D;
+ case PIPE_MAX_TEXTURE_TYPES:
+ break;
+ }
+ unreachable("invalid texture type");
+}
+
+static void
+crocus_query_dmabuf_modifiers(struct pipe_screen *pscreen,
+ enum pipe_format pfmt,
+ int max,
+ uint64_t *modifiers,
+ unsigned int *external_only,
+ int *count)
+{
+ struct crocus_screen *screen = (void *) pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ uint64_t all_modifiers[] = {
+ DRM_FORMAT_MOD_LINEAR,
+ I915_FORMAT_MOD_X_TILED,
+ I915_FORMAT_MOD_Y_TILED,
+ I915_FORMAT_MOD_Y_TILED_CCS,
+ };
+
+ int supported_mods = 0;
+
+ for (int i = 0; i < ARRAY_SIZE(all_modifiers); i++) {
+ if (!modifier_is_supported(devinfo, pfmt, all_modifiers[i]))
+ continue;
+
+ if (supported_mods < max) {
+ if (modifiers)
+ modifiers[supported_mods] = all_modifiers[i];
+
+ if (external_only)
+ external_only[supported_mods] = util_format_is_yuv(pfmt);
+ }
+
+ supported_mods++;
+ }
+
+ *count = supported_mods;
+}
+
+static isl_surf_usage_flags_t
+pipe_bind_to_isl_usage(unsigned bindings)
+{
+ isl_surf_usage_flags_t usage = 0;
+
+ if (bindings & PIPE_BIND_RENDER_TARGET)
+ usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
+ if (bindings & PIPE_BIND_SAMPLER_VIEW)
+ usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+
+ if (bindings & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SHADER_BUFFER))
+ usage |= ISL_SURF_USAGE_STORAGE_BIT;
+
+ if (bindings & PIPE_BIND_DISPLAY_TARGET)
+ usage |= ISL_SURF_USAGE_DISPLAY_BIT;
+
+ return usage;
+}
+
+struct pipe_resource *
+crocus_resource_get_separate_stencil(struct pipe_resource *p_res)
+{
+ /* For packed depth-stencil, we treat depth as the primary resource
+ * and store S8 as the "second plane" resource.
+ */
+ if (p_res->next && p_res->next->format == PIPE_FORMAT_S8_UINT)
+ return p_res->next;
+
+ return NULL;
+
+}
+
+static void
+crocus_resource_set_separate_stencil(struct pipe_resource *p_res,
+ struct pipe_resource *stencil)
+{
+ assert(util_format_has_depth(util_format_description(p_res->format)));
+ pipe_resource_reference(&p_res->next, stencil);
+}
+
+void
+crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo,
+ struct pipe_resource *res,
+ struct crocus_resource **out_z,
+ struct crocus_resource **out_s)
+{
+ if (!res) {
+ *out_z = NULL;
+ *out_s = NULL;
+ return;
+ }
+
+ /* gen4/5 only supports packed ds */
+ if (devinfo->ver < 6) {
+ *out_z = (void *)res;
+ *out_s = (void *)res;
+ return;
+ }
+
+ if (res->format != PIPE_FORMAT_S8_UINT) {
+ *out_z = (void *) res;
+ *out_s = (void *) crocus_resource_get_separate_stencil(res);
+ } else {
+ *out_z = NULL;
+ *out_s = (void *) res;
+ }
+}
+
+void
+crocus_resource_disable_aux(struct crocus_resource *res)
+{
+ crocus_bo_unreference(res->aux.bo);
+ free(res->aux.state);
+
+ res->aux.usage = ISL_AUX_USAGE_NONE;
+ res->aux.has_hiz = 0;
+ res->aux.surf.size_B = 0;
+ res->aux.surf.levels = 0;
+ res->aux.bo = NULL;
+ res->aux.extra_aux.surf.size_B = 0;
+ res->aux.state = NULL;
+}
+
+static void
+crocus_resource_destroy(struct pipe_screen *screen,
+ struct pipe_resource *resource)
+{
+ struct crocus_resource *res = (struct crocus_resource *)resource;
+
+ if (resource->target == PIPE_BUFFER)
+ util_range_destroy(&res->valid_buffer_range);
+
+ if (res->shadow)
+ pipe_resource_reference((struct pipe_resource **)&res->shadow, NULL);
+ crocus_resource_disable_aux(res);
+
+ crocus_bo_unreference(res->bo);
+ crocus_pscreen_unref(res->orig_screen);
+ free(res);
+}
+
+static struct crocus_resource *
+crocus_alloc_resource(struct pipe_screen *pscreen,
+ const struct pipe_resource *templ)
+{
+ struct crocus_resource *res = calloc(1, sizeof(struct crocus_resource));
+ if (!res)
+ return NULL;
+
+ res->base = *templ;
+ res->base.screen = pscreen;
+ res->orig_screen = crocus_pscreen_ref(pscreen);
+ pipe_reference_init(&res->base.reference, 1);
+
+ if (templ->target == PIPE_BUFFER)
+ util_range_init(&res->valid_buffer_range);
+
+ return res;
+}
+
+unsigned
+crocus_get_num_logical_layers(const struct crocus_resource *res, unsigned level)
+{
+ if (res->surf.dim == ISL_SURF_DIM_3D)
+ return minify(res->surf.logical_level0_px.depth, level);
+ else
+ return res->surf.logical_level0_px.array_len;
+}
+
+static enum isl_aux_state **
+create_aux_state_map(struct crocus_resource *res, enum isl_aux_state initial)
+{
+ assert(res->aux.state == NULL);
+
+ uint32_t total_slices = 0;
+ for (uint32_t level = 0; level < res->surf.levels; level++)
+ total_slices += crocus_get_num_logical_layers(res, level);
+
+ const size_t per_level_array_size =
+ res->surf.levels * sizeof(enum isl_aux_state *);
+
+ /* We're going to allocate a single chunk of data for both the per-level
+ * reference array and the arrays of aux_state. This makes cleanup
+ * significantly easier.
+ */
+ const size_t total_size =
+ per_level_array_size + total_slices * sizeof(enum isl_aux_state);
+
+ void *data = malloc(total_size);
+ if (!data)
+ return NULL;
+
+ enum isl_aux_state **per_level_arr = data;
+ enum isl_aux_state *s = data + per_level_array_size;
+ for (uint32_t level = 0; level < res->surf.levels; level++) {
+ per_level_arr[level] = s;
+ const unsigned level_layers = crocus_get_num_logical_layers(res, level);
+ for (uint32_t a = 0; a < level_layers; a++)
+ *(s++) = initial;
+ }
+ assert((void *)s == data + total_size);
+
+ return per_level_arr;
+}
+
+/**
+ * Configure aux for the resource, but don't allocate it. For images which
+ * might be shared with modifiers, we must allocate the image and aux data in
+ * a single bo.
+ *
+ * Returns false on unexpected error (e.g. allocation failed, or invalid
+ * configuration result).
+ */
+static bool
+crocus_resource_configure_aux(struct crocus_screen *screen,
+ struct crocus_resource *res, bool imported,
+ uint64_t *aux_size_B,
+ uint32_t *alloc_flags)
+{
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ /* Try to create the auxiliary surfaces allowed by the modifier or by
+ * the user if no modifier is specified.
+ */
+ assert(!res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE);
+
+ const bool has_mcs = devinfo->ver >= 7 && !res->mod_info &&
+ isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
+
+ const bool has_hiz = devinfo->ver >= 6 && !res->mod_info &&
+ !(INTEL_DEBUG & DEBUG_NO_HIZ) &&
+ isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
+
+ const bool has_ccs =
+ ((devinfo->ver >= 7 && !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) ||
+ (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) &&
+ isl_surf_get_ccs_surf(&screen->isl_dev, &res->surf, &res->aux.surf,
+ &res->aux.extra_aux.surf, 0);
+
+ /* Having both HIZ and MCS is impossible. */
+ assert(!has_mcs || !has_hiz);
+
+ /* Ensure aux surface creation for MCS_CCS and HIZ_CCS is correct. */
+ if (has_ccs && (has_mcs || has_hiz)) {
+ assert(res->aux.extra_aux.surf.size_B > 0 &&
+ res->aux.extra_aux.surf.usage & ISL_SURF_USAGE_CCS_BIT);
+ assert(res->aux.surf.size_B > 0 &&
+ res->aux.surf.usage &
+ (ISL_SURF_USAGE_HIZ_BIT | ISL_SURF_USAGE_MCS_BIT));
+ }
+
+ if (res->mod_info && has_ccs) {
+ res->aux.usage = res->mod_info->aux_usage;
+ } else if (has_mcs) {
+ res->aux.usage = ISL_AUX_USAGE_MCS;
+ } else if (has_hiz) {
+ res->aux.usage = ISL_AUX_USAGE_HIZ;
+ } else if (has_ccs) {
+ if (isl_format_supports_ccs_d(devinfo, res->surf.format))
+ res->aux.usage = ISL_AUX_USAGE_CCS_D;
+ }
+
+ enum isl_aux_state initial_state = ISL_AUX_STATE_AUX_INVALID;
+ *aux_size_B = 0;
+ *alloc_flags = 0;
+ assert(!res->aux.bo);
+
+ switch (res->aux.usage) {
+ case ISL_AUX_USAGE_NONE:
+ /* Having no aux buffer is only okay if there's no modifier with aux. */
+ res->aux.surf.levels = 0;
+ return !res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE;
+ case ISL_AUX_USAGE_HIZ:
+ initial_state = ISL_AUX_STATE_AUX_INVALID;
+ break;
+ case ISL_AUX_USAGE_MCS:
+ /* The Ivybridge PRM, Vol 2 Part 1 p326 says:
+ *
+ * "When MCS buffer is enabled and bound to MSRT, it is required
+ * that it is cleared prior to any rendering."
+ *
+ * Since we only use the MCS buffer for rendering, we just clear it
+ * immediately on allocation. The clear value for MCS buffers is all
+ * 1's, so we simply memset it to 0xff.
+ */
+ initial_state = ISL_AUX_STATE_CLEAR;
+ break;
+ case ISL_AUX_USAGE_CCS_D:
+ /* When CCS_E is used, we need to ensure that the CCS starts off in
+ * a valid state. From the Sky Lake PRM, "MCS Buffer for Render
+ * Target(s)":
+ *
+ * "If Software wants to enable Color Compression without Fast
+ * clear, Software needs to initialize MCS with zeros."
+ *
+ * A CCS value of 0 indicates that the corresponding block is in the
+ * pass-through state which is what we want.
+ *
+ * For CCS_D, do the same thing. On Gen9+, this avoids having any
+ * undefined bits in the aux buffer.
+ */
+ if (imported)
+ initial_state =
+ isl_drm_modifier_get_default_aux_state(res->mod_info->modifier);
+ else
+ initial_state = ISL_AUX_STATE_PASS_THROUGH;
+ *alloc_flags |= BO_ALLOC_ZEROED;
+ break;
+ default:
+ unreachable("non-crocus aux");
+ }
+
+ /* Create the aux_state for the auxiliary buffer. */
+ res->aux.state = create_aux_state_map(res, initial_state);
+ if (!res->aux.state)
+ return false;
+
+ /* Increase the aux offset if the main and aux surfaces will share a BO. */
+ res->aux.offset =
+ !res->mod_info || res->mod_info->aux_usage == res->aux.usage ?
+ ALIGN(res->surf.size_B, res->aux.surf.alignment_B) : 0;
+ uint64_t size = res->aux.surf.size_B;
+
+ /* Allocate space in the buffer for storing the CCS. */
+ if (res->aux.extra_aux.surf.size_B > 0) {
+ const uint64_t padded_aux_size =
+ ALIGN(size, res->aux.extra_aux.surf.alignment_B);
+ res->aux.extra_aux.offset = res->aux.offset + padded_aux_size;
+ size = padded_aux_size + res->aux.extra_aux.surf.size_B;
+ }
+
+ /* Allocate space in the buffer for storing the clear color. On modern
+ * platforms (gen > 9), we can read it directly from such buffer.
+ *
+ * On gen <= 9, we are going to store the clear color on the buffer
+ * anyways, and copy it back to the surface state during state emission.
+ *
+ * Also add some padding to make sure the fast clear color state buffer
+ * starts at a 4K alignment. We believe that 256B might be enough, but due
+ * to lack of testing we will leave this as 4K for now.
+ */
+ size = ALIGN(size, 4096);
+ *aux_size_B = size;
+
+ if (isl_aux_usage_has_hiz(res->aux.usage)) {
+ for (unsigned level = 0; level < res->surf.levels; ++level) {
+ uint32_t width = u_minify(res->surf.phys_level0_sa.width, level);
+ uint32_t height = u_minify(res->surf.phys_level0_sa.height, level);
+
+ /* Disable HiZ for LOD > 0 unless the width/height are 8x4 aligned.
+ * For LOD == 0, we can grow the dimensions to make it work.
+ */
+ if (!devinfo->is_haswell ||
+ (level == 0 || ((width & 7) == 0 && (height & 3) == 0)))
+ res->aux.has_hiz |= 1 << level;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Initialize the aux buffer contents.
+ *
+ * Returns false on unexpected error (e.g. mapping a BO failed).
+ */
+static bool
+crocus_resource_init_aux_buf(struct crocus_resource *res, uint32_t alloc_flags)
+{
+ if (!(alloc_flags & BO_ALLOC_ZEROED)) {
+ void *map = crocus_bo_map(NULL, res->aux.bo, MAP_WRITE | MAP_RAW);
+
+ if (!map)
+ return false;
+
+ if (crocus_resource_get_aux_state(res, 0, 0) != ISL_AUX_STATE_AUX_INVALID) {
+ uint8_t memset_value = isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0;
+ memset((char*)map + res->aux.offset, memset_value,
+ res->aux.surf.size_B);
+ }
+
+ /* Bspec section titled : MCS/CCS Buffers for Render Target(s) states:
+ * - If Software wants to enable Color Compression without Fast clear,
+ * Software needs to initialize MCS with zeros.
+ * - Lossless compression and CCS initialized to all F (using HW Fast
+ * Clear or SW direct Clear)
+ *
+ * We think, the first bullet point above is referring to CCS aux
+ * surface. Since we initialize the MCS in the clear state, we also
+ * initialize the CCS in the clear state (via SW direct clear) to keep
+ * the two in sync.
+ */
+ memset((char*)map + res->aux.extra_aux.offset,
+ isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0,
+ res->aux.extra_aux.surf.size_B);
+
+ crocus_bo_unmap(res->aux.bo);
+ }
+
+ return true;
+}
+
+/**
+ * Allocate the initial aux surface for a resource based on aux.usage
+ *
+ * Returns false on unexpected error (e.g. allocation failed, or invalid
+ * configuration result).
+ */
+static bool
+crocus_resource_alloc_separate_aux(struct crocus_screen *screen,
+ struct crocus_resource *res)
+{
+ uint32_t alloc_flags;
+ uint64_t size;
+ if (!crocus_resource_configure_aux(screen, res, false, &size, &alloc_flags))
+ return false;
+
+ if (size == 0)
+ return true;
+
+ /* Allocate the auxiliary buffer. ISL has stricter set of alignment rules
+ * the drm allocator. Therefore, one can pass the ISL dimensions in terms
+ * of bytes instead of trying to recalculate based on different format
+ * block sizes.
+ */
+ res->aux.bo = crocus_bo_alloc_tiled(screen->bufmgr, "aux buffer", size, 4096,
+ isl_tiling_to_i915_tiling(res->aux.surf.tiling),
+ res->aux.surf.row_pitch_B, alloc_flags);
+ if (!res->aux.bo) {
+ return false;
+ }
+
+ if (!crocus_resource_init_aux_buf(res, alloc_flags))
+ return false;
+
+ return true;
+}
+
+void
+crocus_resource_finish_aux_import(struct pipe_screen *pscreen,
+ struct crocus_resource *res)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ assert(crocus_resource_unfinished_aux_import(res));
+ assert(!res->mod_info->supports_clear_color);
+
+ struct crocus_resource *aux_res = (void *) res->base.next;
+ assert(aux_res->aux.surf.row_pitch_B && aux_res->aux.offset &&
+ aux_res->aux.bo);
+
+ assert(res->bo == aux_res->aux.bo);
+ crocus_bo_reference(aux_res->aux.bo);
+ res->aux.bo = aux_res->aux.bo;
+
+ res->aux.offset = aux_res->aux.offset;
+
+ assert(res->bo->size >= (res->aux.offset + res->aux.surf.size_B));
+ assert(aux_res->aux.surf.row_pitch_B == res->aux.surf.row_pitch_B);
+
+ crocus_resource_destroy(&screen->base, res->base.next);
+ res->base.next = NULL;
+}
+
+static struct pipe_resource *
+crocus_resource_create_for_buffer(struct pipe_screen *pscreen,
+ const struct pipe_resource *templ)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+
+ assert(templ->target == PIPE_BUFFER);
+ assert(templ->height0 <= 1);
+ assert(templ->depth0 <= 1);
+ assert(templ->format == PIPE_FORMAT_NONE ||
+ util_format_get_blocksize(templ->format) == 1);
+
+ res->internal_format = templ->format;
+ res->surf.tiling = ISL_TILING_LINEAR;
+
+ const char *name = templ->target == PIPE_BUFFER ? "buffer" : "miptree";
+
+ res->bo = crocus_bo_alloc(screen->bufmgr, name, templ->width0);
+ if (!res->bo) {
+ crocus_resource_destroy(pscreen, &res->base);
+ return NULL;
+ }
+
+ return &res->base;
+}
+
+static struct pipe_resource *
+crocus_resource_create_with_modifiers(struct pipe_screen *pscreen,
+ const struct pipe_resource *templ,
+ const uint64_t *modifiers,
+ int modifiers_count)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+
+ if (!res)
+ return NULL;
+
+ const struct util_format_description *format_desc =
+ util_format_description(templ->format);
+ const bool has_depth = util_format_has_depth(format_desc);
+ uint64_t modifier =
+ select_best_modifier(devinfo, templ->format, modifiers, modifiers_count);
+
+ isl_tiling_flags_t tiling_flags = ISL_TILING_ANY_MASK;
+
+ /* TODO: This used to be because there wasn't BLORP to handle Y-tiling. */
+ if (devinfo->ver < 6 && !util_format_is_depth_or_stencil(templ->format))
+ tiling_flags &= ~ISL_TILING_Y0_BIT;
+
+ if (modifier != DRM_FORMAT_MOD_INVALID) {
+ res->mod_info = isl_drm_modifier_get_info(modifier);
+
+ tiling_flags = 1 << res->mod_info->tiling;
+ } else {
+ if (modifiers_count > 0) {
+ fprintf(stderr, "Unsupported modifier, resource creation failed.\n");
+ goto fail;
+ }
+
+ if (templ->bind & PIPE_BIND_RENDER_TARGET && devinfo->ver < 6) {
+ modifier = I915_FORMAT_MOD_X_TILED;
+ res->mod_info = isl_drm_modifier_get_info(modifier);
+ tiling_flags = 1 << res->mod_info->tiling;
+ }
+ /* Use linear for staging buffers */
+ if (templ->usage == PIPE_USAGE_STAGING ||
+ templ->bind & (PIPE_BIND_LINEAR | PIPE_BIND_CURSOR) )
+ tiling_flags = ISL_TILING_LINEAR_BIT;
+ }
+
+ isl_surf_usage_flags_t usage = pipe_bind_to_isl_usage(templ->bind);
+
+ if (templ->target == PIPE_TEXTURE_CUBE ||
+ templ->target == PIPE_TEXTURE_CUBE_ARRAY)
+ usage |= ISL_SURF_USAGE_CUBE_BIT;
+
+ if (templ->usage != PIPE_USAGE_STAGING) {
+ if (templ->format == PIPE_FORMAT_S8_UINT)
+ usage |= ISL_SURF_USAGE_STENCIL_BIT;
+ else if (has_depth) {
+ /* combined DS only on gen4/5 */
+ if (devinfo->ver < 6) {
+ if (templ->format == PIPE_FORMAT_Z24X8_UNORM ||
+ templ->format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+ templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+ usage |= ISL_SURF_USAGE_STENCIL_BIT;
+ }
+ usage |= ISL_SURF_USAGE_DEPTH_BIT;
+ }
+
+ if (templ->format == PIPE_FORMAT_S8_UINT)
+ tiling_flags = ISL_TILING_W_BIT;
+ }
+
+ if (templ->usage == PIPE_USAGE_STAGING &&
+ templ->bind == PIPE_BIND_DEPTH_STENCIL &&
+ devinfo->ver < 6)
+ return NULL;
+
+ enum pipe_format pfmt = templ->format;
+ res->internal_format = pfmt;
+
+ /* Should be handled by u_transfer_helper */
+// assert(!util_format_is_depth_and_stencil(pfmt));
+
+ struct crocus_format_info fmt = crocus_format_for_usage(devinfo, pfmt, usage);
+ assert(fmt.fmt != ISL_FORMAT_UNSUPPORTED);
+ enum isl_surf_dim dim = crocus_target_to_isl_surf_dim(templ->target);
+
+ UNUSED const bool isl_surf_created_successfully =
+ isl_surf_init(&screen->isl_dev, &res->surf,
+ .dim = dim,
+ .format = fmt.fmt,
+ .width = templ->width0,
+ .height = templ->height0,
+ .depth = templ->depth0,
+ .levels = templ->last_level + 1,
+ .array_len = templ->array_size,
+ .samples = MAX2(templ->nr_samples, 1),
+ .min_alignment_B = 0,
+ .row_pitch_B = 0,
+ .usage = usage,
+ .tiling_flags = tiling_flags);
+ assert(isl_surf_created_successfully);
+
+ const char *name = "miptree";
+
+ unsigned int flags = 0;
+ if (templ->usage == PIPE_USAGE_STAGING)
+ flags |= BO_ALLOC_COHERENT;
+
+ uint64_t aux_size = 0;
+ uint32_t aux_preferred_alloc_flags;
+
+ if (!crocus_resource_configure_aux(screen, res, false, &aux_size,
+ &aux_preferred_alloc_flags)) {
+ goto fail;
+ }
+
+ /* Modifiers require the aux data to be in the same buffer as the main
+ * surface, but we combine them even when a modifiers is not being used.
+ */
+ const uint64_t bo_size =
+ MAX2(res->surf.size_B, res->aux.offset + aux_size);
+ uint32_t alignment = MAX2(4096, res->surf.alignment_B);
+ res->bo = crocus_bo_alloc_tiled(screen->bufmgr, name, bo_size, alignment,
+ isl_tiling_to_i915_tiling(res->surf.tiling),
+ res->surf.row_pitch_B, flags);
+
+ if (!res->bo)
+ goto fail;
+
+ if (aux_size > 0) {
+ res->aux.bo = res->bo;
+ crocus_bo_reference(res->aux.bo);
+ if (!crocus_resource_init_aux_buf(res, flags))
+ goto fail;
+ }
+
+ if (templ->format == PIPE_FORMAT_S8_UINT && !(templ->usage == PIPE_USAGE_STAGING) &&
+ devinfo->ver == 7 && (templ->bind & PIPE_BIND_SAMPLER_VIEW)) {
+ struct pipe_resource templ_shadow = (struct pipe_resource) {
+ .usage = 0,
+ .bind = PIPE_BIND_SAMPLER_VIEW,
+ .width0 = res->base.width0,
+ .height0 = res->base.height0,
+ .depth0 = res->base.depth0,
+ .last_level = res->base.last_level,
+ .nr_samples = res->base.nr_samples,
+ .nr_storage_samples = res->base.nr_storage_samples,
+ .array_size = res->base.array_size,
+ .format = PIPE_FORMAT_R8_UINT,
+ .target = res->base.target,
+ };
+ res->shadow = (struct crocus_resource *)screen->base.resource_create(&screen->base, &templ_shadow);
+ assert(res->shadow);
+ }
+
+ return &res->base;
+
+fail:
+ fprintf(stderr, "XXX: resource creation failed\n");
+ crocus_resource_destroy(pscreen, &res->base);
+ return NULL;
+
+}
+
+static struct pipe_resource *
+crocus_resource_create(struct pipe_screen *pscreen,
+ const struct pipe_resource *templ)
+{
+ if (templ->target == PIPE_BUFFER)
+ return crocus_resource_create_for_buffer(pscreen, templ);
+ else
+ return crocus_resource_create_with_modifiers(pscreen, templ, NULL, 0);
+}
+
+static uint64_t
+tiling_to_modifier(uint32_t tiling)
+{
+ static const uint64_t map[] = {
+ [I915_TILING_NONE] = DRM_FORMAT_MOD_LINEAR,
+ [I915_TILING_X] = I915_FORMAT_MOD_X_TILED,
+ [I915_TILING_Y] = I915_FORMAT_MOD_Y_TILED,
+ };
+
+ assert(tiling < ARRAY_SIZE(map));
+
+ return map[tiling];
+}
+
+static struct pipe_resource *
+crocus_resource_from_user_memory(struct pipe_screen *pscreen,
+ const struct pipe_resource *templ,
+ void *user_memory)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+ if (!res)
+ return NULL;
+
+ assert(templ->target == PIPE_BUFFER);
+
+ res->internal_format = templ->format;
+ res->bo = crocus_bo_create_userptr(bufmgr, "user",
+ user_memory, templ->width0);
+ if (!res->bo) {
+ free(res);
+ return NULL;
+ }
+
+ util_range_add(&res->base, &res->valid_buffer_range, 0, templ->width0);
+
+ return &res->base;
+}
+
+static struct pipe_resource *
+crocus_resource_from_handle(struct pipe_screen *pscreen,
+ const struct pipe_resource *templ,
+ struct winsys_handle *whandle,
+ unsigned usage)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_bufmgr *bufmgr = screen->bufmgr;
+ struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+ const struct isl_drm_modifier_info *mod_inf =
+ isl_drm_modifier_get_info(whandle->modifier);
+ uint32_t tiling;
+
+ if (!res)
+ return NULL;
+
+ switch (whandle->type) {
+ case WINSYS_HANDLE_TYPE_FD:
+ if (mod_inf)
+ tiling = isl_tiling_to_i915_tiling(mod_inf->tiling);
+ else
+ tiling = I915_TILING_LAST + 1;
+ res->bo = crocus_bo_import_dmabuf(bufmgr, whandle->handle,
+ tiling, whandle->stride);
+ break;
+ case WINSYS_HANDLE_TYPE_SHARED:
+ res->bo = crocus_bo_gem_create_from_name(bufmgr, "winsys image",
+ whandle->handle);
+ break;
+ default:
+ unreachable("invalid winsys handle type");
+ }
+ if (!res->bo)
+ return NULL;
+
+ res->offset = whandle->offset;
+
+ if (mod_inf == NULL) {
+ mod_inf =
+ isl_drm_modifier_get_info(tiling_to_modifier(res->bo->tiling_mode));
+ }
+ assert(mod_inf);
+
+ res->external_format = whandle->format;
+ res->mod_info = mod_inf;
+
+ isl_surf_usage_flags_t isl_usage = pipe_bind_to_isl_usage(templ->bind);
+
+ const struct crocus_format_info fmt =
+ crocus_format_for_usage(devinfo, templ->format, isl_usage);
+ res->internal_format = templ->format;
+
+ if (templ->target == PIPE_BUFFER) {
+ res->surf.tiling = ISL_TILING_LINEAR;
+ } else {
+ if (whandle->plane < util_format_get_num_planes(whandle->format)) {
+ UNUSED const bool isl_surf_created_successfully =
+ isl_surf_init(&screen->isl_dev, &res->surf,
+ .dim = crocus_target_to_isl_surf_dim(templ->target),
+ .format = fmt.fmt,
+ .width = templ->width0,
+ .height = templ->height0,
+ .depth = templ->depth0,
+ .levels = templ->last_level + 1,
+ .array_len = templ->array_size,
+ .samples = MAX2(templ->nr_samples, 1),
+ .min_alignment_B = 0,
+ .row_pitch_B = whandle->stride,
+ .usage = isl_usage,
+ .tiling_flags = 1 << res->mod_info->tiling);
+ assert(isl_surf_created_successfully);
+ assert(res->bo->tiling_mode ==
+ isl_tiling_to_i915_tiling(res->surf.tiling));
+
+ // XXX: create_ccs_buf_for_image?
+ if (whandle->modifier == DRM_FORMAT_MOD_INVALID) {
+ if (!crocus_resource_alloc_separate_aux(screen, res))
+ goto fail;
+ } else {
+ if (res->mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+ uint32_t alloc_flags;
+ uint64_t size;
+ UNUSED bool ok = crocus_resource_configure_aux(screen, res, true, &size,
+ &alloc_flags);
+ assert(ok);
+ /* The gallium dri layer will create a separate plane resource
+ * for the aux image. crocus_resource_finish_aux_import will
+ * merge the separate aux parameters back into a single
+ * crocus_resource.
+ */
+ }
+ }
+ } else {
+ /* Save modifier import information to reconstruct later. After
+ * import, this will be available under a second image accessible
+ * from the main image with res->base.next. See
+ * crocus_resource_finish_aux_import.
+ */
+ res->aux.surf.row_pitch_B = whandle->stride;
+ res->aux.offset = whandle->offset;
+ res->aux.bo = res->bo;
+ res->bo = NULL;
+ }
+ }
+
+ return &res->base;
+
+fail:
+ crocus_resource_destroy(pscreen, &res->base);
+ return NULL;
+}
+
+static void
+crocus_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_resource *res = (void *) resource;
+ const struct isl_drm_modifier_info *mod = res->mod_info;
+
+ crocus_resource_prepare_access(ice, res,
+ 0, INTEL_REMAINING_LEVELS,
+ 0, INTEL_REMAINING_LAYERS,
+ mod ? mod->aux_usage : ISL_AUX_USAGE_NONE,
+ mod ? mod->supports_clear_color : false);
+}
+
+static void
+crocus_resource_disable_aux_on_first_query(struct pipe_resource *resource,
+ unsigned usage)
+{
+ struct crocus_resource *res = (struct crocus_resource *)resource;
+ bool mod_with_aux =
+ res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+
+ /* Disable aux usage if explicit flush not set and this is the first time
+ * we are dealing with this resource and the resource was not created with
+ * a modifier with aux.
+ */
+ if (!mod_with_aux &&
+ (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && res->aux.usage != 0) &&
+ p_atomic_read(&resource->reference.count) == 1) {
+ crocus_resource_disable_aux(res);
+ }
+}
+
+static bool
+crocus_resource_get_param(struct pipe_screen *pscreen,
+ struct pipe_context *context,
+ struct pipe_resource *resource,
+ unsigned plane,
+ unsigned layer,
+ unsigned level,
+ enum pipe_resource_param param,
+ unsigned handle_usage,
+ uint64_t *value)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ struct crocus_resource *res = (struct crocus_resource *)resource;
+ bool mod_with_aux =
+ res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+ bool wants_aux = mod_with_aux && plane > 0;
+ bool result;
+ unsigned handle;
+
+ if (crocus_resource_unfinished_aux_import(res))
+ crocus_resource_finish_aux_import(pscreen, res);
+
+ struct crocus_bo *bo = wants_aux ? res->aux.bo : res->bo;
+
+ crocus_resource_disable_aux_on_first_query(resource, handle_usage);
+
+ switch (param) {
+ case PIPE_RESOURCE_PARAM_NPLANES:
+ if (mod_with_aux) {
+ *value = util_format_get_num_planes(res->external_format);
+ } else {
+ unsigned count = 0;
+ for (struct pipe_resource *cur = resource; cur; cur = cur->next)
+ count++;
+ *value = count;
+ }
+ return true;
+ case PIPE_RESOURCE_PARAM_STRIDE:
+ *value = wants_aux ? res->aux.surf.row_pitch_B : res->surf.row_pitch_B;
+ return true;
+ case PIPE_RESOURCE_PARAM_OFFSET:
+ *value = wants_aux ? res->aux.offset : 0;
+ return true;
+ case PIPE_RESOURCE_PARAM_MODIFIER:
+ *value = res->mod_info ? res->mod_info->modifier :
+ tiling_to_modifier(res->bo->tiling_mode);
+ return true;
+ case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
+ result = crocus_bo_flink(bo, &handle) == 0;
+ if (result)
+ *value = handle;
+ return result;
+ case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: {
+ /* Because we share the same drm file across multiple crocus_screen, when
+ * we export a GEM handle we must make sure it is valid in the DRM file
+ * descriptor the caller is using (this is the FD given at screen
+ * creation).
+ */
+ uint32_t handle;
+ if (crocus_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle))
+ return false;
+ *value = handle;
+ return true;
+ }
+ case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
+ result = crocus_bo_export_dmabuf(bo, (int *) &handle) == 0;
+ if (result)
+ *value = handle;
+ return result;
+ default:
+ return false;
+ }
+}
+
+static bool
+crocus_resource_get_handle(struct pipe_screen *pscreen,
+ struct pipe_context *ctx,
+ struct pipe_resource *resource,
+ struct winsys_handle *whandle,
+ unsigned usage)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+ struct crocus_resource *res = (struct crocus_resource *)resource;
+ bool mod_with_aux =
+ res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+
+ crocus_resource_disable_aux_on_first_query(resource, usage);
+
+ struct crocus_bo *bo;
+ if (mod_with_aux && whandle->plane > 0) {
+ assert(res->aux.bo);
+ bo = res->aux.bo;
+ whandle->stride = res->aux.surf.row_pitch_B;
+ whandle->offset = res->aux.offset;
+ } else {
+ /* If this is a buffer, stride should be 0 - no need to special case */
+ whandle->stride = res->surf.row_pitch_B;
+ bo = res->bo;
+ }
+ whandle->format = res->external_format;
+ whandle->modifier =
+ res->mod_info ? res->mod_info->modifier
+ : tiling_to_modifier(res->bo->tiling_mode);
+
+#ifndef NDEBUG
+ enum isl_aux_usage allowed_usage =
+ res->mod_info ? res->mod_info->aux_usage : ISL_AUX_USAGE_NONE;
+
+ if (res->aux.usage != allowed_usage) {
+ enum isl_aux_state aux_state = crocus_resource_get_aux_state(res, 0, 0);
+ assert(aux_state == ISL_AUX_STATE_RESOLVED ||
+ aux_state == ISL_AUX_STATE_PASS_THROUGH);
+ }
+#endif
+
+ switch (whandle->type) {
+ case WINSYS_HANDLE_TYPE_SHARED:
+ return crocus_bo_flink(bo, &whandle->handle) == 0;
+ case WINSYS_HANDLE_TYPE_KMS: {
+ /* Because we share the same drm file across multiple crocus_screen, when
+ * we export a GEM handle we must make sure it is valid in the DRM file
+ * descriptor the caller is using (this is the FD given at screen
+ * creation).
+ */
+ uint32_t handle;
+ if (crocus_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle))
+ return false;
+ whandle->handle = handle;
+ return true;
+ }
+ case WINSYS_HANDLE_TYPE_FD:
+ return crocus_bo_export_dmabuf(bo, (int *) &whandle->handle) == 0;
+ }
+
+ return false;
+}
+
+static bool
+resource_is_busy(struct crocus_context *ice,
+ struct crocus_resource *res)
+{
+ bool busy = crocus_bo_busy(res->bo);
+
+ for (int i = 0; i < ice->batch_count; i++)
+ busy |= crocus_batch_references(&ice->batches[i], res->bo);
+
+ return busy;
+}
+
+static void
+crocus_invalidate_resource(struct pipe_context *ctx,
+ struct pipe_resource *resource)
+{
+ struct crocus_screen *screen = (void *) ctx->screen;
+ struct crocus_context *ice = (void *) ctx;
+ struct crocus_resource *res = (void *) resource;
+
+ if (resource->target != PIPE_BUFFER)
+ return;
+
+ if (!resource_is_busy(ice, res)) {
+ /* The resource is idle, so just mark that it contains no data and
+ * keep using the same underlying buffer object.
+ */
+ util_range_set_empty(&res->valid_buffer_range);
+ return;
+ }
+
+ /* Otherwise, try and replace the backing storage with a new BO. */
+
+ /* We can't reallocate memory we didn't allocate in the first place. */
+ if (res->bo->userptr)
+ return;
+
+ // XXX: We should support this.
+ if (res->bind_history & PIPE_BIND_STREAM_OUTPUT)
+ return;
+
+ struct crocus_bo *old_bo = res->bo;
+ struct crocus_bo *new_bo =
+ crocus_bo_alloc(screen->bufmgr, res->bo->name, resource->width0);
+
+ if (!new_bo)
+ return;
+
+ /* Swap out the backing storage */
+ res->bo = new_bo;
+
+ /* Rebind the buffer, replacing any state referring to the old BO's
+ * address, and marking state dirty so it's reemitted.
+ */
+ screen->vtbl.rebind_buffer(ice, res);
+
+ util_range_set_empty(&res->valid_buffer_range);
+
+ crocus_bo_unreference(old_bo);
+}
+
+static void
+crocus_flush_staging_region(struct pipe_transfer *xfer,
+ const struct pipe_box *flush_box)
+{
+ if (!(xfer->usage & PIPE_MAP_WRITE))
+ return;
+
+ struct crocus_transfer *map = (void *) xfer;
+
+ struct pipe_box src_box = *flush_box;
+
+ /* Account for extra alignment padding in staging buffer */
+ if (xfer->resource->target == PIPE_BUFFER)
+ src_box.x += xfer->box.x % CROCUS_MAP_BUFFER_ALIGNMENT;
+
+ struct pipe_box dst_box = (struct pipe_box) {
+ .x = xfer->box.x + flush_box->x,
+ .y = xfer->box.y + flush_box->y,
+ .z = xfer->box.z + flush_box->z,
+ .width = flush_box->width,
+ .height = flush_box->height,
+ .depth = flush_box->depth,
+ };
+
+ crocus_copy_region(map->blorp, map->batch, xfer->resource, xfer->level,
+ dst_box.x, dst_box.y, dst_box.z, map->staging, 0,
+ &src_box);
+}
+
+static void
+crocus_unmap_copy_region(struct crocus_transfer *map)
+{
+ crocus_resource_destroy(map->staging->screen, map->staging);
+
+ map->ptr = NULL;
+}
+
+static void
+crocus_map_copy_region(struct crocus_transfer *map)
+{
+ struct pipe_screen *pscreen = &map->batch->screen->base;
+ struct pipe_transfer *xfer = &map->base;
+ struct pipe_box *box = &xfer->box;
+ struct crocus_resource *res = (void *) xfer->resource;
+
+ unsigned extra = xfer->resource->target == PIPE_BUFFER ?
+ box->x % CROCUS_MAP_BUFFER_ALIGNMENT : 0;
+
+ struct pipe_resource templ = (struct pipe_resource) {
+ .usage = PIPE_USAGE_STAGING,
+ .width0 = box->width + extra,
+ .height0 = box->height,
+ .depth0 = 1,
+ .nr_samples = xfer->resource->nr_samples,
+ .nr_storage_samples = xfer->resource->nr_storage_samples,
+ .array_size = box->depth,
+ .format = res->internal_format,
+ };
+
+ if (xfer->resource->target == PIPE_BUFFER)
+ templ.target = PIPE_BUFFER;
+ else if (templ.array_size > 1)
+ templ.target = PIPE_TEXTURE_2D_ARRAY;
+ else
+ templ.target = PIPE_TEXTURE_2D;
+
+ map->staging = crocus_resource_create(pscreen, &templ);
+ assert(map->staging);
+
+ if (templ.target != PIPE_BUFFER) {
+ struct isl_surf *surf = &((struct crocus_resource *) map->staging)->surf;
+ xfer->stride = isl_surf_get_row_pitch_B(surf);
+ xfer->layer_stride = isl_surf_get_array_pitch(surf);
+ }
+
+ if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) {
+ crocus_copy_region(map->blorp, map->batch, map->staging, 0, extra, 0, 0,
+ xfer->resource, xfer->level, box);
+ /* Ensure writes to the staging BO land before we map it below. */
+ crocus_emit_pipe_control_flush(map->batch,
+ "transfer read: flush before mapping",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ }
+
+ struct crocus_bo *staging_bo = crocus_resource_bo(map->staging);
+
+ if (crocus_batch_references(map->batch, staging_bo))
+ crocus_batch_flush(map->batch);
+
+ map->ptr =
+ crocus_bo_map(map->dbg, staging_bo, xfer->usage & MAP_FLAGS) + extra;
+
+ map->unmap = crocus_unmap_copy_region;
+}
+
+static void
+get_image_offset_el(const struct isl_surf *surf, unsigned level, unsigned z,
+ unsigned *out_x0_el, unsigned *out_y0_el)
+{
+ ASSERTED uint32_t z0_el, a0_el;
+ if (surf->dim == ISL_SURF_DIM_3D) {
+ isl_surf_get_image_offset_el(surf, level, 0, z,
+ out_x0_el, out_y0_el, &z0_el, &a0_el);
+ } else {
+ isl_surf_get_image_offset_el(surf, level, z, 0,
+ out_x0_el, out_y0_el, &z0_el, &a0_el);
+ }
+ assert(z0_el == 0 && a0_el == 0);
+}
+
+void
+crocus_resource_get_image_offset(struct crocus_resource *res,
+ uint32_t level, uint32_t z,
+ uint32_t *x, uint32_t *y)
+{
+ get_image_offset_el(&res->surf, level, z, x, y);
+}
+
+/**
+ * Get pointer offset into stencil buffer.
+ *
+ * The stencil buffer is W tiled. Since the GTT is incapable of W fencing, we
+ * must decode the tile's layout in software.
+ *
+ * See
+ * - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.2.1 W-Major Tile
+ * Format.
+ * - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling Algorithm
+ *
+ * Even though the returned offset is always positive, the return type is
+ * signed due to
+ * commit e8b1c6d6f55f5be3bef25084fdd8b6127517e137
+ * mesa: Fix return type of _mesa_get_format_bytes() (#37351)
+ */
+static intptr_t
+s8_offset(uint32_t stride, uint32_t x, uint32_t y, bool swizzled)
+{
+ uint32_t tile_size = 4096;
+ uint32_t tile_width = 64;
+ uint32_t tile_height = 64;
+ uint32_t row_size = 64 * stride / 2; /* Two rows are interleaved. */
+
+ uint32_t tile_x = x / tile_width;
+ uint32_t tile_y = y / tile_height;
+
+ /* The byte's address relative to the tile's base addres. */
+ uint32_t byte_x = x % tile_width;
+ uint32_t byte_y = y % tile_height;
+
+ uintptr_t u = tile_y * row_size
+ + tile_x * tile_size
+ + 512 * (byte_x / 8)
+ + 64 * (byte_y / 8)
+ + 32 * ((byte_y / 4) % 2)
+ + 16 * ((byte_x / 4) % 2)
+ + 8 * ((byte_y / 2) % 2)
+ + 4 * ((byte_x / 2) % 2)
+ + 2 * (byte_y % 2)
+ + 1 * (byte_x % 2);
+
+ if (swizzled) {
+ /* adjust for bit6 swizzling */
+ if (((byte_x / 8) % 2) == 1) {
+ if (((byte_y / 8) % 2) == 0) {
+ u += 64;
+ } else {
+ u -= 64;
+ }
+ }
+ }
+
+ return u;
+}
+
+static void
+crocus_unmap_s8(struct crocus_transfer *map)
+{
+ struct pipe_transfer *xfer = &map->base;
+ const struct pipe_box *box = &xfer->box;
+ struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+ struct isl_surf *surf = &res->surf;
+
+ if (xfer->usage & PIPE_MAP_WRITE) {
+ uint8_t *untiled_s8_map = map->ptr;
+ uint8_t *tiled_s8_map =
+ crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+ for (int s = 0; s < box->depth; s++) {
+ unsigned x0_el, y0_el;
+ get_image_offset_el(surf, xfer->level, box->z + s, &x0_el, &y0_el);
+
+ for (uint32_t y = 0; y < box->height; y++) {
+ for (uint32_t x = 0; x < box->width; x++) {
+ ptrdiff_t offset = s8_offset(surf->row_pitch_B,
+ x0_el + box->x + x,
+ y0_el + box->y + y,
+ map->has_swizzling);
+ tiled_s8_map[offset] =
+ untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x];
+ }
+ }
+ }
+ }
+
+ free(map->buffer);
+}
+
+static void
+crocus_map_s8(struct crocus_transfer *map)
+{
+ struct pipe_transfer *xfer = &map->base;
+ const struct pipe_box *box = &xfer->box;
+ struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+ struct isl_surf *surf = &res->surf;
+
+ xfer->stride = surf->row_pitch_B;
+ xfer->layer_stride = xfer->stride * box->height;
+
+ /* The tiling and detiling functions require that the linear buffer has
+ * a 16-byte alignment (that is, its `x0` is 16-byte aligned). Here we
+ * over-allocate the linear buffer to get the proper alignment.
+ */
+ map->buffer = map->ptr = malloc(xfer->layer_stride * box->depth);
+ assert(map->buffer);
+
+ /* One of either READ_BIT or WRITE_BIT or both is set. READ_BIT implies no
+ * INVALIDATE_RANGE_BIT. WRITE_BIT needs the original values read in unless
+ * invalidate is set, since we'll be writing the whole rectangle from our
+ * temporary buffer back out.
+ */
+ if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) {
+ uint8_t *untiled_s8_map = map->ptr;
+ uint8_t *tiled_s8_map =
+ crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+ for (int s = 0; s < box->depth; s++) {
+ unsigned x0_el, y0_el;
+ get_image_offset_el(surf, xfer->level, box->z + s, &x0_el, &y0_el);
+
+ for (uint32_t y = 0; y < box->height; y++) {
+ for (uint32_t x = 0; x < box->width; x++) {
+ ptrdiff_t offset = s8_offset(surf->row_pitch_B,
+ x0_el + box->x + x,
+ y0_el + box->y + y,
+ map->has_swizzling);
+ untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x] =
+ tiled_s8_map[offset];
+ }
+ }
+ }
+ }
+
+ map->unmap = crocus_unmap_s8;
+}
+
+/* Compute extent parameters for use with tiled_memcpy functions.
+ * xs are in units of bytes and ys are in units of strides.
+ */
+static inline void
+tile_extents(const struct isl_surf *surf,
+ const struct pipe_box *box,
+ unsigned level, int z,
+ unsigned *x1_B, unsigned *x2_B,
+ unsigned *y1_el, unsigned *y2_el)
+{
+ const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+ const unsigned cpp = fmtl->bpb / 8;
+
+ assert(box->x % fmtl->bw == 0);
+ assert(box->y % fmtl->bh == 0);
+
+ unsigned x0_el, y0_el;
+ get_image_offset_el(surf, level, box->z + z, &x0_el, &y0_el);
+
+ *x1_B = (box->x / fmtl->bw + x0_el) * cpp;
+ *y1_el = box->y / fmtl->bh + y0_el;
+ *x2_B = (DIV_ROUND_UP(box->x + box->width, fmtl->bw) + x0_el) * cpp;
+ *y2_el = DIV_ROUND_UP(box->y + box->height, fmtl->bh) + y0_el;
+}
+
+static void
+crocus_unmap_tiled_memcpy(struct crocus_transfer *map)
+{
+ struct pipe_transfer *xfer = &map->base;
+ const struct pipe_box *box = &xfer->box;
+ struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+ struct isl_surf *surf = &res->surf;
+
+ if (xfer->usage & PIPE_MAP_WRITE) {
+ char *dst =
+ crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+ for (int s = 0; s < box->depth; s++) {
+ unsigned x1, x2, y1, y2;
+ tile_extents(surf, box, xfer->level, s, &x1, &x2, &y1, &y2);
+
+ void *ptr = map->ptr + s * xfer->layer_stride;
+
+ isl_memcpy_linear_to_tiled(x1, x2, y1, y2, dst, ptr,
+ surf->row_pitch_B, xfer->stride,
+ map->has_swizzling,
+ surf->tiling, ISL_MEMCPY);
+ }
+ }
+ os_free_aligned(map->buffer);
+ map->buffer = map->ptr = NULL;
+}
+
+static void
+crocus_map_tiled_memcpy(struct crocus_transfer *map)
+{
+ struct pipe_transfer *xfer = &map->base;
+ const struct pipe_box *box = &xfer->box;
+ struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+ struct isl_surf *surf = &res->surf;
+
+ xfer->stride = ALIGN(surf->row_pitch_B, 16);
+ xfer->layer_stride = xfer->stride * box->height;
+
+ unsigned x1, x2, y1, y2;
+ tile_extents(surf, box, xfer->level, 0, &x1, &x2, &y1, &y2);
+
+ /* The tiling and detiling functions require that the linear buffer has
+ * a 16-byte alignment (that is, its `x0` is 16-byte aligned). Here we
+ * over-allocate the linear buffer to get the proper alignment.
+ */
+ map->buffer =
+ os_malloc_aligned(xfer->layer_stride * box->depth, 16);
+ assert(map->buffer);
+ map->ptr = (char *)map->buffer + (x1 & 0xf);
+
+ if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) {
+ char *src =
+ crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+ for (int s = 0; s < box->depth; s++) {
+ unsigned x1, x2, y1, y2;
+ tile_extents(surf, box, xfer->level, s, &x1, &x2, &y1, &y2);
+
+ /* Use 's' rather than 'box->z' to rebase the first slice to 0. */
+ void *ptr = map->ptr + s * xfer->layer_stride;
+
+ isl_memcpy_tiled_to_linear(x1, x2, y1, y2, ptr, src, xfer->stride,
+ surf->row_pitch_B,
+ map->has_swizzling,
+ surf->tiling,
+#if defined(USE_SSE41)
+ util_get_cpu_caps()->has_sse4_1 ? ISL_MEMCPY_STREAMING_LOAD :
+#endif
+ ISL_MEMCPY);
+ }
+ }
+
+ map->unmap = crocus_unmap_tiled_memcpy;
+}
+
+static void
+crocus_map_direct(struct crocus_transfer *map)
+{
+ struct pipe_transfer *xfer = &map->base;
+ struct pipe_box *box = &xfer->box;
+ struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+
+ void *ptr = crocus_bo_map(map->dbg, res->bo, xfer->usage & MAP_FLAGS);
+
+ if (res->base.target == PIPE_BUFFER) {
+ xfer->stride = 0;
+ xfer->layer_stride = 0;
+
+ map->ptr = ptr + box->x;
+ } else {
+ struct isl_surf *surf = &res->surf;
+ const struct isl_format_layout *fmtl =
+ isl_format_get_layout(surf->format);
+ const unsigned cpp = fmtl->bpb / 8;
+ unsigned x0_el, y0_el;
+
+ get_image_offset_el(surf, xfer->level, box->z, &x0_el, &y0_el);
+
+ xfer->stride = isl_surf_get_row_pitch_B(surf);
+ xfer->layer_stride = isl_surf_get_array_pitch(surf);
+
+ map->ptr = ptr + (y0_el + box->y) * xfer->stride + (x0_el + box->x) * cpp;
+ }
+}
+
+static bool
+can_promote_to_async(const struct crocus_resource *res,
+ const struct pipe_box *box,
+ unsigned usage)
+{
+ /* If we're writing to a section of the buffer that hasn't even been
+ * initialized with useful data, then we can safely promote this write
+ * to be unsynchronized. This helps the common pattern of appending data.
+ */
+ return res->base.target == PIPE_BUFFER && (usage & PIPE_MAP_WRITE) &&
+ !(usage & TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED) &&
+ !util_ranges_intersect(&res->valid_buffer_range, box->x,
+ box->x + box->width);
+}
+
+static void *
+crocus_transfer_map(struct pipe_context *ctx,
+ struct pipe_resource *resource,
+ unsigned level,
+ unsigned usage,
+ const struct pipe_box *box,
+ struct pipe_transfer **ptransfer)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_resource *res = (struct crocus_resource *)resource;
+ struct isl_surf *surf = &res->surf;
+
+ if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
+ /* Replace the backing storage with a fresh buffer for non-async maps */
+ if (!(usage & (PIPE_MAP_UNSYNCHRONIZED |
+ TC_TRANSFER_MAP_NO_INVALIDATE)))
+ crocus_invalidate_resource(ctx, resource);
+
+ /* If we can discard the whole resource, we can discard the range. */
+ usage |= PIPE_MAP_DISCARD_RANGE;
+ }
+
+ if (!(usage & PIPE_MAP_UNSYNCHRONIZED) &&
+ can_promote_to_async(res, box, usage)) {
+ usage |= PIPE_MAP_UNSYNCHRONIZED;
+ }
+
+ bool map_would_stall = false;
+
+ if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
+ map_would_stall = resource_is_busy(ice, res) ||
+ crocus_has_invalid_primary(res, level, 1, box->z, box->depth);
+
+
+ if (map_would_stall && (usage & PIPE_MAP_DONTBLOCK) &&
+ (usage & PIPE_MAP_DIRECTLY))
+ return NULL;
+ }
+
+ if (surf->tiling != ISL_TILING_LINEAR &&
+ (usage & PIPE_MAP_DIRECTLY))
+ return NULL;
+
+ struct crocus_transfer *map = slab_alloc(&ice->transfer_pool);
+ struct pipe_transfer *xfer = &map->base;
+
+ if (!map)
+ return NULL;
+
+ memset(map, 0, sizeof(*map));
+ map->dbg = &ice->dbg;
+
+ map->has_swizzling = ((struct crocus_screen *)ctx->screen)->has_swizzling;
+ pipe_resource_reference(&xfer->resource, resource);
+ xfer->level = level;
+ xfer->usage = usage;
+ xfer->box = *box;
+ *ptransfer = xfer;
+
+ map->dest_had_defined_contents =
+ util_ranges_intersect(&res->valid_buffer_range, box->x,
+ box->x + box->width);
+
+ if (usage & PIPE_MAP_WRITE)
+ util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width);
+
+ /* Avoid using GPU copies for persistent/coherent buffers, as the idea
+ * there is to access them simultaneously on the CPU & GPU. This also
+ * avoids trying to use GPU copies for our u_upload_mgr buffers which
+ * contain state we're constructing for a GPU draw call, which would
+ * kill us with infinite stack recursion.
+ */
+ bool no_gpu = usage & (PIPE_MAP_PERSISTENT |
+ PIPE_MAP_COHERENT |
+ PIPE_MAP_DIRECTLY);
+
+ /* GPU copies are not useful for buffer reads. Instead of stalling to
+ * read from the original buffer, we'd simply copy it to a temporary...
+ * then stall (a bit longer) to read from that buffer.
+ *
+ * Images are less clear-cut. Color resolves are destructive, removing
+ * the underlying compression, so we'd rather blit the data to a linear
+ * temporary and map that, to avoid the resolve. (It might be better to
+ * a tiled temporary and use the tiled_memcpy paths...)
+ */
+ if (!(usage & PIPE_MAP_DISCARD_RANGE) &&
+ !crocus_has_invalid_primary(res, level, 1, box->z, box->depth))
+ no_gpu = true;
+
+ const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+ if (fmtl->txc == ISL_TXC_ASTC)
+ no_gpu = true;
+
+ if (map_would_stall && !no_gpu) {
+ /* If we need a synchronous mapping and the resource is busy, or needs
+ * resolving, we copy to/from a linear temporary buffer using the GPU.
+ */
+ map->batch = &ice->batches[CROCUS_BATCH_RENDER];
+ map->blorp = &ice->blorp;
+ crocus_map_copy_region(map);
+ } else {
+ /* Otherwise we're free to map on the CPU. */
+
+ if (resource->target != PIPE_BUFFER) {
+ crocus_resource_access_raw(ice, res,
+ level, box->z, box->depth,
+ usage & PIPE_MAP_WRITE);
+ }
+
+ if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
+ for (int i = 0; i < ice->batch_count; i++) {
+ if (crocus_batch_references(&ice->batches[i], res->bo))
+ crocus_batch_flush(&ice->batches[i]);
+ }
+ }
+
+ if (surf->tiling == ISL_TILING_W) {
+ /* TODO: Teach crocus_map_tiled_memcpy about W-tiling... */
+ crocus_map_s8(map);
+ } else if (surf->tiling != ISL_TILING_LINEAR) {
+ crocus_map_tiled_memcpy(map);
+ } else {
+ crocus_map_direct(map);
+ }
+ }
+
+ return map->ptr;
+}
+
+static void
+crocus_transfer_flush_region(struct pipe_context *ctx,
+ struct pipe_transfer *xfer,
+ const struct pipe_box *box)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+ struct crocus_transfer *map = (void *) xfer;
+
+ if (map->staging)
+ crocus_flush_staging_region(xfer, box);
+
+ uint32_t history_flush = 0;
+
+ if (res->base.target == PIPE_BUFFER) {
+ if (map->staging)
+ history_flush |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
+
+ if (map->dest_had_defined_contents)
+ history_flush |= crocus_flush_bits_for_history(res);
+
+ util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width);
+ }
+
+ if (history_flush & ~PIPE_CONTROL_CS_STALL) {
+ for (int i = 0; i < ice->batch_count; i++) {
+ struct crocus_batch *batch = &ice->batches[i];
+
+ if (!batch->command.bo)
+ continue;
+ if (batch->contains_draw || batch->cache.render->entries) {
+ crocus_batch_maybe_flush(batch, 24);
+ crocus_emit_pipe_control_flush(batch,
+ "cache history: transfer flush",
+ history_flush);
+ }
+ }
+ }
+
+ /* Make sure we flag constants dirty even if there's no need to emit
+ * any PIPE_CONTROLs to a batch.
+ */
+ crocus_dirty_for_history(ice, res);
+}
+
+static void
+crocus_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *xfer)
+{
+ struct crocus_context *ice = (struct crocus_context *)ctx;
+ struct crocus_transfer *map = (void *) xfer;
+
+ if (!(xfer->usage & (PIPE_MAP_FLUSH_EXPLICIT |
+ PIPE_MAP_COHERENT))) {
+ struct pipe_box flush_box = {
+ .x = 0, .y = 0, .z = 0,
+ .width = xfer->box.width,
+ .height = xfer->box.height,
+ .depth = xfer->box.depth,
+ };
+ crocus_transfer_flush_region(ctx, xfer, &flush_box);
+ }
+
+ if (map->unmap)
+ map->unmap(map);
+
+ pipe_resource_reference(&xfer->resource, NULL);
+ slab_free(&ice->transfer_pool, map);
+}
+
+/**
+ * Mark state dirty that needs to be re-emitted when a resource is written.
+ */
+void
+crocus_dirty_for_history(struct crocus_context *ice,
+ struct crocus_resource *res)
+{
+ uint64_t stage_dirty = 0ull;
+
+ if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+ stage_dirty |= ((uint64_t)res->bind_stages) << CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS;
+ }
+
+ ice->state.stage_dirty |= stage_dirty;
+}
+
+/**
+ * Produce a set of PIPE_CONTROL bits which ensure data written to a
+ * resource becomes visible, and any stale read cache data is invalidated.
+ */
+uint32_t
+crocus_flush_bits_for_history(struct crocus_resource *res)
+{
+ uint32_t flush = PIPE_CONTROL_CS_STALL;
+
+ if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+ flush |= PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+ }
+
+ if (res->bind_history & PIPE_BIND_SAMPLER_VIEW)
+ flush |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+
+ if (res->bind_history & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER))
+ flush |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+
+ if (res->bind_history & (PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE))
+ flush |= PIPE_CONTROL_DATA_CACHE_FLUSH;
+
+ return flush;
+}
+
+void
+crocus_flush_and_dirty_for_history(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ uint32_t extra_flags,
+ const char *reason)
+{
+ if (res->base.target != PIPE_BUFFER)
+ return;
+
+ uint32_t flush = crocus_flush_bits_for_history(res) | extra_flags;
+
+ crocus_emit_pipe_control_flush(batch, reason, flush);
+
+ crocus_dirty_for_history(ice, res);
+}
+
+bool
+crocus_resource_set_clear_color(struct crocus_context *ice,
+ struct crocus_resource *res,
+ union isl_color_value color)
+{
+ if (memcmp(&res->aux.clear_color, &color, sizeof(color)) != 0) {
+ res->aux.clear_color = color;
+ return true;
+ }
+
+ return false;
+}
+
+union isl_color_value
+crocus_resource_get_clear_color(const struct crocus_resource *res)
+{
+ assert(res->aux.bo);
+
+ return res->aux.clear_color;
+}
+
+static enum pipe_format
+crocus_resource_get_internal_format(struct pipe_resource *p_res)
+{
+ struct crocus_resource *res = (void *) p_res;
+ return res->internal_format;
+}
+
+static const struct u_transfer_vtbl transfer_vtbl = {
+ .resource_create = crocus_resource_create,
+ .resource_destroy = crocus_resource_destroy,
+ .transfer_map = crocus_transfer_map,
+ .transfer_unmap = crocus_transfer_unmap,
+ .transfer_flush_region = crocus_transfer_flush_region,
+ .get_internal_format = crocus_resource_get_internal_format,
+ .set_stencil = crocus_resource_set_separate_stencil,
+ .get_stencil = crocus_resource_get_separate_stencil,
+};
+
+static bool
+crocus_is_dmabuf_modifier_supported(struct pipe_screen *pscreen,
+ uint64_t modifier, enum pipe_format pfmt,
+ bool *external_only)
+{
+ struct crocus_screen *screen = (void *) pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (modifier_is_supported(devinfo, pfmt, modifier)) {
+ if (external_only)
+ *external_only = false;
+
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned int
+crocus_get_dmabuf_modifier_planes(struct pipe_screen *pscreen, uint64_t modifier,
+ enum pipe_format format)
+{
+ return util_format_get_num_planes(format);
+}
+
+void
+crocus_init_screen_resource_functions(struct pipe_screen *pscreen)
+{
+ struct crocus_screen *screen = (void *) pscreen;
+ pscreen->query_dmabuf_modifiers = crocus_query_dmabuf_modifiers;
+ pscreen->is_dmabuf_modifier_supported = crocus_is_dmabuf_modifier_supported;
+ pscreen->get_dmabuf_modifier_planes = crocus_get_dmabuf_modifier_planes;
+ pscreen->resource_create_with_modifiers =
+ crocus_resource_create_with_modifiers;
+ pscreen->resource_create = u_transfer_helper_resource_create;
+ pscreen->resource_from_user_memory = crocus_resource_from_user_memory;
+ pscreen->resource_from_handle = crocus_resource_from_handle;
+ pscreen->resource_get_handle = crocus_resource_get_handle;
+ pscreen->resource_get_param = crocus_resource_get_param;
+ pscreen->resource_destroy = u_transfer_helper_resource_destroy;
+ pscreen->transfer_helper =
+ u_transfer_helper_create(&transfer_vtbl, screen->devinfo.ver >= 6,
+ screen->devinfo.ver >= 6, false, true);
+}
+
+void
+crocus_init_resource_functions(struct pipe_context *ctx)
+{
+ ctx->flush_resource = crocus_flush_resource;
+ ctx->invalidate_resource = crocus_invalidate_resource;
+ ctx->buffer_map = u_transfer_helper_transfer_map;
+ ctx->texture_map = u_transfer_helper_transfer_map;
+ ctx->transfer_flush_region = u_transfer_helper_transfer_flush_region;
+ ctx->buffer_unmap = u_transfer_helper_transfer_unmap;
+ ctx->texture_unmap = u_transfer_helper_transfer_unmap;
+ ctx->buffer_subdata = u_default_buffer_subdata;
+ ctx->texture_subdata = u_default_texture_subdata;
+}
diff --git a/src/gallium/drivers/crocus/crocus_resource.h b/src/gallium/drivers/crocus/crocus_resource.h
new file mode 100644
index 00000000000..8eb49118f54
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resource.h
@@ -0,0 +1,501 @@
+/*
+ * Copyright 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_RESOURCE_H
+#define CROCUS_RESOURCE_H
+
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_range.h"
+#include "intel/isl/isl.h"
+
+#include "crocus_bufmgr.h"
+
+struct crocus_batch;
+struct crocus_context;
+
+#define CROCUS_MAX_MIPLEVELS 15
+
+struct crocus_format_info {
+ enum isl_format fmt;
+ enum pipe_swizzle swizzles[4];
+};
+
+static inline enum isl_channel_select
+pipe_to_isl_swizzle(const enum pipe_swizzle pswz, bool green_to_blue)
+{
+ unsigned swz = (pswz + 4) & 7;
+
+ return (green_to_blue && swz == ISL_CHANNEL_SELECT_GREEN) ? ISL_CHANNEL_SELECT_BLUE : swz;
+}
+
+static inline struct isl_swizzle
+pipe_to_isl_swizzles(const enum pipe_swizzle pswz[4])
+{
+ struct isl_swizzle swz;
+ swz.r = pipe_to_isl_swizzle(pswz[0], false);
+ swz.g = pipe_to_isl_swizzle(pswz[1], false);
+ swz.b = pipe_to_isl_swizzle(pswz[2], false);
+ swz.a = pipe_to_isl_swizzle(pswz[3], false);
+ return swz;
+}
+
+static inline void
+crocus_combine_swizzle(enum pipe_swizzle outswz[4],
+ const enum pipe_swizzle fswz[4],
+ const enum pipe_swizzle vswz[4])
+{
+ for (unsigned i = 0; i < 4; i++) {
+ switch (vswz[i]) {
+ case PIPE_SWIZZLE_X: outswz[i] = fswz[0]; break;
+ case PIPE_SWIZZLE_Y: outswz[i] = fswz[1]; break;
+ case PIPE_SWIZZLE_Z: outswz[i] = fswz[2]; break;
+ case PIPE_SWIZZLE_W: outswz[i] = fswz[3]; break;
+ case PIPE_SWIZZLE_1: outswz[i] = PIPE_SWIZZLE_1; break;
+ case PIPE_SWIZZLE_0: outswz[i] = PIPE_SWIZZLE_0; break;
+ default: unreachable("invalid swizzle");
+ }
+ }
+}
+
+/**
+ * Resources represent a GPU buffer object or image (mipmap tree).
+ *
+ * They contain the storage (BO) and layout information (ISL surface).
+ */
+struct crocus_resource {
+ struct pipe_resource base;
+ enum pipe_format internal_format;
+
+ /**
+ * The ISL surface layout information for this resource.
+ *
+ * This is not filled out for PIPE_BUFFER resources, but is guaranteed
+ * to be zeroed. Note that this also guarantees that res->surf.tiling
+ * will be ISL_TILING_LINEAR, so it's safe to check that.
+ */
+ struct isl_surf surf;
+
+ /** Backing storage for the resource */
+ struct crocus_bo *bo;
+
+ /** offset at which data starts in the BO */
+ uint64_t offset;
+
+ /**
+ * A bitfield of PIPE_BIND_* indicating how this resource was bound
+ * in the past. Only meaningful for PIPE_BUFFER; used for flushing.
+ */
+ unsigned bind_history;
+
+ /**
+ * A bitfield of MESA_SHADER_* stages indicating where this resource
+ * was bound.
+ */
+ unsigned bind_stages;
+
+ /**
+ * For PIPE_BUFFER resources, a range which may contain valid data.
+ *
+ * This is a conservative estimate of what part of the buffer contains
+ * valid data that we have to preserve. The rest of the buffer is
+ * considered invalid, and we can promote writes to that region to
+ * be unsynchronized writes, avoiding blit copies.
+ */
+ struct util_range valid_buffer_range;
+
+ /**
+ * Auxiliary buffer information (CCS, MCS, or HiZ).
+ */
+ struct {
+ /** The surface layout for the auxiliary buffer. */
+ struct isl_surf surf;
+
+ /** The buffer object containing the auxiliary data. */
+ struct crocus_bo *bo;
+
+ /** Offset into 'bo' where the auxiliary surface starts. */
+ uint32_t offset;
+
+ struct {
+ struct isl_surf surf;
+
+ /** Offset into 'bo' where the auxiliary surface starts. */
+ uint32_t offset;
+ } extra_aux;
+
+ /**
+ * Fast clear color for this surface. For depth surfaces, the clear
+ * value is stored as a float32 in the red component.
+ */
+ union isl_color_value clear_color;
+
+ /**
+ * \brief The type of auxiliary compression used by this resource.
+ *
+ * This describes the type of auxiliary compression that is intended to
+ * be used by this resource. An aux usage of ISL_AUX_USAGE_NONE means
+ * that auxiliary compression is permanently disabled. An aux usage
+ * other than ISL_AUX_USAGE_NONE does not imply that auxiliary
+ * compression will always be enabled for this surface.
+ */
+ enum isl_aux_usage usage;
+
+ /**
+ * \brief Maps miptree slices to their current aux state.
+ *
+ * This two-dimensional array is indexed as [level][layer] and stores an
+ * aux state for each slice.
+ */
+ enum isl_aux_state **state;
+
+ /**
+ * If (1 << level) is set, HiZ is enabled for that miplevel.
+ */
+ uint16_t has_hiz;
+ } aux;
+
+ /**
+ * \brief Shadow miptree for sampling when the main isn't supported by HW.
+ *
+ * To workaround various sampler bugs and limitations, we blit the main
+ * texture into a new texture that can be sampled.
+ *
+ * This miptree may be used for:
+ * - Stencil texturing (pre-BDW) as required by GL_ARB_stencil_texturing.
+ */
+ struct crocus_resource *shadow;
+ bool shadow_needs_update;
+
+ /**
+ * For external surfaces, this is format that was used to create or import
+ * the surface. For internal surfaces, this will always be
+ * PIPE_FORMAT_NONE.
+ */
+ enum pipe_format external_format;
+
+ /**
+ * For external surfaces, this is DRM format modifier that was used to
+ * create or import the surface. For internal surfaces, this will always
+ * be DRM_FORMAT_MOD_INVALID.
+ */
+ const struct isl_drm_modifier_info *mod_info;
+
+ /**
+ * The screen the resource was originally created with, stored for refcounting.
+ */
+ struct pipe_screen *orig_screen;
+};
+
+/**
+ * A simple <resource, offset> tuple for storing a reference to a
+ * piece of state stored in a GPU buffer object.
+ */
+struct crocus_state_ref {
+ struct pipe_resource *res;
+ uint32_t offset;
+};
+
+/**
+ * Gallium CSO for sampler views (texture views).
+ *
+ * In addition to the normal pipe_resource, this adds an ISL view
+ * which may reinterpret the format or restrict levels/layers.
+ *
+ * These can also be linear texture buffers.
+ */
+struct crocus_sampler_view {
+ struct pipe_sampler_view base;
+ struct isl_view view;
+ struct isl_view gather_view;
+
+ enum pipe_swizzle swizzle[4];
+ union isl_color_value clear_color;
+
+ /* A short-cut (not a reference) to the actual resource being viewed.
+ * Multi-planar (or depth+stencil) images may have multiple resources
+ * chained together; this skips having to traverse base->texture->*.
+ */
+ struct crocus_resource *res;
+};
+
+/**
+ * Image view representation.
+ */
+struct crocus_image_view {
+ struct pipe_image_view base;
+ struct isl_view view;
+};
+
+/**
+ * Gallium CSO for surfaces (framebuffer attachments).
+ *
+ * A view of a surface that can be bound to a color render target or
+ * depth/stencil attachment.
+ */
+struct crocus_surface {
+ struct pipe_surface base;
+ struct isl_view view;
+ struct isl_view read_view;
+ struct isl_surf surf;
+ union isl_color_value clear_color;
+
+ struct pipe_resource *align_res;
+};
+
+/**
+ * Transfer object - information about a buffer mapping.
+ */
+struct crocus_transfer {
+ struct pipe_transfer base;
+ struct pipe_debug_callback *dbg;
+ void *buffer;
+ void *ptr;
+
+ /** A linear staging resource for GPU-based copy_region transfers. */
+ struct pipe_resource *staging;
+ struct blorp_context *blorp;
+ struct crocus_batch *batch;
+
+ bool dest_had_defined_contents;
+ bool has_swizzling;
+
+ void (*unmap)(struct crocus_transfer *);
+};
+
+/**
+ * Unwrap a pipe_resource to get the underlying crocus_bo (for convenience).
+ */
+static inline struct crocus_bo *
+crocus_resource_bo(struct pipe_resource *p_res)
+{
+ struct crocus_resource *res = (void *) p_res;
+ return res->bo;
+}
+
+static inline uint32_t
+crocus_mocs(const struct crocus_bo *bo,
+ const struct isl_device *dev)
+{
+ return isl_mocs(dev, 0, bo && crocus_bo_is_external(bo));
+}
+
+struct crocus_format_info crocus_format_for_usage(const struct intel_device_info *,
+ enum pipe_format pf,
+ isl_surf_usage_flags_t usage);
+
+struct pipe_resource *crocus_resource_get_separate_stencil(struct pipe_resource *);
+
+void crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo,
+ struct pipe_resource *res,
+ struct crocus_resource **out_z,
+ struct crocus_resource **out_s);
+bool crocus_resource_set_clear_color(struct crocus_context *ice,
+ struct crocus_resource *res,
+ union isl_color_value color);
+union isl_color_value
+crocus_resource_get_clear_color(const struct crocus_resource *res);
+
+void crocus_init_screen_resource_functions(struct pipe_screen *pscreen);
+
+void crocus_dirty_for_history(struct crocus_context *ice,
+ struct crocus_resource *res);
+uint32_t crocus_flush_bits_for_history(struct crocus_resource *res);
+
+void crocus_flush_and_dirty_for_history(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ uint32_t extra_flags,
+ const char *reason);
+
+unsigned crocus_get_num_logical_layers(const struct crocus_resource *res,
+ unsigned level);
+
+void crocus_resource_disable_aux(struct crocus_resource *res);
+
+#define INTEL_REMAINING_LAYERS UINT32_MAX
+#define INTEL_REMAINING_LEVELS UINT32_MAX
+
+void
+crocus_hiz_exec(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_resource *res,
+ unsigned int level, unsigned int start_layer,
+ unsigned int num_layers, enum isl_aux_op op,
+ bool update_clear_depth);
+
+/**
+ * Prepare a miptree for access
+ *
+ * This function should be called prior to any access to miptree in order to
+ * perform any needed resolves.
+ *
+ * \param[in] start_level The first mip level to be accessed
+ *
+ * \param[in] num_levels The number of miplevels to be accessed or
+ * INTEL_REMAINING_LEVELS to indicate every level
+ * above start_level will be accessed
+ *
+ * \param[in] start_layer The first array slice or 3D layer to be accessed
+ *
+ * \param[in] num_layers The number of array slices or 3D layers be
+ * accessed or INTEL_REMAINING_LAYERS to indicate
+ * every layer above start_layer will be accessed
+ *
+ * \param[in] aux_supported Whether or not the access will support the
+ * miptree's auxiliary compression format; this
+ * must be false for uncompressed miptrees
+ *
+ * \param[in] fast_clear_supported Whether or not the access will support
+ * fast clears in the miptree's auxiliary
+ * compression format
+ */
+void
+crocus_resource_prepare_access(struct crocus_context *ice,
+ struct crocus_resource *res,
+ uint32_t start_level, uint32_t num_levels,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_usage aux_usage,
+ bool fast_clear_supported);
+
+/**
+ * Complete a write operation
+ *
+ * This function should be called after any operation writes to a miptree.
+ * This will update the miptree's compression state so that future resolves
+ * happen correctly. Technically, this function can be called before the
+ * write occurs but the caller must ensure that they don't interlace
+ * crocus_resource_prepare_access and crocus_resource_finish_write calls to
+ * overlapping layer/level ranges.
+ *
+ * \param[in] level The mip level that was written
+ *
+ * \param[in] start_layer The first array slice or 3D layer written
+ *
+ * \param[in] num_layers The number of array slices or 3D layers
+ * written or INTEL_REMAINING_LAYERS to indicate
+ * every layer above start_layer was written
+ *
+ * \param[in] written_with_aux Whether or not the write was done with
+ * auxiliary compression enabled
+ */
+void
+crocus_resource_finish_write(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_usage aux_usage);
+
+/** Get the auxiliary compression state of a miptree slice */
+enum isl_aux_state
+crocus_resource_get_aux_state(const struct crocus_resource *res,
+ uint32_t level, uint32_t layer);
+
+/**
+ * Set the auxiliary compression state of a miptree slice range
+ *
+ * This function directly sets the auxiliary compression state of a slice
+ * range of a miptree. It only modifies data structures and does not do any
+ * resolves. This should only be called by code which directly performs
+ * compression operations such as fast clears and resolves. Most code should
+ * use crocus_resource_prepare_access or crocus_resource_finish_write.
+ */
+void
+crocus_resource_set_aux_state(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t num_layers,
+ enum isl_aux_state aux_state);
+
+/**
+ * Prepare a miptree for raw access
+ *
+ * This helper prepares the miptree for access that knows nothing about any
+ * sort of compression whatsoever. This is useful when mapping the surface or
+ * using it with the blitter.
+ */
+static inline void
+crocus_resource_access_raw(struct crocus_context *ice,
+ struct crocus_resource *res,
+ uint32_t level, uint32_t layer,
+ uint32_t num_layers,
+ bool write)
+{
+ crocus_resource_prepare_access(ice, res, level, 1, layer, num_layers,
+ ISL_AUX_USAGE_NONE, false);
+ if (write) {
+ crocus_resource_finish_write(ice, res, level, layer, num_layers,
+ ISL_AUX_USAGE_NONE);
+ }
+}
+
+void
+crocus_resource_get_image_offset(struct crocus_resource *res,
+ uint32_t level, uint32_t z,
+ uint32_t *x, uint32_t *y);
+static inline enum isl_aux_usage
+crocus_resource_texture_aux_usage(const struct crocus_resource *res)
+{
+ return res->aux.usage == ISL_AUX_USAGE_MCS ? ISL_AUX_USAGE_MCS : ISL_AUX_USAGE_NONE;
+}
+
+void crocus_resource_prepare_texture(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format view_format,
+ uint32_t start_level, uint32_t num_levels,
+ uint32_t start_layer, uint32_t num_layers);
+
+static inline bool
+crocus_resource_unfinished_aux_import(struct crocus_resource *res)
+{
+ return res->base.next != NULL && res->mod_info &&
+ res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+}
+
+void crocus_resource_finish_aux_import(struct pipe_screen *pscreen,
+ struct crocus_resource *res);
+
+bool crocus_has_invalid_primary(const struct crocus_resource *res,
+ unsigned start_level, unsigned num_levels,
+ unsigned start_layer, unsigned num_layers);
+
+void crocus_resource_check_level_layer(const struct crocus_resource *res,
+ uint32_t level, uint32_t layer);
+
+bool crocus_resource_level_has_hiz(const struct crocus_resource *res,
+ uint32_t level);
+bool crocus_has_color_unresolved(const struct crocus_resource *res,
+ unsigned start_level, unsigned num_levels,
+ unsigned start_layer, unsigned num_layers);
+
+enum isl_aux_usage crocus_resource_render_aux_usage(struct crocus_context *ice,
+ struct crocus_resource *res,
+ enum isl_format render_fmt,
+ bool blend_enabled,
+ bool draw_aux_disabled);
+void crocus_resource_prepare_render(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t layer_count,
+ enum isl_aux_usage aux_usage);
+void crocus_resource_finish_render(struct crocus_context *ice,
+ struct crocus_resource *res, uint32_t level,
+ uint32_t start_layer, uint32_t layer_count,
+ enum isl_aux_usage aux_usage);
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_screen.c b/src/gallium/drivers/crocus/crocus_screen.c
new file mode 100644
index 00000000000..d5331d66730
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_screen.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_screen.c
+ *
+ * Screen related driver hooks and capability lists.
+ *
+ * A program may use multiple rendering contexts (crocus_context), but
+ * they all share a common screen (crocus_screen). Global driver state
+ * can be stored in the screen; it may be accessed by multiple threads.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/debug.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_transfer_helper.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "util/xmlconfig.h"
+#include "drm-uapi/i915_drm.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "crocus_fence.h"
+#include "crocus_pipe.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/common/intel_gem.h"
+#include "intel/common/intel_l3_config.h"
+#include "crocus_monitor.h"
+
+#define genX_call(devinfo, func, ...) \
+ switch ((devinfo)->verx10) { \
+ case 75: \
+ gfx75_##func(__VA_ARGS__); \
+ break; \
+ case 70: \
+ gfx7_##func(__VA_ARGS__); \
+ break; \
+ case 60: \
+ gfx6_##func(__VA_ARGS__); \
+ break; \
+ case 50: \
+ gfx5_##func(__VA_ARGS__); \
+ break; \
+ case 45: \
+ gfx45_##func(__VA_ARGS__); \
+ break; \
+ case 40: \
+ gfx4_##func(__VA_ARGS__); \
+ break; \
+ default: \
+ unreachable("Unknown hardware generation"); \
+ }
+
+static void
+crocus_flush_frontbuffer(struct pipe_screen *_screen,
+ struct pipe_context *_pipe,
+ struct pipe_resource *resource,
+ unsigned level, unsigned layer,
+ void *context_private, struct pipe_box *box)
+{
+}
+
+static const char *
+crocus_get_vendor(struct pipe_screen *pscreen)
+{
+ return "Intel";
+}
+
+static const char *
+crocus_get_device_vendor(struct pipe_screen *pscreen)
+{
+ return "Intel";
+}
+
+static const char *
+crocus_get_name(struct pipe_screen *pscreen)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ static char buf[128];
+
+ const char *name = intel_get_device_name(screen->pci_id);
+
+ if (!name)
+ name = "Intel Unknown";
+
+ snprintf(buf, sizeof(buf), "Mesa %s", name);
+ return buf;
+}
+
+static uint64_t
+get_aperture_size(int fd)
+{
+ struct drm_i915_gem_get_aperture aperture = {};
+ intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
+ return aperture.aper_size;
+}
+
+static int
+crocus_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ switch (param) {
+ case PIPE_CAP_NPOT_TEXTURES:
+ case PIPE_CAP_ANISOTROPIC_FILTER:
+ case PIPE_CAP_POINT_SPRITE:
+ case PIPE_CAP_OCCLUSION_QUERY:
+ case PIPE_CAP_TEXTURE_SWIZZLE:
+ case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+ case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+ case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
+ case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
+ case PIPE_CAP_VERTEX_SHADER_SATURATE:
+ case PIPE_CAP_PRIMITIVE_RESTART:
+ case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX:
+ case PIPE_CAP_INDEP_BLEND_ENABLE:
+ case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+ case PIPE_CAP_DEPTH_CLIP_DISABLE:
+ case PIPE_CAP_TGSI_INSTANCEID:
+ case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+ case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+ case PIPE_CAP_SEAMLESS_CUBE_MAP:
+ case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+ case PIPE_CAP_CONDITIONAL_RENDER:
+ case PIPE_CAP_TEXTURE_BARRIER:
+ case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+ case PIPE_CAP_START_INSTANCE:
+ case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+ case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+ case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+ case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+ case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+ case PIPE_CAP_ACCELERATED:
+ case PIPE_CAP_UMA:
+ case PIPE_CAP_CLIP_HALFZ:
+ case PIPE_CAP_TGSI_TEXCOORD:
+ case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+ case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+ case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+ case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+ case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+ case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+ case PIPE_CAP_TGSI_TEX_TXF_LZ:
+ case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+ case PIPE_CAP_CLEAR_TEXTURE:
+ case PIPE_CAP_TGSI_VOTE:
+ case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+ case PIPE_CAP_TEXTURE_GATHER_SM5:
+ case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+ case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
+ case PIPE_CAP_NIR_COMPACT_ARRAYS:
+ case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+ case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+ case PIPE_CAP_INVALIDATE_BUFFER:
+ case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+ case PIPE_CAP_CS_DERIVED_SYSTEM_VALUES_SUPPORTED:
+ case PIPE_CAP_FENCE_SIGNAL:
+ case PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION:
+ return true;
+ case PIPE_CAP_INT64:
+ case PIPE_CAP_INT64_DIVMOD:
+ case PIPE_CAP_TGSI_BALLOT:
+ case PIPE_CAP_PACKED_UNIFORMS:
+ case PIPE_CAP_GL_CLAMP:
+ return false;
+ case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+ return devinfo->ver <= 5;
+ case PIPE_CAP_TEXTURE_QUERY_LOD:
+ case PIPE_CAP_QUERY_TIME_ELAPSED:
+ return devinfo->ver >= 5;
+ case PIPE_CAP_DRAW_INDIRECT:
+ case PIPE_CAP_MULTI_DRAW_INDIRECT:
+ case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+ case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+ case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+ case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+ case PIPE_CAP_TGSI_CLOCK:
+ case PIPE_CAP_TGSI_TXQS:
+ case PIPE_CAP_COMPUTE:
+ case PIPE_CAP_SAMPLER_VIEW_TARGET:
+ case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
+ case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+ case PIPE_CAP_GL_SPIRV:
+ case PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS:
+ case PIPE_CAP_COMPUTE_SHADER_DERIVATIVES:
+ case PIPE_CAP_DOUBLES:
+ return devinfo->ver >= 7;
+ case PIPE_CAP_QUERY_BUFFER_OBJECT:
+ case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+ return devinfo->is_haswell;
+ case PIPE_CAP_CULL_DISTANCE:
+ case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
+ case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+ case PIPE_CAP_SAMPLE_SHADING:
+ case PIPE_CAP_CUBE_MAP_ARRAY:
+ case PIPE_CAP_QUERY_SO_OVERFLOW:
+ case PIPE_CAP_TEXTURE_MULTISAMPLE:
+ case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+ case PIPE_CAP_QUERY_TIMESTAMP:
+ case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+ case PIPE_CAP_INDEP_BLEND_FUNC:
+ case PIPE_CAP_TEXTURE_SHADOW_LOD:
+ case PIPE_CAP_LOAD_CONSTBUF:
+ case PIPE_CAP_DRAW_PARAMETERS:
+ case PIPE_CAP_CLEAR_SCISSORED:
+ return devinfo->ver >= 6;
+ case PIPE_CAP_FBFETCH:
+ return devinfo->verx10 >= 45 ? BRW_MAX_DRAW_BUFFERS : 0;
+ case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+ return devinfo->ver >= 6 ? 1 : 0;
+ case PIPE_CAP_MAX_RENDER_TARGETS:
+ return BRW_MAX_DRAW_BUFFERS;
+ case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
+ if (devinfo->ver >= 7)
+ return 16384;
+ else
+ return 8192;
+ case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+ if (devinfo->ver >= 7)
+ return CROCUS_MAX_MIPLEVELS; /* 16384x16384 */
+ else
+ return CROCUS_MAX_MIPLEVELS - 1; /* 8192x8192 */
+ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+ return 12; /* 2048x2048 */
+ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+ return (devinfo->ver >= 6) ? 4 : 0;
+ case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+ return devinfo->ver >= 7 ? 2048 : 512;
+ case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+ return BRW_MAX_SOL_BINDINGS / CROCUS_MAX_SOL_BUFFERS;
+ case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+ return BRW_MAX_SOL_BINDINGS;
+ case PIPE_CAP_GLSL_FEATURE_LEVEL: {
+ if (devinfo->is_haswell)
+ return 460;
+ else if (devinfo->ver >= 7)
+ return 420;
+ else if (devinfo->ver >= 6)
+ return 330;
+ return 120;
+ }
+ case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+ return devinfo->ver < 6 ? 120 : 130;
+
+ case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+ /* 3DSTATE_CONSTANT_XS requires the start of UBOs to be 32B aligned */
+ return 32;
+ case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+ return CROCUS_MAP_BUFFER_ALIGNMENT;
+ case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+ /* Choose a cacheline (64 bytes) so that we can safely have the CPU and
+ * GPU writing the same SSBO on non-coherent systems (Atom CPUs). With
+ * UBOs, the GPU never writes, so there's no problem. For an SSBO, the
+ * GPU and the CPU can be updating disjoint regions of the buffer
+ * simultaneously and that will break if the regions overlap the same
+ * cacheline.
+ */
+ return devinfo->ver >= 7 ? 64 : 0;
+ case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
+ return devinfo->ver >= 7 ? (1 << 27) : 0;
+ case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+ return 16; // XXX: u_screen says 256 is the minimum value...
+ case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+ return true;
+ case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+ return CROCUS_MAX_TEXTURE_BUFFER_SIZE;
+ case PIPE_CAP_MAX_VIEWPORTS:
+ return devinfo->ver >= 6 ? 16 : 1;
+ case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+ return devinfo->ver >= 6 ? 256 : 0;
+ case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+ return devinfo->ver >= 6 ? 1024 : 0;
+ case PIPE_CAP_MAX_GS_INVOCATIONS:
+ return devinfo->ver >= 7 ? 32 : 1;
+ case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+ if (devinfo->ver >= 7)
+ return 4;
+ else if (devinfo->ver == 6)
+ return 1;
+ else
+ return 0;
+ case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+ if (devinfo->ver >= 7)
+ return -32;
+ else if (devinfo->ver == 6)
+ return -8;
+ else
+ return 0;
+ case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+ if (devinfo->ver >= 7)
+ return 31;
+ else if (devinfo->ver == 6)
+ return 7;
+ else
+ return 0;
+ case PIPE_CAP_MAX_VERTEX_STREAMS:
+ return devinfo->ver >= 7 ? 4 : 1;
+ case PIPE_CAP_VENDOR_ID:
+ return 0x8086;
+ case PIPE_CAP_DEVICE_ID:
+ return screen->pci_id;
+ case PIPE_CAP_VIDEO_MEMORY: {
+ /* Once a batch uses more than 75% of the maximum mappable size, we
+ * assume that there's some fragmentation, and we start doing extra
+ * flushing, etc. That's the big cliff apps will care about.
+ */
+ const unsigned gpu_mappable_megabytes =
+ (screen->aperture_bytes * 3 / 4) / (1024 * 1024);
+
+ const long system_memory_pages = sysconf(_SC_PHYS_PAGES);
+ const long system_page_size = sysconf(_SC_PAGE_SIZE);
+
+ if (system_memory_pages <= 0 || system_page_size <= 0)
+ return -1;
+
+ const uint64_t system_memory_bytes =
+ (uint64_t) system_memory_pages * (uint64_t) system_page_size;
+
+ const unsigned system_memory_megabytes =
+ (unsigned) (system_memory_bytes / (1024 * 1024));
+
+ return MIN2(system_memory_megabytes, gpu_mappable_megabytes);
+ }
+ case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+ case PIPE_CAP_MAX_VARYINGS:
+ return (screen->devinfo.ver >= 6) ? 32 : 16;
+ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+ /* AMD_pinned_memory assumes the flexibility of using client memory
+ * for any buffer (incl. vertex buffers) which rules out the prospect
+ * of using snooped buffers, as using snooped buffers without
+ * cogniscience is likely to be detrimental to performance and require
+ * extensive checking in the driver for correctness, e.g. to prevent
+ * illegal snoop <-> snoop transfers.
+ */
+ return devinfo->has_llc;
+ case PIPE_CAP_THROTTLE:
+ return screen->driconf.disable_throttling ? 0 : 1;
+
+ case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+ return PIPE_CONTEXT_PRIORITY_LOW |
+ PIPE_CONTEXT_PRIORITY_MEDIUM |
+ PIPE_CONTEXT_PRIORITY_HIGH;
+
+ case PIPE_CAP_FRONTEND_NOOP:
+ return true;
+ // XXX: don't hardcode 00:00:02.0 PCI here
+ case PIPE_CAP_PCI_GROUP:
+ return 0;
+ case PIPE_CAP_PCI_BUS:
+ return 0;
+ case PIPE_CAP_PCI_DEVICE:
+ return 2;
+ case PIPE_CAP_PCI_FUNCTION:
+ return 0;
+
+ default:
+ return u_pipe_screen_get_param_defaults(pscreen, param);
+ }
+ return 0;
+}
+
+static float
+crocus_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ switch (param) {
+ case PIPE_CAPF_MAX_LINE_WIDTH:
+ case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+ if (devinfo->ver >= 6)
+ return 7.375f;
+ else
+ return 7.0f;
+
+ case PIPE_CAPF_MAX_POINT_WIDTH:
+ case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+ return 255.0f;
+
+ case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+ return 16.0f;
+ case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+ return 15.0f;
+ case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+ case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+ case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+ return 0.0f;
+ default:
+ unreachable("unknown param");
+ }
+}
+
+static int
+crocus_get_shader_param(struct pipe_screen *pscreen,
+ enum pipe_shader_type p_stage,
+ enum pipe_shader_cap param)
+{
+ gl_shader_stage stage = stage_from_pipe(p_stage);
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ if (devinfo->ver < 6 &&
+ p_stage != PIPE_SHADER_VERTEX &&
+ p_stage != PIPE_SHADER_FRAGMENT)
+ return 0;
+
+ if (devinfo->ver == 6 &&
+ p_stage != PIPE_SHADER_VERTEX &&
+ p_stage != PIPE_SHADER_FRAGMENT &&
+ p_stage != PIPE_SHADER_GEOMETRY)
+ return 0;
+
+ /* this is probably not totally correct.. but it's a start: */
+ switch (param) {
+ case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+ return stage == MESA_SHADER_FRAGMENT ? 1024 : 16384;
+ case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+ case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+ return stage == MESA_SHADER_FRAGMENT ? 1024 : 0;
+
+ case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+ return UINT_MAX;
+
+ case PIPE_SHADER_CAP_MAX_INPUTS:
+ if (stage == MESA_SHADER_VERTEX ||
+ stage == MESA_SHADER_GEOMETRY)
+ return 16; /* Gen7 vec4 geom backend */
+ return 32;
+ case PIPE_SHADER_CAP_MAX_OUTPUTS:
+ return 32;
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+ return 16 * 1024 * sizeof(float);
+ case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+ return devinfo->ver >= 6 ? 16 : 1;
+ case PIPE_SHADER_CAP_MAX_TEMPS:
+ return 256; /* GL_MAX_PROGRAM_TEMPORARIES_ARB */
+ case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+ return 0;
+ case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+ case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+ case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+ case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+ /* Lie about these to avoid st/mesa's GLSL IR lowering of indirects,
+ * which we don't want. Our compiler backend will check brw_compiler's
+ * options and call nir_lower_indirect_derefs appropriately anyway.
+ */
+ return true;
+ case PIPE_SHADER_CAP_SUBROUTINES:
+ return 0;
+ case PIPE_SHADER_CAP_INTEGERS:
+ return 1;
+ case PIPE_SHADER_CAP_INT64_ATOMICS:
+ case PIPE_SHADER_CAP_FP16:
+ return 0;
+ case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+ case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+ return devinfo->is_haswell ? CROCUS_MAX_TEXTURE_SAMPLERS : 16;
+ case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+ if (devinfo->ver >= 7 &&
+ (p_stage == PIPE_SHADER_FRAGMENT ||
+ p_stage == PIPE_SHADER_COMPUTE))
+ return CROCUS_MAX_TEXTURE_SAMPLERS;
+ return 0;
+ case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+ return devinfo->ver >= 7 ? (CROCUS_MAX_ABOS + CROCUS_MAX_SSBOS) : 0;
+ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+ case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+ return 0;
+ case PIPE_SHADER_CAP_PREFERRED_IR:
+ return PIPE_SHADER_IR_NIR;
+ case PIPE_SHADER_CAP_SUPPORTED_IRS:
+ return 1 << PIPE_SHADER_IR_NIR;
+ case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+ return 1;
+ case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+ case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+ case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+ case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+ case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+ case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+ case PIPE_SHADER_CAP_FP16_DERIVATIVES:
+ case PIPE_SHADER_CAP_INT16:
+ case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS:
+ case PIPE_SHADER_CAP_FP16_CONST_BUFFERS:
+ return 0;
+ default:
+ unreachable("unknown shader param");
+ }
+}
+
+static int
+crocus_get_compute_param(struct pipe_screen *pscreen,
+ enum pipe_shader_ir ir_type,
+ enum pipe_compute_cap param,
+ void *ret)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
+ const uint32_t max_invocations = 32 * max_threads;
+
+ if (devinfo->ver < 7)
+ return 0;
+#define RET(x) do { \
+ if (ret) \
+ memcpy(ret, x, sizeof(x)); \
+ return sizeof(x); \
+} while (0)
+
+ switch (param) {
+ case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+ RET((uint32_t []){ 32 });
+
+ case PIPE_COMPUTE_CAP_IR_TARGET:
+ if (ret)
+ strcpy(ret, "gen");
+ return 4;
+
+ case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+ RET((uint64_t []) { 3 });
+
+ case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+ RET(((uint64_t []) { 65535, 65535, 65535 }));
+
+ case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+ /* MaxComputeWorkGroupSize[0..2] */
+ RET(((uint64_t []) {max_invocations, max_invocations, max_invocations}));
+
+ case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+ /* MaxComputeWorkGroupInvocations */
+ RET((uint64_t []) { max_invocations });
+
+ case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+ /* MaxComputeSharedMemorySize */
+ RET((uint64_t []) { 64 * 1024 });
+
+ case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+ RET((uint32_t []) { 1 });
+
+ case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+ RET((uint32_t []) { BRW_SUBGROUP_SIZE });
+
+ case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+ RET((uint64_t []) { max_invocations });
+
+ case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+ case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+ case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+ case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+ case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+ case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+
+ // XXX: I think these are for Clover...
+ return 0;
+
+ default:
+ unreachable("unknown compute param");
+ }
+}
+
+static uint64_t
+crocus_get_timestamp(struct pipe_screen *pscreen)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+ const unsigned TIMESTAMP = 0x2358;
+ uint64_t result;
+
+ crocus_reg_read(screen->bufmgr, TIMESTAMP | 1, &result);
+
+ result = intel_device_info_timebase_scale(&screen->devinfo, result);
+ result &= (1ull << TIMESTAMP_BITS) - 1;
+
+ return result;
+}
+
+void
+crocus_screen_destroy(struct crocus_screen *screen)
+{
+ u_transfer_helper_destroy(screen->base.transfer_helper);
+ crocus_bufmgr_unref(screen->bufmgr);
+ disk_cache_destroy(screen->disk_cache);
+ close(screen->winsys_fd);
+ ralloc_free(screen);
+}
+
+static void
+crocus_screen_unref(struct pipe_screen *pscreen)
+{
+ crocus_pscreen_unref(pscreen);
+}
+
+static void
+crocus_query_memory_info(struct pipe_screen *pscreen,
+ struct pipe_memory_info *info)
+{
+}
+
+static const void *
+crocus_get_compiler_options(struct pipe_screen *pscreen,
+ enum pipe_shader_ir ir,
+ enum pipe_shader_type pstage)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+ gl_shader_stage stage = stage_from_pipe(pstage);
+ assert(ir == PIPE_SHADER_IR_NIR);
+
+ return screen->compiler->glsl_compiler_options[stage].NirOptions;
+}
+
+static struct disk_cache *
+crocus_get_disk_shader_cache(struct pipe_screen *pscreen)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+ return screen->disk_cache;
+}
+
+static const struct intel_l3_config *
+crocus_get_default_l3_config(const struct intel_device_info *devinfo,
+ bool compute)
+{
+ bool wants_dc_cache = true;
+ bool has_slm = compute;
+ const struct intel_l3_weights w =
+ intel_get_default_l3_weights(devinfo, wants_dc_cache, has_slm);
+ return intel_get_l3_config(devinfo, w);
+}
+
+static void
+crocus_shader_debug_log(void *data, const char *fmt, ...)
+{
+ struct pipe_debug_callback *dbg = data;
+ unsigned id = 0;
+ va_list args;
+
+ if (!dbg->debug_message)
+ return;
+
+ va_start(args, fmt);
+ dbg->debug_message(dbg->data, &id, PIPE_DEBUG_TYPE_SHADER_INFO, fmt, args);
+ va_end(args);
+}
+
+static void
+crocus_shader_perf_log(void *data, const char *fmt, ...)
+{
+ struct pipe_debug_callback *dbg = data;
+ unsigned id = 0;
+ va_list args;
+ va_start(args, fmt);
+
+ if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+ va_list args_copy;
+ va_copy(args_copy, args);
+ vfprintf(stderr, fmt, args_copy);
+ va_end(args_copy);
+ }
+
+ if (dbg->debug_message) {
+ dbg->debug_message(dbg->data, &id, PIPE_DEBUG_TYPE_PERF_INFO, fmt, args);
+ }
+
+ va_end(args);
+}
+
+static bool
+crocus_detect_swizzling(struct crocus_screen *screen)
+{
+ /* Broadwell PRM says:
+ *
+ * "Before Gen8, there was a historical configuration control field to
+ * swizzle address bit[6] for in X/Y tiling modes. This was set in three
+ * different places: TILECTL[1:0], ARB_MODE[5:4], and
+ * DISP_ARB_CTL[14:13].
+ *
+ * For Gen8 and subsequent generations, the swizzle fields are all
+ * reserved, and the CPU's memory controller performs all address
+ * swizzling modifications."
+ */
+ uint32_t tiling = I915_TILING_X;
+ uint32_t swizzle_mode = 0;
+ struct crocus_bo *buffer =
+ crocus_bo_alloc_tiled(screen->bufmgr, "swizzle test", 32768,
+ 0, tiling, 512, 0);
+ if (buffer == NULL)
+ return false;
+
+ crocus_bo_get_tiling(buffer, &tiling, &swizzle_mode);
+ crocus_bo_unreference(buffer);
+
+ return swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
+}
+
+struct pipe_screen *
+crocus_screen_create(int fd, const struct pipe_screen_config *config)
+{
+ struct crocus_screen *screen = rzalloc(NULL, struct crocus_screen);
+ if (!screen)
+ return NULL;
+
+ if (!intel_get_device_info_from_fd(fd, &screen->devinfo))
+ return NULL;
+ screen->pci_id = screen->devinfo.chipset_id;
+ screen->no_hw = screen->devinfo.no_hw;
+
+ if (screen->devinfo.ver >= 8)
+ return NULL;
+
+ p_atomic_set(&screen->refcount, 1);
+
+ screen->aperture_bytes = get_aperture_size(fd);
+
+ if (getenv("INTEL_NO_HW") != NULL)
+ screen->no_hw = true;
+
+ bool bo_reuse = false;
+ int bo_reuse_mode = driQueryOptioni(config->options, "bo_reuse");
+ switch (bo_reuse_mode) {
+ case DRI_CONF_BO_REUSE_DISABLED:
+ break;
+ case DRI_CONF_BO_REUSE_ALL:
+ bo_reuse = true;
+ break;
+ }
+
+ screen->bufmgr = crocus_bufmgr_get_for_fd(&screen->devinfo, fd, bo_reuse);
+ if (!screen->bufmgr)
+ return NULL;
+ screen->fd = crocus_bufmgr_get_fd(screen->bufmgr);
+ screen->winsys_fd = fd;
+
+ screen->has_swizzling = crocus_detect_swizzling(screen);
+ brw_process_intel_debug_variable();
+
+ screen->driconf.dual_color_blend_by_location =
+ driQueryOptionb(config->options, "dual_color_blend_by_location");
+ screen->driconf.disable_throttling =
+ driQueryOptionb(config->options, "disable_throttling");
+ screen->driconf.always_flush_cache =
+ driQueryOptionb(config->options, "always_flush_cache");
+
+ screen->precompile = env_var_as_boolean("shader_precompile", true);
+
+ isl_device_init(&screen->isl_dev, &screen->devinfo,
+ screen->has_swizzling);
+
+ screen->compiler = brw_compiler_create(screen, &screen->devinfo);
+ screen->compiler->shader_debug_log = crocus_shader_debug_log;
+ screen->compiler->shader_perf_log = crocus_shader_perf_log;
+ screen->compiler->supports_pull_constants = false;
+ screen->compiler->supports_shader_constants = false;
+ screen->compiler->compact_params = false;
+ screen->compiler->constant_buffer_0_is_relative = true;
+
+ if (screen->devinfo.ver == 7) {
+ screen->l3_config_3d = crocus_get_default_l3_config(&screen->devinfo, false);
+ screen->l3_config_cs = crocus_get_default_l3_config(&screen->devinfo, true);
+ }
+
+ crocus_disk_cache_init(screen);
+
+ slab_create_parent(&screen->transfer_pool,
+ sizeof(struct crocus_transfer), 64);
+
+ screen->subslice_total = intel_device_info_subslice_total(&screen->devinfo);
+ assert(screen->subslice_total >= 1);
+
+ struct pipe_screen *pscreen = &screen->base;
+
+ crocus_init_screen_fence_functions(pscreen);
+ crocus_init_screen_resource_functions(pscreen);
+
+ pscreen->destroy = crocus_screen_unref;
+ pscreen->get_name = crocus_get_name;
+ pscreen->get_vendor = crocus_get_vendor;
+ pscreen->get_device_vendor = crocus_get_device_vendor;
+ pscreen->get_param = crocus_get_param;
+ pscreen->get_shader_param = crocus_get_shader_param;
+ pscreen->get_compute_param = crocus_get_compute_param;
+ pscreen->get_paramf = crocus_get_paramf;
+ pscreen->get_compiler_options = crocus_get_compiler_options;
+ pscreen->get_disk_shader_cache = crocus_get_disk_shader_cache;
+ pscreen->is_format_supported = crocus_is_format_supported;
+ pscreen->context_create = crocus_create_context;
+ pscreen->flush_frontbuffer = crocus_flush_frontbuffer;
+ pscreen->get_timestamp = crocus_get_timestamp;
+ pscreen->query_memory_info = crocus_query_memory_info;
+ pscreen->get_driver_query_group_info = crocus_get_monitor_group_info;
+ pscreen->get_driver_query_info = crocus_get_monitor_info;
+
+ genX_call(&screen->devinfo, init_screen_state, screen);
+ genX_call(&screen->devinfo, init_screen_query, screen);
+ return pscreen;
+}
diff --git a/src/gallium/drivers/crocus/crocus_screen.h b/src/gallium/drivers/crocus/crocus_screen.h
new file mode 100644
index 00000000000..4d942eb8415
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_screen.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_SCREEN_H
+#define CROCUS_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "frontend/drm_driver.h"
+#include "util/disk_cache.h"
+#include "util/slab.h"
+#include "util/u_screen.h"
+#include "intel/dev/intel_device_info.h"
+#include "intel/isl/isl.h"
+#include "crocus_bufmgr.h"
+#include "compiler/shader_enums.h"
+
+struct crocus_monitor_config;
+struct crocus_resource;
+struct crocus_context;
+struct crocus_sampler_state;
+struct brw_vue_map;
+struct brw_tcs_prog_key;
+struct brw_tes_prog_key;
+struct brw_cs_prog_key;
+struct brw_wm_prog_key;
+struct brw_vs_prog_key;
+struct brw_gs_prog_key;
+struct shader_info;
+
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+#define WRITE_ONCE(x, v) *(volatile __typeof__(x) *)&(x) = (v)
+
+#define CROCUS_MAX_TEXTURE_SAMPLERS 32
+#define CROCUS_MAX_SOL_BUFFERS 4
+#define CROCUS_MAP_BUFFER_ALIGNMENT 64
+
+
+/**
+ * Virtual table for generation-specific (genxml) function calls.
+ */
+struct crocus_vtable {
+ void (*destroy_state)(struct crocus_context *ice);
+ void (*init_render_context)(struct crocus_batch *batch);
+ void (*init_compute_context)(struct crocus_batch *batch);
+ void (*upload_render_state)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ const struct pipe_draw_info *draw,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *sc);
+ void (*update_surface_base_address)(struct crocus_batch *batch);
+
+ void (*upload_compute_state)(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ const struct pipe_grid_info *grid);
+ void (*rebind_buffer)(struct crocus_context *ice,
+ struct crocus_resource *res);
+ void (*resolve_conditional_render)(struct crocus_context *ice);
+ void (*emit_compute_predicate)(struct crocus_batch *batch);
+ void (*load_register_reg32)(struct crocus_batch *batch, uint32_t dst,
+ uint32_t src);
+ void (*load_register_reg64)(struct crocus_batch *batch, uint32_t dst,
+ uint32_t src);
+ void (*load_register_imm32)(struct crocus_batch *batch, uint32_t reg,
+ uint32_t val);
+ void (*load_register_imm64)(struct crocus_batch *batch, uint32_t reg,
+ uint64_t val);
+ void (*load_register_mem32)(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset);
+ void (*load_register_mem64)(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset);
+ void (*store_register_mem32)(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset,
+ bool predicated);
+ void (*store_register_mem64)(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset,
+ bool predicated);
+ void (*store_data_imm32)(struct crocus_batch *batch,
+ struct crocus_bo *bo, uint32_t offset,
+ uint32_t value);
+ void (*store_data_imm64)(struct crocus_batch *batch,
+ struct crocus_bo *bo, uint32_t offset,
+ uint64_t value);
+ void (*copy_mem_mem)(struct crocus_batch *batch,
+ struct crocus_bo *dst_bo, uint32_t dst_offset,
+ struct crocus_bo *src_bo, uint32_t src_offset,
+ unsigned bytes);
+ void (*emit_raw_pipe_control)(struct crocus_batch *batch,
+ const char *reason, uint32_t flags,
+ struct crocus_bo *bo, uint32_t offset,
+ uint64_t imm);
+
+ void (*emit_mi_report_perf_count)(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ uint32_t offset_in_bytes,
+ uint32_t report_id);
+
+ uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
+ const struct brw_vue_map *vue_map);
+ void (*populate_vs_key)(const struct crocus_context *ice,
+ const struct shader_info *info,
+ gl_shader_stage last_stage,
+ struct brw_vs_prog_key *key);
+ void (*populate_tcs_key)(const struct crocus_context *ice,
+ struct brw_tcs_prog_key *key);
+ void (*populate_tes_key)(const struct crocus_context *ice,
+ const struct shader_info *info,
+ gl_shader_stage last_stage,
+ struct brw_tes_prog_key *key);
+ void (*populate_gs_key)(const struct crocus_context *ice,
+ const struct shader_info *info,
+ gl_shader_stage last_stage,
+ struct brw_gs_prog_key *key);
+ void (*populate_fs_key)(const struct crocus_context *ice,
+ const struct shader_info *info,
+ struct brw_wm_prog_key *key);
+ void (*populate_cs_key)(const struct crocus_context *ice,
+ struct brw_cs_prog_key *key);
+ void (*lost_genx_state)(struct crocus_context *ice, struct crocus_batch *batch);
+
+ void (*finish_batch)(struct crocus_batch *batch); /* haswell only */
+
+ void (*upload_urb_fence)(struct crocus_batch *batch); /* gen4/5 only */
+
+ bool (*blit_blt)(struct crocus_batch *batch,
+ const struct pipe_blit_info *info);
+ bool (*copy_region_blt)(struct crocus_batch *batch,
+ struct crocus_resource *dst,
+ unsigned dst_level,
+ unsigned dstx, unsigned dsty, unsigned dstz,
+ struct crocus_resource *src,
+ unsigned src_level,
+ const struct pipe_box *src_box);
+ bool (*calculate_urb_fence)(struct crocus_batch *batch, unsigned csize,
+ unsigned vsize, unsigned sfsize);
+ void (*batch_reset_dirty)(struct crocus_batch *batch);
+ unsigned (*translate_prim_type)(enum pipe_prim_type prim, uint8_t verts_per_patch);
+
+ void (*update_so_strides)(struct crocus_context *ice,
+ uint16_t *strides);
+
+ uint32_t (*get_so_offset)(struct pipe_stream_output_target *tgt);
+};
+
+struct crocus_screen {
+ struct pipe_screen base;
+
+ uint32_t refcount;
+
+ /** Global slab allocator for crocus_transfer_map objects */
+ struct slab_parent_pool transfer_pool;
+
+ /** drm device file descriptor, shared with bufmgr, do not close. */
+ int fd;
+
+ /**
+ * drm device file descriptor to used for window system integration, owned
+ * by iris_screen, can be a different DRM instance than fd.
+ */
+ int winsys_fd;
+
+ /** PCI ID for our GPU device */
+ int pci_id;
+
+ bool no_hw;
+
+ struct crocus_vtable vtbl;
+
+ /** Global program_string_id counter (see get_program_string_id()) */
+ unsigned program_id;
+
+ /** Precompile shaders at link time? (Can be disabled for debugging.) */
+ bool precompile;
+
+ /** driconf options and application workarounds */
+ struct {
+ /** Dual color blend by location instead of index (for broken apps) */
+ bool dual_color_blend_by_location;
+ bool disable_throttling;
+ bool always_flush_cache;
+ } driconf;
+
+ unsigned subslice_total;
+
+ uint64_t aperture_bytes;
+
+ struct intel_device_info devinfo;
+ struct isl_device isl_dev;
+ struct crocus_bufmgr *bufmgr;
+ struct brw_compiler *compiler;
+ struct crocus_monitor_config *monitor_cfg;
+ bool has_swizzling;
+
+ const struct intel_l3_config *l3_config_3d;
+ const struct intel_l3_config *l3_config_cs;
+
+ struct disk_cache *disk_cache;
+};
+
+struct pipe_screen *
+crocus_screen_create(int fd, const struct pipe_screen_config *config);
+
+void crocus_screen_destroy(struct crocus_screen *screen);
+
+UNUSED static inline struct pipe_screen *
+crocus_pscreen_ref(struct pipe_screen *pscreen)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+
+ p_atomic_inc(&screen->refcount);
+ return pscreen;
+}
+
+UNUSED static inline void
+crocus_pscreen_unref(struct pipe_screen *pscreen)
+{
+ struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+
+ if (p_atomic_dec_zero(&screen->refcount))
+ crocus_screen_destroy(screen);
+}
+
+bool
+crocus_is_format_supported(struct pipe_screen *pscreen,
+ enum pipe_format format,
+ enum pipe_texture_target target,
+ unsigned sample_count,
+ unsigned storage_sample_count,
+ unsigned usage);
+
+void crocus_disk_cache_init(struct crocus_screen *screen);
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c
new file mode 100644
index 00000000000..7202140df02
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_state.c
@@ -0,0 +1,8382 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_state.c
+ *
+ * ============================= GENXML CODE =============================
+ * [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * This is the main state upload code.
+ *
+ * Gallium uses Constant State Objects, or CSOs, for most state. Large,
+ * complex, or highly reusable state can be created once, and bound and
+ * rebound multiple times. This is modeled with the pipe->create_*_state()
+ * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
+ * streamed out on the fly, via pipe->set_*_state() hooks.
+ *
+ * OpenGL involves frequently mutating context state, which is mirrored in
+ * core Mesa by highly mutable data structures. However, most applications
+ * typically draw the same things over and over - from frame to frame, most
+ * of the same objects are still visible and need to be redrawn. So, rather
+ * than inventing new state all the time, applications usually mutate to swap
+ * between known states that we've seen before.
+ *
+ * Gallium isolates us from this mutation by tracking API state, and
+ * distilling it into a set of Constant State Objects, or CSOs. Large,
+ * complex, or typically reusable state can be created once, then reused
+ * multiple times. Drivers can create and store their own associated data.
+ * This create/bind model corresponds to the pipe->create_*_state() and
+ * pipe->bind_*_state() driver hooks.
+ *
+ * Some state is cheap to create, or expected to be highly dynamic. Rather
+ * than creating and caching piles of CSOs for these, Gallium simply streams
+ * them out, via the pipe->set_*_state() driver hooks.
+ *
+ * To reduce draw time overhead, we try to compute as much state at create
+ * time as possible. Wherever possible, we translate the Gallium pipe state
+ * to 3DSTATE commands, and store those commands in the CSO. At draw time,
+ * we can simply memcpy them into a batch buffer.
+ *
+ * No hardware matches the abstraction perfectly, so some commands require
+ * information from multiple CSOs. In this case, we can store two copies
+ * of the packet (one in each CSO), and simply | together their DWords at
+ * draw time. Sometimes the second set is trivial (one or two fields), so
+ * we simply pack it at draw time.
+ *
+ * There are two main components in the file below. First, the CSO hooks
+ * create/bind/track state. The second are the draw-time upload functions,
+ * crocus_upload_render_state() and crocus_upload_compute_state(), which read
+ * the context state and emit the commands into the actual batch.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+
+#if HAVE_VALGRIND
+#include <memcheck.h>
+#include <valgrind.h>
+#define VG(x) x
+#ifdef DEBUG
+#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
+#endif
+#else
+#define VG(x)
+#endif
+
+#include "drm-uapi/i915_drm.h"
+#include "intel/common/intel_l3_config.h"
+#include "intel/common/intel_sample_positions.h"
+#include "intel/compiler/brw_compiler.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/format/u_format.h"
+#include "util/half_float.h"
+#include "util/u_dual_blend.h"
+#include "util/u_framebuffer.h"
+#include "util/u_helpers.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+#include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_viewport.h"
+#include "crocus_batch.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "crocus_pipe.h"
+#include "crocus_resource.h"
+
+#include "crocus_genx_macros.h"
+#include "intel/common/intel_guardband.h"
+
+/**
+ * Statically assert that PIPE_* enums match the hardware packets.
+ * (As long as they match, we don't need to translate them.)
+ */
+UNUSED static void pipe_asserts()
+{
+#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
+
+ /* pipe_logicop happens to match the hardware. */
+ PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
+ PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
+ PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
+ PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
+ PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
+ PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
+ PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
+ PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
+ PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
+ PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
+ PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
+ PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
+ PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
+ PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
+ PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
+ PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
+
+ /* pipe_blend_func happens to match the hardware. */
+ PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
+ PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
+
+ /* pipe_blend_func happens to match the hardware. */
+ PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
+ PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
+ PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
+ PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
+ PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
+
+ /* pipe_stencil_op happens to match the hardware. */
+ PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
+ PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
+ PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
+ PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
+ PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
+ PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
+ PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
+ PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
+
+#if GFX_VER >= 6
+ /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
+ PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
+ PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
+#endif
+#undef PIPE_ASSERT
+}
+
+static unsigned
+translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
+{
+ static const unsigned map[] = {
+ [PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST,
+ [PIPE_PRIM_LINES] = _3DPRIM_LINELIST,
+ [PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
+ [PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
+ [PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
+ [PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+ [PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+ [PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST,
+ [PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+ [PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON,
+#if GFX_VER >= 6
+ [PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+ [PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+ [PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+ [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+#endif
+#if GFX_VER >= 7
+ [PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
+#endif
+ };
+
+ return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
+}
+
+static unsigned
+translate_compare_func(enum pipe_compare_func pipe_func)
+{
+ static const unsigned map[] = {
+ [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
+ [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
+ [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
+ [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
+ [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
+ [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
+ [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
+ [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
+ };
+ return map[pipe_func];
+}
+
+static unsigned
+translate_shadow_func(enum pipe_compare_func pipe_func)
+{
+ /* Gallium specifies the result of shadow comparisons as:
+ *
+ * 1 if ref <op> texel,
+ * 0 otherwise.
+ *
+ * The hardware does:
+ *
+ * 0 if texel <op> ref,
+ * 1 otherwise.
+ *
+ * So we need to flip the operator and also negate.
+ */
+ static const unsigned map[] = {
+ [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
+ [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
+ [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
+ [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
+ [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
+ [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
+ [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
+ [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
+ };
+ return map[pipe_func];
+}
+
+static unsigned
+translate_cull_mode(unsigned pipe_face)
+{
+ static const unsigned map[4] = {
+ [PIPE_FACE_NONE] = CULLMODE_NONE,
+ [PIPE_FACE_FRONT] = CULLMODE_FRONT,
+ [PIPE_FACE_BACK] = CULLMODE_BACK,
+ [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
+ };
+ return map[pipe_face];
+}
+
+#if GFX_VER >= 6
+static unsigned
+translate_fill_mode(unsigned pipe_polymode)
+{
+ static const unsigned map[4] = {
+ [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
+ [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
+ [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
+ [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
+ };
+ return map[pipe_polymode];
+}
+#endif
+
+static unsigned
+translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
+{
+ static const unsigned map[] = {
+ [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
+ [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
+ [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
+ };
+ return map[pipe_mip];
+}
+
+static uint32_t
+translate_wrap(unsigned pipe_wrap, bool either_nearest)
+{
+ static const unsigned map[] = {
+ [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
+ [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER,
+ [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
+ [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
+ [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
+ [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
+
+ /* These are unsupported. */
+ [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
+ [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
+ };
+ if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
+ return TCM_CLAMP;
+ return map[pipe_wrap];
+}
+
+/**
+ * Equiv if brw_state_batch
+ */
+static uint32_t *
+stream_state(struct crocus_batch *batch,
+ unsigned size,
+ unsigned alignment,
+ uint32_t *out_offset)
+{
+ uint32_t offset = ALIGN(batch->state.used, alignment);
+
+ if (offset + size >= STATE_SZ && !batch->no_wrap) {
+ crocus_batch_flush(batch);
+ offset = ALIGN(batch->state.used, alignment);
+ } else if (offset + size >= batch->state.bo->size) {
+ const unsigned new_size =
+ MIN2(batch->state.bo->size + batch->state.bo->size / 2,
+ MAX_STATE_SIZE);
+ crocus_grow_buffer(batch, true, batch->state.used, new_size);
+ assert(offset + size < batch->state.bo->size);
+ }
+
+ crocus_record_state_size(batch->state_sizes, offset, size);
+
+ batch->state.used = offset + size;
+ *out_offset = offset;
+
+ return (uint32_t *)batch->state.map + (offset >> 2);
+}
+
+/**
+ * stream_state() + memcpy.
+ */
+static uint32_t
+emit_state(struct crocus_batch *batch, const void *data, unsigned size,
+ unsigned alignment)
+{
+ unsigned offset = 0;
+ uint32_t *map = stream_state(batch, size, alignment, &offset);
+
+ if (map)
+ memcpy(map, data, size);
+
+ return offset;
+}
+
+#if GFX_VER <= 5
+static void
+upload_pipelined_state_pointers(struct crocus_batch *batch,
+ bool gs_active, uint32_t gs_offset,
+ uint32_t vs_offset, uint32_t sf_offset,
+ uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
+{
+#if GFX_VER == 5
+ /* Need to flush before changing clip max threads for errata. */
+ crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
+#endif
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
+ pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
+ pp.GSEnable = gs_active;
+ if (gs_active)
+ pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
+ pp.ClipEnable = true;
+ pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
+ pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
+ pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
+ pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
+ }
+}
+
+#endif
+/**
+ * Did field 'x' change between 'old_cso' and 'new_cso'?
+ *
+ * (If so, we may want to set some dirty flags.)
+ */
+#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
+#define cso_changed_memcmp(x) \
+ (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
+
+static void
+flush_before_state_base_change(struct crocus_batch *batch)
+{
+#if GFX_VER >= 6
+ /* Flush before emitting STATE_BASE_ADDRESS.
+ *
+ * This isn't documented anywhere in the PRM. However, it seems to be
+ * necessary prior to changing the surface state base adress. We've
+ * seen issues in Vulkan where we get GPU hangs when using multi-level
+ * command buffers which clear depth, reset state base address, and then
+ * go render stuff.
+ *
+ * Normally, in GL, we would trust the kernel to do sufficient stalls
+ * and flushes prior to executing our batch. However, it doesn't seem
+ * as if the kernel's flushing is always sufficient and we don't want to
+ * rely on it.
+ *
+ * We make this an end-of-pipe sync instead of a normal flush because we
+ * do not know the current status of the GPU. On Haswell at least,
+ * having a fast-clear operation in flight at the same time as a normal
+ * rendering operation can cause hangs. Since the kernel's flushing is
+ * insufficient, we need to ensure that any rendering operations from
+ * other processes are definitely complete before we try to do our own
+ * rendering. It's a bit of a big hammer but it appears to work.
+ */
+ const unsigned dc_flush =
+ batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
+ crocus_emit_end_of_pipe_sync(batch,
+ "change STATE_BASE_ADDRESS (flushes)",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ dc_flush |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+#endif
+}
+
+static void
+flush_after_state_base_change(struct crocus_batch *batch)
+{
+ /* After re-setting the surface state base address, we have to do some
+ * cache flusing so that the sampler engine will pick up the new
+ * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
+ * Shared Function > 3D Sampler > State > State Caching (page 96):
+ *
+ * Coherency with system memory in the state cache, like the texture
+ * cache is handled partially by software. It is expected that the
+ * command stream or shader will issue Cache Flush operation or
+ * Cache_Flush sampler message to ensure that the L1 cache remains
+ * coherent with system memory.
+ *
+ * [...]
+ *
+ * Whenever the value of the Dynamic_State_Base_Addr,
+ * Surface_State_Base_Addr are altered, the L1 state cache must be
+ * invalidated to ensure the new surface or sampler state is fetched
+ * from system memory.
+ *
+ * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
+ * which, according the PIPE_CONTROL instruction documentation in the
+ * Broadwell PRM:
+ *
+ * Setting this bit is independent of any other bit in this packet.
+ * This bit controls the invalidation of the L1 and L2 state caches
+ * at the top of the pipe i.e. at the parsing time.
+ *
+ * Unfortunately, experimentation seems to indicate that state cache
+ * invalidation through a PIPE_CONTROL does nothing whatsoever in
+ * regards to surface state and binding tables. In stead, it seems that
+ * invalidating the texture cache is what is actually needed.
+ *
+ * XXX: As far as we have been able to determine through
+ * experimentation, shows that flush the texture cache appears to be
+ * sufficient. The theory here is that all of the sampling/rendering
+ * units cache the binding table in the texture cache. However, we have
+ * yet to be able to actually confirm this.
+ */
+#if GFX_VER >= 6
+ crocus_emit_end_of_pipe_sync(batch,
+ "change STATE_BASE_ADDRESS (invalidates)",
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+#endif
+}
+
+#if GFX_VER >= 6
+static void
+crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset,
+ bool predicated)
+{
+ crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+ srm.RegisterAddress = reg;
+ srm.MemoryAddress = ggtt_bo(bo, offset);
+#if GFX_VERx10 == 75
+ srm.PredicateEnable = predicated;
+#else
+ if (predicated)
+ unreachable("unsupported predication");
+#endif
+ }
+}
+
+static void
+crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset,
+ bool predicated)
+{
+ crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
+ crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
+}
+#endif
+
+#if GFX_VER >= 7
+static void
+_crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
+{
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = reg;
+ lri.DataDWord = val;
+ }
+}
+#define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
+
+#if GFX_VERx10 == 75
+static void
+_crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
+{
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
+ lrr.SourceRegisterAddress = src;
+ lrr.DestinationRegisterAddress = dst;
+ }
+}
+
+static void
+crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
+ uint32_t src)
+{
+ _crocus_emit_lrr(batch, dst, src);
+}
+
+static void
+crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
+ uint32_t src)
+{
+ _crocus_emit_lrr(batch, dst, src);
+ _crocus_emit_lrr(batch, dst + 4, src + 4);
+}
+#endif
+
+static void
+crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
+ uint32_t val)
+{
+ _crocus_emit_lri(batch, reg, val);
+}
+
+static void
+crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
+ uint64_t val)
+{
+ _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
+ _crocus_emit_lri(batch, reg + 4, val >> 32);
+}
+
+/**
+ * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
+ */
+static void
+crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset)
+{
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = reg;
+ lrm.MemoryAddress = ro_bo(bo, offset);
+ }
+}
+
+/**
+ * Load a 64-bit value from a buffer into a MMIO register via
+ * two MI_LOAD_REGISTER_MEM commands.
+ */
+static void
+crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
+ struct crocus_bo *bo, uint32_t offset)
+{
+ crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
+ crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
+}
+
+#if GFX_VERx10 == 75
+static void
+crocus_store_data_imm32(struct crocus_batch *batch,
+ struct crocus_bo *bo, uint32_t offset,
+ uint32_t imm)
+{
+ crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+ sdi.Address = rw_bo(bo, offset);
+#if GFX_VER >= 6
+ sdi.ImmediateData = imm;
+#endif
+ }
+}
+
+static void
+crocus_store_data_imm64(struct crocus_batch *batch,
+ struct crocus_bo *bo, uint32_t offset,
+ uint64_t imm)
+{
+ /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
+ * 2 in genxml but it's actually variable length and we need 5 DWords.
+ */
+ void *map = crocus_get_command_space(batch, 4 * 5);
+ _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
+ sdi.DWordLength = 5 - 2;
+ sdi.Address = rw_bo(bo, offset);
+#if GFX_VER >= 6
+ sdi.ImmediateData = imm;
+#endif
+ }
+}
+#endif
+
+static void
+crocus_copy_mem_mem(struct crocus_batch *batch,
+ struct crocus_bo *dst_bo, uint32_t dst_offset,
+ struct crocus_bo *src_bo, uint32_t src_offset,
+ unsigned bytes)
+{
+ assert(bytes % 4 == 0);
+ assert(dst_offset % 4 == 0);
+ assert(src_offset % 4 == 0);
+
+#define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
+ for (unsigned i = 0; i < bytes; i += 4) {
+ crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
+ src_bo, src_offset + i);
+ crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
+ dst_bo, dst_offset + i, false);
+ }
+}
+#endif
+
+/**
+ * Gallium CSO for rasterizer state.
+ */
+struct crocus_rasterizer_state {
+ struct pipe_rasterizer_state cso;
+#if GFX_VER >= 6
+ uint32_t sf[GENX(3DSTATE_SF_length)];
+ uint32_t clip[GENX(3DSTATE_CLIP_length)];
+#endif
+ uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
+
+ uint8_t num_clip_plane_consts;
+ bool fill_mode_point_or_line;
+};
+
+#if GFX_VER <= 5
+#define URB_VS 0
+#define URB_GS 1
+#define URB_CLP 2
+#define URB_SF 3
+#define URB_CS 4
+
+static const struct {
+ uint32_t min_nr_entries;
+ uint32_t preferred_nr_entries;
+ uint32_t min_entry_size;
+ uint32_t max_entry_size;
+} limits[URB_CS+1] = {
+ { 16, 32, 1, 5 }, /* vs */
+ { 4, 8, 1, 5 }, /* gs */
+ { 5, 10, 1, 5 }, /* clp */
+ { 1, 8, 1, 12 }, /* sf */
+ { 1, 4, 1, 32 } /* cs */
+};
+
+static bool check_urb_layout(struct crocus_context *ice)
+{
+ ice->urb.vs_start = 0;
+ ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
+ ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
+ ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
+ ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
+
+ return ice->urb.cs_start + ice->urb.nr_cs_entries *
+ ice->urb.csize <= ice->urb.size;
+}
+
+
+static bool
+crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
+ unsigned vsize, unsigned sfsize)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ struct crocus_context *ice = batch->ice;
+ if (csize < limits[URB_CS].min_entry_size)
+ csize = limits[URB_CS].min_entry_size;
+
+ if (vsize < limits[URB_VS].min_entry_size)
+ vsize = limits[URB_VS].min_entry_size;
+
+ if (sfsize < limits[URB_SF].min_entry_size)
+ sfsize = limits[URB_SF].min_entry_size;
+
+ if (ice->urb.vsize < vsize ||
+ ice->urb.sfsize < sfsize ||
+ ice->urb.csize < csize ||
+ (ice->urb.constrained && (ice->urb.vsize > vsize ||
+ ice->urb.sfsize > sfsize ||
+ ice->urb.csize > csize))) {
+
+
+ ice->urb.csize = csize;
+ ice->urb.sfsize = sfsize;
+ ice->urb.vsize = vsize;
+
+ ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
+ ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
+ ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
+ ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
+ ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
+
+ ice->urb.constrained = 0;
+
+ if (devinfo->ver == 5) {
+ ice->urb.nr_vs_entries = 128;
+ ice->urb.nr_sf_entries = 48;
+ if (check_urb_layout(ice)) {
+ goto done;
+ } else {
+ ice->urb.constrained = 1;
+ ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
+ ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
+ }
+ } else if (devinfo->is_g4x) {
+ ice->urb.nr_vs_entries = 64;
+ if (check_urb_layout(ice)) {
+ goto done;
+ } else {
+ ice->urb.constrained = 1;
+ ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
+ }
+ }
+
+ if (!check_urb_layout(ice)) {
+ ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
+ ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
+ ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
+ ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
+ ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
+
+ /* Mark us as operating with constrained nr_entries, so that next
+ * time we recalculate we'll resize the fences in the hope of
+ * escaping constrained mode and getting back to normal performance.
+ */
+ ice->urb.constrained = 1;
+
+ if (!check_urb_layout(ice)) {
+ /* This is impossible, given the maximal sizes of urb
+ * entries and the values for minimum nr of entries
+ * provided above.
+ */
+ fprintf(stderr, "couldn't calculate URB layout!\n");
+ exit(1);
+ }
+
+ if (unlikely(INTEL_DEBUG & (DEBUG_URB|DEBUG_PERF)))
+ fprintf(stderr, "URB CONSTRAINED\n");
+ }
+
+done:
+ if (unlikely(INTEL_DEBUG & DEBUG_URB))
+ fprintf(stderr,
+ "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+ ice->urb.vs_start,
+ ice->urb.gs_start,
+ ice->urb.clip_start,
+ ice->urb.sf_start,
+ ice->urb.cs_start,
+ ice->urb.size);
+ return true;
+ }
+ return false;
+}
+
+static void
+crocus_upload_urb_fence(struct crocus_batch *batch)
+{
+ uint32_t urb_fence[3];
+ _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
+ urb.VSUnitURBReallocationRequest = 1;
+ urb.GSUnitURBReallocationRequest = 1;
+ urb.CLIPUnitURBReallocationRequest = 1;
+ urb.SFUnitURBReallocationRequest = 1;
+ urb.VFEUnitURBReallocationRequest = 1;
+ urb.CSUnitURBReallocationRequest = 1;
+
+ urb.VSFence = batch->ice->urb.gs_start;
+ urb.GSFence = batch->ice->urb.clip_start;
+ urb.CLIPFence = batch->ice->urb.sf_start;
+ urb.SFFence = batch->ice->urb.cs_start;
+ urb.CSFence = batch->ice->urb.size;
+ }
+
+ /* erratum: URB_FENCE must not cross a 64byte cacheline */
+ if ((crocus_batch_bytes_used(batch) & 15) > 12) {
+ int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
+ do {
+ *(uint32_t *)batch->command.map_next = 0;
+ batch->command.map_next += sizeof(uint32_t);
+ } while (--pad);
+ }
+
+ crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
+}
+
+static bool
+calculate_curbe_offsets(struct crocus_batch *batch)
+{
+ struct crocus_context *ice = batch->ice;
+
+ unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
+ unsigned total_regs;
+
+ nr_fp_regs = 0;
+ for (int i = 0; i < 4; i++) {
+ const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
+ if (range->length == 0)
+ continue;
+
+ /* ubo range tracks at 256-bit, we need 512-bit */
+ nr_fp_regs += (range->length + 1) / 2;
+ }
+
+ if (ice->state.cso_rast->cso.clip_plane_enable) {
+ unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
+ nr_clip_regs = (nr_planes * 4 + 15) / 16;
+ }
+
+ nr_vp_regs = 0;
+ for (int i = 0; i < 4; i++) {
+ const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
+ if (range->length == 0)
+ continue;
+
+ /* ubo range tracks at 256-bit, we need 512-bit */
+ nr_vp_regs += (range->length + 1) / 2;
+ }
+ if (nr_vp_regs == 0) {
+ /* The pre-gen6 VS requires that some push constants get loaded no
+ * matter what, or the GPU would hang.
+ */
+ nr_vp_regs = 1;
+ }
+ total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+ /* The CURBE allocation size is limited to 32 512-bit units (128 EU
+ * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5
+ * (volume 1, part 1) PRMs.
+ *
+ * Note that in brw_fs.cpp we're only loading up to 16 EU registers of
+ * values as push constants before spilling to pull constants, and in
+ * brw_vec4.cpp we're loading up to 32 registers of push constants. An EU
+ * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
+ * regs for clip.
+ */
+ assert(total_regs <= 32);
+
+ /* Lazy resize:
+ */
+ if (nr_fp_regs > ice->curbe.wm_size ||
+ nr_vp_regs > ice->curbe.vs_size ||
+ nr_clip_regs != ice->curbe.clip_size ||
+ (total_regs < ice->curbe.total_size / 4 &&
+ ice->curbe.total_size > 16)) {
+
+ GLuint reg = 0;
+
+ /* Calculate a new layout:
+ */
+ reg = 0;
+ ice->curbe.wm_start = reg;
+ ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+ ice->curbe.clip_start = reg;
+ ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+ ice->curbe.vs_start = reg;
+ ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+ ice->curbe.total_size = reg;
+
+ if (0)
+ fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
+ ice->curbe.wm_start,
+ ice->curbe.wm_size,
+ ice->curbe.clip_start,
+ ice->curbe.clip_size,
+ ice->curbe.vs_start,
+ ice->curbe.vs_size );
+ return true;
+ }
+ return false;
+}
+
+static void
+upload_shader_consts(struct crocus_context *ice,
+ gl_shader_stage stage,
+ uint32_t *map,
+ unsigned start)
+{
+ struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+ struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
+ uint32_t *cmap;
+ bool found = false;
+ unsigned offset = start * 16;
+ int total = 0;
+ for (int i = 0; i < 4; i++) {
+ const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+ if (range->length == 0)
+ continue;
+
+ unsigned block_index = crocus_bti_to_group_index(
+ &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
+ unsigned len = range->length * 8 * sizeof(float);
+ unsigned start = range->start * 8 * sizeof(float);
+ struct pipe_transfer *transfer;
+
+ cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
+ ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
+ PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
+ if (cmap)
+ memcpy(&map[offset + (total * 8)], cmap, len);
+ pipe_buffer_unmap(&ice->ctx, transfer);
+ total += range->length;
+ found = true;
+ }
+
+ if (stage == MESA_SHADER_VERTEX && !found) {
+ /* The pre-gen6 VS requires that some push constants get loaded no
+ * matter what, or the GPU would hang.
+ */
+ unsigned len = 16;
+ memset(&map[offset], 0, len);
+ }
+}
+
+static const float fixed_plane[6][4] = {
+ { 0, 0, -1, 1 },
+ { 0, 0, 1, 1 },
+ { 0, -1, 0, 1 },
+ { 0, 1, 0, 1 },
+ {-1, 0, 0, 1 },
+ { 1, 0, 0, 1 }
+};
+
+static void
+gen4_upload_curbe(struct crocus_batch *batch)
+{
+ struct crocus_context *ice = batch->ice;
+ const unsigned sz = ice->curbe.total_size;
+ const unsigned buf_sz = sz * 16 * sizeof(float);
+
+ if (sz == 0)
+ goto emit;
+
+ uint32_t *map;
+ u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
+ &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
+
+ /* fragment shader constants */
+ if (ice->curbe.wm_size) {
+ upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
+ }
+
+ /* clipper constants */
+ if (ice->curbe.clip_size) {
+ unsigned offset = ice->curbe.clip_start * 16;
+ float *fmap = (float *)map;
+ unsigned i;
+ /* If any planes are going this way, send them all this way:
+ */
+ for (i = 0; i < 6; i++) {
+ fmap[offset + i * 4 + 0] = fixed_plane[i][0];
+ fmap[offset + i * 4 + 1] = fixed_plane[i][1];
+ fmap[offset + i * 4 + 2] = fixed_plane[i][2];
+ fmap[offset + i * 4 + 3] = fixed_plane[i][3];
+ }
+
+ unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
+ struct pipe_clip_state *cp = &ice->state.clip_planes;
+ while (mask) {
+ const int j = u_bit_scan(&mask);
+ fmap[offset + i * 4 + 0] = cp->ucp[j][0];
+ fmap[offset + i * 4 + 1] = cp->ucp[j][1];
+ fmap[offset + i * 4 + 2] = cp->ucp[j][2];
+ fmap[offset + i * 4 + 3] = cp->ucp[j][3];
+ i++;
+ }
+ }
+
+ /* vertex shader constants */
+ if (ice->curbe.vs_size) {
+ upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
+ }
+ if (0) {
+ for (int i = 0; i < sz*16; i+=4) {
+ float *f = (float *)map;
+ fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+ f[i+0], f[i+1], f[i+2], f[i+3]);
+ }
+ }
+
+emit:
+ crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
+ if (ice->curbe.curbe_res) {
+ cb.BufferLength = ice->curbe.total_size - 1;
+ cb.Valid = 1;
+ cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
+ }
+ }
+
+#if GFX_VER == 4 && GFX_VERx10 != 45
+ /* Work around a Broadwater/Crestline depth interpolator bug. The
+ * following sequence will cause GPU hangs:
+ *
+ * 1. Change state so that all depth related fields in CC_STATE are
+ * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
+ * 2. Emit a CONSTANT_BUFFER packet.
+ * 3. Draw via 3DPRIMITIVE.
+ *
+ * The recommended workaround is to emit a non-pipelined state change after
+ * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
+ *
+ * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
+ * and always emit it when "PS Use Source Depth" is set. We could be more
+ * precise, but the additional complexity is probably not worth it.
+ *
+ */
+ const struct shader_info *fs_info =
+ crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
+
+ if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
+ ice->state.global_depth_offset_clamp = 0;
+ crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
+ }
+#endif
+}
+#endif
+
+#if GFX_VER == 7
+
+#define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
+#define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
+#define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
+
+static void
+setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
+ const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
+ cfg->n[INTEL_L3P_ALL];
+ const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
+ cfg->n[INTEL_L3P_ALL];
+ const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
+ cfg->n[INTEL_L3P_ALL];
+ const bool has_slm = cfg->n[INTEL_L3P_SLM];
+
+ /* According to the hardware docs, the L3 partitioning can only be changed
+ * while the pipeline is completely drained and the caches are flushed,
+ * which involves a first PIPE_CONTROL flush which stalls the pipeline...
+ */
+ crocus_emit_pipe_control_flush(batch, "l3_config",
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+
+ /* ...followed by a second pipelined PIPE_CONTROL that initiates
+ * invalidation of the relevant caches. Note that because RO invalidation
+ * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
+ * command is processed by the CS) we cannot combine it with the previous
+ * stalling flush as the hardware documentation suggests, because that
+ * would cause the CS to stall on previous rendering *after* RO
+ * invalidation and wouldn't prevent the RO caches from being polluted by
+ * concurrent rendering before the stall completes. This intentionally
+ * doesn't implement the SKL+ hardware workaround suggesting to enable CS
+ * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
+ * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
+ * already guarantee that there is no concurrent GPGPU kernel execution
+ * (see SKL HSD 2132585).
+ */
+ crocus_emit_pipe_control_flush(batch, "l3 config",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+ /* Now send a third stalling flush to make sure that invalidation is
+ * complete when the L3 configuration registers are modified.
+ */
+ crocus_emit_pipe_control_flush(batch, "l3 config",
+ PIPE_CONTROL_DATA_CACHE_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+
+
+ assert(!cfg->n[INTEL_L3P_ALL]);
+
+ /* When enabled SLM only uses a portion of the L3 on half of the banks,
+ * the matching space on the remaining banks has to be allocated to a
+ * client (URB for all validated configurations) set to the
+ * lower-bandwidth 2-bank address hashing mode.
+ */
+ const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
+ assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
+
+ /* Minimum number of ways that can be allocated to the URB. */
+ const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
+ assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
+
+ uint32_t l3sqcr1, l3cr2, l3cr3;
+
+ crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
+ reg.ConvertDC_UC = !has_dc;
+ reg.ConvertIS_UC = !has_is;
+ reg.ConvertC_UC = !has_c;
+ reg.ConvertT_UC = !has_t;
+#if GFX_VERx10 == 75
+ reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
+#else
+ reg.L3SQGeneralPriorityCreditInitialization =
+ devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
+#endif
+ reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
+ };
+
+ crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
+ reg.SLMEnable = has_slm;
+ reg.URBLowBandwidth = urb_low_bw;
+ reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
+#if !(GFX_VERx10 == 75)
+ reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
+#endif
+ reg.ROAllocation = cfg->n[INTEL_L3P_RO];
+ reg.DCAllocation = cfg->n[INTEL_L3P_DC];
+ };
+
+ crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
+ reg.ISAllocation = cfg->n[INTEL_L3P_IS];
+ reg.ISLowBandwidth = 0;
+ reg.CAllocation = cfg->n[INTEL_L3P_C];
+ reg.CLowBandwidth = 0;
+ reg.TAllocation = cfg->n[INTEL_L3P_T];
+ reg.TLowBandwidth = 0;
+ };
+
+ /* Set up the L3 partitioning. */
+ crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
+ crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
+ crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
+
+#if GFX_VERSIONx10 == 75
+ /* TODO: Fail screen creation if command parser version < 4 */
+ uint32_t scratch1, chicken3;
+ crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
+ reg.L3AtomicDisable = !has_dc;
+ }
+ crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
+ reg.L3AtomicDisableMask = true;
+ reg.L3AtomicDisable = !has_dc;
+ }
+ crocus_emit_lri(batch, SCRATCH1, scratch1);
+ crocus_emit_lri(batch, CHICKEN3, chicken3);
+#endif
+}
+
+static void
+emit_l3_state(struct crocus_batch *batch, bool compute)
+{
+ const struct intel_l3_config *const cfg =
+ compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
+
+ setup_l3_config(batch, cfg);
+ if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
+ intel_dump_l3_config(cfg, stderr);
+ }
+}
+
+/**
+ * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
+ */
+static void
+gen7_emit_cs_stall_flush(struct crocus_batch *batch)
+{
+ crocus_emit_pipe_control_write(batch,
+ "workaround",
+ PIPE_CONTROL_CS_STALL
+ | PIPE_CONTROL_WRITE_IMMEDIATE,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset, 0);
+}
+#endif
+
+static void
+emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
+{
+#if GFX_VER >= 6
+ /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+ * PIPELINE_SELECT [DevBWR+]":
+ *
+ * "Project: DEVSNB+
+ *
+ * Software must ensure all the write caches are flushed through a
+ * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+ * command to invalidate read only caches prior to programming
+ * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
+ */
+ const unsigned dc_flush =
+ batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
+ crocus_emit_pipe_control_flush(batch,
+ "workaround: PIPELINE_SELECT flushes (1/2)",
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ dc_flush |
+ PIPE_CONTROL_CS_STALL);
+
+ crocus_emit_pipe_control_flush(batch,
+ "workaround: PIPELINE_SELECT flushes (2/2)",
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE |
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE);
+#else
+ /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+ * PIPELINE_SELECT [DevBWR+]":
+ *
+ * Project: PRE-DEVSNB
+ *
+ * Software must ensure the current pipeline is flushed via an
+ * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
+ */
+ crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
+#endif
+
+ crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
+ sel.PipelineSelection = pipeline;
+ }
+
+#if GFX_VER == 7 && !(GFX_VERx10 == 75)
+ if (pipeline == _3D) {
+ gen7_emit_cs_stall_flush(batch);
+
+ crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
+ prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
+ };
+ }
+#endif
+}
+
+/**
+ * The following diagram shows how we partition the URB:
+ *
+ * 16kB or 32kB Rest of the URB space
+ * __________-__________ _________________-_________________
+ * / \ / \
+ * +-------------------------------------------------------------+
+ * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
+ * | Constants | Entries |
+ * +-------------------------------------------------------------+
+ *
+ * Notably, push constants must be stored at the beginning of the URB
+ * space, while entries can be stored anywhere. Ivybridge and Haswell
+ * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
+ * doubles this (32kB).
+ *
+ * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
+ * sized) in increments of 1kB. Haswell GT3 requires them to be located and
+ * sized in increments of 2kB.
+ *
+ * Currently we split the constant buffer space evenly among whatever stages
+ * are active. This is probably not ideal, but simple.
+ *
+ * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
+ * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
+ * Haswell GT3 has 512kB of URB space.
+ *
+ * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
+ * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
+ */
+#if GFX_VER == 7
+static void
+crocus_alloc_push_constants(struct crocus_batch *batch)
+{
+#if GFX_VERx10 == 75
+ const unsigned push_constant_kb = batch->screen->devinfo.gt == 3 ? 32 : 16;
+#else
+ const unsigned push_constant_kb = 16;
+#endif
+ unsigned size_per_stage = push_constant_kb / 5;
+
+ /* For now, we set a static partitioning of the push constant area,
+ * assuming that all stages could be in use.
+ *
+ * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
+ * see if that improves performance by offering more space to
+ * the VS/FS when those aren't in use. Also, try dynamically
+ * enabling/disabling it like i965 does. This would be more
+ * stalls and may not actually help; we don't know yet.
+ */
+ for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
+ alloc._3DCommandSubOpcode = 18 + i;
+ alloc.ConstantBufferOffset = size_per_stage * i;
+ alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
+ }
+ }
+
+ /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
+ *
+ * A PIPE_CONTROL command with the CS Stall bit set must be programmed
+ * in the ring after this instruction.
+ *
+ * No such restriction exists for Haswell or Baytrail.
+ */
+ if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
+ gen7_emit_cs_stall_flush(batch);
+}
+#endif
+
+/**
+ * Upload the initial GPU state for a render context.
+ *
+ * This sets some invariant state that needs to be programmed a particular
+ * way, but we never actually change.
+ */
+static void
+crocus_init_render_context(struct crocus_batch *batch)
+{
+ UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ emit_pipeline_select(batch, _3D);
+
+ crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
+
+#if GFX_VER == 7
+ emit_l3_state(batch, false);
+#endif
+#if GFX_VER == 7 && GFX_VERx10 != 75
+ crocus_emit_reg(batch, GENX(INSTPM), reg) {
+ reg.CONSTANT_BUFFERAddressOffsetDisable = true;
+ reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
+ }
+#endif
+#if GFX_VER >= 5 || GFX_VERx10 == 45
+ /* Use the legacy AA line coverage computation. */
+ crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
+#endif
+
+ /* No polygon stippling offsets are necessary. */
+ /* TODO: may need to set an offset for origin-UL framebuffers */
+ crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
+
+#if GFX_VER == 7
+ crocus_alloc_push_constants(batch);
+#endif
+}
+
+#if GFX_VER == 7
+static void
+crocus_init_compute_context(struct crocus_batch *batch)
+{
+ UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ emit_pipeline_select(batch, GPGPU);
+
+#if GFX_VER == 7
+ emit_l3_state(batch, true);
+#endif
+}
+#endif
+
+/**
+ * Generation-specific context state (ice->state.genx->...).
+ *
+ * Most state can go in crocus_context directly, but these encode hardware
+ * packets which vary by generation.
+ */
+struct crocus_genx_state {
+ struct {
+#if GFX_VER == 7
+ struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
+#endif
+ } shaders[MESA_SHADER_STAGES];
+};
+
+/**
+ * The pipe->set_blend_color() driver hook.
+ *
+ * This corresponds to our COLOR_CALC_STATE.
+ */
+static void
+crocus_set_blend_color(struct pipe_context *ctx,
+ const struct pipe_blend_color *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+
+ /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
+ memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
+#if GFX_VER <= 5
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
+#else
+ ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+#endif
+}
+
+/**
+ * Gallium CSO for blend state (see pipe_blend_state).
+ */
+struct crocus_blend_state {
+ /** copy of BLEND_STATE */
+ struct pipe_blend_state cso;
+
+ /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
+ uint8_t blend_enables;
+
+ /** Bitfield of whether color writes are enabled for RT[i] */
+ uint8_t color_write_enables;
+
+ /** Does RT[0] use dual color blending? */
+ bool dual_color_blending;
+};
+
+#if GFX_VER >= 6
+static enum pipe_blendfactor
+fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
+{
+ if (alpha_to_one) {
+ if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
+ return PIPE_BLENDFACTOR_ONE;
+
+ if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
+ return PIPE_BLENDFACTOR_ZERO;
+ }
+
+ return f;
+}
+#endif
+
+/**
+ * The pipe->create_blend_state() driver hook.
+ *
+ * Translates a pipe_blend_state into crocus_blend_state.
+ */
+static void *
+crocus_create_blend_state(struct pipe_context *ctx,
+ const struct pipe_blend_state *state)
+{
+ struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
+
+ cso->blend_enables = 0;
+ cso->color_write_enables = 0;
+ STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
+
+ cso->cso = *state;
+ cso->dual_color_blending = util_blend_state_is_dual(state, 0);
+ for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
+ const struct pipe_rt_blend_state *rt =
+ &state->rt[state->independent_blend_enable ? i : 0];
+ if (rt->blend_enable)
+ cso->blend_enables |= 1u << i;
+ if (rt->colormask)
+ cso->color_write_enables |= 1u << i;
+ }
+ return cso;
+}
+
+/**
+ * The pipe->bind_blend_state() driver hook.
+ *
+ * Bind a blending CSO and flag related dirty bits.
+ */
+static void
+crocus_bind_blend_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_blend_state *cso = state;
+
+ ice->state.cso_blend = cso;
+ ice->state.blend_enables = cso ? cso->blend_enables : 0;
+
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+#if GFX_VER >= 6
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+#endif
+#if GFX_VER >= 7
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
+#endif
+ ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+ ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+ ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
+}
+
+/**
+ * Return true if the FS writes to any color outputs which are not disabled
+ * via color masking.
+ */
+static bool
+has_writeable_rt(const struct crocus_blend_state *cso_blend,
+ const struct shader_info *fs_info)
+{
+ if (!fs_info)
+ return false;
+
+ unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
+
+ if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
+ rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
+
+ return cso_blend->color_write_enables & rt_outputs;
+}
+
+/**
+ * Gallium CSO for depth, stencil, and alpha testing state.
+ */
+struct crocus_depth_stencil_alpha_state {
+ struct pipe_depth_stencil_alpha_state cso;
+
+ bool depth_writes_enabled;
+ bool stencil_writes_enabled;
+};
+
+/**
+ * The pipe->create_depth_stencil_alpha_state() driver hook.
+ *
+ * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
+ * testing state since we need pieces of it in a variety of places.
+ */
+static void *
+crocus_create_zsa_state(struct pipe_context *ctx,
+ const struct pipe_depth_stencil_alpha_state *state)
+{
+ struct crocus_depth_stencil_alpha_state *cso =
+ malloc(sizeof(struct crocus_depth_stencil_alpha_state));
+
+ bool two_sided_stencil = state->stencil[1].enabled;
+ cso->cso = *state;
+
+ cso->depth_writes_enabled = state->depth_writemask;
+ cso->stencil_writes_enabled =
+ state->stencil[0].writemask != 0 ||
+ (two_sided_stencil && state->stencil[1].writemask != 0);
+
+ /* The state tracker needs to optimize away EQUAL writes for us. */
+ assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
+
+ return cso;
+}
+
+/**
+ * The pipe->bind_depth_stencil_alpha_state() driver hook.
+ *
+ * Bind a depth/stencil/alpha CSO and flag related dirty bits.
+ */
+static void
+crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
+ struct crocus_depth_stencil_alpha_state *new_cso = state;
+
+ if (new_cso) {
+ if (cso_changed(cso.alpha_ref_value))
+ ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+
+#if GFX_VER >= 6
+ if (cso_changed(cso.alpha_enabled))
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+
+ if (cso_changed(cso.alpha_func))
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+#endif
+
+ if (cso_changed(depth_writes_enabled))
+ ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+
+ ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
+ ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
+
+#if GFX_VER <= 5
+ ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+#endif
+ }
+
+ ice->state.cso_zsa = new_cso;
+ ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
+#if GFX_VER >= 6
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
+#endif
+ ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
+}
+
+static float
+get_line_width(const struct pipe_rasterizer_state *state)
+{
+ float line_width = state->line_width;
+
+ /* From the OpenGL 4.4 spec:
+ *
+ * "The actual width of non-antialiased lines is determined by rounding
+ * the supplied width to the nearest integer, then clamping it to the
+ * implementation-dependent maximum non-antialiased line width."
+ */
+ if (!state->multisample && !state->line_smooth)
+ line_width = roundf(state->line_width);
+
+ if (!state->multisample && state->line_smooth && line_width < 1.5f) {
+ /* For 1 pixel line thickness or less, the general anti-aliasing
+ * algorithm gives up, and a garbage line is generated. Setting a
+ * Line Width of 0.0 specifies the rasterization of the "thinnest"
+ * (one-pixel-wide), non-antialiased lines.
+ *
+ * Lines rendered with zero Line Width are rasterized using the
+ * "Grid Intersection Quantization" rules as specified by the
+ * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
+ */
+ line_width = 0.0f;
+ }
+
+ return line_width;
+}
+
+/**
+ * The pipe->create_rasterizer_state() driver hook.
+ */
+static void *
+crocus_create_rasterizer_state(struct pipe_context *ctx,
+ const struct pipe_rasterizer_state *state)
+{
+ struct crocus_rasterizer_state *cso =
+ malloc(sizeof(struct crocus_rasterizer_state));
+
+ cso->fill_mode_point_or_line =
+ state->fill_front == PIPE_POLYGON_MODE_LINE ||
+ state->fill_front == PIPE_POLYGON_MODE_POINT ||
+ state->fill_back == PIPE_POLYGON_MODE_LINE ||
+ state->fill_back == PIPE_POLYGON_MODE_POINT;
+
+ if (state->clip_plane_enable != 0)
+ cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
+ else
+ cso->num_clip_plane_consts = 0;
+
+ cso->cso = *state;
+
+#if GFX_VER >= 6
+ float line_width = get_line_width(state);
+
+ crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
+ sf.StatisticsEnable = true;
+ sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
+ sf.LineEndCapAntialiasingRegionWidth =
+ state->line_smooth ? _10pixels : _05pixels;
+ sf.LastPixelEnable = state->line_last_pixel;
+ sf.LineWidth = line_width;
+ sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
+ sf.PointWidth = state->point_size;
+
+ if (state->flatshade_first) {
+ sf.TriangleFanProvokingVertexSelect = 1;
+ } else {
+ sf.TriangleStripListProvokingVertexSelect = 2;
+ sf.TriangleFanProvokingVertexSelect = 2;
+ sf.LineStripListProvokingVertexSelect = 1;
+ }
+
+ sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
+ sf.CullMode = translate_cull_mode(state->cull_face);
+
+ sf.ScissorRectangleEnable = true;
+
+#if GFX_VER == 6
+ sf.AttributeSwizzleEnable = true;
+ if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+ sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
+ else
+ sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+#endif
+
+#if GFX_VER >= 6
+ sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
+ sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
+ sf.GlobalDepthOffsetEnablePoint = state->offset_point;
+ sf.GlobalDepthOffsetConstant = state->offset_units * 2;
+ sf.GlobalDepthOffsetScale = state->offset_scale;
+ sf.GlobalDepthOffsetClamp = state->offset_clamp;
+
+ sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
+ sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
+#endif
+
+#if GFX_VERx10 == 75
+ sf.LineStippleEnable = state->line_stipple_enable;
+#endif
+ }
+#endif
+
+#if GFX_VER >= 6
+ crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
+ /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
+ * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
+ */
+#if GFX_VER >= 7
+ cl.EarlyCullEnable = true;
+#endif
+
+#if GFX_VER == 7
+ cl.FrontWinding = state->front_ccw ? 1 : 0;
+ cl.CullMode = translate_cull_mode(state->cull_face);
+#endif
+ cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
+ cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
+ cl.GuardbandClipTestEnable = true;
+ cl.ClipEnable = true;
+ cl.MinimumPointWidth = 0.125;
+ cl.MaximumPointWidth = 255.875;
+ cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
+
+ if (state->flatshade_first) {
+ cl.TriangleFanProvokingVertexSelect = 1;
+ } else {
+ cl.TriangleStripListProvokingVertexSelect = 2;
+ cl.TriangleFanProvokingVertexSelect = 2;
+ cl.LineStripListProvokingVertexSelect = 1;
+ }
+ }
+#endif
+
+ /* Remap from 0..255 back to 1..256 */
+ const unsigned line_stipple_factor = state->line_stipple_factor + 1;
+
+ crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
+ if (state->line_stipple_enable) {
+ line.LineStipplePattern = state->line_stipple_pattern;
+ line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
+ line.LineStippleRepeatCount = line_stipple_factor;
+ }
+ }
+
+ return cso;
+}
+
+/**
+ * The pipe->bind_rasterizer_state() driver hook.
+ *
+ * Bind a rasterizer CSO and flag related dirty bits.
+ */
+static void
+crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
+ struct crocus_rasterizer_state *new_cso = state;
+
+ if (new_cso) {
+ /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
+ if (cso_changed_memcmp(line_stipple))
+ ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
+#if GFX_VER >= 6
+ if (cso_changed(cso.half_pixel_center))
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
+ if (cso_changed(cso.scissor))
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#else
+ if (cso_changed(cso.scissor))
+ ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+#endif
+
+ if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+
+#if GFX_VER >= 6
+ if (cso_changed(cso.rasterizer_discard))
+ ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+
+ if (cso_changed(cso.flatshade_first))
+ ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
+#endif
+
+ if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
+ cso_changed(cso.clip_halfz))
+ ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
+
+#if GFX_VER >= 7
+ if (cso_changed(cso.sprite_coord_enable) ||
+ cso_changed(cso.sprite_coord_mode) ||
+ cso_changed(cso.light_twoside))
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+#endif
+#if GFX_VER <= 5
+ if (cso_changed(cso.clip_plane_enable))
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
+#endif
+ }
+
+ ice->state.cso_rast = new_cso;
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+#if GFX_VER <= 5
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+#endif
+#if GFX_VER <= 6
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+#endif
+ ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
+}
+
+/**
+ * Return true if the given wrap mode requires the border color to exist.
+ *
+ * (We can skip uploading it if the sampler isn't going to use it.)
+ */
+static bool
+wrap_mode_needs_border_color(unsigned wrap_mode)
+{
+ return wrap_mode == TCM_CLAMP_BORDER;
+}
+
+/**
+ * Gallium CSO for sampler state.
+ */
+struct crocus_sampler_state {
+ struct pipe_sampler_state pstate;
+ union pipe_color_union border_color;
+ bool needs_border_color;
+ unsigned wrap_s;
+ unsigned wrap_t;
+ unsigned wrap_r;
+ unsigned mag_img_filter;
+ float min_lod;
+};
+
+/**
+ * The pipe->create_sampler_state() driver hook.
+ *
+ * We fill out SAMPLER_STATE (except for the border color pointer), and
+ * store that on the CPU. It doesn't make sense to upload it to a GPU
+ * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
+ * all bound sampler states to be in contiguous memor.
+ */
+static void *
+crocus_create_sampler_state(struct pipe_context *ctx,
+ const struct pipe_sampler_state *state)
+{
+ struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
+
+ if (!cso)
+ return NULL;
+
+ STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
+ STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
+
+ bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
+ state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
+ cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
+ cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
+ cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
+
+ cso->pstate = *state;
+
+ memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
+
+ cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
+ wrap_mode_needs_border_color(cso->wrap_t) ||
+ wrap_mode_needs_border_color(cso->wrap_r);
+
+ cso->min_lod = state->min_lod;
+ cso->mag_img_filter = state->mag_img_filter;
+
+ // XXX: explain this code ported from ilo...I don't get it at all...
+ if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+ state->min_lod > 0.0f) {
+ cso->min_lod = 0.0f;
+ cso->mag_img_filter = state->min_img_filter;
+ }
+
+ return cso;
+}
+
+/**
+ * The pipe->bind_sampler_states() driver hook.
+ */
+static void
+crocus_bind_sampler_states(struct pipe_context *ctx,
+ enum pipe_shader_type p_stage,
+ unsigned start, unsigned count,
+ void **states)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ gl_shader_stage stage = stage_from_pipe(p_stage);
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+ assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
+
+ bool dirty = false;
+
+ for (int i = 0; i < count; i++) {
+ if (shs->samplers[start + i] != states[i]) {
+ shs->samplers[start + i] = states[i];
+ dirty = true;
+ }
+ }
+
+ if (dirty) {
+#if GFX_VER <= 5
+ if (p_stage == PIPE_SHADER_FRAGMENT)
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+ else if (p_stage == PIPE_SHADER_VERTEX)
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
+#endif
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
+ ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
+ }
+}
+
+enum samp_workaround {
+ SAMP_NORMAL,
+ SAMP_CUBE_CLAMP,
+ SAMP_CUBE_CUBE,
+ SAMP_T_WRAP,
+};
+
+static void
+crocus_upload_sampler_state(struct crocus_batch *batch,
+ struct crocus_sampler_state *cso,
+ uint32_t border_color_offset,
+ enum samp_workaround samp_workaround,
+ uint32_t first_level,
+ void *map)
+{
+ struct pipe_sampler_state *state = &cso->pstate;
+ uint32_t wrap_s, wrap_t, wrap_r;
+
+ wrap_s = cso->wrap_s;
+ wrap_t = cso->wrap_t;
+ wrap_r = cso->wrap_r;
+
+ switch (samp_workaround) {
+ case SAMP_CUBE_CLAMP:
+ wrap_s = TCM_CLAMP;
+ wrap_t = TCM_CLAMP;
+ wrap_r = TCM_CLAMP;
+ break;
+ case SAMP_CUBE_CUBE:
+ wrap_s = TCM_CUBE;
+ wrap_t = TCM_CUBE;
+ wrap_r = TCM_CUBE;
+ break;
+ case SAMP_T_WRAP:
+ wrap_t = TCM_WRAP;
+ break;
+ default:
+ break;
+ }
+
+ _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
+ samp.TCXAddressControlMode = wrap_s;
+ samp.TCYAddressControlMode = wrap_t;
+ samp.TCZAddressControlMode = wrap_r;
+
+#if GFX_VER >= 6
+ samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
+#endif
+ samp.MinModeFilter = state->min_img_filter;
+ samp.MagModeFilter = cso->mag_img_filter;
+ samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
+ samp.MaximumAnisotropy = RATIO21;
+
+ if (state->max_anisotropy >= 2) {
+ if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+ samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
+#if GFX_VER >= 7
+ samp.AnisotropicAlgorithm = EWAApproximation;
+#endif
+ }
+
+ if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
+ samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
+
+ samp.MaximumAnisotropy =
+ MIN2((state->max_anisotropy - 2) / 2, RATIO161);
+ }
+
+ /* Set address rounding bits if not using nearest filtering. */
+ if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
+ samp.UAddressMinFilterRoundingEnable = true;
+ samp.VAddressMinFilterRoundingEnable = true;
+ samp.RAddressMinFilterRoundingEnable = true;
+ }
+
+ if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
+ samp.UAddressMagFilterRoundingEnable = true;
+ samp.VAddressMagFilterRoundingEnable = true;
+ samp.RAddressMagFilterRoundingEnable = true;
+ }
+
+ if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+ samp.ShadowFunction = translate_shadow_func(state->compare_func);
+
+ const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
+
+ samp.LODPreClampEnable = true;
+ samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
+ samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
+ samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
+
+#if GFX_VER == 6
+ samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
+ samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
+#endif
+
+#if GFX_VER < 6
+ samp.BorderColorPointer =
+ ro_bo(batch->state.bo, border_color_offset);
+#else
+ samp.BorderColorPointer = border_color_offset;
+#endif
+ }
+}
+
+static void
+crocus_upload_border_color(struct crocus_batch *batch,
+ struct crocus_sampler_state *cso,
+ struct crocus_sampler_view *tex,
+ uint32_t *bc_offset)
+{
+ /* We may need to swizzle the border color for format faking.
+ * A/LA formats are faked as R/RG with 000R or R00G swizzles.
+ * This means we need to move the border color's A channel into
+ * the R or G channels so that those read swizzles will move it
+ * back into A.
+ */
+ enum pipe_format internal_format = PIPE_FORMAT_NONE;
+ union pipe_color_union *color = &cso->border_color;
+ union pipe_color_union tmp;
+ if (tex) {
+ internal_format = tex->res->internal_format;
+
+ if (util_format_is_alpha(internal_format)) {
+ unsigned char swz[4] = {
+ PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
+ PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
+ };
+ util_format_apply_color_swizzle(&tmp, color, swz, true);
+ color = &tmp;
+ } else if (util_format_is_luminance_alpha(internal_format) &&
+ internal_format != PIPE_FORMAT_L8A8_SRGB) {
+ unsigned char swz[4] = {
+ PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
+ PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
+ };
+ util_format_apply_color_swizzle(&tmp, color, swz, true);
+ color = &tmp;
+ }
+ }
+ bool is_integer_format = util_format_is_pure_integer(internal_format);
+ unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
+ const int sbc_align = (GFX_VERx10 == 75 && is_integer_format) ? 512 : 32;
+ uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
+
+ struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
+
+#define ASSIGN(dst, src) \
+ do { \
+ dst = src; \
+ } while (0)
+
+#define ASSIGNu16(dst, src) \
+ do { \
+ dst = (uint16_t)src; \
+ } while (0)
+
+#define ASSIGNu8(dst, src) \
+ do { \
+ dst = (uint8_t)src; \
+ } while (0)
+
+#define BORDER_COLOR_ATTR(macro, _color_type, src) \
+ macro(state.BorderColor ## _color_type ## Red, src[0]); \
+ macro(state.BorderColor ## _color_type ## Green, src[1]); \
+ macro(state.BorderColor ## _color_type ## Blue, src[2]); \
+ macro(state.BorderColor ## _color_type ## Alpha, src[3]);
+
+#if GFX_VERx10 == 75
+ if (is_integer_format) {
+ const struct util_format_description *format_desc =
+ util_format_description(internal_format);
+
+ /* From the Haswell PRM, "Command Reference: Structures", Page 36:
+ * "If any color channel is missing from the surface format,
+ * corresponding border color should be programmed as zero and if
+ * alpha channel is missing, corresponding Alpha border color should
+ * be programmed as 1."
+ */
+ unsigned c[4] = { 0, 0, 0, 1 };
+ for (int i = 0; i < 4; i++) {
+ if (format_desc->channel[i].size)
+ c[i] = color->ui[i];
+ }
+
+ switch (format_desc->channel[0].size) {
+ case 8:
+ /* Copy RGBA in order. */
+ BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
+ break;
+ case 10:
+ /* R10G10B10A2_UINT is treated like a 16-bit format. */
+ case 16:
+ BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
+ break;
+ case 32:
+ if (format_desc->channel[1].size && !format_desc->channel[2].size) {
+ /* Careful inspection of the tables reveals that for RG32 formats,
+ * the green channel needs to go where blue normally belongs.
+ */
+ state.BorderColor32bitRed = c[0];
+ state.BorderColor32bitBlue = c[1];
+ state.BorderColor32bitAlpha = 1;
+ } else {
+ /* Copy RGBA in order. */
+ BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
+ }
+ break;
+ default:
+ assert(!"Invalid number of bits per channel in integer format.");
+ break;
+ }
+ } else {
+ BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
+ }
+#elif GFX_VER == 5 || GFX_VER == 6
+ BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
+ BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
+ BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
+
+#define MESA_FLOAT_TO_HALF(dst, src) \
+ dst = _mesa_float_to_half(src);
+
+ BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
+
+#undef MESA_FLOAT_TO_HALF
+
+ state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
+ state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
+ state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
+ state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
+
+ BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
+
+#elif GFX_VER == 4
+ BORDER_COLOR_ATTR(ASSIGN, , color->f);
+#else
+ BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
+#endif
+
+#undef ASSIGN
+#undef BORDER_COLOR_ATTR
+
+ GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
+}
+
+/**
+ * Upload the sampler states into a contiguous area of GPU memory, for
+ * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
+ *
+ * Also fill out the border color state pointers.
+ */
+static void
+crocus_upload_sampler_states(struct crocus_context *ice,
+ struct crocus_batch *batch, gl_shader_stage stage)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ const struct shader_info *info = crocus_get_shader_info(ice, stage);
+
+ /* We assume the state tracker will call pipe->bind_sampler_states()
+ * if the program's number of textures changes.
+ */
+ unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
+
+ if (!count)
+ return;
+
+ /* Assemble the SAMPLER_STATEs into a contiguous table that lives
+ * in the dynamic state memory zone, so we can point to it via the
+ * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
+ */
+ unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
+ uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
+
+ if (unlikely(!map))
+ return;
+
+ for (int i = 0; i < count; i++) {
+ struct crocus_sampler_state *state = shs->samplers[i];
+ struct crocus_sampler_view *tex = shs->textures[i];
+
+ if (!state || !tex) {
+ memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
+ } else {
+ unsigned border_color_offset = 0;
+ if (state->needs_border_color) {
+ crocus_upload_border_color(batch, state, tex, &border_color_offset);
+ }
+
+ enum samp_workaround wa = SAMP_NORMAL;
+ /* There's a bug in 1D texture sampling - it actually pays
+ * attention to the wrap_t value, though it should not.
+ * Override the wrap_t value here to GL_REPEAT to keep
+ * any nonexistent border pixels from floating in.
+ */
+ if (tex->base.target == PIPE_TEXTURE_1D)
+ wa = SAMP_T_WRAP;
+ else if (tex->base.target == PIPE_TEXTURE_CUBE ||
+ tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
+ /* Cube maps must use the same wrap mode for all three coordinate
+ * dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
+ *
+ * Ivybridge and Baytrail seem to have problems with CUBE mode and
+ * integer formats. Fall back to CLAMP for now.
+ */
+ if (state->pstate.seamless_cube_map &&
+ !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
+ wa = SAMP_CUBE_CUBE;
+ else
+ wa = SAMP_CUBE_CLAMP;
+ }
+
+ uint32_t first_level = 0;
+ if (tex->base.target != PIPE_BUFFER)
+ first_level = tex->base.u.tex.first_level;
+
+ crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
+ }
+
+ map += GENX(SAMPLER_STATE_length);
+ }
+}
+
+/**
+ * The pipe->create_sampler_view() driver hook.
+ */
+static struct pipe_sampler_view *
+crocus_create_sampler_view(struct pipe_context *ctx,
+ struct pipe_resource *tex,
+ const struct pipe_sampler_view *tmpl)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
+
+ if (!isv)
+ return NULL;
+
+ /* initialize base object */
+ isv->base = *tmpl;
+ isv->base.context = ctx;
+ isv->base.texture = NULL;
+ pipe_reference_init(&isv->base.reference, 1);
+ pipe_resource_reference(&isv->base.texture, tex);
+
+ if (util_format_is_depth_or_stencil(tmpl->format)) {
+ struct crocus_resource *zres, *sres;
+ const struct util_format_description *desc =
+ util_format_description(tmpl->format);
+
+ crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
+
+ tex = util_format_has_depth(desc) ? &zres->base : &sres->base;
+
+ if (tex->format == PIPE_FORMAT_S8_UINT)
+ if (devinfo->ver == 7 && sres->shadow)
+ tex = &sres->shadow->base;
+ }
+
+ isv->res = (struct crocus_resource *) tex;
+
+ isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
+
+ if (isv->base.target == PIPE_TEXTURE_CUBE ||
+ isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
+ usage |= ISL_SURF_USAGE_CUBE_BIT;
+
+ const struct crocus_format_info fmt =
+ crocus_format_for_usage(devinfo, tmpl->format, usage);
+
+ enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
+ crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
+
+ /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
+ if (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
+ tmpl->format == PIPE_FORMAT_X24S8_UINT) {
+ isv->swizzle[0] = tmpl->swizzle_g;
+ isv->swizzle[1] = tmpl->swizzle_g;
+ isv->swizzle[2] = tmpl->swizzle_g;
+ isv->swizzle[3] = tmpl->swizzle_g;
+ }
+
+ isv->clear_color = isv->res->aux.clear_color;
+
+ isv->view = (struct isl_view) {
+ .format = fmt.fmt,
+#if GFX_VERx10 >= 75
+ .swizzle = (struct isl_swizzle) {
+ .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
+ .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
+ .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
+ .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
+ },
+#else
+ /* swizzling handled in shader code */
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+#endif
+ .usage = usage,
+ };
+
+ /* Fill out SURFACE_STATE for this view. */
+ if (tmpl->target != PIPE_BUFFER) {
+ isv->view.base_level = tmpl->u.tex.first_level;
+ isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
+ // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
+ isv->view.base_array_layer = tmpl->u.tex.first_layer;
+ isv->view.array_len =
+ tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
+ }
+#if GFX_VER >= 6
+ /* just create a second view struct for texture gather just in case */
+ isv->gather_view = isv->view;
+
+#if GFX_VER >= 7
+ if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
+ fmt.fmt == ISL_FORMAT_R32G32_SINT ||
+ fmt.fmt == ISL_FORMAT_R32G32_UINT) {
+ isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
+#if GFX_VERx10 >= 75
+ isv->gather_view.swizzle = (struct isl_swizzle) {
+ .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
+ .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
+ .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
+ .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
+ };
+#endif
+ }
+#endif
+#if GFX_VER == 6
+ /* Sandybridge's gather4 message is broken for integer formats.
+ * To work around this, we pretend the surface is UNORM for
+ * 8 or 16-bit formats, and emit shader instructions to recover
+ * the real INT/UINT value. For 32-bit formats, we pretend
+ * the surface is FLOAT, and simply reinterpret the resulting
+ * bits.
+ */
+ switch (fmt.fmt) {
+ case ISL_FORMAT_R8_SINT:
+ case ISL_FORMAT_R8_UINT:
+ isv->gather_view.format = ISL_FORMAT_R8_UNORM;
+ break;
+
+ case ISL_FORMAT_R16_SINT:
+ case ISL_FORMAT_R16_UINT:
+ isv->gather_view.format = ISL_FORMAT_R16_UNORM;
+ break;
+
+ case ISL_FORMAT_R32_SINT:
+ case ISL_FORMAT_R32_UINT:
+ isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
+ break;
+
+ default:
+ break;
+ }
+#endif
+#endif
+ /* Fill out SURFACE_STATE for this view. */
+ if (tmpl->target != PIPE_BUFFER) {
+ if (crocus_resource_unfinished_aux_import(isv->res))
+ crocus_resource_finish_aux_import(&screen->base, isv->res);
+
+ }
+
+ return &isv->base;
+}
+
+static void
+crocus_sampler_view_destroy(struct pipe_context *ctx,
+ struct pipe_sampler_view *state)
+{
+ struct crocus_sampler_view *isv = (void *) state;
+ pipe_resource_reference(&state->texture, NULL);
+ free(isv);
+}
+
+/**
+ * The pipe->create_surface() driver hook.
+ *
+ * In Gallium nomenclature, "surfaces" are a view of a resource that
+ * can be bound as a render target or depth/stencil buffer.
+ */
+static struct pipe_surface *
+crocus_create_surface(struct pipe_context *ctx,
+ struct pipe_resource *tex,
+ const struct pipe_surface *tmpl)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+
+ isl_surf_usage_flags_t usage = 0;
+ if (tmpl->writable)
+ usage = ISL_SURF_USAGE_STORAGE_BIT;
+ else if (util_format_is_depth_or_stencil(tmpl->format))
+ usage = ISL_SURF_USAGE_DEPTH_BIT;
+ else
+ usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
+ const struct crocus_format_info fmt =
+ crocus_format_for_usage(devinfo, tmpl->format, usage);
+
+ if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
+ !isl_format_supports_rendering(devinfo, fmt.fmt)) {
+ /* Framebuffer validation will reject this invalid case, but it
+ * hasn't had the opportunity yet. In the meantime, we need to
+ * avoid hitting ISL asserts about unsupported formats below.
+ */
+ return NULL;
+ }
+
+ struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
+ struct pipe_surface *psurf = &surf->base;
+ struct crocus_resource *res = (struct crocus_resource *) tex;
+
+ if (!surf)
+ return NULL;
+
+ pipe_reference_init(&psurf->reference, 1);
+ pipe_resource_reference(&psurf->texture, tex);
+ psurf->context = ctx;
+ psurf->format = tmpl->format;
+ psurf->width = tex->width0;
+ psurf->height = tex->height0;
+ psurf->texture = tex;
+ psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
+ psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
+ psurf->u.tex.level = tmpl->u.tex.level;
+
+ uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
+
+ struct isl_view *view = &surf->view;
+ *view = (struct isl_view) {
+ .format = fmt.fmt,
+ .base_level = tmpl->u.tex.level,
+ .levels = 1,
+ .base_array_layer = tmpl->u.tex.first_layer,
+ .array_len = array_len,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .usage = usage,
+ };
+
+#if GFX_VER >= 6
+ struct isl_view *read_view = &surf->read_view;
+ *read_view = (struct isl_view) {
+ .format = fmt.fmt,
+ .base_level = tmpl->u.tex.level,
+ .levels = 1,
+ .base_array_layer = tmpl->u.tex.first_layer,
+ .array_len = array_len,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .usage = ISL_SURF_USAGE_TEXTURE_BIT,
+ };
+#endif
+
+ surf->clear_color = res->aux.clear_color;
+
+ /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
+ if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
+ ISL_SURF_USAGE_STENCIL_BIT))
+ return psurf;
+
+ if (!isl_format_is_compressed(res->surf.format)) {
+ if (crocus_resource_unfinished_aux_import(res))
+ crocus_resource_finish_aux_import(&screen->base, res);
+
+ memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
+ uint32_t temp_offset, temp_x, temp_y;
+
+ isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
+ res->base.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
+ res->base.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
+ &temp_offset, &temp_x, &temp_y);
+ if (!devinfo->has_surface_tile_offset &&
+ (temp_x || temp_y)) {
+ /* Original gfx4 hardware couldn't draw to a non-tile-aligned
+ * destination.
+ */
+ /* move to temp */
+ struct pipe_resource wa_templ = (struct pipe_resource) {
+ .width0 = u_minify(res->base.width0, tmpl->u.tex.level),
+ .height0 = u_minify(res->base.height0, tmpl->u.tex.level),
+ .depth0 = 1,
+ .array_size = 1,
+ .format = res->base.format,
+ .target = PIPE_TEXTURE_2D,
+ .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
+ };
+ surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
+ view->base_level = 0;
+ view->base_array_layer = 0;
+ view->array_len = 1;
+ struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
+ memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
+ }
+ return psurf;
+ }
+
+ /* The resource has a compressed format, which is not renderable, but we
+ * have a renderable view format. We must be attempting to upload blocks
+ * of compressed data via an uncompressed view.
+ *
+ * In this case, we can assume there are no auxiliary buffers, a single
+ * miplevel, and that the resource is single-sampled. Gallium may try
+ * and create an uncompressed view with multiple layers, however.
+ */
+ assert(!isl_format_is_compressed(fmt.fmt));
+ assert(res->surf.samples == 1);
+ assert(view->levels == 1);
+
+ /* TODO: compressed pbo uploads aren't working here */
+ return NULL;
+
+ uint32_t offset_B = 0, tile_x_sa = 0, tile_y_sa = 0;
+
+ if (view->base_level > 0) {
+ /* We can't rely on the hardware's miplevel selection with such
+ * a substantial lie about the format, so we select a single image
+ * using the Tile X/Y Offset fields. In this case, we can't handle
+ * multiple array slices.
+ *
+ * On Broadwell, HALIGN and VALIGN are specified in pixels and are
+ * hard-coded to align to exactly the block size of the compressed
+ * texture. This means that, when reinterpreted as a non-compressed
+ * texture, the tile offsets may be anything and we can't rely on
+ * X/Y Offset.
+ *
+ * Return NULL to force the state tracker to take fallback paths.
+ */
+ // TODO: check if the gen7 check is right, originally gen8
+ if (view->array_len > 1 || GFX_VER == 7)
+ return NULL;
+
+ const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
+ isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
+ view->base_level,
+ is_3d ? 0 : view->base_array_layer,
+ is_3d ? view->base_array_layer : 0,
+ &surf->surf,
+ &offset_B, &tile_x_sa, &tile_y_sa);
+
+ /* We use address and tile offsets to access a single level/layer
+ * as a subimage, so reset level/layer so it doesn't offset again.
+ */
+ view->base_array_layer = 0;
+ view->base_level = 0;
+ } else {
+ /* Level 0 doesn't require tile offsets, and the hardware can find
+ * array slices using QPitch even with the format override, so we
+ * can allow layers in this case. Copy the original ISL surface.
+ */
+ memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
+ }
+
+ /* Scale down the image dimensions by the block size. */
+ const struct isl_format_layout *fmtl =
+ isl_format_get_layout(res->surf.format);
+ surf->surf.format = fmt.fmt;
+ surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
+ surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
+ tile_x_sa /= fmtl->bw;
+ tile_y_sa /= fmtl->bh;
+
+ psurf->width = surf->surf.logical_level0_px.width;
+ psurf->height = surf->surf.logical_level0_px.height;
+
+ return psurf;
+}
+
+#if GFX_VER == 7
+static void
+fill_default_image_param(struct brw_image_param *param)
+{
+ memset(param, 0, sizeof(*param));
+ /* Set the swizzling shifts to all-ones to effectively disable swizzling --
+ * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
+ * detailed explanation of these parameters.
+ */
+ param->swizzling[0] = 0xff;
+ param->swizzling[1] = 0xff;
+}
+
+static void
+fill_buffer_image_param(struct brw_image_param *param,
+ enum pipe_format pfmt,
+ unsigned size)
+{
+ const unsigned cpp = util_format_get_blocksize(pfmt);
+
+ fill_default_image_param(param);
+ param->size[0] = size / cpp;
+ param->stride[0] = cpp;
+}
+
+#endif
+
+/**
+ * The pipe->set_shader_images() driver hook.
+ */
+static void
+crocus_set_shader_images(struct pipe_context *ctx,
+ enum pipe_shader_type p_stage,
+ unsigned start_slot, unsigned count,
+ unsigned unbind_num_trailing_slots,
+ const struct pipe_image_view *p_images)
+{
+#if GFX_VER == 7
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ gl_shader_stage stage = stage_from_pipe(p_stage);
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ struct crocus_genx_state *genx = ice->state.genx;
+ struct brw_image_param *image_params = genx->shaders[stage].image_param;
+
+ shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
+
+ for (unsigned i = 0; i < count; i++) {
+ struct crocus_image_view *iv = &shs->image[start_slot + i];
+
+ if (p_images && p_images[i].resource) {
+ const struct pipe_image_view *img = &p_images[i];
+ struct crocus_resource *res = (void *) img->resource;
+
+ util_copy_image_view(&iv->base, img);
+
+ shs->bound_image_views |= 1 << (start_slot + i);
+
+ res->bind_history |= PIPE_BIND_SHADER_IMAGE;
+ res->bind_stages |= 1 << stage;
+
+ isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
+ struct crocus_format_info fmt =
+ crocus_format_for_usage(devinfo, img->format, usage);
+
+ struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
+ if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
+ /* On Gen8, try to use typed surfaces reads (which support a
+ * limited number of formats), and if not possible, fall back
+ * to untyped reads.
+ */
+ if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
+ fmt.fmt = ISL_FORMAT_RAW;
+ else
+ fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
+ }
+
+ if (res->base.target != PIPE_BUFFER) {
+ struct isl_view view = {
+ .format = fmt.fmt,
+ .base_level = img->u.tex.level,
+ .levels = 1,
+ .base_array_layer = img->u.tex.first_layer,
+ .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
+ .swizzle = swiz,
+ .usage = usage,
+ };
+
+ iv->view = view;
+
+ isl_surf_fill_image_param(&screen->isl_dev,
+ &image_params[start_slot + i],
+ &res->surf, &view);
+ } else {
+ struct isl_view view = {
+ .format = fmt.fmt,
+ .swizzle = swiz,
+ .usage = usage,
+ };
+ iv->view = view;
+
+ util_range_add(&res->base, &res->valid_buffer_range, img->u.buf.offset,
+ img->u.buf.offset + img->u.buf.size);
+ fill_buffer_image_param(&image_params[start_slot + i],
+ img->format, img->u.buf.size);
+ }
+ } else {
+ pipe_resource_reference(&iv->base.resource, NULL);
+ fill_default_image_param(&image_params[start_slot + i]);
+ }
+ }
+
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
+ ice->state.dirty |=
+ stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
+ : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+
+ /* Broadwell also needs brw_image_params re-uploaded */
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
+ shs->sysvals_need_upload = true;
+#endif
+}
+
+
+/**
+ * The pipe->set_sampler_views() driver hook.
+ */
+static void
+crocus_set_sampler_views(struct pipe_context *ctx,
+ enum pipe_shader_type p_stage,
+ unsigned start, unsigned count,
+ unsigned unbind_num_trailing_slots,
+ struct pipe_sampler_view **views)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ gl_shader_stage stage = stage_from_pipe(p_stage);
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+ shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
+
+ for (unsigned i = 0; i < count; i++) {
+ struct pipe_sampler_view *pview = views ? views[i] : NULL;
+ pipe_sampler_view_reference((struct pipe_sampler_view **)
+ &shs->textures[start + i], pview);
+ struct crocus_sampler_view *view = (void *) pview;
+ if (view) {
+ view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
+ view->res->bind_stages |= 1 << stage;
+
+ shs->bound_sampler_views |= 1 << (start + i);
+ }
+ }
+
+ ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
+ ice->state.dirty |=
+ stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
+ : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+ ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
+}
+
+/**
+ * The pipe->set_tess_state() driver hook.
+ */
+static void
+crocus_set_tess_state(struct pipe_context *ctx,
+ const float default_outer_level[4],
+ const float default_inner_level[2])
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
+
+ memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
+ memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
+
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+ shs->sysvals_need_upload = true;
+}
+
+static void
+crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
+{
+ struct crocus_surface *surf = (void *) p_surf;
+ pipe_resource_reference(&p_surf->texture, NULL);
+
+ pipe_resource_reference(&surf->align_res, NULL);
+ free(surf);
+}
+
+static void
+crocus_set_clip_state(struct pipe_context *ctx,
+ const struct pipe_clip_state *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
+ struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
+ struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
+
+ memcpy(&ice->state.clip_planes, state, sizeof(*state));
+
+#if GFX_VER <= 5
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
+#endif
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ shs->sysvals_need_upload = true;
+ gshs->sysvals_need_upload = true;
+ tshs->sysvals_need_upload = true;
+}
+
+/**
+ * The pipe->set_polygon_stipple() driver hook.
+ */
+static void
+crocus_set_polygon_stipple(struct pipe_context *ctx,
+ const struct pipe_poly_stipple *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ memcpy(&ice->state.poly_stipple, state, sizeof(*state));
+ ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
+}
+
+/**
+ * The pipe->set_sample_mask() driver hook.
+ */
+static void
+crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+
+ /* We only support 16x MSAA, so we have 16 bits of sample maks.
+ * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
+ */
+ ice->state.sample_mask = sample_mask & 0xff;
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
+}
+
+static void
+crocus_fill_scissor_rect(struct crocus_context *ice,
+ int idx,
+ struct pipe_scissor_state *ss)
+{
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
+ const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
+ struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
+ .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
+ .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
+ .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
+ .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
+ };
+ if (cso_state->scissor) {
+ struct pipe_scissor_state *s = &ice->state.scissors[idx];
+ scissor.minx = MAX2(scissor.minx, s->minx);
+ scissor.miny = MAX2(scissor.miny, s->miny);
+ scissor.maxx = MIN2(scissor.maxx, s->maxx);
+ scissor.maxy = MIN2(scissor.maxy, s->maxy);
+ }
+ *ss = scissor;
+}
+
+/**
+ * The pipe->set_scissor_states() driver hook.
+ *
+ * This corresponds to our SCISSOR_RECT state structures. It's an
+ * exact match, so we just store them, and memcpy them out later.
+ */
+static void
+crocus_set_scissor_states(struct pipe_context *ctx,
+ unsigned start_slot,
+ unsigned num_scissors,
+ const struct pipe_scissor_state *rects)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+
+ for (unsigned i = 0; i < num_scissors; i++) {
+ if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
+ /* If the scissor was out of bounds and got clamped to 0 width/height
+ * at the bounds, the subtraction of 1 from maximums could produce a
+ * negative number and thus not clip anything. Instead, just provide
+ * a min > max scissor inside the bounds, which produces the expected
+ * no rendering.
+ */
+ ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
+ .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
+ };
+ } else {
+ ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
+ .minx = rects[i].minx, .miny = rects[i].miny,
+ .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
+ };
+ }
+ }
+
+#if GFX_VER < 6
+ ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
+#else
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#endif
+ ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+
+}
+
+/**
+ * The pipe->set_stencil_ref() driver hook.
+ *
+ * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
+ */
+static void
+crocus_set_stencil_ref(struct pipe_context *ctx,
+ const struct pipe_stencil_ref ref)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ ice->state.stencil_ref = ref;
+ ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+}
+
+/**
+ * The pipe->set_viewport_states() driver hook.
+ *
+ * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
+ * the guardband yet, as we need the framebuffer dimensions, but we can
+ * at least fill out the rest.
+ */
+static void
+crocus_set_viewport_states(struct pipe_context *ctx,
+ unsigned start_slot,
+ unsigned count,
+ const struct pipe_viewport_state *states)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+
+ memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
+
+ ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+#if GFX_VER >= 6
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#endif
+
+ if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
+ !ice->state.cso_rast->cso.depth_clip_far))
+ ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
+}
+
+/**
+ * The pipe->set_framebuffer_state() driver hook.
+ *
+ * Sets the current draw FBO, including color render targets, depth,
+ * and stencil buffers.
+ */
+static void
+crocus_set_framebuffer_state(struct pipe_context *ctx,
+ const struct pipe_framebuffer_state *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+#if 0
+ struct isl_device *isl_dev = &screen->isl_dev;
+ struct crocus_resource *zres;
+ struct crocus_resource *stencil_res;
+#endif
+
+ unsigned samples = util_framebuffer_get_num_samples(state);
+ unsigned layers = util_framebuffer_get_num_layers(state);
+
+#if GFX_VER >= 6
+ if (cso->samples != samples) {
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+#if GFX_VERx10 == 75
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
+#endif
+ }
+#endif
+
+#if GFX_VER >= 6
+ if (cso->nr_cbufs != state->nr_cbufs) {
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+ }
+#endif
+
+ if ((cso->layers == 0) != (layers == 0)) {
+ ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ }
+
+ if (cso->width != state->width || cso->height != state->height) {
+ ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
+#if GFX_VER >= 6
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#endif
+ }
+
+ if (cso->zsbuf || state->zsbuf) {
+ ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
+
+ /* update SF's depth buffer format */
+ if (GFX_VER == 7 && cso->zsbuf)
+ ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ }
+
+ /* wm thread dispatch enable */
+ ice->state.dirty |= CROCUS_DIRTY_WM;
+ util_copy_framebuffer_state(cso, state);
+ cso->samples = samples;
+ cso->layers = layers;
+
+ if (cso->zsbuf) {
+ struct crocus_resource *zres;
+ struct crocus_resource *stencil_res;
+ enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
+ crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
+ &stencil_res);
+ if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
+ aux_usage = zres->aux.usage;
+ }
+ ice->state.hiz_usage = aux_usage;
+ }
+
+ /* Render target change */
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
+
+ ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+
+ ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
+}
+
+/**
+ * The pipe->set_constant_buffer() driver hook.
+ *
+ * This uploads any constant data in user buffers, and references
+ * any UBO resources containing constant data.
+ */
+static void
+crocus_set_constant_buffer(struct pipe_context *ctx,
+ enum pipe_shader_type p_stage, unsigned index,
+ bool take_ownership,
+ const struct pipe_constant_buffer *input)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ gl_shader_stage stage = stage_from_pipe(p_stage);
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
+
+ util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
+
+ if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
+ shs->bound_cbufs |= 1u << index;
+
+ if (input->user_buffer) {
+ void *map = NULL;
+ pipe_resource_reference(&cbuf->buffer, NULL);
+ u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
+ &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
+
+ if (!cbuf->buffer) {
+ /* Allocation was unsuccessful - just unbind */
+ crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
+ return;
+ }
+
+ assert(map);
+ memcpy(map, input->user_buffer, input->buffer_size);
+ }
+ cbuf->buffer_size =
+ MIN2(input->buffer_size,
+ crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
+
+ struct crocus_resource *res = (void *) cbuf->buffer;
+ res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
+ res->bind_stages |= 1 << stage;
+ } else {
+ shs->bound_cbufs &= ~(1u << index);
+ }
+
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
+}
+
+static void
+upload_sysvals(struct crocus_context *ice,
+ gl_shader_stage stage)
+{
+ UNUSED struct crocus_genx_state *genx = ice->state.genx;
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+ struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+ if (!shader || shader->num_system_values == 0)
+ return;
+
+ assert(shader->num_cbufs > 0);
+
+ unsigned sysval_cbuf_index = shader->num_cbufs - 1;
+ struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
+ unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
+ uint32_t *map = NULL;
+
+ assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
+ u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
+ &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
+
+ for (int i = 0; i < shader->num_system_values; i++) {
+ uint32_t sysval = shader->system_values[i];
+ uint32_t value = 0;
+
+ if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
+#if GFX_VER == 7
+ unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
+ unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
+ struct brw_image_param *param =
+ &genx->shaders[stage].image_param[img];
+
+ assert(offset < sizeof(struct brw_image_param));
+ value = ((uint32_t *) param)[offset];
+#endif
+ } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
+ value = 0;
+ } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
+ int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
+ int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
+ value = fui(ice->state.clip_planes.ucp[plane][comp]);
+ } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
+ if (stage == MESA_SHADER_TESS_CTRL) {
+ value = ice->state.vertices_per_patch;
+ } else {
+ assert(stage == MESA_SHADER_TESS_EVAL);
+ const struct shader_info *tcs_info =
+ crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+ if (tcs_info)
+ value = tcs_info->tess.tcs_vertices_out;
+ else
+ value = ice->state.vertices_per_patch;
+ }
+ } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
+ sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
+ unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
+ value = fui(ice->state.default_outer_level[i]);
+ } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
+ value = fui(ice->state.default_inner_level[0]);
+ } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
+ value = fui(ice->state.default_inner_level[1]);
+ } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
+ sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
+ unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
+ value = ice->state.last_block[i];
+ } else {
+ assert(!"unhandled system value");
+ }
+
+ *map++ = value;
+ }
+
+ cbuf->buffer_size = upload_size;
+ shs->sysvals_need_upload = false;
+}
+
+/**
+ * The pipe->set_shader_buffers() driver hook.
+ *
+ * This binds SSBOs and ABOs. Unfortunately, we need to stream out
+ * SURFACE_STATE here, as the buffer offset may change each time.
+ */
+static void
+crocus_set_shader_buffers(struct pipe_context *ctx,
+ enum pipe_shader_type p_stage,
+ unsigned start_slot, unsigned count,
+ const struct pipe_shader_buffer *buffers,
+ unsigned writable_bitmask)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ gl_shader_stage stage = stage_from_pipe(p_stage);
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+ unsigned modified_bits = u_bit_consecutive(start_slot, count);
+
+ shs->bound_ssbos &= ~modified_bits;
+ shs->writable_ssbos &= ~modified_bits;
+ shs->writable_ssbos |= writable_bitmask << start_slot;
+
+ for (unsigned i = 0; i < count; i++) {
+ if (buffers && buffers[i].buffer) {
+ struct crocus_resource *res = (void *) buffers[i].buffer;
+ struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
+ pipe_resource_reference(&ssbo->buffer, &res->base);
+ ssbo->buffer_offset = buffers[i].buffer_offset;
+ ssbo->buffer_size =
+ MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
+
+ shs->bound_ssbos |= 1 << (start_slot + i);
+
+ res->bind_history |= PIPE_BIND_SHADER_BUFFER;
+ res->bind_stages |= 1 << stage;
+
+ util_range_add(&res->base, &res->valid_buffer_range, ssbo->buffer_offset,
+ ssbo->buffer_offset + ssbo->buffer_size);
+ } else {
+ pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
+ }
+ }
+
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
+}
+
+static void
+crocus_delete_state(struct pipe_context *ctx, void *state)
+{
+ free(state);
+}
+
+/**
+ * The pipe->set_vertex_buffers() driver hook.
+ *
+ * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
+ */
+static void
+crocus_set_vertex_buffers(struct pipe_context *ctx,
+ unsigned start_slot, unsigned count,
+ unsigned unbind_num_trailing_slots,
+ bool take_ownership,
+ const struct pipe_vertex_buffer *buffers)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
+ const unsigned padding =
+ (!(GFX_VERx10 == 75) && !screen->devinfo.is_baytrail) * 2;
+ ice->state.bound_vertex_buffers &=
+ ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
+
+ util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
+ buffers, start_slot, count, unbind_num_trailing_slots,
+ take_ownership);
+
+ for (unsigned i = 0; i < count; i++) {
+ struct pipe_vertex_buffer *state =
+ &ice->state.vertex_buffers[start_slot + i];
+
+ if (!state->is_user_buffer && state->buffer.resource) {
+ struct crocus_resource *res = (void *)state->buffer.resource;
+ res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+ }
+
+ uint32_t end = 0;
+ if (state->buffer.resource)
+ end = state->buffer.resource->width0 + padding;
+ ice->state.vb_end[start_slot + i] = end;
+ }
+ ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
+}
+
+#if !(GFX_VERx10 == 75)
+static uint8_t get_wa_flags(enum isl_format format)
+{
+ uint8_t wa_flags = 0;
+
+ switch (format) {
+ case ISL_FORMAT_R10G10B10A2_USCALED:
+ wa_flags = BRW_ATTRIB_WA_SCALE;
+ break;
+ case ISL_FORMAT_R10G10B10A2_SSCALED:
+ wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
+ break;
+ case ISL_FORMAT_R10G10B10A2_UNORM:
+ wa_flags = BRW_ATTRIB_WA_NORMALIZE;
+ break;
+ case ISL_FORMAT_R10G10B10A2_SNORM:
+ wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
+ break;
+ case ISL_FORMAT_R10G10B10A2_SINT:
+ wa_flags = BRW_ATTRIB_WA_SIGN;
+ break;
+ case ISL_FORMAT_B10G10R10A2_USCALED:
+ wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
+ break;
+ case ISL_FORMAT_B10G10R10A2_SSCALED:
+ wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
+ break;
+ case ISL_FORMAT_B10G10R10A2_UNORM:
+ wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
+ break;
+ case ISL_FORMAT_B10G10R10A2_SNORM:
+ wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
+ break;
+ case ISL_FORMAT_B10G10R10A2_SINT:
+ wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
+ break;
+ case ISL_FORMAT_B10G10R10A2_UINT:
+ wa_flags = BRW_ATTRIB_WA_BGRA;
+ break;
+ default:
+ break;
+ }
+ return wa_flags;
+}
+#endif
+
+/**
+ * Gallium CSO for vertex elements.
+ */
+struct crocus_vertex_element_state {
+ uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
+ uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
+ uint32_t step_rate[16];
+ uint8_t wa_flags[33];
+ unsigned count;
+};
+
+/**
+ * The pipe->create_vertex_elements() driver hook.
+ *
+ * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
+ * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
+ * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
+ * needed. In these cases we will need information available at draw time.
+ * We setup edgeflag_ve and edgeflag_vfi as alternatives last
+ * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
+ * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
+ */
+static void *
+crocus_create_vertex_elements(struct pipe_context *ctx,
+ unsigned count,
+ const struct pipe_vertex_element *state)
+{
+ struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_vertex_element_state *cso =
+ malloc(sizeof(struct crocus_vertex_element_state));
+
+ cso->count = count;
+
+ crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
+ ve.DWordLength =
+ 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
+ }
+
+ uint32_t *ve_pack_dest = &cso->vertex_elements[1];
+
+ if (count == 0) {
+ crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+ ve.Valid = true;
+ ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
+ ve.Component0Control = VFCOMP_STORE_0;
+ ve.Component1Control = VFCOMP_STORE_0;
+ ve.Component2Control = VFCOMP_STORE_0;
+ ve.Component3Control = VFCOMP_STORE_1_FP;
+ }
+ }
+
+ for (int i = 0; i < count; i++) {
+ const struct crocus_format_info fmt =
+ crocus_format_for_usage(devinfo, state[i].src_format, 0);
+ unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
+ VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
+ enum isl_format actual_fmt = fmt.fmt;
+
+#if !(GFX_VERx10 == 75)
+ cso->wa_flags[i] = get_wa_flags(fmt.fmt);
+
+ if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
+ fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
+ fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
+ fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
+ fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
+ fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
+ fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
+ fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
+ fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
+ fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
+ fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
+ actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
+ if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
+ actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
+ if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
+ actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
+ if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
+ actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
+ if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
+ actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
+#endif
+
+ cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
+
+ switch (isl_format_get_num_channels(fmt.fmt)) {
+ case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
+ case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
+ case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
+ case 3:
+ comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
+ : VFCOMP_STORE_1_FP;
+ break;
+ }
+ crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+#if GFX_VER >= 6
+ ve.EdgeFlagEnable = false;
+#endif
+ ve.VertexBufferIndex = state[i].vertex_buffer_index;
+ ve.Valid = true;
+ ve.SourceElementOffset = state[i].src_offset;
+ ve.SourceElementFormat = actual_fmt;
+ ve.Component0Control = comp[0];
+ ve.Component1Control = comp[1];
+ ve.Component2Control = comp[2];
+ ve.Component3Control = comp[3];
+#if GFX_VER < 5
+ ve.DestinationElementOffset = i * 4;
+#endif
+ }
+
+ ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+ }
+
+ /* An alternative version of the last VE and VFI is stored so it
+ * can be used at draw time in case Vertex Shader uses EdgeFlag
+ */
+ if (count) {
+ const unsigned edgeflag_index = count - 1;
+ const struct crocus_format_info fmt =
+ crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
+ crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
+#if GFX_VER >= 6
+ ve.EdgeFlagEnable = true;
+#endif
+ ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
+ ve.Valid = true;
+ ve.SourceElementOffset = state[edgeflag_index].src_offset;
+ ve.SourceElementFormat = fmt.fmt;
+ ve.Component0Control = VFCOMP_STORE_SRC;
+ ve.Component1Control = VFCOMP_STORE_0;
+ ve.Component2Control = VFCOMP_STORE_0;
+ ve.Component3Control = VFCOMP_STORE_0;
+ }
+ }
+
+ return cso;
+}
+
+/**
+ * The pipe->bind_vertex_elements_state() driver hook.
+ */
+static void
+crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+
+ ice->state.cso_vertex_elements = state;
+ ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
+ ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
+}
+
+#if GFX_VER >= 6
+struct crocus_streamout_counter {
+ uint32_t offset_start;
+ uint32_t offset_end;
+
+ uint64_t accum;
+};
+
+/**
+ * Gallium CSO for stream output (transform feedback) targets.
+ */
+struct crocus_stream_output_target {
+ struct pipe_stream_output_target base;
+
+ /** Stride (bytes-per-vertex) during this transform feedback operation */
+ uint16_t stride;
+
+ /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
+ bool zeroed;
+
+ struct crocus_resource *offset_res;
+ uint32_t offset_offset;
+
+#if GFX_VER == 6
+ void *prim_map;
+ struct crocus_streamout_counter prev_count;
+ struct crocus_streamout_counter count;
+#endif
+};
+
+#if GFX_VER >= 7
+static uint32_t
+crocus_get_so_offset(struct pipe_stream_output_target *so)
+{
+ struct crocus_stream_output_target *tgt = (void *)so;
+ struct pipe_transfer *transfer;
+ struct pipe_box box;
+ uint32_t result;
+ u_box_1d(tgt->offset_offset, 4, &box);
+ void *val = so->context->buffer_map(so->context, &tgt->offset_res->base,
+ 0, PIPE_MAP_DIRECTLY,
+ &box, &transfer);
+ assert(val);
+ result = *(uint32_t *)val;
+ so->context->buffer_unmap(so->context, transfer);
+
+ return result / tgt->stride;
+}
+#endif
+
+#if GFX_VER == 6
+static void
+compute_vertices_written_so_far(struct crocus_context *ice,
+ struct crocus_stream_output_target *tgt,
+ struct crocus_streamout_counter *count,
+ uint64_t *svbi);
+
+static uint32_t
+crocus_get_so_offset(struct pipe_stream_output_target *so)
+{
+ struct crocus_stream_output_target *tgt = (void *)so;
+ struct crocus_context *ice = (void *)so->context;
+
+ uint64_t vert_written;
+ compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
+ return vert_written;
+}
+#endif
+
+/**
+ * The pipe->create_stream_output_target() driver hook.
+ *
+ * "Target" here refers to a destination buffer. We translate this into
+ * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
+ * know which buffer this represents, or whether we ought to zero the
+ * write-offsets, or append. Those are handled in the set() hook.
+ */
+static struct pipe_stream_output_target *
+crocus_create_stream_output_target(struct pipe_context *ctx,
+ struct pipe_resource *p_res,
+ unsigned buffer_offset,
+ unsigned buffer_size)
+{
+ struct crocus_resource *res = (void *) p_res;
+ struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
+ if (!cso)
+ return NULL;
+
+ res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
+
+ pipe_reference_init(&cso->base.reference, 1);
+ pipe_resource_reference(&cso->base.buffer, p_res);
+ cso->base.buffer_offset = buffer_offset;
+ cso->base.buffer_size = buffer_size;
+ cso->base.context = ctx;
+
+ util_range_add(&res->base, &res->valid_buffer_range, buffer_offset,
+ buffer_offset + buffer_size);
+#if GFX_VER >= 7
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ void *temp;
+ u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
+ &cso->offset_offset,
+ (struct pipe_resource **)&cso->offset_res,
+ &temp);
+#endif
+
+ return &cso->base;
+}
+
+static void
+crocus_stream_output_target_destroy(struct pipe_context *ctx,
+ struct pipe_stream_output_target *state)
+{
+ struct crocus_stream_output_target *cso = (void *) state;
+
+ pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
+ pipe_resource_reference(&cso->base.buffer, NULL);
+
+ free(cso);
+}
+
+#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
+#define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4)
+
+#if GFX_VER == 6
+static void
+aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
+ struct crocus_streamout_counter *counter)
+{
+ uint64_t *prim_counts = tgt->prim_map;
+
+ if (crocus_batch_references(batch, tgt->offset_res->bo)) {
+ struct pipe_fence_handle *out_fence = NULL;
+ batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
+ batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
+ batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
+ }
+
+ for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
+ counter->accum += prim_counts[i + 1] - prim_counts[i];
+ }
+ tgt->count.offset_start = tgt->count.offset_end = 0;
+}
+
+static void
+crocus_stream_store_prims_written(struct crocus_batch *batch,
+ struct crocus_stream_output_target *tgt)
+{
+ if (!tgt->offset_res) {
+ u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
+ &tgt->offset_offset,
+ (struct pipe_resource **)&tgt->offset_res,
+ &tgt->prim_map);
+ tgt->count.offset_start = tgt->count.offset_end = 0;
+ }
+
+ if (tgt->count.offset_end + 16 >= 4096) {
+ aggregate_stream_counter(batch, tgt, &tgt->prev_count);
+ aggregate_stream_counter(batch, tgt, &tgt->count);
+ }
+
+ crocus_emit_mi_flush(batch);
+ crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
+ tgt->offset_res->bo,
+ tgt->count.offset_end + tgt->offset_offset, false);
+ tgt->count.offset_end += 8;
+}
+
+static void
+compute_vertices_written_so_far(struct crocus_context *ice,
+ struct crocus_stream_output_target *tgt,
+ struct crocus_streamout_counter *counter,
+ uint64_t *svbi)
+{
+ //TODO vertices per prim
+ aggregate_stream_counter(&ice->batches[0], tgt, counter);
+
+ *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
+}
+#endif
+/**
+ * The pipe->set_stream_output_targets() driver hook.
+ *
+ * At this point, we know which targets are bound to a particular index,
+ * and also whether we want to append or start over. We can finish the
+ * 3DSTATE_SO_BUFFER packets we started earlier.
+ */
+static void
+crocus_set_stream_output_targets(struct pipe_context *ctx,
+ unsigned num_targets,
+ struct pipe_stream_output_target **targets,
+ const unsigned *offsets)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+ struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
+ const bool active = num_targets > 0;
+ if (ice->state.streamout_active != active) {
+ ice->state.streamout_active = active;
+#if GFX_VER >= 7
+ ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
+#else
+ ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+#endif
+
+ /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
+ * it's a non-pipelined command. If we're switching streamout on, we
+ * may have missed emitting it earlier, so do so now. (We're already
+ * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
+ */
+ if (active) {
+#if GFX_VER >= 7
+ ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
+#endif
+ } else {
+ uint32_t flush = 0;
+ for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+ struct crocus_stream_output_target *tgt =
+ (void *) ice->state.so_target[i];
+ if (tgt) {
+ struct crocus_resource *res = (void *) tgt->base.buffer;
+
+ flush |= crocus_flush_bits_for_history(res);
+ crocus_dirty_for_history(ice, res);
+ }
+ }
+ crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
+ "make streamout results visible", flush);
+ }
+ }
+
+ ice->state.so_targets = num_targets;
+ for (int i = 0; i < 4; i++) {
+ pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
+ pipe_so_target_reference(&ice->state.so_target[i],
+ i < num_targets ? targets[i] : NULL);
+ }
+
+#if GFX_VER == 6
+ bool stored_num_prims = false;
+ for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+ if (num_targets) {
+ struct crocus_stream_output_target *tgt =
+ (void *) ice->state.so_target[i];
+
+ if (!tgt)
+ continue;
+ if (offsets[i] == 0) {
+ // This means that we're supposed to ignore anything written to
+ // the buffer before. We can do this by just clearing out the
+ // count of writes to the prim count buffer.
+ tgt->count.offset_start = tgt->count.offset_end;
+ tgt->count.accum = 0;
+ ice->state.svbi = 0;
+ } else {
+ if (tgt->offset_res) {
+ compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
+ tgt->count.offset_start = tgt->count.offset_end;
+ }
+ }
+
+ if (!stored_num_prims) {
+ crocus_stream_store_prims_written(batch, tgt);
+ stored_num_prims = true;
+ }
+ } else {
+ struct crocus_stream_output_target *tgt =
+ (void *) old_tgt[i];
+ if (tgt) {
+ if (!stored_num_prims) {
+ crocus_stream_store_prims_written(batch, tgt);
+ stored_num_prims = true;
+ }
+
+ if (tgt->offset_res) {
+ tgt->prev_count = tgt->count;
+ }
+ }
+ }
+ pipe_so_target_reference(&old_tgt[i], NULL);
+ }
+
+#else
+ for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+ if (num_targets) {
+ struct crocus_stream_output_target *tgt =
+ (void *) ice->state.so_target[i];
+
+ if (offsets[i] == 0)
+ crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
+ else if (tgt)
+ crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
+ tgt->offset_res->bo,
+ tgt->offset_offset);
+ } else {
+ struct crocus_stream_output_target *tgt =
+ (void *) old_tgt[i];
+ if (tgt)
+ crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
+ tgt->offset_res->bo,
+ tgt->offset_offset, false);
+ }
+ pipe_so_target_reference(&old_tgt[i], NULL);
+ }
+#endif
+ /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
+ if (!active)
+ return;
+#if GFX_VER >= 7
+ ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
+#elif GFX_VER == 6
+ ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
+#endif
+}
+
+#endif
+
+#if GFX_VER >= 7
+/**
+ * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
+ * 3DSTATE_STREAMOUT packets.
+ *
+ * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
+ * hardware to record. We can create it entirely based on the shader, with
+ * no dynamic state dependencies.
+ *
+ * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
+ * state-based settings. We capture the shader-related ones here, and merge
+ * the rest in at draw time.
+ */
+static uint32_t *
+crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
+ const struct brw_vue_map *vue_map)
+{
+ struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
+ int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+ int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+ int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+ int max_decls = 0;
+ STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
+
+ memset(so_decl, 0, sizeof(so_decl));
+
+ /* Construct the list of SO_DECLs to be emitted. The formatting of the
+ * command feels strange -- each dword pair contains a SO_DECL per stream.
+ */
+ for (unsigned i = 0; i < info->num_outputs; i++) {
+ const struct pipe_stream_output *output = &info->output[i];
+ const int buffer = output->output_buffer;
+ const int varying = output->register_index;
+ const unsigned stream_id = output->stream;
+ assert(stream_id < MAX_VERTEX_STREAMS);
+
+ buffer_mask[stream_id] |= 1 << buffer;
+
+ assert(vue_map->varying_to_slot[varying] >= 0);
+
+ /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
+ * array. Instead, it simply increments DstOffset for the following
+ * input by the number of components that should be skipped.
+ *
+ * Our hardware is unusual in that it requires us to program SO_DECLs
+ * for fake "hole" components, rather than simply taking the offset
+ * for each real varying. Each hole can have size 1, 2, 3, or 4; we
+ * program as many size = 4 holes as we can, then a final hole to
+ * accommodate the final 1, 2, or 3 remaining.
+ */
+ int skip_components = output->dst_offset - next_offset[buffer];
+
+ while (skip_components > 0) {
+ so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+ .HoleFlag = 1,
+ .OutputBufferSlot = output->output_buffer,
+ .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
+ };
+ skip_components -= 4;
+ }
+
+ next_offset[buffer] = output->dst_offset + output->num_components;
+
+ so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+ .OutputBufferSlot = output->output_buffer,
+ .RegisterIndex = vue_map->varying_to_slot[varying],
+ .ComponentMask =
+ ((1 << output->num_components) - 1) << output->start_component,
+ };
+
+ if (decls[stream_id] > max_decls)
+ max_decls = decls[stream_id];
+ }
+
+ unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
+ uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
+ uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
+
+ crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
+ int urb_entry_read_offset = 0;
+ int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
+ urb_entry_read_offset;
+
+ /* We always read the whole vertex. This could be reduced at some
+ * point by reading less and offsetting the register index in the
+ * SO_DECLs.
+ */
+ sol.Stream0VertexReadOffset = urb_entry_read_offset;
+ sol.Stream0VertexReadLength = urb_entry_read_length - 1;
+ sol.Stream1VertexReadOffset = urb_entry_read_offset;
+ sol.Stream1VertexReadLength = urb_entry_read_length - 1;
+ sol.Stream2VertexReadOffset = urb_entry_read_offset;
+ sol.Stream2VertexReadLength = urb_entry_read_length - 1;
+ sol.Stream3VertexReadOffset = urb_entry_read_offset;
+ sol.Stream3VertexReadLength = urb_entry_read_length - 1;
+
+ // TODO: Double-check that stride == 0 means no buffer. Probably this
+ // needs to go elsewhere, where the buffer enable stuff is actually
+ // known.
+ sol.SOBufferEnable0 = !!info->stride[0];
+ sol.SOBufferEnable1 = !!info->stride[1];
+ sol.SOBufferEnable2 = !!info->stride[2];
+ sol.SOBufferEnable3 = !!info->stride[3];
+ }
+
+ crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
+ list.DWordLength = 3 + 2 * max_decls - 2;
+ list.StreamtoBufferSelects0 = buffer_mask[0];
+ list.StreamtoBufferSelects1 = buffer_mask[1];
+ list.StreamtoBufferSelects2 = buffer_mask[2];
+ list.StreamtoBufferSelects3 = buffer_mask[3];
+ list.NumEntries0 = decls[0];
+ list.NumEntries1 = decls[1];
+ list.NumEntries2 = decls[2];
+ list.NumEntries3 = decls[3];
+ }
+
+ for (int i = 0; i < max_decls; i++) {
+ crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
+ entry.Stream0Decl = so_decl[0][i];
+ entry.Stream1Decl = so_decl[1][i];
+ entry.Stream2Decl = so_decl[2][i];
+ entry.Stream3Decl = so_decl[3][i];
+ }
+ }
+
+ return map;
+}
+#endif
+
+#if GFX_VER == 6
+static void
+crocus_emit_so_svbi(struct crocus_context *ice)
+{
+ struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+ unsigned max_vertex = 0xffffffff;
+ for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+ struct crocus_stream_output_target *tgt =
+ (void *) ice->state.so_target[i];
+ if (tgt)
+ max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
+ }
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
+ svbi.IndexNumber = 0;
+ svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
+ svbi.MaximumIndex = max_vertex;
+ }
+
+ /* initialize the rest of the SVBI's to reasonable values so that we don't
+ * run out of room writing the regular data.
+ */
+ for (int i = 1; i < 4; i++) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
+ svbi.IndexNumber = i;
+ svbi.StreamedVertexBufferIndex = 0;
+ svbi.MaximumIndex = 0xffffffff;
+ }
+ }
+}
+
+#endif
+
+
+#if GFX_VER >= 6
+static bool
+crocus_is_drawing_points(const struct crocus_context *ice)
+{
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+ if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
+ cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
+ return true;
+
+ if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
+ const struct brw_gs_prog_data *gs_prog_data =
+ (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
+ return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
+ } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
+ const struct brw_tes_prog_data *tes_data =
+ (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
+ return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+ } else {
+ return ice->state.prim_mode == PIPE_PRIM_POINTS;
+ }
+}
+#endif
+
+#if GFX_VER >= 6
+static void
+get_attr_override(
+ struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
+ const struct brw_vue_map *vue_map,
+ int urb_entry_read_offset, int fs_attr,
+ bool two_side_color, uint32_t *max_source_attr)
+{
+ /* Find the VUE slot for this attribute. */
+ int slot = vue_map->varying_to_slot[fs_attr];
+
+ /* Viewport and Layer are stored in the VUE header. We need to override
+ * them to zero if earlier stages didn't write them, as GL requires that
+ * they read back as zero when not explicitly set.
+ */
+ if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
+ attr->ComponentOverrideX = true;
+ attr->ComponentOverrideW = true;
+ attr->ConstantSource = CONST_0000;
+
+ if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
+ attr->ComponentOverrideY = true;
+ if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
+ attr->ComponentOverrideZ = true;
+
+ return;
+ }
+
+ /* If there was only a back color written but not front, use back
+ * as the color instead of undefined
+ */
+ if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
+ slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
+ if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
+ slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
+
+ if (slot == -1) {
+ /* This attribute does not exist in the VUE--that means that the vertex
+ * shader did not write to it. This means that either:
+ *
+ * (a) This attribute is a texture coordinate, and it is going to be
+ * replaced with point coordinates (as a consequence of a call to
+ * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
+ * hardware will ignore whatever attribute override we supply.
+ *
+ * (b) This attribute is read by the fragment shader but not written by
+ * the vertex shader, so its value is undefined. Therefore the
+ * attribute override we supply doesn't matter.
+ *
+ * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
+ * previous shader stage.
+ *
+ * Note that we don't have to worry about the cases where the attribute
+ * is gl_PointCoord or is undergoing point sprite coordinate
+ * replacement, because in those cases, this function isn't called.
+ *
+ * In case (c), we need to program the attribute overrides so that the
+ * primitive ID will be stored in this slot. In every other case, the
+ * attribute override we supply doesn't matter. So just go ahead and
+ * program primitive ID in every case.
+ */
+ attr->ComponentOverrideW = true;
+ attr->ComponentOverrideX = true;
+ attr->ComponentOverrideY = true;
+ attr->ComponentOverrideZ = true;
+ attr->ConstantSource = PRIM_ID;
+ return;
+ }
+
+ /* Compute the location of the attribute relative to urb_entry_read_offset.
+ * Each increment of urb_entry_read_offset represents a 256-bit value, so
+ * it counts for two 128-bit VUE slots.
+ */
+ int source_attr = slot - 2 * urb_entry_read_offset;
+ assert(source_attr >= 0 && source_attr < 32);
+
+ /* If we are doing two-sided color, and the VUE slot following this one
+ * represents a back-facing color, then we need to instruct the SF unit to
+ * do back-facing swizzling.
+ */
+ bool swizzling = two_side_color &&
+ ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
+ vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
+ (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
+ vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
+
+ /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
+ if (*max_source_attr < source_attr + swizzling)
+ *max_source_attr = source_attr + swizzling;
+
+ attr->SourceAttribute = source_attr;
+ if (swizzling)
+ attr->SwizzleSelect = INPUTATTR_FACING;
+}
+
+static void
+calculate_attr_overrides(
+ const struct crocus_context *ice,
+ struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
+ uint32_t *point_sprite_enables,
+ uint32_t *urb_entry_read_length,
+ uint32_t *urb_entry_read_offset)
+{
+ const struct brw_wm_prog_data *wm_prog_data = (void *)
+ ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+ const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+ uint32_t max_source_attr = 0;
+ const struct shader_info *fs_info =
+ crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
+
+ int first_slot =
+ brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
+
+ /* Each URB offset packs two varying slots */
+ assert(first_slot % 2 == 0);
+ *urb_entry_read_offset = first_slot / 2;
+ *point_sprite_enables = 0;
+
+ for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
+ const int input_index = wm_prog_data->urb_setup[fs_attr];
+
+ if (input_index < 0)
+ continue;
+
+ bool point_sprite = false;
+ if (crocus_is_drawing_points(ice)) {
+ if (fs_attr >= VARYING_SLOT_TEX0 &&
+ fs_attr <= VARYING_SLOT_TEX7 &&
+ cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
+ point_sprite = true;
+
+ if (fs_attr == VARYING_SLOT_PNTC)
+ point_sprite = true;
+
+ if (point_sprite)
+ *point_sprite_enables |= 1U << input_index;
+ }
+
+ struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
+ if (!point_sprite) {
+ get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
+ cso_rast->cso.light_twoside, &max_source_attr);
+ }
+
+ /* The hardware can only do the overrides on 16 overrides at a
+ * time, and the other up to 16 have to be lined up so that the
+ * input index = the output index. We'll need to do some
+ * tweaking to make sure that's the case.
+ */
+ if (input_index < 16)
+ attr_overrides[input_index] = attribute;
+ else
+ assert(attribute.SourceAttribute == input_index);
+ }
+
+ /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
+ * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
+ *
+ * "This field should be set to the minimum length required to read the
+ * maximum source attribute. The maximum source attribute is indicated
+ * by the maximum value of the enabled Attribute # Source Attribute if
+ * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
+ * enable is not set.
+ * read_length = ceiling((max_source_attr + 1) / 2)
+ *
+ * [errata] Corruption/Hang possible if length programmed larger than
+ * recommended"
+ *
+ * Similar text exists for Ivy Bridge.
+ */
+ *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
+}
+#endif
+
+#if GFX_VER == 7
+static void
+crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
+{
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+ const struct brw_wm_prog_data *wm_prog_data = (void *)
+ ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+
+ uint32_t urb_entry_read_length;
+ uint32_t urb_entry_read_offset;
+ uint32_t point_sprite_enables;
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
+ sbe.AttributeSwizzleEnable = true;
+ sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+ sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
+
+ calculate_attr_overrides(ice,
+ sbe.Attribute,
+ &point_sprite_enables,
+ &urb_entry_read_length,
+ &urb_entry_read_offset);
+ sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+ sbe.VertexURBEntryReadLength = urb_entry_read_length;
+ sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+ sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
+ }
+}
+#endif
+
+/* ------------------------------------------------------------------- */
+
+/**
+ * Populate VS program key fields based on the current state.
+ */
+static void
+crocus_populate_vs_key(const struct crocus_context *ice,
+ const struct shader_info *info,
+ gl_shader_stage last_stage,
+ struct brw_vs_prog_key *key)
+{
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+ if (info->clip_distance_array_size == 0 &&
+ (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
+ last_stage == MESA_SHADER_VERTEX)
+ key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
+
+#if GFX_VER <= 5
+ key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
+ cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
+ key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
+#endif
+
+ key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
+
+#if !(GFX_VERx10 == 75)
+ uint64_t inputs_read = info->inputs_read;
+ int ve_idx = 0;
+ while (inputs_read) {
+ int i = u_bit_scan64(&inputs_read);
+ key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
+ ve_idx++;
+ }
+#endif
+}
+
+/**
+ * Populate TCS program key fields based on the current state.
+ */
+static void
+crocus_populate_tcs_key(const struct crocus_context *ice,
+ struct brw_tcs_prog_key *key)
+{
+}
+
+/**
+ * Populate TES program key fields based on the current state.
+ */
+static void
+crocus_populate_tes_key(const struct crocus_context *ice,
+ const struct shader_info *info,
+ gl_shader_stage last_stage,
+ struct brw_tes_prog_key *key)
+{
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+ if (info->clip_distance_array_size == 0 &&
+ (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
+ last_stage == MESA_SHADER_TESS_EVAL)
+ key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
+}
+
+/**
+ * Populate GS program key fields based on the current state.
+ */
+static void
+crocus_populate_gs_key(const struct crocus_context *ice,
+ const struct shader_info *info,
+ gl_shader_stage last_stage,
+ struct brw_gs_prog_key *key)
+{
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+ if (info->clip_distance_array_size == 0 &&
+ (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
+ last_stage == MESA_SHADER_GEOMETRY)
+ key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
+}
+
+/**
+ * Populate FS program key fields based on the current state.
+ */
+static void
+crocus_populate_fs_key(const struct crocus_context *ice,
+ const struct shader_info *info,
+ struct brw_wm_prog_key *key)
+{
+ struct crocus_screen *screen = (void *) ice->ctx.screen;
+ const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
+ const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
+ const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
+ const struct crocus_blend_state *blend = ice->state.cso_blend;
+
+#if GFX_VER < 6
+ uint32_t lookup = 0;
+
+ if (info->fs.uses_discard || zsa->cso.alpha_enabled)
+ lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
+
+ if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+ lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
+
+ if (fb->zsbuf && zsa->cso.depth_enabled) {
+ lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
+
+ if (zsa->cso.depth_writemask)
+ lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
+
+ }
+ if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
+ lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
+ if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
+ lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
+ }
+ key->iz_lookup = lookup;
+ key->stats_wm = ice->state.stats_wm;
+#endif
+
+ uint32_t line_aa = BRW_WM_AA_NEVER;
+ if (rast->cso.line_smooth) {
+ int reduced_prim = u_reduced_prim(ice->state.prim_mode);
+ if (reduced_prim == PIPE_PRIM_LINES)
+ line_aa = BRW_WM_AA_ALWAYS;
+ else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
+ if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
+ line_aa = BRW_WM_AA_SOMETIMES;
+
+ if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
+ rast->cso.cull_face == PIPE_FACE_BACK)
+ line_aa = BRW_WM_AA_ALWAYS;
+ } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
+ line_aa = BRW_WM_AA_SOMETIMES;
+
+ if (rast->cso.cull_face == PIPE_FACE_FRONT)
+ line_aa = BRW_WM_AA_ALWAYS;
+ }
+ }
+ }
+ key->line_aa = line_aa;
+
+ key->nr_color_regions = fb->nr_cbufs;
+
+ key->clamp_fragment_color = rast->cso.clamp_fragment_color;
+
+ key->alpha_to_coverage = blend->cso.alpha_to_coverage;
+
+ key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
+
+ key->flat_shade = rast->cso.flatshade &&
+ (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
+
+ key->persample_interp = rast->cso.force_persample_interp;
+ key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
+
+ key->ignore_sample_mask_out = !key->multisample_fbo;
+ key->coherent_fb_fetch = false; // TODO: needed?
+
+ key->force_dual_color_blend =
+ screen->driconf.dual_color_blend_by_location &&
+ (blend->blend_enables & 1) && blend->dual_color_blending;
+
+ /* TODO: Respect glHint for key->high_quality_derivatives */
+
+#if GFX_VER <= 5
+ if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
+ key->alpha_test_func = zsa->cso.alpha_func;
+ key->alpha_test_ref = zsa->cso.alpha_ref_value;
+ }
+#endif
+}
+
+static void
+crocus_populate_cs_key(const struct crocus_context *ice,
+ struct brw_cs_prog_key *key)
+{
+}
+
+#if GFX_VER == 4
+#define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
+#elif GFX_VER >= 5
+static uint64_t
+KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
+{
+ return shader->offset;
+}
+#endif
+
+/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
+ * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
+ * this WA on C0 stepping.
+ *
+ * TODO: Fill out SamplerCount for prefetching?
+ */
+
+#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
+ pkt.KernelStartPointer = KSP(ice, shader); \
+ pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
+ pkt.FloatingPointMode = prog_data->use_alt_mode; \
+ \
+ pkt.DispatchGRFStartRegisterForURBData = \
+ prog_data->dispatch_grf_start_reg; \
+ pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
+ pkt.prefix##URBEntryReadOffset = 0; \
+ \
+ pkt.StatisticsEnable = true; \
+ pkt.Enable = true; \
+ \
+ if (prog_data->total_scratch) { \
+ struct crocus_bo *bo = \
+ crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
+ pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
+ pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \
+ }
+
+/* ------------------------------------------------------------------- */
+#if GFX_VER >= 6
+static const uint32_t push_constant_opcodes[] = {
+ [MESA_SHADER_VERTEX] = 21,
+ [MESA_SHADER_TESS_CTRL] = 25, /* HS */
+ [MESA_SHADER_TESS_EVAL] = 26, /* DS */
+ [MESA_SHADER_GEOMETRY] = 22,
+ [MESA_SHADER_FRAGMENT] = 23,
+ [MESA_SHADER_COMPUTE] = 0,
+};
+#endif
+
+static void
+emit_sized_null_surface(struct crocus_batch *batch,
+ unsigned width, unsigned height,
+ unsigned layers, unsigned levels,
+ unsigned minimum_array_element,
+ uint32_t *out_offset)
+{
+ struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t *surf = stream_state(batch, isl_dev->ss.size,
+ isl_dev->ss.align,
+ out_offset);
+ //TODO gen 6 multisample crash
+ isl_null_fill_state(isl_dev, surf,
+ .size = isl_extent3d(width, height, layers),
+ .levels = levels,
+ .minimum_array_element = minimum_array_element);
+}
+static void
+emit_null_surface(struct crocus_batch *batch,
+ uint32_t *out_offset)
+{
+ emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
+}
+
+static void
+emit_null_fb_surface(struct crocus_batch *batch,
+ struct crocus_context *ice,
+ uint32_t *out_offset)
+{
+ uint32_t width, height, layers, level, layer;
+ /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
+ if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
+ emit_null_surface(batch, out_offset);
+ return;
+ }
+
+ struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
+ width = MAX2(cso->width, 1);
+ height = MAX2(cso->height, 1);
+ layers = cso->layers ? cso->layers : 1;
+ level = 0;
+ layer = 0;
+
+ if (cso->nr_cbufs == 0 && cso->zsbuf) {
+ width = cso->zsbuf->width;
+ height = cso->zsbuf->height;
+ level = cso->zsbuf->u.tex.level;
+ layer = cso->zsbuf->u.tex.first_layer;
+ }
+ emit_sized_null_surface(batch, width, height,
+ layers, level, layer,
+ out_offset);
+}
+
+static void
+emit_surface_state(struct crocus_batch *batch,
+ struct crocus_resource *res,
+ const struct isl_surf *in_surf,
+ bool adjust_surf,
+ struct isl_view *view,
+ bool writeable,
+ enum isl_aux_usage aux_usage,
+ bool blend_enable,
+ uint32_t write_disables,
+ uint32_t *surf_state,
+ uint32_t addr_offset)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t reloc = RELOC_32BIT;
+ uint32_t offset = res->offset, tile_x_sa = 0, tile_y_sa = 0;
+
+ if (writeable)
+ reloc |= RELOC_WRITE;
+
+ struct isl_surf surf = *in_surf;
+ if (adjust_surf) {
+ if (res->base.target == PIPE_TEXTURE_3D && view->array_len == 1) {
+ isl_surf_get_image_surf(isl_dev, in_surf,
+ view->base_level, 0,
+ view->base_array_layer,
+ &surf, &offset,
+ &tile_x_sa, &tile_y_sa);
+ view->base_array_layer = 0;
+ view->base_level = 0;
+ } else if (res->base.target == PIPE_TEXTURE_CUBE && devinfo->ver == 4) {
+ isl_surf_get_image_surf(isl_dev, in_surf,
+ view->base_level, view->base_array_layer,
+ 0,
+ &surf, &offset,
+ &tile_x_sa, &tile_y_sa);
+ view->base_array_layer = 0;
+ view->base_level = 0;
+ } else if (res->base.target == PIPE_TEXTURE_1D_ARRAY)
+ surf.dim = ISL_SURF_DIM_2D;
+ }
+
+ union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
+ struct crocus_bo *aux_bo = NULL;
+ uint32_t aux_offset = 0;
+ struct isl_surf *aux_surf = NULL;
+ if (aux_usage != ISL_AUX_USAGE_NONE) {
+ aux_surf = &res->aux.surf;
+ aux_offset = res->aux.offset;
+ aux_bo = res->aux.bo;
+
+ clear_color = crocus_resource_get_clear_color(res);
+ }
+
+ isl_surf_fill_state(isl_dev, surf_state,
+ .surf = &surf,
+ .view = view,
+ .address = crocus_state_reloc(batch,
+ addr_offset + isl_dev->ss.addr_offset,
+ res->bo, offset, reloc),
+ .aux_surf = aux_surf,
+ .aux_usage = aux_usage,
+ .aux_address = aux_offset,
+ .mocs = crocus_mocs(res->bo, isl_dev),
+ .clear_color = clear_color,
+ .use_clear_address = false,
+ .clear_address = 0,
+ .x_offset_sa = tile_x_sa,
+ .y_offset_sa = tile_y_sa,
+#if GFX_VER <= 5
+ .blend_enable = blend_enable,
+ .write_disables = write_disables,
+#endif
+ );
+
+ if (aux_surf) {
+ /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
+ * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
+ * contain other control information. Since buffer addresses are always
+ * on 4k boundaries (and thus have their lower 12 bits zero), we can use
+ * an ordinary reloc to do the necessary address translation.
+ *
+ * FIXME: move to the point of assignment.
+ */
+ uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
+ *aux_addr = crocus_state_reloc(batch,
+ addr_offset + isl_dev->ss.aux_addr_offset,
+ aux_bo, *aux_addr,
+ reloc);
+ }
+
+}
+
+static uint32_t
+emit_surface(struct crocus_batch *batch,
+ struct crocus_surface *surf,
+ enum isl_aux_usage aux_usage,
+ bool blend_enable,
+ uint32_t write_disables)
+{
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ struct isl_device *isl_dev = &batch->screen->isl_dev;
+ struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
+ struct isl_view *view = &surf->view;
+ uint32_t offset = 0;
+ enum pipe_texture_target target = res->base.target;
+ bool adjust_surf = false;
+
+ if (devinfo->ver == 4 && target == PIPE_TEXTURE_CUBE)
+ adjust_surf = true;
+
+ if (surf->align_res)
+ res = (struct crocus_resource *)surf->align_res;
+
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
+
+ emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
+ aux_usage, blend_enable,
+ write_disables,
+ surf_state, offset);
+ return offset;
+}
+
+static uint32_t
+emit_rt_surface(struct crocus_batch *batch,
+ struct crocus_surface *surf,
+ enum isl_aux_usage aux_usage)
+{
+ struct isl_device *isl_dev = &batch->screen->isl_dev;
+ struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
+ struct isl_view *view = &surf->read_view;
+ uint32_t offset = 0;
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
+
+ emit_surface_state(batch, res, &surf->surf, true, view, false,
+ aux_usage, 0, false,
+ surf_state, offset);
+ return offset;
+}
+
+static uint32_t
+emit_grid(struct crocus_context *ice,
+ struct crocus_batch *batch)
+{
+ UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t offset = 0;
+ struct crocus_state_ref *grid_ref = &ice->state.grid_size;
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+ isl_dev->ss.align, &offset);
+ isl_buffer_fill_state(isl_dev, surf_state,
+ .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+ crocus_resource_bo(grid_ref->res),
+ grid_ref->offset,
+ RELOC_32BIT),
+ .size_B = 12,
+ .format = ISL_FORMAT_RAW,
+ .stride_B = 1,
+ .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
+ return offset;
+}
+
+static uint32_t
+emit_ubo_buffer(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct pipe_constant_buffer *buffer)
+{
+ UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t offset = 0;
+
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+ isl_dev->ss.align, &offset);
+ isl_buffer_fill_state(isl_dev, surf_state,
+ .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+ crocus_resource_bo(buffer->buffer),
+ buffer->buffer_offset,
+ RELOC_32BIT),
+ .size_B = buffer->buffer_size,
+ .format = 0,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .stride_B = 1,
+ .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
+
+ return offset;
+}
+
+static uint32_t
+emit_ssbo_buffer(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct pipe_shader_buffer *buffer, bool writeable)
+{
+ UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t offset = 0;
+ uint32_t reloc = RELOC_32BIT;
+
+ if (writeable)
+ reloc |= RELOC_WRITE;
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+ isl_dev->ss.align, &offset);
+ isl_buffer_fill_state(isl_dev, surf_state,
+ .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+ crocus_resource_bo(buffer->buffer),
+ buffer->buffer_offset,
+ reloc),
+ .size_B = buffer->buffer_size,
+ .format = ISL_FORMAT_RAW,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .stride_B = 1,
+ .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
+
+ return offset;
+}
+
+static uint32_t
+emit_sampler_view(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ bool for_gather,
+ struct crocus_sampler_view *isv)
+{
+ UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t offset = 0;
+
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+ isl_dev->ss.align, &offset);
+
+ if (isv->base.target == PIPE_BUFFER) {
+ const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
+ const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
+ unsigned final_size =
+ MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
+ CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
+ isl_buffer_fill_state(isl_dev, surf_state,
+ .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+ isv->res->bo,
+ isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
+ .size_B = final_size,
+ .format = isv->view.format,
+ .swizzle = isv->view.swizzle,
+ .stride_B = cpp,
+ .mocs = crocus_mocs(isv->res->bo, isl_dev)
+ );
+ } else {
+ enum isl_aux_usage aux_usage =
+ crocus_resource_texture_aux_usage(isv->res);
+
+ emit_surface_state(batch, isv->res, &isv->res->surf, false,
+ for_gather ? &isv->gather_view : &isv->view,
+ false, aux_usage, false,
+ 0, surf_state, offset);
+ }
+ return offset;
+}
+
+static uint32_t
+emit_image_view(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ struct crocus_image_view *iv)
+{
+ UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t offset = 0;
+
+ struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+ isl_dev->ss.align, &offset);
+ bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
+ uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
+ if (res->base.target == PIPE_BUFFER) {
+ const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
+ const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
+ unsigned final_size =
+ MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
+ CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
+ isl_buffer_fill_state(isl_dev, surf_state,
+ .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+ res->bo,
+ res->offset + iv->base.u.buf.offset, reloc),
+ .size_B = final_size,
+ .format = iv->view.format,
+ .swizzle = iv->view.swizzle,
+ .stride_B = cpp,
+ .mocs = crocus_mocs(res->bo, isl_dev)
+ );
+ } else {
+ if (iv->view.format == ISL_FORMAT_RAW) {
+ isl_buffer_fill_state(isl_dev, surf_state,
+ .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+ res->bo,
+ res->offset, reloc),
+ .size_B = res->bo->size - res->offset,
+ .format = iv->view.format,
+ .swizzle = iv->view.swizzle,
+ .stride_B = 1,
+ .mocs = crocus_mocs(res->bo, isl_dev),
+ );
+
+
+ } else {
+ emit_surface_state(batch, res,
+ &res->surf, false, &iv->view,
+ write, 0, false,
+ 0, surf_state, offset);
+ }
+ }
+
+ return offset;
+}
+
+#if GFX_VER == 6
+static uint32_t
+emit_sol_surface(struct crocus_batch *batch,
+ struct pipe_stream_output_info *so_info,
+ uint32_t idx)
+{
+ struct crocus_context *ice = batch->ice;
+
+ if (idx >= so_info->num_outputs || !ice->state.streamout_active)
+ return 0;
+ const struct pipe_stream_output *output = &so_info->output[idx];
+ const int buffer = output->output_buffer;
+ assert(output->stream == 0);
+
+ struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
+ unsigned stride_dwords = so_info->stride[buffer];
+ unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
+
+ size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
+ unsigned num_vector_components = output->num_components;
+ unsigned num_elements;
+ /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
+ * too big to map using a single binding table entry?
+ */
+ // assert((size_dwords - offset_dwords) / stride_dwords
+ // <= BRW_MAX_NUM_BUFFER_ENTRIES);
+
+ if (size_dwords > offset_dwords + num_vector_components) {
+ /* There is room for at least 1 transform feedback output in the buffer.
+ * Compute the number of additional transform feedback outputs the
+ * buffer has room for.
+ */
+ num_elements =
+ (size_dwords - offset_dwords - num_vector_components);
+ } else {
+ /* There isn't even room for a single transform feedback output in the
+ * buffer. We can't configure the binding table entry to prevent output
+ * entirely; we'll have to rely on the geometry shader to detect
+ * overflow. But to minimize the damage in case of a bug, set up the
+ * binding table entry to just allow a single output.
+ */
+ num_elements = 0;
+ }
+ num_elements += stride_dwords;
+
+ uint32_t surface_format;
+ switch (num_vector_components) {
+ case 1:
+ surface_format = ISL_FORMAT_R32_FLOAT;
+ break;
+ case 2:
+ surface_format = ISL_FORMAT_R32G32_FLOAT;
+ break;
+ case 3:
+ surface_format = ISL_FORMAT_R32G32B32_FLOAT;
+ break;
+ case 4:
+ surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
+ break;
+ default:
+ unreachable("Invalid vector size for transform feedback output");
+ }
+
+ UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+ uint32_t offset = 0;
+
+ uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+ isl_dev->ss.align, &offset);
+ isl_buffer_fill_state(isl_dev, surf_state,
+ .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+ crocus_resource_bo(&buf->base),
+ offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
+ .size_B = num_elements * 4,
+ .stride_B = stride_dwords * 4,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .format = surface_format);
+ return offset;
+}
+#endif
+
+#define foreach_surface_used(index, group) \
+ for (int index = 0; index < bt->sizes[group]; index++) \
+ if (crocus_group_index_to_bti(bt, group, index) != \
+ CROCUS_SURFACE_NOT_USED)
+
+static void
+crocus_populate_binding_table(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ gl_shader_stage stage, bool ff_gs)
+{
+ struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
+ struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
+ if (!shader)
+ return;
+
+ struct crocus_binding_table *bt = &shader->bt;
+ int s = 0;
+ uint32_t *surf_offsets = shader->surf_offset;
+
+ const struct shader_info *info = crocus_get_shader_info(ice, stage);
+
+ if (stage == MESA_SHADER_FRAGMENT) {
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
+ if (cso_fb->nr_cbufs) {
+ for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+ uint32_t write_disables = 0;
+ bool blend_enable = false;
+#if GFX_VER <= 5
+ const struct pipe_rt_blend_state *rt =
+ &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
+ write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
+ write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
+ write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
+ write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
+ blend_enable = rt->blend_enable;
+#endif
+ if (cso_fb->cbufs[i]) {
+ surf_offsets[s] = emit_surface(batch,
+ (struct crocus_surface *)cso_fb->cbufs[i],
+ ice->state.draw_aux_usage[i],
+ blend_enable,
+ write_disables);
+ } else {
+ emit_null_fb_surface(batch, ice, &surf_offsets[s]);
+ }
+ s++;
+ }
+ } else {
+ emit_null_fb_surface(batch, ice, &surf_offsets[s]);
+ s++;
+ }
+
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ if (cso_fb->cbufs[i]) {
+ surf_offsets[s++] = emit_rt_surface(batch,
+ (struct crocus_surface *)cso_fb->cbufs[i],
+ ice->state.draw_aux_usage[i]);
+ }
+ }
+ }
+
+ if (stage == MESA_SHADER_COMPUTE) {
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
+ surf_offsets[s] = emit_grid(ice, batch);
+ s++;
+ }
+ }
+
+#if GFX_VER == 6
+ if (stage == MESA_SHADER_GEOMETRY) {
+ struct pipe_stream_output_info *so_info;
+ if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+ so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
+ else
+ so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
+
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
+ surf_offsets[s] = emit_sol_surface(batch, so_info, i);
+ s++;
+ }
+ }
+#endif
+
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
+ struct crocus_sampler_view *view = shs->textures[i];
+ if (view)
+ surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
+ else
+ emit_null_surface(batch, &surf_offsets[s]);
+ s++;
+ }
+
+ if (info && info->uses_texture_gather) {
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
+ struct crocus_sampler_view *view = shs->textures[i];
+ if (view)
+ surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
+ else
+ emit_null_surface(batch, &surf_offsets[s]);
+ s++;
+ }
+ }
+
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
+ struct crocus_image_view *view = &shs->image[i];
+ if (view->base.resource)
+ surf_offsets[s] = emit_image_view(ice, batch, view);
+ else
+ emit_null_surface(batch, &surf_offsets[s]);
+ s++;
+ }
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
+ if (shs->constbufs[i].buffer)
+ surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
+ else
+ emit_null_surface(batch, &surf_offsets[s]);
+ s++;
+ }
+ foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
+ if (shs->ssbo[i].buffer)
+ surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
+ !!(shs->writable_ssbos & (1 << i)));
+ else
+ emit_null_surface(batch, &surf_offsets[s]);
+ s++;
+ }
+
+}
+/* ------------------------------------------------------------------- */
+static uint32_t
+crocus_upload_binding_table(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ uint32_t *table,
+ uint32_t size)
+
+{
+ if (size == 0)
+ return 0;
+ return emit_state(batch, table, size, 32);
+}
+
+/**
+ * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
+ */
+
+static void
+crocus_update_surface_base_address(struct crocus_batch *batch)
+{
+ if (batch->state_base_address_emitted)
+ return;
+#if GFX_VER >= 6
+ uint32_t mocs = batch->screen->isl_dev.mocs.internal;
+#endif
+ flush_before_state_base_change(batch);
+
+ crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
+
+ sba.SurfaceStateBaseAddressModifyEnable = true;
+ sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
+
+#if GFX_VER >= 5
+ sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
+#endif
+
+ sba.GeneralStateBaseAddressModifyEnable = true;
+ sba.IndirectObjectBaseAddressModifyEnable = true;
+#if GFX_VER >= 5
+ sba.InstructionBaseAddressModifyEnable = true;
+#endif
+
+ sba.GeneralStateAccessUpperBoundModifyEnable = true;
+#if GFX_VER >= 5
+ sba.IndirectObjectAccessUpperBoundModifyEnable = true;
+ sba.InstructionAccessUpperBoundModifyEnable = true;
+#endif
+#if GFX_VER <= 5
+ sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
+#endif
+#if GFX_VER >= 6
+ /* The hardware appears to pay attention to the MOCS fields even
+ * if you don't set the "Address Modify Enable" bit for the base.
+ */
+ sba.GeneralStateMOCS = mocs;
+ sba.StatelessDataPortAccessMOCS = mocs;
+
+ sba.DynamicStateBaseAddressModifyEnable = true;
+
+ sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
+
+ /* Dynamic state upper bound. Although the documentation says that
+ * programming it to zero will cause it to be ignored, that is a lie.
+ * If this isn't programmed to a real bound, the sampler border color
+ * pointer is rejected, causing border color to mysteriously fail.
+ */
+ sba.DynamicStateAccessUpperBoundModifyEnable = true;
+ sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
+#endif
+ }
+
+ flush_after_state_base_change(batch);
+
+ /* According to section 3.6.1 of VOL1 of the 965 PRM,
+ * STATE_BASE_ADDRESS updates require a reissue of:
+ *
+ * 3DSTATE_PIPELINE_POINTERS
+ * 3DSTATE_BINDING_TABLE_POINTERS
+ * MEDIA_STATE_POINTERS
+ *
+ * and this continues through Ironlake. The Sandy Bridge PRM, vol
+ * 1 part 1 says that the folowing packets must be reissued:
+ *
+ * 3DSTATE_CC_POINTERS
+ * 3DSTATE_BINDING_TABLE_POINTERS
+ * 3DSTATE_SAMPLER_STATE_POINTERS
+ * 3DSTATE_VIEWPORT_STATE_POINTERS
+ * MEDIA_STATE_POINTERS
+ *
+ * Those are always reissued following SBA updates anyway (new
+ * batch time), except in the case of the program cache BO
+ * changing. Having a separate state flag makes the sequence more
+ * obvious.
+ */
+#if GFX_VER <= 5
+ batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
+#elif GFX_VER == 6
+ batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
+#endif
+ batch->state_base_address_emitted = true;
+}
+
+static inline void
+crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
+ bool window_space_position, float *zmin, float *zmax)
+{
+ if (window_space_position) {
+ *zmin = 0.f;
+ *zmax = 1.f;
+ return;
+ }
+ util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
+}
+
+struct push_bos {
+ struct {
+ struct crocus_address addr;
+ uint32_t length;
+ } buffers[4];
+ int buffer_count;
+ uint32_t max_length;
+};
+
+#if GFX_VER >= 6
+static void
+setup_constant_buffers(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ int stage,
+ struct push_bos *push_bos)
+{
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+ struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
+
+ uint32_t push_range_sum = 0;
+
+ int n = 0;
+ for (int i = 0; i < 4; i++) {
+ const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+ if (range->length == 0)
+ continue;
+
+ push_range_sum += range->length;
+
+ if (range->length > push_bos->max_length)
+ push_bos->max_length = range->length;
+
+ /* Range block is a binding table index, map back to UBO index. */
+ unsigned block_index = crocus_bti_to_group_index(
+ &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
+ assert(block_index != CROCUS_SURFACE_NOT_USED);
+
+ struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
+ struct crocus_resource *res = (void *) cbuf->buffer;
+
+ assert(cbuf->buffer_offset % 32 == 0);
+
+ push_bos->buffers[n].length = range->length;
+ push_bos->buffers[n].addr =
+ res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
+ : ro_bo(batch->ice->workaround_bo,
+ batch->ice->workaround_offset);
+ n++;
+ }
+
+ /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
+ *
+ * "The sum of all four read length fields must be less than or
+ * equal to the size of 64."
+ */
+ assert(push_range_sum <= 64);
+
+ push_bos->buffer_count = n;
+}
+
+#if GFX_VER == 7
+static void
+gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
+{
+ ASSERTED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ assert(devinfo->ver == 7);
+ crocus_emit_pipe_control_write(batch,
+ "vs workaround",
+ PIPE_CONTROL_WRITE_IMMEDIATE
+ | PIPE_CONTROL_DEPTH_STALL,
+ batch->ice->workaround_bo,
+ batch->ice->workaround_offset, 0);
+}
+#endif
+
+static void
+emit_push_constant_packets(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ int stage,
+ const struct push_bos *push_bos)
+{
+ struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+ struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
+
+#if GFX_VER == 7
+ if (stage == MESA_SHADER_VERTEX) {
+ if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
+ gen7_emit_vs_workaround_flush(batch);
+ }
+#endif
+ crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
+ pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
+#if GFX_VER == 7
+ if (prog_data) {
+ /* The Skylake PRM contains the following restriction:
+ *
+ * "The driver must ensure The following case does not occur
+ * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+ * buffer 3 read length equal to zero committed followed by a
+ * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+ * zero committed."
+ *
+ * To avoid this, we program the buffers in the highest slots.
+ * This way, slot 0 is only used if slot 3 is also used.
+ */
+ int n = push_bos->buffer_count;
+ assert(n <= 4);
+#if GFX_VERx10 >= 75
+ const unsigned shift = 4 - n;
+#else
+ const unsigned shift = 0;
+#endif
+ for (int i = 0; i < n; i++) {
+ pkt.ConstantBody.ReadLength[i + shift] =
+ push_bos->buffers[i].length;
+ pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
+ }
+ }
+#else
+ if (prog_data) {
+ int n = push_bos->buffer_count;
+ assert (n <= 1);
+ if (n == 1) {
+ pkt.Buffer0Valid = true;
+ pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
+ pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
+ }
+ }
+#endif
+ }
+}
+
+#endif
+
+#if GFX_VER >= 6
+typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
+#endif
+
+static inline void
+set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
+{
+ struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+ ds->DepthTestEnable = cso->cso.depth_enabled;
+ ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
+ ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
+
+ ds->StencilFailOp = cso->cso.stencil[0].fail_op;
+ ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
+ ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
+ ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
+
+ ds->StencilTestMask = cso->cso.stencil[0].valuemask;
+ ds->StencilWriteMask = cso->cso.stencil[0].writemask;
+
+ ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
+ ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
+ ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
+ ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
+
+ ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
+ ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
+ ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
+ ds->StencilTestEnable = cso->cso.stencil[0].enabled;
+ ds->StencilBufferWriteEnable =
+ cso->cso.stencil[0].writemask != 0 ||
+ (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
+}
+
+static void
+emit_vertex_buffer_state(struct crocus_batch *batch,
+ unsigned buffer_id,
+ struct crocus_bo *bo,
+ unsigned start_offset,
+ unsigned end_offset,
+ unsigned stride,
+ unsigned step_rate,
+ uint32_t **map)
+{
+ const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
+ _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
+ vb.BufferStartingAddress = ro_bo(bo, start_offset);
+ vb.VertexBufferIndex = buffer_id;
+ vb.BufferPitch = stride;
+#if GFX_VER == 7
+ vb.AddressModifyEnable = true;
+#endif
+#if GFX_VER >= 6
+ vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
+#endif
+ vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
+ vb.InstanceDataStepRate = step_rate;
+#if GFX_VER >= 5
+ vb.EndAddress = ro_bo(bo, end_offset - 1);
+#endif
+ }
+ *map += vb_dwords;
+}
+
+static bool
+can_emit_logic_op(struct crocus_context *ice)
+{
+ /* all pre gen8 have logicop restricted to unorm */
+ enum pipe_format pformat = PIPE_FORMAT_NONE;
+ for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
+ if (ice->state.framebuffer.cbufs[i]) {
+ pformat = ice->state.framebuffer.cbufs[i]->format;
+ break;
+ }
+ }
+ return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
+}
+
+#if GFX_VER >= 6
+static uint32_t
+determine_sample_mask(struct crocus_context *ice)
+{
+ uint32_t num_samples = ice->state.framebuffer.samples;
+
+ if (num_samples <= 1)
+ return 1;
+
+ uint32_t fb_mask = (1 << num_samples) - 1;
+ return ice->state.sample_mask & fb_mask;
+}
+#endif
+
+static void
+crocus_upload_dirty_render_state(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ const struct pipe_draw_info *draw)
+{
+ uint64_t dirty = ice->state.dirty;
+ uint64_t stage_dirty = ice->state.stage_dirty;
+
+ if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
+ !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
+ return;
+
+ if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
+ vf.StatisticsEnable = true;
+ }
+ }
+
+#if GFX_VER <= 5
+ if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
+ bool ret = calculate_curbe_offsets(batch);
+ if (ret) {
+ dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
+ stage_dirty |= CROCUS_STAGE_DIRTY_VS;
+ }
+ }
+
+ if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
+ stage_dirty & CROCUS_STAGE_DIRTY_VS) {
+ bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
+ brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
+ ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
+ if (ret)
+ dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+ }
+#endif
+ if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+ uint32_t cc_vp_address;
+
+ /* XXX: could avoid streaming for depth_clip [0,1] case. */
+ uint32_t *cc_vp_map =
+ stream_state(batch,
+ 4 * ice->state.num_viewports *
+ GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
+ for (int i = 0; i < ice->state.num_viewports; i++) {
+ float zmin, zmax;
+ crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
+ ice->state.window_space_position,
+ &zmin, &zmax);
+ if (cso_rast->cso.depth_clip_near)
+ zmin = 0.0;
+ if (cso_rast->cso.depth_clip_far)
+ zmax = 1.0;
+
+ crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
+ ccv.MinimumDepth = zmin;
+ ccv.MaximumDepth = zmax;
+ }
+
+ cc_vp_map += GENX(CC_VIEWPORT_length);
+ }
+
+#if GFX_VER >= 7
+ crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
+ ptr.CCViewportPointer = cc_vp_address;
+ }
+#elif GFX_VER == 6
+ crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+ vp.CCViewportStateChange = 1;
+ vp.PointertoCC_VIEWPORT = cc_vp_address;
+ }
+#else
+ ice->state.cc_vp_address = cc_vp_address;
+ dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+#endif
+ }
+
+ if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+#if GFX_VER == 7
+ uint32_t sf_cl_vp_address;
+ uint32_t *vp_map =
+ stream_state(batch,
+ 4 * ice->state.num_viewports *
+ GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
+#else
+ uint32_t *vp_map =
+ stream_state(batch,
+ 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
+ 32, &ice->state.sf_vp_address);
+ uint32_t *clip_map =
+ stream_state(batch,
+ 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
+ 32, &ice->state.clip_vp_address);
+#endif
+
+ for (unsigned i = 0; i < ice->state.num_viewports; i++) {
+ const struct pipe_viewport_state *state = &ice->state.viewports[i];
+ float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
+
+ intel_calculate_guardband_size(cso_fb->width, cso_fb->height,
+ state->scale[0], state->scale[1],
+ state->translate[0], state->translate[1],
+ &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
+#if GFX_VER == 7
+ crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
+#else
+ crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
+#endif
+ {
+ vp.ViewportMatrixElementm00 = state->scale[0];
+ vp.ViewportMatrixElementm11 = state->scale[1];
+ vp.ViewportMatrixElementm22 = state->scale[2];
+ vp.ViewportMatrixElementm30 = state->translate[0];
+ vp.ViewportMatrixElementm31 = state->translate[1];
+ vp.ViewportMatrixElementm32 = state->translate[2];
+#if GFX_VER < 6
+ struct pipe_scissor_state scissor;
+ crocus_fill_scissor_rect(ice, 0, &scissor);
+ vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
+ vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
+ vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
+ vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
+#endif
+
+#if GFX_VER == 7
+ vp.XMinClipGuardband = gb_xmin;
+ vp.XMaxClipGuardband = gb_xmax;
+ vp.YMinClipGuardband = gb_ymin;
+ vp.YMaxClipGuardband = gb_ymax;
+#endif
+ }
+#if GFX_VER < 7
+ crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
+ clip.XMinClipGuardband = gb_xmin;
+ clip.XMaxClipGuardband = gb_xmax;
+ clip.YMinClipGuardband = gb_ymin;
+ clip.YMaxClipGuardband = gb_ymax;
+ }
+#endif
+#if GFX_VER == 7
+ vp_map += GENX(SF_CLIP_VIEWPORT_length);
+#else
+ vp_map += GENX(SF_VIEWPORT_length);
+ clip_map += GENX(CLIP_VIEWPORT_length);
+#endif
+ }
+#if GFX_VER == 7
+ crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
+ ptr.SFClipViewportPointer = sf_cl_vp_address;
+ }
+#elif GFX_VER == 6
+ crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+ vp.SFViewportStateChange = 1;
+ vp.CLIPViewportStateChange = 1;
+ vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
+ vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
+ }
+#endif
+ }
+
+#if GFX_VER >= 6
+ if (dirty & CROCUS_DIRTY_GEN6_URB) {
+#if GFX_VER == 6
+ bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
+ || ice->shaders.ff_gs_prog;
+
+ struct brw_vue_prog_data *vue_prog_data =
+ (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
+ const unsigned vs_size = vue_prog_data->urb_entry_size;
+ unsigned gs_size = vs_size;
+ if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
+ struct brw_vue_prog_data *gs_vue_prog_data =
+ (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
+ gs_size = gs_vue_prog_data->urb_entry_size;
+ }
+
+ genX(upload_urb)(batch, vs_size, gs_present, gs_size);
+#endif
+#if GFX_VER == 7
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
+ bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
+ unsigned entry_size[4];
+
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+ if (!ice->shaders.prog[i]) {
+ entry_size[i] = 1;
+ } else {
+ struct brw_vue_prog_data *vue_prog_data =
+ (void *) ice->shaders.prog[i]->prog_data;
+ entry_size[i] = vue_prog_data->urb_entry_size;
+ }
+ assert(entry_size[i] != 0);
+ }
+
+ /* If we're just switching between programs with the same URB requirements,
+ * skip the rest of the logic.
+ */
+ bool no_change = false;
+ if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
+ ice->urb.gs_present == gs_present &&
+ ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
+ ice->urb.tess_present == tess_present &&
+ ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
+ ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
+ no_change = true;
+ }
+
+ if (!no_change) {
+ ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
+ ice->urb.gs_present = gs_present;
+ ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
+ ice->urb.tess_present = tess_present;
+ ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
+ ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
+
+ unsigned entries[4];
+ unsigned start[4];
+ bool constrained;
+ intel_get_urb_config(devinfo,
+ batch->screen->l3_config_3d,
+ tess_present,
+ gs_present,
+ entry_size,
+ entries, start, NULL, &constrained);
+
+ if (!(GFX_VERx10 == 75) && !devinfo->is_baytrail)
+ gen7_emit_vs_workaround_flush(batch);
+ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
+ urb._3DCommandSubOpcode += i;
+ urb.VSURBStartingAddress = start[i];
+ urb.VSURBEntryAllocationSize = entry_size[i] - 1;
+ urb.VSNumberofURBEntries = entries[i];
+ }
+ }
+ }
+#endif
+ }
+
+ if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
+ struct crocus_blend_state *cso_blend = ice->state.cso_blend;
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
+
+ STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
+
+ const int rt_dwords =
+ MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
+
+ uint32_t blend_offset;
+ uint32_t *blend_map =
+ stream_state(batch,
+ 4 * rt_dwords, 64, &blend_offset);
+
+ bool indep_alpha_blend = false;
+ for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
+ const struct pipe_rt_blend_state *rt =
+ &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
+
+ enum pipe_blendfactor src_rgb =
+ fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
+ enum pipe_blendfactor src_alpha =
+ fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
+ enum pipe_blendfactor dst_rgb =
+ fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
+ enum pipe_blendfactor dst_alpha =
+ fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
+
+ if (rt->rgb_func != rt->alpha_func ||
+ src_rgb != src_alpha || dst_rgb != dst_alpha)
+ indep_alpha_blend = true;
+
+ crocus_pack_state(GENX(BLEND_STATE_ENTRY), blend_map, be) {
+ if (can_emit_logic_op(ice)) {
+ be.LogicOpEnable = cso_blend->cso.logicop_enable;
+ be.LogicOpFunction = cso_blend->cso.logicop_func;
+ }
+
+ be.ColorClampRange = COLORCLAMP_RTFORMAT;
+ be.PreBlendColorClampEnable = true;
+ be.PostBlendColorClampEnable = true;
+
+ if (i == 0) {
+ struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
+ struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
+ be.ColorBufferBlendEnable = rt->blend_enable &&
+ (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
+ } else
+ be.ColorBufferBlendEnable = rt->blend_enable;
+
+ be.ColorBlendFunction = rt->rgb_func;
+ be.AlphaBlendFunction = rt->alpha_func;
+ be.SourceBlendFactor = (int) src_rgb;
+ be.SourceAlphaBlendFactor = (int) src_alpha;
+ be.DestinationBlendFactor = (int) dst_rgb;
+ be.DestinationAlphaBlendFactor = (int) dst_alpha;
+
+ be.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
+ be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
+ be.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
+ be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
+
+ be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
+ be.IndependentAlphaBlendEnable = indep_alpha_blend;
+ be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
+ be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage;
+ be.ColorDitherEnable = cso_blend->cso.dither;
+
+ /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
+ // Except they're not... fix that. Can't be done here since it needs
+ // to be conditional on non-integer RT's
+ be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
+ be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
+ }
+ blend_map += GENX(BLEND_STATE_ENTRY_length);
+ }
+
+#if GFX_VER < 7
+ crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+ ptr.PointertoBLEND_STATE = blend_offset;
+ ptr.BLEND_STATEChange = true;
+ }
+#else
+ crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
+ ptr.BlendStatePointer = blend_offset;
+ }
+#endif
+ }
+#endif
+
+ if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
+ struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+ UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
+ struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
+ uint32_t cc_offset;
+ void *cc_map =
+ stream_state(batch,
+ sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
+ 64, &cc_offset);
+#if GFX_VER <= 5
+ dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+ int blend_idx = 0;
+
+ if (cso_blend->cso.independent_blend_enable) {
+ for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+ if (cso_blend->cso.rt[i].blend_enable) {
+ blend_idx = i;
+ break;
+ }
+ }
+ }
+ const struct pipe_rt_blend_state *rt = &cso_blend->cso.rt[blend_idx];
+#endif
+ _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
+ cc.AlphaTestFormat = ALPHATEST_FLOAT32;
+ cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
+
+#if GFX_VER <= 5
+
+ set_depth_stencil_bits(ice, &cc);
+
+ cc.ColorBufferBlendEnable = rt->blend_enable;
+
+ if (cso_blend->cso.logicop_enable) {
+ if (can_emit_logic_op(ice)) {
+ cc.LogicOpEnable = cso_blend->cso.logicop_enable;
+ cc.LogicOpFunction = cso_blend->cso.logicop_func;
+ }
+ }
+ cc.ColorDitherEnable = cso_blend->cso.dither;
+ cc.ColorBlendFunction = rt->rgb_func;
+ cc.AlphaBlendFunction = rt->alpha_func;
+ cc.SourceBlendFactor = rt->rgb_src_factor;
+ cc.SourceAlphaBlendFactor = rt->alpha_src_factor;
+ cc.DestinationBlendFactor = rt->rgb_dst_factor;
+ cc.DestinationAlphaBlendFactor = rt->alpha_dst_factor;
+
+ if (rt->rgb_func != rt->alpha_func ||
+ rt->rgb_src_factor != rt->alpha_src_factor ||
+ rt->rgb_dst_factor != rt->alpha_dst_factor)
+ cc.IndependentAlphaBlendEnable = true;
+
+ if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
+ cc.AlphaTestEnable = cso->cso.alpha_enabled;
+ cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
+ }
+ cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
+ cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
+#else
+ cc.AlphaTestFormat = ALPHATEST_FLOAT32;
+ cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
+
+ cc.BlendConstantColorRed = ice->state.blend_color.color[0];
+ cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
+ cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
+ cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
+#endif
+ cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
+ cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
+ }
+ ice->shaders.cc_offset = cc_offset;
+#if GFX_VER >= 6
+ crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+ ptr.ColorCalcStatePointer = cc_offset;
+#if GFX_VER != 7
+ ptr.ColorCalcStatePointerValid = true;
+#endif
+ }
+#endif
+ }
+#if GFX_VER <= 5
+ if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
+ blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
+ blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
+ blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
+ blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
+ }
+ }
+#endif
+ for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+ if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
+ continue;
+
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+
+ if (!shader)
+ continue;
+
+ if (shs->sysvals_need_upload)
+ upload_sysvals(ice, stage);
+
+#if GFX_VER <= 5
+ dirty |= CROCUS_DIRTY_GEN4_CURBE;
+#endif
+#if GFX_VER >= 7
+ struct push_bos push_bos = {};
+ setup_constant_buffers(ice, batch, stage, &push_bos);
+
+ emit_push_constant_packets(ice, batch, stage, &push_bos);
+#endif
+ }
+
+ for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+ if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
+ if (ice->shaders.prog[stage]) {
+#if GFX_VER <= 6
+ dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
+#endif
+ crocus_populate_binding_table(ice, batch, stage, false);
+ ice->shaders.prog[stage]->bind_bo_offset =
+ crocus_upload_binding_table(ice, batch,
+ ice->shaders.prog[stage]->surf_offset,
+ ice->shaders.prog[stage]->bt.size_bytes);
+
+#if GFX_VER == 7
+ crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
+ ptr._3DCommandSubOpcode = 38 + stage;
+ ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
+ }
+#endif
+#if GFX_VER == 6
+ } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
+ dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
+ crocus_populate_binding_table(ice, batch, stage, true);
+ ice->shaders.ff_gs_prog->bind_bo_offset =
+ crocus_upload_binding_table(ice, batch,
+ ice->shaders.ff_gs_prog->surf_offset,
+ ice->shaders.ff_gs_prog->bt.size_bytes);
+#endif
+ }
+ }
+ }
+#if GFX_VER <= 6
+ if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
+ struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
+ if (gs == NULL)
+ gs = ice->shaders.ff_gs_prog;
+ crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
+ ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
+ ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
+#if GFX_VER == 6
+ ptr.VSBindingTableChange = true;
+ ptr.PSBindingTableChange = true;
+ ptr.GSBindingTableChange = gs ? true : false;
+ ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
+#endif
+ }
+ }
+#endif
+
+ bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
+ for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+ if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
+ !ice->shaders.prog[stage])
+ continue;
+
+ crocus_upload_sampler_states(ice, batch, stage);
+
+ sampler_updates = true;
+
+#if GFX_VER >= 7
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
+ ptr._3DCommandSubOpcode = 43 + stage;
+ ptr.PointertoVSSamplerState = shs->sampler_offset;
+ }
+#endif
+ }
+
+ if (sampler_updates) {
+#if GFX_VER == 6
+ struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
+ struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
+ struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
+ crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
+ if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
+ (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
+ stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
+ ptr.VSSamplerStateChange = true;
+ ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
+ }
+ if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
+ (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
+ stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
+ ptr.GSSamplerStateChange = true;
+ ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
+ }
+ if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
+ (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
+ stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
+ ptr.PSSamplerStateChange = true;
+ ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
+ }
+ }
+#endif
+ }
+
+#if GFX_VER >= 6
+ if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+ ms.PixelLocation =
+ ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
+ if (ice->state.framebuffer.samples > 0)
+ ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
+#if GFX_VER == 6
+ INTEL_SAMPLE_POS_4X(ms.Sample);
+#elif GFX_VER == 7
+ switch (ice->state.framebuffer.samples) {
+ case 1:
+ INTEL_SAMPLE_POS_1X(ms.Sample);
+ break;
+ case 2:
+ INTEL_SAMPLE_POS_2X(ms.Sample);
+ break;
+ case 4:
+ INTEL_SAMPLE_POS_4X(ms.Sample);
+ break;
+ case 8:
+ INTEL_SAMPLE_POS_8X(ms.Sample);
+ break;
+ default:
+ break;
+ }
+#endif
+ }
+ }
+
+ if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
+ ms.SampleMask = determine_sample_mask(ice);
+ }
+ }
+#endif
+
+#if GFX_VER >= 7
+ struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
+ if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
+ struct brw_stage_prog_data *prog_data = shader->prog_data;
+ struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
+ ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+ ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+ ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+
+ ps.DispatchGRFStartRegisterForConstantSetupData0 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
+ ps.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
+ ps.DispatchGRFStartRegisterForConstantSetupData2 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+
+ ps.KernelStartPointer0 = KSP(ice, shader) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
+ ps.KernelStartPointer1 = KSP(ice, shader) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
+ ps.KernelStartPointer2 = KSP(ice, shader) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+
+#if GFX_VERx10 == 75
+ ps.SampleMask = determine_sample_mask(ice);
+#endif
+ // XXX: WABTPPrefetchDisable, see above, drop at C0
+ ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
+ ps.FloatingPointMode = prog_data->use_alt_mode;
+ ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
+
+ ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
+
+ ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+ ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
+ ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
+ /* From the documentation for this packet:
+ * "If the PS kernel does not need the Position XY Offsets to
+ * compute a Position Value, then this field should be programmed
+ * to POSOFFSET_NONE."
+ *
+ * "SW Recommendation: If the PS kernel needs the Position Offsets
+ * to compute a Position XY value, this field should match Position
+ * ZW Interpolation Mode to ensure a consistent position.xyzw
+ * computation."
+ *
+ * We only require XY sample offsets. So, this recommendation doesn't
+ * look useful at the moment. We might need this in future.
+ */
+ ps.PositionXYOffsetSelect =
+ wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
+
+ if (wm_prog_data->base.total_scratch) {
+ struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
+ ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
+ ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
+ }
+ }
+ }
+#endif
+
+#if GFX_VER >= 7
+ if (ice->state.streamout_active) {
+ if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
+ for (int i = 0; i < 4; i++) {
+ struct crocus_stream_output_target *tgt =
+ (void *) ice->state.so_target[i];
+
+ if (!tgt) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
+ sob.SOBufferIndex = i;
+ }
+ continue;
+ }
+ struct crocus_resource *res = (void *) tgt->base.buffer;
+ uint32_t start = tgt->base.buffer_offset;
+ uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
+ crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
+ sob.SOBufferIndex = i;
+
+ sob.SurfaceBaseAddress = rw_bo(res->bo, start);
+ sob.SurfacePitch = tgt->stride;
+ sob.SurfaceEndAddress = rw_bo(res->bo, end);
+ }
+ }
+ }
+
+ if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
+ uint32_t *decl_list =
+ ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
+ crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
+ }
+
+ if (dirty & CROCUS_DIRTY_STREAMOUT) {
+ const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+ uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
+ crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
+ sol.SOFunctionEnable = true;
+ sol.SOStatisticsEnable = true;
+
+ sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
+ !ice->state.prims_generated_query_active;
+ sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
+ }
+
+ assert(ice->state.streamout);
+
+ crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
+ GENX(3DSTATE_STREAMOUT_length));
+ }
+ } else {
+ if (dirty & CROCUS_DIRTY_STREAMOUT) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
+ }
+ }
+#endif
+#if GFX_VER == 6
+ if (ice->state.streamout_active) {
+ if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
+ crocus_emit_so_svbi(ice);
+ }
+ }
+#endif
+
+ if (dirty & CROCUS_DIRTY_CLIP) {
+#if GFX_VER < 6
+ const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
+ struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
+
+ uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
+ dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+ _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
+ clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
+ clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+ clip.SingleProgramFlow = true;
+ clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
+
+ clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
+ clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
+
+ clip.DispatchGRFStartRegisterForURBData = 1;
+ clip.VertexURBEntryReadOffset = 0;
+ clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
+
+ clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
+ clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+
+ if (batch->ice->urb.nr_clip_entries >= 10) {
+ /* Half of the URB entries go to each thread, and it has to be an
+ * even number.
+ */
+ assert(batch->ice->urb.nr_clip_entries % 2 == 0);
+
+ /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
+ * only 2 threads can output VUEs at a time.
+ */
+ clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
+ } else {
+ assert(batch->ice->urb.nr_clip_entries >= 5);
+ clip.MaximumNumberofThreads = 1 - 1;
+ }
+ clip.VertexPositionSpace = VPOS_NDCSPACE;
+ clip.UserClipFlagsMustClipEnable = true;
+ clip.GuardbandClipTestEnable = true;
+
+ clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
+ clip.ScreenSpaceViewportXMin = -1.0;
+ clip.ScreenSpaceViewportXMax = 1.0;
+ clip.ScreenSpaceViewportYMin = -1.0;
+ clip.ScreenSpaceViewportYMax = 1.0;
+ clip.ViewportXYClipTestEnable = true;
+ clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
+
+#if GFX_VER == 5 || GFX_VERx10 == 45
+ clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
+#else
+ /* Up to 6 actual clip flags, plus the 7th for the negative RHW
+ * workaround.
+ */
+ clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
+#endif
+
+ clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
+ clip.GuardbandClipTestEnable = true;
+
+ clip.ClipMode = clip_prog_data->clip_mode;
+#if GFX_VERx10 == 45
+ clip.NegativeWClipTestEnable = true;
+#endif
+ }
+
+#else //if GFX_VER >= 6
+ struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+ const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
+ struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+ bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
+ ice->shaders.prog[MESA_SHADER_TESS_EVAL];
+ bool points_or_lines = cso_rast->fill_mode_point_or_line ||
+ (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
+ : ice->state.prim_is_points_or_lines);
+ uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
+ crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
+ cl.StatisticsEnable = ice->state.statistics_counters_enabled;
+ if (cso_rast->cso.rasterizer_discard)
+ cl.ClipMode = CLIPMODE_REJECT_ALL;
+ else if (ice->state.window_space_position)
+ cl.ClipMode = CLIPMODE_ACCEPT_ALL;
+ else
+ cl.ClipMode = CLIPMODE_NORMAL;
+
+ cl.PerspectiveDivideDisable = ice->state.window_space_position;
+ cl.ViewportXYClipTestEnable = !points_or_lines;
+
+ cl.UserClipDistanceCullTestEnableBitmask =
+ brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
+
+ if (wm_prog_data->barycentric_interp_modes &
+ BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
+ cl.NonPerspectiveBarycentricEnable = true;
+
+ cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
+ cl.MaximumVPIndex = ice->state.num_viewports - 1;
+ }
+ crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
+ ARRAY_SIZE(cso_rast->clip));
+#endif
+ }
+
+ if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
+ struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
+ const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+ const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+#if GFX_VER == 7
+ if (batch->screen->devinfo.is_ivybridge)
+ gen7_emit_vs_workaround_flush(batch);
+#endif
+
+
+#if GFX_VER == 6
+ struct push_bos push_bos = {};
+ setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
+
+ emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
+#endif
+#if GFX_VER >= 6
+ crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
+#else
+ uint32_t *vs_ptr = stream_state(batch,
+ GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
+ dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+ _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
+#endif
+ {
+ INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
+
+ vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
+
+#if GFX_VER < 6
+ vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
+ vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
+ vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
+
+ vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
+ vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+
+ vs.MaximumNumberofThreads =
+ CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
+ vs.StatisticsEnable = false;
+ vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
+#endif
+#if GFX_VER == 5
+ /* Force single program flow on Ironlake. We cannot reliably get
+ * all applications working without it. See:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=29172
+ *
+ * The most notable and reliably failing application is the Humus
+ * demo "CelShading"
+ */
+ vs.SingleProgramFlow = true;
+ vs.SamplerCount = 0; /* hardware requirement */
+
+#endif
+ }
+
+#if GFX_VER == 6
+ crocus_emit_pipe_control_flush(batch,
+ "post VS const",
+ PIPE_CONTROL_DEPTH_STALL |
+ PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+ PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+#endif
+ }
+
+ if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
+ struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
+ bool active = GFX_VER >= 6 && shader;
+#if GFX_VER == 6
+ struct push_bos push_bos = {};
+ if (shader)
+ setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
+
+ emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
+#endif
+#if GFX_VER >= 6
+ crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
+#else
+ uint32_t *gs_ptr = stream_state(batch,
+ GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
+ dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+ _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
+#endif
+ {
+#if GFX_VER >= 6
+ if (active) {
+ const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
+ const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+ const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
+
+ INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
+#if GFX_VER >= 7
+ gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
+ gs.OutputTopology = gs_prog_data->output_topology;
+ gs.ControlDataHeaderSize =
+ gs_prog_data->control_data_header_size_hwords;
+
+ gs.InstanceControl = gs_prog_data->invocations - 1;
+ gs.DispatchMode = vue_prog_data->dispatch_mode;
+
+ gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
+
+ gs.ControlDataFormat = gs_prog_data->control_data_format;
+#endif
+
+ /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
+ * Ivy Bridge and Haswell.
+ *
+ * On Ivy Bridge, setting this bit causes the vertices of a triangle
+ * strip to be delivered to the geometry shader in an order that does
+ * not strictly follow the OpenGL spec, but preserves triangle
+ * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
+ * the geometry shader sees triangles:
+ *
+ * (1, 2, 3), (2, 4, 3), (3, 4, 5)
+ *
+ * (Clearing the bit is even worse, because it fails to preserve
+ * orientation).
+ *
+ * Triangle strips with adjacency always ordered in a way that preserves
+ * triangle orientation but does not strictly follow the OpenGL spec,
+ * regardless of the setting of this bit.
+ *
+ * On Haswell, both triangle strips and triangle strips with adjacency
+ * are always ordered in a way that preserves triangle orientation.
+ * Setting this bit causes the ordering to strictly follow the OpenGL
+ * spec.
+ *
+ * So in either case we want to set the bit. Unfortunately on Ivy
+ * Bridge this will get the order close to correct but not perfect.
+ */
+ gs.ReorderMode = TRAILING;
+ gs.MaximumNumberofThreads = (batch->screen->devinfo.max_gs_threads - 1);
+
+#if GFX_VER < 7
+ gs.SOStatisticsEnable = true;
+ if (gs_prog_data->num_transform_feedback_bindings)
+ gs.SVBIPayloadEnable = ice->state.streamout_active;
+
+ /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
+ * was previously done for gen6.
+ *
+ * TODO: test with both disabled to see if the HW is behaving
+ * as expected, like in gen7.
+ */
+ gs.SingleProgramFlow = true;
+ gs.VectorMaskEnable = true;
+#endif
+ }
+#endif
+#if GFX_VER <= 6
+ if (!active && ice->shaders.ff_gs_prog) {
+ const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
+ /* In gen6, transform feedback for the VS stage is done with an
+ * ad-hoc GS program. This function provides the needed 3DSTATE_GS
+ * for this.
+ */
+ gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
+ gs.SingleProgramFlow = true;
+ gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
+ gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
+
+#if GFX_VER <= 5
+ gs.GRFRegisterCount =
+ DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
+ /* BRW_NEW_URB_FENCE */
+ gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
+ gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+ gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
+ gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+#else
+ gs.Enable = true;
+ gs.VectorMaskEnable = true;
+ gs.SVBIPayloadEnable = true;
+ gs.SVBIPostIncrementEnable = true;
+ gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
+ gs.SOStatisticsEnable = true;
+ gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
+#endif
+ }
+#endif
+ if (!active && !ice->shaders.ff_gs_prog) {
+ gs.DispatchGRFStartRegisterForURBData = 1;
+#if GFX_VER >= 7
+ gs.IncludeVertexHandles = true;
+#endif
+ }
+#if GFX_VER >= 6
+ gs.StatisticsEnable = true;
+#endif
+#if GFX_VER == 5 || GFX_VER == 6
+ gs.RenderingEnabled = true;
+#endif
+#if GFX_VER <= 5
+ gs.MaximumVPIndex = ice->state.num_viewports - 1;
+#endif
+ }
+ }
+
+#if GFX_VER >= 7
+ if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
+ struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
+
+ if (shader) {
+ const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
+ const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+ const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
+ INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
+ hs.InstanceCount = tcs_prog_data->instances - 1;
+ hs.IncludeVertexHandles = true;
+ hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
+ }
+ } else {
+ crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
+ }
+
+ }
+
+ if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
+ struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
+ if (shader) {
+ const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
+ const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+ const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
+ te.Partitioning = tes_prog_data->partitioning;
+ te.OutputTopology = tes_prog_data->output_topology;
+ te.TEDomain = tes_prog_data->domain;
+ te.TEEnable = true;
+ te.MaximumTessellationFactorOdd = 63.0;
+ te.MaximumTessellationFactorNotOdd = 64.0;
+ };
+ crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
+ INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
+
+ ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
+ ds.ComputeWCoordinateEnable =
+ tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+ };
+ } else {
+ crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
+ crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
+ }
+ }
+#endif
+ if (dirty & CROCUS_DIRTY_RASTER) {
+
+#if GFX_VER < 6
+ const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
+ struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
+ uint32_t *sf_ptr = stream_state(batch,
+ GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
+ dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+ _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
+ sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
+ sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+ sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
+ sf.DispatchGRFStartRegisterForURBData = 3;
+ sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+ sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
+ sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
+ sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
+ sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+
+ sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
+
+ sf.MaximumNumberofThreads =
+ MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
+
+ sf.SpritePointEnable = cso_state->point_quad_rasterization;
+ sf.DestinationOriginHorizontalBias = 0.5;
+ sf.DestinationOriginVerticalBias = 0.5;
+
+ sf.LastPixelEnable = cso_state->line_last_pixel;
+ sf.LineWidth = get_line_width(cso_state);
+ sf.PointWidth = cso_state->point_size;
+ sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
+#if GFX_VERx10 == 45 || GFX_VER >= 5
+ sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
+#endif
+ sf.ViewportTransformEnable = true;
+ sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
+ sf.ScissorRectangleEnable = true;
+ sf.CullMode = translate_cull_mode(cso_state->cull_face);
+
+ if (cso_state->flatshade_first) {
+ sf.TriangleFanProvokingVertexSelect = 1;
+ } else {
+ sf.TriangleStripListProvokingVertexSelect = 2;
+ sf.TriangleFanProvokingVertexSelect = 2;
+ sf.LineStripListProvokingVertexSelect = 1;
+ }
+ }
+#else
+ struct crocus_rasterizer_state *cso = ice->state.cso_rast;
+ uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
+ crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
+ sf.ViewportTransformEnable = !ice->state.window_space_position;
+
+#if GFX_VER == 6
+ const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+ uint32_t urb_entry_read_length;
+ uint32_t urb_entry_read_offset;
+ uint32_t point_sprite_enables;
+ calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
+ &urb_entry_read_length,
+ &urb_entry_read_offset);
+ sf.VertexURBEntryReadLength = urb_entry_read_length;
+ sf.VertexURBEntryReadOffset = urb_entry_read_offset;
+ sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
+ sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+ sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+#endif
+
+#if GFX_VER >= 6
+ if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
+ sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+#endif
+#if GFX_VER == 7
+ if (ice->state.framebuffer.zsbuf) {
+ struct crocus_resource *zres, *sres;
+ crocus_get_depth_stencil_resources(&batch->screen->devinfo,
+ ice->state.framebuffer.zsbuf->texture,
+ &zres, &sres);
+ /* ANV thinks that the stencil-ness doesn't matter, this is just
+ * about handling polygon offset scaling.
+ */
+ sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
+ }
+#endif
+ }
+ crocus_emit_merge(batch, cso->sf, dynamic_sf,
+ ARRAY_SIZE(dynamic_sf));
+#endif
+ }
+
+ if (dirty & CROCUS_DIRTY_WM) {
+ struct crocus_rasterizer_state *cso = ice->state.cso_rast;
+ const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+ UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
+ UNUSED const struct shader_info *fs_info =
+ crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
+
+#if GFX_VER == 6
+ struct push_bos push_bos = {};
+ setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
+
+ emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
+#endif
+#if GFX_VER >= 6
+ crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
+#else
+ uint32_t *wm_ptr = stream_state(batch,
+ GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
+
+ dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+
+ _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
+#endif
+ {
+#if GFX_VER <= 6
+ wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+ wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+ wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+#endif
+#if GFX_VER == 4
+ /* On gen4, we only have one shader kernel */
+ if (brw_wm_state_has_ksp(wm, 0)) {
+ wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
+ wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+ wm.DispatchGRFStartRegisterForConstantSetupData0 =
+ wm_prog_data->base.dispatch_grf_start_reg;
+ }
+#elif GFX_VER == 5
+ wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+ wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+ wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+ wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+ wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
+ wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
+
+ wm.DispatchGRFStartRegisterForConstantSetupData0 =
+ wm_prog_data->base.dispatch_grf_start_reg;
+#elif GFX_VER == 6
+ wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+ wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+ wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+ brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+ wm.DispatchGRFStartRegisterForConstantSetupData0 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+ wm.DispatchGRFStartRegisterForConstantSetupData1 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
+ wm.DispatchGRFStartRegisterForConstantSetupData2 =
+ brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
+#endif
+#if GFX_VER <= 5
+ wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
+ wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
+ wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+ wm.SetupURBEntryReadOffset = 0;
+ wm.EarlyDepthTestEnable = true;
+ wm.LineAntialiasingRegionWidth = _05pixels;
+ wm.LineEndCapAntialiasingRegionWidth = _10pixels;
+ wm.DepthCoefficientURBReadOffset = 1;
+
+ if (cso->cso.offset_tri) {
+ wm.GlobalDepthOffsetEnable = true;
+
+ /* Something weird going on with legacy_global_depth_bias,
+ * offset_constant, scaling and MRD. This value passes glean
+ * but gives some odd results elsewere (eg. the
+ * quad-offset-units test).
+ */
+ wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
+ wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
+ }
+ wm.SamplerStatePointer = ro_bo(batch->state.bo,
+ ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
+#endif
+
+ wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
+ ice->state.statistics_counters_enabled : 0;
+
+#if GFX_VER >= 6
+ wm.LineAntialiasingRegionWidth = _10pixels;
+ wm.LineEndCapAntialiasingRegionWidth = _05pixels;
+
+ wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+ wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
+#endif
+#if GFX_VER == 6
+ wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
+ ice->state.cso_blend->dual_color_blending;
+ wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+ wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+
+ /* From the SNB PRM, volume 2 part 1, page 281:
+ * "If the PS kernel does not need the Position XY Offsets
+ * to compute a Position XY value, then this field should be
+ * programmed to POSOFFSET_NONE."
+ *
+ * "SW Recommendation: If the PS kernel needs the Position Offsets
+ * to compute a Position XY value, this field should match Position
+ * ZW Interpolation Mode to ensure a consistent position.xyzw
+ * computation."
+ * We only require XY sample offsets. So, this recommendation doesn't
+ * look useful at the moment. We might need this in future.
+ */
+ if (wm_prog_data->uses_pos_offset)
+ wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
+ else
+ wm.PositionXYOffsetSelect = POSOFFSET_NONE;
+#endif
+ wm.LineStippleEnable = cso->cso.line_stipple_enable;
+ wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
+
+#if GFX_VER < 7
+ if (wm_prog_data->base.use_alt_mode)
+ wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+ wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
+ wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
+#endif
+
+#if GFX_VER >= 6
+ wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
+
+ struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
+ if (fb->samples > 1) {
+ if (cso->cso.multisample)
+ wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+ else
+ wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+
+ if (wm_prog_data->persample_dispatch)
+ wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+ else
+ wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
+ } else {
+ wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+ wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+ }
+#endif
+
+ wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
+
+ if (wm_prog_data->uses_kill ||
+ ice->state.cso_zsa->cso.alpha_enabled ||
+ ice->state.cso_blend->cso.alpha_to_coverage ||
+ (GFX_VER >= 6 && wm_prog_data->uses_omask))
+ wm.PixelShaderKillsPixel = true;
+
+ if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
+ writes_depth || wm.PixelShaderKillsPixel ||
+ (GFX_VER >= 6 && wm_prog_data->has_side_effects))
+ wm.ThreadDispatchEnable = true;
+
+#if GFX_VER >= 7
+ wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
+ wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
+#else
+ if (wm_prog_data->base.total_scratch) {
+ struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
+ MESA_SHADER_FRAGMENT);
+ wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
+ wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
+ }
+
+ wm.PixelShaderComputedDepth = writes_depth;
+
+#endif
+ /* The "UAV access enable" bits are unnecessary on HSW because they only
+ * seem to have an effect on the HW-assisted coherency mechanism which we
+ * don't need, and the rasterization-related UAV_ONLY flag and the
+ * DISPATCH_ENABLE bit can be set independently from it.
+ * C.f. gen8_upload_ps_extra().
+ *
+ * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
+ * _NEW_COLOR
+ */
+#if GFX_VERx10 == 75
+ if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
+ wm_prog_data->has_side_effects)
+ wm.PSUAVonly = ON;
+#endif
+
+#if GFX_VER >= 7
+ /* BRW_NEW_FS_PROG_DATA */
+ if (wm_prog_data->early_fragment_tests)
+ wm.EarlyDepthStencilControl = EDSC_PREPS;
+ else if (wm_prog_data->has_side_effects)
+ wm.EarlyDepthStencilControl = EDSC_PSEXEC;
+#endif
+ };
+
+#if GFX_VER <= 5
+ if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
+ clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
+ }
+ ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
+ }
+#endif
+ }
+
+#if GFX_VER >= 7
+ if (dirty & CROCUS_DIRTY_GEN7_SBE) {
+ crocus_emit_sbe(batch, ice);
+ }
+#endif
+
+#if GFX_VER >= 6
+ if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
+ uint32_t ds_offset;
+ void *ds_map = stream_state(batch,
+ sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
+ 64, &ds_offset);
+ _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
+ set_depth_stencil_bits(ice, &ds);
+ }
+
+#if GFX_VER == 6
+ crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+ ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
+ ptr.DEPTH_STENCIL_STATEChange = true;
+ }
+#else
+ crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
+ ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
+ }
+#endif
+ }
+
+ if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
+ /* Align to 64-byte boundary as per anv. */
+ uint32_t scissor_offset;
+ struct pipe_scissor_state *scissor_map = (void *)
+ stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
+ 64, &scissor_offset);
+ for (int i = 0; i < ice->state.num_viewports; i++) {
+ struct pipe_scissor_state scissor;
+ crocus_fill_scissor_rect(ice, i, &scissor);
+ scissor_map[i] = scissor;
+ }
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
+ ptr.ScissorRectPointer = scissor_offset;
+ }
+ }
+#endif
+
+ if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
+ struct isl_device *isl_dev = &batch->screen->isl_dev;
+#if GFX_VER >= 6
+ crocus_emit_depth_stall_flushes(batch);
+#endif
+ void *batch_ptr;
+ struct crocus_resource *zres, *sres;
+ struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
+ batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
+
+ struct isl_view view = {
+ .base_level = 0,
+ .levels = 1,
+ .base_array_layer = 0,
+ .array_len = 1,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ };
+ struct isl_depth_stencil_hiz_emit_info info = { .view = &view };
+
+ if (cso->zsbuf) {
+ crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
+ struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
+ if (zsbuf->align_res) {
+ zres = (struct crocus_resource *)zsbuf->align_res;
+ }
+ view.base_level = cso->zsbuf->u.tex.level;
+ view.base_array_layer = cso->zsbuf->u.tex.first_layer;
+ view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
+
+ if (zres) {
+ view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
+
+ info.depth_surf = &zres->surf;
+ info.depth_address = crocus_command_reloc(batch,
+ (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
+ zres->bo, 0, RELOC_32BIT);
+
+ info.mocs = crocus_mocs(zres->bo, isl_dev);
+ view.format = zres->surf.format;
+
+ if (crocus_resource_level_has_hiz(zres, view.base_level)) {
+ info.hiz_usage = zres->aux.usage;
+ info.hiz_surf = &zres->aux.surf;
+ uint32_t hiz_offset = 0;
+
+#if GFX_VER == 6
+ /* HiZ surfaces on Sandy Bridge technically don't support
+ * mip-mapping. However, we can fake it by offsetting to the
+ * first slice of LOD0 in the HiZ surface.
+ */
+ isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
+ view.base_level, 0, 0,
+ &hiz_offset, NULL, NULL);
+#endif
+ info.hiz_address = crocus_command_reloc(batch,
+ (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
+ zres->aux.bo, zres->aux.offset + hiz_offset,
+ RELOC_32BIT);
+ info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
+ }
+ }
+
+#if GFX_VER >= 6
+ if (sres) {
+ view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
+ info.stencil_aux_usage = sres->aux.usage;
+ info.stencil_surf = &sres->surf;
+
+ uint32_t stencil_offset = 0;
+#if GFX_VER == 6
+ /* Stencil surfaces on Sandy Bridge technically don't support
+ * mip-mapping. However, we can fake it by offsetting to the
+ * first slice of LOD0 in the stencil surface.
+ */
+ isl_surf_get_image_offset_B_tile_sa(&sres->surf,
+ view.base_level, 0, 0,
+ &stencil_offset, NULL, NULL);
+#endif
+
+ info.stencil_address = crocus_command_reloc(batch,
+ (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
+ sres->bo, stencil_offset, RELOC_32BIT);
+ if (!zres) {
+ view.format = sres->surf.format;
+ info.mocs = crocus_mocs(sres->bo, isl_dev);
+ }
+ }
+#endif
+ }
+ isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
+ }
+
+ /* TODO: Disable emitting this until something uses a stipple. */
+ if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
+ for (int i = 0; i < 32; i++) {
+ poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
+ }
+ }
+ }
+
+ if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
+ struct crocus_rasterizer_state *cso = ice->state.cso_rast;
+ crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
+ }
+
+#if GFX_VER <= 5
+ if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
+ upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
+ ice->shaders.vs_offset, ice->shaders.sf_offset,
+ ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
+ crocus_upload_urb_fence(batch);
+
+ crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
+ cs.NumberofURBEntries = ice->urb.nr_cs_entries;
+ cs.URBEntryAllocationSize = ice->urb.csize - 1;
+ }
+ dirty |= CROCUS_DIRTY_GEN4_CURBE;
+ }
+#endif
+ if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
+ struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
+ if (fb->width && fb->height) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+ rect.ClippedDrawingRectangleXMax = fb->width - 1;
+ rect.ClippedDrawingRectangleYMax = fb->height - 1;
+ }
+ }
+ }
+
+ if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
+ const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
+ const uint32_t count = user_count +
+ ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
+ uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
+
+ if (count) {
+ const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
+
+ uint32_t *map =
+ crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
+ _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
+ vb.DWordLength = (vb_dwords * count + 1) - 2;
+ }
+ map += 1;
+
+ uint32_t bound = dynamic_bound;
+ int i;
+ while (bound) {
+ i = u_bit_scan(&bound);
+ struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
+ struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
+ uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
+
+ emit_vertex_buffer_state(batch, i, bo,
+ buf->buffer_offset,
+ ice->state.vb_end[i],
+ buf->stride,
+ step_rate,
+ &map);
+ }
+ i = user_count;
+ if (ice->state.vs_uses_draw_params) {
+ struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
+ emit_vertex_buffer_state(batch, i++,
+ res->bo,
+ ice->draw.draw_params.offset,
+ ice->draw.draw_params.res->width0,
+ 0, 0, &map);
+ }
+ if (ice->state.vs_uses_derived_draw_params) {
+ struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
+ emit_vertex_buffer_state(batch, i++,
+ res->bo,
+ ice->draw.derived_draw_params.offset,
+ ice->draw.derived_draw_params.res->width0,
+ 0, 0, &map);
+ }
+ }
+ }
+
+ if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
+ struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
+ const unsigned entries = MAX2(cso->count, 1);
+ if (!(ice->state.vs_needs_sgvs_element ||
+ ice->state.vs_uses_derived_draw_params ||
+ ice->state.vs_needs_edge_flag)) {
+ crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
+ (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
+ } else {
+ uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
+ const unsigned dyn_count = cso->count +
+ ice->state.vs_needs_sgvs_element +
+ ice->state.vs_uses_derived_draw_params;
+
+ crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
+ &dynamic_ves, ve) {
+ ve.DWordLength =
+ 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
+ }
+ memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
+ (cso->count - ice->state.vs_needs_edge_flag) *
+ GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
+ uint32_t *ve_pack_dest =
+ &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
+ GENX(VERTEX_ELEMENT_STATE_length)];
+
+ if (ice->state.vs_needs_sgvs_element) {
+ uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
+ VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+ crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+ ve.Valid = true;
+ ve.VertexBufferIndex =
+ util_bitcount64(ice->state.bound_vertex_buffers);
+ ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+ ve.Component0Control = base_ctrl;
+ ve.Component1Control = base_ctrl;
+ ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
+ ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
+#if GFX_VER < 5
+ ve.DestinationElementOffset = cso->count * 4;
+#endif
+ }
+ ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+ }
+ if (ice->state.vs_uses_derived_draw_params) {
+ crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+ ve.Valid = true;
+ ve.VertexBufferIndex =
+ util_bitcount64(ice->state.bound_vertex_buffers) +
+ ice->state.vs_uses_draw_params;
+ ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+ ve.Component0Control = VFCOMP_STORE_SRC;
+ ve.Component1Control = VFCOMP_STORE_SRC;
+ ve.Component2Control = VFCOMP_STORE_0;
+ ve.Component3Control = VFCOMP_STORE_0;
+#if GFX_VER < 5
+ ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
+#endif
+ }
+ ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+ }
+ if (ice->state.vs_needs_edge_flag) {
+ for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
+ ve_pack_dest[i] = cso->edgeflag_ve[i];
+ }
+
+ crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
+ (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
+ }
+ }
+
+#if GFX_VERx10 == 75
+ if (dirty & CROCUS_DIRTY_GEN75_VF) {
+ crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
+ if (draw->primitive_restart) {
+ vf.IndexedDrawCutIndexEnable = true;
+ vf.CutIndex = draw->restart_index;
+ }
+ }
+ }
+#endif
+
+#if GFX_VER <= 5
+ if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
+ gen4_upload_curbe(batch);
+ }
+#endif
+}
+
+static void
+crocus_upload_render_state(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ const struct pipe_draw_info *draw,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *sc)
+{
+#if GFX_VER == 7
+ bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
+#endif
+ bool emit_index = false;
+ batch->no_wrap = true;
+
+ if (!batch->contains_draw) {
+ emit_index = true;
+ batch->contains_draw = true;
+ }
+ crocus_update_surface_base_address(batch);
+
+ crocus_upload_dirty_render_state(ice, batch, draw);
+
+ batch->no_wrap = false;
+ if (draw->index_size > 0) {
+ unsigned offset;
+ unsigned size;
+
+ if (draw->has_user_indices) {
+ unsigned start_offset = draw->index_size * sc->start;
+ u_upload_data(ice->ctx.stream_uploader, 0,
+ sc->count * draw->index_size, 4,
+ (char *)draw->index.user + start_offset,
+ &offset, &ice->state.index_buffer.res);
+ offset -= start_offset;
+ size = start_offset + sc->count * draw->index_size;
+ emit_index = true;
+ } else {
+ struct crocus_resource *res = (void *) draw->index.resource;
+ res->bind_history |= PIPE_BIND_INDEX_BUFFER;
+
+ if (ice->state.index_buffer.res != draw->index.resource) {
+ pipe_resource_reference(&ice->state.index_buffer.res,
+ draw->index.resource);
+ emit_index = true;
+ }
+ offset = 0;
+ size = draw->index.resource->width0;
+ }
+
+ if (!emit_index &&
+ (ice->state.index_buffer.size != size ||
+ ice->state.index_buffer.index_size != draw->index_size ||
+ ice->state.index_buffer.prim_restart != draw->primitive_restart))
+ emit_index = true;
+
+ if (emit_index) {
+ struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+#if !(GFX_VERx10 == 75)
+ ib.CutIndexEnable = draw->primitive_restart;
+#endif
+ ib.IndexFormat = draw->index_size >> 1;
+ ib.BufferStartingAddress = ro_bo(bo, offset);
+ ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
+ }
+ ice->state.index_buffer.size = size;
+ ice->state.index_buffer.offset = offset;
+ ice->state.index_buffer.index_size = draw->index_size;
+ ice->state.index_buffer.prim_restart = draw->primitive_restart;
+ }
+ }
+
+#define _3DPRIM_END_OFFSET 0x2420
+#define _3DPRIM_START_VERTEX 0x2430
+#define _3DPRIM_VERTEX_COUNT 0x2434
+#define _3DPRIM_INSTANCE_COUNT 0x2438
+#define _3DPRIM_START_INSTANCE 0x243C
+#define _3DPRIM_BASE_VERTEX 0x2440
+
+#if GFX_VER == 7
+ if (indirect && !indirect->count_from_stream_output) {
+ if (indirect->indirect_draw_count) {
+ use_predicate = true;
+
+ struct crocus_bo *draw_count_bo =
+ crocus_resource_bo(indirect->indirect_draw_count);
+ unsigned draw_count_offset =
+ indirect->indirect_draw_count_offset;
+
+ crocus_emit_pipe_control_flush(batch,
+ "ensure indirect draw buffer is flushed",
+ PIPE_CONTROL_FLUSH_ENABLE);
+ if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+#if GFX_VERx10 == 75
+ struct mi_builder b;
+ mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+ /* comparison = draw id < draw count */
+ struct mi_value comparison =
+ mi_ult(&b, mi_imm(drawid_offset),
+ mi_mem32(ro_bo(draw_count_bo,
+ draw_count_offset)));
+
+ /* predicate = comparison & conditional rendering predicate */
+ struct mi_value pred = mi_iand(&b, comparison,
+ mi_reg32(CS_GPR(15)));
+
+ mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
+ mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+ unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+ MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+#endif
+ } else {
+ uint32_t mi_predicate;
+
+ /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
+ crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
+ /* Upload the current draw count from the draw parameters buffer
+ * to MI_PREDICATE_SRC0.
+ */
+ crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
+ draw_count_bo, draw_count_offset);
+ /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
+ crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
+
+ if (drawid_offset == 0) {
+ mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+ MI_PREDICATE_COMBINEOP_SET |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+ } else {
+ /* While draw_index < draw_count the predicate's result will be
+ * (draw_index == draw_count) ^ TRUE = TRUE
+ * When draw_index == draw_count the result is
+ * (TRUE) ^ TRUE = FALSE
+ * After this all results will be:
+ * (FALSE) ^ FALSE = FALSE
+ */
+ mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
+ MI_PREDICATE_COMBINEOP_XOR |
+ MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+ }
+ crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+ }
+ }
+
+#if GFX_VER >= 7
+ struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
+ assert(bo);
+
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
+ lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
+ }
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
+ lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
+ }
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = _3DPRIM_START_VERTEX;
+ lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
+ }
+ if (draw->index_size) {
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
+ lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
+ }
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
+ lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
+ }
+ } else {
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
+ lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
+ }
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+ lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
+ lri.DataDWord = 0;
+ }
+ }
+#endif
+ } else if (indirect && indirect->count_from_stream_output) {
+#if GFX_VERx10 == 75
+ struct crocus_stream_output_target *so =
+ (void *) indirect->count_from_stream_output;
+
+ /* XXX: Replace with actual cache tracking */
+ crocus_emit_pipe_control_flush(batch,
+ "draw count from stream output stall",
+ PIPE_CONTROL_CS_STALL);
+
+ struct mi_builder b;
+ mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+ struct crocus_address addr =
+ ro_bo(crocus_resource_bo(&so->offset_res->base), so->offset_offset);
+ struct mi_value offset =
+ mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
+
+ mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
+ mi_udiv32_imm(&b, offset, so->stride));
+
+ _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
+ _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
+ _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
+ _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
+#endif
+ }
+#else
+ assert(!indirect);
+#endif
+
+ crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
+ prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
+#if GFX_VER == 7
+ prim.PredicateEnable = use_predicate;
+#endif
+
+ prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, draw->vertices_per_patch);
+ if (indirect) {
+ // XXX Probably have to do something for gen6 here?
+#if GFX_VER == 7
+ prim.IndirectParameterEnable = true;
+#endif
+ } else {
+#if GFX_VER >= 5
+ prim.StartInstanceLocation = draw->start_instance;
+#endif
+ prim.InstanceCount = draw->instance_count;
+ prim.VertexCountPerInstance = sc->count;
+
+ prim.StartVertexLocation = sc->start;
+
+ if (draw->index_size) {
+ prim.BaseVertexLocation += sc->index_bias;
+ }
+ }
+ }
+}
+
+#if GFX_VER == 7
+
+static void
+crocus_upload_compute_state(struct crocus_context *ice,
+ struct crocus_batch *batch,
+ const struct pipe_grid_info *grid)
+{
+ const uint64_t stage_dirty = ice->state.stage_dirty;
+ struct crocus_screen *screen = batch->screen;
+ const struct intel_device_info *devinfo = &screen->devinfo;
+ struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
+ struct crocus_compiled_shader *shader =
+ ice->shaders.prog[MESA_SHADER_COMPUTE];
+ struct brw_stage_prog_data *prog_data = shader->prog_data;
+ struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
+ const struct brw_cs_dispatch_info dispatch =
+ brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
+
+ crocus_update_surface_base_address(batch);
+ if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
+ upload_sysvals(ice, MESA_SHADER_COMPUTE);
+
+ if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
+ crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
+ ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
+ crocus_upload_binding_table(ice, batch,
+ ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
+ ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
+ }
+
+ if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
+ crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
+
+ if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
+ cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
+ /* The MEDIA_VFE_STATE documentation for Gen8+ says:
+ *
+ * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+ * the only bits that are changed are scoreboard related: Scoreboard
+ * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
+ * these scoreboard related states, a MEDIA_STATE_FLUSH is
+ * sufficient."
+ */
+ crocus_emit_pipe_control_flush(batch,
+ "workaround: stall before MEDIA_VFE_STATE",
+ PIPE_CONTROL_CS_STALL);
+
+ crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
+ if (prog_data->total_scratch) {
+ struct crocus_bo *bo =
+ crocus_get_scratch_space(ice, prog_data->total_scratch,
+ MESA_SHADER_COMPUTE);
+#if GFX_VERx10 == 75
+ /* Haswell's Per Thread Scratch Space is in the range [0, 10]
+ * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
+ */
+ vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
+#else
+ /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
+ * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
+ */
+ vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
+#endif
+ vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
+ }
+
+ vfe.MaximumNumberofThreads =
+ devinfo->max_cs_threads * screen->subslice_total - 1;
+ vfe.ResetGatewayTimer =
+ Resettingrelativetimerandlatchingtheglobaltimestamp;
+ vfe.BypassGatewayControl = true;
+ vfe.GPGPUMode = 1;
+ vfe.NumberofURBEntries = 0;
+ vfe.URBEntryAllocationSize = 0;
+
+ vfe.CURBEAllocationSize =
+ ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
+ cs_prog_data->push.cross_thread.regs, 2);
+ }
+ }
+
+ /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
+ if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
+ cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
+ uint32_t curbe_data_offset = 0;
+ assert(cs_prog_data->push.cross_thread.dwords == 0 &&
+ cs_prog_data->push.per_thread.dwords == 1 &&
+ cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
+ const unsigned push_const_size =
+ brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
+ uint32_t *curbe_data_map =
+ stream_state(batch,
+ ALIGN(push_const_size, 64), 64,
+ &curbe_data_offset);
+ assert(curbe_data_map);
+ memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
+ crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
+ curbe_data_map);
+
+ crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+ curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
+ curbe.CURBEDataStartAddress = curbe_data_offset;
+ }
+ }
+
+ if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
+ CROCUS_STAGE_DIRTY_BINDINGS_CS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_CS |
+ CROCUS_STAGE_DIRTY_CS)) {
+ uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
+ const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
+ crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
+ idd.KernelStartPointer = ksp;
+ idd.SamplerStatePointer = shs->sampler_offset;
+ idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
+ idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
+ idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
+ idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
+ idd.BarrierEnable = cs_prog_data->uses_barrier;
+ idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
+ prog_data->total_shared);
+#if GFX_VERx10 >= 75
+ idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
+#endif
+ }
+
+ crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
+ load.InterfaceDescriptorTotalLength =
+ GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+ load.InterfaceDescriptorDataStartAddress =
+ emit_state(batch, desc, sizeof(desc), 64);
+ }
+ }
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+ if (grid->indirect) {
+ struct crocus_state_ref *grid_size = &ice->state.grid_size;
+ struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
+ lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
+ }
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
+ lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
+ }
+ crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+ lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
+ lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
+ }
+
+ /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
+ _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
+ crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
+
+ /* Load compute_dispatch_indirect_x_size into SRC0 */
+ crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
+
+ /* predicate = (compute_dispatch_indirect_x_size == 0); */
+ crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOAD;
+ mip.CombineOperation = COMBINE_SET;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ };
+
+ /* Load compute_dispatch_indirect_y_size into SRC0 */
+ crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
+
+ /* predicate = (compute_dispatch_indirect_y_size == 0); */
+ crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOAD;
+ mip.CombineOperation = COMBINE_OR;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ };
+
+ /* Load compute_dispatch_indirect_z_size into SRC0 */
+ crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
+
+ /* predicate = (compute_dispatch_indirect_z_size == 0); */
+ crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOAD;
+ mip.CombineOperation = COMBINE_OR;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ };
+
+ /* predicate = !predicate; */
+#define COMPARE_FALSE 1
+ crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = LOAD_LOADINV;
+ mip.CombineOperation = COMBINE_OR;
+ mip.CompareOperation = COMPARE_FALSE;
+ }
+
+ }
+
+ crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
+ ggw.IndirectParameterEnable = grid->indirect != NULL;
+ ggw.PredicateEnable = grid->indirect != NULL;
+ ggw.SIMDSize = dispatch.simd_size / 16;
+ ggw.ThreadDepthCounterMaximum = 0;
+ ggw.ThreadHeightCounterMaximum = 0;
+ ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
+ ggw.ThreadGroupIDXDimension = grid->grid[0];
+ ggw.ThreadGroupIDYDimension = grid->grid[1];
+ ggw.ThreadGroupIDZDimension = grid->grid[2];
+ ggw.RightExecutionMask = dispatch.right_mask;
+ ggw.BottomExecutionMask = 0xffffffff;
+ }
+
+ crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
+
+ batch->contains_draw = true;
+}
+
+#endif /* GFX_VER == 7 */
+
+/**
+ * State module teardown.
+ */
+static void
+crocus_destroy_state(struct crocus_context *ice)
+{
+
+ pipe_resource_reference(&ice->draw.draw_params.res, NULL);
+ pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
+
+ for (int i = 0; i < 4; i++) {
+ pipe_so_target_reference(&ice->state.so_target[i], NULL);
+ }
+
+ for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
+ pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
+ }
+ pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
+
+ for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
+ struct crocus_shader_state *shs = &ice->state.shaders[stage];
+ for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+ pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
+ }
+ for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
+ pipe_resource_reference(&shs->image[i].base.resource, NULL);
+ }
+ for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
+ pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
+ }
+ for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
+ pipe_sampler_view_reference((struct pipe_sampler_view **)
+ &shs->textures[i], NULL);
+ }
+ }
+
+ pipe_resource_reference(&ice->state.grid_size.res, NULL);
+
+ pipe_resource_reference(&ice->state.index_buffer.res, NULL);
+}
+
+/* ------------------------------------------------------------------- */
+
+static void
+crocus_rebind_buffer(struct crocus_context *ice,
+ struct crocus_resource *res)
+{
+ struct pipe_context *ctx = &ice->ctx;
+
+ assert(res->base.target == PIPE_BUFFER);
+
+ /* Buffers can't be framebuffer attachments, nor display related,
+ * and we don't have upstream Clover support.
+ */
+ assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
+ PIPE_BIND_RENDER_TARGET |
+ PIPE_BIND_BLENDABLE |
+ PIPE_BIND_DISPLAY_TARGET |
+ PIPE_BIND_CURSOR |
+ PIPE_BIND_COMPUTE_RESOURCE |
+ PIPE_BIND_GLOBAL)));
+
+ if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
+ uint64_t bound_vbs = ice->state.bound_vertex_buffers;
+ while (bound_vbs) {
+ const int i = u_bit_scan64(&bound_vbs);
+ struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
+
+ if (!buffer->is_user_buffer && &res->base == buffer->buffer.resource)
+ ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
+ }
+ }
+
+ if (res->bind_history & PIPE_BIND_INDEX_BUFFER) {
+ if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
+ pipe_resource_reference(&ice->state.index_buffer.res, NULL);
+ }
+ /* There is no need to handle these:
+ * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
+ * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
+ */
+
+ if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
+ /* XXX: be careful about resetting vs appending... */
+ assert(false);
+ }
+
+ for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
+ struct crocus_shader_state *shs = &ice->state.shaders[s];
+ enum pipe_shader_type p_stage = stage_to_pipe(s);
+
+ if (!(res->bind_stages & (1 << s)))
+ continue;
+
+ if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+ /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
+ uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
+ while (bound_cbufs) {
+ const int i = u_bit_scan(&bound_cbufs);
+ struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
+
+ if (res->bo == crocus_resource_bo(cbuf->buffer)) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
+ }
+ }
+ }
+
+ if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
+ uint32_t bound_ssbos = shs->bound_ssbos;
+ while (bound_ssbos) {
+ const int i = u_bit_scan(&bound_ssbos);
+ struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
+
+ if (res->bo == crocus_resource_bo(ssbo->buffer)) {
+ struct pipe_shader_buffer buf = {
+ .buffer = &res->base,
+ .buffer_offset = ssbo->buffer_offset,
+ .buffer_size = ssbo->buffer_size,
+ };
+ crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
+ (shs->writable_ssbos >> i) & 1);
+ }
+ }
+ }
+
+ if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
+ uint32_t bound_sampler_views = shs->bound_sampler_views;
+ while (bound_sampler_views) {
+ const int i = u_bit_scan(&bound_sampler_views);
+ struct crocus_sampler_view *isv = shs->textures[i];
+ struct crocus_bo *bo = isv->res->bo;
+
+ if (res->bo == bo) {
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
+ }
+ }
+ }
+
+ if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
+ uint32_t bound_image_views = shs->bound_image_views;
+ while (bound_image_views) {
+ const int i = u_bit_scan(&bound_image_views);
+ struct crocus_image_view *iv = &shs->image[i];
+ struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
+
+ if (res->bo == bo)
+ ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
+ }
+ }
+ }
+}
+
+/* ------------------------------------------------------------------- */
+
+static unsigned
+flags_to_post_sync_op(uint32_t flags)
+{
+ if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
+ return WriteImmediateData;
+
+ if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
+ return WritePSDepthCount;
+
+ if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
+ return WriteTimestamp;
+
+ return 0;
+}
+
+/*
+ * Do the given flags have a Post Sync or LRI Post Sync operation?
+ */
+static enum pipe_control_flags
+get_post_sync_flags(enum pipe_control_flags flags)
+{
+ flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
+ PIPE_CONTROL_WRITE_DEPTH_COUNT |
+ PIPE_CONTROL_WRITE_TIMESTAMP |
+ PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+ /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
+ * "LRI Post Sync Operation". So more than one bit set would be illegal.
+ */
+ assert(util_bitcount(flags) <= 1);
+
+ return flags;
+}
+
+#define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
+
+/**
+ * Emit a series of PIPE_CONTROL commands, taking into account any
+ * workarounds necessary to actually accomplish the caller's request.
+ *
+ * Unless otherwise noted, spec quotations in this function come from:
+ *
+ * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
+ * Restrictions for PIPE_CONTROL.
+ *
+ * You should not use this function directly. Use the helpers in
+ * crocus_pipe_control.c instead, which may split the pipe control further.
+ */
+static void
+crocus_emit_raw_pipe_control(struct crocus_batch *batch,
+ const char *reason,
+ uint32_t flags,
+ struct crocus_bo *bo,
+ uint32_t offset,
+ uint64_t imm)
+{
+ UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+ enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
+ UNUSED enum pipe_control_flags non_lri_post_sync_flags =
+ post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+ /* Recursive PIPE_CONTROL workarounds --------------------------------
+ * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
+ *
+ * We do these first because we want to look at the original operation,
+ * rather than any workarounds we set.
+ */
+
+ /* "Flush Types" workarounds ---------------------------------------------
+ * We do these now because they may add post-sync operations or CS stalls.
+ */
+
+ if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
+ /* Hardware workaround: SNB B-Spec says:
+ *
+ * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
+ * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
+ * required."
+ */
+ crocus_emit_post_sync_nonzero_flush(batch);
+ }
+
+ if (!(GFX_VERx10 == 75) && (flags & PIPE_CONTROL_DEPTH_STALL)) {
+ /* Project: PRE-HSW / Argument: Depth Stall
+ *
+ * "The following bits must be clear:
+ * - Render Target Cache Flush Enable ([12] of DW1)
+ * - Depth Cache Flush Enable ([0] of DW1)"
+ */
+ assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
+ }
+
+ if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
+ /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
+ *
+ * "This bit must be DISABLED for operations other than writing
+ * PS_DEPTH_COUNT."
+ *
+ * This seems like nonsense. An Ivybridge workaround requires us to
+ * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
+ * operation. Gen8+ requires us to emit depth stalls and depth cache
+ * flushes together. So, it's hard to imagine this means anything other
+ * than "we originally intended this to be used for PS_DEPTH_COUNT".
+ *
+ * We ignore the supposed restriction and do nothing.
+ */
+ }
+
+ if (!(GFX_VERx10 == 75) && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
+ /* Project: PRE-HSW / Argument: Depth Cache Flush
+ *
+ * "Depth Stall must be clear ([13] of DW1)."
+ */
+ assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
+ }
+
+ if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
+ /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
+ *
+ * "This bit must be DISABLED for End-of-pipe (Read) fences,
+ * PS_DEPTH_COUNT or TIMESTAMP queries."
+ *
+ * TODO: Implement end-of-pipe checking.
+ */
+ assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
+ PIPE_CONTROL_WRITE_TIMESTAMP)));
+ }
+
+ if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
+ /* From the PIPE_CONTROL instruction table, bit 1:
+ *
+ * "This bit is ignored if Depth Stall Enable is set.
+ * Further, the render cache is not flushed even if Write Cache
+ * Flush Enable bit is set."
+ *
+ * We assert that the caller doesn't do this combination, to try and
+ * prevent mistakes. It shouldn't hurt the GPU, though.
+ *
+ * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
+ * and "Render Target Flush" combo is explicitly required for BTI
+ * update workarounds.
+ */
+ assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH)));
+ }
+
+ /* PIPE_CONTROL page workarounds ------------------------------------- */
+
+ if (GFX_VER == 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
+ /* From the PIPE_CONTROL page itself:
+ *
+ * "IVB, HSW, BDW
+ * Restriction: Pipe_control with CS-stall bit set must be issued
+ * before a pipe-control command that has the State Cache
+ * Invalidate bit set."
+ */
+ flags |= PIPE_CONTROL_CS_STALL;
+ }
+
+ if ((GFX_VERx10 == 75)) {
+ /* From the PIPE_CONTROL page itself:
+ *
+ * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
+ * Prior to programming a PIPECONTROL command with any of the RO
+ * cache invalidation bit set, program a PIPECONTROL flush command
+ * with “CS stall” bit and “HDC Flush” bit set."
+ *
+ * TODO: Actually implement this. What's an HDC Flush?
+ */
+ }
+
+ if (flags & PIPE_CONTROL_FLUSH_LLC) {
+ /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
+ *
+ * "Project: ALL
+ * SW must always program Post-Sync Operation to "Write Immediate
+ * Data" when Flush LLC is set."
+ *
+ * For now, we just require the caller to do it.
+ */
+ assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
+ }
+
+ /* "Post-Sync Operation" workarounds -------------------------------- */
+
+ /* Project: All / Argument: Global Snapshot Count Reset [19]
+ *
+ * "This bit must not be exercised on any product.
+ * Requires stall bit ([20] of DW1) set."
+ *
+ * We don't use this, so we just assert that it isn't used. The
+ * PIPE_CONTROL instruction page indicates that they intended this
+ * as a debug feature and don't think it is useful in production,
+ * but it may actually be usable, should we ever want to.
+ */
+ assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
+
+ if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
+ PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
+ /* Project: All / Arguments:
+ *
+ * - Generic Media State Clear [16]
+ * - Indirect State Pointers Disable [16]
+ *
+ * "Requires stall bit ([20] of DW1) set."
+ *
+ * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
+ * State Clear) says:
+ *
+ * "PIPECONTROL command with “Command Streamer Stall Enable” must be
+ * programmed prior to programming a PIPECONTROL command with "Media
+ * State Clear" set in GPGPU mode of operation"
+ *
+ * This is a subset of the earlier rule, so there's nothing to do.
+ */
+ flags |= PIPE_CONTROL_CS_STALL;
+ }
+
+ if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
+ /* Project: All / Argument: Store Data Index
+ *
+ * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+ * than '0'."
+ *
+ * For now, we just assert that the caller does this. We might want to
+ * automatically add a write to the workaround BO...
+ */
+ assert(non_lri_post_sync_flags != 0);
+ }
+
+ if (flags & PIPE_CONTROL_SYNC_GFDT) {
+ /* Project: All / Argument: Sync GFDT
+ *
+ * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+ * than '0' or 0x2520[13] must be set."
+ *
+ * For now, we just assert that the caller does this.
+ */
+ assert(non_lri_post_sync_flags != 0);
+ }
+
+ if (GFX_VER >= 6 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
+ /* Project: SNB, IVB, HSW / Argument: TLB inv
+ *
+ * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
+ * must be set to something other than '0'."
+ *
+ * For now, we just assert that the caller does this.
+ */
+ assert(non_lri_post_sync_flags != 0);
+ }
+
+ if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
+ /* Project: IVB+ / Argument: TLB inv
+ *
+ * "Requires stall bit ([20] of DW1) set."
+ *
+ * Also, from the PIPE_CONTROL instruction table:
+ *
+ * "Project: SKL+
+ * Post Sync Operation or CS stall must be set to ensure a TLB
+ * invalidation occurs. Otherwise no cycle will occur to the TLB
+ * cache to invalidate."
+ *
+ * This is not a subset of the earlier rule, so there's nothing to do.
+ */
+ flags |= PIPE_CONTROL_CS_STALL;
+ }
+
+ /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+ *
+ * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+ * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ *
+ * Note that the kernel does CS stalls between batches, so we only need
+ * to count them within a batch. We currently naively count every 4, and
+ * don't skip the ones with only read-cache-invalidate bits set. This
+ * may or may not be a problem...
+ */
+ if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
+ if (flags & PIPE_CONTROL_CS_STALL) {
+ /* If we're doing a CS stall, reset the counter and carry on. */
+ batch->pipe_controls_since_last_cs_stall = 0;
+ }
+
+ /* If this is the fourth pipe control without a CS stall, do one now. */
+ if (++batch->pipe_controls_since_last_cs_stall == 4) {
+ batch->pipe_controls_since_last_cs_stall = 0;
+ flags |= PIPE_CONTROL_CS_STALL;
+ }
+ }
+
+ /* "Stall" workarounds ----------------------------------------------
+ * These have to come after the earlier ones because we may have added
+ * some additional CS stalls above.
+ */
+
+ if (flags & PIPE_CONTROL_CS_STALL) {
+ /* Project: PRE-SKL, VLV, CHV
+ *
+ * "[All Stepping][All SKUs]:
+ *
+ * One of the following must also be set:
+ *
+ * - Render Target Cache Flush Enable ([12] of DW1)
+ * - Depth Cache Flush Enable ([0] of DW1)
+ * - Stall at Pixel Scoreboard ([1] of DW1)
+ * - Depth Stall ([13] of DW1)
+ * - Post-Sync Operation ([13] of DW1)
+ * - DC Flush Enable ([5] of DW1)"
+ *
+ * If we don't already have one of those bits set, we choose to add
+ * "Stall at Pixel Scoreboard". Some of the other bits require a
+ * CS stall as a workaround (see above), which would send us into
+ * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
+ * appears to be safe, so we choose that.
+ */
+ const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_WRITE_IMMEDIATE |
+ PIPE_CONTROL_WRITE_DEPTH_COUNT |
+ PIPE_CONTROL_WRITE_TIMESTAMP |
+ PIPE_CONTROL_STALL_AT_SCOREBOARD |
+ PIPE_CONTROL_DEPTH_STALL |
+ PIPE_CONTROL_DATA_CACHE_FLUSH;
+ if (!(flags & wa_bits))
+ flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+ }
+
+ /* Emit --------------------------------------------------------------- */
+
+ if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) {
+ fprintf(stderr,
+ " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
+ (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
+ (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
+ (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
+ (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
+ (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
+ (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
+ (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
+ (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
+ (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
+ (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
+ (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
+ (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
+ (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
+ (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
+ (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
+ (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
+ "SnapRes" : "",
+ (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
+ "ISPDis" : "",
+ (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
+ (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
+ (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
+ imm, reason);
+ }
+
+ crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
+#if GFX_VER >= 7
+ pc.LRIPostSyncOperation = NoLRIOperation;
+ pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
+ pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
+#endif
+#if GFX_VER >= 6
+ pc.StoreDataIndex = 0;
+ pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
+ pc.GlobalSnapshotCountReset =
+ flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
+ pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
+ pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
+ pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
+ pc.RenderTargetCacheFlushEnable =
+ flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
+ pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+ pc.StateCacheInvalidationEnable =
+ flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+ pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
+ pc.ConstantCacheInvalidationEnable =
+ flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+#else
+ pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
+#endif
+ pc.PostSyncOperation = flags_to_post_sync_op(flags);
+ pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
+ pc.InstructionCacheInvalidateEnable =
+ flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
+ pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
+#if GFX_VER >= 5 || GFX_VERx10 == 45
+ pc.IndirectStatePointersDisable =
+ flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
+#endif
+#if GFX_VER >= 6
+ pc.TextureCacheInvalidationEnable =
+ flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+#elif GFX_VER == 5 || GFX_VERx10 == 45
+ pc.TextureCacheFlushEnable =
+ flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+#endif
+ pc.Address = ggtt_bo(bo, offset);
+ if (GFX_VER < 7 && bo)
+ pc.DestinationAddressType = DAT_GGTT;
+ pc.ImmediateData = imm;
+ }
+}
+
+#if GFX_VER == 6
+void
+genX(upload_urb)(struct crocus_batch *batch,
+ unsigned vs_size,
+ bool gs_present,
+ unsigned gs_size)
+{
+ struct crocus_context *ice = batch->ice;
+ int nr_vs_entries, nr_gs_entries;
+ int total_urb_size = ice->urb.size * 1024; /* in bytes */
+ const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+ /* Calculate how many entries fit in each stage's section of the URB */
+ if (gs_present) {
+ nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
+ nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
+ } else {
+ nr_vs_entries = total_urb_size / (vs_size * 128);
+ nr_gs_entries = 0;
+ }
+
+ /* Then clamp to the maximum allowed by the hardware */
+ if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
+ nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
+
+ if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
+ nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
+
+ /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
+ ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
+ ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
+
+ assert(ice->urb.nr_vs_entries >=
+ devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
+ assert(ice->urb.nr_vs_entries % 4 == 0);
+ assert(ice->urb.nr_gs_entries % 4 == 0);
+ assert(vs_size <= 5);
+ assert(gs_size <= 5);
+
+ crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
+ urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
+ urb.VSURBEntryAllocationSize = vs_size - 1;
+
+ urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
+ urb.GSURBEntryAllocationSize = gs_size - 1;
+ };
+ /* From the PRM Volume 2 part 1, section 1.4.7:
+ *
+ * Because of a urb corruption caused by allocating a previous gsunit’s
+ * urb entry to vsunit software is required to send a "GS NULL
+ * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
+ * a dummy DRAW call before any case where VS will be taking over GS URB
+ * space.
+ *
+ * It is not clear exactly what this means ("URB fence" is a command that
+ * doesn't exist on Gen6). So for now we just do a full pipeline flush as
+ * a workaround.
+ */
+ if (ice->urb.gs_present && !gs_present)
+ crocus_emit_mi_flush(batch);
+ ice->urb.gs_present = gs_present;
+}
+#endif
+
+static void
+crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
+{
+}
+
+static void
+crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
+ struct crocus_bo *bo,
+ uint32_t offset_in_bytes,
+ uint32_t report_id)
+{
+#if GFX_VER >= 7
+ crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
+ mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
+ mi_rpc.ReportID = report_id;
+ }
+#endif
+}
+
+/**
+ * From the PRM, Volume 2a:
+ *
+ * "Indirect State Pointers Disable
+ *
+ * At the completion of the post-sync operation associated with this pipe
+ * control packet, the indirect state pointers in the hardware are
+ * considered invalid; the indirect pointers are not saved in the context.
+ * If any new indirect state commands are executed in the command stream
+ * while the pipe control is pending, the new indirect state commands are
+ * preserved.
+ *
+ * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
+ * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
+ * commands are only considered as Indirect State Pointers. Once ISP is
+ * issued in a context, SW must initialize by programming push constant
+ * commands for all the shaders (at least to zero length) before attempting
+ * any rendering operation for the same context."
+ *
+ * 3DSTATE_CONSTANT_* packets are restored during a context restore,
+ * even though they point to a BO that has been already unreferenced at
+ * the end of the previous batch buffer. This has been fine so far since
+ * we are protected by these scratch page (every address not covered by
+ * a BO should be pointing to the scratch page). But on CNL, it is
+ * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
+ * instruction.
+ *
+ * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
+ * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
+ * context restore, so the mentioned hang doesn't happen. However,
+ * software must program push constant commands for all stages prior to
+ * rendering anything, so we flag them as dirty.
+ *
+ * Finally, we also make sure to stall at pixel scoreboard to make sure the
+ * constants have been loaded into the EUs prior to disable the push constants
+ * so that it doesn't hang a previous 3DPRIMITIVE.
+ */
+#if GFX_VER >= 7
+static void
+gen7_emit_isp_disable(struct crocus_batch *batch)
+{
+ crocus_emit_raw_pipe_control(batch, "isp disable",
+ PIPE_CONTROL_STALL_AT_SCOREBOARD |
+ PIPE_CONTROL_CS_STALL,
+ NULL, 0, 0);
+ crocus_emit_raw_pipe_control(batch, "isp disable",
+ PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
+ PIPE_CONTROL_CS_STALL,
+ NULL, 0, 0);
+
+ struct crocus_context *ice = batch->ice;
+ ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_TES |
+ CROCUS_STAGE_DIRTY_CONSTANTS_GS |
+ CROCUS_STAGE_DIRTY_CONSTANTS_FS);
+}
+#endif
+
+#if GFX_VER >= 7
+static void
+crocus_state_finish_batch(struct crocus_batch *batch)
+{
+#if GFX_VERx10 == 75
+ if (batch->name == CROCUS_BATCH_RENDER) {
+ crocus_emit_mi_flush(batch);
+ crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+ ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
+ }
+
+ crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+ }
+#endif
+ gen7_emit_isp_disable(batch);
+}
+#endif
+
+static void
+crocus_batch_reset_dirty(struct crocus_batch *batch)
+{
+ /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
+ * as the old state batch won't still be available.
+ */
+ batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
+ CROCUS_DIRTY_COLOR_CALC_STATE;
+
+ batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
+
+ batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
+
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
+ batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
+ batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
+
+#if GFX_VER >= 6
+ /* SCISSOR_STATE */
+ batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+ batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+ batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
+
+#endif
+#if GFX_VER <= 5
+ /* dirty the SF state on gen4/5 */
+ batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
+ batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
+ batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
+ batch->ice->state.dirty |= CROCUS_DIRTY_WM;
+#endif
+#if GFX_VER >= 7
+ /* Streamout dirty */
+ batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
+ batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
+ batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
+#endif
+}
+
+#if GFX_VERx10 == 75
+struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
+{
+ return &ice->state.cso_rast->cso;
+}
+#endif
+
+#if GFX_VER >= 6
+static void update_so_strides(struct crocus_context *ice,
+ uint16_t *strides)
+{
+ for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+ struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
+ if (so)
+ so->stride = strides[i] * sizeof(uint32_t);
+ }
+}
+#endif
+
+static void
+crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
+{
+ struct crocus_context *ice = (struct crocus_context *) ctx;
+
+ if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
+ ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+ }
+
+ if (ice->batch_count == 1)
+ return;
+
+ if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
+ ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
+ ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+ }
+}
+
+void
+genX(init_screen_state)(struct crocus_screen *screen)
+{
+ assert(screen->devinfo.verx10 == GFX_VERx10);
+ screen->vtbl.destroy_state = crocus_destroy_state;
+ screen->vtbl.init_render_context = crocus_init_render_context;
+ screen->vtbl.upload_render_state = crocus_upload_render_state;
+#if GFX_VER == 7
+ screen->vtbl.init_compute_context = crocus_init_compute_context;
+ screen->vtbl.upload_compute_state = crocus_upload_compute_state;
+#endif
+ screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
+ screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
+ screen->vtbl.rebind_buffer = crocus_rebind_buffer;
+#if GFX_VERx10 == 75
+ screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
+ screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
+ screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
+ screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
+ screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
+ screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
+#endif
+#if GFX_VER >= 7
+ screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
+ screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
+ screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
+ screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
+#endif
+ screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
+#if GFX_VER >= 6
+ screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
+ screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
+#endif
+ screen->vtbl.populate_vs_key = crocus_populate_vs_key;
+ screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
+ screen->vtbl.populate_tes_key = crocus_populate_tes_key;
+ screen->vtbl.populate_gs_key = crocus_populate_gs_key;
+ screen->vtbl.populate_fs_key = crocus_populate_fs_key;
+ screen->vtbl.populate_cs_key = crocus_populate_cs_key;
+ screen->vtbl.lost_genx_state = crocus_lost_genx_state;
+#if GFX_VER >= 7
+ screen->vtbl.finish_batch = crocus_state_finish_batch;
+#endif
+#if GFX_VER <= 5
+ screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
+ screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
+#endif
+ screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
+ screen->vtbl.translate_prim_type = translate_prim_type;
+#if GFX_VER >= 6
+ screen->vtbl.update_so_strides = update_so_strides;
+ screen->vtbl.get_so_offset = crocus_get_so_offset;
+#endif
+
+ genX(init_blt)(screen);
+}
+
+void
+genX(init_state)(struct crocus_context *ice)
+{
+ struct pipe_context *ctx = &ice->ctx;
+
+ ctx->create_blend_state = crocus_create_blend_state;
+ ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
+ ctx->create_rasterizer_state = crocus_create_rasterizer_state;
+ ctx->create_sampler_state = crocus_create_sampler_state;
+ ctx->create_sampler_view = crocus_create_sampler_view;
+ ctx->create_surface = crocus_create_surface;
+ ctx->create_vertex_elements_state = crocus_create_vertex_elements;
+ ctx->bind_blend_state = crocus_bind_blend_state;
+ ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
+ ctx->bind_sampler_states = crocus_bind_sampler_states;
+ ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
+ ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
+ ctx->delete_blend_state = crocus_delete_state;
+ ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
+ ctx->delete_rasterizer_state = crocus_delete_state;
+ ctx->delete_sampler_state = crocus_delete_state;
+ ctx->delete_vertex_elements_state = crocus_delete_state;
+ ctx->set_blend_color = crocus_set_blend_color;
+ ctx->set_clip_state = crocus_set_clip_state;
+ ctx->set_constant_buffer = crocus_set_constant_buffer;
+ ctx->set_shader_buffers = crocus_set_shader_buffers;
+ ctx->set_shader_images = crocus_set_shader_images;
+ ctx->set_sampler_views = crocus_set_sampler_views;
+ ctx->set_tess_state = crocus_set_tess_state;
+ ctx->set_framebuffer_state = crocus_set_framebuffer_state;
+ ctx->set_polygon_stipple = crocus_set_polygon_stipple;
+ ctx->set_sample_mask = crocus_set_sample_mask;
+ ctx->set_scissor_states = crocus_set_scissor_states;
+ ctx->set_stencil_ref = crocus_set_stencil_ref;
+ ctx->set_vertex_buffers = crocus_set_vertex_buffers;
+ ctx->set_viewport_states = crocus_set_viewport_states;
+ ctx->sampler_view_destroy = crocus_sampler_view_destroy;
+ ctx->surface_destroy = crocus_surface_destroy;
+ ctx->draw_vbo = crocus_draw_vbo;
+ ctx->launch_grid = crocus_launch_grid;
+
+ ctx->set_frontend_noop = crocus_set_frontend_noop;
+
+#if GFX_VER >= 6
+ ctx->create_stream_output_target = crocus_create_stream_output_target;
+ ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
+ ctx->set_stream_output_targets = crocus_set_stream_output_targets;
+#endif
+
+ ice->state.dirty = ~0ull;
+ ice->state.stage_dirty = ~0ull;
+
+ ice->state.statistics_counters_enabled = true;
+
+ ice->state.sample_mask = 0xff;
+ ice->state.num_viewports = 1;
+ ice->state.prim_mode = PIPE_PRIM_MAX;
+ ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
+ ice->draw.derived_params.drawid = -1;
+
+ /* Default all scissor rectangles to be empty regions. */
+ for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
+ ice->state.scissors[i] = (struct pipe_scissor_state) {
+ .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
+ };
+ }
+}
diff --git a/src/gallium/drivers/crocus/crocus_todo.txt b/src/gallium/drivers/crocus/crocus_todo.txt
new file mode 100644
index 00000000000..1a6d3c9a710
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_todo.txt
@@ -0,0 +1,16 @@
+Quick TODO list from what I can see:
+
+General:
+Re-emit SURFACE_STATE_BASE_ADDRESS at the top of every batch
+
+Gen4:
+rgb32 issue
+
+Gen5:
+rgb32 issue
+
+Gen6:
+vec4 push constants
+
+Gen7:
+
diff --git a/src/gallium/drivers/crocus/driinfo_crocus.h b/src/gallium/drivers/crocus/driinfo_crocus.h
new file mode 100644
index 00000000000..829bf7f818c
--- /dev/null
+++ b/src/gallium/drivers/crocus/driinfo_crocus.h
@@ -0,0 +1,11 @@
+// crocus specific driconf options
+
+DRI_CONF_SECTION_DEBUG
+ DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
+ DRI_CONF_DISABLE_THROTTLING(false)
+ DRI_CONF_ALWAYS_FLUSH_CACHE(false)
+DRI_CONF_SECTION_END
+
+DRI_CONF_SECTION_PERFORMANCE
+ DRI_CONF_OPT_E(bo_reuse, 1, 0, 1, "Buffer object reuse",)
+DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/crocus/gen4_blorp_exec.h b/src/gallium/drivers/crocus/gen4_blorp_exec.h
new file mode 100644
index 00000000000..bc19a1b39fc
--- /dev/null
+++ b/src/gallium/drivers/crocus/gen4_blorp_exec.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+static inline struct blorp_address
+dynamic_state_address(struct blorp_batch *blorp_batch, uint32_t offset)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ return (struct blorp_address) {
+ .buffer = batch->state.bo,
+ .offset = offset,
+ };
+
+}
+
+static inline struct blorp_address
+instruction_state_address(struct blorp_batch *blorp_batch, uint32_t offset)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ return (struct blorp_address) {
+ .buffer = batch->ice->shaders.cache_bo,
+ .offset = offset,
+ };
+}
+
+static struct blorp_address
+blorp_emit_vs_state(struct blorp_batch *blorp_batch)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ uint32_t offset;
+ blorp_emit_dynamic(blorp_batch, GENX(VS_STATE), vs, 64, &offset) {
+ vs.Enable = false;
+ vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+#if GFX_VER == 5
+ vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> 2;
+#else
+ vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries;
+#endif
+ }
+
+ return dynamic_state_address(blorp_batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_sf_state(struct blorp_batch *blorp_batch,
+ const struct blorp_params *params)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+ const struct brw_sf_prog_data *prog_data = params->sf_prog_data;
+
+ uint32_t offset;
+ blorp_emit_dynamic(blorp_batch, GENX(SF_STATE), sf, 64, &offset) {
+#if GFX_VER == 4
+ sf.KernelStartPointer =
+ instruction_state_address(blorp_batch, params->sf_prog_kernel);
+#else
+ sf.KernelStartPointer = params->sf_prog_kernel;
+#endif
+ sf.GRFRegisterCount = DIV_ROUND_UP(prog_data->total_grf, 16) - 1;
+ sf.VertexURBEntryReadLength = prog_data->urb_read_length;
+ sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+ sf.DispatchGRFStartRegisterForURBData = 3;
+ sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
+ sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
+
+#if GFX_VER == 5
+ sf.MaximumNumberofThreads = MIN2(48, batch->ice->urb.nr_sf_entries) - 1;
+#else
+ sf.MaximumNumberofThreads = MIN2(24, batch->ice->urb.nr_sf_entries) - 1;
+#endif
+ sf.ViewportTransformEnable = false;
+
+ sf.CullMode = CULLMODE_NONE;
+ }
+
+ return dynamic_state_address(blorp_batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_wm_state(struct blorp_batch *blorp_batch,
+ const struct blorp_params *params)
+{
+ const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
+
+ uint32_t offset;
+ blorp_emit_dynamic(blorp_batch, GENX(WM_STATE), wm, 64, &offset) {
+ if (params->src.enabled) {
+ /* Iron Lake can't do sampler prefetch */
+ wm.SamplerCount = (GFX_VER != 5);
+ wm.BindingTableEntryCount = 2;
+ uint32_t sampler = blorp_emit_sampler_state(blorp_batch);
+ wm.SamplerStatePointer = dynamic_state_address(blorp_batch, sampler);
+ }
+
+ if (prog_data) {
+ wm.DispatchGRFStartRegisterForConstantSetupData0 =
+ prog_data->base.dispatch_grf_start_reg;
+ wm.SetupURBEntryReadLength = prog_data->num_varying_inputs * 2;
+ wm.SetupURBEntryReadOffset = 0;
+
+ wm.DepthCoefficientURBReadOffset = 1;
+ wm.PixelShaderKillsPixel = prog_data->uses_kill;
+ wm.ThreadDispatchEnable = true;
+ wm.EarlyDepthTestEnable = true;
+
+ wm._8PixelDispatchEnable = prog_data->dispatch_8;
+ wm._16PixelDispatchEnable = prog_data->dispatch_16;
+ wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
+#if GFX_VER == 4
+ wm.KernelStartPointer0 =
+ instruction_state_address(blorp_batch, params->wm_prog_kernel);
+ wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0);
+#else
+ wm.KernelStartPointer0 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+ wm.KernelStartPointer1 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+ wm.KernelStartPointer2 = params->wm_prog_kernel +
+ brw_wm_prog_data_prog_offset(prog_data, wm, 2);
+ wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0);
+ wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(prog_data, wm, 1);
+ wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(prog_data, wm, 2);
+#endif
+ }
+
+ wm.MaximumNumberofThreads =
+ blorp_batch->blorp->compiler->devinfo->max_wm_threads - 1;
+ }
+
+ return dynamic_state_address(blorp_batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_color_calc_state(struct blorp_batch *blorp_batch)
+{
+ uint32_t cc_viewport = blorp_emit_cc_viewport(blorp_batch);
+
+ uint32_t offset;
+ blorp_emit_dynamic(blorp_batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
+ cc.CCViewportStatePointer = dynamic_state_address(blorp_batch, cc_viewport);
+ }
+
+ return dynamic_state_address(blorp_batch, offset);
+}
+
+static void
+blorp_emit_pipeline(struct blorp_batch *blorp_batch,
+ const struct blorp_params *params)
+{
+ struct crocus_batch *batch = blorp_batch->driver_batch;
+
+ emit_urb_config(blorp_batch, params, NULL);
+
+ blorp_emit(blorp_batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
+ pp.PointertoVSState = blorp_emit_vs_state(blorp_batch);
+ pp.GSEnable = false;
+ pp.ClipEnable = false;
+ pp.PointertoSFState = blorp_emit_sf_state(blorp_batch, params);
+ pp.PointertoWMState = blorp_emit_wm_state(blorp_batch, params);
+ pp.PointertoColorCalcState = blorp_emit_color_calc_state(blorp_batch);
+ }
+
+ batch->screen->vtbl.upload_urb_fence(batch);
+
+ blorp_emit(blorp_batch, GENX(CS_URB_STATE), curb);
+ blorp_emit(blorp_batch, GENX(CONSTANT_BUFFER), curb);
+}
diff --git a/src/gallium/drivers/crocus/meson.build b/src/gallium/drivers/crocus/meson.build
new file mode 100644
index 00000000000..2bdb1f2cfb5
--- /dev/null
+++ b/src/gallium/drivers/crocus/meson.build
@@ -0,0 +1,90 @@
+# Copyright © 2017-2019 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+files_libcrocus = files(
+ 'gen4_blorp_exec.h',
+ 'driinfo_crocus.h',
+ 'crocus_batch.c',
+ 'crocus_batch.h',
+ 'crocus_blit.c',
+ 'crocus_bufmgr.c',
+ 'crocus_bufmgr.h',
+ 'crocus_clear.c',
+ 'crocus_context.c',
+ 'crocus_context.h',
+ 'crocus_draw.c',
+ 'crocus_fence.c',
+ 'crocus_fence.h',
+ 'crocus_fine_fence.c',
+ 'crocus_fine_fence.h',
+ 'crocus_formats.c',
+ 'crocus_genx_macros.h',
+ 'crocus_genx_protos.h',
+ 'crocus_monitor.c',
+ 'crocus_pipe.h',
+ 'crocus_pipe_control.c',
+ 'crocus_program.c',
+ 'crocus_program_cache.c',
+ 'crocus_resolve.c',
+ 'crocus_resource.c',
+ 'crocus_resource.h',
+ 'crocus_screen.c',
+ 'crocus_screen.h',
+ 'crocus_disk_cache.c',
+)
+
+crocus_per_hw_ver_libs = []
+foreach v : ['40', '45', '50', '60', '70', '75']
+ crocus_per_hw_ver_libs += static_library(
+ 'crocus_per_hw_ver@0@'.format(v),
+ ['crocus_blorp.c', 'crocus_query.c', 'crocus_state.c', 'crocus_blt.c', gen_xml_pack],
+ include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_intel],
+ c_args : [
+ no_override_init_args, c_sse2_args,
+ '-DGFX_VERx10=@0@'.format(v),
+ ],
+ gnu_symbol_visibility : 'hidden',
+ dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
+ )
+endforeach
+
+libcrocus = static_library(
+ 'crocus',
+ [files_libcrocus, gen_xml_pack],
+ include_directories : [
+ inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_intel,
+ inc_gallium_drivers,
+ # these should not be necessary, but main/macros.h...
+ inc_mesa, inc_mapi
+ ],
+ c_args : [c_sse2_args],
+ cpp_args : [c_sse2_args],
+ gnu_symbol_visibility : 'hidden',
+ dependencies : [dep_libdrm, dep_valgrind, idep_genxml, idep_libintel_common, idep_nir_headers],
+ link_with : [
+ crocus_per_hw_ver_libs, libintel_compiler, libintel_dev, libisl,
+ libblorp, libintel_perf
+ ],
+)
+
+driver_crocus = declare_dependency(
+ compile_args : '-DGALLIUM_CROCUS',
+ link_with : [libcrocus, libcrocuswinsys],
+)
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index 3b3bb07f1de..e64d7399ae1 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -129,6 +129,12 @@ if with_gallium_tegra
else
driver_tegra = declare_dependency()
endif
+if with_gallium_crocus
+ subdir('winsys/crocus/drm')
+ subdir('drivers/crocus')
+else
+ driver_crocus = declare_dependency()
+endif
if with_gallium_iris
subdir('winsys/iris/drm')
subdir('drivers/iris')
diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build
index daef41613db..cc6c805641b 100644
--- a/src/gallium/targets/d3dadapter9/meson.build
+++ b/src/gallium/targets/d3dadapter9/meson.build
@@ -64,7 +64,7 @@ libgallium_nine = shared_library(
dep_selinux, dep_libdrm, dep_llvm, dep_thread,
idep_xmlconfig, idep_mesautil, idep_nir,
driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
- driver_i915, driver_svga, driver_iris
+ driver_i915, driver_svga, driver_iris, driver_crocus
],
name_prefix : '',
version : '.'.join(nine_version),
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index 90b48bf508e..e4cc199b363 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -58,7 +58,7 @@ libgallium_dri = shared_library(
driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
driver_tegra, driver_i915, driver_svga, driver_virgl,
driver_swr, driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
- driver_asahi
+ driver_asahi, driver_crocus
],
# Will be deleted during installation, see install_megadrivers.py
install : true,
@@ -98,6 +98,7 @@ foreach d : [[with_gallium_kmsro, [
[with_gallium_panfrost, 'panfrost_dri.so'],
[with_gallium_etnaviv, 'etnaviv_dri.so'],
[with_gallium_tegra, 'tegra_dri.so'],
+ [with_gallium_crocus, 'crocus_dri.so'],
[with_gallium_iris, 'iris_dri.so'],
[with_gallium_i915, 'i915_dri.so'],
[with_gallium_r300, 'r300_dri.so'],
diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c
index 9df8da61803..3c7c2325f17 100644
--- a/src/gallium/targets/dri/target.c
+++ b/src/gallium/targets/dri/target.c
@@ -42,6 +42,10 @@ DEFINE_LOADER_DRM_ENTRYPOINT(i915)
DEFINE_LOADER_DRM_ENTRYPOINT(iris)
#endif
+#if defined(GALLIUM_CROCUS)
+DEFINE_LOADER_DRM_ENTRYPOINT(crocus)
+#endif
+
#if defined(GALLIUM_NOUVEAU)
DEFINE_LOADER_DRM_ENTRYPOINT(nouveau)
#endif
diff --git a/src/gallium/winsys/crocus/drm/crocus_drm_public.h b/src/gallium/winsys/crocus/drm/crocus_drm_public.h
new file mode 100644
index 00000000000..614543136be
--- /dev/null
+++ b/src/gallium/winsys/crocus/drm/crocus_drm_public.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_DRM_PUBLIC_H
+#define CROCUS_DRM_PUBLIC_H
+
+struct pipe_screen;
+struct pipe_screen_config;
+
+struct pipe_screen *
+crocus_drm_screen_create(int drm_fd, const struct pipe_screen_config *config);
+
+#endif /* CROCUS_DRM_PUBLIC_H */
diff --git a/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c b/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c
new file mode 100644
index 00000000000..ffeeba567ac
--- /dev/null
+++ b/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "util/os_file.h"
+
+#include "crocus_drm_public.h"
+#include "crocus/crocus_screen.h"
+
+struct pipe_screen *
+crocus_drm_screen_create(int fd, const struct pipe_screen_config *config)
+{
+ int newfd = os_dupfd_cloexec(fd);
+ if (newfd < 0)
+ return NULL;
+ return crocus_screen_create(newfd, config);
+}
diff --git a/src/gallium/winsys/crocus/drm/meson.build b/src/gallium/winsys/crocus/drm/meson.build
new file mode 100644
index 00000000000..4e82fe52437
--- /dev/null
+++ b/src/gallium/winsys/crocus/drm/meson.build
@@ -0,0 +1,29 @@
+# Copyright © 2017 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libcrocuswinsys = static_library(
+ 'crocuswinsys',
+ files('crocus_drm_winsys.c'),
+ include_directories : [
+ inc_src, inc_include,
+ inc_gallium, inc_gallium_aux, inc_gallium_drivers,
+ ],
+ gnu_symbol_visibility : 'hidden',
+)