From f3630548f1da904ec6c63b43ece7e68afdb8867e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 1 Jun 2021 13:14:51 +1000 Subject: crocus: initial gallium driver for Intel gfx 4-7 This is a gallium driver for the Intel gfx 4-7 GPUs. It was initially cloned from the iris driver by Ilia Mirkin, then I ported over large reams of code from i965 until it worked. Acked-by: Jason Ekstrand Part-of: --- .../auxiliary/pipe-loader/pipe_loader_drm.c | 1 + src/gallium/auxiliary/target-helpers/drm_helper.h | 20 + .../auxiliary/target-helpers/drm_helper_public.h | 1 + src/gallium/drivers/crocus/crocus_batch.c | 1047 +++ src/gallium/drivers/crocus/crocus_batch.h | 325 + src/gallium/drivers/crocus/crocus_blit.c | 836 ++ src/gallium/drivers/crocus/crocus_blorp.c | 399 + src/gallium/drivers/crocus/crocus_blt.c | 337 + src/gallium/drivers/crocus/crocus_bufmgr.c | 1689 ++++ src/gallium/drivers/crocus/crocus_bufmgr.h | 331 + src/gallium/drivers/crocus/crocus_clear.c | 859 ++ src/gallium/drivers/crocus/crocus_context.c | 336 + src/gallium/drivers/crocus/crocus_context.h | 955 +++ src/gallium/drivers/crocus/crocus_defines.h | 58 + src/gallium/drivers/crocus/crocus_disk_cache.c | 263 + src/gallium/drivers/crocus/crocus_draw.c | 511 ++ src/gallium/drivers/crocus/crocus_fence.c | 571 ++ src/gallium/drivers/crocus/crocus_fence.h | 60 + src/gallium/drivers/crocus/crocus_fine_fence.c | 85 + src/gallium/drivers/crocus/crocus_fine_fence.h | 109 + src/gallium/drivers/crocus/crocus_formats.c | 576 ++ src/gallium/drivers/crocus/crocus_genx_macros.h | 164 + src/gallium/drivers/crocus/crocus_genx_protos.h | 56 + src/gallium/drivers/crocus/crocus_monitor.c | 484 ++ src/gallium/drivers/crocus/crocus_monitor.h | 72 + src/gallium/drivers/crocus/crocus_pipe.h | 74 + src/gallium/drivers/crocus/crocus_pipe_control.c | 368 + src/gallium/drivers/crocus/crocus_program.c | 3171 ++++++++ src/gallium/drivers/crocus/crocus_program_cache.c | 347 + src/gallium/drivers/crocus/crocus_query.c | 996 +++ src/gallium/drivers/crocus/crocus_resolve.c | 1061 +++ src/gallium/drivers/crocus/crocus_resource.c | 1946 +++++ src/gallium/drivers/crocus/crocus_resource.h | 501 ++ src/gallium/drivers/crocus/crocus_screen.c | 829 ++ src/gallium/drivers/crocus/crocus_screen.h | 253 + src/gallium/drivers/crocus/crocus_state.c | 8382 ++++++++++++++++++++ src/gallium/drivers/crocus/crocus_todo.txt | 16 + src/gallium/drivers/crocus/driinfo_crocus.h | 11 + src/gallium/drivers/crocus/gen4_blorp_exec.h | 190 + src/gallium/drivers/crocus/meson.build | 90 + src/gallium/meson.build | 6 + src/gallium/targets/d3dadapter9/meson.build | 2 +- src/gallium/targets/dri/meson.build | 3 +- src/gallium/targets/dri/target.c | 4 + src/gallium/winsys/crocus/drm/crocus_drm_public.h | 33 + src/gallium/winsys/crocus/drm/crocus_drm_winsys.c | 39 + src/gallium/winsys/crocus/drm/meson.build | 29 + 47 files changed, 28494 insertions(+), 2 deletions(-) create mode 100644 src/gallium/drivers/crocus/crocus_batch.c create mode 100644 src/gallium/drivers/crocus/crocus_batch.h create mode 100644 src/gallium/drivers/crocus/crocus_blit.c create mode 100644 src/gallium/drivers/crocus/crocus_blorp.c create mode 100644 src/gallium/drivers/crocus/crocus_blt.c create mode 100644 src/gallium/drivers/crocus/crocus_bufmgr.c create mode 100644 src/gallium/drivers/crocus/crocus_bufmgr.h create mode 100644 src/gallium/drivers/crocus/crocus_clear.c create mode 100644 src/gallium/drivers/crocus/crocus_context.c create mode 100644 src/gallium/drivers/crocus/crocus_context.h create mode 100644 src/gallium/drivers/crocus/crocus_defines.h create mode 100644 src/gallium/drivers/crocus/crocus_disk_cache.c create mode 100644 src/gallium/drivers/crocus/crocus_draw.c create mode 100644 src/gallium/drivers/crocus/crocus_fence.c create mode 100644 src/gallium/drivers/crocus/crocus_fence.h create mode 100644 src/gallium/drivers/crocus/crocus_fine_fence.c create mode 100644 src/gallium/drivers/crocus/crocus_fine_fence.h create mode 100644 src/gallium/drivers/crocus/crocus_formats.c create mode 100644 src/gallium/drivers/crocus/crocus_genx_macros.h create mode 100644 src/gallium/drivers/crocus/crocus_genx_protos.h create mode 100644 src/gallium/drivers/crocus/crocus_monitor.c create mode 100644 src/gallium/drivers/crocus/crocus_monitor.h create mode 100644 src/gallium/drivers/crocus/crocus_pipe.h create mode 100644 src/gallium/drivers/crocus/crocus_pipe_control.c create mode 100644 src/gallium/drivers/crocus/crocus_program.c create mode 100644 src/gallium/drivers/crocus/crocus_program_cache.c create mode 100644 src/gallium/drivers/crocus/crocus_query.c create mode 100644 src/gallium/drivers/crocus/crocus_resolve.c create mode 100644 src/gallium/drivers/crocus/crocus_resource.c create mode 100644 src/gallium/drivers/crocus/crocus_resource.h create mode 100644 src/gallium/drivers/crocus/crocus_screen.c create mode 100644 src/gallium/drivers/crocus/crocus_screen.h create mode 100644 src/gallium/drivers/crocus/crocus_state.c create mode 100644 src/gallium/drivers/crocus/crocus_todo.txt create mode 100644 src/gallium/drivers/crocus/driinfo_crocus.h create mode 100644 src/gallium/drivers/crocus/gen4_blorp_exec.h create mode 100644 src/gallium/drivers/crocus/meson.build create mode 100644 src/gallium/winsys/crocus/drm/crocus_drm_public.h create mode 100644 src/gallium/winsys/crocus/drm/crocus_drm_winsys.c create mode 100644 src/gallium/winsys/crocus/drm/meson.build (limited to 'src/gallium') diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c index 8147c3ca346..ca5bf121a88 100644 --- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c +++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c @@ -70,6 +70,7 @@ static const struct pipe_loader_ops pipe_loader_drm_ops; static const struct drm_driver_descriptor *driver_descriptors[] = { &i915_driver_descriptor, &iris_driver_descriptor, + &crocus_driver_descriptor, &nouveau_driver_descriptor, &r300_driver_descriptor, &r600_driver_descriptor, diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h index 6bab07a40e7..ff4621e1a88 100644 --- a/src/gallium/auxiliary/target-helpers/drm_helper.h +++ b/src/gallium/auxiliary/target-helpers/drm_helper.h @@ -112,6 +112,26 @@ DRM_DRIVER_DESCRIPTOR(iris, iris_driconf, ARRAY_SIZE(iris_driconf)) DRM_DRIVER_DESCRIPTOR_STUB(iris) #endif +#ifdef GALLIUM_CROCUS +#include "crocus/drm/crocus_drm_public.h" + +static struct pipe_screen * +pipe_crocus_create_screen(int fd, const struct pipe_screen_config *config) +{ + struct pipe_screen *screen; + + screen = crocus_drm_screen_create(fd, config); + return screen ? debug_screen_wrap(screen) : NULL; +} + +const driOptionDescription crocus_driconf[] = { + #include "crocus/driinfo_crocus.h" +}; +DRM_DRIVER_DESCRIPTOR(crocus, crocus_driconf, ARRAY_SIZE(crocus_driconf)) +#else +DRM_DRIVER_DESCRIPTOR_STUB(crocus) +#endif + #ifdef GALLIUM_NOUVEAU #include "nouveau/drm/nouveau_drm_public.h" diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h index 5fd3084dfdb..478e72b8525 100644 --- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h +++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h @@ -6,6 +6,7 @@ struct pipe_screen_config; extern const struct drm_driver_descriptor i915_driver_descriptor; extern const struct drm_driver_descriptor iris_driver_descriptor; +extern const struct drm_driver_descriptor crocus_driver_descriptor; extern const struct drm_driver_descriptor nouveau_driver_descriptor; extern const struct drm_driver_descriptor r300_driver_descriptor; extern const struct drm_driver_descriptor r600_driver_descriptor; diff --git a/src/gallium/drivers/crocus/crocus_batch.c b/src/gallium/drivers/crocus/crocus_batch.c new file mode 100644 index 00000000000..63cfe282de4 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_batch.c @@ -0,0 +1,1047 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_batch.c + * + * Batchbuffer and command submission module. + * + * Every API draw call results in a number of GPU commands, which we + * collect into a "batch buffer". Typically, many draw calls are grouped + * into a single batch to amortize command submission overhead. + * + * We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl. + * One critical piece of data is the "validation list", which contains a + * list of the buffer objects (BOs) which the commands in the GPU need. + * The kernel will make sure these are resident and pinned at the correct + * virtual memory address before executing our batch. If a BO is not in + * the validation list, it effectively does not exist, so take care. + */ + +#include "crocus_batch.h" +#include "crocus_bufmgr.h" +#include "crocus_context.h" +#include "crocus_fence.h" + +#include "drm-uapi/i915_drm.h" + +#include "intel/common/intel_gem.h" +#include "main/macros.h" +#include "util/hash_table.h" +#include "util/set.h" +#include "util/u_upload_mgr.h" + +#include +#include + +#if HAVE_VALGRIND +#include +#include +#define VG(x) x +#else +#define VG(x) +#endif + +#define FILE_DEBUG_FLAG DEBUG_BUFMGR + +/* Terminating the batch takes either 4 bytes for MI_BATCH_BUFFER_END + * or 12 bytes for MI_BATCH_BUFFER_START (when chaining). Plus, we may + * need an extra 4 bytes to pad out to the nearest QWord. So reserve 16. + */ +#define BATCH_RESERVED(devinfo) ((devinfo)->is_haswell ? 32 : 16) + +static void crocus_batch_reset(struct crocus_batch *batch); + +static unsigned +num_fences(struct crocus_batch *batch) +{ + return util_dynarray_num_elements(&batch->exec_fences, + struct drm_i915_gem_exec_fence); +} + +/** + * Debugging code to dump the fence list, used by INTEL_DEBUG=submit. + */ +static void +dump_fence_list(struct crocus_batch *batch) +{ + fprintf(stderr, "Fence list (length %u): ", num_fences(batch)); + + util_dynarray_foreach(&batch->exec_fences, + struct drm_i915_gem_exec_fence, f) { + fprintf(stderr, "%s%u%s ", + (f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "", + f->handle, + (f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : ""); + } + + fprintf(stderr, "\n"); +} + +/** + * Debugging code to dump the validation list, used by INTEL_DEBUG=submit. + */ +static void +dump_validation_list(struct crocus_batch *batch) +{ + fprintf(stderr, "Validation list (length %d):\n", batch->exec_count); + + for (int i = 0; i < batch->exec_count; i++) { + uint64_t flags = batch->validation_list[i].flags; + assert(batch->validation_list[i].handle == + batch->exec_bos[i]->gem_handle); + fprintf(stderr, + "[%2d]: %2d %-14s @ 0x%016llx (%" PRIu64 "B)\t %2d refs %s\n", i, + batch->validation_list[i].handle, batch->exec_bos[i]->name, + batch->validation_list[i].offset, batch->exec_bos[i]->size, + batch->exec_bos[i]->refcount, + (flags & EXEC_OBJECT_WRITE) ? " (write)" : ""); + } +} + +/** + * Return BO information to the batch decoder (for debugging). + */ +static struct intel_batch_decode_bo +decode_get_bo(void *v_batch, bool ppgtt, uint64_t address) +{ + struct crocus_batch *batch = v_batch; + + for (int i = 0; i < batch->exec_count; i++) { + struct crocus_bo *bo = batch->exec_bos[i]; + /* The decoder zeroes out the top 16 bits, so we need to as well */ + uint64_t bo_address = bo->gtt_offset & (~0ull >> 16); + + if (address >= bo_address && address < bo_address + bo->size) { + return (struct intel_batch_decode_bo){ + .addr = address, + .size = bo->size, + .map = crocus_bo_map(batch->dbg, bo, MAP_READ) + + (address - bo_address), + }; + } + } + + return (struct intel_batch_decode_bo) { }; +} + +static unsigned +decode_get_state_size(void *v_batch, uint64_t address, + uint64_t base_address) +{ + struct crocus_batch *batch = v_batch; + + /* The decoder gives us offsets from a base address, which is not great. + * Binding tables are relative to surface state base address, and other + * state is relative to dynamic state base address. These could alias, + * but in practice it's unlikely because surface offsets are always in + * the [0, 64K) range, and we assign dynamic state addresses starting at + * the top of the 4GB range. We should fix this but it's likely good + * enough for now. + */ + unsigned size = (uintptr_t) + _mesa_hash_table_u64_search(batch->state_sizes, address - base_address); + + return size; +} + +/** + * Decode the current batch. + */ +static void +decode_batch(struct crocus_batch *batch) +{ + void *map = crocus_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ); + intel_print_batch(&batch->decoder, map, batch->primary_batch_size, + batch->exec_bos[0]->gtt_offset, false); +} + +static void +init_reloc_list(struct crocus_reloc_list *rlist, int count) +{ + rlist->reloc_count = 0; + rlist->reloc_array_size = count; + rlist->relocs = malloc(rlist->reloc_array_size * + sizeof(struct drm_i915_gem_relocation_entry)); +} + +void +crocus_init_batch(struct crocus_context *ice, + enum crocus_batch_name name, + int priority) +{ + struct crocus_batch *batch = &ice->batches[name]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + + batch->ice = ice; + batch->screen = screen; + batch->dbg = &ice->dbg; + batch->reset = &ice->reset; + batch->name = name; + batch->contains_fence_signal = false; + + if (devinfo->ver >= 7) { + batch->fine_fences.uploader = + u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM, + PIPE_USAGE_STAGING, 0); + } + crocus_fine_fence_init(batch); + + batch->hw_ctx_id = crocus_create_hw_context(screen->bufmgr); + assert(batch->hw_ctx_id); + + crocus_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority); + + batch->valid_reloc_flags = EXEC_OBJECT_WRITE; + if (devinfo->ver == 6) + batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT; + + if (INTEL_DEBUG & DEBUG_BATCH) { + /* The shadow doesn't get relocs written so state decode fails. */ + batch->use_shadow_copy = false; + } else + batch->use_shadow_copy = !devinfo->has_llc; + + util_dynarray_init(&batch->exec_fences, ralloc_context(NULL)); + util_dynarray_init(&batch->syncobjs, ralloc_context(NULL)); + + init_reloc_list(&batch->command.relocs, 250); + init_reloc_list(&batch->state.relocs, 250); + + batch->exec_count = 0; + batch->exec_array_size = 100; + batch->exec_bos = + malloc(batch->exec_array_size * sizeof(batch->exec_bos[0])); + batch->validation_list = + malloc(batch->exec_array_size * sizeof(batch->validation_list[0])); + + batch->cache.render = _mesa_hash_table_create(NULL, NULL, + _mesa_key_pointer_equal); + batch->cache.depth = _mesa_set_create(NULL, NULL, + _mesa_key_pointer_equal); + + memset(batch->other_batches, 0, sizeof(batch->other_batches)); + + for (int i = 0, j = 0; i < ice->batch_count; i++) { + if (i != name) + batch->other_batches[j++] = &ice->batches[i]; + } + + if (INTEL_DEBUG & DEBUG_BATCH) { + + batch->state_sizes = _mesa_hash_table_u64_create(NULL); + const unsigned decode_flags = + INTEL_BATCH_DECODE_FULL | + ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) | + INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS; + + intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo, stderr, + decode_flags, NULL, decode_get_bo, + decode_get_state_size, batch); + batch->decoder.max_vbo_decoded_lines = 32; + } + + crocus_batch_reset(batch); +} + +static struct drm_i915_gem_exec_object2 * +find_validation_entry(struct crocus_batch *batch, struct crocus_bo *bo) +{ + unsigned index = READ_ONCE(bo->index); + + if (index < batch->exec_count && batch->exec_bos[index] == bo) + return &batch->validation_list[index]; + + /* May have been shared between multiple active batches */ + for (index = 0; index < batch->exec_count; index++) { + if (batch->exec_bos[index] == bo) + return &batch->validation_list[index]; + } + + return NULL; +} + +static void +ensure_exec_obj_space(struct crocus_batch *batch, uint32_t count) +{ + while (batch->exec_count + count > batch->exec_array_size) { + batch->exec_array_size *= 2; + batch->exec_bos = realloc( + batch->exec_bos, batch->exec_array_size * sizeof(batch->exec_bos[0])); + batch->validation_list = + realloc(batch->validation_list, + batch->exec_array_size * sizeof(batch->validation_list[0])); + } +} + +static struct drm_i915_gem_exec_object2 * +crocus_use_bo(struct crocus_batch *batch, struct crocus_bo *bo, bool writable) +{ + assert(bo->bufmgr == batch->command.bo->bufmgr); + + if (bo == batch->ice->workaround_bo) + writable = false; + + struct drm_i915_gem_exec_object2 *existing_entry = + find_validation_entry(batch, bo); + + if (existing_entry) { + /* The BO is already in the validation list; mark it writable */ + if (writable) + existing_entry->flags |= EXEC_OBJECT_WRITE; + return existing_entry; + } + + if (bo != batch->command.bo && bo != batch->state.bo) { + /* This is the first time our batch has seen this BO. Before we use it, + * we may need to flush and synchronize with other batches. + */ + for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) { + + if (!batch->other_batches[b]) + continue; + struct drm_i915_gem_exec_object2 *other_entry = + find_validation_entry(batch->other_batches[b], bo); + + /* If the buffer is referenced by another batch, and either batch + * intends to write it, then flush the other batch and synchronize. + * + * Consider these cases: + * + * 1. They read, we read => No synchronization required. + * 2. They read, we write => Synchronize (they need the old value) + * 3. They write, we read => Synchronize (we need their new value) + * 4. They write, we write => Synchronize (order writes) + * + * The read/read case is very common, as multiple batches usually + * share a streaming state buffer or shader assembly buffer, and + * we want to avoid synchronizing in this case. + */ + if (other_entry && + ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) { + crocus_batch_flush(batch->other_batches[b]); + crocus_batch_add_syncobj(batch, + batch->other_batches[b]->last_fence->syncobj, + I915_EXEC_FENCE_WAIT); + } + } + } + + /* Bump the ref count since the batch is now using this bo. */ + crocus_bo_reference(bo); + + ensure_exec_obj_space(batch, 1); + + batch->validation_list[batch->exec_count] = + (struct drm_i915_gem_exec_object2) { + .handle = bo->gem_handle, + .offset = bo->gtt_offset, + .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0), + }; + + bo->index = batch->exec_count; + batch->exec_bos[batch->exec_count] = bo; + batch->aperture_space += bo->size; + + batch->exec_count++; + + return &batch->validation_list[batch->exec_count - 1]; +} + +static uint64_t +emit_reloc(struct crocus_batch *batch, + struct crocus_reloc_list *rlist, uint32_t offset, + struct crocus_bo *target, int32_t target_offset, + unsigned int reloc_flags) +{ + assert(target != NULL); + + bool writable = reloc_flags & RELOC_WRITE; + + struct drm_i915_gem_exec_object2 *entry = + crocus_use_bo(batch, target, writable); + + if (rlist->reloc_count == rlist->reloc_array_size) { + rlist->reloc_array_size *= 2; + rlist->relocs = realloc(rlist->relocs, + rlist->reloc_array_size * + sizeof(struct drm_i915_gem_relocation_entry)); + } + + if (reloc_flags & RELOC_32BIT) { + /* Restrict this buffer to the low 32 bits of the address space. + * + * Altering the validation list flags restricts it for this batch, + * but we also alter the BO's kflags to restrict it permanently + * (until the BO is destroyed and put back in the cache). Buffers + * may stay bound across batches, and we want keep it constrained. + */ + target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + + /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */ + reloc_flags &= ~RELOC_32BIT; + } + + if (reloc_flags) + entry->flags |= reloc_flags & batch->valid_reloc_flags; + + rlist->relocs[rlist->reloc_count++] = + (struct drm_i915_gem_relocation_entry) { + .offset = offset, + .delta = target_offset, + .target_handle = target->index, + .presumed_offset = entry->offset, + }; + + /* Using the old buffer offset, write in what the right data would be, in + * case the buffer doesn't move and we can short-circuit the relocation + * processing in the kernel + */ + return entry->offset + target_offset; +} + +uint64_t +crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags) +{ + assert(batch_offset <= batch->command.bo->size - sizeof(uint32_t)); + + return emit_reloc(batch, &batch->command.relocs, batch_offset, + target, target_offset, reloc_flags); +} + +uint64_t +crocus_state_reloc(struct crocus_batch *batch, uint32_t state_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags) +{ + assert(state_offset <= batch->state.bo->size - sizeof(uint32_t)); + + return emit_reloc(batch, &batch->state.relocs, state_offset, + target, target_offset, reloc_flags); +} + +static void +recreate_growing_buffer(struct crocus_batch *batch, + struct crocus_growing_bo *grow, + const char *name, unsigned size) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + grow->bo = crocus_bo_alloc(bufmgr, name, size); + grow->bo->kflags |= EXEC_OBJECT_CAPTURE; + grow->partial_bo = NULL; + grow->partial_bo_map = NULL; + grow->partial_bytes = 0; + if (batch->use_shadow_copy) + grow->map = realloc(grow->map, grow->bo->size); + else + grow->map = crocus_bo_map(NULL, grow->bo, MAP_READ | MAP_WRITE); + grow->map_next = grow->map; +} + +static void +create_batch(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + + recreate_growing_buffer(batch, &batch->command, + "command buffer", + BATCH_SZ + BATCH_RESERVED(&screen->devinfo)); + + crocus_use_bo(batch, batch->command.bo, false); + + recreate_growing_buffer(batch, &batch->state, + "state buffer", + STATE_SZ); + + batch->state.used = 1; + crocus_use_bo(batch, batch->state.bo, false); +} + +static void +crocus_batch_maybe_noop(struct crocus_batch *batch) +{ + /* We only insert the NOOP at the beginning of the batch. */ + assert(crocus_batch_bytes_used(batch) == 0); + + if (batch->noop_enabled) { + /* Emit MI_BATCH_BUFFER_END to prevent any further command to be + * executed. + */ + uint32_t *map = batch->command.map_next; + + map[0] = (0xA << 23); + + batch->command.map_next += 4; + } +} + +static void +crocus_batch_reset(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + + crocus_bo_unreference(batch->command.bo); + crocus_bo_unreference(batch->state.bo); + batch->primary_batch_size = 0; + batch->contains_draw = false; + batch->contains_fence_signal = false; + batch->state_base_address_emitted = false; + batch->screen->vtbl.batch_reset_dirty(batch); + + create_batch(batch); + assert(batch->command.bo->index == 0); + + if (batch->state_sizes) + _mesa_hash_table_u64_clear(batch->state_sizes); + struct crocus_syncobj *syncobj = crocus_create_syncobj(screen); + crocus_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL); + crocus_syncobj_reference(screen, &syncobj, NULL); + + crocus_cache_sets_clear(batch); +} + +void +crocus_batch_free(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + + if (batch->use_shadow_copy) { + free(batch->command.map); + free(batch->state.map); + } + + for (int i = 0; i < batch->exec_count; i++) { + crocus_bo_unreference(batch->exec_bos[i]); + } + + pipe_resource_reference(&batch->fine_fences.ref.res, NULL); + + free(batch->command.relocs.relocs); + free(batch->state.relocs.relocs); + free(batch->exec_bos); + free(batch->validation_list); + + ralloc_free(batch->exec_fences.mem_ctx); + + util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s) + crocus_syncobj_reference(screen, s, NULL); + ralloc_free(batch->syncobjs.mem_ctx); + + crocus_fine_fence_reference(batch->screen, &batch->last_fence, NULL); + if (batch_has_fine_fence(batch)) + u_upload_destroy(batch->fine_fences.uploader); + + crocus_bo_unreference(batch->command.bo); + batch->command.bo = NULL; + batch->command.map = NULL; + batch->command.map_next = NULL; + + crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id); + + _mesa_hash_table_destroy(batch->cache.render, NULL); + _mesa_set_destroy(batch->cache.depth, NULL); + + if (batch->state_sizes) { + _mesa_hash_table_u64_destroy(batch->state_sizes); + intel_batch_decode_ctx_finish(&batch->decoder); + } +} + +/** + * If we've chained to a secondary batch, or are getting near to the end, + * then flush. This should only be called between draws. + */ +void +crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate) +{ + if (batch->command.bo != batch->exec_bos[0] || + crocus_batch_bytes_used(batch) + estimate >= BATCH_SZ) { + crocus_batch_flush(batch); + } +} + +/** + * Finish copying the old batch/state buffer's contents to the new one + * after we tried to "grow" the buffer in an earlier operation. + */ +static void +finish_growing_bos(struct crocus_growing_bo *grow) +{ + struct crocus_bo *old_bo = grow->partial_bo; + if (!old_bo) + return; + + memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes); + + grow->partial_bo = NULL; + grow->partial_bo_map = NULL; + grow->partial_bytes = 0; + + crocus_bo_unreference(old_bo); +} + +void +crocus_grow_buffer(struct crocus_batch *batch, bool grow_state, + unsigned used, + unsigned new_size) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + struct crocus_growing_bo *grow = grow_state ? &batch->state : &batch->command; + struct crocus_bo *bo = grow->bo; + + if (grow->partial_bo) { + /* We've already grown once, and now we need to do it again. + * Finish our last grow operation so we can start a new one. + * This should basically never happen. + */ + finish_growing_bos(grow); + } + + struct crocus_bo *new_bo = crocus_bo_alloc(bufmgr, bo->name, new_size); + + /* Copy existing data to the new larger buffer */ + grow->partial_bo_map = grow->map; + + if (batch->use_shadow_copy) { + /* We can't safely use realloc, as it may move the existing buffer, + * breaking existing pointers the caller may still be using. Just + * malloc a new copy and memcpy it like the normal BO path. + * + * Use bo->size rather than new_size because the bufmgr may have + * rounded up the size, and we want the shadow size to match. + */ + grow->map = malloc(new_bo->size); + } else { + grow->map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE); + } + /* Try to put the new BO at the same GTT offset as the old BO (which + * we're throwing away, so it doesn't need to be there). + * + * This guarantees that our relocations continue to work: values we've + * already written into the buffer, values we're going to write into the + * buffer, and the validation/relocation lists all will match. + * + * Also preserve kflags for EXEC_OBJECT_CAPTURE. + */ + new_bo->gtt_offset = bo->gtt_offset; + new_bo->index = bo->index; + new_bo->kflags = bo->kflags; + + /* Batch/state buffers are per-context, and if we've run out of space, + * we must have actually used them before, so...they will be in the list. + */ + assert(bo->index < batch->exec_count); + assert(batch->exec_bos[bo->index] == bo); + + /* Update the validation list to use the new BO. */ + batch->validation_list[bo->index].handle = new_bo->gem_handle; + /* Exchange the two BOs...without breaking pointers to the old BO. + * + * Consider this scenario: + * + * 1. Somebody calls brw_state_batch() to get a region of memory, and + * and then creates a brw_address pointing to brw->batch.state.bo. + * 2. They then call brw_state_batch() a second time, which happens to + * grow and replace the state buffer. They then try to emit a + * relocation to their first section of memory. + * + * If we replace the brw->batch.state.bo pointer at step 2, we would + * break the address created in step 1. They'd have a pointer to the + * old destroyed BO. Emitting a relocation would add this dead BO to + * the validation list...causing /both/ statebuffers to be in the list, + * and all kinds of disasters. + * + * This is not a contrived case - BLORP vertex data upload hits this. + * + * There are worse scenarios too. Fences for GL sync objects reference + * brw->batch.batch.bo. If we replaced the batch pointer when growing, + * we'd need to chase down every fence and update it to point to the + * new BO. Otherwise, it would refer to a "batch" that never actually + * gets submitted, and would fail to trigger. + * + * To work around both of these issues, we transmutate the buffers in + * place, making the existing struct brw_bo represent the new buffer, + * and "new_bo" represent the old BO. This is highly unusual, but it + * seems like a necessary evil. + * + * We also defer the memcpy of the existing batch's contents. Callers + * may make multiple brw_state_batch calls, and retain pointers to the + * old BO's map. We'll perform the memcpy in finish_growing_bo() when + * we finally submit the batch, at which point we've finished uploading + * state, and nobody should have any old references anymore. + * + * To do that, we keep a reference to the old BO in grow->partial_bo, + * and store the number of bytes to copy in grow->partial_bytes. We + * can monkey with the refcounts directly without atomics because these + * are per-context BOs and they can only be touched by this thread. + */ + assert(new_bo->refcount == 1); + new_bo->refcount = bo->refcount; + bo->refcount = 1; + + struct crocus_bo tmp; + memcpy(&tmp, bo, sizeof(struct crocus_bo)); + memcpy(bo, new_bo, sizeof(struct crocus_bo)); + memcpy(new_bo, &tmp, sizeof(struct crocus_bo)); + + grow->partial_bo = new_bo; /* the one reference of the OLD bo */ + grow->partial_bytes = used; +} + +static void +finish_seqno(struct crocus_batch *batch) +{ + struct crocus_fine_fence *sq = crocus_fine_fence_new(batch, CROCUS_FENCE_END); + if (!sq) + return; + + crocus_fine_fence_reference(batch->screen, &batch->last_fence, sq); + crocus_fine_fence_reference(batch->screen, &sq, NULL); +} + +/** + * Terminate a batch with MI_BATCH_BUFFER_END. + */ +static void +crocus_finish_batch(struct crocus_batch *batch) +{ + + batch->no_wrap = true; + if (batch->screen->vtbl.finish_batch) + batch->screen->vtbl.finish_batch(batch); + + finish_seqno(batch); + + /* Emit MI_BATCH_BUFFER_END to finish our batch. */ + uint32_t *map = batch->command.map_next; + + map[0] = (0xA << 23); + + batch->command.map_next += 4; + VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->command.map, crocus_batch_bytes_used(batch))); + + if (batch->command.bo == batch->exec_bos[0]) + batch->primary_batch_size = crocus_batch_bytes_used(batch); + batch->no_wrap = false; +} + +/** + * Replace our current GEM context with a new one (in case it got banned). + */ +static bool +replace_hw_ctx(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + + uint32_t new_ctx = crocus_clone_hw_context(bufmgr, batch->hw_ctx_id); + if (!new_ctx) + return false; + + crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id); + batch->hw_ctx_id = new_ctx; + + /* Notify the context that state must be re-initialized. */ + crocus_lost_context_state(batch); + + return true; +} + +enum pipe_reset_status +crocus_batch_check_for_reset(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + enum pipe_reset_status status = PIPE_NO_RESET; + struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id }; + + if (drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats)) + DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno)); + + if (stats.batch_active != 0) { + /* A reset was observed while a batch from this hardware context was + * executing. Assume that this context was at fault. + */ + status = PIPE_GUILTY_CONTEXT_RESET; + } else if (stats.batch_pending != 0) { + /* A reset was observed while a batch from this context was in progress, + * but the batch was not executing. In this case, assume that the + * context was not at fault. + */ + status = PIPE_INNOCENT_CONTEXT_RESET; + } + + if (status != PIPE_NO_RESET) { + /* Our context is likely banned, or at least in an unknown state. + * Throw it away and start with a fresh context. Ideally this may + * catch the problem before our next execbuf fails with -EIO. + */ + replace_hw_ctx(batch); + } + + return status; +} + +/** + * Submit the batch to the GPU via execbuffer2. + */ +static int +submit_batch(struct crocus_batch *batch) +{ + + if (batch->use_shadow_copy) { + void *bo_map = crocus_bo_map(batch->dbg, batch->command.bo, MAP_WRITE); + memcpy(bo_map, batch->command.map, crocus_batch_bytes_used(batch)); + + bo_map = crocus_bo_map(batch->dbg, batch->state.bo, MAP_WRITE); + memcpy(bo_map, batch->state.map, batch->state.used); + } + + crocus_bo_unmap(batch->command.bo); + crocus_bo_unmap(batch->state.bo); + + /* The requirement for using I915_EXEC_NO_RELOC are: + * + * The addresses written in the objects must match the corresponding + * reloc.gtt_offset which in turn must match the corresponding + * execobject.offset. + * + * Any render targets written to in the batch must be flagged with + * EXEC_OBJECT_WRITE. + * + * To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + */ + /* Set statebuffer relocations */ + const unsigned state_index = batch->state.bo->index; + if (state_index < batch->exec_count && + batch->exec_bos[state_index] == batch->state.bo) { + struct drm_i915_gem_exec_object2 *entry = + &batch->validation_list[state_index]; + assert(entry->handle == batch->state.bo->gem_handle); + entry->relocation_count = batch->state.relocs.reloc_count; + entry->relocs_ptr = (uintptr_t)batch->state.relocs.relocs; + } + + /* Set batchbuffer relocations */ + struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0]; + assert(entry->handle == batch->command.bo->gem_handle); + entry->relocation_count = batch->command.relocs.reloc_count; + entry->relocs_ptr = (uintptr_t)batch->command.relocs.relocs; + + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = (uintptr_t)batch->validation_list, + .buffer_count = batch->exec_count, + .batch_start_offset = 0, + /* This must be QWord aligned. */ + .batch_len = ALIGN(batch->primary_batch_size, 8), + .flags = I915_EXEC_RENDER | + I915_EXEC_NO_RELOC | + I915_EXEC_BATCH_FIRST | + I915_EXEC_HANDLE_LUT, + .rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */ + }; + + if (num_fences(batch)) { + execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.num_cliprects = num_fences(batch); + execbuf.cliprects_ptr = + (uintptr_t)util_dynarray_begin(&batch->exec_fences); + } + + int ret = 0; + if (!batch->screen->no_hw && + intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf)) + ret = -errno; + + for (int i = 0; i < batch->exec_count; i++) { + struct crocus_bo *bo = batch->exec_bos[i]; + + bo->idle = false; + bo->index = -1; + + /* Update brw_bo::gtt_offset */ + if (batch->validation_list[i].offset != bo->gtt_offset) { + DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n", + bo->gem_handle, bo->gtt_offset, + batch->validation_list[i].offset); + assert(!(bo->kflags & EXEC_OBJECT_PINNED)); + bo->gtt_offset = batch->validation_list[i].offset; + } + } + + return ret; +} + +static const char * +batch_name_to_string(enum crocus_batch_name name) +{ + const char *names[CROCUS_BATCH_COUNT] = { + [CROCUS_BATCH_RENDER] = "render", + [CROCUS_BATCH_COMPUTE] = "compute", + }; + return names[name]; +} + +/** + * Flush the batch buffer, submitting it to the GPU and resetting it so + * we're ready to emit the next batch. + * + * \param in_fence_fd is ignored if -1. Otherwise, this function takes + * ownership of the fd. + * + * \param out_fence_fd is ignored if NULL. Otherwise, the caller must + * take ownership of the returned fd. + */ +void +_crocus_batch_flush(struct crocus_batch *batch, const char *file, int line) +{ + struct crocus_screen *screen = batch->screen; + + /* If a fence signals we need to flush it. */ + if (crocus_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal) + return; + + assert(!batch->no_wrap); + crocus_finish_batch(batch); + + finish_growing_bos(&batch->command); + finish_growing_bos(&batch->state); + int ret = submit_batch(batch); + + if (unlikely(INTEL_DEBUG & + (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL))) { + int bytes_for_commands = crocus_batch_bytes_used(batch); + int second_bytes = 0; + if (batch->command.bo != batch->exec_bos[0]) { + second_bytes = bytes_for_commands; + bytes_for_commands += batch->primary_batch_size; + } + fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) " + "(cmds), %4d BOs (%0.1fMb aperture)," + " %4d command relocs, %4d state relocs\n", + file, line, batch_name_to_string(batch->name), batch->hw_ctx_id, + batch->primary_batch_size, second_bytes, + 100.0f * bytes_for_commands / BATCH_SZ, + batch->exec_count, + (float) batch->aperture_space / (1024 * 1024), + batch->command.relocs.reloc_count, + batch->state.relocs.reloc_count); + + if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) { + dump_fence_list(batch); + dump_validation_list(batch); + } + + if (INTEL_DEBUG & DEBUG_BATCH) { + decode_batch(batch); + } + } + + for (int i = 0; i < batch->exec_count; i++) { + struct crocus_bo *bo = batch->exec_bos[i]; + crocus_bo_unreference(bo); + } + + batch->command.relocs.reloc_count = 0; + batch->state.relocs.reloc_count = 0; + batch->exec_count = 0; + batch->aperture_space = 0; + + util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s) + crocus_syncobj_reference(screen, s, NULL); + util_dynarray_clear(&batch->syncobjs); + + util_dynarray_clear(&batch->exec_fences); + + if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) { + dbg_printf("waiting for idle\n"); + crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */ + } + + /* Start a new batch buffer. */ + crocus_batch_reset(batch); + + /* EIO means our context is banned. In this case, try and replace it + * with a new logical context, and inform crocus_context that all state + * has been lost and needs to be re-initialized. If this succeeds, + * dubiously claim success... + */ + if (ret == -EIO && replace_hw_ctx(batch)) { + if (batch->reset->reset) { + /* Tell the state tracker the device is lost and it was our fault. */ + batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET); + } + + ret = 0; + } + + if (ret < 0) { +#ifdef DEBUG + const bool color = INTEL_DEBUG & DEBUG_COLOR; + fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n", + color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : ""); +#endif + abort(); + } +} + +/** + * Does the current batch refer to the given BO? + * + * (In other words, is the BO in the current batch's validation list?) + */ +bool +crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo) +{ + return find_validation_entry(batch, bo) != NULL; +} + +/** + * Updates the state of the noop feature. Returns true if there was a noop + * transition that led to state invalidation. + */ +bool +crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable) +{ + if (batch->noop_enabled == noop_enable) + return 0; + + batch->noop_enabled = noop_enable; + + crocus_batch_flush(batch); + + /* If the batch was empty, flush had no effect, so insert our noop. */ + if (crocus_batch_bytes_used(batch) == 0) + crocus_batch_maybe_noop(batch); + + /* We only need to update the entire state if we transition from noop -> + * not-noop. + */ + return !batch->noop_enabled; +} diff --git a/src/gallium/drivers/crocus/crocus_batch.h b/src/gallium/drivers/crocus/crocus_batch.h new file mode 100644 index 00000000000..fe6857d83ed --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_batch.h @@ -0,0 +1,325 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_BATCH_DOT_H +#define CROCUS_BATCH_DOT_H + +#include +#include +#include + +#include "util/u_dynarray.h" + +#include "common/intel_decoder.h" +#include "drm-uapi/i915_drm.h" + +#include "crocus_fence.h" +#include "crocus_fine_fence.h" + +#include "crocus_bufmgr.h" +/* The kernel assumes batchbuffers are smaller than 256kB. */ +#define MAX_BATCH_SIZE (256 * 1024) + +/* 3DSTATE_BINDING_TABLE_POINTERS has a U16 offset from Surface State Base + * Address, which means that we can't put binding tables beyond 64kB. This + * effectively limits the maximum statebuffer size to 64kB. + */ +#define MAX_STATE_SIZE (64 * 1024) + +/* Our target batch size - flush approximately at this point. */ +#define BATCH_SZ (20 * 1024) +#define STATE_SZ (16 * 1024) + +enum crocus_batch_name { + CROCUS_BATCH_RENDER, + CROCUS_BATCH_COMPUTE, +}; + +#define CROCUS_BATCH_COUNT 2 + +struct crocus_address { + struct crocus_bo *bo; + int32_t offset; + uint32_t reloc_flags; +}; + +struct crocus_reloc_list { + struct drm_i915_gem_relocation_entry *relocs; + int reloc_count; + int reloc_array_size; +}; + +struct crocus_growing_bo { + struct crocus_bo *bo; + void *map; + void *map_next; + struct crocus_bo *partial_bo; + void *partial_bo_map; + unsigned partial_bytes; + struct crocus_reloc_list relocs; + unsigned used; +}; + +struct crocus_batch { + struct crocus_context *ice; + struct crocus_screen *screen; + struct pipe_debug_callback *dbg; + struct pipe_device_reset_callback *reset; + + /** What batch is this? (e.g. CROCUS_BATCH_RENDER/COMPUTE) */ + enum crocus_batch_name name; + + /** buffers: command, state */ + struct crocus_growing_bo command, state; + + /** Size of the primary batch if we've moved on to a secondary. */ + unsigned primary_batch_size; + + bool state_base_address_emitted; + uint8_t pipe_controls_since_last_cs_stall; + + uint32_t hw_ctx_id; + + uint32_t valid_reloc_flags; + + bool use_shadow_copy; + bool no_wrap; + + /** The validation list */ + struct drm_i915_gem_exec_object2 *validation_list; + struct crocus_bo **exec_bos; + int exec_count; + int exec_array_size; + + /** Whether INTEL_BLACKHOLE_RENDER is enabled in the batch (aka first + * instruction is a MI_BATCH_BUFFER_END). + */ + bool noop_enabled; + + /** + * A list of crocus_syncobjs associated with this batch. + * + * The first list entry will always be a signalling sync-point, indicating + * that this batch has completed. The others are likely to be sync-points + * to wait on before executing the batch. + */ + struct util_dynarray syncobjs; + + /** A list of drm_i915_exec_fences to have execbuf signal or wait on */ + struct util_dynarray exec_fences; + + /** The amount of aperture space (in bytes) used by all exec_bos */ + int aperture_space; + + struct { + /** Uploader to use for sequence numbers */ + struct u_upload_mgr *uploader; + + /** GPU buffer and CPU map where our seqno's will be written. */ + struct crocus_state_ref ref; + uint32_t *map; + + /** The sequence number to write the next time we add a fence. */ + uint32_t next; + } fine_fences; + + /** A seqno (and syncobj) for the last batch that was submitted. */ + struct crocus_fine_fence *last_fence; + + /** List of other batches which we might need to flush to use a BO */ + struct crocus_batch *other_batches[CROCUS_BATCH_COUNT - 1]; + + struct { + /** + * Set of struct brw_bo * that have been rendered to within this + * batchbuffer and would need flushing before being used from another + * cache domain that isn't coherent with it (i.e. the sampler). + */ + struct hash_table *render; + + /** + * Set of struct brw_bo * that have been used as a depth buffer within + * this batchbuffer and would need flushing before being used from + * another cache domain that isn't coherent with it (i.e. the sampler). + */ + struct set *depth; + } cache; + + struct intel_batch_decode_ctx decoder; + struct hash_table_u64 *state_sizes; + + /** Have we emitted any draw calls to this batch? */ + bool contains_draw; + + /** Batch contains fence signal operation. */ + bool contains_fence_signal; +}; + +static inline bool +batch_has_fine_fence(struct crocus_batch *batch) +{ + return !!batch->fine_fences.uploader; +} + +#define BATCH_HAS_FINE_FENCES(batch) (!!(batch)->fine_fences.uploader) +void crocus_init_batch(struct crocus_context *ctx, + enum crocus_batch_name name, + int priority); +void crocus_batch_free(struct crocus_batch *batch); +void crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate); + +void _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line); +#define crocus_batch_flush(batch) _crocus_batch_flush((batch), __FILE__, __LINE__) + +bool crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo); + +bool crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable); + +#define RELOC_WRITE EXEC_OBJECT_WRITE +#define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT +/* Inverted meaning, but using the same bit...emit_reloc will flip it. */ +#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS + +void crocus_use_pinned_bo(struct crocus_batch *batch, struct crocus_bo *bo, + bool writable); +uint64_t crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags); +uint64_t crocus_state_reloc(struct crocus_batch *batch, uint32_t batch_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags); + +enum pipe_reset_status crocus_batch_check_for_reset(struct crocus_batch *batch); + +void crocus_grow_buffer(struct crocus_batch *batch, bool grow_state, + unsigned used, unsigned new_size); + +static inline unsigned +crocus_batch_bytes_used(struct crocus_batch *batch) +{ + return batch->command.map_next - batch->command.map; +} + +/** + * Ensure the current command buffer has \param size bytes of space + * remaining. If not, this creates a secondary batch buffer and emits + * a jump from the primary batch to the start of the secondary. + * + * Most callers want crocus_get_command_space() instead. + */ +static inline void +crocus_require_command_space(struct crocus_batch *batch, unsigned size) +{ + const unsigned required_bytes = crocus_batch_bytes_used(batch) + size; + unsigned used = crocus_batch_bytes_used(batch); + if (required_bytes >= BATCH_SZ && !batch->no_wrap) { + crocus_batch_flush(batch); + } else if (used + size >= batch->command.bo->size) { + const unsigned new_size = + MIN2(batch->command.bo->size + batch->command.bo->size / 2, + MAX_BATCH_SIZE); + + crocus_grow_buffer(batch, false, used, new_size); + batch->command.map_next = (void *)batch->command.map + used; + assert(crocus_batch_bytes_used(batch) + size < batch->command.bo->size); + } +} + +/** + * Allocate space in the current command buffer, and return a pointer + * to the mapped area so the caller can write commands there. + * + * This should be called whenever emitting commands. + */ +static inline void * +crocus_get_command_space(struct crocus_batch *batch, unsigned bytes) +{ + crocus_require_command_space(batch, bytes); + void *map = batch->command.map_next; + batch->command.map_next += bytes; + return map; +} + +/** + * Helper to emit GPU commands - allocates space, copies them there. + */ +static inline void +crocus_batch_emit(struct crocus_batch *batch, const void *data, unsigned size) +{ + void *map = crocus_get_command_space(batch, size); + memcpy(map, data, size); +} + +/** + * Get a pointer to the batch's signalling syncobj. Does not refcount. + */ +static inline struct crocus_syncobj * +crocus_batch_get_signal_syncobj(struct crocus_batch *batch) +{ + /* The signalling syncobj is the first one in the list. */ + struct crocus_syncobj *syncobj = + ((struct crocus_syncobj **)util_dynarray_begin(&batch->syncobjs))[0]; + return syncobj; +} + +/** + * Take a reference to the batch's signalling syncobj. + * + * Callers can use this to wait for the the current batch under construction + * to complete (after flushing it). + */ +static inline void +crocus_batch_reference_signal_syncobj(struct crocus_batch *batch, + struct crocus_syncobj **out_syncobj) +{ + struct crocus_syncobj *syncobj = crocus_batch_get_signal_syncobj(batch); + crocus_syncobj_reference(batch->screen, out_syncobj, syncobj); +} + +/** + * Record the size of a piece of state for use in INTEL_DEBUG=bat printing. + */ +static inline void +crocus_record_state_size(struct hash_table_u64 *ht, uint32_t offset_from_base, + uint32_t size) +{ + if (ht) { + _mesa_hash_table_u64_insert(ht, offset_from_base, + (void *)(uintptr_t)size); + } +} + +static inline bool +crocus_ptr_in_state_buffer(struct crocus_batch *batch, void *p) +{ + return (char *)p >= (char *)batch->state.map && + (char *)p < (char *)batch->state.map + batch->state.bo->size; +} + +static inline void +crocus_require_statebuffer_space(struct crocus_batch *batch, int size) +{ + if (batch->state.used + size >= STATE_SZ) + crocus_batch_flush(batch); +} +#endif diff --git a/src/gallium/drivers/crocus/crocus_blit.c b/src/gallium/drivers/crocus/crocus_blit.c new file mode 100644 index 00000000000..9cae82e3e2d --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_blit.c @@ -0,0 +1,836 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_surface.h" +#include "util/ralloc.h" +#include "intel/blorp/blorp.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" + +void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond) +{ + util_blitter_save_vertex_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_VERTEX]); + util_blitter_save_tessctrl_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL]); + util_blitter_save_tesseval_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]); + util_blitter_save_geometry_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]); + util_blitter_save_so_targets(ice->blitter, ice->state.so_targets, + (struct pipe_stream_output_target**)ice->state.so_target); + util_blitter_save_vertex_buffer_slot(ice->blitter, ice->state.vertex_buffers); + util_blitter_save_vertex_elements(ice->blitter, (void *)ice->state.cso_vertex_elements); + if (op & CROCUS_SAVE_FRAGMENT_STATE) { + util_blitter_save_blend(ice->blitter, ice->state.cso_blend); + util_blitter_save_depth_stencil_alpha(ice->blitter, ice->state.cso_zsa); + util_blitter_save_stencil_ref(ice->blitter, &ice->state.stencil_ref); + util_blitter_save_fragment_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]); + util_blitter_save_sample_mask(ice->blitter, ice->state.sample_mask); + util_blitter_save_rasterizer(ice->blitter, ice->state.cso_rast); + util_blitter_save_scissor(ice->blitter, &ice->state.scissors[0]); + util_blitter_save_viewport(ice->blitter, &ice->state.viewports[0]); + util_blitter_save_fragment_constant_buffer_slot(ice->blitter, &ice->state.shaders[MESA_SHADER_FRAGMENT].constbufs[0]); + } + + if (!render_cond) + util_blitter_save_render_condition(ice->blitter, + (struct pipe_query *)ice->condition.query, + ice->condition.condition, + ice->condition.mode); + +// util_blitter_save_scissor(ice->blitter, &ice->scissors[0]); + if (op & CROCUS_SAVE_FRAMEBUFFER) + util_blitter_save_framebuffer(ice->blitter, &ice->state.framebuffer); + + if (op & CROCUS_SAVE_TEXTURES) { + util_blitter_save_fragment_sampler_states(ice->blitter, 1, (void **)ice->state.shaders[MESA_SHADER_FRAGMENT].samplers); + util_blitter_save_fragment_sampler_views(ice->blitter, 1, (struct pipe_sampler_view **)ice->state.shaders[MESA_SHADER_FRAGMENT].textures); + } +} + +/** + * Helper function for handling mirror image blits. + * + * If coord0 > coord1, swap them and return "true" (mirrored). + */ +static bool +apply_mirror(float *coord0, float *coord1) +{ + if (*coord0 > *coord1) { + float tmp = *coord0; + *coord0 = *coord1; + *coord1 = tmp; + return true; + } + return false; +} + +/** + * Compute the number of pixels to clip for each side of a rect + * + * \param x0 The rect's left coordinate + * \param y0 The rect's bottom coordinate + * \param x1 The rect's right coordinate + * \param y1 The rect's top coordinate + * \param min_x The clipping region's left coordinate + * \param min_y The clipping region's bottom coordinate + * \param max_x The clipping region's right coordinate + * \param max_y The clipping region's top coordinate + * \param clipped_x0 The number of pixels to clip from the left side + * \param clipped_y0 The number of pixels to clip from the bottom side + * \param clipped_x1 The number of pixels to clip from the right side + * \param clipped_y1 The number of pixels to clip from the top side + * + * \return false if we clip everything away, true otherwise + */ +static inline bool +compute_pixels_clipped(float x0, float y0, float x1, float y1, + float min_x, float min_y, float max_x, float max_y, + float *clipped_x0, float *clipped_y0, + float *clipped_x1, float *clipped_y1) +{ + /* If we are going to clip everything away, stop. */ + if (!(min_x <= max_x && + min_y <= max_y && + x0 <= max_x && + y0 <= max_y && + min_x <= x1 && + min_y <= y1 && + x0 <= x1 && + y0 <= y1)) { + return false; + } + + if (x0 < min_x) + *clipped_x0 = min_x - x0; + else + *clipped_x0 = 0; + if (max_x < x1) + *clipped_x1 = x1 - max_x; + else + *clipped_x1 = 0; + + if (y0 < min_y) + *clipped_y0 = min_y - y0; + else + *clipped_y0 = 0; + if (max_y < y1) + *clipped_y1 = y1 - max_y; + else + *clipped_y1 = 0; + + return true; +} + +/** + * Clips a coordinate (left, right, top or bottom) for the src or dst rect + * (whichever requires the largest clip) and adjusts the coordinate + * for the other rect accordingly. + * + * \param mirror true if mirroring is required + * \param src the source rect coordinate (for example src_x0) + * \param dst0 the dst rect coordinate (for example dst_x0) + * \param dst1 the opposite dst rect coordinate (for example dst_x1) + * \param clipped_dst0 number of pixels to clip from the dst coordinate + * \param clipped_dst1 number of pixels to clip from the opposite dst coordinate + * \param scale the src vs dst scale involved for that coordinate + * \param is_left_or_bottom true if we are clipping the left or bottom sides + * of the rect. + */ +static void +clip_coordinates(bool mirror, + float *src, float *dst0, float *dst1, + float clipped_dst0, + float clipped_dst1, + float scale, + bool is_left_or_bottom) +{ + /* When clipping we need to add or subtract pixels from the original + * coordinates depending on whether we are acting on the left/bottom + * or right/top sides of the rect respectively. We assume we have to + * add them in the code below, and multiply by -1 when we should + * subtract. + */ + int mult = is_left_or_bottom ? 1 : -1; + + if (!mirror) { + *dst0 += clipped_dst0 * mult; + *src += clipped_dst0 * scale * mult; + } else { + *dst1 -= clipped_dst1 * mult; + *src += clipped_dst1 * scale * mult; + } +} + +/** + * Apply a scissor rectangle to blit coordinates. + * + * Returns true if the blit was entirely scissored away. + */ +static bool +apply_blit_scissor(const struct pipe_scissor_state *scissor, + float *src_x0, float *src_y0, + float *src_x1, float *src_y1, + float *dst_x0, float *dst_y0, + float *dst_x1, float *dst_y1, + bool mirror_x, bool mirror_y) +{ + float clip_dst_x0, clip_dst_x1, clip_dst_y0, clip_dst_y1; + + /* Compute number of pixels to scissor away. */ + if (!compute_pixels_clipped(*dst_x0, *dst_y0, *dst_x1, *dst_y1, + scissor->minx, scissor->miny, + scissor->maxx, scissor->maxy, + &clip_dst_x0, &clip_dst_y0, + &clip_dst_x1, &clip_dst_y1)) + return true; + + // XXX: comments assume source clipping, which we don't do + + /* When clipping any of the two rects we need to adjust the coordinates + * in the other rect considering the scaling factor involved. To obtain + * the best precision we want to make sure that we only clip once per + * side to avoid accumulating errors due to the scaling adjustment. + * + * For example, if src_x0 and dst_x0 need both to be clipped we want to + * avoid the situation where we clip src_x0 first, then adjust dst_x0 + * accordingly but then we realize that the resulting dst_x0 still needs + * to be clipped, so we clip dst_x0 and adjust src_x0 again. Because we are + * applying scaling factors to adjust the coordinates in each clipping + * pass we lose some precision and that can affect the results of the + * blorp blit operation slightly. What we want to do here is detect the + * rect that we should clip first for each side so that when we adjust + * the other rect we ensure the resulting coordinate does not need to be + * clipped again. + * + * The code below implements this by comparing the number of pixels that + * we need to clip for each side of both rects considering the scales + * involved. For example, clip_src_x0 represents the number of pixels + * to be clipped for the src rect's left side, so if clip_src_x0 = 5, + * clip_dst_x0 = 4 and scale_x = 2 it means that we are clipping more + * from the dst rect so we should clip dst_x0 only and adjust src_x0. + * This is because clipping 4 pixels in the dst is equivalent to + * clipping 4 * 2 = 8 > 5 in the src. + */ + + if (*src_x0 == *src_x1 || *src_y0 == *src_y1 + || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1) + return true; + + float scale_x = (float) (*src_x1 - *src_x0) / (*dst_x1 - *dst_x0); + float scale_y = (float) (*src_y1 - *src_y0) / (*dst_y1 - *dst_y0); + + /* Clip left side */ + clip_coordinates(mirror_x, src_x0, dst_x0, dst_x1, + clip_dst_x0, clip_dst_x1, scale_x, true); + + /* Clip right side */ + clip_coordinates(mirror_x, src_x1, dst_x1, dst_x0, + clip_dst_x1, clip_dst_x0, scale_x, false); + + /* Clip bottom side */ + clip_coordinates(mirror_y, src_y0, dst_y0, dst_y1, + clip_dst_y0, clip_dst_y1, scale_y, true); + + /* Clip top side */ + clip_coordinates(mirror_y, src_y1, dst_y1, dst_y0, + clip_dst_y1, clip_dst_y0, scale_y, false); + + /* Check for invalid bounds + * Can't blit for 0-dimensions + */ + return *src_x0 == *src_x1 || *src_y0 == *src_y1 + || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1; +} + +void +crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl, + struct isl_device *isl_dev, + struct blorp_surf *surf, + struct pipe_resource *p_res, + enum isl_aux_usage aux_usage, + unsigned level, + bool is_render_target) +{ + struct crocus_resource *res = (void *) p_res; + + assert(!crocus_resource_unfinished_aux_import(res)); + + if (isl_aux_usage_has_hiz(aux_usage) && + !crocus_resource_level_has_hiz(res, level)) + aux_usage = ISL_AUX_USAGE_NONE; + + *surf = (struct blorp_surf) { + .surf = &res->surf, + .addr = (struct blorp_address) { + .buffer = res->bo, + .offset = res->offset, + .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0, + .mocs = crocus_mocs(res->bo, isl_dev), + }, + .aux_usage = aux_usage, + }; + + if (aux_usage != ISL_AUX_USAGE_NONE) { + surf->aux_surf = &res->aux.surf; + surf->aux_addr = (struct blorp_address) { + .buffer = res->aux.bo, + .offset = res->aux.offset, + .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0, + .mocs = crocus_mocs(res->bo, isl_dev), + }; + surf->clear_color = + crocus_resource_get_clear_color(res); + } +} + +static void +tex_cache_flush_hack(struct crocus_batch *batch, + enum isl_format view_format, + enum isl_format surf_format) +{ + /* The WaSamplerCacheFlushBetweenRedescribedSurfaceReads workaround says: + * + * "Currently Sampler assumes that a surface would not have two + * different format associate with it. It will not properly cache + * the different views in the MT cache, causing a data corruption." + * + * We may need to handle this for texture views in general someday, but + * for now we handle it here, as it hurts copies and blits particularly + * badly because they ofter reinterpret formats. + * + * If the BO hasn't been referenced yet this batch, we assume that the + * texture cache doesn't contain any relevant data nor need flushing. + * + * Icelake (Gen11+) claims to fix this issue, but seems to still have + * issues with ASTC formats. + */ + bool need_flush = view_format != surf_format; + if (!need_flush) + return; + + const char *reason = + "workaround: WaSamplerCacheFlushBetweenRedescribedSurfaceReads"; + + crocus_emit_pipe_control_flush(batch, reason, PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(batch, reason, + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); +} + +static struct crocus_resource * +crocus_resource_for_aspect(const struct intel_device_info *devinfo, + struct pipe_resource *p_res, unsigned pipe_mask) +{ + if (pipe_mask == PIPE_MASK_S) { + struct crocus_resource *junk, *s_res; + crocus_get_depth_stencil_resources(devinfo, p_res, &junk, &s_res); + return s_res; + } else { + return (struct crocus_resource *)p_res; + } +} + +static enum pipe_format +pipe_format_for_aspect(enum pipe_format format, unsigned pipe_mask) +{ + if (pipe_mask == PIPE_MASK_S) { + return util_format_stencil_only(format); + } else if (pipe_mask == PIPE_MASK_Z) { + return util_format_get_depth_only(format); + } else { + return format; + } +} + +static void +crocus_u_blitter(struct crocus_context *ice, + const struct pipe_blit_info *info) +{ + struct pipe_blit_info dinfo = *info; + if (!util_format_has_alpha(dinfo.dst.resource->format)) + dinfo.mask &= ~PIPE_MASK_A; + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + util_blitter_blit(ice->blitter, &dinfo); +} + +/** + * The pipe->blit() driver hook. + * + * This performs a blit between two surfaces, which copies data but may + * also perform format conversion, scaling, flipping, and so on. + */ +static void +crocus_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + enum blorp_batch_flags blorp_flags = 0; + + /* We don't support color masking. */ + assert((info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA || + (info->mask & PIPE_MASK_RGBA) == 0); + + if (info->render_condition_enable) + if (!crocus_check_conditional_render(ice)) + return; + + if (devinfo->ver <= 5) { + if (!screen->vtbl.blit_blt(batch, info)) { + + if (!util_format_is_depth_or_stencil(info->src.resource->format) && + info->dst.resource->target != PIPE_TEXTURE_3D) + goto use_blorp; + + if (!util_blitter_is_blit_supported(ice->blitter, info)) { + if (util_format_is_depth_or_stencil(info->src.resource->format)) { + + struct pipe_blit_info depth_blit = *info; + depth_blit.mask = PIPE_MASK_Z; + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + util_blitter_blit(ice->blitter, &depth_blit); + + struct pipe_surface *dst_view, dst_templ; + util_blitter_default_dst_texture(&dst_templ, info->dst.resource, info->dst.level, info->dst.box.z); + dst_view = ctx->create_surface(ctx, info->dst.resource, &dst_templ); + + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + + util_blitter_clear_depth_stencil(ice->blitter, dst_view, PIPE_CLEAR_STENCIL, + 0, 0, info->dst.box.x, info->dst.box.y, + info->dst.box.width, info->dst.box.height); + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + util_blitter_stencil_fallback(ice->blitter, + info->dst.resource, + info->dst.level, + &info->dst.box, + info->src.resource, + info->src.level, + &info->src.box, NULL); + + } + return; + } + + crocus_u_blitter(ice, info); + } + return; + } + + if (devinfo->ver == 6) { + if (info->src.resource->target == PIPE_TEXTURE_3D && + info->dst.resource->target == PIPE_TEXTURE_3D) { + crocus_u_blitter(ice, info); + return; + } + } + +use_blorp: + if (info->render_condition_enable) { + if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) + blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE; + } + + float src_x0 = info->src.box.x; + float src_x1 = info->src.box.x + info->src.box.width; + float src_y0 = info->src.box.y; + float src_y1 = info->src.box.y + info->src.box.height; + float dst_x0 = info->dst.box.x; + float dst_x1 = info->dst.box.x + info->dst.box.width; + float dst_y0 = info->dst.box.y; + float dst_y1 = info->dst.box.y + info->dst.box.height; + bool mirror_x = apply_mirror(&src_x0, &src_x1); + bool mirror_y = apply_mirror(&src_y0, &src_y1); + enum blorp_filter filter; + + if (info->scissor_enable) { + bool noop = apply_blit_scissor(&info->scissor, + &src_x0, &src_y0, &src_x1, &src_y1, + &dst_x0, &dst_y0, &dst_x1, &dst_y1, + mirror_x, mirror_y); + if (noop) + return; + } + + if (abs(info->dst.box.width) == abs(info->src.box.width) && + abs(info->dst.box.height) == abs(info->src.box.height)) { + if (info->src.resource->nr_samples > 1 && + info->dst.resource->nr_samples <= 1) { + /* The OpenGL ES 3.2 specification, section 16.2.1, says: + * + * "If the read framebuffer is multisampled (its effective + * value of SAMPLE_BUFFERS is one) and the draw framebuffer + * is not (its value of SAMPLE_BUFFERS is zero), the samples + * corresponding to each pixel location in the source are + * converted to a single sample before being written to the + * destination. The filter parameter is ignored. If the + * source formats are integer types or stencil values, a + * single sample’s value is selected for each pixel. If the + * source formats are floating-point or normalized types, + * the sample values for each pixel are resolved in an + * implementation-dependent manner. If the source formats + * are depth values, sample values are resolved in an + * implementation-dependent manner where the result will be + * between the minimum and maximum depth values in the pixel." + * + * When selecting a single sample, we always choose sample 0. + */ + if (util_format_is_depth_or_stencil(info->src.format) || + util_format_is_pure_integer(info->src.format)) { + filter = BLORP_FILTER_SAMPLE_0; + } else { + filter = BLORP_FILTER_AVERAGE; + } + } else { + /* The OpenGL 4.6 specification, section 18.3.1, says: + * + * "If the source and destination dimensions are identical, + * no filtering is applied." + * + * Using BLORP_FILTER_NONE will also handle the upsample case by + * replicating the one value in the source to all values in the + * destination. + */ + filter = BLORP_FILTER_NONE; + } + } else if (info->filter == PIPE_TEX_FILTER_LINEAR) { + filter = BLORP_FILTER_BILINEAR; + } else { + filter = BLORP_FILTER_NEAREST; + } + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + float src_z_step = (float)info->src.box.depth / (float)info->dst.box.depth; + + /* There is no interpolation to the pixel center during rendering, so + * add the 0.5 offset ourselves here. + */ + float depth_center_offset = 0; + if (info->src.resource->target == PIPE_TEXTURE_3D) + depth_center_offset = 0.5 / info->dst.box.depth * info->src.box.depth; + + /* Perform a blit for each aspect requested by the caller. PIPE_MASK_R is + * used to represent the color aspect. */ + unsigned aspect_mask = info->mask & (PIPE_MASK_R | PIPE_MASK_ZS); + while (aspect_mask) { + unsigned aspect = 1 << u_bit_scan(&aspect_mask); + + struct crocus_resource *src_res = + crocus_resource_for_aspect(devinfo, info->src.resource, aspect); + struct crocus_resource *dst_res = + crocus_resource_for_aspect(devinfo, info->dst.resource, aspect); + + enum pipe_format src_pfmt = + pipe_format_for_aspect(info->src.format, aspect); + enum pipe_format dst_pfmt = + pipe_format_for_aspect(info->dst.format, aspect); + + if (crocus_resource_unfinished_aux_import(src_res)) + crocus_resource_finish_aux_import(ctx->screen, src_res); + if (crocus_resource_unfinished_aux_import(dst_res)) + crocus_resource_finish_aux_import(ctx->screen, dst_res); + + struct crocus_format_info src_fmt = + crocus_format_for_usage(devinfo, src_pfmt, ISL_SURF_USAGE_TEXTURE_BIT); + enum isl_aux_usage src_aux_usage = + crocus_resource_texture_aux_usage(src_res); + + crocus_resource_prepare_texture(ice, src_res, src_fmt.fmt, + info->src.level, 1, info->src.box.z, + info->src.box.depth); + // crocus_emit_buffer_barrier_for(batch, src_res->bo, + // CROCUS_DOMAIN_OTHER_READ); + + struct crocus_format_info dst_fmt = + crocus_format_for_usage(devinfo, dst_pfmt, + ISL_SURF_USAGE_RENDER_TARGET_BIT); + enum isl_aux_usage dst_aux_usage = + crocus_resource_render_aux_usage(ice, dst_res, info->dst.level, + dst_fmt.fmt, false); + + struct blorp_surf src_surf, dst_surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf, + &src_res->base, src_aux_usage, + info->src.level, false); + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf, + &dst_res->base, dst_aux_usage, + info->dst.level, true); + + crocus_resource_prepare_render(ice, dst_res, info->dst.level, + info->dst.box.z, info->dst.box.depth, + dst_aux_usage); + // crocus_emit_buffer_barrier_for(batch, dst_res->bo, + // CROCUS_DOMAIN_RENDER_WRITE); + + if (crocus_batch_references(batch, src_res->bo)) + tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format); + + if (dst_res->base.target == PIPE_BUFFER) { + util_range_add(&dst_res->base, &dst_res->valid_buffer_range, + dst_x0, dst_x1); + } + + struct isl_swizzle src_swiz = pipe_to_isl_swizzles(src_fmt.swizzles); + struct isl_swizzle dst_swiz = pipe_to_isl_swizzles(dst_fmt.swizzles); + + for (int slice = 0; slice < info->dst.box.depth; slice++) { + unsigned dst_z = info->dst.box.z + slice; + float src_z = info->src.box.z + slice * src_z_step + + depth_center_offset; + + crocus_batch_maybe_flush(batch, 1500); + + blorp_blit(&blorp_batch, + &src_surf, info->src.level, src_z, + src_fmt.fmt, src_swiz, + &dst_surf, info->dst.level, dst_z, + dst_fmt.fmt, dst_swiz, + src_x0, src_y0, src_x1, src_y1, + dst_x0, dst_y0, dst_x1, dst_y1, + filter, mirror_x, mirror_y); + + } + + tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format); + + crocus_resource_finish_render(ice, dst_res, info->dst.level, + info->dst.box.z, info->dst.box.depth, + dst_aux_usage); + } + + blorp_batch_finish(&blorp_batch); + + crocus_flush_and_dirty_for_history(ice, batch, (struct crocus_resource *) + info->dst.resource, + PIPE_CONTROL_RENDER_TARGET_FLUSH, + "cache history: post-blit"); +} + +static void +get_copy_region_aux_settings(struct crocus_resource *res, + enum isl_aux_usage *out_aux_usage, + bool is_render_target) +{ + switch (res->aux.usage) { + case ISL_AUX_USAGE_MCS: + /* A stencil resolve operation must be performed prior to doing resource + * copies or used by CPU. + * (see HSD 1209978162) + */ + if (is_render_target && isl_surf_usage_is_stencil(res->surf.usage)) { + *out_aux_usage = ISL_AUX_USAGE_NONE; + } else { + *out_aux_usage = res->aux.usage; + } + break; + default: + *out_aux_usage = ISL_AUX_USAGE_NONE; + break; + } +} + +/** + * Perform a GPU-based raw memory copy between compatible view classes. + * + * Does not perform any flushing - the new data may still be left in the + * render cache, and old data may remain in other caches. + * + * Wraps blorp_copy() and blorp_buffer_copy(). + */ +void +crocus_copy_region(struct blorp_context *blorp, + struct crocus_batch *batch, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct blorp_batch blorp_batch; + struct crocus_context *ice = blorp->driver_ctx; + struct crocus_screen *screen = (void *) ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_resource *src_res = (void *) src; + struct crocus_resource *dst_res = (void *) dst; + + if (devinfo->ver <= 5) { + if (screen->vtbl.copy_region_blt(batch, dst_res, + dst_level, dstx, dsty, dstz, + src_res, src_level, src_box)) + return; + } + enum isl_aux_usage src_aux_usage, dst_aux_usage; + get_copy_region_aux_settings(src_res, &src_aux_usage, + false); + get_copy_region_aux_settings(dst_res, &dst_aux_usage, + true); + + if (crocus_batch_references(batch, src_res->bo)) + tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format); + + if (dst->target == PIPE_BUFFER) + util_range_add(&dst_res->base, &dst_res->valid_buffer_range, dstx, dstx + src_box->width); + + if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { + struct blorp_address src_addr = { + .buffer = crocus_resource_bo(src), .offset = src_box->x, + }; + struct blorp_address dst_addr = { + .buffer = crocus_resource_bo(dst), .offset = dstx, + .reloc_flags = EXEC_OBJECT_WRITE, + }; + + crocus_batch_maybe_flush(batch, 1500); + + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + blorp_buffer_copy(&blorp_batch, src_addr, dst_addr, src_box->width); + blorp_batch_finish(&blorp_batch); + } else { + // XXX: what about one surface being a buffer and not the other? + + struct blorp_surf src_surf, dst_surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf, + src, src_aux_usage, src_level, false); + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf, + dst, dst_aux_usage, dst_level, true); + + crocus_resource_prepare_access(ice, src_res, src_level, 1, + src_box->z, src_box->depth, + src_aux_usage, false); + crocus_resource_prepare_access(ice, dst_res, dst_level, 1, + dstz, src_box->depth, + dst_aux_usage, false); + + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + + for (int slice = 0; slice < src_box->depth; slice++) { + crocus_batch_maybe_flush(batch, 1500); + + blorp_copy(&blorp_batch, &src_surf, src_level, src_box->z + slice, + &dst_surf, dst_level, dstz + slice, + src_box->x, src_box->y, dstx, dsty, + src_box->width, src_box->height); + } + blorp_batch_finish(&blorp_batch); + + crocus_resource_finish_write(ice, dst_res, dst_level, dstz, + src_box->depth, dst_aux_usage); + } + + tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format); +} + +static struct crocus_batch * +get_preferred_batch(struct crocus_context *ice, struct crocus_bo *bo) +{ + /* If the compute batch is already using this buffer, we'd prefer to + * continue queueing in the compute batch. + */ + if (crocus_batch_references(&ice->batches[CROCUS_BATCH_COMPUTE], bo)) + return &ice->batches[CROCUS_BATCH_COMPUTE]; + + /* Otherwise default to the render batch. */ + return &ice->batches[CROCUS_BATCH_RENDER]; +} + + +/** + * The pipe->resource_copy_region() driver hook. + * + * This implements ARB_copy_image semantics - a raw memory copy between + * compatible view classes. + */ +static void +crocus_resource_copy_region(struct pipe_context *ctx, + struct pipe_resource *p_dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *p_src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_resource *src = (void *) p_src; + struct crocus_resource *dst = (void *) p_dst; + + if (crocus_resource_unfinished_aux_import(src)) + crocus_resource_finish_aux_import(ctx->screen, src); + if (crocus_resource_unfinished_aux_import(dst)) + crocus_resource_finish_aux_import(ctx->screen, dst); + + /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */ + if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER && + (src_box->width % 4 == 0) && src_box->width <= 16 && + screen->vtbl.copy_mem_mem) { + struct crocus_bo *dst_bo = crocus_resource_bo(p_dst); + batch = get_preferred_batch(ice, dst_bo); + crocus_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4)); + crocus_emit_pipe_control_flush(batch, + "stall for MI_COPY_MEM_MEM copy_region", + PIPE_CONTROL_CS_STALL); + screen->vtbl.copy_mem_mem(batch, dst_bo, dstx, crocus_resource_bo(p_src), + src_box->x, src_box->width); + return; + } + + if (devinfo->ver < 6 && util_format_is_depth_or_stencil(p_dst->format)) { + util_resource_copy_region(ctx, p_dst, dst_level, dstx, dsty, dstz, + p_src, src_level, src_box); + return; + } + crocus_copy_region(&ice->blorp, batch, p_dst, dst_level, dstx, dsty, dstz, + p_src, src_level, src_box); + + if (util_format_is_depth_and_stencil(p_dst->format) && + util_format_has_stencil(util_format_description(p_src->format)) && + devinfo->ver >= 6) { + struct crocus_resource *junk, *s_src_res, *s_dst_res; + crocus_get_depth_stencil_resources(devinfo, p_src, &junk, &s_src_res); + crocus_get_depth_stencil_resources(devinfo, p_dst, &junk, &s_dst_res); + + crocus_copy_region(&ice->blorp, batch, &s_dst_res->base, dst_level, dstx, + dsty, dstz, &s_src_res->base, src_level, src_box); + } + + crocus_flush_and_dirty_for_history(ice, batch, dst, + PIPE_CONTROL_RENDER_TARGET_FLUSH, + "cache history: post copy_region"); +} + +void +crocus_init_blit_functions(struct pipe_context *ctx) +{ + ctx->blit = crocus_blit; + ctx->resource_copy_region = crocus_resource_copy_region; +} diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c new file mode 100644 index 00000000000..75f0078d535 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_blorp.c @@ -0,0 +1,399 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_blorp.c + * + * ============================= GENXML CODE ============================= + * [This file is compiled once per generation.] + * ======================================================================= + * + * GenX specific code for working with BLORP (blitting, resolves, clears + * on the 3D engine). This provides the driver-specific hooks needed to + * implement the BLORP API. + * + * See crocus_blit.c, crocus_clear.c, and so on. + */ + +#include + +#include "crocus_batch.h" +#include "crocus_resource.h" +#include "crocus_context.h" + +#include "util/u_upload_mgr.h" +#include "intel/common/intel_l3_config.h" + +#include "blorp/blorp_genX_exec.h" + +#if GFX_VER <= 5 +#include "gen4_blorp_exec.h" +#endif + +static uint32_t * +stream_state(struct crocus_batch *batch, + unsigned size, + unsigned alignment, + uint32_t *out_offset, + struct crocus_bo **out_bo) +{ + uint32_t offset = ALIGN(batch->state.used, alignment); + + if (offset + size >= STATE_SZ && !batch->no_wrap) { + crocus_batch_flush(batch); + offset = ALIGN(batch->state.used, alignment); + } else if (offset + size >= batch->state.bo->size) { + const unsigned new_size = + MIN2(batch->state.bo->size + batch->state.bo->size / 2, + MAX_STATE_SIZE); + crocus_grow_buffer(batch, true, batch->state.used, new_size); + assert(offset + size < batch->state.bo->size); + } + + crocus_record_state_size(batch->state_sizes, offset, size); + + batch->state.used = offset + size; + *out_offset = offset; + + /* If the caller has asked for a BO, we leave them the responsibility of + * adding bo->gtt_offset (say, by handing an address to genxml). If not, + * we assume they want the offset from a base address. + */ + if (out_bo) + *out_bo = batch->state.bo; + + return (uint32_t *)batch->state.map + (offset >> 2); +} + +static void * +blorp_emit_dwords(struct blorp_batch *blorp_batch, unsigned n) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + return crocus_get_command_space(batch, n * sizeof(uint32_t)); +} + +static uint64_t +blorp_emit_reloc(struct blorp_batch *blorp_batch, UNUSED void *location, + struct blorp_address addr, uint32_t delta) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + uint32_t offset; + + if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) { + offset = (char *)location - (char *)batch->state.map; + return crocus_state_reloc(batch, offset, + addr.buffer, addr.offset + delta, + addr.reloc_flags); + } + + assert(!crocus_ptr_in_state_buffer(batch, location)); + + offset = (char *)location - (char *)batch->command.map; + return crocus_command_reloc(batch, offset, + addr.buffer, addr.offset + delta, + addr.reloc_flags); +} + +static void +blorp_surface_reloc(struct blorp_batch *blorp_batch, uint32_t ss_offset, + struct blorp_address addr, uint32_t delta) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + struct crocus_bo *bo = addr.buffer; + + uint64_t reloc_val = + crocus_state_reloc(batch, ss_offset, bo, addr.offset + delta, + addr.reloc_flags); + + void *reloc_ptr = (void *)batch->state.map + ss_offset; + *(uint32_t *)reloc_ptr = reloc_val; +} + +static uint64_t +blorp_get_surface_address(struct blorp_batch *blorp_batch, + struct blorp_address addr) +{ + /* We'll let blorp_surface_reloc write the address. */ + return 0ull; +} + +#if GFX_VER >= 7 +static struct blorp_address +blorp_get_surface_base_address(struct blorp_batch *blorp_batch) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + return (struct blorp_address) { + .buffer = batch->state.bo, + .offset = 0 + }; +} +#endif + +static void * +blorp_alloc_dynamic_state(struct blorp_batch *blorp_batch, + uint32_t size, + uint32_t alignment, + uint32_t *offset) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + return stream_state(batch, size, alignment, offset, NULL); +} + +static void +blorp_alloc_binding_table(struct blorp_batch *blorp_batch, + unsigned num_entries, + unsigned state_size, + unsigned state_alignment, + uint32_t *bt_offset, + uint32_t *surface_offsets, + void **surface_maps) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + uint32_t *bt_map = stream_state(batch, num_entries * sizeof(uint32_t), 32, + bt_offset, NULL); + + for (unsigned i = 0; i < num_entries; i++) { + surface_maps[i] = stream_state(batch, + state_size, state_alignment, + &(surface_offsets)[i], NULL); + bt_map[i] = surface_offsets[i]; + } +} + +static void * +blorp_alloc_vertex_buffer(struct blorp_batch *blorp_batch, + uint32_t size, + struct blorp_address *addr) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + struct crocus_bo *bo; + uint32_t offset; + + void *map = stream_state(batch, size, 64, + &offset, &bo); + + *addr = (struct blorp_address) { + .buffer = bo, + .offset = offset, + .reloc_flags = RELOC_32BIT, +#if GFX_VER >= 7 + .mocs = crocus_mocs(bo, &batch->screen->isl_dev), +#endif + }; + + return map; +} + +/** + */ +static void +blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *blorp_batch, + const struct blorp_address *addrs, + UNUSED uint32_t *sizes, + unsigned num_vbs) +{ +} + +static struct blorp_address +blorp_get_workaround_address(struct blorp_batch *blorp_batch) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + return (struct blorp_address) { + .buffer = batch->ice->workaround_bo, + .offset = batch->ice->workaround_offset, + }; +} + +static void +blorp_flush_range(UNUSED struct blorp_batch *blorp_batch, + UNUSED void *start, + UNUSED size_t size) +{ + /* All allocated states come from the batch which we will flush before we + * submit it. There's nothing for us to do here. + */ +} + +#if GFX_VER >= 7 +static const struct intel_l3_config * +blorp_get_l3_config(struct blorp_batch *blorp_batch) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + return batch->screen->l3_config_3d; +} +#else /* GFX_VER < 7 */ +static void +blorp_emit_urb_config(struct blorp_batch *blorp_batch, + unsigned vs_entry_size, + UNUSED unsigned sf_entry_size) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; +#if GFX_VER <= 5 + batch->screen->vtbl.calculate_urb_fence(batch, 0, vs_entry_size, sf_entry_size); +#else + genX(upload_urb)(batch, vs_entry_size, false, vs_entry_size); +#endif +} +#endif + +static void +crocus_blorp_exec(struct blorp_batch *blorp_batch, + const struct blorp_params *params) +{ + struct crocus_context *ice = blorp_batch->blorp->driver_ctx; + struct crocus_batch *batch = blorp_batch->driver_batch; + + /* Flush the sampler and render caches. We definitely need to flush the + * sampler cache so that we get updated contents from the render cache for + * the glBlitFramebuffer() source. Also, we are sometimes warned in the + * docs to flush the cache between reinterpretations of the same surface + * data with different formats, which blorp does for stencil and depth + * data. + */ + if (params->src.enabled) + crocus_cache_flush_for_read(batch, params->src.addr.buffer); + if (params->dst.enabled) { + crocus_cache_flush_for_render(batch, params->dst.addr.buffer, + params->dst.view.format, + params->dst.aux_usage); + } + if (params->depth.enabled) + crocus_cache_flush_for_depth(batch, params->depth.addr.buffer); + if (params->stencil.enabled) + crocus_cache_flush_for_depth(batch, params->stencil.addr.buffer); + + crocus_require_command_space(batch, 1400); + crocus_require_statebuffer_space(batch, 600); + batch->no_wrap = true; +#if GFX_VER == 6 + /* Emit workaround flushes when we switch from drawing to blorping. */ + crocus_emit_post_sync_nonzero_flush(batch); +#endif + +#if GFX_VER >= 6 + crocus_emit_depth_stall_flushes(batch); +#endif + + blorp_emit(blorp_batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { + rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1; + rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1; + } + + batch->screen->vtbl.update_surface_base_address(batch); + crocus_handle_always_flush_cache(batch); + + batch->contains_draw = true; + blorp_exec(blorp_batch, params); + + batch->no_wrap = false; + crocus_handle_always_flush_cache(batch); + + /* We've smashed all state compared to what the normal 3D pipeline + * rendering tracks for GL. + */ + + uint64_t skip_bits = (CROCUS_DIRTY_POLYGON_STIPPLE | + CROCUS_DIRTY_GEN7_SO_BUFFERS | + CROCUS_DIRTY_SO_DECL_LIST | + CROCUS_DIRTY_LINE_STIPPLE | + CROCUS_ALL_DIRTY_FOR_COMPUTE | + CROCUS_DIRTY_GEN6_SCISSOR_RECT | + CROCUS_DIRTY_GEN75_VF | + CROCUS_DIRTY_SF_CL_VIEWPORT); + + uint64_t skip_stage_bits = (CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE | + CROCUS_STAGE_DIRTY_UNCOMPILED_VS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TCS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TES | + CROCUS_STAGE_DIRTY_UNCOMPILED_GS | + CROCUS_STAGE_DIRTY_UNCOMPILED_FS | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS); + + if (!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) { + /* BLORP disabled tessellation, that's fine for the next draw */ + skip_stage_bits |= CROCUS_STAGE_DIRTY_TCS | + CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_CONSTANTS_TCS | + CROCUS_STAGE_DIRTY_CONSTANTS_TES | + CROCUS_STAGE_DIRTY_BINDINGS_TCS | + CROCUS_STAGE_DIRTY_BINDINGS_TES; + } + + if (!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) { + /* BLORP disabled geometry shaders, that's fine for the next draw */ + skip_stage_bits |= CROCUS_STAGE_DIRTY_GS | + CROCUS_STAGE_DIRTY_CONSTANTS_GS | + CROCUS_STAGE_DIRTY_BINDINGS_GS; + } + + /* we can skip flagging CROCUS_DIRTY_DEPTH_BUFFER, if + * BLORP_BATCH_NO_EMIT_DEPTH_STENCIL is set. + */ + if (blorp_batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) + skip_bits |= CROCUS_DIRTY_DEPTH_BUFFER; + + if (!params->wm_prog_data) + skip_bits |= CROCUS_DIRTY_GEN6_BLEND_STATE; + + ice->state.dirty |= ~skip_bits; + ice->state.stage_dirty |= ~skip_stage_bits; + + ice->urb.vsize = 0; + ice->urb.gs_present = false; + ice->urb.gsize = 0; + ice->urb.tess_present = false; + ice->urb.hsize = 0; + ice->urb.dsize = 0; + + if (params->dst.enabled) { + crocus_render_cache_add_bo(batch, params->dst.addr.buffer, + params->dst.view.format, + params->dst.aux_usage); + } + if (params->depth.enabled) + crocus_depth_cache_add_bo(batch, params->depth.addr.buffer); + if (params->stencil.enabled) + crocus_depth_cache_add_bo(batch, params->stencil.addr.buffer); +} + +static void +blorp_measure_start(struct blorp_batch *blorp_batch, + const struct blorp_params *params) +{ +} + +void +genX(init_blorp)(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + + blorp_init(&ice->blorp, ice, &screen->isl_dev); + ice->blorp.compiler = screen->compiler; + ice->blorp.lookup_shader = crocus_blorp_lookup_shader; + ice->blorp.upload_shader = crocus_blorp_upload_shader; + ice->blorp.exec = crocus_blorp_exec; +} diff --git a/src/gallium/drivers/crocus/crocus_blt.c b/src/gallium/drivers/crocus/crocus_blt.c new file mode 100644 index 00000000000..d27891352bd --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_blt.c @@ -0,0 +1,337 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* blt command encoding for gen4/5 */ +#include "crocus_context.h" + +#include "crocus_genx_macros.h" +#include "crocus_genx_protos.h" +#include "crocus_resource.h" + +#define FILE_DEBUG_FLAG DEBUG_BLIT + +#if GFX_VER <= 5 + +static bool validate_blit_for_blt(struct crocus_batch *batch, + const struct pipe_blit_info *info) +{ + /* If the source and destination are the same size with no mirroring, + * the rectangles are within the size of the texture and there is no + * scissor, then we can probably use the blit engine. + */ + if (info->dst.box.width != info->src.box.width || + info->dst.box.height != info->src.box.height) + return false; + + if (info->scissor_enable) + return false; + + if (info->dst.box.height < 0 || info->src.box.height < 0) + return false; + + if (info->dst.box.depth > 1 || info->src.box.depth > 1) + return false; + + return true; +} + +static inline int crocus_resource_blt_pitch(struct crocus_resource *res) +{ + int pitch = res->surf.row_pitch_B; + if (res->surf.tiling != ISL_TILING_LINEAR) + pitch /= 4; + return pitch; +} + +static uint32_t +color_depth_for_cpp(int cpp) +{ + switch (cpp) { + case 4: return COLOR_DEPTH__32bit; + case 2: return COLOR_DEPTH__565; + case 1: return COLOR_DEPTH__8bit; + default: + unreachable("not reached"); + } +} + +static bool emit_copy_blt(struct crocus_batch *batch, + struct crocus_resource *src, + struct crocus_resource *dst, + unsigned cpp, + int32_t src_pitch, + unsigned src_offset, + int32_t dst_pitch, + unsigned dst_offset, + uint16_t src_x, uint16_t src_y, + uint16_t dst_x, uint16_t dst_y, + uint16_t w, uint16_t h) + +{ + uint32_t src_tile_w, src_tile_h; + uint32_t dst_tile_w, dst_tile_h; + int dst_y2 = dst_y + h; + int dst_x2 = dst_x + w; + + DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n", + __func__, + src, src_pitch, src_offset, src_x, src_y, + dst, dst_pitch, dst_offset, dst_x, dst_y, w, h); + + isl_get_tile_dims(src->surf.tiling, cpp, &src_tile_w, &src_tile_h); + isl_get_tile_dims(dst->surf.tiling, cpp, &dst_tile_w, &dst_tile_h); + + /* For Tiled surfaces, the pitch has to be a multiple of the Tile width + * (X direction width of the Tile). This is ensured while allocating the + * buffer object. + */ + assert(src->surf.tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0); + assert(dst->surf.tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0); + + /* For big formats (such as floating point), do the copy using 16 or + * 32bpp and multiply the coordinates. + */ + if (cpp > 4) { + if (cpp % 4 == 2) { + dst_x *= cpp / 2; + dst_x2 *= cpp / 2; + src_x *= cpp / 2; + cpp = 2; + } else { + assert(cpp % 4 == 0); + dst_x *= cpp / 4; + dst_x2 *= cpp / 4; + src_x *= cpp / 4; + cpp = 4; + } + } + + /* For tiled source and destination, pitch value should be specified + * as a number of Dwords. + */ + if (dst->surf.tiling != ISL_TILING_LINEAR) + dst_pitch /= 4; + + if (src->surf.tiling != ISL_TILING_LINEAR) + src_pitch /= 4; + + assert(cpp <= 4); + crocus_emit_cmd(batch, GENX(XY_SRC_COPY_BLT), xyblt) { + xyblt.RasterOperation = 0xCC; + xyblt.DestinationTilingEnable = dst->surf.tiling != ISL_TILING_LINEAR; + xyblt.SourceTilingEnable = src->surf.tiling != ISL_TILING_LINEAR; + xyblt.SourceBaseAddress = ro_bo(src->bo, src_offset); + xyblt.DestinationBaseAddress = rw_bo(dst->bo, dst_offset); + xyblt.ColorDepth = color_depth_for_cpp(cpp); + xyblt._32bppByteMask = cpp == 4 ? 0x3 : 0x1; + xyblt.DestinationX1Coordinate = dst_x; + xyblt.DestinationY1Coordinate = dst_y; + xyblt.DestinationX2Coordinate = dst_x2; + xyblt.DestinationY2Coordinate = dst_y2; + xyblt.DestinationPitch = dst_pitch; + xyblt.SourceX1Coordinate = src_x; + xyblt.SourceY1Coordinate = src_y; + xyblt.SourcePitch = src_pitch; + }; + + crocus_emit_mi_flush(batch); + return true; +} + +static bool crocus_emit_blt(struct crocus_batch *batch, + struct crocus_resource *src, + struct crocus_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, + unsigned dst_z, + unsigned src_level, + const struct pipe_box *src_box) +{ + const struct isl_format_layout *src_fmtl = isl_format_get_layout(src->surf.format); + unsigned src_cpp = src_fmtl->bpb / 8; + const struct isl_format_layout *dst_fmtl = isl_format_get_layout(dst->surf.format); + const unsigned dst_cpp = dst_fmtl->bpb / 8; + uint16_t src_x, src_y; + uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y; + uint32_t src_width = src_box->width, src_height = src_box->height; + + /* gen4/5 can't handle Y tiled blits. */ + if (src->surf.tiling == ISL_TILING_Y0 || dst->surf.tiling == ISL_TILING_Y0) + return false; + + if (src->surf.format != dst->surf.format) + return false; + + if (src_cpp != dst_cpp) + return false; + + src_x = src_box->x; + src_y = src_box->y; + + assert(src_cpp == dst_cpp); + + crocus_resource_get_image_offset(src, src_level, src_box->z, &src_image_x, + &src_image_y); + if (util_format_is_compressed(src->base.format)) { + int bw = util_format_get_blockwidth(src->base.format); + int bh = util_format_get_blockheight(src->base.format); + assert(src_x % bw == 0); + assert(src_y % bh == 0); + src_x /= (int)bw; + src_y /= (int)bh; + src_width = DIV_ROUND_UP(src_width, (int)bw); + src_height = DIV_ROUND_UP(src_height, (int)bh); + } + + crocus_resource_get_image_offset(dst, dst_level, dst_z, &dst_image_x, + &dst_image_y); + if (util_format_is_compressed(dst->base.format)) { + int bw = util_format_get_blockwidth(dst->base.format); + int bh = util_format_get_blockheight(dst->base.format); + assert(dst_x % bw == 0); + assert(dst_y % bh == 0); + dst_x /= (int)bw; + dst_y /= (int)bh; + } + src_x += src_image_x; + src_y += src_image_y; + dst_x += dst_image_x; + dst_y += dst_image_y; + + /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics + * Data Size Limitations): + * + * The BLT engine is capable of transferring very large quantities of + * graphics data. Any graphics data read from and written to the + * destination is permitted to represent a number of pixels that + * occupies up to 65,536 scan lines and up to 32,768 bytes per scan line + * at the destination. The maximum number of pixels that may be + * represented per scan line’s worth of graphics data depends on the + * color depth. + * + * The blitter's pitch is a signed 16-bit integer, but measured in bytes + * for linear surfaces and DWords for tiled surfaces. So the maximum + * pitch is 32k linear and 128k tiled. + */ + if (crocus_resource_blt_pitch(src) >= 32768 || + crocus_resource_blt_pitch(dst) >= 32768) { + return false; + } + + /* We need to split the blit into chunks that each fit within the blitter's + * restrictions. We can't use a chunk size of 32768 because we need to + * ensure that src_tile_x + chunk_size fits. We choose 16384 because it's + * a nice round power of two, big enough that performance won't suffer, and + * small enough to guarantee everything fits. + */ + const uint32_t max_chunk_size = 16384; + + for (uint32_t chunk_x = 0; chunk_x < src_width; chunk_x += max_chunk_size) { + for (uint32_t chunk_y = 0; chunk_y < src_height; chunk_y += max_chunk_size) { + const uint32_t chunk_w = MIN2(max_chunk_size, src_width - chunk_x); + const uint32_t chunk_h = MIN2(max_chunk_size, src_height - chunk_y); + + ASSERTED uint32_t z_offset_el, array_offset; + uint32_t src_offset, src_tile_x, src_tile_y; + isl_tiling_get_intratile_offset_el(src->surf.tiling, + src_cpp * 8, src->surf.row_pitch_B, + src->surf.array_pitch_el_rows, + src_x + chunk_x, src_y + chunk_y, 0, 0, + &src_offset, + &src_tile_x, &src_tile_y, + &z_offset_el, &array_offset); + assert(z_offset_el == 0); + assert(array_offset == 0); + + uint32_t dst_offset, dst_tile_x, dst_tile_y; + isl_tiling_get_intratile_offset_el(dst->surf.tiling, + dst_cpp * 8, dst->surf.row_pitch_B, + dst->surf.array_pitch_el_rows, + dst_x + chunk_x, dst_y + chunk_y, 0, 0, + &dst_offset, + &dst_tile_x, &dst_tile_y, + &z_offset_el, &array_offset); + assert(z_offset_el == 0); + assert(array_offset == 0); + if (!emit_copy_blt(batch, src, dst, + src_cpp, src->surf.row_pitch_B, + src_offset, + dst->surf.row_pitch_B, dst_offset, + src_tile_x, src_tile_y, + dst_tile_x, dst_tile_y, + chunk_w, chunk_h)) { + return false; + } + } + } + return true; +} + +static bool crocus_blit_blt(struct crocus_batch *batch, + const struct pipe_blit_info *info) +{ + if (!validate_blit_for_blt(batch, info)) + return false; + + return crocus_emit_blt(batch, + (struct crocus_resource *)info->src.resource, + (struct crocus_resource *)info->dst.resource, + info->dst.level, + info->dst.box.x, + info->dst.box.y, + info->dst.box.z, + info->src.level, + &info->src.box); +} + + +static bool crocus_copy_region_blt(struct crocus_batch *batch, + struct crocus_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct crocus_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + if (dst->base.target == PIPE_BUFFER || src->base.target == PIPE_BUFFER) + return false; + return crocus_emit_blt(batch, + src, + dst, + dst_level, + dstx, dsty, dstz, + src_level, + src_box); +} +#endif + +void +genX(init_blt)(struct crocus_screen *screen) +{ +#if GFX_VER <= 5 + screen->vtbl.blit_blt = crocus_blit_blt; + screen->vtbl.copy_region_blt = crocus_copy_region_blt; +#else + screen->vtbl.blit_blt = NULL; + screen->vtbl.copy_region_blt = NULL; +#endif +} diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.c b/src/gallium/drivers/crocus/crocus_bufmgr.c new file mode 100644 index 00000000000..caca821cd7e --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_bufmgr.c @@ -0,0 +1,1689 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_bufmgr.c + * + * The crocus buffer manager. + * + * XXX: write better comments + * - BOs + * - Explain BO cache + * - main interface to GEM in the kernel + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "errno.h" +#include "common/intel_clflush.h" +#include "dev/intel_debug.h" +#include "common/intel_gem.h" +#include "dev/intel_device_info.h" +#include "main/macros.h" +#include "util/debug.h" +#include "util/macros.h" +#include "util/hash_table.h" +#include "util/list.h" +#include "util/os_file.h" +#include "util/u_dynarray.h" +#include "util/vma.h" +#include "crocus_bufmgr.h" +#include "crocus_context.h" +#include "string.h" + +#include "drm-uapi/i915_drm.h" + +#ifdef HAVE_VALGRIND +#include +#include +#define VG(x) x +#else +#define VG(x) +#endif + +/** + * For debugging purposes, this returns a time in seconds. + */ +static double +get_time(void) +{ + struct timespec tp; + + clock_gettime(CLOCK_MONOTONIC, &tp); + + return tp.tv_sec + tp.tv_nsec / 1000000000.0; +} + +/* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier + * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is + * leaked. All because it does not call VG(cli_free) from its + * VG_USERREQ__FREELIKE_BLOCK handler. Instead of treating the memory like + * and allocation, we mark it available for use upon mmapping and remove + * it upon unmapping. + */ +#define VG_DEFINED(ptr, size) VG(VALGRIND_MAKE_MEM_DEFINED(ptr, size)) +#define VG_NOACCESS(ptr, size) VG(VALGRIND_MAKE_MEM_NOACCESS(ptr, size)) + +#define PAGE_SIZE 4096 + +#define WARN_ONCE(cond, fmt...) do { \ + if (unlikely(cond)) { \ + static bool _warned = false; \ + if (!_warned) { \ + fprintf(stderr, "WARNING: "); \ + fprintf(stderr, fmt); \ + _warned = true; \ + } \ + } \ +} while (0) + +#define FILE_DEBUG_FLAG DEBUG_BUFMGR + +static inline int +atomic_add_unless(int *v, int add, int unless) +{ + int c, old; + c = p_atomic_read(v); + while (c != unless && (old = p_atomic_cmpxchg(v, c, c + add)) != c) + c = old; + return c == unless; +} + +struct bo_cache_bucket { + /** List of cached BOs. */ + struct list_head head; + + /** Size of this bucket, in bytes. */ + uint64_t size; +}; + +struct bo_export { + /** File descriptor associated with a handle export. */ + int drm_fd; + + /** GEM handle in drm_fd */ + uint32_t gem_handle; + + struct list_head link; +}; + +struct crocus_bufmgr { + /** + * List into the list of bufmgr. + */ + struct list_head link; + + uint32_t refcount; + + int fd; + + mtx_t lock; + + /** Array of lists of cached gem objects of power-of-two sizes */ + struct bo_cache_bucket cache_bucket[14 * 4]; + int num_buckets; + time_t time; + + struct hash_table *name_table; + struct hash_table *handle_table; + + /** + * List of BOs which we've effectively freed, but are hanging on to + * until they're idle before closing and returning the VMA. + */ + struct list_head zombie_list; + + bool has_llc:1; + bool has_mmap_offset:1; + bool has_tiling_uapi:1; + bool bo_reuse:1; +}; + +static mtx_t global_bufmgr_list_mutex = _MTX_INITIALIZER_NP; +static struct list_head global_bufmgr_list = { + .next = &global_bufmgr_list, + .prev = &global_bufmgr_list, +}; + +static int bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode, + uint32_t stride); + +static void bo_free(struct crocus_bo *bo); + +static uint32_t +key_hash_uint(const void *key) +{ + return _mesa_hash_data(key, 4); +} + +static bool +key_uint_equal(const void *a, const void *b) +{ + return *((unsigned *) a) == *((unsigned *) b); +} + +static struct crocus_bo * +find_and_ref_external_bo(struct hash_table *ht, unsigned int key) +{ + struct hash_entry *entry = _mesa_hash_table_search(ht, &key); + struct crocus_bo *bo = entry ? entry->data : NULL; + + if (bo) { + assert(bo->external); + assert(!bo->reusable); + + /* Being non-reusable, the BO cannot be in the cache lists, but it + * may be in the zombie list if it had reached zero references, but + * we hadn't yet closed it...and then reimported the same BO. If it + * is, then remove it since it's now been resurrected. + */ + if (bo->head.prev || bo->head.next) + list_del(&bo->head); + + crocus_bo_reference(bo); + } + + return bo; +} + +/** + * This function finds the correct bucket fit for the input size. + * The function works with O(1) complexity when the requested size + * was queried instead of iterating the size through all the buckets. + */ +static struct bo_cache_bucket * +bucket_for_size(struct crocus_bufmgr *bufmgr, uint64_t size) +{ + /* Calculating the pages and rounding up to the page size. */ + const unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + + /* Row Bucket sizes clz((x-1) | 3) Row Column + * in pages stride size + * 0: 1 2 3 4 -> 30 30 30 30 4 1 + * 1: 5 6 7 8 -> 29 29 29 29 4 1 + * 2: 10 12 14 16 -> 28 28 28 28 8 2 + * 3: 20 24 28 32 -> 27 27 27 27 16 4 + */ + const unsigned row = 30 - __builtin_clz((pages - 1) | 3); + const unsigned row_max_pages = 4 << row; + + /* The '& ~2' is the special case for row 1. In row 1, max pages / + * 2 is 2, but the previous row maximum is zero (because there is + * no previous row). All row maximum sizes are power of 2, so that + * is the only case where that bit will be set. + */ + const unsigned prev_row_max_pages = (row_max_pages / 2) & ~2; + int col_size_log2 = row - 1; + col_size_log2 += (col_size_log2 < 0); + + const unsigned col = (pages - prev_row_max_pages + + ((1 << col_size_log2) - 1)) >> col_size_log2; + + /* Calculating the index based on the row and column. */ + const unsigned index = (row * 4) + (col - 1); + + return (index < bufmgr->num_buckets) ? + &bufmgr->cache_bucket[index] : NULL; +} + + +int +crocus_bo_busy(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct drm_i915_gem_busy busy = { .handle = bo->gem_handle }; + + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy); + if (ret == 0) { + bo->idle = !busy.busy; + return busy.busy; + } + return false; +} + +int +crocus_bo_madvise(struct crocus_bo *bo, int state) +{ + struct drm_i915_gem_madvise madv = { + .handle = bo->gem_handle, + .madv = state, + .retained = 1, + }; + + intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv); + + return madv.retained; +} + +static struct crocus_bo * +bo_calloc(void) +{ + struct crocus_bo *bo = calloc(1, sizeof(*bo)); + if (!bo) + return NULL; + + list_inithead(&bo->exports); + bo->hash = _mesa_hash_pointer(bo); + return bo; +} + +static struct crocus_bo * +alloc_bo_from_cache(struct crocus_bufmgr *bufmgr, + struct bo_cache_bucket *bucket, + uint32_t alignment, + unsigned flags) +{ + if (!bucket) + return NULL; + + struct crocus_bo *bo = NULL; + + list_for_each_entry_safe(struct crocus_bo, cur, &bucket->head, head) { + /* If the last BO in the cache is busy, there are no idle BOs. Bail, + * either falling back to a non-matching memzone, or if that fails, + * allocating a fresh buffer. + */ + if (crocus_bo_busy(cur)) + return NULL; + + list_del(&cur->head); + + /* Tell the kernel we need this BO. If it still exists, we're done! */ + if (crocus_bo_madvise(cur, I915_MADV_WILLNEED)) { + bo = cur; + break; + } + + /* This BO was purged, throw it out and keep looking. */ + bo_free(cur); + } + + if (!bo) + return NULL; + + /* Zero the contents if necessary. If this fails, fall back to + * allocating a fresh BO, which will always be zeroed by the kernel. + */ + if (flags & BO_ALLOC_ZEROED) { + void *map = crocus_bo_map(NULL, bo, MAP_WRITE | MAP_RAW); + if (map) { + memset(map, 0, bo->size); + } else { + bo_free(bo); + return NULL; + } + } + + return bo; +} + +static struct crocus_bo * +alloc_fresh_bo(struct crocus_bufmgr *bufmgr, uint64_t bo_size) +{ + struct crocus_bo *bo = bo_calloc(); + if (!bo) + return NULL; + + struct drm_i915_gem_create create = { .size = bo_size }; + + /* All new BOs we get from the kernel are zeroed, so we don't need to + * worry about that here. + */ + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create) != 0) { + free(bo); + return NULL; + } + + bo->gem_handle = create.handle; + bo->bufmgr = bufmgr; + bo->size = bo_size; + bo->idle = true; + bo->tiling_mode = I915_TILING_NONE; + bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE; + bo->stride = 0; + + /* Calling set_domain() will allocate pages for the BO outside of the + * struct mutex lock in the kernel, which is more efficient than waiting + * to create them during the first execbuf that uses the BO. + */ + struct drm_i915_gem_set_domain sd = { + .handle = bo->gem_handle, + .read_domains = I915_GEM_DOMAIN_CPU, + .write_domain = 0, + }; + + if (intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) { + bo_free(bo); + return NULL; + } + + return bo; +} + +static struct crocus_bo * +bo_alloc_internal(struct crocus_bufmgr *bufmgr, + const char *name, + uint64_t size, + uint32_t alignment, + unsigned flags, + uint32_t tiling_mode, + uint32_t stride) +{ + struct crocus_bo *bo; + unsigned int page_size = getpagesize(); + struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size); + + /* Round the size up to the bucket size, or if we don't have caching + * at this size, a multiple of the page size. + */ + uint64_t bo_size = + bucket ? bucket->size : MAX2(ALIGN(size, page_size), page_size); + + mtx_lock(&bufmgr->lock); + + /* Get a buffer out of the cache if available. First, we try to find + * one with a matching memory zone so we can avoid reallocating VMA. + */ + bo = alloc_bo_from_cache(bufmgr, bucket, alignment, flags); + + mtx_unlock(&bufmgr->lock); + + if (!bo) { + bo = alloc_fresh_bo(bufmgr, bo_size); + if (!bo) + return NULL; + } + + if (bo_set_tiling_internal(bo, tiling_mode, stride)) + goto err_free; + + bo->name = name; + p_atomic_set(&bo->refcount, 1); + bo->reusable = bucket && bufmgr->bo_reuse; + bo->cache_coherent = bufmgr->has_llc; + bo->index = -1; + bo->kflags = 0; + + if ((flags & BO_ALLOC_COHERENT) && !bo->cache_coherent) { + struct drm_i915_gem_caching arg = { + .handle = bo->gem_handle, + .caching = 1, + }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) == 0) { + bo->cache_coherent = true; + bo->reusable = false; + } + } + + DBG("bo_create: buf %d (%s) %llub\n", bo->gem_handle, + bo->name, (unsigned long long) size); + + return bo; + +err_free: + bo_free(bo); + return NULL; +} + +struct crocus_bo * +crocus_bo_alloc(struct crocus_bufmgr *bufmgr, + const char *name, + uint64_t size) +{ + return bo_alloc_internal(bufmgr, name, size, 1, + 0, I915_TILING_NONE, 0); +} + +struct crocus_bo * +crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr, const char *name, + uint64_t size, uint32_t alignment, + uint32_t tiling_mode, uint32_t pitch, unsigned flags) +{ + return bo_alloc_internal(bufmgr, name, size, alignment, + flags, tiling_mode, pitch); +} + +struct crocus_bo * +crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr, const char *name, + void *ptr, size_t size) +{ + struct crocus_bo *bo; + + bo = bo_calloc(); + if (!bo) + return NULL; + + struct drm_i915_gem_userptr arg = { + .user_ptr = (uintptr_t)ptr, + .user_size = size, + }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_USERPTR, &arg)) + goto err_free; + bo->gem_handle = arg.handle; + + /* Check the buffer for validity before we try and use it in a batch */ + struct drm_i915_gem_set_domain sd = { + .handle = bo->gem_handle, + .read_domains = I915_GEM_DOMAIN_CPU, + }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd)) + goto err_close; + + bo->name = name; + bo->size = size; + bo->map_cpu = ptr; + + bo->bufmgr = bufmgr; + bo->kflags = 0; + + if (bo->gtt_offset == 0ull) + goto err_close; + + p_atomic_set(&bo->refcount, 1); + bo->userptr = true; + bo->cache_coherent = true; + bo->index = -1; + bo->idle = true; + + return bo; + +err_close: + intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &bo->gem_handle); +err_free: + free(bo); + return NULL; +} + +/** + * Returns a crocus_bo wrapping the given buffer object handle. + * + * This can be used when one application needs to pass a buffer object + * to another. + */ +struct crocus_bo * +crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr, + const char *name, unsigned int handle) +{ + struct crocus_bo *bo; + + /* At the moment most applications only have a few named bo. + * For instance, in a DRI client only the render buffers passed + * between X and the client are named. And since X returns the + * alternating names for the front/back buffer a linear search + * provides a sufficiently fast match. + */ + mtx_lock(&bufmgr->lock); + bo = find_and_ref_external_bo(bufmgr->name_table, handle); + if (bo) + goto out; + + struct drm_gem_open open_arg = { .name = handle }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_OPEN, &open_arg); + if (ret != 0) { + DBG("Couldn't reference %s handle 0x%08x: %s\n", + name, handle, strerror(errno)); + bo = NULL; + goto out; + } + /* Now see if someone has used a prime handle to get this + * object from the kernel before by looking through the list + * again for a matching gem_handle + */ + bo = find_and_ref_external_bo(bufmgr->handle_table, open_arg.handle); + if (bo) + goto out; + + bo = bo_calloc(); + if (!bo) + goto out; + + p_atomic_set(&bo->refcount, 1); + + bo->size = open_arg.size; + bo->gtt_offset = 0; + bo->bufmgr = bufmgr; + bo->gem_handle = open_arg.handle; + bo->name = name; + bo->global_name = handle; + bo->reusable = false; + bo->external = true; + bo->kflags = 0; + + _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); + _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); + + struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle }; + ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling); + if (ret != 0) + goto err_unref; + + bo->tiling_mode = get_tiling.tiling_mode; + bo->swizzle_mode = get_tiling.swizzle_mode; + /* XXX stride is unknown */ + DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name); + +out: + mtx_unlock(&bufmgr->lock); + return bo; + +err_unref: + bo_free(bo); + mtx_unlock(&bufmgr->lock); + return NULL; +} + +static void +bo_close(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bo->external) { + struct hash_entry *entry; + + if (bo->global_name) { + entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name); + _mesa_hash_table_remove(bufmgr->name_table, entry); + } + + entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle); + _mesa_hash_table_remove(bufmgr->handle_table, entry); + } + + /* Close this object */ + struct drm_gem_close close = { .handle = bo->gem_handle }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close); + if (ret != 0) { + DBG("DRM_IOCTL_GEM_CLOSE %d failed (%s): %s\n", + bo->gem_handle, bo->name, strerror(errno)); + } + + free(bo); +} + +static void +bo_free(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bo->map_cpu && !bo->userptr) { + VG_NOACCESS(bo->map_cpu, bo->size); + munmap(bo->map_cpu, bo->size); + } + if (bo->map_wc) { + VG_NOACCESS(bo->map_wc, bo->size); + munmap(bo->map_wc, bo->size); + } + if (bo->map_gtt) { + VG_NOACCESS(bo->map_gtt, bo->size); + munmap(bo->map_gtt, bo->size); + } + + if (bo->idle) { + bo_close(bo); + } else { + /* Defer closing the GEM BO and returning the VMA for reuse until the + * BO is idle. Just move it to the dead list for now. + */ + list_addtail(&bo->head, &bufmgr->zombie_list); + } +} + +/** Frees all cached buffers significantly older than @time. */ +static void +cleanup_bo_cache(struct crocus_bufmgr *bufmgr, time_t time) +{ + int i; + + if (bufmgr->time == time) + return; + + for (i = 0; i < bufmgr->num_buckets; i++) { + struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i]; + + list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) { + if (time - bo->free_time <= 1) + break; + + list_del(&bo->head); + + bo_free(bo); + } + } + + list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) { + /* Stop once we reach a busy BO - all others past this point were + * freed more recently so are likely also busy. + */ + if (!bo->idle && crocus_bo_busy(bo)) + break; + + list_del(&bo->head); + bo_close(bo); + } + + bufmgr->time = time; +} + +static void +bo_unreference_final(struct crocus_bo *bo, time_t time) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct bo_cache_bucket *bucket; + + DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name); + + bucket = NULL; + if (bo->reusable) + bucket = bucket_for_size(bufmgr, bo->size); + /* Put the buffer into our internal cache for reuse if we can. */ + if (bucket && crocus_bo_madvise(bo, I915_MADV_DONTNEED)) { + bo->free_time = time; + bo->name = NULL; + + list_addtail(&bo->head, &bucket->head); + } else { + bo_free(bo); + } +} + +void +crocus_bo_unreference(struct crocus_bo *bo) +{ + if (bo == NULL) + return; + + assert(p_atomic_read(&bo->refcount) > 0); + + if (atomic_add_unless(&bo->refcount, -1, 1)) { + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct timespec time; + + clock_gettime(CLOCK_MONOTONIC, &time); + + mtx_lock(&bufmgr->lock); + + if (p_atomic_dec_zero(&bo->refcount)) { + bo_unreference_final(bo, time.tv_sec); + cleanup_bo_cache(bufmgr, time.tv_sec); + } + + mtx_unlock(&bufmgr->lock); + } +} + +static void +bo_wait_with_stall_warning(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, + const char *action) +{ + bool busy = dbg && !bo->idle; + double elapsed = unlikely(busy) ? -get_time() : 0.0; + + crocus_bo_wait_rendering(bo); + + if (unlikely(busy)) { + elapsed += get_time(); + if (elapsed > 1e-5) /* 0.01ms */ { + perf_debug(dbg, "%s a busy \"%s\" BO stalled and took %.03f ms.\n", + action, bo->name, elapsed * 1000); + } + } +} + +static void +print_flags(unsigned flags) +{ + if (flags & MAP_READ) + DBG("READ "); + if (flags & MAP_WRITE) + DBG("WRITE "); + if (flags & MAP_ASYNC) + DBG("ASYNC "); + if (flags & MAP_PERSISTENT) + DBG("PERSISTENT "); + if (flags & MAP_COHERENT) + DBG("COHERENT "); + if (flags & MAP_RAW) + DBG("RAW "); + DBG("\n"); +} + +static void * +crocus_bo_gem_mmap_legacy(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, bool wc) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + struct drm_i915_gem_mmap mmap_arg = { + .handle = bo->gem_handle, + .size = bo->size, + .flags = wc ? I915_MMAP_WC : 0, + }; + + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg); + if (ret != 0) { + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + void *map = (void *) (uintptr_t) mmap_arg.addr_ptr; + + return map; +} + +static void * +crocus_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct crocus_bo *bo, + bool wc) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + struct drm_i915_gem_mmap_offset mmap_arg = { + .handle = bo->gem_handle, + .flags = wc ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB, + }; + + /* Get the fake offset back */ + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mmap_arg); + if (ret != 0) { + DBG("%s:%d: Error preparing buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* And map it */ + void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, + bufmgr->fd, mmap_arg.offset); + if (map == MAP_FAILED) { + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + return map; +} + +static void * +crocus_bo_gem_mmap(struct pipe_debug_callback *dbg, struct crocus_bo *bo, bool wc) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bufmgr->has_mmap_offset) + return crocus_bo_gem_mmap_offset(dbg, bo, wc); + else + return crocus_bo_gem_mmap_legacy(dbg, bo, wc); +} + +static void * +crocus_bo_map_cpu(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + /* We disallow CPU maps for writing to non-coherent buffers, as the + * CPU map can become invalidated when a batch is flushed out, which + * can happen at unpredictable times. You should use WC maps instead. + */ + assert(bo->cache_coherent || !(flags & MAP_WRITE)); + + if (!bo->map_cpu) { + DBG("crocus_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name); + + void *map = crocus_bo_gem_mmap(dbg, bo, false); + if (!map) { + return NULL; + } + + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) { + VG_NOACCESS(map, bo->size); + munmap(map, bo->size); + } + } + assert(bo->map_cpu); + + DBG("crocus_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name, + bo->map_cpu); + print_flags(flags); + + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(dbg, bo, "CPU mapping"); + } + + if (!bo->cache_coherent && !bo->bufmgr->has_llc) { + /* If we're reusing an existing CPU mapping, the CPU caches may + * contain stale data from the last time we read from that mapping. + * (With the BO cache, it might even be data from a previous buffer!) + * Even if it's a brand new mapping, the kernel may have zeroed the + * buffer via CPU writes. + * + * We need to invalidate those cachelines so that we see the latest + * contents, and so long as we only read from the CPU mmap we do not + * need to write those cachelines back afterwards. + * + * On LLC, the emprical evidence suggests that writes from the GPU + * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU + * cachelines. (Other reads, such as the display engine, bypass the + * LLC entirely requiring us to keep dirty pixels for the scanout + * out of any cache.) + */ + intel_invalidate_range(bo->map_cpu, bo->size); + } + + return bo->map_cpu; +} + +static void * +crocus_bo_map_wc(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + if (!bo->map_wc) { + DBG("crocus_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name); + + void *map = crocus_bo_gem_mmap(dbg, bo, true); + if (!map) { + return NULL; + } + + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) { + VG_NOACCESS(map, bo->size); + munmap(map, bo->size); + } + } + assert(bo->map_wc); + + DBG("crocus_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc); + print_flags(flags); + + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(dbg, bo, "WC mapping"); + } + + return bo->map_wc; +} + +/** + * Perform an uncached mapping via the GTT. + * + * Write access through the GTT is not quite fully coherent. On low power + * systems especially, like modern Atoms, we can observe reads from RAM before + * the write via GTT has landed. A write memory barrier that flushes the Write + * Combining Buffer (i.e. sfence/mfence) is not sufficient to order the later + * read after the write as the GTT write suffers a small delay through the GTT + * indirection. The kernel uses an uncached mmio read to ensure the GTT write + * is ordered with reads (either by the GPU, WB or WC) and unconditionally + * flushes prior to execbuf submission. However, if we are not informing the + * kernel about our GTT writes, it will not flush before earlier access, such + * as when using the cmdparser. Similarly, we need to be careful if we should + * ever issue a CPU read immediately following a GTT write. + * + * Telling the kernel about write access also has one more important + * side-effect. Upon receiving notification about the write, it cancels any + * scanout buffering for FBC/PSR and friends. Later FBC/PSR is then flushed by + * either SW_FINISH or DIRTYFB. The presumption is that we never write to the + * actual scanout via a mmaping, only to a backbuffer and so all the FBC/PSR + * tracking is handled on the buffer exchange instead. + */ +static void * +crocus_bo_map_gtt(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + /* If we don't support get/set_tiling, there's no support for GTT mapping + * either (it won't do any de-tiling for us). + */ + assert(bufmgr->has_tiling_uapi); + + /* Get a mapping of the buffer if we haven't before. */ + if (bo->map_gtt == NULL) { + DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name); + + struct drm_i915_gem_mmap_gtt mmap_arg = { .handle = bo->gem_handle }; + + /* Get the fake offset back... */ + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg); + if (ret != 0) { + DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* and mmap it. */ + void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, + MAP_SHARED, bufmgr->fd, mmap_arg.offset); + if (map == MAP_FAILED) { + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* We don't need to use VALGRIND_MALLOCLIKE_BLOCK because Valgrind will + * already intercept this mmap call. However, for consistency between + * all the mmap paths, we mark the pointer as defined now and mark it + * as inaccessible afterwards. + */ + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) { + VG_NOACCESS(map, bo->size); + munmap(map, bo->size); + } + } + assert(bo->map_gtt); + + DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt); + print_flags(flags); + + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(dbg, bo, "GTT mapping"); + } + + return bo->map_gtt; +} + +static bool +can_map_cpu(struct crocus_bo *bo, unsigned flags) +{ + if (bo->cache_coherent) + return true; + + /* Even if the buffer itself is not cache-coherent (such as a scanout), on + * an LLC platform reads always are coherent (as they are performed via the + * central system agent). It is just the writes that we need to take special + * care to ensure that land in main memory and not stick in the CPU cache. + */ + if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc) + return true; + + /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid + * across batch flushes where the kernel will change cache domains of the + * bo, invalidating continued access to the CPU mmap on non-LLC device. + * + * Similarly, ASYNC typically means that the buffer will be accessed via + * both the CPU and the GPU simultaneously. Batches may be executed that + * use the BO even while it is mapped. While OpenGL technically disallows + * most drawing while non-persistent mappings are active, we may still use + * the GPU for blits or other operations, causing batches to happen at + * inconvenient times. + * + * If RAW is set, we expect the caller to be able to handle a WC buffer + * more efficiently than the involuntary clflushes. + */ + if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC | MAP_RAW)) + return false; + + return !(flags & MAP_WRITE); +} + +void * +crocus_bo_map(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW)) + return crocus_bo_map_gtt(dbg, bo, flags); + + void *map; + + if (can_map_cpu(bo, flags)) + map = crocus_bo_map_cpu(dbg, bo, flags); + else + map = crocus_bo_map_wc(dbg, bo, flags); + + /* Allow the attempt to fail by falling back to the GTT where necessary. + * + * Not every buffer can be mmaped directly using the CPU (or WC), for + * example buffers that wrap stolen memory or are imported from other + * devices. For those, we have little choice but to use a GTT mmapping. + * However, if we use a slow GTT mmapping for reads where we expected fast + * access, that order of magnitude difference in throughput will be clearly + * expressed by angry users. + * + * We skip MAP_RAW because we want to avoid map_gtt's fence detiling. + */ + if (!map && !(flags & MAP_RAW)) { + perf_debug(dbg, "Fallback GTT mapping for %s with access flags %x\n", + bo->name, flags); + map = crocus_bo_map_gtt(dbg, bo, flags); + } + + return map; +} + +/** Waits for all GPU rendering with the object to have completed. */ +void +crocus_bo_wait_rendering(struct crocus_bo *bo) +{ + /* We require a kernel recent enough for WAIT_IOCTL support. + * See intel_init_bufmgr() + */ + crocus_bo_wait(bo, -1); +} + +/** + * Waits on a BO for the given amount of time. + * + * @bo: buffer object to wait for + * @timeout_ns: amount of time to wait in nanoseconds. + * If value is less than 0, an infinite wait will occur. + * + * Returns 0 if the wait was successful ie. the last batch referencing the + * object has completed within the allotted time. Otherwise some negative return + * value describes the error. Of particular interest is -ETIME when the wait has + * failed to yield the desired result. + * + * Similar to crocus_bo_wait_rendering except a timeout parameter allows + * the operation to give up after a certain amount of time. Another subtle + * difference is the internal locking semantics are different (this variant does + * not hold the lock for the duration of the wait). This makes the wait subject + * to a larger userspace race window. + * + * The implementation shall wait until the object is no longer actively + * referenced within a batch buffer at the time of the call. The wait will + * not guarantee that the buffer is re-issued via another thread, or an flinked + * handle. Userspace must make sure this race does not occur if such precision + * is important. + * + * Note that some kernels have broken the inifite wait for negative values + * promise, upgrade to latest stable kernels if this is the case. + */ +int +crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + /* If we know it's idle, don't bother with the kernel round trip */ + if (bo->idle && !bo->external) + return 0; + + struct drm_i915_gem_wait wait = { + .bo_handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait); + if (ret != 0) + return -errno; + + bo->idle = true; + + return ret; +} + +static void +crocus_bufmgr_destroy(struct crocus_bufmgr *bufmgr) +{ + mtx_destroy(&bufmgr->lock); + + /* Free any cached buffer objects we were going to reuse */ + for (int i = 0; i < bufmgr->num_buckets; i++) { + struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i]; + + list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) { + list_del(&bo->head); + + bo_free(bo); + } + } + + /* Close any buffer objects on the dead list. */ + list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) { + list_del(&bo->head); + bo_close(bo); + } + + _mesa_hash_table_destroy(bufmgr->name_table, NULL); + _mesa_hash_table_destroy(bufmgr->handle_table, NULL); + + close(bufmgr->fd); + + free(bufmgr); +} + +static int +bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode, + uint32_t stride) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct drm_i915_gem_set_tiling set_tiling; + int ret; + + if (bo->global_name == 0 && + tiling_mode == bo->tiling_mode && stride == bo->stride) + return 0; + + memset(&set_tiling, 0, sizeof(set_tiling)); + do { + /* set_tiling is slightly broken and overwrites the + * input on the error path, so we have to open code + * drm_ioctl. + */ + set_tiling.handle = bo->gem_handle; + set_tiling.tiling_mode = tiling_mode; + set_tiling.stride = stride; + + ret = ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling); + } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); + if (ret == -1) + return -errno; + + bo->tiling_mode = set_tiling.tiling_mode; + bo->swizzle_mode = set_tiling.swizzle_mode; + bo->stride = set_tiling.stride; + return 0; +} + +int +crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode, + uint32_t *swizzle_mode) +{ + *tiling_mode = bo->tiling_mode; + *swizzle_mode = bo->swizzle_mode; + return 0; +} + +struct crocus_bo * +crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr, int prime_fd, + uint32_t tiling, uint32_t stride) +{ + uint32_t handle; + struct crocus_bo *bo; + + mtx_lock(&bufmgr->lock); + int ret = drmPrimeFDToHandle(bufmgr->fd, prime_fd, &handle); + if (ret) { + DBG("import_dmabuf: failed to obtain handle from fd: %s\n", + strerror(errno)); + mtx_unlock(&bufmgr->lock); + return NULL; + } + + /* + * See if the kernel has already returned this buffer to us. Just as + * for named buffers, we must not create two bo's pointing at the same + * kernel object + */ + bo = find_and_ref_external_bo(bufmgr->handle_table, handle); + if (bo) + goto out; + + bo = bo_calloc(); + if (!bo) + goto out; + + p_atomic_set(&bo->refcount, 1); + + /* Determine size of bo. The fd-to-handle ioctl really should + * return the size, but it doesn't. If we have kernel 3.12 or + * later, we can lseek on the prime fd to get the size. Older + * kernels will just fail, in which case we fall back to the + * provided (estimated or guess size). */ + ret = lseek(prime_fd, 0, SEEK_END); + if (ret != -1) + bo->size = ret; + + bo->bufmgr = bufmgr; + bo->name = "prime"; + bo->reusable = false; + bo->external = true; + bo->kflags = 0; + bo->gem_handle = handle; + _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); + + struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) + goto err; + + if (get_tiling.tiling_mode == tiling || tiling > I915_TILING_LAST) { + bo->tiling_mode = get_tiling.tiling_mode; + bo->swizzle_mode = get_tiling.swizzle_mode; + /* XXX stride is unknown */ + } else { + if (bo_set_tiling_internal(bo, tiling, stride)) { + goto err; + } + } + +out: + mtx_unlock(&bufmgr->lock); + return bo; + +err: + bo_free(bo); + mtx_unlock(&bufmgr->lock); + return NULL; +} + +static void +crocus_bo_make_external_locked(struct crocus_bo *bo) +{ + if (!bo->external) { + _mesa_hash_table_insert(bo->bufmgr->handle_table, &bo->gem_handle, bo); + bo->external = true; + bo->reusable = false; + } +} + +static void +crocus_bo_make_external(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bo->external) { + assert(!bo->reusable); + return; + } + + mtx_lock(&bufmgr->lock); + crocus_bo_make_external_locked(bo); + mtx_unlock(&bufmgr->lock); +} + +int +crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + crocus_bo_make_external(bo); + + if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle, + DRM_CLOEXEC, prime_fd) != 0) + return -errno; + + return 0; +} + +uint32_t +crocus_bo_export_gem_handle(struct crocus_bo *bo) +{ + crocus_bo_make_external(bo); + + return bo->gem_handle; +} + +int +crocus_bo_flink(struct crocus_bo *bo, uint32_t *name) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (!bo->global_name) { + struct drm_gem_flink flink = { .handle = bo->gem_handle }; + + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink)) + return -errno; + + mtx_lock(&bufmgr->lock); + if (!bo->global_name) { + crocus_bo_make_external_locked(bo); + bo->global_name = flink.name; + _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); + } + mtx_unlock(&bufmgr->lock); + } + + *name = bo->global_name; + return 0; +} + +int +crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd, + uint32_t *out_handle) +{ + /* Only add the new GEM handle to the list of export if it belongs to a + * different GEM device. Otherwise we might close the same buffer multiple + * times. + */ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + int ret = os_same_file_description(drm_fd, bufmgr->fd); + WARN_ONCE(ret < 0, + "Kernel has no file descriptor comparison support: %s\n", + strerror(errno)); + if (ret == 0) { + *out_handle = crocus_bo_export_gem_handle(bo); + return 0; + } + + struct bo_export *export = calloc(1, sizeof(*export)); + if (!export) + return -ENOMEM; + + export->drm_fd = drm_fd; + + int dmabuf_fd = -1; + int err = crocus_bo_export_dmabuf(bo, &dmabuf_fd); + if (err) { + free(export); + return err; + } + + mtx_lock(&bufmgr->lock); + err = drmPrimeFDToHandle(drm_fd, dmabuf_fd, &export->gem_handle); + close(dmabuf_fd); + if (err) { + mtx_unlock(&bufmgr->lock); + free(export); + return err; + } + + bool found = false; + list_for_each_entry(struct bo_export, iter, &bo->exports, link) { + if (iter->drm_fd != drm_fd) + continue; + /* Here we assume that for a given DRM fd, we'll always get back the + * same GEM handle for a given buffer. + */ + assert(iter->gem_handle == export->gem_handle); + free(export); + export = iter; + found = true; + break; + } + if (!found) + list_addtail(&export->link, &bo->exports); + + mtx_unlock(&bufmgr->lock); + + *out_handle = export->gem_handle; + + return 0; +} + +static void +add_bucket(struct crocus_bufmgr *bufmgr, int size) +{ + unsigned int i = bufmgr->num_buckets; + + assert(i < ARRAY_SIZE(bufmgr->cache_bucket)); + + list_inithead(&bufmgr->cache_bucket[i].head); + bufmgr->cache_bucket[i].size = size; + bufmgr->num_buckets++; + + assert(bucket_for_size(bufmgr, size) == &bufmgr->cache_bucket[i]); + assert(bucket_for_size(bufmgr, size - 2048) == &bufmgr->cache_bucket[i]); + assert(bucket_for_size(bufmgr, size + 1) != &bufmgr->cache_bucket[i]); +} + +static void +init_cache_buckets(struct crocus_bufmgr *bufmgr) +{ + uint64_t size, cache_max_size = 64 * 1024 * 1024; + + /* OK, so power of two buckets was too wasteful of memory. + * Give 3 other sizes between each power of two, to hopefully + * cover things accurately enough. (The alternative is + * probably to just go for exact matching of sizes, and assume + * that for things like composited window resize the tiled + * width/height alignment and rounding of sizes to pages will + * get us useful cache hit rates anyway) + */ + add_bucket(bufmgr, PAGE_SIZE); + add_bucket(bufmgr, PAGE_SIZE * 2); + add_bucket(bufmgr, PAGE_SIZE * 3); + + /* Initialize the linked lists for BO reuse cache. */ + for (size = 4 * PAGE_SIZE; size <= cache_max_size; size *= 2) { + add_bucket(bufmgr, size); + + add_bucket(bufmgr, size + size * 1 / 4); + add_bucket(bufmgr, size + size * 2 / 4); + add_bucket(bufmgr, size + size * 3 / 4); + } +} + +uint32_t +crocus_create_hw_context(struct crocus_bufmgr *bufmgr) +{ + struct drm_i915_gem_context_create create = { }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create); + if (ret != 0) { + DBG("DRM_IOCTL_I915_GEM_CONTEXT_CREATE failed: %s\n", strerror(errno)); + return 0; + } + + /* Upon declaring a GPU hang, the kernel will zap the guilty context + * back to the default logical HW state and attempt to continue on to + * our next submitted batchbuffer. However, our render batches assume + * the previous GPU state is preserved, and only emit commands needed + * to incrementally change that state. In particular, we inherit the + * STATE_BASE_ADDRESS and PIPELINE_SELECT settings, which are critical. + * With default base addresses, our next batches will almost certainly + * cause more GPU hangs, leading to repeated hangs until we're banned + * or the machine is dead. + * + * Here we tell the kernel not to attempt to recover our context but + * immediately (on the next batchbuffer submission) report that the + * context is lost, and we will do the recovery ourselves. Ideally, + * we'll have two lost batches instead of a continual stream of hangs. + */ + struct drm_i915_gem_context_param p = { + .ctx_id = create.ctx_id, + .param = I915_CONTEXT_PARAM_RECOVERABLE, + .value = false, + }; + drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p); + + return create.ctx_id; +} + +static int +crocus_hw_context_get_priority(struct crocus_bufmgr *bufmgr, uint32_t ctx_id) +{ + struct drm_i915_gem_context_param p = { + .ctx_id = ctx_id, + .param = I915_CONTEXT_PARAM_PRIORITY, + }; + drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p); + return p.value; /* on error, return 0 i.e. default priority */ +} + +int +crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr, + uint32_t ctx_id, + int priority) +{ + struct drm_i915_gem_context_param p = { + .ctx_id = ctx_id, + .param = I915_CONTEXT_PARAM_PRIORITY, + .value = priority, + }; + int err; + + err = 0; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p)) + err = -errno; + + return err; +} + +uint32_t +crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id) +{ + uint32_t new_ctx = crocus_create_hw_context(bufmgr); + + if (new_ctx) { + int priority = crocus_hw_context_get_priority(bufmgr, ctx_id); + crocus_hw_context_set_priority(bufmgr, new_ctx, priority); + } + + return new_ctx; +} + +void +crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id) +{ + struct drm_i915_gem_context_destroy d = { .ctx_id = ctx_id }; + + if (ctx_id != 0 && + intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &d) != 0) { + fprintf(stderr, "DRM_IOCTL_I915_GEM_CONTEXT_DESTROY failed: %s\n", + strerror(errno)); + } +} + +int +crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset, uint64_t *result) +{ + struct drm_i915_reg_read reg_read = { .offset = offset }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_REG_READ, ®_read); + + *result = reg_read.val; + return ret; +} + +static int +gem_param(int fd, int name) +{ + int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */ + + struct drm_i915_getparam gp = { .param = name, .value = &v }; + if (intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp)) + return -1; + + return v; +} + +/** + * Initializes the GEM buffer manager, which uses the kernel to allocate, map, + * and manage map buffer objections. + * + * \param fd File descriptor of the opened DRM device. + */ +static struct crocus_bufmgr * +crocus_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse) +{ + struct crocus_bufmgr *bufmgr = calloc(1, sizeof(*bufmgr)); + if (bufmgr == NULL) + return NULL; + + /* Handles to buffer objects belong to the device fd and are not + * reference counted by the kernel. If the same fd is used by + * multiple parties (threads sharing the same screen bufmgr, or + * even worse the same device fd passed to multiple libraries) + * ownership of those handles is shared by those independent parties. + * + * Don't do this! Ensure that each library/bufmgr has its own device + * fd so that its namespace does not clash with another. + */ + bufmgr->fd = os_dupfd_cloexec(fd); + + p_atomic_set(&bufmgr->refcount, 1); + + if (mtx_init(&bufmgr->lock, mtx_plain) != 0) { + free(bufmgr); + return NULL; + } + + list_inithead(&bufmgr->zombie_list); + + bufmgr->has_llc = devinfo->has_llc; + bufmgr->has_tiling_uapi = devinfo->has_tiling_uapi; + bufmgr->bo_reuse = bo_reuse; + bufmgr->has_mmap_offset = gem_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4; + + init_cache_buckets(bufmgr); + + bufmgr->name_table = + _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + bufmgr->handle_table = + _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + + return bufmgr; +} + +static struct crocus_bufmgr * +crocus_bufmgr_ref(struct crocus_bufmgr *bufmgr) +{ + p_atomic_inc(&bufmgr->refcount); + return bufmgr; +} + +void +crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr) +{ + mtx_lock(&global_bufmgr_list_mutex); + if (p_atomic_dec_zero(&bufmgr->refcount)) { + list_del(&bufmgr->link); + crocus_bufmgr_destroy(bufmgr); + } + mtx_unlock(&global_bufmgr_list_mutex); +} + +/** + * Gets an already existing GEM buffer manager or create a new one. + * + * \param fd File descriptor of the opened DRM device. + */ +struct crocus_bufmgr * +crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd, bool bo_reuse) +{ + struct stat st; + + if (fstat(fd, &st)) + return NULL; + + struct crocus_bufmgr *bufmgr = NULL; + + mtx_lock(&global_bufmgr_list_mutex); + list_for_each_entry(struct crocus_bufmgr, iter_bufmgr, &global_bufmgr_list, link) { + struct stat iter_st; + if (fstat(iter_bufmgr->fd, &iter_st)) + continue; + + if (st.st_rdev == iter_st.st_rdev) { + assert(iter_bufmgr->bo_reuse == bo_reuse); + bufmgr = crocus_bufmgr_ref(iter_bufmgr); + goto unlock; + } + } + + bufmgr = crocus_bufmgr_create(devinfo, fd, bo_reuse); + if (bufmgr) + list_addtail(&bufmgr->link, &global_bufmgr_list); + + unlock: + mtx_unlock(&global_bufmgr_list_mutex); + + return bufmgr; +} + +int +crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr) +{ + return bufmgr->fd; +} diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.h b/src/gallium/drivers/crocus/crocus_bufmgr.h new file mode 100644 index 00000000000..8bb328fdeae --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_bufmgr.h @@ -0,0 +1,331 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_BUFMGR_H +#define CROCUS_BUFMGR_H + +#include +#include +#include +#include +#include "util/macros.h" +#include "util/u_atomic.h" +#include "util/list.h" +#include "pipe/p_defines.h" + +struct crocus_batch; +struct intel_device_info; +struct pipe_debug_callback; + +#define CROCUS_BINDER_SIZE (64 * 1024) +#define CROCUS_MAX_BINDERS 100 + +struct crocus_bo { + /** + * Size in bytes of the buffer object. + * + * The size may be larger than the size originally requested for the + * allocation, such as being aligned to page size. + */ + uint64_t size; + + /** Buffer manager context associated with this buffer object */ + struct crocus_bufmgr *bufmgr; + + /** The GEM handle for this buffer object. */ + uint32_t gem_handle; + + /** + * Virtual address of the buffer inside the PPGTT (Per-Process Graphics + * Translation Table). + * + * Although each hardware context has its own VMA, we assign BO's to the + * same address in all contexts, for simplicity. + */ + uint64_t gtt_offset; + + /** + * The validation list index for this buffer, or -1 when not in a batch. + * Note that a single buffer may be in multiple batches (contexts), and + * this is a global field, which refers to the last batch using the BO. + * It should not be considered authoritative, but can be used to avoid a + * linear walk of the validation list in the common case by guessing that + * exec_bos[bo->index] == bo and confirming whether that's the case. + * + * XXX: this is not ideal now that we have more than one batch per context, + * XXX: as the index will flop back and forth between the render index and + * XXX: compute index... + */ + unsigned index; + + /** + * Boolean of whether the GPU is definitely not accessing the buffer. + * + * This is only valid when reusable, since non-reusable + * buffers are those that have been shared with other + * processes, so we don't know their state. + */ + bool idle; + + int refcount; + const char *name; + + uint64_t kflags; + + /** + * Kenel-assigned global name for this object + * + * List contains both flink named and prime fd'd objects + */ + unsigned global_name; + + /** + * Current tiling mode + */ + uint32_t tiling_mode; + uint32_t swizzle_mode; + uint32_t stride; + + time_t free_time; + + /** Mapped address for the buffer, saved across map/unmap cycles */ + void *map_cpu; + /** GTT virtual address for the buffer, saved across map/unmap cycles */ + void *map_gtt; + /** WC CPU address for the buffer, saved across map/unmap cycles */ + void *map_wc; + + /** BO cache list */ + struct list_head head; + + /** List of GEM handle exports of this buffer (bo_export) */ + struct list_head exports; + + /** + * Boolean of whether this buffer can be re-used + */ + bool reusable; + + /** + * Boolean of whether this buffer has been shared with an external client. + */ + bool external; + + /** + * Boolean of whether this buffer is cache coherent + */ + bool cache_coherent; + + /** + * Boolean of whether this buffer points into user memory + */ + bool userptr; + + /** Pre-computed hash using _mesa_hash_pointer for cache tracking sets */ + uint32_t hash; +}; + +#define BO_ALLOC_ZEROED (1 << 0) +#define BO_ALLOC_COHERENT (1 << 1) + +/** + * Allocate a buffer object. + * + * Buffer objects are not necessarily initially mapped into CPU virtual + * address space or graphics device aperture. They must be mapped + * using crocus_bo_map() to be used by the CPU. + */ +struct crocus_bo *crocus_bo_alloc(struct crocus_bufmgr *bufmgr, + const char *name, uint64_t size); + +/** + * Allocate a tiled buffer object. + * + * Alignment for tiled objects is set automatically; the 'flags' + * argument provides a hint about how the object will be used initially. + * + * Valid tiling formats are: + * I915_TILING_NONE + * I915_TILING_X + * I915_TILING_Y + */ +struct crocus_bo *crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr, + const char *name, uint64_t size, + uint32_t alignment, + uint32_t tiling_mode, uint32_t pitch, + unsigned flags); + +struct crocus_bo *crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr, + const char *name, void *ptr, + size_t size); + +/** Takes a reference on a buffer object */ +static inline void +crocus_bo_reference(struct crocus_bo *bo) +{ + p_atomic_inc(&bo->refcount); +} + +/** + * Releases a reference on a buffer object, freeing the data if + * no references remain. + */ +void crocus_bo_unreference(struct crocus_bo *bo); + +#define MAP_READ PIPE_MAP_READ +#define MAP_WRITE PIPE_MAP_WRITE +#define MAP_ASYNC PIPE_MAP_UNSYNCHRONIZED +#define MAP_PERSISTENT PIPE_MAP_PERSISTENT +#define MAP_COHERENT PIPE_MAP_COHERENT +/* internal */ +#define MAP_INTERNAL_MASK (0xff << 24) +#define MAP_RAW (0x01 << 24) + +#define MAP_FLAGS (MAP_READ | MAP_WRITE | MAP_ASYNC | \ + MAP_PERSISTENT | MAP_COHERENT | MAP_INTERNAL_MASK) + +/** + * Maps the buffer into userspace. + * + * This function will block waiting for any existing execution on the + * buffer to complete, first. The resulting mapping is returned. + */ +MUST_CHECK void *crocus_bo_map(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags); + +/** + * Reduces the refcount on the userspace mapping of the buffer + * object. + */ +static inline int crocus_bo_unmap(struct crocus_bo *bo) { return 0; } + +/** + * Waits for rendering to an object by the GPU to have completed. + * + * This is not required for any access to the BO by bo_map, + * bo_subdata, etc. It is merely a way for the driver to implement + * glFinish. + */ +void crocus_bo_wait_rendering(struct crocus_bo *bo); + +/** + * Unref a buffer manager instance. + */ +void crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr); + +/** + * Get the current tiling (and resulting swizzling) mode for the bo. + * + * \param buf Buffer to get tiling mode for + * \param tiling_mode returned tiling mode + * \param swizzle_mode returned swizzling mode + */ +int crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode, + uint32_t *swizzle_mode); + +/** + * Create a visible name for a buffer which can be used by other apps + * + * \param buf Buffer to create a name for + * \param name Returned name + */ +int crocus_bo_flink(struct crocus_bo *bo, uint32_t *name); + +/** + * Is this buffer shared with external clients (exported)? + */ +static inline bool +crocus_bo_is_external(const struct crocus_bo *bo) +{ + return bo->external; +} + +/** + * Returns 1 if mapping the buffer for write could cause the process + * to block, due to the object being active in the GPU. + */ +int crocus_bo_busy(struct crocus_bo *bo); + +/** + * Specify the volatility of the buffer. + * \param bo Buffer to create a name for + * \param madv The purgeable status + * + * Use I915_MADV_DONTNEED to mark the buffer as purgeable, and it will be + * reclaimed under memory pressure. If you subsequently require the buffer, + * then you must pass I915_MADV_WILLNEED to mark the buffer as required. + * + * Returns 1 if the buffer was retained, or 0 if it was discarded whilst + * marked as I915_MADV_DONTNEED. + */ +int crocus_bo_madvise(struct crocus_bo *bo, int madv); + +/* drm_bacon_bufmgr_gem.c */ +struct crocus_bufmgr * +crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd, + bool bo_reuse); +int crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr); + +struct crocus_bo *crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr, + const char *name, + unsigned handle); + +int crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns); + +uint32_t crocus_create_hw_context(struct crocus_bufmgr *bufmgr); +uint32_t crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id); + +#define CROCUS_CONTEXT_LOW_PRIORITY ((I915_CONTEXT_MIN_USER_PRIORITY - 1) / 2) +#define CROCUS_CONTEXT_MEDIUM_PRIORITY (I915_CONTEXT_DEFAULT_PRIORITY) +#define CROCUS_CONTEXT_HIGH_PRIORITY ((I915_CONTEXT_MAX_USER_PRIORITY + 1) / 2) + +int crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr, + uint32_t ctx_id, int priority); + +void crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id); + +int crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd); +struct crocus_bo *crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr, + int prime_fd, uint32_t tiling, + uint32_t stride); + +/** + * Exports a bo as a GEM handle into a given DRM file descriptor + * \param bo Buffer to export + * \param drm_fd File descriptor where the new handle is created + * \param out_handle Pointer to store the new handle + * + * Returns 0 if the buffer was successfully exported, a non zero error code + * otherwise. + */ +int crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd, + uint32_t *out_handle); + +uint32_t crocus_bo_export_gem_handle(struct crocus_bo *bo); + +int crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset, + uint64_t *out); + +int drm_ioctl(int fd, unsigned long request, void *arg); + +#endif /* CROCUS_BUFMGR_H */ diff --git a/src/gallium/drivers/crocus/crocus_clear.c b/src/gallium/drivers/crocus/crocus_clear.c new file mode 100644 index 00000000000..1c56e23f794 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_clear.c @@ -0,0 +1,859 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_inlines.h" +#include "util/u_surface.h" +#include "util/format/u_format.h" +#include "util/u_upload_mgr.h" +#include "util/ralloc.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "intel/compiler/brw_compiler.h" +#include "util/format_srgb.h" + +static bool +crocus_is_color_fast_clear_compatible(struct crocus_context *ice, + enum isl_format format, + const union isl_color_value color) +{ + if (isl_format_has_int_channel(format)) { + perf_debug(&ice->dbg, "Integer fast clear not enabled for %s", + isl_format_get_name(format)); + return false; + } + + for (int i = 0; i < 4; i++) { + if (!isl_format_has_color_component(format, i)) { + continue; + } + + if (color.f32[i] != 0.0f && color.f32[i] != 1.0f) { + return false; + } + } + + return true; +} + +static bool +can_fast_clear_color(struct crocus_context *ice, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + enum isl_format format, + enum isl_format render_format, + union isl_color_value color) +{ + struct crocus_resource *res = (void *) p_res; + + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return false; + + if (!isl_aux_usage_has_fast_clears(res->aux.usage)) + return false; + + /* Check for partial clear */ + if (box->x > 0 || box->y > 0 || + box->width < minify(p_res->width0, level) || + box->height < minify(p_res->height0, level)) { + return false; + } + + /* Avoid conditional fast clears to maintain correct tracking of the aux + * state (see iris_resource_finish_write for more info). Note that partial + * fast clears (if they existed) would not pose a problem with conditional + * rendering. + */ + if (render_condition_enabled && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + return false; + } + + /* We store clear colors as floats or uints as needed. If there are + * texture views in play, the formats will not properly be respected + * during resolves because the resolve operations only know about the + * resource and not the renderbuffer. + */ + if (isl_format_srgb_to_linear(render_format) != + isl_format_srgb_to_linear(format)) { + return false; + } + + /* XXX: if (irb->mt->supports_fast_clear) + * see intel_miptree_create_for_dri_image() + */ + + if (!crocus_is_color_fast_clear_compatible(ice, format, color)) + return false; + + return true; +} + +static union isl_color_value +convert_fast_clear_color(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format render_format, + const union isl_color_value color) +{ + union isl_color_value override_color = color; + struct pipe_resource *p_res = (void *) res; + + const enum pipe_format format = p_res->format; + const struct util_format_description *desc = + util_format_description(format); + unsigned colormask = util_format_colormask(desc); + + if (util_format_is_intensity(format) || + util_format_is_luminance(format) || + util_format_is_luminance_alpha(format)) { + override_color.u32[1] = override_color.u32[0]; + override_color.u32[2] = override_color.u32[0]; + if (util_format_is_intensity(format)) + override_color.u32[3] = override_color.u32[0]; + } else { + for (int chan = 0; chan < 3; chan++) { + if (!(colormask & (1 << chan))) + override_color.u32[chan] = 0; + } + } + + if (util_format_is_unorm(format)) { + for (int i = 0; i < 4; i++) + override_color.f32[i] = CLAMP(override_color.f32[i], 0.0f, 1.0f); + } else if (util_format_is_snorm(format)) { + for (int i = 0; i < 4; i++) + override_color.f32[i] = CLAMP(override_color.f32[i], -1.0f, 1.0f); + } else if (util_format_is_pure_uint(format)) { + for (int i = 0; i < 4; i++) { + unsigned bits = util_format_get_component_bits( + format, UTIL_FORMAT_COLORSPACE_RGB, i); + if (bits < 32) { + uint32_t max = (1u << bits) - 1; + override_color.u32[i] = MIN2(override_color.u32[i], max); + } + } + } else if (util_format_is_pure_sint(format)) { + for (int i = 0; i < 4; i++) { + unsigned bits = util_format_get_component_bits( + format, UTIL_FORMAT_COLORSPACE_RGB, i); + if (bits < 32) { + int32_t max = (1 << (bits - 1)) - 1; + int32_t min = -(1 << (bits - 1)); + override_color.i32[i] = CLAMP(override_color.i32[i], min, max); + } + } + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT || + format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + /* these packed float formats only store unsigned values */ + for (int i = 0; i < 4; i++) + override_color.f32[i] = MAX2(override_color.f32[i], 0.0f); + } + + if (!(colormask & 1 << 3)) { + if (util_format_is_pure_integer(format)) + override_color.u32[3] = 1; + else + override_color.f32[3] = 1.0f; + } + + /* Handle linear to SRGB conversion */ + if (isl_format_is_srgb(render_format)) { + for (int i = 0; i < 3; i++) { + override_color.f32[i] = + util_format_linear_to_srgb_float(override_color.f32[i]); + } + } + + return override_color; +} + +static void +fast_clear_color(struct crocus_context *ice, + struct crocus_resource *res, + unsigned level, + const struct pipe_box *box, + enum isl_format format, + union isl_color_value color, + enum blorp_batch_flags blorp_flags) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + struct pipe_resource *p_res = (void *) res; + + color = convert_fast_clear_color(ice, res, format, color); + + bool color_changed = !!memcmp(&res->aux.clear_color, &color, + sizeof(color)); + + if (color_changed) { + /* If we are clearing to a new clear value, we need to resolve fast + * clears from other levels/layers first, since we can't have different + * levels/layers with different fast clear colors. + */ + for (unsigned res_lvl = 0; res_lvl < res->surf.levels; res_lvl++) { + const unsigned level_layers = + crocus_get_num_logical_layers(res, res_lvl); + for (unsigned layer = 0; layer < level_layers; layer++) { + if (res_lvl == level && + layer >= box->z && + layer < box->z + box->depth) { + /* We're going to clear this layer anyway. Leave it alone. */ + continue; + } + + enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, res_lvl, layer); + + if (aux_state != ISL_AUX_STATE_CLEAR && + aux_state != ISL_AUX_STATE_PARTIAL_CLEAR && + aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) { + /* This slice doesn't have any fast-cleared bits. */ + continue; + } + + /* If we got here, then the level may have fast-clear bits that use + * the old clear value. We need to do a color resolve to get rid + * of their use of the clear color before we can change it. + * Fortunately, few applications ever change their clear color at + * different levels/layers, so this shouldn't happen often. + */ + crocus_resource_prepare_access(ice, res, + res_lvl, 1, layer, 1, + res->aux.usage, + false); + perf_debug(&ice->dbg, + "Resolving resource (%p) level %d, layer %d: color changing from " + "(%0.2f, %0.2f, %0.2f, %0.2f) to " + "(%0.2f, %0.2f, %0.2f, %0.2f)\n", + res, res_lvl, layer, + res->aux.clear_color.f32[0], + res->aux.clear_color.f32[1], + res->aux.clear_color.f32[2], + res->aux.clear_color.f32[3], + color.f32[0], color.f32[1], color.f32[2], color.f32[3]); + } + } + } + + crocus_resource_set_clear_color(ice, res, color); + + /* If the buffer is already in ISL_AUX_STATE_CLEAR, and the color hasn't + * changed, the clear is redundant and can be skipped. + */ + const enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, box->z); + if (!color_changed && box->depth == 1 && aux_state == ISL_AUX_STATE_CLEAR) + return; + + /* Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)": + * + * "Any transition from any value in {Clear, Render, Resolve} to a + * different value in {Clear, Render, Resolve} requires end of pipe + * synchronization." + * + * In other words, fast clear ops are not properly synchronized with + * other drawing. We need to use a PIPE_CONTROL to ensure that the + * contents of the previous draw hit the render target before we resolve + * and again afterwards to ensure that the resolve is complete before we + * do any more regular drawing. + */ + crocus_emit_end_of_pipe_sync(batch, + "fast clear: pre-flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + /* If we reach this point, we need to fast clear to change the state to + * ISL_AUX_STATE_CLEAR, or to update the fast clear color (or both). + */ + blorp_flags |= color_changed ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR; + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + p_res, res->aux.usage, level, true); + + /* In newer gens (> 9), the hardware will do a linear -> sRGB conversion of + * the clear color during the fast clear, if the surface format is of sRGB + * type. We use the linear version of the surface format here to prevent + * that from happening, since we already do our own linear -> sRGB + * conversion in convert_fast_clear_color(). + */ + blorp_fast_clear(&blorp_batch, &surf, isl_format_srgb_to_linear(format), + ISL_SWIZZLE_IDENTITY, + level, box->z, box->depth, + box->x, box->y, box->x + box->width, + box->y + box->height); + blorp_batch_finish(&blorp_batch); + crocus_emit_end_of_pipe_sync(batch, + "fast clear: post flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + crocus_resource_set_aux_state(ice, res, level, box->z, + box->depth, ISL_AUX_STATE_CLEAR); + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; + return; +} + +static void +clear_color(struct crocus_context *ice, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + enum isl_format format, + struct isl_swizzle swizzle, + union isl_color_value color) +{ + struct crocus_resource *res = (void *) p_res; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + enum blorp_batch_flags blorp_flags = 0; + + if (render_condition_enabled) { + if (!crocus_check_conditional_render(ice)) + return; + + if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) + blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE; + } + + if (p_res->target == PIPE_BUFFER) + util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width); + + crocus_batch_maybe_flush(batch, 1500); + + bool can_fast_clear = can_fast_clear_color(ice, p_res, level, box, + render_condition_enabled, + res->surf.format, format, color); + if (can_fast_clear) { + fast_clear_color(ice, res, level, box, format, color, + blorp_flags); + return; + } + + bool color_write_disable[4] = { false, false, false, false }; + enum isl_aux_usage aux_usage = + crocus_resource_render_aux_usage(ice, res, format, + false, false); + + crocus_resource_prepare_render(ice, res, level, + box->z, box->depth, aux_usage); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + p_res, aux_usage, level, true); + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + if (!isl_format_supports_rendering(devinfo, format) && + isl_format_is_rgbx(format)) + format = isl_format_rgbx_to_rgba(format); + + blorp_clear(&blorp_batch, &surf, format, swizzle, + level, box->z, box->depth, box->x, box->y, + box->x + box->width, box->y + box->height, + color, color_write_disable); + + blorp_batch_finish(&blorp_batch); + crocus_flush_and_dirty_for_history(ice, batch, res, + PIPE_CONTROL_RENDER_TARGET_FLUSH, + "cache history: post color clear"); + + crocus_resource_finish_render(ice, res, level, + box->z, box->depth, aux_usage); +} + +static bool +can_fast_clear_depth(struct crocus_context *ice, + struct crocus_resource *res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + float depth) +{ + struct pipe_resource *p_res = (void *) res; + struct pipe_context *ctx = (void *) ice; + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (devinfo->ver < 6) + return false; + + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return false; + + /* Check for partial clears */ + if (box->x > 0 || box->y > 0 || + box->width < u_minify(p_res->width0, level) || + box->height < u_minify(p_res->height0, level)) { + return false; + } + + /* Avoid conditional fast clears to maintain correct tracking of the aux + * state (see iris_resource_finish_write for more info). Note that partial + * fast clears would not pose a problem with conditional rendering. + */ + if (render_condition_enabled && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + return false; + } + + if (!crocus_resource_level_has_hiz(res, level)) + return false; + + if (res->base.format == PIPE_FORMAT_Z16_UNORM) { + /* From the Sandy Bridge PRM, volume 2 part 1, page 314: + * + * "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be + * enabled (the legacy method of clearing must be performed): + * + * - DevSNB{W/A}]: When depth buffer format is D16_UNORM and the + * width of the map (LOD0) is not multiple of 16, fast clear + * optimization must be disabled. + */ + if (devinfo->ver == 6 && + (minify(res->surf.phys_level0_sa.width, + level) % 16) != 0) + return false; + } + return true; +} + +static void +fast_clear_depth(struct crocus_context *ice, + struct crocus_resource *res, + unsigned level, + const struct pipe_box *box, + float depth) +{ + struct pipe_resource *p_res = (void *) res; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + /* Quantize the clear value to what can be stored in the actual depth + * buffer. This makes the following check more accurate because it now + * checks if the actual depth bits will match. It also prevents us from + * getting a too-accurate depth value during depth testing or when sampling + * with HiZ enabled. + */ + const unsigned nbits = p_res->format == PIPE_FORMAT_Z16_UNORM ? 16 : 24; + const uint32_t depth_max = (1 << nbits) - 1; + depth = p_res->format == PIPE_FORMAT_Z32_FLOAT ? depth : + (unsigned)(depth * depth_max) / (float)depth_max; + + bool update_clear_depth = false; + + /* If we're clearing to a new clear value, then we need to resolve any clear + * flags out of the HiZ buffer into the real depth buffer. + */ + if (res->aux.clear_color.f32[0] != depth) { + for (unsigned res_level = 0; res_level < res->surf.levels; res_level++) { + if (!crocus_resource_level_has_hiz(res, res_level)) + continue; + + const unsigned level_layers = + crocus_get_num_logical_layers(res, res_level); + for (unsigned layer = 0; layer < level_layers; layer++) { + if (res_level == level && + layer >= box->z && + layer < box->z + box->depth) { + /* We're going to clear this layer anyway. Leave it alone. */ + continue; + } + + enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, res_level, layer); + + if (aux_state != ISL_AUX_STATE_CLEAR && + aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) { + /* This slice doesn't have any fast-cleared bits. */ + continue; + } + + /* If we got here, then the level may have fast-clear bits that + * use the old clear value. We need to do a depth resolve to get + * rid of their use of the clear value before we can change it. + * Fortunately, few applications ever change their depth clear + * value so this shouldn't happen often. + */ + crocus_hiz_exec(ice, batch, res, res_level, layer, 1, + ISL_AUX_OP_FULL_RESOLVE, false); + crocus_resource_set_aux_state(ice, res, res_level, layer, 1, + ISL_AUX_STATE_RESOLVED); + } + } + const union isl_color_value clear_value = { .f32 = {depth, } }; + crocus_resource_set_clear_color(ice, res, clear_value); + update_clear_depth = true; + } + + for (unsigned l = 0; l < box->depth; l++) { + enum isl_aux_state aux_state = + crocus_resource_level_has_hiz(res, level) ? + crocus_resource_get_aux_state(res, level, box->z + l) : + ISL_AUX_STATE_AUX_INVALID; + if (update_clear_depth || aux_state != ISL_AUX_STATE_CLEAR) { + if (aux_state == ISL_AUX_STATE_CLEAR) { + perf_debug(&ice->dbg, "Performing HiZ clear just to update the " + "depth clear value\n"); + } + crocus_hiz_exec(ice, batch, res, level, + box->z + l, 1, ISL_AUX_OP_FAST_CLEAR, + update_clear_depth); + } + } + + crocus_resource_set_aux_state(ice, res, level, box->z, box->depth, + ISL_AUX_STATE_CLEAR); + ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER; +} + +static void +clear_depth_stencil(struct crocus_context *ice, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + bool clear_depth, + bool clear_stencil, + float depth, + uint8_t stencil) +{ + struct crocus_resource *res = (void *) p_res; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + enum blorp_batch_flags blorp_flags = 0; + + if (render_condition_enabled) { + if (!crocus_check_conditional_render(ice)) + return; + + if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) + blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE; + } + + crocus_batch_maybe_flush(batch, 1500); + + struct crocus_resource *z_res; + struct crocus_resource *stencil_res; + struct blorp_surf z_surf; + struct blorp_surf stencil_surf; + + crocus_get_depth_stencil_resources(&batch->screen->devinfo, p_res, &z_res, &stencil_res); + if (z_res && clear_depth && + can_fast_clear_depth(ice, z_res, level, box, render_condition_enabled, + depth)) { + fast_clear_depth(ice, z_res, level, box, depth); + crocus_flush_and_dirty_for_history(ice, batch, res, 0, + "cache history: post fast Z clear"); + clear_depth = false; + z_res = NULL; + } + + /* At this point, we might have fast cleared the depth buffer. So if there's + * no stencil clear pending, return early. + */ + if (!(clear_depth || (clear_stencil && stencil_res))) { + return; + } + + if (clear_depth && z_res) { + const enum isl_aux_usage aux_usage = + crocus_resource_render_aux_usage(ice, z_res, level, z_res->surf.format, + false); + crocus_resource_prepare_render(ice, z_res, level, box->z, box->depth, + aux_usage); + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, + &z_surf, &z_res->base, aux_usage, + level, true); + } + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + uint8_t stencil_mask = clear_stencil && stencil_res ? 0xff : 0; + if (stencil_mask) { + crocus_resource_prepare_access(ice, stencil_res, level, 1, box->z, + box->depth, stencil_res->aux.usage, false); + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, + &stencil_surf, &stencil_res->base, + stencil_res->aux.usage, level, true); + } + + blorp_clear_depth_stencil(&blorp_batch, &z_surf, &stencil_surf, + level, box->z, box->depth, + box->x, box->y, + box->x + box->width, + box->y + box->height, + clear_depth && z_res, depth, + stencil_mask, stencil); + + blorp_batch_finish(&blorp_batch); + crocus_flush_and_dirty_for_history(ice, batch, res, 0, + "cache history: post slow ZS clear"); + + if (clear_depth && z_res) { + crocus_resource_finish_render(ice, z_res, level, + box->z, box->depth, z_surf.aux_usage); + } + + if (stencil_mask) { + crocus_resource_finish_write(ice, stencil_res, level, box->z, box->depth, + stencil_res->aux.usage); + } +} + +/** + * The pipe->clear() driver hook. + * + * This clears buffers attached to the current draw framebuffer. + */ +static void +crocus_clear(struct pipe_context *ctx, + unsigned buffers, + const struct pipe_scissor_state *scissor_state, + const union pipe_color_union *p_color, + double depth, + unsigned stencil) +{ + struct crocus_context *ice = (void *) ctx; + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + assert(buffers != 0); + + struct pipe_box box = { + .width = cso_fb->width, + .height = cso_fb->height, + }; + + if (scissor_state) { + box.x = scissor_state->minx; + box.y = scissor_state->miny; + box.width = MIN2(box.width, scissor_state->maxx - scissor_state->minx); + box.height = MIN2(box.height, scissor_state->maxy - scissor_state->miny); + } + + if (buffers & PIPE_CLEAR_DEPTHSTENCIL) { + if (devinfo->ver < 6) { + crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE, true); + util_blitter_clear(ice->blitter, cso_fb->width, cso_fb->height, + util_framebuffer_get_num_layers(cso_fb), + buffers & PIPE_CLEAR_DEPTHSTENCIL, p_color, depth, stencil, false); + } else { + struct pipe_surface *psurf = cso_fb->zsbuf; + box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1; + box.z = psurf->u.tex.first_layer; + + clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box, true, + buffers & PIPE_CLEAR_DEPTH, + buffers & PIPE_CLEAR_STENCIL, + depth, stencil); + } + buffers &= ~PIPE_CLEAR_DEPTHSTENCIL; + } + + if (buffers & PIPE_CLEAR_COLOR) { + /* pipe_color_union and isl_color_value are interchangeable */ + union isl_color_value *color = (void *) p_color; + + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + if (buffers & (PIPE_CLEAR_COLOR0 << i)) { + struct pipe_surface *psurf = cso_fb->cbufs[i]; + struct crocus_surface *isurf = (void *) psurf; + box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1, + box.z = psurf->u.tex.first_layer, + + clear_color(ice, psurf->texture, psurf->u.tex.level, &box, + true, isurf->view.format, isurf->view.swizzle, + *color); + } + } + } +} + +/** + * The pipe->clear_texture() driver hook. + * + * This clears the given texture resource. + */ +static void +crocus_clear_texture(struct pipe_context *ctx, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + const void *data) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_resource *res = (void *) p_res; + + if (devinfo->ver < 6) { + util_clear_texture(ctx, p_res, + level, box, data); + return; + } + + if (crocus_resource_unfinished_aux_import(res)) + crocus_resource_finish_aux_import(ctx->screen, res); + + if (util_format_is_depth_or_stencil(p_res->format)) { + const struct util_format_unpack_description *fmt_unpack = + util_format_unpack_description(p_res->format); + + float depth = 0.0; + uint8_t stencil = 0; + + if (fmt_unpack->unpack_z_float) + fmt_unpack->unpack_z_float(&depth, 0, data, 0, 1, 1); + + if (fmt_unpack->unpack_s_8uint) + fmt_unpack->unpack_s_8uint(&stencil, 0, data, 0, 1, 1); + + clear_depth_stencil(ice, p_res, level, box, true, true, true, + depth, stencil); + } else { + union isl_color_value color; + struct crocus_resource *res = (void *) p_res; + enum isl_format format = res->surf.format; + + if (!isl_format_supports_rendering(devinfo, format)) { + const struct isl_format_layout *fmtl = isl_format_get_layout(format); + // XXX: actually just get_copy_format_for_bpb from BLORP + // XXX: don't cut and paste this + switch (fmtl->bpb) { + case 8: format = ISL_FORMAT_R8_UINT; break; + case 16: format = ISL_FORMAT_R8G8_UINT; break; + case 24: format = ISL_FORMAT_R8G8B8_UINT; break; + case 32: format = ISL_FORMAT_R8G8B8A8_UINT; break; + case 48: format = ISL_FORMAT_R16G16B16_UINT; break; + case 64: format = ISL_FORMAT_R16G16B16A16_UINT; break; + case 96: format = ISL_FORMAT_R32G32B32_UINT; break; + case 128: format = ISL_FORMAT_R32G32B32A32_UINT; break; + default: + unreachable("Unknown format bpb"); + } + + /* No aux surfaces for non-renderable surfaces */ + assert(res->aux.usage == ISL_AUX_USAGE_NONE); + } + + isl_color_value_unpack(&color, format, data); + + clear_color(ice, p_res, level, box, true, format, + ISL_SWIZZLE_IDENTITY, color); + } +} + +/** + * The pipe->clear_render_target() driver hook. + * + * This clears the given render target surface. + */ +static void +crocus_clear_render_target(struct pipe_context *ctx, + struct pipe_surface *psurf, + const union pipe_color_union *p_color, + unsigned dst_x, unsigned dst_y, + unsigned width, unsigned height, + bool render_condition_enabled) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_surface *isurf = (void *) psurf; + struct pipe_box box = { + .x = dst_x, + .y = dst_y, + .z = psurf->u.tex.first_layer, + .width = width, + .height = height, + .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1 + }; + + /* pipe_color_union and isl_color_value are interchangeable */ + union isl_color_value *color = (void *) p_color; + + clear_color(ice, psurf->texture, psurf->u.tex.level, &box, + render_condition_enabled, + isurf->view.format, isurf->view.swizzle, *color); +} + +/** + * The pipe->clear_depth_stencil() driver hook. + * + * This clears the given depth/stencil surface. + */ +static void +crocus_clear_depth_stencil(struct pipe_context *ctx, + struct pipe_surface *psurf, + unsigned flags, + double depth, + unsigned stencil, + unsigned dst_x, unsigned dst_y, + unsigned width, unsigned height, + bool render_condition_enabled) +{ + return; +#if 0 + struct crocus_context *ice = (void *) ctx; + struct pipe_box box = { + .x = dst_x, + .y = dst_y, + .z = psurf->u.tex.first_layer, + .width = width, + .height = height, + .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1 + }; + uint32_t blit_flags = 0; + + assert(util_format_is_depth_or_stencil(psurf->texture->format)); + + crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE); + util_blitter_clear(ice->blitter, width, height, + 1, flags, NULL, depth, stencil, render_condition_enabled); +#if 0 + clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box, + render_condition_enabled, + flags & PIPE_CLEAR_DEPTH, flags & PIPE_CLEAR_STENCIL, + depth, stencil); +#endif +#endif +} + +void +crocus_init_clear_functions(struct pipe_context *ctx) +{ + ctx->clear = crocus_clear; + ctx->clear_texture = crocus_clear_texture; + ctx->clear_render_target = crocus_clear_render_target; + ctx->clear_depth_stencil = crocus_clear_depth_stencil; +} diff --git a/src/gallium/drivers/crocus/crocus_context.c b/src/gallium/drivers/crocus/crocus_context.c new file mode 100644 index 00000000000..cd8a54d6d34 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_context.c @@ -0,0 +1,336 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "util/ralloc.h" +#include "util/u_inlines.h" +#include "util/format/u_format.h" +#include "util/u_upload_mgr.h" +#include "drm-uapi/i915_drm.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "common/intel_defines.h" +#include "common/intel_sample_positions.h" + +/** + * The pipe->set_debug_callback() driver hook. + */ +static void +crocus_set_debug_callback(struct pipe_context *ctx, + const struct pipe_debug_callback *cb) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + if (cb) + ice->dbg = *cb; + else + memset(&ice->dbg, 0, sizeof(ice->dbg)); +} + +static bool +crocus_init_identifier_bo(struct crocus_context *ice) +{ + void *bo_map; + + bo_map = crocus_bo_map(NULL, ice->workaround_bo, MAP_READ | MAP_WRITE); + if (!bo_map) + return false; + + ice->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE; + ice->workaround_offset = ALIGN( + intel_debug_write_identifiers(bo_map, 4096, "Crocus") + 8, 8); + + crocus_bo_unmap(ice->workaround_bo); + + return true; +} + +/** + * Called from the batch module when it detects a GPU hang. + * + * In this case, we've lost our GEM context, and can't rely on any existing + * state on the GPU. We must mark everything dirty and wipe away any saved + * assumptions about the last known state of the GPU. + */ +void +crocus_lost_context_state(struct crocus_batch *batch) +{ + /* The batch module doesn't have an crocus_context, because we want to + * avoid introducing lots of layering violations. Unfortunately, here + * we do need to inform the context of batch catastrophe. We know the + * batch is one of our context's, so hackily claw our way back. + */ + struct crocus_context *ice = batch->ice; + struct crocus_screen *screen = batch->screen; + if (batch->name == CROCUS_BATCH_RENDER) { + screen->vtbl.init_render_context(batch); + } else if (batch->name == CROCUS_BATCH_COMPUTE) { + screen->vtbl.init_compute_context(batch); + } else { + unreachable("unhandled batch reset"); + } + + ice->state.dirty = ~0ull; + memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); + batch->state_base_address_emitted = false; + screen->vtbl.lost_genx_state(ice, batch); +} + +static enum pipe_reset_status +crocus_get_device_reset_status(struct pipe_context *ctx) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + enum pipe_reset_status worst_reset = PIPE_NO_RESET; + + /* Check the reset status of each batch's hardware context, and take the + * worst status (if one was guilty, proclaim guilt). + */ + for (int i = 0; i < ice->batch_count; i++) { + /* This will also recreate the hardware contexts as necessary, so any + * future queries will show no resets. We only want to report once. + */ + enum pipe_reset_status batch_reset = + crocus_batch_check_for_reset(&ice->batches[i]); + + if (batch_reset == PIPE_NO_RESET) + continue; + + if (worst_reset == PIPE_NO_RESET) { + worst_reset = batch_reset; + } else { + /* GUILTY < INNOCENT < UNKNOWN */ + worst_reset = MIN2(worst_reset, batch_reset); + } + } + + if (worst_reset != PIPE_NO_RESET && ice->reset.reset) + ice->reset.reset(ice->reset.data, worst_reset); + + return worst_reset; +} + +static void +crocus_set_device_reset_callback(struct pipe_context *ctx, + const struct pipe_device_reset_callback *cb) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + if (cb) + ice->reset = *cb; + else + memset(&ice->reset, 0, sizeof(ice->reset)); +} + +static void +crocus_get_sample_position(struct pipe_context *ctx, + unsigned sample_count, + unsigned sample_index, + float *out_value) +{ + union { + struct { + float x[16]; + float y[16]; + } a; + struct { + float _0XOffset, _1XOffset, _2XOffset, _3XOffset, + _4XOffset, _5XOffset, _6XOffset, _7XOffset, + _8XOffset, _9XOffset, _10XOffset, _11XOffset, + _12XOffset, _13XOffset, _14XOffset, _15XOffset; + float _0YOffset, _1YOffset, _2YOffset, _3YOffset, + _4YOffset, _5YOffset, _6YOffset, _7YOffset, + _8YOffset, _9YOffset, _10YOffset, _11YOffset, + _12YOffset, _13YOffset, _14YOffset, _15YOffset; + } v; + } u; + switch (sample_count) { + case 1: INTEL_SAMPLE_POS_1X(u.v._); break; + case 2: INTEL_SAMPLE_POS_2X(u.v._); break; + case 4: INTEL_SAMPLE_POS_4X(u.v._); break; + case 8: INTEL_SAMPLE_POS_8X(u.v._); break; + case 16: INTEL_SAMPLE_POS_16X(u.v._); break; + default: unreachable("invalid sample count"); + } + + out_value[0] = u.a.x[sample_index]; + out_value[1] = u.a.y[sample_index]; +} + +/** + * Destroy a context, freeing any associated memory. + */ +static void +crocus_destroy_context(struct pipe_context *ctx) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + if (ctx->stream_uploader) + u_upload_destroy(ctx->stream_uploader); + + if (ice->blitter) + util_blitter_destroy(ice->blitter); + screen->vtbl.destroy_state(ice); + crocus_destroy_program_cache(ice); + u_upload_destroy(ice->query_buffer_uploader); + + crocus_bo_unreference(ice->workaround_bo); + + slab_destroy_child(&ice->transfer_pool); + + crocus_batch_free(&ice->batches[CROCUS_BATCH_RENDER]); + if (ice->batches[CROCUS_BATCH_COMPUTE].ice) + crocus_batch_free(&ice->batches[CROCUS_BATCH_COMPUTE]); + + ralloc_free(ice); +} + +#define genX_call(devinfo, func, ...) \ + switch ((devinfo)->verx10) { \ + case 75: \ + gfx75_##func(__VA_ARGS__); \ + break; \ + case 70: \ + gfx7_##func(__VA_ARGS__); \ + break; \ + case 60: \ + gfx6_##func(__VA_ARGS__); \ + break; \ + case 50: \ + gfx5_##func(__VA_ARGS__); \ + break; \ + case 45: \ + gfx45_##func(__VA_ARGS__); \ + break; \ + case 40: \ + gfx4_##func(__VA_ARGS__); \ + break; \ + default: \ + unreachable("Unknown hardware generation"); \ + } + +/** + * Create a context. + * + * This is where each context begins. + */ +struct pipe_context * +crocus_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags) +{ + struct crocus_screen *screen = (struct crocus_screen*)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_context *ice = rzalloc(NULL, struct crocus_context); + + if (!ice) + return NULL; + + struct pipe_context *ctx = &ice->ctx; + + ctx->screen = pscreen; + ctx->priv = priv; + + ctx->stream_uploader = u_upload_create_default(ctx); + if (!ctx->stream_uploader) { + free(ctx); + return NULL; + } + ctx->const_uploader = ctx->stream_uploader; + + ctx->destroy = crocus_destroy_context; + ctx->set_debug_callback = crocus_set_debug_callback; + ctx->set_device_reset_callback = crocus_set_device_reset_callback; + ctx->get_device_reset_status = crocus_get_device_reset_status; + ctx->get_sample_position = crocus_get_sample_position; + + ice->shaders.urb_size = devinfo->urb.size; + + crocus_init_context_fence_functions(ctx); + crocus_init_blit_functions(ctx); + crocus_init_clear_functions(ctx); + crocus_init_program_functions(ctx); + crocus_init_resource_functions(ctx); + crocus_init_flush_functions(ctx); + + crocus_init_program_cache(ice); + + slab_create_child(&ice->transfer_pool, &screen->transfer_pool); + + ice->query_buffer_uploader = + u_upload_create(ctx, 4096, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, + 0); + + ice->workaround_bo = + crocus_bo_alloc(screen->bufmgr, "workaround", 4096); + if (!ice->workaround_bo) + return NULL; + + if (!crocus_init_identifier_bo(ice)) + return NULL; + + genX_call(devinfo, init_state, ice); + genX_call(devinfo, init_blorp, ice); + genX_call(devinfo, init_query, ice); + + ice->blitter = util_blitter_create(&ice->ctx); + if (ice->blitter == NULL) + return NULL; + int priority = 0; + if (flags & PIPE_CONTEXT_HIGH_PRIORITY) + priority = INTEL_CONTEXT_HIGH_PRIORITY; + if (flags & PIPE_CONTEXT_LOW_PRIORITY) + priority = INTEL_CONTEXT_LOW_PRIORITY; + + ice->batch_count = devinfo->ver >= 7 ? CROCUS_BATCH_COUNT : 1; + for (int i = 0; i < ice->batch_count; i++) { + crocus_init_batch(ice, (enum crocus_batch_name) i, + priority); + } + + ice->urb.size = devinfo->urb.size; + screen->vtbl.init_render_context(&ice->batches[CROCUS_BATCH_RENDER]); + if (ice->batch_count > 1) + screen->vtbl.init_compute_context(&ice->batches[CROCUS_BATCH_COMPUTE]); + + return ctx; +} + +bool +crocus_sw_check_cond_render(struct crocus_context *ice) +{ + struct crocus_query *q = ice->condition.query; + union pipe_query_result result; + + bool wait = ice->condition.mode == PIPE_RENDER_COND_WAIT || + ice->condition.mode == PIPE_RENDER_COND_BY_REGION_WAIT; + if (!q) + return true; + + bool ret = ice->ctx.get_query_result(&ice->ctx, (void *)q, wait, &result); + if (!ret) + return true; + + return ice->condition.condition ? result.u64 == 0 : result.u64 != 0; +} diff --git a/src/gallium/drivers/crocus/crocus_context.h b/src/gallium/drivers/crocus/crocus_context.h new file mode 100644 index 00000000000..8d6e43d80f6 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_context.h @@ -0,0 +1,955 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_CONTEXT_H +#define CROCUS_CONTEXT_H + +#include "pipe/p_context.h" +#include "pipe/p_state.h" +#include "util/u_debug.h" +#include "intel/blorp/blorp.h" +#include "intel/dev/intel_debug.h" +#include "intel/compiler/brw_compiler.h" +#include "crocus_batch.h" +#include "crocus_fence.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "util/u_blitter.h" + +struct crocus_bo; +struct crocus_context; +struct blorp_batch; +struct blorp_params; + +#define CROCUS_MAX_TEXTURE_BUFFER_SIZE (1 << 27) +#define CROCUS_MAX_TEXTURE_SAMPLERS 32 +/* CROCUS_MAX_ABOS and CROCUS_MAX_SSBOS must be the same. */ +#define CROCUS_MAX_ABOS 16 +#define CROCUS_MAX_SSBOS 16 +#define CROCUS_MAX_VIEWPORTS 16 +#define CROCUS_MAX_CLIP_PLANES 8 + +enum crocus_param_domain { + BRW_PARAM_DOMAIN_BUILTIN = 0, + BRW_PARAM_DOMAIN_IMAGE, +}; + +enum { + DRI_CONF_BO_REUSE_DISABLED, + DRI_CONF_BO_REUSE_ALL +}; + +#define BRW_PARAM(domain, val) (BRW_PARAM_DOMAIN_##domain << 24 | (val)) +#define BRW_PARAM_DOMAIN(param) ((uint32_t)(param) >> 24) +#define BRW_PARAM_VALUE(param) ((uint32_t)(param) & 0x00ffffff) +#define BRW_PARAM_IMAGE(idx, offset) BRW_PARAM(IMAGE, ((idx) << 8) | (offset)) +#define BRW_PARAM_IMAGE_IDX(value) (BRW_PARAM_VALUE(value) >> 8) +#define BRW_PARAM_IMAGE_OFFSET(value)(BRW_PARAM_VALUE(value) & 0xf) + +/** + * Dirty flags. When state changes, we flag some combination of these + * to indicate that particular GPU commands need to be re-emitted. + * + * Each bit typically corresponds to a single 3DSTATE_* command packet, but + * in rare cases they map to a group of related packets that need to be + * emitted together. + * + * See crocus_upload_render_state(). + */ +#define CROCUS_DIRTY_COLOR_CALC_STATE (1ull << 0) +#define CROCUS_DIRTY_POLYGON_STIPPLE (1ull << 1) +#define CROCUS_DIRTY_CC_VIEWPORT (1ull << 2) +#define CROCUS_DIRTY_SF_CL_VIEWPORT (1ull << 3) +#define CROCUS_DIRTY_RASTER (1ull << 4) +#define CROCUS_DIRTY_CLIP (1ull << 5) +#define CROCUS_DIRTY_LINE_STIPPLE (1ull << 6) +#define CROCUS_DIRTY_VERTEX_ELEMENTS (1ull << 7) +#define CROCUS_DIRTY_VERTEX_BUFFERS (1ull << 8) +#define CROCUS_DIRTY_DRAWING_RECTANGLE (1ull << 9) +#define CROCUS_DIRTY_GEN6_URB (1ull << 10) +#define CROCUS_DIRTY_DEPTH_BUFFER (1ull << 11) +#define CROCUS_DIRTY_WM (1ull << 12) +#define CROCUS_DIRTY_SO_DECL_LIST (1ull << 13) +#define CROCUS_DIRTY_STREAMOUT (1ull << 14) +#define CROCUS_DIRTY_GEN4_CONSTANT_COLOR (1ull << 15) +#define CROCUS_DIRTY_GEN4_CURBE (1ull << 16) +#define CROCUS_DIRTY_GEN4_URB_FENCE (1ull << 17) +#define CROCUS_DIRTY_GEN5_PIPELINED_POINTERS (1ull << 18) +#define CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS (1ull << 19) +#define CROCUS_DIRTY_GEN6_BLEND_STATE (1ull << 20) +#define CROCUS_DIRTY_GEN6_SCISSOR_RECT (1ull << 21) +#define CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL (1ull << 22) +#define CROCUS_DIRTY_GEN6_MULTISAMPLE (1ull << 23) +#define CROCUS_DIRTY_GEN6_SAMPLE_MASK (1ull << 24) +#define CROCUS_DIRTY_GEN7_SBE (1ull << 25) +#define CROCUS_DIRTY_GEN7_L3_CONFIG (1ull << 26) +#define CROCUS_DIRTY_GEN7_SO_BUFFERS (1ull << 27) +#define CROCUS_DIRTY_GEN75_VF (1ull << 28) +#define CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES (1ull << 29) +#define CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 30) +#define CROCUS_DIRTY_VF_STATISTICS (1ull << 31) +#define CROCUS_DIRTY_GEN4_CLIP_PROG (1ull << 32) +#define CROCUS_DIRTY_GEN4_SF_PROG (1ull << 33) +#define CROCUS_DIRTY_GEN4_FF_GS_PROG (1ull << 34) +#define CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS (1ull << 35) +#define CROCUS_DIRTY_GEN6_SVBI (1ull << 36) + +#define CROCUS_ALL_DIRTY_FOR_COMPUTE (CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES) + +#define CROCUS_ALL_DIRTY_FOR_RENDER (~CROCUS_ALL_DIRTY_FOR_COMPUTE) + +/** + * Per-stage dirty flags. When state changes, we flag some combination of + * these to indicate that particular GPU commands need to be re-emitted. + * Unlike the IRIS_DIRTY_* flags these are shader stage-specific and can be + * indexed by shifting the mask by the shader stage index. + * + * See crocus_upload_render_state(). + */ +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS (1ull << 0) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS (1ull << 1) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES (1ull << 2) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS (1ull << 3) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS (1ull << 4) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS (1ull << 5) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_VS (1ull << 6) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_TCS (1ull << 7) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_TES (1ull << 8) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_GS (1ull << 9) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_FS (1ull << 10) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_CS (1ull << 11) +#define CROCUS_STAGE_DIRTY_VS (1ull << 12) +#define CROCUS_STAGE_DIRTY_TCS (1ull << 13) +#define CROCUS_STAGE_DIRTY_TES (1ull << 14) +#define CROCUS_STAGE_DIRTY_GS (1ull << 15) +#define CROCUS_STAGE_DIRTY_FS (1ull << 16) +#define CROCUS_STAGE_DIRTY_CS (1ull << 17) +#define CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS 18 +#define CROCUS_STAGE_DIRTY_CONSTANTS_VS (1ull << 18) +#define CROCUS_STAGE_DIRTY_CONSTANTS_TCS (1ull << 19) +#define CROCUS_STAGE_DIRTY_CONSTANTS_TES (1ull << 20) +#define CROCUS_STAGE_DIRTY_CONSTANTS_GS (1ull << 21) +#define CROCUS_STAGE_DIRTY_CONSTANTS_FS (1ull << 22) +#define CROCUS_STAGE_DIRTY_CONSTANTS_CS (1ull << 23) +#define CROCUS_STAGE_DIRTY_BINDINGS_VS (1ull << 24) +#define CROCUS_STAGE_DIRTY_BINDINGS_TCS (1ull << 25) +#define CROCUS_STAGE_DIRTY_BINDINGS_TES (1ull << 26) +#define CROCUS_STAGE_DIRTY_BINDINGS_GS (1ull << 27) +#define CROCUS_STAGE_DIRTY_BINDINGS_FS (1ull << 28) +#define CROCUS_STAGE_DIRTY_BINDINGS_CS (1ull << 29) + +#define CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE (CROCUS_STAGE_DIRTY_CS | \ + CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS | \ + CROCUS_STAGE_DIRTY_UNCOMPILED_CS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_CS | \ + CROCUS_STAGE_DIRTY_BINDINGS_CS) + +#define CROCUS_ALL_STAGE_DIRTY_FOR_RENDER (~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE) + +#define CROCUS_ALL_STAGE_DIRTY_BINDINGS (CROCUS_STAGE_DIRTY_BINDINGS_VS | \ + CROCUS_STAGE_DIRTY_BINDINGS_TCS | \ + CROCUS_STAGE_DIRTY_BINDINGS_TES | \ + CROCUS_STAGE_DIRTY_BINDINGS_GS | \ + CROCUS_STAGE_DIRTY_BINDINGS_FS | \ + CROCUS_STAGE_DIRTY_BINDINGS_CS) + +#define CROCUS_RENDER_STAGE_DIRTY_CONSTANTS (CROCUS_STAGE_DIRTY_CONSTANTS_VS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_TCS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_TES | \ + CROCUS_STAGE_DIRTY_CONSTANTS_GS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_FS) + +/** + * Non-orthogonal state (NOS) dependency flags. + * + * Shader programs may depend on non-orthogonal state. These flags are + * used to indicate that a shader's key depends on the state provided by + * a certain Gallium CSO. Changing any CSOs marked as a dependency will + * cause the driver to re-compute the shader key, possibly triggering a + * shader recompile. + */ +enum crocus_nos_dep { + CROCUS_NOS_FRAMEBUFFER, + CROCUS_NOS_DEPTH_STENCIL_ALPHA, + CROCUS_NOS_RASTERIZER, + CROCUS_NOS_BLEND, + CROCUS_NOS_LAST_VUE_MAP, + CROCUS_NOS_TEXTURES, + CROCUS_NOS_VERTEX_ELEMENTS, + CROCUS_NOS_COUNT, +}; + +struct crocus_depth_stencil_alpha_state; + +/** + * Cache IDs for the in-memory program cache (ice->shaders.cache). + */ +enum crocus_program_cache_id { + CROCUS_CACHE_VS = MESA_SHADER_VERTEX, + CROCUS_CACHE_TCS = MESA_SHADER_TESS_CTRL, + CROCUS_CACHE_TES = MESA_SHADER_TESS_EVAL, + CROCUS_CACHE_GS = MESA_SHADER_GEOMETRY, + CROCUS_CACHE_FS = MESA_SHADER_FRAGMENT, + CROCUS_CACHE_CS = MESA_SHADER_COMPUTE, + CROCUS_CACHE_BLORP, + CROCUS_CACHE_SF, + CROCUS_CACHE_CLIP, + CROCUS_CACHE_FF_GS, +}; + +/** @{ + * + * Defines for PIPE_CONTROL operations, which trigger cache flushes, + * synchronization, pipelined memory writes, and so on. + * + * The bits here are not the actual hardware values. The actual fields + * move between various generations, so we just have flags for each + * potential operation, and use genxml to encode the actual packet. + */ +enum pipe_control_flags +{ + PIPE_CONTROL_FLUSH_LLC = (1 << 1), + PIPE_CONTROL_LRI_POST_SYNC_OP = (1 << 2), + PIPE_CONTROL_STORE_DATA_INDEX = (1 << 3), + PIPE_CONTROL_CS_STALL = (1 << 4), + PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET = (1 << 5), + PIPE_CONTROL_SYNC_GFDT = (1 << 6), + PIPE_CONTROL_TLB_INVALIDATE = (1 << 7), + PIPE_CONTROL_MEDIA_STATE_CLEAR = (1 << 8), + PIPE_CONTROL_WRITE_IMMEDIATE = (1 << 9), + PIPE_CONTROL_WRITE_DEPTH_COUNT = (1 << 10), + PIPE_CONTROL_WRITE_TIMESTAMP = (1 << 11), + PIPE_CONTROL_DEPTH_STALL = (1 << 12), + PIPE_CONTROL_RENDER_TARGET_FLUSH = (1 << 13), + PIPE_CONTROL_INSTRUCTION_INVALIDATE = (1 << 14), + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE = (1 << 15), + PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE = (1 << 16), + PIPE_CONTROL_NOTIFY_ENABLE = (1 << 17), + PIPE_CONTROL_FLUSH_ENABLE = (1 << 18), + PIPE_CONTROL_DATA_CACHE_FLUSH = (1 << 19), + PIPE_CONTROL_VF_CACHE_INVALIDATE = (1 << 20), + PIPE_CONTROL_CONST_CACHE_INVALIDATE = (1 << 21), + PIPE_CONTROL_STATE_CACHE_INVALIDATE = (1 << 22), + PIPE_CONTROL_STALL_AT_SCOREBOARD = (1 << 23), + PIPE_CONTROL_DEPTH_CACHE_FLUSH = (1 << 24), + PIPE_CONTROL_TILE_CACHE_FLUSH = (1 << 25), +}; + +#define PIPE_CONTROL_CACHE_FLUSH_BITS \ + (PIPE_CONTROL_DEPTH_CACHE_FLUSH | \ + PIPE_CONTROL_DATA_CACHE_FLUSH | \ + PIPE_CONTROL_RENDER_TARGET_FLUSH) + +#define PIPE_CONTROL_CACHE_INVALIDATE_BITS \ + (PIPE_CONTROL_STATE_CACHE_INVALIDATE | \ + PIPE_CONTROL_CONST_CACHE_INVALIDATE | \ + PIPE_CONTROL_VF_CACHE_INVALIDATE | \ + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \ + PIPE_CONTROL_INSTRUCTION_INVALIDATE) + +enum crocus_predicate_state { + /* The first two states are used if we can determine whether to draw + * without having to look at the values in the query object buffer. This + * will happen if there is no conditional render in progress, if the query + * object is already completed or if something else has already added + * samples to the preliminary result. + */ + CROCUS_PREDICATE_STATE_RENDER, + CROCUS_PREDICATE_STATE_DONT_RENDER, + + /* In this case whether to draw or not depends on the result of an + * MI_PREDICATE command so the predicate enable bit needs to be checked. + */ + CROCUS_PREDICATE_STATE_USE_BIT, + /* In this case, either MI_PREDICATE doesn't exist or we lack the + * necessary kernel features to use it. Stall for the query result. + */ + CROCUS_PREDICATE_STATE_STALL_FOR_QUERY, +}; + +/** @} */ + +/** + * An uncompiled, API-facing shader. This is the Gallium CSO for shaders. + * It primarily contains the NIR for the shader. + * + * Each API-facing shader can be compiled into multiple shader variants, + * based on non-orthogonal state dependencies, recorded in the shader key. + * + * See crocus_compiled_shader, which represents a compiled shader variant. + */ +struct crocus_uncompiled_shader { + struct nir_shader *nir; + + struct pipe_stream_output_info stream_output; + + /* A SHA1 of the serialized NIR for the disk cache. */ + unsigned char nir_sha1[20]; + + unsigned program_id; + + /** Bitfield of (1 << CROCUS_NOS_*) flags. */ + unsigned nos; + + /** Have any shader variants been compiled yet? */ + bool compiled_once; + + /** Should we use ALT mode for math? Useful for ARB programs. */ + bool use_alt_mode; + + bool needs_edge_flag; + + /** Constant data scraped from the shader by nir_opt_large_constants */ + struct pipe_resource *const_data; + + /** Surface state for const_data */ + struct crocus_state_ref const_data_state; +}; + +enum crocus_surface_group { + CROCUS_SURFACE_GROUP_RENDER_TARGET, + CROCUS_SURFACE_GROUP_RENDER_TARGET_READ, + CROCUS_SURFACE_GROUP_SOL, + CROCUS_SURFACE_GROUP_CS_WORK_GROUPS, + CROCUS_SURFACE_GROUP_TEXTURE, + CROCUS_SURFACE_GROUP_TEXTURE_GATHER, + CROCUS_SURFACE_GROUP_IMAGE, + CROCUS_SURFACE_GROUP_UBO, + CROCUS_SURFACE_GROUP_SSBO, + + CROCUS_SURFACE_GROUP_COUNT, +}; + +enum { + /* Invalid value for a binding table index. */ + CROCUS_SURFACE_NOT_USED = 0xa0a0a0a0, +}; + +struct crocus_binding_table { + uint32_t size_bytes; + + /** Number of surfaces in each group, before compacting. */ + uint32_t sizes[CROCUS_SURFACE_GROUP_COUNT]; + + /** Initial offset of each group. */ + uint32_t offsets[CROCUS_SURFACE_GROUP_COUNT]; + + /** Mask of surfaces used in each group. */ + uint64_t used_mask[CROCUS_SURFACE_GROUP_COUNT]; +}; + +/** + * A compiled shader variant, containing a pointer to the GPU assembly, + * as well as program data and other packets needed by state upload. + * + * There can be several crocus_compiled_shader variants per API-level shader + * (crocus_uncompiled_shader), due to state-based recompiles (brw_*_prog_key). + */ +struct crocus_compiled_shader { + /** Reference to the uploaded assembly. */ + uint32_t offset; + + /* asm size in map */ + uint32_t map_size; + + /** The program data (owned by the program cache hash table) */ + struct brw_stage_prog_data *prog_data; + uint32_t prog_data_size; + + /** A list of system values to be uploaded as uniforms. */ + enum brw_param_builtin *system_values; + unsigned num_system_values; + + /** Number of constbufs expected by the shader. */ + unsigned num_cbufs; + + /** + * Derived 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets + * (the VUE-based information for transform feedback outputs). + */ + uint32_t *streamout; + + struct crocus_binding_table bt; + + uint32_t bind_bo_offset; + uint32_t surf_offset[128];//TODO +}; + +/** + * API context state that is replicated per shader stage. + */ +struct crocus_shader_state { + /** Uniform Buffers */ + struct pipe_constant_buffer constbufs[PIPE_MAX_CONSTANT_BUFFERS]; + + bool sysvals_need_upload; + + /** Shader Storage Buffers */ + struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS]; + + /** Shader Storage Images (image load store) */ + struct crocus_image_view image[PIPE_MAX_SHADER_IMAGES]; + + struct crocus_sampler_state *samplers[CROCUS_MAX_TEXTURE_SAMPLERS]; + struct crocus_sampler_view *textures[CROCUS_MAX_TEXTURE_SAMPLERS]; + + /** Bitfield of which constant buffers are bound (non-null). */ + uint32_t bound_cbufs; + + /** Bitfield of which image views are bound (non-null). */ + uint32_t bound_image_views; + + /** Bitfield of which sampler views are bound (non-null). */ + uint32_t bound_sampler_views; + + /** Bitfield of which shader storage buffers are bound (non-null). */ + uint32_t bound_ssbos; + + /** Bitfield of which shader storage buffers are writable. */ + uint32_t writable_ssbos; + + uint32_t sampler_offset; +}; + +/** + * The API context (derived from pipe_context). + * + * Most driver state is tracked here. + */ +struct crocus_context { + struct pipe_context ctx; + + /** A debug callback for KHR_debug output. */ + struct pipe_debug_callback dbg; + + /** A device reset status callback for notifying that the GPU is hosed. */ + struct pipe_device_reset_callback reset; + + /** Slab allocator for crocus_transfer_map objects. */ + struct slab_child_pool transfer_pool; + + struct blorp_context blorp; + + int batch_count; + struct crocus_batch batches[CROCUS_BATCH_COUNT]; + + struct u_upload_mgr *query_buffer_uploader; + + struct blitter_context *blitter; + + struct { + struct { + /** + * Either the value of BaseVertex for indexed draw calls or the value + * of the argument for non-indexed draw calls. + */ + int firstvertex; + int baseinstance; + } params; + + /** + * Are the above values the ones stored in the draw_params buffer? + * If so, we can compare them against new values to see if anything + * changed. If not, we need to assume they changed. + */ + bool params_valid; + + /** + * Resource and offset that stores draw_parameters from the indirect + * buffer or to the buffer that stures the previous values for non + * indirect draws. + */ + struct crocus_state_ref draw_params; + + struct { + /** + * The value of DrawID. This always comes in from it's own vertex + * buffer since it's not part of the indirect draw parameters. + */ + int drawid; + + /** + * Stores if an indexed or non-indexed draw (~0/0). Useful to + * calculate BaseVertex as an AND of firstvertex and is_indexed_draw. + */ + int is_indexed_draw; + } derived_params; + + /** + * Resource and offset used for GL_ARB_shader_draw_parameters which + * contains parameters that are not present in the indirect buffer as + * drawid and is_indexed_draw. They will go in their own vertex element. + */ + struct crocus_state_ref derived_draw_params; + } draw; + + struct { + struct crocus_uncompiled_shader *uncompiled[MESA_SHADER_STAGES]; + struct crocus_compiled_shader *prog[MESA_SHADER_STAGES]; + struct brw_vue_map *last_vue_map; + + struct crocus_bo *cache_bo; + uint32_t cache_next_offset; + void *cache_bo_map; + struct hash_table *cache; + + unsigned urb_size; + + /* gen 4/5 clip/sf progs */ + struct crocus_compiled_shader *clip_prog; + struct crocus_compiled_shader *sf_prog; + /* gen4/5 prims, gen6 streamout */ + struct crocus_compiled_shader *ff_gs_prog; + uint32_t clip_offset; + uint32_t sf_offset; + uint32_t wm_offset; + uint32_t vs_offset; + uint32_t gs_offset; + uint32_t cc_offset; + + /** Is a GS or TES outputting points or lines? */ + bool output_topology_is_points_or_lines; + + /* Track last VS URB entry size */ + unsigned last_vs_entry_size; + + /** + * Scratch buffers for various sizes and stages. + * + * Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding, + * and shader stage. + */ + struct crocus_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES]; + } shaders; + + struct { + struct crocus_query *query; + bool condition; + enum pipe_render_cond_flag mode; + } condition; + + struct intel_perf_context *perf_ctx; + + struct { + uint64_t dirty; + uint64_t stage_dirty; + uint64_t stage_dirty_for_nos[CROCUS_NOS_COUNT]; + + unsigned num_viewports; + unsigned sample_mask; + struct crocus_blend_state *cso_blend; + struct crocus_rasterizer_state *cso_rast; + struct crocus_depth_stencil_alpha_state *cso_zsa; + struct crocus_vertex_element_state *cso_vertex_elements; + struct pipe_blend_color blend_color; + struct pipe_poly_stipple poly_stipple; + struct pipe_viewport_state viewports[CROCUS_MAX_VIEWPORTS]; + struct pipe_scissor_state scissors[CROCUS_MAX_VIEWPORTS]; + struct pipe_stencil_ref stencil_ref; + struct pipe_framebuffer_state framebuffer; + struct pipe_clip_state clip_planes; + + float default_outer_level[4]; + float default_inner_level[2]; + + /** Bitfield of which vertex buffers are bound (non-null). */ + uint32_t bound_vertex_buffers; + struct pipe_vertex_buffer vertex_buffers[16]; + uint32_t vb_end[16]; + + bool primitive_restart; + unsigned cut_index; + enum pipe_prim_type prim_mode:8; + bool prim_is_points_or_lines; + uint8_t vertices_per_patch; + + bool window_space_position; + + /** The last compute group size */ + uint32_t last_block[3]; + + /** The last compute grid size */ + uint32_t last_grid[3]; + /** Reference to the BO containing the compute grid size */ + struct crocus_state_ref grid_size; + + /** + * Array of aux usages for drawing, altered to account for any + * self-dependencies from resources bound for sampling and rendering. + */ + enum isl_aux_usage draw_aux_usage[BRW_MAX_DRAW_BUFFERS]; + + /** Aux usage of the fb's depth buffer (which may or may not exist). */ + enum isl_aux_usage hiz_usage; + + /** Bitfield of whether color blending is enabled for RT[i] */ + uint8_t blend_enables; + + /** Are depth writes enabled? (Depth buffer may or may not exist.) */ + bool depth_writes_enabled; + + /** Are stencil writes enabled? (Stencil buffer may or may not exist.) */ + bool stencil_writes_enabled; + + /** GenX-specific current state */ + struct crocus_genx_state *genx; + + struct crocus_shader_state shaders[MESA_SHADER_STAGES]; + + /** Do vertex shader uses shader draw parameters ? */ + bool vs_uses_draw_params; + bool vs_uses_derived_draw_params; + bool vs_needs_sgvs_element; + bool vs_uses_vertexid; + bool vs_uses_instanceid; + + /** Do vertex shader uses edge flag ? */ + bool vs_needs_edge_flag; + + struct pipe_stream_output_target *so_target[PIPE_MAX_SO_BUFFERS]; + bool streamout_active; + int so_targets; + + bool statistics_counters_enabled; + + /** Current conditional rendering mode */ + enum crocus_predicate_state predicate; + bool predicate_supported; + + /** + * Query BO with a MI_PREDICATE_RESULT snapshot calculated on the + * render context that needs to be uploaded to the compute context. + */ + struct crocus_bo *compute_predicate; + + /** Is a PIPE_QUERY_PRIMITIVES_GENERATED query active? */ + bool prims_generated_query_active; + + /** 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets */ + uint32_t *streamout; + + /** + * Resources containing streamed state which our render context + * currently points to. Used to re-add these to the validation + * list when we start a new batch and haven't resubmitted commands. + */ + struct { + struct pipe_resource *res; + uint32_t offset; + uint32_t size; + uint32_t index_size; + bool prim_restart; + } index_buffer; + + uint32_t sf_vp_address; + uint32_t clip_vp_address; + uint32_t cc_vp_address; + + uint32_t stats_wm; + float global_depth_offset_clamp; + + uint32_t last_xfb_verts_per_prim; + uint64_t svbi; + } state; + + /* BRW_NEW_URB_ALLOCATIONS: + */ + struct { + uint32_t vsize; /* vertex size plus header in urb registers */ + uint32_t gsize; /* GS output size in urb registers */ + uint32_t hsize; /* Tessellation control output size in urb registers */ + uint32_t dsize; /* Tessellation evaluation output size in urb registers */ + uint32_t csize; /* constant buffer size in urb registers */ + uint32_t sfsize; /* setup data size in urb registers */ + + bool constrained; + + uint32_t nr_vs_entries; + uint32_t nr_hs_entries; + uint32_t nr_ds_entries; + uint32_t nr_gs_entries; + uint32_t nr_clip_entries; + uint32_t nr_sf_entries; + uint32_t nr_cs_entries; + + uint32_t vs_start; + uint32_t hs_start; + uint32_t ds_start; + uint32_t gs_start; + uint32_t clip_start; + uint32_t sf_start; + uint32_t cs_start; + /** + * URB size in the current configuration. The units this is expressed + * in are somewhat inconsistent, see intel_device_info::urb::size. + * + * FINISHME: Represent the URB size consistently in KB on all platforms. + */ + uint32_t size; + + /* True if the most recently sent _3DSTATE_URB message allocated + * URB space for the GS. + */ + bool gs_present; + + /* True if the most recently sent _3DSTATE_URB message allocated + * URB space for the HS and DS. + */ + bool tess_present; + } urb; + + /* GEN4/5 curbe */ + struct { + unsigned wm_start; + unsigned wm_size; + unsigned clip_start; + unsigned clip_size; + unsigned vs_start; + unsigned vs_size; + unsigned total_size; + + struct crocus_resource *curbe_res; + unsigned curbe_offset; + } curbe; + + /** + * A buffer containing a marker + description of the driver. This buffer is + * added to all execbufs syscalls so that we can identify the driver that + * generated a hang by looking at the content of the buffer in the error + * state. It is also used for hardware workarounds that require scratch + * writes or reads from some unimportant memory. To avoid overriding the + * debug data, use the workaround_address field for workarounds. + */ + struct crocus_bo *workaround_bo; + unsigned workaround_offset; +}; + +#define perf_debug(dbg, ...) do { \ + if (INTEL_DEBUG & DEBUG_PERF) \ + dbg_printf(__VA_ARGS__); \ + if (unlikely(dbg)) \ + pipe_debug_message(dbg, PERF_INFO, __VA_ARGS__); \ +} while(0) + + +struct pipe_context * +crocus_create_context(struct pipe_screen *screen, void *priv, unsigned flags); + +void crocus_lost_context_state(struct crocus_batch *batch); + +void crocus_init_blit_functions(struct pipe_context *ctx); +void crocus_init_clear_functions(struct pipe_context *ctx); +void crocus_init_program_functions(struct pipe_context *ctx); +void crocus_init_resource_functions(struct pipe_context *ctx); +bool crocus_update_compiled_shaders(struct crocus_context *ice); +void crocus_update_compiled_compute_shader(struct crocus_context *ice); +void crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data, + unsigned threads, uint32_t *dst); + + +/* crocus_blit.c */ +enum crocus_blitter_op +{ + CROCUS_SAVE_TEXTURES = 1, + CROCUS_SAVE_FRAMEBUFFER = 2, + CROCUS_SAVE_FRAGMENT_STATE = 4, + CROCUS_DISABLE_RENDER_COND = 8, +}; +void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond); + +void crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl, + struct isl_device *isl_dev, + struct blorp_surf *surf, + struct pipe_resource *p_res, + enum isl_aux_usage aux_usage, + unsigned level, + bool is_render_target); +void crocus_copy_region(struct blorp_context *blorp, + struct crocus_batch *batch, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box); + +/* crocus_draw.c */ +void crocus_draw_vbo(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws); +void crocus_launch_grid(struct pipe_context *, const struct pipe_grid_info *); + +/* crocus_pipe_control.c */ + +void crocus_emit_pipe_control_flush(struct crocus_batch *batch, + const char *reason, uint32_t flags); +void crocus_emit_pipe_control_write(struct crocus_batch *batch, + const char *reason, uint32_t flags, + struct crocus_bo *bo, uint32_t offset, + uint64_t imm); +void crocus_emit_mi_flush(struct crocus_batch *batch); +void crocus_emit_depth_stall_flushes(struct crocus_batch *batch); +void crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch); +void crocus_emit_end_of_pipe_sync(struct crocus_batch *batch, + const char *reason, uint32_t flags); +void crocus_flush_all_caches(struct crocus_batch *batch); + +#define crocus_handle_always_flush_cache(batch) \ + if (unlikely(batch->screen->driconf.always_flush_cache)) \ + crocus_flush_all_caches(batch); + +void crocus_init_flush_functions(struct pipe_context *ctx); + +/* crocus_program.c */ +const struct shader_info *crocus_get_shader_info(const struct crocus_context *ice, + gl_shader_stage stage); +struct crocus_bo *crocus_get_scratch_space(struct crocus_context *ice, + unsigned per_thread_scratch, + gl_shader_stage stage); +uint32_t crocus_group_index_to_bti(const struct crocus_binding_table *bt, + enum crocus_surface_group group, + uint32_t index); +uint32_t crocus_bti_to_group_index(const struct crocus_binding_table *bt, + enum crocus_surface_group group, + uint32_t bti); + +/* crocus_disk_cache.c */ + +void crocus_disk_cache_store(struct disk_cache *cache, + const struct crocus_uncompiled_shader *ish, + const struct crocus_compiled_shader *shader, + void *map, + const void *prog_key, + uint32_t prog_key_size); +struct crocus_compiled_shader * +crocus_disk_cache_retrieve(struct crocus_context *ice, + const struct crocus_uncompiled_shader *ish, + const void *prog_key, + uint32_t prog_key_size); + +/* crocus_program_cache.c */ + +void crocus_init_program_cache(struct crocus_context *ice); +void crocus_destroy_program_cache(struct crocus_context *ice); +void crocus_print_program_cache(struct crocus_context *ice); +struct crocus_compiled_shader *crocus_find_cached_shader(struct crocus_context *ice, + enum crocus_program_cache_id, + uint32_t key_size, + const void *key); +struct crocus_compiled_shader *crocus_upload_shader(struct crocus_context *ice, + enum crocus_program_cache_id, + uint32_t key_size, + const void *key, + const void *assembly, + uint32_t asm_size, + struct brw_stage_prog_data *, + uint32_t prog_data_size, + uint32_t *streamout, + enum brw_param_builtin *sysv, + unsigned num_system_values, + unsigned num_cbufs, + const struct crocus_binding_table *bt); +const void *crocus_find_previous_compile(const struct crocus_context *ice, + enum crocus_program_cache_id cache_id, + unsigned program_string_id); +bool crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch, + const void *key, + uint32_t key_size, + uint32_t *kernel_out, + void *prog_data_out); +bool crocus_blorp_upload_shader(struct blorp_batch *blorp_batch, + uint32_t stage, + const void *key, uint32_t key_size, + const void *kernel, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, + uint32_t *kernel_out, + void *prog_data_out); + +/* crocus_resolve.c */ + +void crocus_predraw_resolve_inputs(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled, + gl_shader_stage stage, + bool consider_framebuffer); +void crocus_predraw_resolve_framebuffer(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled); +void crocus_postdraw_update_resolve_tracking(struct crocus_context *ice, + struct crocus_batch *batch); +void crocus_cache_sets_clear(struct crocus_batch *batch); +void crocus_flush_depth_and_render_caches(struct crocus_batch *batch); +void crocus_cache_flush_for_read(struct crocus_batch *batch, struct crocus_bo *bo); +void crocus_cache_flush_for_render(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage); +void crocus_render_cache_add_bo(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage); +void crocus_cache_flush_for_depth(struct crocus_batch *batch, struct crocus_bo *bo); +void crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo); +int crocus_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info); +int crocus_get_driver_query_group_info(struct pipe_screen *pscreen, + unsigned index, + struct pipe_driver_query_group_info *info); + +struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ctx); + +bool crocus_sw_check_cond_render(struct crocus_context *ice); +static inline bool crocus_check_conditional_render(struct crocus_context *ice) +{ + if (ice->state.predicate == CROCUS_PREDICATE_STATE_STALL_FOR_QUERY) + return crocus_sw_check_cond_render(ice); + return ice->state.predicate != CROCUS_PREDICATE_STATE_DONT_RENDER; +} + +#ifdef genX +# include "crocus_genx_protos.h" +#else +# define genX(x) gfx4_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx45_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx5_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx6_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx7_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx75_##x +# include "crocus_genx_protos.h" +# undef genX +#endif + +#endif diff --git a/src/gallium/drivers/crocus/crocus_defines.h b/src/gallium/drivers/crocus/crocus_defines.h new file mode 100644 index 00000000000..a634d0746b0 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_defines.h @@ -0,0 +1,58 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_DEFINES_H +#define CROCUS_DEFINES_H + +/** + * @file crocus_defines.h + * + * Random hardware #defines that we're not using GENXML for. + */ + +#define MI_PREDICATE (0xC << 23) +# define MI_PREDICATE_LOADOP_KEEP (0 << 6) +# define MI_PREDICATE_LOADOP_LOAD (2 << 6) +# define MI_PREDICATE_LOADOP_LOADINV (3 << 6) +# define MI_PREDICATE_COMBINEOP_SET (0 << 3) +# define MI_PREDICATE_COMBINEOP_AND (1 << 3) +# define MI_PREDICATE_COMBINEOP_OR (2 << 3) +# define MI_PREDICATE_COMBINEOP_XOR (3 << 3) +# define MI_PREDICATE_COMPAREOP_TRUE (0 << 0) +# define MI_PREDICATE_COMPAREOP_FALSE (1 << 0) +# define MI_PREDICATE_COMPAREOP_SRCS_EQUAL (2 << 0) +# define MI_PREDICATE_COMPAREOP_DELTAS_EQUAL (3 << 0) + +/* Predicate registers */ +#define MI_PREDICATE_SRC0 0x2400 +#define MI_PREDICATE_SRC1 0x2408 +#define MI_PREDICATE_DATA 0x2410 +#define MI_PREDICATE_RESULT 0x2418 +#define MI_PREDICATE_RESULT_1 0x241C +#define MI_PREDICATE_RESULT_2 0x2214 + +#define CS_GPR(n) (0x2600 + (n) * 8) + +/* The number of bits in our TIMESTAMP queries. */ +#define TIMESTAMP_BITS 36 + +#endif diff --git a/src/gallium/drivers/crocus/crocus_disk_cache.c b/src/gallium/drivers/crocus/crocus_disk_cache.c new file mode 100644 index 00000000000..c84d043fbc8 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_disk_cache.c @@ -0,0 +1,263 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_disk_cache.c + * + * Functions for interacting with the on-disk shader cache. + */ + +#include +#include +#include +#include + +#include "compiler/nir/nir.h" +#include "util/blob.h" +#include "util/build_id.h" +#include "util/disk_cache.h" +#include "util/mesa-sha1.h" + +#include "crocus_context.h" + +static bool debug = false; + +/** + * Compute a disk cache key for the given uncompiled shader and NOS key. + */ +static void +crocus_disk_cache_compute_key(struct disk_cache *cache, + const struct crocus_uncompiled_shader *ish, + const void *orig_prog_key, + uint32_t prog_key_size, + cache_key cache_key) +{ + /* Create a copy of the program key with program_string_id zeroed out. + * It's essentially random data which we don't want to include in our + * hashing and comparisons. We'll set a proper value on a cache hit. + */ + union brw_any_prog_key prog_key; + memcpy(&prog_key, orig_prog_key, prog_key_size); + prog_key.base.program_string_id = 0; + + uint8_t data[sizeof(prog_key) + sizeof(ish->nir_sha1)]; + uint32_t data_size = prog_key_size + sizeof(ish->nir_sha1); + + memcpy(data, ish->nir_sha1, sizeof(ish->nir_sha1)); + memcpy(data + sizeof(ish->nir_sha1), &prog_key, prog_key_size); + + disk_cache_compute_key(cache, data, data_size, cache_key); +} + +/** + * Store the given compiled shader in the disk cache. + * + * This should only be called on newly compiled shaders. No checking is + * done to prevent repeated stores of the same shader. + */ +void +crocus_disk_cache_store(struct disk_cache *cache, + const struct crocus_uncompiled_shader *ish, + const struct crocus_compiled_shader *shader, + void *map, + const void *prog_key, + uint32_t prog_key_size) +{ +#ifdef ENABLE_SHADER_CACHE + if (!cache) + return; + + gl_shader_stage stage = ish->nir->info.stage; + const struct brw_stage_prog_data *prog_data = shader->prog_data; + + cache_key cache_key; + crocus_disk_cache_compute_key(cache, ish, prog_key, prog_key_size, cache_key); + + if (debug) { + char sha1[41]; + _mesa_sha1_format(sha1, cache_key); + fprintf(stderr, "[mesa disk cache] storing %s\n", sha1); + } + + struct blob blob; + blob_init(&blob); + + /* We write the following data to the cache blob: + * + * 1. Prog data (must come first because it has the assembly size) + * 2. Assembly code + * 3. Number of entries in the system value array + * 4. System value array + * 5. Legacy param array (only used for compute workgroup ID) + * 6. Binding table + */ + blob_write_bytes(&blob, shader->prog_data, brw_prog_data_size(stage)); + blob_write_bytes(&blob, map + shader->offset, shader->prog_data->program_size); + blob_write_bytes(&blob, &shader->num_system_values, sizeof(unsigned)); + blob_write_bytes(&blob, shader->system_values, + shader->num_system_values * sizeof(enum brw_param_builtin)); + blob_write_bytes(&blob, prog_data->param, + prog_data->nr_params * sizeof(uint32_t)); + blob_write_bytes(&blob, &shader->bt, sizeof(shader->bt)); + + disk_cache_put(cache, cache_key, blob.data, blob.size, NULL); + blob_finish(&blob); +#endif +} + +/** + * Search for a compiled shader in the disk cache. If found, upload it + * to the in-memory program cache so we can use it. + */ +struct crocus_compiled_shader * +crocus_disk_cache_retrieve(struct crocus_context *ice, + const struct crocus_uncompiled_shader *ish, + const void *prog_key, + uint32_t key_size) +{ +#ifdef ENABLE_SHADER_CACHE + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct disk_cache *cache = screen->disk_cache; + gl_shader_stage stage = ish->nir->info.stage; + + if (!cache) + return NULL; + + cache_key cache_key; + crocus_disk_cache_compute_key(cache, ish, prog_key, key_size, cache_key); + + if (debug) { + char sha1[41]; + _mesa_sha1_format(sha1, cache_key); + fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1); + } + + size_t size; + void *buffer = disk_cache_get(screen->disk_cache, cache_key, &size); + + if (debug) + fprintf(stderr, "%s\n", buffer ? "found" : "missing"); + + if (!buffer) + return NULL; + + const uint32_t prog_data_size = brw_prog_data_size(stage); + + struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size); + const void *assembly; + uint32_t num_system_values; + uint32_t *system_values = NULL; + uint32_t *so_decls = NULL; + + struct blob_reader blob; + blob_reader_init(&blob, buffer, size); + blob_copy_bytes(&blob, prog_data, prog_data_size); + assembly = blob_read_bytes(&blob, prog_data->program_size); + num_system_values = blob_read_uint32(&blob); + if (num_system_values) { + system_values = + ralloc_array(NULL, enum brw_param_builtin, num_system_values); + blob_copy_bytes(&blob, system_values, + num_system_values * sizeof(enum brw_param_builtin)); + } + + prog_data->param = NULL; + prog_data->pull_param = NULL; + assert(prog_data->nr_pull_params == 0); + + if (prog_data->nr_params) { + prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params); + blob_copy_bytes(&blob, prog_data->param, + prog_data->nr_params * sizeof(uint32_t)); + } + + struct crocus_binding_table bt; + blob_copy_bytes(&blob, &bt, sizeof(bt)); + + if ((stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY) && screen->devinfo.ver > 6) { + struct brw_vue_prog_data *vue_prog_data = (void *) prog_data; + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + } + + /* System values and uniforms are stored in constant buffer 0, the + * user-facing UBOs are indexed by one. So if any constant buffer is + * needed, the constant buffer 0 will be needed, so account for it. + */ + unsigned num_cbufs = ish->nir->info.num_ubos; + + if (num_cbufs || ish->nir->num_uniforms) + num_cbufs++; + + if (num_system_values) + num_cbufs++; + + /* Upload our newly read shader to the in-memory program cache and + * return it to the caller. + */ + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, stage, key_size, prog_key, assembly, + prog_data->program_size, + prog_data, prog_data_size, so_decls, system_values, + num_system_values, num_cbufs, &bt); + + free(buffer); + + return shader; +#else + return NULL; +#endif +} + +/** + * Initialize the on-disk shader cache. + */ +void +crocus_disk_cache_init(struct crocus_screen *screen) +{ +#ifdef ENABLE_SHADER_CACHE + if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK) + return; + + /* array length = print length + nul char + 1 extra to verify it's unused */ + char renderer[13]; + UNUSED int len = + snprintf(renderer, sizeof(renderer), "crocus_%04x", screen->pci_id); + assert(len == sizeof(renderer) - 2); + + const struct build_id_note *note = + build_id_find_nhdr_for_addr(crocus_disk_cache_init); + assert(note && build_id_length(note) == 20); /* sha1 */ + + const uint8_t *id_sha1 = build_id_data(note); + assert(id_sha1); + + char timestamp[41]; + _mesa_sha1_format(timestamp, id_sha1); + + const uint64_t driver_flags = + brw_get_compiler_config_value(screen->compiler); + screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags); +#endif +} diff --git a/src/gallium/drivers/crocus/crocus_draw.c b/src/gallium/drivers/crocus/crocus_draw.c new file mode 100644 index 00000000000..119c5571ae1 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_draw.c @@ -0,0 +1,511 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_draw.c + * + * The main driver hooks for drawing and launching compute shaders. + */ + +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_draw.h" +#include "util/u_inlines.h" +#include "util/u_transfer.h" +#include "util/u_upload_mgr.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/compiler/brw_eu_defines.h" +#include "crocus_context.h" +#include "crocus_defines.h" +#include "util/u_prim_restart.h" +#include "indices/u_primconvert.h" +#include "util/u_prim.h" + +static bool +prim_is_points_or_lines(enum pipe_prim_type mode) +{ + /* We don't need to worry about adjacency - it can only be used with + * geometry shaders, and we don't care about this info when GS is on. + */ + return mode == PIPE_PRIM_POINTS || + mode == PIPE_PRIM_LINES || + mode == PIPE_PRIM_LINE_LOOP || + mode == PIPE_PRIM_LINE_STRIP; +} + +static bool +can_cut_index_handle_restart_index(struct crocus_context *ice, + const struct pipe_draw_info *draw) +{ + switch (draw->index_size) { + case 1: + return draw->restart_index == 0xff; + case 2: + return draw->restart_index == 0xffff; + case 4: + return draw->restart_index == 0xffffffff; + default: + unreachable("illegal index size\n"); + } + + return false; +} + +static bool +can_cut_index_handle_prim(struct crocus_context *ice, + const struct pipe_draw_info *draw) +{ + struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + /* Haswell can do it all. */ + if (devinfo->is_haswell) + return true; + + if (!can_cut_index_handle_restart_index(ice, draw)) + return false; + + switch (draw->mode) { + case PIPE_PRIM_POINTS: + case PIPE_PRIM_LINES: + case PIPE_PRIM_LINE_STRIP: + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_LINES_ADJACENCY: + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + case PIPE_PRIM_TRIANGLES_ADJACENCY: + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + return true; + default: + break; + } + return false; +} + +/** + * Record the current primitive mode and restart information, flagging + * related packets as dirty if necessary. + * + * This must be called before updating compiled shaders, because the patch + * information informs the TCS key. + */ +static void +crocus_update_draw_info(struct crocus_context *ice, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + enum pipe_prim_type mode = info->mode; + + if (screen->devinfo.ver < 6) { + /* Slight optimization to avoid the GS program when not needed: + */ + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + if (mode == PIPE_PRIM_QUAD_STRIP && !rs_state->flatshade && + rs_state->fill_front == PIPE_POLYGON_MODE_FILL && + rs_state->fill_back == PIPE_POLYGON_MODE_FILL) + mode = PIPE_PRIM_TRIANGLE_STRIP; + if (mode == PIPE_PRIM_QUADS && + draw->count == 4 && + !rs_state->flatshade && + rs_state->fill_front == PIPE_POLYGON_MODE_FILL && + rs_state->fill_back == PIPE_POLYGON_MODE_FILL) + mode = PIPE_PRIM_TRIANGLE_FAN; + } + + if (ice->state.prim_mode != mode) { + ice->state.prim_mode = mode; + + if (screen->devinfo.ver < 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; + if (screen->devinfo.ver <= 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; + + if (screen->devinfo.ver >= 7) + ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; + + /* For XY Clip enables */ + bool points_or_lines = prim_is_points_or_lines(mode); + if (points_or_lines != ice->state.prim_is_points_or_lines) { + ice->state.prim_is_points_or_lines = points_or_lines; + ice->state.dirty |= CROCUS_DIRTY_CLIP; + } + } + + if (info->mode == PIPE_PRIM_PATCHES && + ice->state.vertices_per_patch != info->vertices_per_patch) { + ice->state.vertices_per_patch = info->vertices_per_patch; + + /* This is needed for key->input_vertices */ + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_TCS; + + /* Flag constants dirty for gl_PatchVerticesIn if needed. */ + const struct shader_info *tcs_info = + crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL); + if (tcs_info && + BITSET_TEST(tcs_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS; + ice->state.shaders[MESA_SHADER_TESS_CTRL].sysvals_need_upload = true; + } + } + + const unsigned cut_index = info->primitive_restart ? info->restart_index : + ice->state.cut_index; + if (ice->state.primitive_restart != info->primitive_restart || + ice->state.cut_index != cut_index) { + if (screen->devinfo.is_haswell) + ice->state.dirty |= CROCUS_DIRTY_GEN75_VF; + ice->state.primitive_restart = info->primitive_restart; + ice->state.cut_index = info->restart_index; + } +} + +/** + * Update shader draw parameters, flagging VF packets as dirty if necessary. + */ +static void +crocus_update_draw_parameters(struct crocus_context *ice, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draw) +{ + bool changed = false; + + if (ice->state.vs_uses_draw_params) { + struct crocus_state_ref *draw_params = &ice->draw.draw_params; + + if (indirect && indirect->buffer) { + pipe_resource_reference(&draw_params->res, indirect->buffer); + draw_params->offset = + indirect->offset + (info->index_size ? 12 : 8); + + changed = true; + ice->draw.params_valid = false; + } else { + int firstvertex = info->index_size ? draw->index_bias : draw->start; + + if (!ice->draw.params_valid || + ice->draw.params.firstvertex != firstvertex || + ice->draw.params.baseinstance != info->start_instance) { + + changed = true; + ice->draw.params.firstvertex = firstvertex; + ice->draw.params.baseinstance = info->start_instance; + ice->draw.params_valid = true; + + u_upload_data(ice->ctx.stream_uploader, 0, + sizeof(ice->draw.params), 4, &ice->draw.params, + &draw_params->offset, &draw_params->res); + } + } + } + + if (ice->state.vs_uses_derived_draw_params) { + struct crocus_state_ref *derived_params = &ice->draw.derived_draw_params; + int is_indexed_draw = info->index_size ? -1 : 0; + + if (ice->draw.derived_params.drawid != drawid_offset || + ice->draw.derived_params.is_indexed_draw != is_indexed_draw) { + + changed = true; + ice->draw.derived_params.drawid = drawid_offset; + ice->draw.derived_params.is_indexed_draw = is_indexed_draw; + + u_upload_data(ice->ctx.stream_uploader, 0, + sizeof(ice->draw.derived_params), 4, + &ice->draw.derived_params, &derived_params->offset, + &derived_params->res); + } + } + + if (changed) { + ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS | + CROCUS_DIRTY_VERTEX_ELEMENTS; + } +} + +static void +crocus_indirect_draw_vbo(struct crocus_context *ice, + const struct pipe_draw_info *dinfo, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *dindirect, + const struct pipe_draw_start_count_bias *draws) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + struct pipe_draw_info info = *dinfo; + struct pipe_draw_indirect_info indirect = *dindirect; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + + if (devinfo->is_haswell && indirect.indirect_draw_count && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + /* Upload MI_PREDICATE_RESULT to GPR15.*/ + screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT); + } + + uint64_t orig_dirty = ice->state.dirty; + uint64_t orig_stage_dirty = ice->state.stage_dirty; + + for (int i = 0; i < indirect.draw_count; i++) { + crocus_batch_maybe_flush(batch, 1500); + crocus_require_statebuffer_space(batch, 2400); + + crocus_update_draw_parameters(ice, &info, drawid_offset + i, &indirect, draws); + + screen->vtbl.upload_render_state(ice, batch, &info, drawid_offset + i, &indirect, draws); + + ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER; + ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; + + indirect.offset += indirect.stride; + } + + if (devinfo->is_haswell && indirect.indirect_draw_count && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + /* Restore MI_PREDICATE_RESULT. */ + screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15)); + } + + /* Put this back for post-draw resolves, we'll clear it again after. */ + ice->state.dirty = orig_dirty; + ice->state.stage_dirty = orig_stage_dirty; +} + +static void +crocus_simple_draw_vbo(struct crocus_context *ice, + const struct pipe_draw_info *draw, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + + crocus_batch_maybe_flush(batch, 1500); + crocus_require_statebuffer_space(batch, 2400); + + crocus_update_draw_parameters(ice, draw, drawid_offset, indirect, sc); + + screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc); +} + +static void +crocus_draw_vbo_get_vertex_count(struct pipe_context *ctx, + const struct pipe_draw_info *info_in, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect) +{ + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + struct pipe_draw_info info = *info_in; + struct pipe_draw_start_count_bias draw; + + uint32_t val = screen->vtbl.get_so_offset(indirect->count_from_stream_output); + + draw.start = 0; + draw.count = val; + ctx->draw_vbo(ctx, &info, drawid_offset, NULL, &draw, 1); +} + +/** + * The pipe->draw_vbo() driver hook. Performs a draw on the GPU. + */ +void +crocus_draw_vbo(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) +{ + if (num_draws > 1) { + util_draw_multi(ctx, info, drawid_offset, indirect, draws, num_draws); + return; + } + + if (!indirect && (!draws[0].count || !info->instance_count)) + return; + + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + if (!crocus_check_conditional_render(ice)) + return; + + if (info->primitive_restart && !can_cut_index_handle_prim(ice, info)) { + util_draw_vbo_without_prim_restart(ctx, info, drawid_offset, + indirect, draws); + return; + } + + if (indirect && indirect->count_from_stream_output && + !screen->devinfo.is_haswell) { + crocus_draw_vbo_get_vertex_count(ctx, info, drawid_offset, indirect); + return; + } + + /** + * The hardware is capable of removing dangling vertices on its own; however, + * prior to Gen6, we sometimes convert quads into trifans (and quad strips + * into tristrips), since pre-Gen6 hardware requires a GS to render quads. + * This function manually trims dangling vertices from a draw call involving + * quads so that those dangling vertices won't get drawn when we convert to + * trifans/tristrips. + */ + if (screen->devinfo.ver < 6) { + if (info->mode == PIPE_PRIM_QUADS || info->mode == PIPE_PRIM_QUAD_STRIP) { + bool trim = u_trim_pipe_prim(info->mode, (unsigned *)&draws[0].count); + if (!trim) + return; + } + } + + /* We can't safely re-emit 3DSTATE_SO_BUFFERS because it may zero the + * write offsets, changing the behavior. + */ + if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) { + ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER & ~CROCUS_DIRTY_GEN7_SO_BUFFERS; + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; + } + + /* Emit Sandybridge workaround flushes on every primitive, for safety. */ + if (screen->devinfo.ver == 6) + crocus_emit_post_sync_nonzero_flush(batch); + + crocus_update_draw_info(ice, info, draws); + + if (!crocus_update_compiled_shaders(ice)) + return; + + if (ice->state.dirty & CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES) { + bool draw_aux_buffer_disabled[BRW_MAX_DRAW_BUFFERS] = { }; + for (gl_shader_stage stage = 0; stage < MESA_SHADER_COMPUTE; stage++) { + if (ice->shaders.prog[stage]) + crocus_predraw_resolve_inputs(ice, batch, draw_aux_buffer_disabled, + stage, true); + } + crocus_predraw_resolve_framebuffer(ice, batch, draw_aux_buffer_disabled); + } + + crocus_handle_always_flush_cache(batch); + + if (indirect && indirect->buffer) + crocus_indirect_draw_vbo(ice, info, drawid_offset, indirect, draws); + else + crocus_simple_draw_vbo(ice, info, drawid_offset, indirect, draws); + + crocus_handle_always_flush_cache(batch); + + crocus_postdraw_update_resolve_tracking(ice, batch); + + ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER; + ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; +} + +static void +crocus_update_grid_size_resource(struct crocus_context *ice, + const struct pipe_grid_info *grid) +{ + struct crocus_state_ref *grid_ref = &ice->state.grid_size; + const struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_COMPUTE]; + bool grid_needs_surface = shader->bt.used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS]; + + if (grid->indirect) { + pipe_resource_reference(&grid_ref->res, grid->indirect); + grid_ref->offset = grid->indirect_offset; + + /* Zero out the grid size so that the next non-indirect grid launch will + * re-upload it properly. + */ + memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); + } else if (memcmp(ice->state.last_grid, grid->grid, sizeof(grid->grid)) != 0) { + memcpy(ice->state.last_grid, grid->grid, sizeof(grid->grid)); + u_upload_data(ice->ctx.const_uploader, 0, sizeof(grid->grid), 4, + grid->grid, &grid_ref->offset, &grid_ref->res); + } + + /* Skip surface upload if we don't need it or we already have one */ + if (!grid_needs_surface) + return; + + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_CS; +} + + +void +crocus_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_COMPUTE]; + struct crocus_screen *screen = batch->screen; + + if (!crocus_check_conditional_render(ice)) + return; + + if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) { + ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE; + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE; + } + + /* We can't do resolves on the compute engine, so awkwardly, we have to + * do them on the render batch... + */ + if (ice->state.dirty & CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES) { + crocus_predraw_resolve_inputs(ice, &ice->batches[CROCUS_BATCH_RENDER], NULL, + MESA_SHADER_COMPUTE, false); + } + + crocus_batch_maybe_flush(batch, 1500); + crocus_require_statebuffer_space(batch, 2500); + crocus_update_compiled_compute_shader(ice); + + if (memcmp(ice->state.last_block, grid->block, sizeof(grid->block)) != 0) { + memcpy(ice->state.last_block, grid->block, sizeof(grid->block)); + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS; + ice->state.shaders[MESA_SHADER_COMPUTE].sysvals_need_upload = true; + } + + crocus_update_grid_size_resource(ice, grid); + + if (ice->state.compute_predicate) { + screen->vtbl.emit_compute_predicate(batch); + ice->state.compute_predicate = NULL; + } + + crocus_handle_always_flush_cache(batch); + + screen->vtbl.upload_compute_state(ice, batch, grid); + + crocus_handle_always_flush_cache(batch); + + ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_COMPUTE; + ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE; + + /* Note: since compute shaders can't access the framebuffer, there's + * no need to call crocus_postdraw_update_resolve_tracking. + */ +} diff --git a/src/gallium/drivers/crocus/crocus_fence.c b/src/gallium/drivers/crocus/crocus_fence.c new file mode 100644 index 00000000000..fdff24b2dd4 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fence.c @@ -0,0 +1,571 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_fence.c + * + * Fences for driver and IPC serialisation, scheduling and synchronisation. + */ + +#include "util/u_inlines.h" +#include "intel/common/intel_gem.h" + +#include "crocus_batch.h" +#include "crocus_bufmgr.h" +#include "crocus_context.h" +#include "crocus_fence.h" +#include "crocus_screen.h" + +static uint32_t +gem_syncobj_create(int fd, uint32_t flags) +{ + struct drm_syncobj_create args = { + .flags = flags, + }; + + intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_CREATE, &args); + + return args.handle; +} + +static void +gem_syncobj_destroy(int fd, uint32_t handle) +{ + struct drm_syncobj_destroy args = { + .handle = handle, + }; + + intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_DESTROY, &args); +} + +/** + * Make a new sync-point. + */ +struct crocus_syncobj * +crocus_create_syncobj(struct crocus_screen *screen) +{ + struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj)); + + if (!syncobj) + return NULL; + + syncobj->handle = gem_syncobj_create(screen->fd, 0); + assert(syncobj->handle); + + pipe_reference_init(&syncobj->ref, 1); + + return syncobj; +} + +void +crocus_syncobj_destroy(struct crocus_screen *screen, + struct crocus_syncobj *syncobj) +{ + gem_syncobj_destroy(screen->fd, syncobj->handle); + free(syncobj); +} + +/** + * Add a sync-point to the batch, with the given flags. + * + * \p flags One of I915_EXEC_FENCE_WAIT or I915_EXEC_FENCE_SIGNAL. + */ +void +crocus_batch_add_syncobj(struct crocus_batch *batch, + struct crocus_syncobj *syncobj, unsigned flags) +{ + struct drm_i915_gem_exec_fence *fence = + util_dynarray_grow(&batch->exec_fences, struct drm_i915_gem_exec_fence, 1); + + *fence = (struct drm_i915_gem_exec_fence){ + .handle = syncobj->handle, + .flags = flags, + }; + + struct crocus_syncobj **store = + util_dynarray_grow(&batch->syncobjs, struct crocus_syncobj *, 1); + + *store = NULL; + crocus_syncobj_reference(batch->screen, store, syncobj); +} + +/** + * Walk through a batch's dependencies (any I915_EXEC_FENCE_WAIT syncobjs) + * and unreference any which have already passed. + * + * Sometimes the compute batch is seldom used, and accumulates references + * to stale render batches that are no longer of interest, so we can free + * those up. + */ +static void +clear_stale_syncobjs(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + + int n = util_dynarray_num_elements(&batch->syncobjs, struct crocus_syncobj *); + + assert(n == util_dynarray_num_elements(&batch->exec_fences, + struct drm_i915_gem_exec_fence)); + + /* Skip the first syncobj, as it's the signalling one. */ + for (int i = n - 1; i > 1; i--) { + struct crocus_syncobj **syncobj = + util_dynarray_element(&batch->syncobjs, struct crocus_syncobj *, i); + struct drm_i915_gem_exec_fence *fence = + util_dynarray_element(&batch->exec_fences, + struct drm_i915_gem_exec_fence, i); + assert(fence->flags & I915_EXEC_FENCE_WAIT); + + if (crocus_wait_syncobj(&screen->base, *syncobj, 0)) + continue; + + /* This sync object has already passed, there's no need to continue + * marking it as a dependency; we can stop holding on to the reference. + */ + crocus_syncobj_reference(screen, syncobj, NULL); + + /* Remove it from the lists; move the last element here. */ + struct crocus_syncobj **nth_syncobj = + util_dynarray_pop_ptr(&batch->syncobjs, struct crocus_syncobj *); + struct drm_i915_gem_exec_fence *nth_fence = + util_dynarray_pop_ptr(&batch->exec_fences, + struct drm_i915_gem_exec_fence); + + if (syncobj != nth_syncobj) { + *syncobj = *nth_syncobj; + memcpy(fence, nth_fence, sizeof(*fence)); + } + } +} + +/* ------------------------------------------------------------------- */ + +struct pipe_fence_handle { + struct pipe_reference ref; + + struct pipe_context *unflushed_ctx; + + struct crocus_fine_fence *fine[CROCUS_BATCH_COUNT]; +}; + +static void +crocus_fence_destroy(struct pipe_screen *p_screen, + struct pipe_fence_handle *fence) +{ + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) + crocus_fine_fence_reference(screen, &fence->fine[i], NULL); + + free(fence); +} + +static void +crocus_fence_reference(struct pipe_screen *p_screen, + struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) +{ + if (pipe_reference(&(*dst)->ref, &src->ref)) + crocus_fence_destroy(p_screen, *dst); + + *dst = src; +} + +bool +crocus_wait_syncobj(struct pipe_screen *p_screen, + struct crocus_syncobj *syncobj, int64_t timeout_nsec) +{ + if (!syncobj) + return false; + + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + struct drm_syncobj_wait args = { + .handles = (uintptr_t)&syncobj->handle, + .count_handles = 1, + .timeout_nsec = timeout_nsec, + }; + return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args); +} + +static void +crocus_fence_flush(struct pipe_context *ctx, + struct pipe_fence_handle **out_fence, unsigned flags) +{ + struct crocus_screen *screen = (void *)ctx->screen; + struct crocus_context *ice = (struct crocus_context *)ctx; + + const bool deferred = flags & PIPE_FLUSH_DEFERRED; + + if (!deferred) { + for (unsigned i = 0; i < ice->batch_count; i++) + crocus_batch_flush(&ice->batches[i]); + } + + if (!out_fence) + return; + + struct pipe_fence_handle *fence = calloc(1, sizeof(*fence)); + if (!fence) + return; + + pipe_reference_init(&fence->ref, 1); + + if (deferred) + fence->unflushed_ctx = ctx; + + for (unsigned b = 0; b < ice->batch_count; b++) { + struct crocus_batch *batch = &ice->batches[b]; + + if (deferred && crocus_batch_bytes_used(batch) > 0) { + struct crocus_fine_fence *fine = + crocus_fine_fence_new(batch, CROCUS_FENCE_BOTTOM_OF_PIPE); + crocus_fine_fence_reference(screen, &fence->fine[b], fine); + crocus_fine_fence_reference(screen, &fine, NULL); + } else { + /* This batch has no commands queued up (perhaps we just flushed, + * or all the commands are on the other batch). Wait for the last + * syncobj on this engine - unless it's already finished by now. + */ + if (crocus_fine_fence_signaled(batch->last_fence)) + continue; + + crocus_fine_fence_reference(screen, &fence->fine[b], + batch->last_fence); + } + } + + crocus_fence_reference(ctx->screen, out_fence, NULL); + *out_fence = fence; +} + +static void +crocus_fence_await(struct pipe_context *ctx, struct pipe_fence_handle *fence) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + /* Unflushed fences from the same context are no-ops. */ + if (ctx && ctx == fence->unflushed_ctx) + return; + + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + for (unsigned b = 0; b < ice->batch_count; b++) { + struct crocus_batch *batch = &ice->batches[b]; + + /* We're going to make any future work in this batch wait for our + * fence to have gone by. But any currently queued work doesn't + * need to wait. Flush the batch now, so it can happen sooner. + */ + crocus_batch_flush(batch); + + /* Before adding a new reference, clean out any stale ones. */ + clear_stale_syncobjs(batch); + + crocus_batch_add_syncobj(batch, fine->syncobj, I915_EXEC_FENCE_WAIT); + } + } +} + +#define NSEC_PER_SEC (1000 * USEC_PER_SEC) +#define USEC_PER_SEC (1000 * MSEC_PER_SEC) +#define MSEC_PER_SEC (1000) + +static uint64_t +gettime_ns(void) +{ + struct timespec current; + clock_gettime(CLOCK_MONOTONIC, ¤t); + return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec; +} + +static uint64_t +rel2abs(uint64_t timeout) +{ + if (timeout == 0) + return 0; + + uint64_t current_time = gettime_ns(); + uint64_t max_timeout = (uint64_t)INT64_MAX - current_time; + + timeout = MIN2(max_timeout, timeout); + + return current_time + timeout; +} + +static bool +crocus_fence_finish(struct pipe_screen *p_screen, struct pipe_context *ctx, + struct pipe_fence_handle *fence, uint64_t timeout) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + + /* If we created the fence with PIPE_FLUSH_DEFERRED, we may not have + * flushed yet. Check if our syncobj is the current batch's signalling + * syncobj - if so, we haven't flushed and need to now. + * + * The Gallium docs mention that a flush will occur if \p ctx matches + * the context the fence was created with. It may be NULL, so we check + * that it matches first. + */ + if (ctx && ctx == fence->unflushed_ctx) { + for (unsigned i = 0; i < ice->batch_count; i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + if (fine->syncobj == crocus_batch_get_signal_syncobj(&ice->batches[i])) + crocus_batch_flush(&ice->batches[i]); + } + + /* The fence is no longer deferred. */ + fence->unflushed_ctx = NULL; + } + + unsigned int handle_count = 0; + uint32_t handles[ARRAY_SIZE(fence->fine)]; + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + handles[handle_count++] = fine->syncobj->handle; + } + + if (handle_count == 0) + return true; + + struct drm_syncobj_wait args = { + .handles = (uintptr_t)handles, + .count_handles = handle_count, + .timeout_nsec = rel2abs(timeout), + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL + }; + if (fence->unflushed_ctx) { + /* This fence had a deferred flush from another context. We can't + * safely flush it here, because the context might be bound to a + * different thread, and poking at its internals wouldn't be safe. + * + * Instead, use the WAIT_FOR_SUBMIT flag to block and hope that + * another thread submits the work. + */ + args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT; + } + return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args) == 0; +} + +#ifndef SYNC_IOC_MAGIC +/* duplicated from linux/sync_file.h to avoid build-time dependency + * on new (v4.7) kernel headers. Once distro's are mostly using + * something newer than v4.7 drop this and #include + * instead. + */ +struct sync_merge_data { + char name[32]; + __s32 fd2; + __s32 fence; + __u32 flags; + __u32 pad; +}; + +#define SYNC_IOC_MAGIC '>' +#define SYNC_IOC_MERGE _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data) +#endif + +static int +sync_merge_fd(int sync_fd, int new_fd) +{ + if (sync_fd == -1) + return new_fd; + + if (new_fd == -1) + return sync_fd; + + struct sync_merge_data args = { + .name = "crocus fence", + .fd2 = new_fd, + .fence = -1, + }; + + intel_ioctl(sync_fd, SYNC_IOC_MERGE, &args); + close(new_fd); + close(sync_fd); + + return args.fence; +} + +static int +crocus_fence_get_fd(struct pipe_screen *p_screen, + struct pipe_fence_handle *fence) +{ + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + int fd = -1; + + /* Deferred fences aren't supported. */ + if (fence->unflushed_ctx) + return -1; + + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + struct drm_syncobj_handle args = { + .handle = fine->syncobj->handle, + .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE, + .fd = -1, + }; + + intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args); + fd = sync_merge_fd(fd, args.fd); + } + + if (fd == -1) { + /* Our fence has no syncobj's recorded. This means that all of the + * batches had already completed, their syncobj's had been signalled, + * and so we didn't bother to record them. But we're being asked to + * export such a fence. So export a dummy already-signalled syncobj. + */ + struct drm_syncobj_handle args = { + .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE, + .fd = -1, + }; + + args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED); + intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args); + gem_syncobj_destroy(screen->fd, args.handle); + return args.fd; + } + + return fd; +} + +static void +crocus_fence_create_fd(struct pipe_context *ctx, struct pipe_fence_handle **out, + int fd, enum pipe_fd_type type) +{ + assert(type == PIPE_FD_TYPE_NATIVE_SYNC || type == PIPE_FD_TYPE_SYNCOBJ); + + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + struct drm_syncobj_handle args = { + .fd = fd, + }; + + if (type == PIPE_FD_TYPE_NATIVE_SYNC) { + args.flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE; + args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED); + } + + if (intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args) == -1) { + fprintf(stderr, "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE failed: %s\n", + strerror(errno)); + if (type == PIPE_FD_TYPE_NATIVE_SYNC) + gem_syncobj_destroy(screen->fd, args.handle); + *out = NULL; + return; + } + + struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj)); + if (!syncobj) { + *out = NULL; + return; + } + syncobj->handle = args.handle; + pipe_reference_init(&syncobj->ref, 1); + + struct crocus_fine_fence *fine = calloc(1, sizeof(*fine)); + if (!fine) { + free(syncobj); + *out = NULL; + return; + } + + static const uint32_t zero = 0; + + /* Fences work in terms of crocus_fine_fence, but we don't actually have a + * seqno for an imported fence. So, create a fake one which always + * returns as 'not signaled' so we fall back to using the sync object. + */ + fine->seqno = UINT32_MAX; + fine->map = &zero; + fine->syncobj = syncobj; + fine->flags = CROCUS_FENCE_END; + pipe_reference_init(&fine->reference, 1); + + struct pipe_fence_handle *fence = calloc(1, sizeof(*fence)); + if (!fence) { + free(fine); + free(syncobj); + *out = NULL; + return; + } + pipe_reference_init(&fence->ref, 1); + fence->fine[0] = fine; + + *out = fence; +} + +static void +crocus_fence_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + if (ctx == fence->unflushed_ctx) + return; + + for (unsigned b = 0; b < ice->batch_count; b++) { + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + /* already signaled fence skipped */ + if (crocus_fine_fence_signaled(fine)) + continue; + + ice->batches[b].contains_fence_signal = true; + crocus_batch_add_syncobj(&ice->batches[b], fine->syncobj, + I915_EXEC_FENCE_SIGNAL); + } + } +} + +void +crocus_init_screen_fence_functions(struct pipe_screen *screen) +{ + screen->fence_reference = crocus_fence_reference; + screen->fence_finish = crocus_fence_finish; + screen->fence_get_fd = crocus_fence_get_fd; +} + +void +crocus_init_context_fence_functions(struct pipe_context *ctx) +{ + ctx->flush = crocus_fence_flush; + ctx->create_fence_fd = crocus_fence_create_fd; + ctx->fence_server_sync = crocus_fence_await; + ctx->fence_server_signal = crocus_fence_signal; +} diff --git a/src/gallium/drivers/crocus/crocus_fence.h b/src/gallium/drivers/crocus/crocus_fence.h new file mode 100644 index 00000000000..ef2eff5259b --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fence.h @@ -0,0 +1,60 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_FENCE_H +#define CROCUS_FENCE_H + +#include "util/u_inlines.h" + +struct pipe_screen; +struct crocus_screen; +struct crocus_batch; + +struct crocus_syncobj { + struct pipe_reference ref; + uint32_t handle; +}; + +void crocus_init_context_fence_functions(struct pipe_context *ctx); +void crocus_init_screen_fence_functions(struct pipe_screen *screen); + +struct crocus_syncobj *crocus_create_syncobj(struct crocus_screen *screen); +void crocus_syncobj_destroy(struct crocus_screen *, struct crocus_syncobj *); +void crocus_batch_add_syncobj(struct crocus_batch *batch, + struct crocus_syncobj *syncobj, + unsigned flags); +bool crocus_wait_syncobj(struct pipe_screen *screen, + struct crocus_syncobj *syncobj, + int64_t timeout_nsec); +static inline void +crocus_syncobj_reference(struct crocus_screen *screen, + struct crocus_syncobj **dst, + struct crocus_syncobj *src) +{ + if (pipe_reference(&(*dst)->ref, &src->ref)) + crocus_syncobj_destroy(screen, *dst); + + *dst = src; +} + +#endif diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.c b/src/gallium/drivers/crocus/crocus_fine_fence.c new file mode 100644 index 00000000000..9bb8a9673e3 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fine_fence.c @@ -0,0 +1,85 @@ +#include "crocus_context.h" +#include "crocus_fine_fence.h" +#include "util/u_upload_mgr.h" + +static void +crocus_fine_fence_reset(struct crocus_batch *batch) +{ + u_upload_alloc(batch->fine_fences.uploader, + 0, sizeof(uint64_t), sizeof(uint64_t), + &batch->fine_fences.ref.offset, &batch->fine_fences.ref.res, + (void **)&batch->fine_fences.map); + WRITE_ONCE(*batch->fine_fences.map, 0); + batch->fine_fences.next++; +} + +void +crocus_fine_fence_init(struct crocus_batch *batch) +{ + batch->fine_fences.ref.res = NULL; + batch->fine_fences.next = 0; + if (batch_has_fine_fence(batch)) + crocus_fine_fence_reset(batch); +} + +static uint32_t +crocus_fine_fence_next(struct crocus_batch *batch) +{ + if (!batch_has_fine_fence(batch)) + return UINT32_MAX; + + uint32_t seqno = batch->fine_fences.next++; + + if (batch->fine_fences.next == 0) + crocus_fine_fence_reset(batch); + + return seqno; +} + +void +crocus_fine_fence_destroy(struct crocus_screen *screen, + struct crocus_fine_fence *fine) +{ + crocus_syncobj_reference(screen, &fine->syncobj, NULL); + pipe_resource_reference(&fine->ref.res, NULL); + free(fine); +} + +struct crocus_fine_fence * +crocus_fine_fence_new(struct crocus_batch *batch, unsigned flags) +{ + struct crocus_fine_fence *fine = calloc(1, sizeof(*fine)); + if (!fine) + return NULL; + + pipe_reference_init(&fine->reference, 1); + + fine->seqno = crocus_fine_fence_next(batch); + + crocus_syncobj_reference(batch->screen, &fine->syncobj, + crocus_batch_get_signal_syncobj(batch)); + + if (!batch_has_fine_fence(batch)) + return fine; + pipe_resource_reference(&fine->ref.res, batch->fine_fences.ref.res); + fine->ref.offset = batch->fine_fences.ref.offset; + fine->map = batch->fine_fences.map; + fine->flags = flags; + + unsigned pc; + if (flags & CROCUS_FENCE_TOP_OF_PIPE) { + pc = PIPE_CONTROL_WRITE_IMMEDIATE | PIPE_CONTROL_CS_STALL; + } else { + pc = PIPE_CONTROL_WRITE_IMMEDIATE | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_TILE_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DATA_CACHE_FLUSH; + } + crocus_emit_pipe_control_write(batch, "fence: fine", pc, + crocus_resource_bo(fine->ref.res), + fine->ref.offset, + fine->seqno); + + return fine; +} diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.h b/src/gallium/drivers/crocus/crocus_fine_fence.h new file mode 100644 index 00000000000..ad6f02a945a --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fine_fence.h @@ -0,0 +1,109 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_FINE_FENCE_DOT_H +#define CROCUS_FINE_FENCE_DOT_H + +#include +#include + +#include "crocus_screen.h" +#include "crocus_resource.h" + +/** + * A lightweight sequence number fence. + * + * We emit PIPE_CONTROLs inside a batch (possibly in the middle) + * which update a monotonically increasing, 32-bit counter. We + * can then check if that moment has passed by either: + * + * 1. Checking on the CPU by snooping on the DWord via a coherent map + * + * 2. Blocking on the GPU with MI_SEMAPHORE_WAIT from a second batch + * (relying on mid-batch preemption to switch GPU execution to the + * batch that writes it). + */ +struct crocus_fine_fence { + struct pipe_reference reference; + + /** Buffer where the seqno lives */ + struct crocus_state_ref ref; + + /** Coherent CPU map of the buffer containing the seqno DWord. */ + const uint32_t *map; + + /** + * A drm_syncobj pointing which will be signaled at the end of the + * batch which writes this seqno. This can be used to block until + * the seqno has definitely passed (but may wait longer than necessary). + */ + struct crocus_syncobj *syncobj; + +#define CROCUS_FENCE_BOTTOM_OF_PIPE 0x0 /**< Written by bottom-of-pipe flush */ +#define CROCUS_FENCE_TOP_OF_PIPE 0x1 /**< Written by top-of-pipe flush */ +#define CROCUS_FENCE_END 0x2 /**< Written at the end of a batch */ + + /** Information about the type of flush involved (see CROCUS_FENCE_*) */ + uint32_t flags; + + /** + * Sequence number expected to be written by the flush we inserted + * when creating this fence. The crocus_fine_fence is 'signaled' when *@map + * (written by the flush on the GPU) is greater-than-or-equal to @seqno. + */ + uint32_t seqno; +}; + +void crocus_fine_fence_init(struct crocus_batch *batch); + +struct crocus_fine_fence *crocus_fine_fence_new(struct crocus_batch *batch, + unsigned flags); + +void crocus_fine_fence_destroy(struct crocus_screen *screen, + struct crocus_fine_fence *sq); + +static inline void +crocus_fine_fence_reference(struct crocus_screen *screen, + struct crocus_fine_fence **dst, + struct crocus_fine_fence *src) +{ + if (pipe_reference(&(*dst)->reference, &src->reference)) + crocus_fine_fence_destroy(screen, *dst); + + *dst = src; +} + +/** + * Return true if this seqno has passed. + * + * NULL is considered signaled. + */ +static inline bool +crocus_fine_fence_signaled(const struct crocus_fine_fence *sq) +{ + if (sq && !sq->map) + return false; + return !sq || (READ_ONCE(*sq->map) >= sq->seqno); +} + +#endif diff --git a/src/gallium/drivers/crocus/crocus_formats.c b/src/gallium/drivers/crocus/crocus_formats.c new file mode 100644 index 00000000000..31762643bdc --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_formats.c @@ -0,0 +1,576 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_formats.c + * + * Converts Gallium formats (PIPE_FORMAT_*) to hardware ones (ISL_FORMAT_*). + * Provides information about which formats support what features. + */ + +#include "util/bitscan.h" +#include "util/macros.h" +#include "util/format/u_format.h" + +#include "crocus_resource.h" +#include "crocus_screen.h" + +static enum isl_format +crocus_isl_format_for_pipe_format(enum pipe_format pf) +{ + static const enum isl_format table[PIPE_FORMAT_COUNT] = { + [0 ... PIPE_FORMAT_COUNT-1] = ISL_FORMAT_UNSUPPORTED, + + [PIPE_FORMAT_B8G8R8A8_UNORM] = ISL_FORMAT_B8G8R8A8_UNORM, + [PIPE_FORMAT_B8G8R8X8_UNORM] = ISL_FORMAT_B8G8R8X8_UNORM, + [PIPE_FORMAT_B5G5R5A1_UNORM] = ISL_FORMAT_B5G5R5A1_UNORM, + [PIPE_FORMAT_B4G4R4A4_UNORM] = ISL_FORMAT_B4G4R4A4_UNORM, + [PIPE_FORMAT_B5G6R5_UNORM] = ISL_FORMAT_B5G6R5_UNORM, + [PIPE_FORMAT_R10G10B10A2_UNORM] = ISL_FORMAT_R10G10B10A2_UNORM, + + [PIPE_FORMAT_Z16_UNORM] = ISL_FORMAT_R16_UNORM, + [PIPE_FORMAT_Z32_UNORM] = ISL_FORMAT_R32_UNORM, + [PIPE_FORMAT_Z32_FLOAT] = ISL_FORMAT_R32_FLOAT, + + /* We translate the combined depth/stencil formats to depth only here */ + [PIPE_FORMAT_Z24_UNORM_S8_UINT] = ISL_FORMAT_R24_UNORM_X8_TYPELESS, + [PIPE_FORMAT_Z24X8_UNORM] = ISL_FORMAT_R24_UNORM_X8_TYPELESS, + [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = ISL_FORMAT_R32_FLOAT, + + [PIPE_FORMAT_S8_UINT] = ISL_FORMAT_R8_UINT, + [PIPE_FORMAT_X24S8_UINT] = ISL_FORMAT_R8_UINT, + [PIPE_FORMAT_X32_S8X24_UINT] = ISL_FORMAT_R8_UINT, + + [PIPE_FORMAT_R64_FLOAT] = ISL_FORMAT_R64_FLOAT, + [PIPE_FORMAT_R64G64_FLOAT] = ISL_FORMAT_R64G64_FLOAT, + [PIPE_FORMAT_R64G64B64_FLOAT] = ISL_FORMAT_R64G64B64_FLOAT, + [PIPE_FORMAT_R64G64B64A64_FLOAT] = ISL_FORMAT_R64G64B64A64_FLOAT, + [PIPE_FORMAT_R32_FLOAT] = ISL_FORMAT_R32_FLOAT, + [PIPE_FORMAT_R32G32_FLOAT] = ISL_FORMAT_R32G32_FLOAT, + [PIPE_FORMAT_R32G32B32_FLOAT] = ISL_FORMAT_R32G32B32_FLOAT, + [PIPE_FORMAT_R32G32B32A32_FLOAT] = ISL_FORMAT_R32G32B32A32_FLOAT, + [PIPE_FORMAT_R32_UNORM] = ISL_FORMAT_R32_UNORM, + [PIPE_FORMAT_R32G32_UNORM] = ISL_FORMAT_R32G32_UNORM, + [PIPE_FORMAT_R32G32B32_UNORM] = ISL_FORMAT_R32G32B32_UNORM, + [PIPE_FORMAT_R32G32B32A32_UNORM] = ISL_FORMAT_R32G32B32A32_UNORM, + [PIPE_FORMAT_R32_USCALED] = ISL_FORMAT_R32_USCALED, + [PIPE_FORMAT_R32G32_USCALED] = ISL_FORMAT_R32G32_USCALED, + [PIPE_FORMAT_R32G32B32_USCALED] = ISL_FORMAT_R32G32B32_USCALED, + [PIPE_FORMAT_R32G32B32A32_USCALED] = ISL_FORMAT_R32G32B32A32_USCALED, + [PIPE_FORMAT_R32_SNORM] = ISL_FORMAT_R32_SNORM, + [PIPE_FORMAT_R32G32_SNORM] = ISL_FORMAT_R32G32_SNORM, + [PIPE_FORMAT_R32G32B32_SNORM] = ISL_FORMAT_R32G32B32_SNORM, + [PIPE_FORMAT_R32G32B32A32_SNORM] = ISL_FORMAT_R32G32B32A32_SNORM, + [PIPE_FORMAT_R32_SSCALED] = ISL_FORMAT_R32_SSCALED, + [PIPE_FORMAT_R32G32_SSCALED] = ISL_FORMAT_R32G32_SSCALED, + [PIPE_FORMAT_R32G32B32_SSCALED] = ISL_FORMAT_R32G32B32_SSCALED, + [PIPE_FORMAT_R32G32B32A32_SSCALED] = ISL_FORMAT_R32G32B32A32_SSCALED, + [PIPE_FORMAT_R16_UNORM] = ISL_FORMAT_R16_UNORM, + [PIPE_FORMAT_R16G16_UNORM] = ISL_FORMAT_R16G16_UNORM, + [PIPE_FORMAT_R16G16B16_UNORM] = ISL_FORMAT_R16G16B16_UNORM, + [PIPE_FORMAT_R16G16B16A16_UNORM] = ISL_FORMAT_R16G16B16A16_UNORM, + [PIPE_FORMAT_R16_USCALED] = ISL_FORMAT_R16_USCALED, + [PIPE_FORMAT_R16G16_USCALED] = ISL_FORMAT_R16G16_USCALED, + [PIPE_FORMAT_R16G16B16_USCALED] = ISL_FORMAT_R16G16B16_USCALED, + [PIPE_FORMAT_R16G16B16A16_USCALED] = ISL_FORMAT_R16G16B16A16_USCALED, + [PIPE_FORMAT_R16_SNORM] = ISL_FORMAT_R16_SNORM, + [PIPE_FORMAT_R16G16_SNORM] = ISL_FORMAT_R16G16_SNORM, + [PIPE_FORMAT_R16G16B16_SNORM] = ISL_FORMAT_R16G16B16_SNORM, + [PIPE_FORMAT_R16G16B16A16_SNORM] = ISL_FORMAT_R16G16B16A16_SNORM, + [PIPE_FORMAT_R16_SSCALED] = ISL_FORMAT_R16_SSCALED, + [PIPE_FORMAT_R16G16_SSCALED] = ISL_FORMAT_R16G16_SSCALED, + [PIPE_FORMAT_R16G16B16_SSCALED] = ISL_FORMAT_R16G16B16_SSCALED, + [PIPE_FORMAT_R16G16B16A16_SSCALED] = ISL_FORMAT_R16G16B16A16_SSCALED, + [PIPE_FORMAT_R8_UNORM] = ISL_FORMAT_R8_UNORM, + [PIPE_FORMAT_R8G8_UNORM] = ISL_FORMAT_R8G8_UNORM, + [PIPE_FORMAT_R8G8B8_UNORM] = ISL_FORMAT_R8G8B8_UNORM, + [PIPE_FORMAT_R8G8B8A8_UNORM] = ISL_FORMAT_R8G8B8A8_UNORM, + [PIPE_FORMAT_R8_USCALED] = ISL_FORMAT_R8_USCALED, + [PIPE_FORMAT_R8G8_USCALED] = ISL_FORMAT_R8G8_USCALED, + [PIPE_FORMAT_R8G8B8_USCALED] = ISL_FORMAT_R8G8B8_USCALED, + [PIPE_FORMAT_R8G8B8A8_USCALED] = ISL_FORMAT_R8G8B8A8_USCALED, + [PIPE_FORMAT_R8_SNORM] = ISL_FORMAT_R8_SNORM, + [PIPE_FORMAT_R8G8_SNORM] = ISL_FORMAT_R8G8_SNORM, + [PIPE_FORMAT_R8G8B8_SNORM] = ISL_FORMAT_R8G8B8_SNORM, + [PIPE_FORMAT_R8G8B8A8_SNORM] = ISL_FORMAT_R8G8B8A8_SNORM, + [PIPE_FORMAT_R8_SSCALED] = ISL_FORMAT_R8_SSCALED, + [PIPE_FORMAT_R8G8_SSCALED] = ISL_FORMAT_R8G8_SSCALED, + [PIPE_FORMAT_R8G8B8_SSCALED] = ISL_FORMAT_R8G8B8_SSCALED, + [PIPE_FORMAT_R8G8B8A8_SSCALED] = ISL_FORMAT_R8G8B8A8_SSCALED, + [PIPE_FORMAT_R32_FIXED] = ISL_FORMAT_R32_SFIXED, + [PIPE_FORMAT_R32G32_FIXED] = ISL_FORMAT_R32G32_SFIXED, + [PIPE_FORMAT_R32G32B32_FIXED] = ISL_FORMAT_R32G32B32_SFIXED, + [PIPE_FORMAT_R32G32B32A32_FIXED] = ISL_FORMAT_R32G32B32A32_SFIXED, + [PIPE_FORMAT_R16_FLOAT] = ISL_FORMAT_R16_FLOAT, + [PIPE_FORMAT_R16G16_FLOAT] = ISL_FORMAT_R16G16_FLOAT, + [PIPE_FORMAT_R16G16B16_FLOAT] = ISL_FORMAT_R16G16B16_FLOAT, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = ISL_FORMAT_R16G16B16A16_FLOAT, + + [PIPE_FORMAT_R8G8B8_SRGB] = ISL_FORMAT_R8G8B8_UNORM_SRGB, + [PIPE_FORMAT_B8G8R8A8_SRGB] = ISL_FORMAT_B8G8R8A8_UNORM_SRGB, + [PIPE_FORMAT_B8G8R8X8_SRGB] = ISL_FORMAT_B8G8R8X8_UNORM_SRGB, + [PIPE_FORMAT_R8G8B8A8_SRGB] = ISL_FORMAT_R8G8B8A8_UNORM_SRGB, + + [PIPE_FORMAT_DXT1_RGB] = ISL_FORMAT_BC1_UNORM, + [PIPE_FORMAT_DXT1_RGBA] = ISL_FORMAT_BC1_UNORM, + [PIPE_FORMAT_DXT3_RGBA] = ISL_FORMAT_BC2_UNORM, + [PIPE_FORMAT_DXT5_RGBA] = ISL_FORMAT_BC3_UNORM, + + [PIPE_FORMAT_DXT1_SRGB] = ISL_FORMAT_BC1_UNORM_SRGB, + [PIPE_FORMAT_DXT1_SRGBA] = ISL_FORMAT_BC1_UNORM_SRGB, + [PIPE_FORMAT_DXT3_SRGBA] = ISL_FORMAT_BC2_UNORM_SRGB, + [PIPE_FORMAT_DXT5_SRGBA] = ISL_FORMAT_BC3_UNORM_SRGB, + + [PIPE_FORMAT_RGTC1_UNORM] = ISL_FORMAT_BC4_UNORM, + [PIPE_FORMAT_RGTC1_SNORM] = ISL_FORMAT_BC4_SNORM, + [PIPE_FORMAT_RGTC2_UNORM] = ISL_FORMAT_BC5_UNORM, + [PIPE_FORMAT_RGTC2_SNORM] = ISL_FORMAT_BC5_SNORM, + + [PIPE_FORMAT_R10G10B10A2_USCALED] = ISL_FORMAT_R10G10B10A2_USCALED, + [PIPE_FORMAT_R11G11B10_FLOAT] = ISL_FORMAT_R11G11B10_FLOAT, + [PIPE_FORMAT_R9G9B9E5_FLOAT] = ISL_FORMAT_R9G9B9E5_SHAREDEXP, + [PIPE_FORMAT_R1_UNORM] = ISL_FORMAT_R1_UNORM, + [PIPE_FORMAT_R10G10B10X2_USCALED] = ISL_FORMAT_R10G10B10X2_USCALED, + [PIPE_FORMAT_B10G10R10A2_UNORM] = ISL_FORMAT_B10G10R10A2_UNORM, + [PIPE_FORMAT_R8G8B8X8_UNORM] = ISL_FORMAT_R8G8B8X8_UNORM, + + [PIPE_FORMAT_I8_UNORM] = ISL_FORMAT_R8_UNORM, + [PIPE_FORMAT_I16_UNORM] = ISL_FORMAT_R16_UNORM, + [PIPE_FORMAT_I8_SNORM] = ISL_FORMAT_R8_SNORM, + [PIPE_FORMAT_I16_SNORM] = ISL_FORMAT_R16_SNORM, + [PIPE_FORMAT_I16_FLOAT] = ISL_FORMAT_R16_FLOAT, + [PIPE_FORMAT_I32_FLOAT] = ISL_FORMAT_R32_FLOAT, + + [PIPE_FORMAT_L8_UINT] = ISL_FORMAT_L8_UINT, + [PIPE_FORMAT_L8_UNORM] = ISL_FORMAT_L8_UNORM, + [PIPE_FORMAT_L8_SNORM] = ISL_FORMAT_R8_SNORM, + [PIPE_FORMAT_L8_SINT] = ISL_FORMAT_L8_SINT, + [PIPE_FORMAT_L16_UNORM] = ISL_FORMAT_L16_UNORM, + [PIPE_FORMAT_L16_SNORM] = ISL_FORMAT_R16_SNORM, + [PIPE_FORMAT_L16_FLOAT] = ISL_FORMAT_L16_FLOAT, + [PIPE_FORMAT_L32_FLOAT] = ISL_FORMAT_L32_FLOAT, + + [PIPE_FORMAT_A8_UNORM] = ISL_FORMAT_A8_UNORM, + [PIPE_FORMAT_A16_UNORM] = ISL_FORMAT_A16_UNORM, + [PIPE_FORMAT_A16_FLOAT] = ISL_FORMAT_A16_FLOAT, + [PIPE_FORMAT_A32_FLOAT] = ISL_FORMAT_A32_FLOAT, + + [PIPE_FORMAT_L8A8_UNORM] = ISL_FORMAT_L8A8_UNORM, + [PIPE_FORMAT_L16A16_UNORM] = ISL_FORMAT_L16A16_UNORM, + [PIPE_FORMAT_L16A16_FLOAT] = ISL_FORMAT_L16A16_FLOAT, + [PIPE_FORMAT_L32A32_FLOAT] = ISL_FORMAT_L32A32_FLOAT, + + /* Sadly, we have to use luminance[-alpha] formats for sRGB decoding. */ + [PIPE_FORMAT_R8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB, + [PIPE_FORMAT_L8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB, + [PIPE_FORMAT_L8A8_SRGB] = ISL_FORMAT_L8A8_UNORM_SRGB, + + [PIPE_FORMAT_R10G10B10A2_SSCALED] = ISL_FORMAT_R10G10B10A2_SSCALED, + [PIPE_FORMAT_R10G10B10A2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM, + + [PIPE_FORMAT_B10G10R10A2_USCALED] = ISL_FORMAT_B10G10R10A2_USCALED, + [PIPE_FORMAT_B10G10R10A2_SSCALED] = ISL_FORMAT_B10G10R10A2_SSCALED, + [PIPE_FORMAT_B10G10R10A2_SNORM] = ISL_FORMAT_B10G10R10A2_SNORM, + + [PIPE_FORMAT_R8_UINT] = ISL_FORMAT_R8_UINT, + [PIPE_FORMAT_R8G8_UINT] = ISL_FORMAT_R8G8_UINT, + [PIPE_FORMAT_R8G8B8_UINT] = ISL_FORMAT_R8G8B8_UINT, + [PIPE_FORMAT_R8G8B8A8_UINT] = ISL_FORMAT_R8G8B8A8_UINT, + + [PIPE_FORMAT_R8_SINT] = ISL_FORMAT_R8_SINT, + [PIPE_FORMAT_R8G8_SINT] = ISL_FORMAT_R8G8_SINT, + [PIPE_FORMAT_R8G8B8_SINT] = ISL_FORMAT_R8G8B8_SINT, + [PIPE_FORMAT_R8G8B8A8_SINT] = ISL_FORMAT_R8G8B8A8_SINT, + + [PIPE_FORMAT_R16_UINT] = ISL_FORMAT_R16_UINT, + [PIPE_FORMAT_R16G16_UINT] = ISL_FORMAT_R16G16_UINT, + [PIPE_FORMAT_R16G16B16_UINT] = ISL_FORMAT_R16G16B16_UINT, + [PIPE_FORMAT_R16G16B16A16_UINT] = ISL_FORMAT_R16G16B16A16_UINT, + + [PIPE_FORMAT_R16_SINT] = ISL_FORMAT_R16_SINT, + [PIPE_FORMAT_R16G16_SINT] = ISL_FORMAT_R16G16_SINT, + [PIPE_FORMAT_R16G16B16_SINT] = ISL_FORMAT_R16G16B16_SINT, + [PIPE_FORMAT_R16G16B16A16_SINT] = ISL_FORMAT_R16G16B16A16_SINT, + + [PIPE_FORMAT_R32_UINT] = ISL_FORMAT_R32_UINT, + [PIPE_FORMAT_R32G32_UINT] = ISL_FORMAT_R32G32_UINT, + [PIPE_FORMAT_R32G32B32_UINT] = ISL_FORMAT_R32G32B32_UINT, + [PIPE_FORMAT_R32G32B32A32_UINT] = ISL_FORMAT_R32G32B32A32_UINT, + + [PIPE_FORMAT_R32_SINT] = ISL_FORMAT_R32_SINT, + [PIPE_FORMAT_R32G32_SINT] = ISL_FORMAT_R32G32_SINT, + [PIPE_FORMAT_R32G32B32_SINT] = ISL_FORMAT_R32G32B32_SINT, + [PIPE_FORMAT_R32G32B32A32_SINT] = ISL_FORMAT_R32G32B32A32_SINT, + + [PIPE_FORMAT_B10G10R10A2_UINT] = ISL_FORMAT_B10G10R10A2_UINT, + + [PIPE_FORMAT_ETC1_RGB8] = ISL_FORMAT_ETC1_RGB8, + + [PIPE_FORMAT_R8G8B8X8_SRGB] = ISL_FORMAT_R8G8B8X8_UNORM_SRGB, + [PIPE_FORMAT_B10G10R10X2_UNORM] = ISL_FORMAT_B10G10R10X2_UNORM, + [PIPE_FORMAT_R16G16B16X16_UNORM] = ISL_FORMAT_R16G16B16X16_UNORM, + [PIPE_FORMAT_R16G16B16X16_FLOAT] = ISL_FORMAT_R16G16B16X16_FLOAT, + [PIPE_FORMAT_R32G32B32X32_FLOAT] = ISL_FORMAT_R32G32B32X32_FLOAT, + + [PIPE_FORMAT_R10G10B10A2_UINT] = ISL_FORMAT_R10G10B10A2_UINT, + + [PIPE_FORMAT_B5G6R5_SRGB] = ISL_FORMAT_B5G6R5_UNORM_SRGB, + + [PIPE_FORMAT_BPTC_RGBA_UNORM] = ISL_FORMAT_BC7_UNORM, + [PIPE_FORMAT_BPTC_SRGBA] = ISL_FORMAT_BC7_UNORM_SRGB, + [PIPE_FORMAT_BPTC_RGB_FLOAT] = ISL_FORMAT_BC6H_SF16, + [PIPE_FORMAT_BPTC_RGB_UFLOAT] = ISL_FORMAT_BC6H_UF16, + + [PIPE_FORMAT_ETC2_RGB8] = ISL_FORMAT_ETC2_RGB8, + [PIPE_FORMAT_ETC2_SRGB8] = ISL_FORMAT_ETC2_SRGB8, + [PIPE_FORMAT_ETC2_RGB8A1] = ISL_FORMAT_ETC2_RGB8_PTA, + [PIPE_FORMAT_ETC2_SRGB8A1] = ISL_FORMAT_ETC2_SRGB8_PTA, + [PIPE_FORMAT_ETC2_RGBA8] = ISL_FORMAT_ETC2_EAC_RGBA8, + [PIPE_FORMAT_ETC2_SRGBA8] = ISL_FORMAT_ETC2_EAC_SRGB8_A8, + [PIPE_FORMAT_ETC2_R11_UNORM] = ISL_FORMAT_EAC_R11, + [PIPE_FORMAT_ETC2_R11_SNORM] = ISL_FORMAT_EAC_SIGNED_R11, + [PIPE_FORMAT_ETC2_RG11_UNORM] = ISL_FORMAT_EAC_RG11, + [PIPE_FORMAT_ETC2_RG11_SNORM] = ISL_FORMAT_EAC_SIGNED_RG11, + + [PIPE_FORMAT_FXT1_RGB] = ISL_FORMAT_FXT1, + [PIPE_FORMAT_FXT1_RGBA] = ISL_FORMAT_FXT1, + + [PIPE_FORMAT_ASTC_4x4] = ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16, + [PIPE_FORMAT_ASTC_5x4] = ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16, + [PIPE_FORMAT_ASTC_5x5] = ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16, + [PIPE_FORMAT_ASTC_6x5] = ISL_FORMAT_ASTC_LDR_2D_6X5_FLT16, + [PIPE_FORMAT_ASTC_6x6] = ISL_FORMAT_ASTC_LDR_2D_6X6_FLT16, + [PIPE_FORMAT_ASTC_8x5] = ISL_FORMAT_ASTC_LDR_2D_8X5_FLT16, + [PIPE_FORMAT_ASTC_8x6] = ISL_FORMAT_ASTC_LDR_2D_8X6_FLT16, + [PIPE_FORMAT_ASTC_8x8] = ISL_FORMAT_ASTC_LDR_2D_8X8_FLT16, + [PIPE_FORMAT_ASTC_10x5] = ISL_FORMAT_ASTC_LDR_2D_10X5_FLT16, + [PIPE_FORMAT_ASTC_10x6] = ISL_FORMAT_ASTC_LDR_2D_10X6_FLT16, + [PIPE_FORMAT_ASTC_10x8] = ISL_FORMAT_ASTC_LDR_2D_10X8_FLT16, + [PIPE_FORMAT_ASTC_10x10] = ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16, + [PIPE_FORMAT_ASTC_12x10] = ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16, + [PIPE_FORMAT_ASTC_12x12] = ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16, + + [PIPE_FORMAT_ASTC_4x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_4X4_U8SRGB, + [PIPE_FORMAT_ASTC_5x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X4_U8SRGB, + [PIPE_FORMAT_ASTC_5x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB, + [PIPE_FORMAT_ASTC_6x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X5_U8SRGB, + [PIPE_FORMAT_ASTC_6x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X6_U8SRGB, + [PIPE_FORMAT_ASTC_8x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X5_U8SRGB, + [PIPE_FORMAT_ASTC_8x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X6_U8SRGB, + [PIPE_FORMAT_ASTC_8x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X8_U8SRGB, + [PIPE_FORMAT_ASTC_10x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X5_U8SRGB, + [PIPE_FORMAT_ASTC_10x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X6_U8SRGB, + [PIPE_FORMAT_ASTC_10x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X8_U8SRGB, + [PIPE_FORMAT_ASTC_10x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X10_U8SRGB, + [PIPE_FORMAT_ASTC_12x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB, + [PIPE_FORMAT_ASTC_12x12_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB, + + [PIPE_FORMAT_A1B5G5R5_UNORM] = ISL_FORMAT_A1B5G5R5_UNORM, + + /* We support these so that we know the API expects no alpha channel. + * Otherwise, the state tracker would just give us a format with alpha + * and we wouldn't know to override the swizzle to 1. + */ + [PIPE_FORMAT_R16G16B16X16_UINT] = ISL_FORMAT_R16G16B16A16_UINT, + [PIPE_FORMAT_R16G16B16X16_SINT] = ISL_FORMAT_R16G16B16A16_SINT, + [PIPE_FORMAT_R32G32B32X32_UINT] = ISL_FORMAT_R32G32B32A32_UINT, + [PIPE_FORMAT_R32G32B32X32_SINT] = ISL_FORMAT_R32G32B32A32_SINT, + [PIPE_FORMAT_R10G10B10X2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM, + }; + assert(pf < PIPE_FORMAT_COUNT); + return table[pf]; +} + +static enum isl_format +get_render_format(enum pipe_format pformat, enum isl_format def_format) +{ + switch (pformat) { + case PIPE_FORMAT_A16_UNORM: return ISL_FORMAT_R16_UNORM; + case PIPE_FORMAT_A16_FLOAT: return ISL_FORMAT_R16_FLOAT; + case PIPE_FORMAT_A32_FLOAT: return ISL_FORMAT_R32_FLOAT; + + case PIPE_FORMAT_I8_UNORM: return ISL_FORMAT_R8_UNORM; + case PIPE_FORMAT_I16_UNORM: return ISL_FORMAT_R16_UNORM; + case PIPE_FORMAT_I16_FLOAT: return ISL_FORMAT_R16_FLOAT; + case PIPE_FORMAT_I32_FLOAT: return ISL_FORMAT_R32_FLOAT; + + case PIPE_FORMAT_L8_UNORM: return ISL_FORMAT_R8_UNORM; + case PIPE_FORMAT_L8_UINT: return ISL_FORMAT_R8_UINT; + case PIPE_FORMAT_L8_SINT: return ISL_FORMAT_R8_SINT; + case PIPE_FORMAT_L16_UNORM: return ISL_FORMAT_R16_UNORM; + case PIPE_FORMAT_L16_FLOAT: return ISL_FORMAT_R16_FLOAT; + case PIPE_FORMAT_L32_FLOAT: return ISL_FORMAT_R32_FLOAT; + + case PIPE_FORMAT_L8A8_UNORM: return ISL_FORMAT_R8G8_UNORM; + case PIPE_FORMAT_L16A16_UNORM: return ISL_FORMAT_R16G16_UNORM; + case PIPE_FORMAT_L16A16_FLOAT: return ISL_FORMAT_R16G16_FLOAT; + case PIPE_FORMAT_L32A32_FLOAT: return ISL_FORMAT_R32G32_FLOAT; + + default: + return def_format; + } +} + +struct crocus_format_info +crocus_format_for_usage(const struct intel_device_info *devinfo, + enum pipe_format pformat, + isl_surf_usage_flags_t usage) +{ + struct crocus_format_info info = { crocus_isl_format_for_pipe_format(pformat), + { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W } }; + + if (info.fmt == ISL_FORMAT_UNSUPPORTED) + return info; + + if (pformat == PIPE_FORMAT_A8_UNORM) { + info.fmt = ISL_FORMAT_A8_UNORM; + } + + if (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) + info.fmt = get_render_format(pformat, info.fmt); + if (devinfo->ver < 6) { + if (pformat == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) + info.fmt = ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS; + if (pformat == PIPE_FORMAT_X32_S8X24_UINT) + info.fmt = ISL_FORMAT_X32_TYPELESS_G8X24_UINT; + if (pformat == PIPE_FORMAT_X24S8_UINT) + info.fmt = ISL_FORMAT_X24_TYPELESS_G8_UINT; + } + + const struct isl_format_layout *fmtl = isl_format_get_layout(info.fmt); + + if (util_format_is_snorm(pformat)) { + if (util_format_is_intensity(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_X; + info.swizzles[2] = PIPE_SWIZZLE_X; + info.swizzles[3] = PIPE_SWIZZLE_X; + } else if (util_format_is_luminance(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_X; + info.swizzles[2] = PIPE_SWIZZLE_X; + info.swizzles[3] = PIPE_SWIZZLE_1; + } else if (util_format_is_luminance_alpha(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_X; + info.swizzles[2] = PIPE_SWIZZLE_X; + info.swizzles[3] = PIPE_SWIZZLE_Y; + } else if (util_format_is_alpha(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_0; + info.swizzles[1] = PIPE_SWIZZLE_0; + info.swizzles[2] = PIPE_SWIZZLE_0; + info.swizzles[3] = PIPE_SWIZZLE_X; + } + } + + /* When faking RGBX pipe formats with RGBA ISL formats, override alpha. */ + if (!util_format_has_alpha(pformat) && fmtl->channels.a.type != ISL_VOID) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_Y; + info.swizzles[2] = PIPE_SWIZZLE_Z; + info.swizzles[3] = PIPE_SWIZZLE_1; + } + + /* We choose RGBA over RGBX for rendering the hardware doesn't support + * rendering to RGBX. However, when this internal override is used on Gen9+, + * fast clears don't work correctly. + * + * i965 fixes this by pretending to not support RGBX formats, and the higher + * layers of Mesa pick the RGBA format instead. Gallium doesn't work that + * way, and might choose a different format, like BGRX instead of RGBX, + * which will also cause problems when sampling from a surface fast cleared + * as RGBX. So we always choose RGBA instead of RGBX explicitly + * here. + */ + if (isl_format_is_rgbx(info.fmt) && + !isl_format_supports_rendering(devinfo, info.fmt) && + (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)) { + info.fmt = isl_format_rgbx_to_rgba(info.fmt); + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_Y; + info.swizzles[2] = PIPE_SWIZZLE_Z; + info.swizzles[3] = PIPE_SWIZZLE_1; + } + + return info; +} + +/** + * The pscreen->is_format_supported() driver hook. + * + * Returns true if the given format is supported for the given usage + * (PIPE_BIND_*) and sample count. + */ +bool +crocus_is_format_supported(struct pipe_screen *pscreen, + enum pipe_format pformat, + enum pipe_texture_target target, + unsigned sample_count, unsigned storage_sample_count, + unsigned usage) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (!util_is_power_of_two_or_zero(sample_count)) + return false; + if (devinfo->ver >= 7) { + if (sample_count > 8 || sample_count == 2) + return false; + } else if (devinfo->ver == 6) { + if (sample_count > 4 || sample_count == 2) + return false; + } else if (sample_count > 1) { + return false; + } + + if (pformat == PIPE_FORMAT_NONE) + return true; + + enum isl_format format = crocus_isl_format_for_pipe_format(pformat); + + if (format == ISL_FORMAT_UNSUPPORTED) + return false; + + /* no stencil texturing prior to haswell */ + if (!devinfo->is_haswell) { + if (pformat == PIPE_FORMAT_S8_UINT || + pformat == PIPE_FORMAT_X24S8_UINT || + pformat == PIPE_FORMAT_S8X24_UINT || + pformat == PIPE_FORMAT_X32_S8X24_UINT) + return FALSE; + } + + const struct isl_format_layout *fmtl = isl_format_get_layout(format); + const bool is_integer = isl_format_has_int_channel(format); + bool supported = true; + + if (sample_count > 1) + supported &= isl_format_supports_multisampling(devinfo, format); + + if (usage & PIPE_BIND_DEPTH_STENCIL) { + supported &= format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS || + format == ISL_FORMAT_R32_FLOAT || + format == ISL_FORMAT_R24_UNORM_X8_TYPELESS || + format == ISL_FORMAT_R16_UNORM || + format == ISL_FORMAT_R8_UINT; + } + + if (usage & PIPE_BIND_RENDER_TARGET) { + /* Alpha and luminance-alpha formats other than A8_UNORM are not + * renderable. + * + * For BLORP, we can apply the swizzle in the shader. But for + * general rendering, this would mean recompiling the shader, which + * we'd like to avoid doing. So we mark these formats non-renderable. + * + * We do support A8_UNORM as it's required and is renderable. + */ + if (pformat != PIPE_FORMAT_A8_UNORM && + (util_format_is_alpha(pformat) || + util_format_is_luminance_alpha(pformat))) + supported = false; + + enum isl_format rt_format = format; + + if (isl_format_is_rgbx(format) && + !isl_format_supports_rendering(devinfo, format)) + rt_format = isl_format_rgbx_to_rgba(format); + + supported &= isl_format_supports_rendering(devinfo, rt_format); + + if (!is_integer) + supported &= isl_format_supports_alpha_blending(devinfo, rt_format); + } + + if (usage & PIPE_BIND_SHADER_IMAGE) { + /* Dataport doesn't support compression, and we can't resolve an MCS + * compressed surface. (Buffer images may have sample count of 0.) + */ + supported &= sample_count == 0; + + supported &= isl_format_supports_typed_writes(devinfo, format); + supported &= isl_has_matching_typed_storage_image_format(devinfo, format); + } + + if (usage & PIPE_BIND_SAMPLER_VIEW) { + supported &= isl_format_supports_sampling(devinfo, format); + bool ignore_filtering = false; + + if (is_integer) + ignore_filtering = true; + + /* I said them, but I lied them. */ + if (devinfo->ver < 5 && (format == ISL_FORMAT_R32G32B32A32_FLOAT || + format == ISL_FORMAT_R24_UNORM_X8_TYPELESS || + format == ISL_FORMAT_R32_FLOAT || + format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS)) + ignore_filtering = true; + if (!ignore_filtering) + supported &= isl_format_supports_filtering(devinfo, format); + + /* Don't advertise 3-component RGB formats for non-buffer textures. + * This ensures that they are renderable from an API perspective since + * the state tracker will fall back to RGBA or RGBX, which are + * renderable. We want to render internally for copies and blits, + * even if the application doesn't. + * + * Buffer textures don't need to be renderable, so we support real RGB. + * This is useful for PBO upload, and 32-bit RGB support is mandatory. + */ + if (target != PIPE_BUFFER) + supported &= fmtl->bpb != 24 && fmtl->bpb != 48 && fmtl->bpb != 96; + } + + if (usage & PIPE_BIND_VERTEX_BUFFER) { + supported &= isl_format_supports_vertex_fetch(devinfo, format); + + if (!devinfo->is_haswell) { + /* W/A: Pre-Haswell, the hardware doesn't really support the formats + * we'd like to use here, so upload everything as UINT and fix it in + * the shader + */ + if (format == ISL_FORMAT_R10G10B10A2_UNORM || + format == ISL_FORMAT_B10G10R10A2_UNORM || + format == ISL_FORMAT_R10G10B10A2_SNORM || + format == ISL_FORMAT_B10G10R10A2_SNORM || + format == ISL_FORMAT_R10G10B10A2_USCALED || + format == ISL_FORMAT_B10G10R10A2_USCALED || + format == ISL_FORMAT_R10G10B10A2_SSCALED || + format == ISL_FORMAT_B10G10R10A2_SSCALED) + supported = true; + + if (format == ISL_FORMAT_R8G8B8_SINT || + format == ISL_FORMAT_R8G8B8_UINT || + format == ISL_FORMAT_R16G16B16_SINT || + format == ISL_FORMAT_R16G16B16_UINT) + supported = true; + } + } + + if (usage & PIPE_BIND_INDEX_BUFFER) { + supported &= format == ISL_FORMAT_R8_UINT || + format == ISL_FORMAT_R16_UINT || + format == ISL_FORMAT_R32_UINT; + } + + return supported; +} diff --git a/src/gallium/drivers/crocus/crocus_genx_macros.h b/src/gallium/drivers/crocus/crocus_genx_macros.h new file mode 100644 index 00000000000..a0309513ed2 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_genx_macros.h @@ -0,0 +1,164 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * Macro and function definitions needed in order to use genxml. + * + * This should only be included in sources compiled per-generation. + */ + +#include "crocus_batch.h" + +#include "genxml/gen_macros.h" + +#define __gen_address_type struct crocus_address +#define __gen_user_data struct crocus_batch +#define __gen_combine_address crocus_combine_address + +static inline void * +__gen_get_batch_dwords(struct crocus_batch *batch, unsigned dwords) +{ + return crocus_get_command_space(batch, dwords * sizeof(uint32_t)); +} + +static inline struct crocus_address +__gen_address_offset(struct crocus_address addr, uint64_t offset) +{ + addr.offset += offset; + return addr; +} + +static uint64_t +__gen_combine_address(struct crocus_batch *batch, void *location, + struct crocus_address addr, uint32_t delta) +{ + uint32_t offset = (char *)location - (char *)batch->command.map; + + if (addr.bo == NULL) { + return addr.offset + delta; + } else { + if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) { + offset = (char *) location - (char *) batch->state.map; + return crocus_state_reloc(batch, offset, addr.bo, + addr.offset + delta, + addr.reloc_flags); + } + + assert(!crocus_ptr_in_state_buffer(batch, location)); + + offset = (char *) location - (char *) batch->command.map; + return crocus_command_reloc(batch, offset, addr.bo, + addr.offset + delta, + addr.reloc_flags); + } +} + +#define __gen_address_type struct crocus_address +#define __gen_user_data struct crocus_batch + +#define __genxml_cmd_length(cmd) cmd ## _length +#define __genxml_cmd_length_bias(cmd) cmd ## _length_bias +#define __genxml_cmd_header(cmd) cmd ## _header +#define __genxml_cmd_pack(cmd) cmd ## _pack +#define __genxml_reg_num(cmd) cmd ## _num + +#include "genxml/genX_pack.h" +#include "genxml/gen_macros.h" +#include "genxml/genX_bits.h" + +/* CS_GPR(15) is reserved for combining conditional rendering predicates + * with GL_ARB_indirect_parameters draw number predicates. + */ +#define MI_BUILDER_NUM_ALLOC_GPRS 15 +#include "common/mi_builder.h" + +#define _crocus_pack_command(batch, cmd, dst, name) \ + for (struct cmd name = { __genxml_cmd_header(cmd) }, \ + *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \ + ({ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name); \ + _dst = NULL; \ + })) + +#define crocus_pack_command(cmd, dst, name) \ + _crocus_pack_command(NULL, cmd, dst, name) + +#define _crocus_pack_state(batch, cmd, dst, name) \ + for (struct cmd name = {}, \ + *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \ + __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name), \ + _dst = NULL) + +#define crocus_pack_state(cmd, dst, name) \ + _crocus_pack_state(NULL, cmd, dst, name) + +#define crocus_emit_cmd(batch, cmd, name) \ + _crocus_pack_command(batch, cmd, __gen_get_batch_dwords(batch, __genxml_cmd_length(cmd)), name) + +#define crocus_emit_merge(batch, dwords0, dwords1, num_dwords) \ + do { \ + uint32_t *dw = __gen_get_batch_dwords(batch, num_dwords); \ + for (uint32_t i = 0; i < num_dwords; i++) \ + dw[i] = (dwords0)[i] | (dwords1)[i]; \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dwords)); \ + } while (0) + +#define crocus_emit_reg(batch, reg, name) \ + for (struct reg name = {}, *_cont = (struct reg *)1; _cont != NULL; \ + ({ \ + uint32_t _dw[__genxml_cmd_length(reg)]; \ + __genxml_cmd_pack(reg)(NULL, _dw, &name); \ + for (unsigned i = 0; i < __genxml_cmd_length(reg); i++) { \ + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { \ + lri.RegisterOffset = __genxml_reg_num(reg); \ + lri.DataDWord = _dw[i]; \ + } \ + } \ + _cont = NULL; \ + })) + + +/** + * crocus_address constructor helpers: + * + * When using these to construct a CSO, pass NULL for \p bo, and manually + * pin the BO later. Otherwise, genxml's address handling will add the + * BO to the current batch's validation list at CSO creation time, rather + * than at draw time as desired. + */ + +UNUSED static struct crocus_address +ro_bo(struct crocus_bo *bo, uint64_t offset) +{ + return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT }; +} + +UNUSED static struct crocus_address +rw_bo(struct crocus_bo *bo, uint64_t offset) +{ + return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT | RELOC_WRITE }; +} + +UNUSED static struct crocus_address +ggtt_bo(struct crocus_bo *bo, uint64_t offset) +{ + return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT }; +} diff --git a/src/gallium/drivers/crocus/crocus_genx_protos.h b/src/gallium/drivers/crocus/crocus_genx_protos.h new file mode 100644 index 00000000000..ba6798f991e --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_genx_protos.h @@ -0,0 +1,56 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* GenX-specific function declarations. + * + * Don't include this directly, it will be included by crocus_context.h. + * + * NOTE: This header can be included multiple times, from the same file. + */ + +/* crocus_state.c */ +void genX(init_state)(struct crocus_context *ice); +void genX(init_screen_state)(struct crocus_screen *screen); +void genX(upload_urb)(struct crocus_batch *batch, + unsigned vs_size, + bool gs_present, + unsigned gs_size); +void genX(emit_hashing_mode)(struct crocus_context *ice, + struct crocus_batch *batch, + unsigned width, unsigned height, + unsigned scale); + +/* crocus_blorp.c */ +void genX(init_blorp)(struct crocus_context *ice); + +/* crocus_query.c */ +void genX(init_query)(struct crocus_context *ice); +void genX(init_screen_query)(struct crocus_screen *screen); +void genX(math_add32_gpr0)(struct crocus_context *ice, + struct crocus_batch *batch, + uint32_t x); +void genX(math_div32_gpr0)(struct crocus_context *ice, + struct crocus_batch *batch, + uint32_t D); + +/* crocus_blt.c */ +void genX(init_blt)(struct crocus_screen *screen); diff --git a/src/gallium/drivers/crocus/crocus_monitor.c b/src/gallium/drivers/crocus/crocus_monitor.c new file mode 100644 index 00000000000..c0465f22875 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_monitor.c @@ -0,0 +1,484 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "crocus_monitor.h" + +#include + +#include "crocus_screen.h" +#include "crocus_context.h" + +#include "perf/intel_perf.h" +#include "perf/intel_perf_query.h" +#include "perf/intel_perf_regs.h" + +struct crocus_monitor_object { + int num_active_counters; + int *active_counters; + + size_t result_size; + unsigned char *result_buffer; + + struct intel_perf_query_object *query; +}; + +int +crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info) +{ + const struct crocus_screen *screen = (struct crocus_screen *)pscreen; + assert(screen->monitor_cfg); + if (!screen->monitor_cfg) + return 0; + + const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + + if (!info) { + /* return the number of metrics */ + return monitor_cfg->num_counters; + } + + const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + const int group = monitor_cfg->counters[index].group; + const int counter_index = monitor_cfg->counters[index].counter; + struct intel_perf_query_counter *counter = + &perf_cfg->queries[group].counters[counter_index]; + + info->group_id = group; + info->name = counter->name; + info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index; + + if (counter->type == INTEL_PERF_COUNTER_TYPE_THROUGHPUT) + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE; + else + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; + switch (counter->data_type) { + case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: + case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: + info->type = PIPE_DRIVER_QUERY_TYPE_UINT; + info->max_value.u32 = 0; + break; + case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; + info->max_value.u64 = 0; + break; + case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: + case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: + info->type = PIPE_DRIVER_QUERY_TYPE_FLOAT; + info->max_value.u64 = -1; + break; + default: + assert(false); + break; + } + + /* indicates that this is an OA query, not a pipeline statistics query */ + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; + return 1; +} + +typedef void (*bo_unreference_t)(void *); +typedef void *(*bo_map_t)(void *, void *, unsigned flags); +typedef void (*bo_unmap_t)(void *); +typedef void (*emit_mi_report_t)(void *, void *, uint32_t, uint32_t); +typedef void (*emit_mi_flush_t)(void *); +typedef void (*capture_frequency_stat_register_t)(void *, void *, + uint32_t ); +typedef void (*store_register_mem64_t)(void *ctx, void *bo, + uint32_t reg, uint32_t offset); +typedef bool (*batch_references_t)(void *batch, void *bo); +typedef void (*bo_wait_rendering_t)(void *bo); +typedef int (*bo_busy_t)(void *bo); + +static void * +crocus_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size) +{ + return crocus_bo_alloc(bufmgr, name, size); +} + +#if 0 +static void +crocus_monitor_emit_mi_flush(struct crocus_context *ice) +{ + const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CS_STALL; + crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER], + "OA metrics", flags); +} +#endif + +static void +crocus_monitor_emit_mi_report_perf_count(void *c, + void *bo, + uint32_t offset_in_bytes, + uint32_t report_id) +{ + struct crocus_context *ice = c; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + screen->vtbl.emit_mi_report_perf_count(batch, bo, offset_in_bytes, report_id); +} + +static void +crocus_monitor_batchbuffer_flush(void *c, const char *file, int line) +{ + struct crocus_context *ice = c; + _crocus_batch_flush(&ice->batches[CROCUS_BATCH_RENDER], __FILE__, __LINE__); +} + +#if 0 +static void +crocus_monitor_capture_frequency_stat_register(void *ctx, + void *bo, + uint32_t bo_offset) +{ + struct crocus_context *ice = ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + ice->vtbl.store_register_mem32(batch, GEN9_RPSTAT0, bo, bo_offset, false); +} + +static void +crocus_monitor_store_register_mem64(void *ctx, void *bo, + uint32_t reg, uint32_t offset) +{ + struct crocus_context *ice = ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + ice->vtbl.store_register_mem64(batch, reg, bo, offset, false); +} +#endif + +static bool +crocus_monitor_init_metrics(struct crocus_screen *screen) +{ + struct crocus_monitor_config *monitor_cfg = + rzalloc(screen, struct crocus_monitor_config); + struct intel_perf_config *perf_cfg = NULL; + if (unlikely(!monitor_cfg)) + goto allocation_error; + perf_cfg = intel_perf_new(monitor_cfg); + if (unlikely(!perf_cfg)) + goto allocation_error; + + monitor_cfg->perf_cfg = perf_cfg; + + perf_cfg->vtbl.bo_alloc = crocus_oa_bo_alloc; + perf_cfg->vtbl.bo_unreference = (bo_unreference_t)crocus_bo_unreference; + perf_cfg->vtbl.bo_map = (bo_map_t)crocus_bo_map; + perf_cfg->vtbl.bo_unmap = (bo_unmap_t)crocus_bo_unmap; + + perf_cfg->vtbl.emit_mi_report_perf_count = + (emit_mi_report_t)crocus_monitor_emit_mi_report_perf_count; + perf_cfg->vtbl.batchbuffer_flush = crocus_monitor_batchbuffer_flush; + perf_cfg->vtbl.batch_references = (batch_references_t)crocus_batch_references; + perf_cfg->vtbl.bo_wait_rendering = + (bo_wait_rendering_t)crocus_bo_wait_rendering; + perf_cfg->vtbl.bo_busy = (bo_busy_t)crocus_bo_busy; + + intel_perf_init_metrics(perf_cfg, &screen->devinfo, screen->fd, false, false); + screen->monitor_cfg = monitor_cfg; + + /* a gallium "group" is equivalent to a gen "query" + * a gallium "query" is equivalent to a gen "query_counter" + * + * Each gen_query supports a specific number of query_counters. To + * allocate the array of crocus_monitor_counter, we need an upper bound + * (ignoring duplicate query_counters). + */ + int gen_query_counters_count = 0; + for (int gen_query_id = 0; + gen_query_id < perf_cfg->n_queries; + ++gen_query_id) { + gen_query_counters_count += perf_cfg->queries[gen_query_id].n_counters; + } + + monitor_cfg->counters = rzalloc_size(monitor_cfg, + sizeof(struct crocus_monitor_counter) * + gen_query_counters_count); + if (unlikely(!monitor_cfg->counters)) + goto allocation_error; + + int crocus_monitor_id = 0; + for (int group = 0; group < perf_cfg->n_queries; ++group) { + for (int counter = 0; + counter < perf_cfg->queries[group].n_counters; + ++counter) { + /* Check previously identified metrics to filter out duplicates. The + * user is not helped by having the same metric available in several + * groups. (n^2 algorithm). + */ + bool duplicate = false; + for (int existing_group = 0; + existing_group < group && !duplicate; + ++existing_group) { + for (int existing_counter = 0; + existing_counter < perf_cfg->queries[existing_group].n_counters && !duplicate; + ++existing_counter) { + const char *current_name = + perf_cfg->queries[group].counters[counter].name; + const char *existing_name = + perf_cfg->queries[existing_group].counters[existing_counter].name; + if (strcmp(current_name, existing_name) == 0) { + duplicate = true; + } + } + } + if (duplicate) + continue; + monitor_cfg->counters[crocus_monitor_id].group = group; + monitor_cfg->counters[crocus_monitor_id].counter = counter; + ++crocus_monitor_id; + } + } + monitor_cfg->num_counters = crocus_monitor_id; + return monitor_cfg->num_counters; + +allocation_error: + if (monitor_cfg) + free(monitor_cfg->counters); + free(perf_cfg); + free(monitor_cfg); + return false; +} + +int +crocus_get_monitor_group_info(struct pipe_screen *pscreen, + unsigned group_index, + struct pipe_driver_query_group_info *info) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + if (!screen->monitor_cfg) { + if (!crocus_monitor_init_metrics(screen)) + return 0; + } + + const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + + if (!info) { + /* return the count that can be queried */ + return perf_cfg->n_queries; + } + + if (group_index >= perf_cfg->n_queries) { + /* out of range */ + return 0; + } + + struct intel_perf_query_info *query = &perf_cfg->queries[group_index]; + + info->name = query->name; + info->max_active_queries = query->n_counters; + info->num_queries = query->n_counters; + + return 1; +} + +static void +crocus_init_monitor_ctx(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen; + struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + + ice->perf_ctx = intel_perf_new_context(ice); + if (unlikely(!ice->perf_ctx)) + return; + + struct intel_perf_context *perf_ctx = ice->perf_ctx; + struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + intel_perf_init_context(perf_ctx, + perf_cfg, + ice, + ice, + screen->bufmgr, + &screen->devinfo, + ice->batches[CROCUS_BATCH_RENDER].hw_ctx_id, + screen->fd); +} + +/* entry point for GenPerfMonitorsAMD */ +struct crocus_monitor_object * +crocus_create_monitor_object(struct crocus_context *ice, + unsigned num_queries, + unsigned *query_types) +{ + struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen; + struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + struct intel_perf_query_object *query_obj = NULL; + + /* initialize perf context if this has not already been done. This + * function is the first entry point that carries the gl context. + */ + if (ice->perf_ctx == NULL) { + crocus_init_monitor_ctx(ice); + } + struct intel_perf_context *perf_ctx = ice->perf_ctx; + + assert(num_queries > 0); + int query_index = query_types[0] - PIPE_QUERY_DRIVER_SPECIFIC; + assert(query_index <= monitor_cfg->num_counters); + const int group = monitor_cfg->counters[query_index].group; + + struct crocus_monitor_object *monitor = + calloc(1, sizeof(struct crocus_monitor_object)); + if (unlikely(!monitor)) + goto allocation_failure; + + monitor->num_active_counters = num_queries; + monitor->active_counters = calloc(num_queries, sizeof(int)); + if (unlikely(!monitor->active_counters)) + goto allocation_failure; + + for (int i = 0; i < num_queries; ++i) { + unsigned current_query = query_types[i]; + unsigned current_query_index = current_query - PIPE_QUERY_DRIVER_SPECIFIC; + + /* all queries must be in the same group */ + assert(current_query_index <= monitor_cfg->num_counters); + assert(monitor_cfg->counters[current_query_index].group == group); + monitor->active_counters[i] = + monitor_cfg->counters[current_query_index].counter; + } + + /* create the intel_perf_query */ + query_obj = intel_perf_new_query(perf_ctx, group); + if (unlikely(!query_obj)) + goto allocation_failure; + + monitor->query = query_obj; + monitor->result_size = perf_cfg->queries[group].data_size; + monitor->result_buffer = calloc(1, monitor->result_size); + if (unlikely(!monitor->result_buffer)) + goto allocation_failure; + + return monitor; + +allocation_failure: + if (monitor) { + free(monitor->active_counters); + free(monitor->result_buffer); + } + free(query_obj); + free(monitor); + return NULL; +} + +void +crocus_destroy_monitor_object(struct pipe_context *ctx, + struct crocus_monitor_object *monitor) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + intel_perf_delete_query(ice->perf_ctx, monitor->query); + free(monitor->result_buffer); + monitor->result_buffer = NULL; + free(monitor->active_counters); + monitor->active_counters = NULL; + free(monitor); +} + +bool +crocus_begin_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor) +{ + struct crocus_context *ice = (void *) ctx; + struct intel_perf_context *perf_ctx = ice->perf_ctx; + + return intel_perf_begin_query(perf_ctx, monitor->query); +} + +bool +crocus_end_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor) +{ + struct crocus_context *ice = (void *) ctx; + struct intel_perf_context *perf_ctx = ice->perf_ctx; + + intel_perf_end_query(perf_ctx, monitor->query); + return true; +} + +bool +crocus_get_monitor_result(struct pipe_context *ctx, + struct crocus_monitor_object *monitor, + bool wait, + union pipe_numeric_type_union *result) +{ + struct crocus_context *ice = (void *) ctx; + struct intel_perf_context *perf_ctx = ice->perf_ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + bool monitor_ready = + intel_perf_is_query_ready(perf_ctx, monitor->query, batch); + + if (!monitor_ready) { + if (!wait) + return false; + intel_perf_wait_query(perf_ctx, monitor->query, batch); + } + + assert(intel_perf_is_query_ready(perf_ctx, monitor->query, batch)); + + unsigned bytes_written; + intel_perf_get_query_data(perf_ctx, monitor->query, batch, + monitor->result_size, + (unsigned*) monitor->result_buffer, + &bytes_written); + if (bytes_written != monitor->result_size) + return false; + + /* copy metrics into the batch result */ + for (int i = 0; i < monitor->num_active_counters; ++i) { + int current_counter = monitor->active_counters[i]; + const struct intel_perf_query_info *info = + intel_perf_query_info(monitor->query); + const struct intel_perf_query_counter *counter = + &info->counters[current_counter]; + assert(intel_perf_query_counter_get_size(counter)); + switch (counter->data_type) { + case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: + result[i].u64 = *(uint64_t*)(monitor->result_buffer + counter->offset); + break; + case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: + result[i].f = *(float*)(monitor->result_buffer + counter->offset); + break; + case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: + case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: + result[i].u64 = *(uint32_t*)(monitor->result_buffer + counter->offset); + break; + case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: { + double v = *(double*)(monitor->result_buffer + counter->offset); + result[i].f = v; + break; + } + default: + unreachable("unexpected counter data type"); + } + } + return true; +} diff --git a/src/gallium/drivers/crocus/crocus_monitor.h b/src/gallium/drivers/crocus/crocus_monitor.h new file mode 100644 index 00000000000..3335c8860e2 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_monitor.h @@ -0,0 +1,72 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef CROCUS_MONITOR_H +#define CROCUS_MONITOR_H + +#include "pipe/p_screen.h" + +struct crocus_monitor_counter { + int group; + int counter; +}; + +struct crocus_monitor_config { + struct intel_perf_config *perf_cfg; + + /* gallium requires an index for each counter */ + int num_counters; + struct crocus_monitor_counter *counters; +}; + +int crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info); +int crocus_get_monitor_group_info(struct pipe_screen *pscreen, + unsigned index, + struct pipe_driver_query_group_info *info); + +struct crocus_context; +struct crocus_screen; + +struct crocus_monitor_object * +crocus_create_monitor_object(struct crocus_context *ice, + unsigned num_queries, + unsigned *query_types); + +struct pipe_query; +void crocus_destroy_monitor_object(struct pipe_context *ctx, + struct crocus_monitor_object *monitor); + +bool +crocus_begin_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor); +bool +crocus_end_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor); + +bool +crocus_get_monitor_result(struct pipe_context *ctx, + struct crocus_monitor_object *monitor, + bool wait, + union pipe_numeric_type_union *result); + +#endif diff --git a/src/gallium/drivers/crocus/crocus_pipe.h b/src/gallium/drivers/crocus/crocus_pipe.h new file mode 100644 index 00000000000..71b12d08e16 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_pipe.h @@ -0,0 +1,74 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_PIPE_H +#define CROCUS_PIPE_H + +#include "pipe/p_defines.h" +#include "compiler/shader_enums.h" + +static inline gl_shader_stage +stage_from_pipe(enum pipe_shader_type pstage) +{ + static const gl_shader_stage stages[PIPE_SHADER_TYPES] = { + [PIPE_SHADER_VERTEX] = MESA_SHADER_VERTEX, + [PIPE_SHADER_TESS_CTRL] = MESA_SHADER_TESS_CTRL, + [PIPE_SHADER_TESS_EVAL] = MESA_SHADER_TESS_EVAL, + [PIPE_SHADER_GEOMETRY] = MESA_SHADER_GEOMETRY, + [PIPE_SHADER_FRAGMENT] = MESA_SHADER_FRAGMENT, + [PIPE_SHADER_COMPUTE] = MESA_SHADER_COMPUTE, + }; + return stages[pstage]; +} + +static inline enum pipe_shader_type +stage_to_pipe(gl_shader_stage stage) +{ + static const enum pipe_shader_type pstages[MESA_SHADER_STAGES] = { + [MESA_SHADER_VERTEX] = PIPE_SHADER_VERTEX, + [MESA_SHADER_TESS_CTRL] = PIPE_SHADER_TESS_CTRL, + [MESA_SHADER_TESS_EVAL] = PIPE_SHADER_TESS_EVAL, + [MESA_SHADER_GEOMETRY] = PIPE_SHADER_GEOMETRY, + [MESA_SHADER_FRAGMENT] = PIPE_SHADER_FRAGMENT, + [MESA_SHADER_COMPUTE] = PIPE_SHADER_COMPUTE, + }; + return pstages[stage]; +} + +/** + * Convert an swizzle enumeration (i.e. PIPE_SWIZZLE_X) to one of the HW's + * "Shader Channel Select" enumerations (i.e. SCS_RED). The mappings are + * + * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE + * 0 1 2 3 4 5 + * 4 5 6 7 0 1 + * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE + * + * which is simply adding 4 then modding by 8 (or anding with 7). + */ +static inline enum isl_channel_select +pipe_swizzle_to_isl_channel(enum pipe_swizzle swizzle) +{ + return (swizzle + 4) & 7; +} + +#endif diff --git a/src/gallium/drivers/crocus/crocus_pipe_control.c b/src/gallium/drivers/crocus/crocus_pipe_control.c new file mode 100644 index 00000000000..7a9625c61ed --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_pipe_control.c @@ -0,0 +1,368 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_pipe_control.c + * + * PIPE_CONTROL is the main flushing and synchronization primitive on Intel + * GPUs. It can invalidate caches, stall until rendering reaches various + * stages of completion, write to memory, and other things. In a way, it's + * a swiss army knife command - it has all kinds of capabilities, but some + * significant limitations as well. + * + * Unfortunately, it's notoriously complicated and difficult to use. Many + * sub-commands can't be used together. Some are meant to be used at the + * top of the pipeline (invalidating caches before drawing), while some are + * meant to be used at the end (stalling or flushing after drawing). + * + * Also, there's a list of restrictions a mile long, which vary by generation. + * Do this before doing that, or suffer the consequences (usually a GPU hang). + * + * This file contains helpers for emitting them safely. You can simply call + * crocus_emit_pipe_control_flush() with the desired operations (as logical + * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple + * PIPE_CONTROL commands as necessary. The per-generation workarounds are + * applied in crocus_emit_raw_pipe_control() in crocus_state.c. + */ + +#include "crocus_context.h" +#include "util/hash_table.h" +#include "util/set.h" + +/** + * Emit a PIPE_CONTROL with various flushing flags. + * + * The caller is responsible for deciding what flags are appropriate for the + * given generation. + */ +void +crocus_emit_pipe_control_flush(struct crocus_batch *batch, + const char *reason, + uint32_t flags) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + + if (devinfo->ver >= 6 && + (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) && + (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) { + /* A pipe control command with flush and invalidate bits set + * simultaneously is an inherently racy operation on Gen6+ if the + * contents of the flushed caches were intended to become visible from + * any of the invalidated caches. Split it in two PIPE_CONTROLs, the + * first one should stall the pipeline to make sure that the flushed R/W + * caches are coherent with memory once the specified R/O caches are + * invalidated. On pre-Gen6 hardware the (implicit) R/O cache + * invalidation seems to happen at the bottom of the pipeline together + * with any write cache flush, so this shouldn't be a concern. In order + * to ensure a full stall, we do an end-of-pipe sync. + */ + crocus_emit_end_of_pipe_sync(batch, reason, + flags & PIPE_CONTROL_CACHE_FLUSH_BITS); + flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL); + } + + batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0); +} + +/** + * Emit a PIPE_CONTROL that writes to a buffer object. + * + * \p flags should contain one of the following items: + * - PIPE_CONTROL_WRITE_IMMEDIATE + * - PIPE_CONTROL_WRITE_TIMESTAMP + * - PIPE_CONTROL_WRITE_DEPTH_COUNT + */ +void +crocus_emit_pipe_control_write(struct crocus_batch *batch, + const char *reason, uint32_t flags, + struct crocus_bo *bo, uint32_t offset, + uint64_t imm) +{ + batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm); +} + +/** + * Restriction [DevSNB, DevIVB]: + * + * Prior to changing Depth/Stencil Buffer state (i.e. any combination of + * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER, + * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall + * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth + * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by + * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set), + * unless SW can otherwise guarantee that the pipeline from WM onwards is + * already flushed (e.g., via a preceding MI_FLUSH). + */ +void +crocus_emit_depth_stall_flushes(struct crocus_batch *batch) +{ + UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; + + assert(devinfo->ver >= 6); + + crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL); + crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_CACHE_FLUSH); + crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL); +} + +/* + * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": + * + * Write synchronization is a special case of end-of-pipe + * synchronization that requires that the render cache and/or depth + * related caches are flushed to memory, where the data will become + * globally visible. This type of synchronization is required prior to + * SW (CPU) actually reading the result data from memory, or initiating + * an operation that will use as a read surface (such as a texture + * surface) a previous render target and/or depth/stencil buffer + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Exercising the write cache flush bits (Render Target Cache Flush + * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only + * ensures the write caches are flushed and doesn't guarantee the data + * is globally visible. + * + * SW can track the completion of the end-of-pipe-synchronization by + * using "Notify Enable" and "PostSync Operation - Write Immediate + * Data" in the PIPE_CONTROL command. + */ +void +crocus_emit_end_of_pipe_sync(struct crocus_batch *batch, + const char *reason, uint32_t flags) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + + if (devinfo->ver >= 6) { + /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": + * + * "The most common action to perform upon reaching a synchronization + * point is to write a value out to memory. An immediate value + * (included with the synchronization command) may be written." + * + * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": + * + * "In case the data flushed out by the render engine is to be read + * back in to the render engine in coherent manner, then the render + * engine has to wait for the fence completion before accessing the + * flushed data. This can be achieved by following means on various + * products: PIPE_CONTROL command with CS Stall and the required + * write caches flushed with Post-Sync-Operation as Write Immediate + * Data. + * + * Example: + * - Workload-1 (3D/GPGPU/MEDIA) + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate + * Data, Required Write Cache Flush bits set) + * - Workload-2 (Can use the data produce or output by Workload-1) + */ + crocus_emit_pipe_control_write(batch, reason, + flags | PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_WRITE_IMMEDIATE, + batch->ice->workaround_bo, + batch->ice->workaround_offset, 0); + + if (batch->screen->devinfo.is_haswell) { +#define GEN7_3DPRIM_START_INSTANCE 0x243C + batch->screen->vtbl.load_register_mem32(batch, GEN7_3DPRIM_START_INSTANCE, + batch->ice->workaround_bo, + batch->ice->workaround_offset); + } + } else { + /* On gen4-5, a regular pipe control seems to suffice. */ + crocus_emit_pipe_control_flush(batch, reason, flags); + } +} + +/* Emit a pipelined flush to either flush render and texture cache for + * reading from a FBO-drawn texture, or flush so that frontbuffer + * render appears on the screen in DRI1. + * + * This is also used for the always_flush_cache driconf debug option. + */ +void +crocus_emit_mi_flush(struct crocus_batch *batch) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH; + if (devinfo->ver >= 6) { + flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CS_STALL; + } + crocus_emit_pipe_control_flush(batch, "mi flush", flags); +} + +/** + * Emits a PIPE_CONTROL with a non-zero post-sync operation, for + * implementing two workarounds on gen6. From section 1.4.7.1 + * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: + * + * [DevSNB-C+{W/A}] Before any depth stall flush (including those + * produced by non-pipelined state commands), software needs to first + * send a PIPE_CONTROL with no bits set except Post-Sync Operation != + * 0. + * + * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable + * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. + * + * And the workaround for these two requires this workaround first: + * + * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent + * BEFORE the pipe-control with a post-sync op and no write-cache + * flushes. + * + * And this last workaround is tricky because of the requirements on + * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM + * volume 2 part 1: + * + * "1 of the following must also be set: + * - Render Target Cache Flush Enable ([12] of DW1) + * - Depth Cache Flush Enable ([0] of DW1) + * - Stall at Pixel Scoreboard ([1] of DW1) + * - Depth Stall ([13] of DW1) + * - Post-Sync Operation ([13] of DW1) + * - Notify Enable ([8] of DW1)" + * + * The cache flushes require the workaround flush that triggered this + * one, so we can't use it. Depth stall would trigger the same. + * Post-sync nonzero is what triggered this second workaround, so we + * can't use that one either. Notify enable is IRQs, which aren't + * really our business. That leaves only stall at scoreboard. + */ +void +crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch) +{ + crocus_emit_pipe_control_flush(batch, "nonzero", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + + crocus_emit_pipe_control_write(batch, "nonzero", + PIPE_CONTROL_WRITE_IMMEDIATE, + batch->ice->workaround_bo, + batch->ice->workaround_offset, 0); +} + +/** + * Flush and invalidate all caches (for debugging purposes). + */ +void +crocus_flush_all_caches(struct crocus_batch *batch) +{ + crocus_emit_pipe_control_flush(batch, "debug: flush all caches", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE); +} + +static void +crocus_texture_barrier(struct pipe_context *ctx, unsigned flags) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_batch *render_batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_batch *compute_batch = &ice->batches[CROCUS_BATCH_COMPUTE]; + const struct intel_device_info *devinfo = &render_batch->screen->devinfo; + + if (devinfo->ver < 6) { + crocus_emit_mi_flush(render_batch); + return; + } + + if (render_batch->contains_draw) { + crocus_batch_maybe_flush(render_batch, 48); + crocus_emit_pipe_control_flush(render_batch, + "API: texture barrier (1/2)", + (flags == 1 ? PIPE_CONTROL_DEPTH_CACHE_FLUSH : 0) | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(render_batch, + "API: texture barrier (2/2)", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); + } + + if (compute_batch->contains_draw) { + crocus_batch_maybe_flush(compute_batch, 48); + crocus_emit_pipe_control_flush(compute_batch, + "API: texture barrier (1/2)", + PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(compute_batch, + "API: texture barrier (2/2)", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); + } +} + +static void +crocus_memory_barrier(struct pipe_context *ctx, unsigned flags) +{ + struct crocus_context *ice = (void *) ctx; + unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL; + const struct intel_device_info *devinfo = &ice->batches[0].screen->devinfo; + + assert(devinfo->ver == 7); + + if (flags & (PIPE_BARRIER_VERTEX_BUFFER | + PIPE_BARRIER_INDEX_BUFFER | + PIPE_BARRIER_INDIRECT_BUFFER)) { + bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + } + + if (flags & PIPE_BARRIER_CONSTANT_BUFFER) { + bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE; + } + + if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) { + bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_RENDER_TARGET_FLUSH; + } + + /* Typed surface messages are handled by the render cache on IVB, so we + * need to flush it too. + */ + if (!devinfo->is_haswell) + bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH; + + for (int i = 0; i < ice->batch_count; i++) { + if (ice->batches[i].contains_draw) { + crocus_batch_maybe_flush(&ice->batches[i], 24); + crocus_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier", + bits); + } + } +} + +void +crocus_init_flush_functions(struct pipe_context *ctx) +{ + ctx->memory_barrier = crocus_memory_barrier; + ctx->texture_barrier = crocus_texture_barrier; +} diff --git a/src/gallium/drivers/crocus/crocus_program.c b/src/gallium/drivers/crocus/crocus_program.c new file mode 100644 index 00000000000..fb8216b71ab --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_program.c @@ -0,0 +1,3171 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_program.c + * + * This file contains the driver interface for compiling shaders. + * + * See crocus_program_cache.c for the in-memory program cache where the + * compiled shaders are stored. + */ + +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_atomic.h" +#include "util/u_upload_mgr.h" +#include "util/debug.h" +#include "util/u_prim.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_serialize.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/compiler/brw_nir.h" +#include "crocus_context.h" +#include "nir/tgsi_to_nir.h" + +#define KEY_INIT_NO_ID() \ + .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \ + .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688, \ + .base.tex.compressed_multisample_layout_mask = ~0 +#define KEY_INIT() .base.program_string_id = ish->program_id, KEY_INIT_NO_ID() + +static void +crocus_sanitize_tex_key(struct brw_sampler_prog_key_data *key) +{ + key->gather_channel_quirk_mask = 0; + for (unsigned s = 0; s < MAX_SAMPLERS; s++) { + key->swizzles[s] = SWIZZLE_NOOP; + key->gfx6_gather_wa[s] = 0; + } +} + +static uint32_t +crocus_get_texture_swizzle(const struct crocus_context *ice, + const struct crocus_sampler_view *t) +{ + uint32_t swiz = 0; + + for (int i = 0; i < 4; i++) { + swiz |= t->swizzle[i] << (i * 3); + } + return swiz; +} + +static inline bool can_push_ubo(const struct intel_device_info *devinfo) +{ + /* push works for everyone except SNB at the moment */ + return devinfo->ver != 6; +} + +static uint8_t +gfx6_gather_workaround(enum pipe_format pformat) +{ + switch (pformat) { + case PIPE_FORMAT_R8_SINT: return WA_SIGN | WA_8BIT; + case PIPE_FORMAT_R8_UINT: return WA_8BIT; + case PIPE_FORMAT_R16_SINT: return WA_SIGN | WA_16BIT; + case PIPE_FORMAT_R16_UINT: return WA_16BIT; + default: + /* Note that even though PIPE_FORMAT_R32_SINT and + * PIPE_FORMAT_R32_UINThave format overrides in + * the surface state, there is no shader w/a required. + */ + return 0; + } +} + +static const unsigned crocus_gfx6_swizzle_for_offset[4] = { + BRW_SWIZZLE4(0, 1, 2, 3), + BRW_SWIZZLE4(1, 2, 3, 3), + BRW_SWIZZLE4(2, 3, 3, 3), + BRW_SWIZZLE4(3, 3, 3, 3) +}; + +static void +gfx6_gs_xfb_setup(const struct pipe_stream_output_info *so_info, + struct brw_gs_prog_data *gs_prog_data) +{ + /* Make sure that the VUE slots won't overflow the unsigned chars in + * prog_data->transform_feedback_bindings[]. + */ + STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256); + + /* Make sure that we don't need more binding table entries than we've + * set aside for use in transform feedback. (We shouldn't, since we + * set aside enough binding table entries to have one per component). + */ + assert(so_info->num_outputs <= BRW_MAX_SOL_BINDINGS); + + gs_prog_data->num_transform_feedback_bindings = so_info->num_outputs; + for (unsigned i = 0; i < so_info->num_outputs; i++) { + gs_prog_data->transform_feedback_bindings[i] = + so_info->output[i].register_index; + gs_prog_data->transform_feedback_swizzles[i] = + crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component]; + } +} + +static void +gfx6_ff_gs_xfb_setup(const struct pipe_stream_output_info *so_info, + struct brw_ff_gs_prog_key *key) +{ + key->num_transform_feedback_bindings = so_info->num_outputs; + for (unsigned i = 0; i < so_info->num_outputs; i++) { + key->transform_feedback_bindings[i] = + so_info->output[i].register_index; + key->transform_feedback_swizzles[i] = + crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component]; + } +} + +static void +crocus_populate_sampler_prog_key_data(struct crocus_context *ice, + const struct intel_device_info *devinfo, + gl_shader_stage stage, + struct crocus_uncompiled_shader *ish, + bool uses_texture_gather, + struct brw_sampler_prog_key_data *key) +{ + uint32_t mask = ish->nir->info.textures_used[0]; + + while (mask) { + const int s = u_bit_scan(&mask); + + struct crocus_sampler_view *texture = ice->state.shaders[stage].textures[s]; + key->swizzles[s] = SWIZZLE_NOOP; + key->scale_factors[s] = 0.0f; + + if (!texture) + continue; + if (texture->base.target == PIPE_BUFFER) + continue; + if (!devinfo->is_haswell) { + key->swizzles[s] = crocus_get_texture_swizzle(ice, texture); + } + + /* gather4 for RG32* is broken in multiple ways on Gen7. */ + if (devinfo->ver == 7 && uses_texture_gather) { + switch (texture->base.format) { + case PIPE_FORMAT_R32G32_UINT: + case PIPE_FORMAT_R32G32_SINT: { + /* We have to override the format to R32G32_FLOAT_LD. + * This means that SCS_ALPHA and SCS_ONE will return 0x3f8 + * (1.0) rather than integer 1. This needs shader hacks. + * + * On Ivybridge, we whack W (alpha) to ONE in our key's + * swizzle. On Haswell, we look at the original texture + * swizzle, and use XYZW with channels overridden to ONE, + * leaving normal texture swizzling to SCS. + */ + unsigned src_swizzle = key->swizzles[s]; + for (int i = 0; i < 4; i++) { + unsigned src_comp = GET_SWZ(src_swizzle, i); + if (src_comp == SWIZZLE_ONE || src_comp == SWIZZLE_W) { + key->swizzles[i] &= ~(0x7 << (3 * i)); + key->swizzles[i] |= SWIZZLE_ONE << (3 * i); + } + } + } + FALLTHROUGH; + case PIPE_FORMAT_R32G32_FLOAT: + /* The channel select for green doesn't work - we have to + * request blue. Haswell can use SCS for this, but Ivybridge + * needs a shader workaround. + */ + if (!devinfo->is_haswell) + key->gather_channel_quirk_mask |= 1 << s; + break; + default: + break; + } + } + if (devinfo->ver == 6 && uses_texture_gather) { + key->gfx6_gather_wa[s] = gfx6_gather_workaround(texture->base.format); + } + } +} + +static void +crocus_lower_swizzles(struct nir_shader *nir, + const struct brw_sampler_prog_key_data *key_tex) +{ + struct nir_lower_tex_options tex_options = { 0 }; + uint32_t mask = nir->info.textures_used[0]; + + while (mask) { + const int s = u_bit_scan(&mask); + + if (key_tex->swizzles[s] == SWIZZLE_NOOP) + continue; + + tex_options.swizzle_result |= (1 << s); + for (unsigned c = 0; c < 4; c++) + tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c); + } + if (tex_options.swizzle_result) + nir_lower_tex(nir, &tex_options); +} + +static unsigned +get_new_program_id(struct crocus_screen *screen) +{ + return p_atomic_inc_return(&screen->program_id); +} + +static nir_ssa_def * +get_aoa_deref_offset(nir_builder *b, + nir_deref_instr *deref, + unsigned elem_size) +{ + unsigned array_size = elem_size; + nir_ssa_def *offset = nir_imm_int(b, 0); + + while (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + + /* This level's element size is the previous level's array size */ + nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1); + assert(deref->arr.index.ssa); + offset = nir_iadd(b, offset, + nir_imul(b, index, nir_imm_int(b, array_size))); + + deref = nir_deref_instr_parent(deref); + assert(glsl_type_is_array(deref->type)); + array_size *= glsl_get_length(deref->type); + } + + /* Accessing an invalid surface index with the dataport can result in a + * hang. According to the spec "if the index used to select an individual + * element is negative or greater than or equal to the size of the array, + * the results of the operation are undefined but may not lead to + * termination" -- which is one of the possible outcomes of the hang. + * Clamp the index to prevent access outside of the array bounds. + */ + return nir_umin(b, offset, nir_imm_int(b, array_size - elem_size)); +} + +static void +crocus_lower_storage_image_derefs(nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_load_raw_intel: + case nir_intrinsic_image_deref_store_raw_intel: { + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + b.cursor = nir_before_instr(&intrin->instr); + nir_ssa_def *index = + nir_iadd(&b, nir_imm_int(&b, var->data.driver_location), + get_aoa_deref_offset(&b, deref, 1)); + nir_rewrite_image_intrinsic(intrin, index, false); + break; + } + + default: + break; + } + } + } +} + +// XXX: need unify_interfaces() at link time... + +/** + * Undo nir_lower_passthrough_edgeflags but keep the inputs_read flag. + */ +static bool +crocus_fix_edge_flags(nir_shader *nir) +{ + if (nir->info.stage != MESA_SHADER_VERTEX) { + nir_shader_preserve_all_metadata(nir); + return false; + } + + nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out, + VARYING_SLOT_EDGE); + if (!var) { + nir_shader_preserve_all_metadata(nir); + return false; + } + + var->data.mode = nir_var_shader_temp; + nir->info.outputs_written &= ~VARYING_BIT_EDGE; + nir->info.inputs_read &= ~VERT_BIT_EDGEFLAG; + nir_fixup_deref_modes(nir); + + nir_foreach_function(f, nir) { + if (f->impl) { + nir_metadata_preserve(f->impl, nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs | + nir_metadata_loop_analysis); + } else { + nir_metadata_preserve(f->impl, nir_metadata_all); + } + } + + return true; +} + +/** + * Fix an uncompiled shader's stream output info. + * + * Core Gallium stores output->register_index as a "slot" number, where + * slots are assigned consecutively to all outputs in info->outputs_written. + * This naive packing of outputs doesn't work for us - we too have slots, + * but the layout is defined by the VUE map, which we won't have until we + * compile a specific shader variant. So, we remap these and simply store + * VARYING_SLOT_* in our copy's output->register_index fields. + * + * We also fix up VARYING_SLOT_{LAYER,VIEWPORT,PSIZ} to select the Y/Z/W + * components of our VUE header. See brw_vue_map.c for the layout. + */ +static void +update_so_info(struct pipe_stream_output_info *so_info, + uint64_t outputs_written) +{ + uint8_t reverse_map[64] = {}; + unsigned slot = 0; + while (outputs_written) { + reverse_map[slot++] = u_bit_scan64(&outputs_written); + } + + for (unsigned i = 0; i < so_info->num_outputs; i++) { + struct pipe_stream_output *output = &so_info->output[i]; + + /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */ + output->register_index = reverse_map[output->register_index]; + + /* The VUE header contains three scalar fields packed together: + * - gl_PointSize is stored in VARYING_SLOT_PSIZ.w + * - gl_Layer is stored in VARYING_SLOT_PSIZ.y + * - gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z + */ + switch (output->register_index) { + case VARYING_SLOT_LAYER: + assert(output->num_components == 1); + output->register_index = VARYING_SLOT_PSIZ; + output->start_component = 1; + break; + case VARYING_SLOT_VIEWPORT: + assert(output->num_components == 1); + output->register_index = VARYING_SLOT_PSIZ; + output->start_component = 2; + break; + case VARYING_SLOT_PSIZ: + assert(output->num_components == 1); + output->start_component = 3; + break; + } + + //info->outputs_written |= 1ull << output->register_index; + } +} + +static void +setup_vec4_image_sysval(uint32_t *sysvals, uint32_t idx, + unsigned offset, unsigned n) +{ + assert(offset % sizeof(uint32_t) == 0); + + for (unsigned i = 0; i < n; ++i) + sysvals[i] = BRW_PARAM_IMAGE(idx, offset / sizeof(uint32_t) + i); + + for (unsigned i = n; i < 4; ++i) + sysvals[i] = BRW_PARAM_BUILTIN_ZERO; +} + +/** + * Associate NIR uniform variables with the prog_data->param[] mechanism + * used by the backend. Also, decide which UBOs we'd like to push in an + * ideal situation (though the backend can reduce this). + */ +static void +crocus_setup_uniforms(const struct brw_compiler *compiler, + void *mem_ctx, + nir_shader *nir, + struct brw_stage_prog_data *prog_data, + enum brw_param_builtin **out_system_values, + unsigned *out_num_system_values, + unsigned *out_num_cbufs) +{ + UNUSED const struct intel_device_info *devinfo = compiler->devinfo; + + const unsigned CROCUS_MAX_SYSTEM_VALUES = + PIPE_MAX_SHADER_IMAGES * BRW_IMAGE_PARAM_SIZE; + enum brw_param_builtin *system_values = + rzalloc_array(mem_ctx, enum brw_param_builtin, CROCUS_MAX_SYSTEM_VALUES); + unsigned num_system_values = 0; + + unsigned patch_vert_idx = -1; + unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES]; + unsigned img_idx[PIPE_MAX_SHADER_IMAGES]; + unsigned variable_group_size_idx = -1; + memset(ucp_idx, -1, sizeof(ucp_idx)); + memset(img_idx, -1, sizeof(img_idx)); + + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, impl); + + b.cursor = nir_before_block(nir_start_block(impl)); + nir_ssa_def *temp_ubo_name = nir_ssa_undef(&b, 1, 32); + nir_ssa_def *temp_const_ubo_name = NULL; + + /* Turn system value intrinsics into uniforms */ + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + nir_ssa_def *offset; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_constant: { + /* This one is special because it reads from the shader constant + * data and not cbuf0 which gallium uploads for us. + */ + b.cursor = nir_before_instr(instr); + nir_ssa_def *offset = + nir_iadd_imm(&b, nir_ssa_for_src(&b, intrin->src[0], 1), + nir_intrinsic_base(intrin)); + + if (temp_const_ubo_name == NULL) + temp_const_ubo_name = nir_imm_int(&b, 0); + + nir_intrinsic_instr *load_ubo = + nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo); + load_ubo->num_components = intrin->num_components; + load_ubo->src[0] = nir_src_for_ssa(temp_const_ubo_name); + load_ubo->src[1] = nir_src_for_ssa(offset); + nir_intrinsic_set_align(load_ubo, 4, 0); + nir_intrinsic_set_range_base(load_ubo, 0); + nir_intrinsic_set_range(load_ubo, ~0); + nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, + intrin->dest.ssa.name); + nir_builder_instr_insert(&b, &load_ubo->instr); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + &load_ubo->dest.ssa); + nir_instr_remove(&intrin->instr); + continue; + } + case nir_intrinsic_load_user_clip_plane: { + unsigned ucp = nir_intrinsic_ucp_id(intrin); + + if (ucp_idx[ucp] == -1) { + ucp_idx[ucp] = num_system_values; + num_system_values += 4; + } + + for (int i = 0; i < 4; i++) { + system_values[ucp_idx[ucp] + i] = + BRW_PARAM_BUILTIN_CLIP_PLANE(ucp, i); + } + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, ucp_idx[ucp] * sizeof(uint32_t)); + break; + } + case nir_intrinsic_load_patch_vertices_in: + if (patch_vert_idx == -1) + patch_vert_idx = num_system_values++; + + system_values[patch_vert_idx] = + BRW_PARAM_BUILTIN_PATCH_VERTICES_IN; + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t)); + break; + case nir_intrinsic_image_deref_load_param_intel: { + assert(devinfo->ver < 9); + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + if (img_idx[var->data.binding] == -1) { + /* GL only allows arrays of arrays of images. */ + assert(glsl_type_is_image(glsl_without_array(var->type))); + unsigned num_images = MAX2(1, glsl_get_aoa_size(var->type)); + + for (int i = 0; i < num_images; i++) { + const unsigned img = var->data.binding + i; + + img_idx[img] = num_system_values; + num_system_values += BRW_IMAGE_PARAM_SIZE; + + uint32_t *img_sv = &system_values[img_idx[img]]; + + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_OFFSET_OFFSET, img, + offsetof(struct brw_image_param, offset), 2); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_SIZE_OFFSET, img, + offsetof(struct brw_image_param, size), 3); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_STRIDE_OFFSET, img, + offsetof(struct brw_image_param, stride), 4); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_TILING_OFFSET, img, + offsetof(struct brw_image_param, tiling), 3); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_SWIZZLING_OFFSET, img, + offsetof(struct brw_image_param, swizzling), 2); + } + } + + b.cursor = nir_before_instr(instr); + offset = nir_iadd(&b, + get_aoa_deref_offset(&b, deref, BRW_IMAGE_PARAM_SIZE * 4), + nir_imm_int(&b, img_idx[var->data.binding] * 4 + + nir_intrinsic_base(intrin) * 16)); + break; + } + case nir_intrinsic_load_workgroup_size: { + assert(nir->info.workgroup_size_variable); + if (variable_group_size_idx == -1) { + variable_group_size_idx = num_system_values; + num_system_values += 3; + for (int i = 0; i < 3; i++) { + system_values[variable_group_size_idx + i] = + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i; + } + } + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, + variable_group_size_idx * sizeof(uint32_t)); + break; + } + default: + continue; + } + + unsigned comps = nir_intrinsic_dest_components(intrin); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(nir, nir_intrinsic_load_ubo); + load->num_components = comps; + load->src[0] = nir_src_for_ssa(temp_ubo_name); + load->src[1] = nir_src_for_ssa(offset); + nir_intrinsic_set_align(load, 4, 0); + nir_intrinsic_set_range_base(load, 0); + nir_intrinsic_set_range(load, ~0); + nir_ssa_dest_init(&load->instr, &load->dest, comps, 32, NULL); + nir_builder_instr_insert(&b, &load->instr); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + &load->dest.ssa); + nir_instr_remove(instr); + } + } + + nir_validate_shader(nir, "before remapping"); + + /* Uniforms are stored in constant buffer 0, the + * user-facing UBOs are indexed by one. So if any constant buffer is + * needed, the constant buffer 0 will be needed, so account for it. + */ + unsigned num_cbufs = nir->info.num_ubos; + if (num_cbufs || nir->num_uniforms) + num_cbufs++; + + /* Place the new params in a new cbuf. */ + if (num_system_values > 0) { + unsigned sysval_cbuf_index = num_cbufs; + num_cbufs++; + + system_values = reralloc(mem_ctx, system_values, enum brw_param_builtin, + num_system_values); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); + + if (load->intrinsic != nir_intrinsic_load_ubo) + continue; + + b.cursor = nir_before_instr(instr); + + assert(load->src[0].is_ssa); + + if (load->src[0].ssa == temp_ubo_name) { + nir_ssa_def *imm = nir_imm_int(&b, sysval_cbuf_index); + nir_instr_rewrite_src(instr, &load->src[0], + nir_src_for_ssa(imm)); + } + } + } + + /* We need to fold the new iadds for brw_nir_analyze_ubo_ranges */ + nir_opt_constant_folding(nir); + } else { + ralloc_free(system_values); + system_values = NULL; + } + + assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS); + nir_validate_shader(nir, "after remap"); + + /* We don't use params[] but gallium leaves num_uniforms set. We use this + * to detect when cbuf0 exists but we don't need it anymore when we get + * here. Instead, zero it out so that the back-end doesn't get confused + * when nr_params * 4 != num_uniforms != nr_params * 4. + */ + nir->num_uniforms = 0; + + /* Constant loads (if any) need to go at the end of the constant buffers so + * we need to know num_cbufs before we can lower to them. + */ + if (temp_const_ubo_name != NULL) { + nir_load_const_instr *const_ubo_index = + nir_instr_as_load_const(temp_const_ubo_name->parent_instr); + assert(const_ubo_index->def.bit_size == 32); + const_ubo_index->value[0].u32 = num_cbufs; + } + + *out_system_values = system_values; + *out_num_system_values = num_system_values; + *out_num_cbufs = num_cbufs; +} + +static const char *surface_group_names[] = { + [CROCUS_SURFACE_GROUP_RENDER_TARGET] = "render target", + [CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = "non-coherent render target read", + [CROCUS_SURFACE_GROUP_SOL] = "streamout", + [CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = "CS work groups", + [CROCUS_SURFACE_GROUP_TEXTURE] = "texture", + [CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = "texture gather", + [CROCUS_SURFACE_GROUP_UBO] = "ubo", + [CROCUS_SURFACE_GROUP_SSBO] = "ssbo", + [CROCUS_SURFACE_GROUP_IMAGE] = "image", +}; + +static void +crocus_print_binding_table(FILE *fp, const char *name, + const struct crocus_binding_table *bt) +{ + STATIC_ASSERT(ARRAY_SIZE(surface_group_names) == CROCUS_SURFACE_GROUP_COUNT); + + uint32_t total = 0; + uint32_t compacted = 0; + + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) { + uint32_t size = bt->sizes[i]; + total += size; + if (size) + compacted += util_bitcount64(bt->used_mask[i]); + } + + if (total == 0) { + fprintf(fp, "Binding table for %s is empty\n\n", name); + return; + } + + if (total != compacted) { + fprintf(fp, "Binding table for %s " + "(compacted to %u entries from %u entries)\n", + name, compacted, total); + } else { + fprintf(fp, "Binding table for %s (%u entries)\n", name, total); + } + + uint32_t entry = 0; + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) { + uint64_t mask = bt->used_mask[i]; + while (mask) { + int index = u_bit_scan64(&mask); + fprintf(fp, " [%u] %s #%d\n", entry++, surface_group_names[i], index); + } + } + fprintf(fp, "\n"); +} + +enum { + /* Max elements in a surface group. */ + SURFACE_GROUP_MAX_ELEMENTS = 64, +}; + +/** + * Map a pair to a binding table index. + * + * For example: => binding table index 12 + */ +uint32_t +crocus_group_index_to_bti(const struct crocus_binding_table *bt, + enum crocus_surface_group group, uint32_t index) +{ + assert(index < bt->sizes[group]); + uint64_t mask = bt->used_mask[group]; + uint64_t bit = 1ull << index; + if (bit & mask) { + return bt->offsets[group] + util_bitcount64((bit - 1) & mask); + } else { + return CROCUS_SURFACE_NOT_USED; + } +} + +/** + * Map a binding table index back to a pair. + * + * For example: binding table index 12 => + */ +uint32_t +crocus_bti_to_group_index(const struct crocus_binding_table *bt, + enum crocus_surface_group group, uint32_t bti) +{ + uint64_t used_mask = bt->used_mask[group]; + assert(bti >= bt->offsets[group]); + + uint32_t c = bti - bt->offsets[group]; + while (used_mask) { + int i = u_bit_scan64(&used_mask); + if (c == 0) + return i; + c--; + } + + return CROCUS_SURFACE_NOT_USED; +} + +static void +rewrite_src_with_bti(nir_builder *b, struct crocus_binding_table *bt, + nir_instr *instr, nir_src *src, + enum crocus_surface_group group) +{ + assert(bt->sizes[group] > 0); + + b->cursor = nir_before_instr(instr); + nir_ssa_def *bti; + if (nir_src_is_const(*src)) { + uint32_t index = nir_src_as_uint(*src); + bti = nir_imm_intN_t(b, crocus_group_index_to_bti(bt, group, index), + src->ssa->bit_size); + } else { + /* Indirect usage makes all the surfaces of the group to be available, + * so we can just add the base. + */ + assert(bt->used_mask[group] == BITFIELD64_MASK(bt->sizes[group])); + bti = nir_iadd_imm(b, src->ssa, bt->offsets[group]); + } + nir_instr_rewrite_src(instr, src, nir_src_for_ssa(bti)); +} + +static void +mark_used_with_src(struct crocus_binding_table *bt, nir_src *src, + enum crocus_surface_group group) +{ + assert(bt->sizes[group] > 0); + + if (nir_src_is_const(*src)) { + uint64_t index = nir_src_as_uint(*src); + assert(index < bt->sizes[group]); + bt->used_mask[group] |= 1ull << index; + } else { + /* There's an indirect usage, we need all the surfaces. */ + bt->used_mask[group] = BITFIELD64_MASK(bt->sizes[group]); + } +} + +static bool +skip_compacting_binding_tables(void) +{ + static int skip = -1; + if (skip < 0) + skip = env_var_as_boolean("INTEL_DISABLE_COMPACT_BINDING_TABLE", false); + return skip; +} + +/** + * Set up the binding table indices and apply to the shader. + */ +static void +crocus_setup_binding_table(const struct intel_device_info *devinfo, + struct nir_shader *nir, + struct crocus_binding_table *bt, + unsigned num_render_targets, + unsigned num_system_values, + unsigned num_cbufs, + const struct brw_sampler_prog_key_data *key) +{ + const struct shader_info *info = &nir->info; + + memset(bt, 0, sizeof(*bt)); + + /* Set the sizes for each surface group. For some groups, we already know + * upfront how many will be used, so mark them. + */ + if (info->stage == MESA_SHADER_FRAGMENT) { + bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET] = num_render_targets; + /* All render targets used. */ + bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET] = + BITFIELD64_MASK(num_render_targets); + + /* Setup render target read surface group in order to support non-coherent + * framebuffer fetch on Gfx7 + */ + if (devinfo->ver >= 6 && info->outputs_read) { + bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = num_render_targets; + bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = + BITFIELD64_MASK(num_render_targets); + } + } else if (info->stage == MESA_SHADER_COMPUTE) { + bt->sizes[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1; + } else if (info->stage == MESA_SHADER_GEOMETRY) { + /* In gfx6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform + * feedback surfaces. + */ + if (devinfo->ver == 6) { + bt->sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS; + bt->used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1; + } + } + + bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE] = BITSET_LAST_BIT(info->textures_used); + bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE] = info->textures_used[0]; + + if (info->uses_texture_gather) { + bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = BITSET_LAST_BIT(info->textures_used); + bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = info->textures_used[0]; + } + + bt->sizes[CROCUS_SURFACE_GROUP_IMAGE] = info->num_images; + + /* Allocate an extra slot in the UBO section for NIR constants. + * Binding table compaction will remove it if unnecessary. + * + * We don't include them in crocus_compiled_shader::num_cbufs because + * they are uploaded separately from shs->constbufs[], but from a shader + * point of view, they're another UBO (at the end of the section). + */ + bt->sizes[CROCUS_SURFACE_GROUP_UBO] = num_cbufs + 1; + + bt->sizes[CROCUS_SURFACE_GROUP_SSBO] = info->num_ssbos; + + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) + assert(bt->sizes[i] <= SURFACE_GROUP_MAX_ELEMENTS); + + /* Mark surfaces used for the cases we don't have the information available + * upfront. + */ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_foreach_block (block, impl) { + nir_foreach_instr (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_num_workgroups: + bt->used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1; + break; + + case nir_intrinsic_load_output: + if (devinfo->ver >= 6) { + mark_used_with_src(bt, &intrin->src[0], + CROCUS_SURFACE_GROUP_RENDER_TARGET_READ); + } + break; + + case nir_intrinsic_image_size: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_load_raw_intel: + case nir_intrinsic_image_store_raw_intel: + mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_IMAGE); + break; + + case nir_intrinsic_load_ubo: + mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_UBO); + break; + + case nir_intrinsic_store_ssbo: + mark_used_with_src(bt, &intrin->src[1], CROCUS_SURFACE_GROUP_SSBO); + break; + + case nir_intrinsic_get_ssbo_size: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmax: + case nir_intrinsic_ssbo_atomic_fcomp_swap: + case nir_intrinsic_load_ssbo: + mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_SSBO); + break; + + default: + break; + } + } + } + + /* When disable we just mark everything as used. */ + if (unlikely(skip_compacting_binding_tables())) { + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) + bt->used_mask[i] = BITFIELD64_MASK(bt->sizes[i]); + } + + /* Calculate the offsets and the binding table size based on the used + * surfaces. After this point, the functions to go between "group indices" + * and binding table indices can be used. + */ + uint32_t next = 0; + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) { + if (bt->used_mask[i] != 0) { + bt->offsets[i] = next; + next += util_bitcount64(bt->used_mask[i]); + } + } + bt->size_bytes = next * 4; + + if (unlikely(INTEL_DEBUG & DEBUG_BT)) { + crocus_print_binding_table(stderr, gl_shader_stage_name(info->stage), bt); + } + + /* Apply the binding table indices. The backend compiler is not expected + * to change those, as we haven't set any of the *_start entries in brw + * binding_table. + */ + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block (block, impl) { + nir_foreach_instr (instr, block) { + if (instr->type == nir_instr_type_tex) { + nir_tex_instr *tex = nir_instr_as_tex(instr); + bool is_gather = tex->op == nir_texop_tg4; + + /* rewrite the tg4 component from green to blue before replacing the + texture index */ + if (devinfo->ver == 7 && !devinfo->is_haswell) { + if (tex->component == 1) + if (key->gather_channel_quirk_mask & (1 << tex->texture_index)) + tex->component = 2; + } + + if (is_gather && devinfo->ver == 6 && key->gfx6_gather_wa[tex->texture_index]) { + b.cursor = nir_after_instr(instr); + enum gfx6_gather_sampler_wa wa = key->gfx6_gather_wa[tex->texture_index]; + int width = (wa & WA_8BIT) ? 8 : 16; + + nir_ssa_def *val = nir_fmul_imm(&b, &tex->dest.ssa, (1 << width) - 1); + val = nir_f2u32(&b, val); + if (wa & WA_SIGN) { + val = nir_ishl(&b, val, nir_imm_int(&b, 32 - width)); + val = nir_ishr(&b, val, nir_imm_int(&b, 32 - width)); + } + nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, val, val->parent_instr); + } + + tex->texture_index = + crocus_group_index_to_bti(bt, is_gather ? CROCUS_SURFACE_GROUP_TEXTURE_GATHER : CROCUS_SURFACE_GROUP_TEXTURE, + tex->texture_index); + continue; + } + + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_image_size: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_load_raw_intel: + case nir_intrinsic_image_store_raw_intel: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_IMAGE); + break; + + case nir_intrinsic_load_ubo: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_UBO); + break; + + case nir_intrinsic_store_ssbo: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[1], + CROCUS_SURFACE_GROUP_SSBO); + break; + + case nir_intrinsic_load_output: + if (devinfo->ver >= 6) { + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_RENDER_TARGET_READ); + } + break; + + case nir_intrinsic_get_ssbo_size: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmax: + case nir_intrinsic_ssbo_atomic_fcomp_swap: + case nir_intrinsic_load_ssbo: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_SSBO); + break; + + default: + break; + } + } + } +} + +static void +crocus_debug_recompile(struct crocus_context *ice, + struct shader_info *info, + const struct brw_base_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen; + const struct brw_compiler *c = screen->compiler; + + if (!info) + return; + + c->shader_perf_log(&ice->dbg, "Recompiling %s shader for program %s: %s\n", + _mesa_shader_stage_to_string(info->stage), + info->name ? info->name : "(no identifier)", + info->label ? info->label : ""); + + const void *old_key = + crocus_find_previous_compile(ice, info->stage, key->program_string_id); + + brw_debug_key_recompile(c, &ice->dbg, info->stage, old_key, key); +} + +/** + * Get the shader for the last enabled geometry stage. + * + * This stage is the one which will feed stream output and the rasterizer. + */ +static gl_shader_stage +last_vue_stage(struct crocus_context *ice) +{ + if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) + return MESA_SHADER_GEOMETRY; + + if (ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) + return MESA_SHADER_TESS_EVAL; + + return MESA_SHADER_VERTEX; +} + +static GLbitfield64 +crocus_vs_outputs_written(struct crocus_context *ice, + const struct brw_vs_prog_key *key, + GLbitfield64 user_varyings) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + GLbitfield64 outputs_written = user_varyings; + + if (devinfo->ver < 6) { + + if (key->copy_edgeflag) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE); + + /* Put dummy slots into the VUE for the SF to put the replaced + * point sprite coords in. We shouldn't need these dummy slots, + * which take up precious URB space, but it would mean that the SF + * doesn't get nice aligned pairs of input coords into output + * coords, which would be a pain to handle. + */ + for (unsigned i = 0; i < 8; i++) { + if (key->point_coord_replace & (1 << i)) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i); + } + + /* if back colors are written, allocate slots for front colors too */ + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0)) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0); + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1)) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1); + } + + /* In order for legacy clipping to work, we need to populate the clip + * distance varying slots whenever clipping is enabled, even if the vertex + * shader doesn't write to gl_ClipDistance. + */ + if (key->nr_userclip_plane_consts > 0) { + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); + } + + return outputs_written; +} + +/* + * If no edgeflags come from the user, gen4/5 + * require giving the clip shader a default edgeflag. + * + * This will always be 1.0. + */ +static void +crocus_lower_default_edgeflags(struct nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, impl); + + b.cursor = nir_after_cf_list(&b.impl->body); + nir_variable *var = nir_variable_create(nir, nir_var_shader_out, + glsl_float_type(), + "edgeflag"); + var->data.location = VARYING_SLOT_EDGE; + nir_store_var(&b, var, nir_imm_float(&b, 1.0), 0x1); +} + +/** + * Compile a vertex shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_vs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_vs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + const struct intel_device_info *devinfo = &screen->devinfo; + void *mem_ctx = ralloc_context(NULL); + struct brw_vs_prog_data *vs_prog_data = + rzalloc(mem_ctx, struct brw_vs_prog_data); + struct brw_vue_prog_data *vue_prog_data = &vs_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + enum brw_param_builtin *system_values; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + if (key->nr_userclip_plane_consts) { + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true, + false, NULL); + nir_lower_io_to_temporaries(nir, impl, true, false); + nir_lower_global_vars_to_local(nir); + nir_lower_vars_to_ssa(nir); + nir_shader_gather_info(nir, impl); + } + + prog_data->use_alt_mode = ish->use_alt_mode; + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + + crocus_lower_swizzles(nir, &key->base.tex); + + if (devinfo->ver <= 5 && + !(nir->info.inputs_read & BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG))) + crocus_lower_default_edgeflags(nir); + + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + uint64_t outputs_written = + crocus_vs_outputs_written(ice, key, nir->info.outputs_written); + brw_compute_vue_map(devinfo, + &vue_prog_data->vue_map, outputs_written, + nir->info.separate_shader, /* pos slots */ 1); + + /* Don't tell the backend about our clip plane constants, we've already + * lowered them in NIR and we don't want it doing it again. + */ + struct brw_vs_prog_key key_no_ucp = *key; + key_no_ucp.nr_userclip_plane_consts = 0; + key_no_ucp.copy_edgeflag = false; + crocus_sanitize_tex_key(&key_no_ucp.base.tex); + + struct brw_compile_vs_params params = { + .nir = nir, + .key = &key_no_ucp, + .prog_data = vs_prog_data, + .edgeflag_is_last = devinfo->ver < 6, + .log_data = &ice->dbg, + }; + const unsigned *program = + brw_compile_vs(compiler, mem_ctx, ¶ms); + if (program == NULL) { + dbg_printf("Failed to compile vertex shader: %s\n", params.error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + uint32_t *so_decls = NULL; + if (devinfo->ver > 6) + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_VS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*vs_prog_data), so_decls, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current vertex shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_vs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_VERTEX]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_vs_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_VERTEX, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_vs_key(ice, &ish->nir->info, last_vue_stage(ice), &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_VS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_VS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_vs(ice, ish, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_VS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS | + CROCUS_STAGE_DIRTY_BINDINGS_VS | + CROCUS_STAGE_DIRTY_CONSTANTS_VS; + shs->sysvals_need_upload = true; + + const struct brw_vs_prog_data *vs_prog_data = + (void *) shader->prog_data; + const bool uses_draw_params = vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance; + const bool uses_derived_draw_params = vs_prog_data->uses_drawid || + vs_prog_data->uses_is_indexed_draw; + const bool needs_sgvs_element = uses_draw_params || + vs_prog_data->uses_instanceid || + vs_prog_data->uses_vertexid; + + if (ice->state.vs_uses_draw_params != uses_draw_params || + ice->state.vs_uses_derived_draw_params != uses_derived_draw_params || + ice->state.vs_needs_edge_flag != ish->needs_edge_flag || + ice->state.vs_uses_vertexid != vs_prog_data->uses_vertexid || + ice->state.vs_uses_instanceid != vs_prog_data->uses_instanceid) { + ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS | + CROCUS_DIRTY_VERTEX_ELEMENTS; + } + ice->state.vs_uses_draw_params = uses_draw_params; + ice->state.vs_uses_derived_draw_params = uses_derived_draw_params; + ice->state.vs_needs_sgvs_element = needs_sgvs_element; + ice->state.vs_needs_edge_flag = ish->needs_edge_flag; + ice->state.vs_uses_vertexid = vs_prog_data->uses_vertexid; + ice->state.vs_uses_instanceid = vs_prog_data->uses_instanceid; + } +} + +/** + * Get the shader_info for a given stage, or NULL if the stage is disabled. + */ +const struct shader_info * +crocus_get_shader_info(const struct crocus_context *ice, gl_shader_stage stage) +{ + const struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[stage]; + + if (!ish) + return NULL; + + const nir_shader *nir = ish->nir; + return &nir->info; +} + +/** + * Get the union of TCS output and TES input slots. + * + * TCS and TES need to agree on a common URB entry layout. In particular, + * the data for all patch vertices is stored in a single URB entry (unlike + * GS which has one entry per input vertex). This means that per-vertex + * array indexing needs a stride. + * + * SSO requires locations to match, but doesn't require the number of + * outputs/inputs to match (in fact, the TCS often has extra outputs). + * So, we need to take the extra step of unifying these on the fly. + */ +static void +get_unified_tess_slots(const struct crocus_context *ice, + uint64_t *per_vertex_slots, + uint32_t *per_patch_slots) +{ + const struct shader_info *tcs = + crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL); + const struct shader_info *tes = + crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL); + + *per_vertex_slots = tes->inputs_read; + *per_patch_slots = tes->patch_inputs_read; + + if (tcs) { + *per_vertex_slots |= tcs->outputs_written; + *per_patch_slots |= tcs->patch_outputs_written; + } +} + +/** + * Compile a tessellation control shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_tcs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_tcs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + const struct nir_shader_compiler_options *options = + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].NirOptions; + void *mem_ctx = ralloc_context(NULL); + struct brw_tcs_prog_data *tcs_prog_data = + rzalloc(mem_ctx, struct brw_tcs_prog_data); + struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + const struct intel_device_info *devinfo = &screen->devinfo; + enum brw_param_builtin *system_values = NULL; + unsigned num_system_values = 0; + unsigned num_cbufs = 0; + + nir_shader *nir; + + struct crocus_binding_table bt; + + if (ish) { + nir = nir_shader_clone(mem_ctx, ish->nir); + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + + crocus_lower_swizzles(nir, &key->base.tex); + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + } else { + nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, options, key); + + /* Reserve space for passing the default tess levels as constants. */ + num_cbufs = 1; + num_system_values = 8; + system_values = + rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values); + prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values); + prog_data->nr_params = num_system_values; + + if (key->tes_primitive_mode == GL_QUADS) { + for (int i = 0; i < 4; i++) + system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; + + system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; + system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y; + } else if (key->tes_primitive_mode == GL_TRIANGLES) { + for (int i = 0; i < 3; i++) + system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; + + system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; + } else { + assert(key->tes_primitive_mode == GL_ISOLINES); + system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y; + system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X; + } + + /* Manually setup the TCS binding table. */ + memset(&bt, 0, sizeof(bt)); + bt.sizes[CROCUS_SURFACE_GROUP_UBO] = 1; + bt.used_mask[CROCUS_SURFACE_GROUP_UBO] = 1; + bt.size_bytes = 4; + + prog_data->ubo_ranges[0].length = 1; + } + + struct brw_tcs_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + char *error_str = NULL; + const unsigned *program = + brw_compile_tcs(compiler, &ice->dbg, mem_ctx, &key_clean, tcs_prog_data, nir, + -1, NULL, &error_str); + if (program == NULL) { + dbg_printf("Failed to compile control shader: %s\n", error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish) { + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_TCS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*tcs_prog_data), NULL, + system_values, num_system_values, + num_cbufs, &bt); + + if (ish) + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current tessellation control shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_tcs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL]; + struct crocus_uncompiled_shader *tcs = + ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + const struct shader_info *tes_info = + crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL); + struct brw_tcs_prog_key key = { + KEY_INIT_NO_ID(), + .base.program_string_id = tcs ? tcs->program_id : 0, + .tes_primitive_mode = tes_info->tess.primitive_mode, + .input_vertices = ice->state.vertices_per_patch, + .quads_workaround = tes_info->tess.primitive_mode == GL_QUADS && + tes_info->tess.spacing == TESS_SPACING_EQUAL, + }; + + if (tcs && tcs->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_CTRL, tcs, + tcs->nir->info.uses_texture_gather, &key.base.tex); + get_unified_tess_slots(ice, &key.outputs_written, + &key.patch_outputs_written); + screen->vtbl.populate_tcs_key(ice, &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TCS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_TCS, sizeof(key), &key); + + if (tcs && !shader) + shader = crocus_disk_cache_retrieve(ice, tcs, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_tcs(ice, tcs, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_TCS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TCS | + CROCUS_STAGE_DIRTY_BINDINGS_TCS | + CROCUS_STAGE_DIRTY_CONSTANTS_TCS; + shs->sysvals_need_upload = true; + } +} + +/** + * Compile a tessellation evaluation shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_tes(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_tes_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx = ralloc_context(NULL); + struct brw_tes_prog_data *tes_prog_data = + rzalloc(mem_ctx, struct brw_tes_prog_data); + struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + enum brw_param_builtin *system_values; + const struct intel_device_info *devinfo = &screen->devinfo; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + if (key->nr_userclip_plane_consts) { + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true, + false, NULL); + nir_lower_io_to_temporaries(nir, impl, true, false); + nir_lower_global_vars_to_local(nir); + nir_lower_vars_to_ssa(nir); + nir_shader_gather_info(nir, impl); + } + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + crocus_lower_swizzles(nir, &key->base.tex); + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + struct brw_vue_map input_vue_map; + brw_compute_tess_vue_map(&input_vue_map, key->inputs_read, + key->patch_inputs_read); + + struct brw_tes_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + char *error_str = NULL; + const unsigned *program = + brw_compile_tes(compiler, &ice->dbg, mem_ctx, &key_clean, &input_vue_map, + tes_prog_data, nir, -1, NULL, &error_str); + if (program == NULL) { + dbg_printf("Failed to compile evaluation shader: %s\n", error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + uint32_t *so_decls = NULL; + if (devinfo->ver > 6) + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_TES, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*tes_prog_data), so_decls, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current tessellation evaluation shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_tes(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_EVAL]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]; + struct brw_tes_prog_key key = { KEY_INIT() }; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_EVAL, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + get_unified_tess_slots(ice, &key.inputs_read, &key.patch_inputs_read); + screen->vtbl.populate_tes_key(ice, &ish->nir->info, last_vue_stage(ice), &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TES]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_TES, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_tes(ice, ish, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_TES] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_BINDINGS_TES | + CROCUS_STAGE_DIRTY_CONSTANTS_TES; + shs->sysvals_need_upload = true; + } + + /* TODO: Could compare and avoid flagging this. */ + const struct shader_info *tes_info = &ish->nir->info; + if (BITSET_TEST(tes_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES; + ice->state.shaders[MESA_SHADER_TESS_EVAL].sysvals_need_upload = true; + } +} + +/** + * Compile a geometry shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_gs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_gs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + const struct intel_device_info *devinfo = &screen->devinfo; + void *mem_ctx = ralloc_context(NULL); + struct brw_gs_prog_data *gs_prog_data = + rzalloc(mem_ctx, struct brw_gs_prog_data); + struct brw_vue_prog_data *vue_prog_data = &gs_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + enum brw_param_builtin *system_values; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + if (key->nr_userclip_plane_consts) { + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_lower_clip_gs(nir, (1 << key->nr_userclip_plane_consts) - 1, false, + NULL); + nir_lower_io_to_temporaries(nir, impl, true, false); + nir_lower_global_vars_to_local(nir); + nir_lower_vars_to_ssa(nir); + nir_shader_gather_info(nir, impl); + } + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + crocus_lower_swizzles(nir, &key->base.tex); + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + brw_compute_vue_map(devinfo, + &vue_prog_data->vue_map, nir->info.outputs_written, + nir->info.separate_shader, /* pos slots */ 1); + + if (devinfo->ver == 6) + gfx6_gs_xfb_setup(&ish->stream_output, gs_prog_data); + struct brw_gs_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + + char *error_str = NULL; + const unsigned *program = + brw_compile_gs(compiler, &ice->dbg, mem_ctx, &key_clean, gs_prog_data, nir, + -1, NULL, &error_str); + if (program == NULL) { + dbg_printf("Failed to compile geometry shader: %s\n", error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + uint32_t *so_decls = NULL; + if (devinfo->ver > 6) + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_GS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*gs_prog_data), so_decls, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current geometry shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_gs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_GEOMETRY]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]; + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_GS]; + struct crocus_compiled_shader *shader = NULL; + + if (ish) { + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_gs_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_GEOMETRY, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_gs_key(ice, &ish->nir->info, last_vue_stage(ice), &key); + + shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_GS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_gs(ice, ish, &key); + } + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_GS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS | + CROCUS_STAGE_DIRTY_BINDINGS_GS | + CROCUS_STAGE_DIRTY_CONSTANTS_GS; + shs->sysvals_need_upload = true; + } +} + +/** + * Compile a fragment (pixel) shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_fs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_wm_prog_key *key, + struct brw_vue_map *vue_map) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx = ralloc_context(NULL); + struct brw_wm_prog_data *fs_prog_data = + rzalloc(mem_ctx, struct brw_wm_prog_data); + struct brw_stage_prog_data *prog_data = &fs_prog_data->base; + enum brw_param_builtin *system_values; + const struct intel_device_info *devinfo = &screen->devinfo; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + prog_data->use_alt_mode = ish->use_alt_mode; + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + + /* Lower output variables to load_output intrinsics before setting up + * binding tables, so crocus_setup_binding_table can map any load_output + * intrinsics to CROCUS_SURFACE_GROUP_RENDER_TARGET_READ on Gen8 for + * non-coherent framebuffer fetches. + */ + brw_nir_lower_fs_outputs(nir); + + /* lower swizzles before binding table */ + crocus_lower_swizzles(nir, &key->base.tex); + int null_rts = 1; + + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, + MAX2(key->nr_color_regions, null_rts), + num_system_values, num_cbufs, + &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + struct brw_wm_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + + struct brw_compile_fs_params params = { + .nir = nir, + .key = &key_clean, + .prog_data = fs_prog_data, + + .allow_spilling = true, + .vue_map = vue_map, + + .log_data = &ice->dbg, + }; + const unsigned *program = + brw_compile_fs(compiler, mem_ctx, ¶ms); + if (program == NULL) { + dbg_printf("Failed to compile fragment shader: %s\n", params.error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_FS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*fs_prog_data), NULL, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current fragment shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_fs(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_FRAGMENT]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; + struct brw_wm_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_FRAGMENT, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_fs_key(ice, &ish->nir->info, &key); + + if (ish->nos & (1ull << CROCUS_NOS_LAST_VUE_MAP)) + key.input_slots_valid = ice->shaders.last_vue_map->slots_valid; + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_FS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_FS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_fs(ice, ish, &key, ice->shaders.last_vue_map); + + if (old != shader) { + // XXX: only need to flag CLIP if barycentric has NONPERSPECTIVE + // toggles. might be able to avoid flagging SBE too. + ice->shaders.prog[CROCUS_CACHE_FS] = shader; + ice->state.dirty |= CROCUS_DIRTY_WM; + /* gen4 clip/sf rely on fs prog_data */ + if (devinfo->ver < 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; + else + ice->state.dirty |= CROCUS_DIRTY_CLIP; + if (devinfo->ver == 6) + ice->state.dirty |= CROCUS_DIRTY_RASTER; + if (devinfo->ver >= 7) + ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS | + CROCUS_STAGE_DIRTY_BINDINGS_FS | + CROCUS_STAGE_DIRTY_CONSTANTS_FS; + shs->sysvals_need_upload = true; + } +} + +/** + * Update the last enabled stage's VUE map. + * + * When the shader feeding the rasterizer's output interface changes, we + * need to re-emit various packets. + */ +static void +update_last_vue_map(struct crocus_context *ice, + struct brw_stage_prog_data *prog_data) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_vue_prog_data *vue_prog_data = (void *) prog_data; + struct brw_vue_map *vue_map = &vue_prog_data->vue_map; + struct brw_vue_map *old_map = ice->shaders.last_vue_map; + const uint64_t changed_slots = + (old_map ? old_map->slots_valid : 0ull) ^ vue_map->slots_valid; + + if (changed_slots & VARYING_BIT_VIEWPORT) { + ice->state.num_viewports = + (vue_map->slots_valid & VARYING_BIT_VIEWPORT) ? CROCUS_MAX_VIEWPORTS : 1; + ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT | + CROCUS_DIRTY_CC_VIEWPORT; + if (devinfo->ver < 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; + + if (devinfo->ver <= 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; + + if (devinfo->ver >= 6) + ice->state.dirty |= CROCUS_DIRTY_CLIP | + CROCUS_DIRTY_GEN6_SCISSOR_RECT;; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS | + ice->state.stage_dirty_for_nos[CROCUS_NOS_LAST_VUE_MAP]; + } + + if (changed_slots || (old_map && old_map->separate != vue_map->separate)) { + ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS; + } + + ice->shaders.last_vue_map = &vue_prog_data->vue_map; +} + +static void +crocus_update_pull_constant_descriptors(struct crocus_context *ice, + gl_shader_stage stage) +{ + struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; + + if (!shader || !shader->prog_data->has_ubo_pull) + return; + + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + bool any_new_descriptors = + shader->num_system_values > 0 && shs->sysvals_need_upload; + + unsigned bound_cbufs = shs->bound_cbufs; + + while (bound_cbufs) { + const int i = u_bit_scan(&bound_cbufs); + struct pipe_constant_buffer *cbuf = &shs->constbufs[i]; + if (cbuf->buffer) { + any_new_descriptors = true; + } + } + + if (any_new_descriptors) + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage; +} + +/** + * Get the prog_data for a given stage, or NULL if the stage is disabled. + */ +static struct brw_vue_prog_data * +get_vue_prog_data(struct crocus_context *ice, gl_shader_stage stage) +{ + if (!ice->shaders.prog[stage]) + return NULL; + + return (void *) ice->shaders.prog[stage]->prog_data; +} + +static struct crocus_compiled_shader * +crocus_compile_clip(struct crocus_context *ice, struct brw_clip_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx; + unsigned program_size; + mem_ctx = ralloc_context(NULL); + + struct brw_clip_prog_data *clip_prog_data = + rzalloc(mem_ctx, struct brw_clip_prog_data); + + const unsigned *program = brw_compile_clip(compiler, mem_ctx, key, clip_prog_data, + ice->shaders.last_vue_map, &program_size); + + if (program == NULL) { + dbg_printf("failed to compile clip shader\n"); + ralloc_free(mem_ctx); + return false; + } + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_CLIP, sizeof(*key), key, program, + program_size, + (struct brw_stage_prog_data *)clip_prog_data, sizeof(*clip_prog_data), + NULL, NULL, 0, 0, &bt); + ralloc_free(mem_ctx); + return shader; +} +static void +crocus_update_compiled_clip(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct brw_clip_prog_key key; + struct crocus_compiled_shader *old = ice->shaders.clip_prog; + memset(&key, 0, sizeof(key)); + + const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); + if (wm_prog_data) { + key.contains_flat_varying = wm_prog_data->contains_flat_varying; + key.contains_noperspective_varying = + wm_prog_data->contains_noperspective_varying; + memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode)); + } + + key.primitive = u_reduced_prim(ice->state.prim_mode); + key.attrs = ice->shaders.last_vue_map->slots_valid; + + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + key.pv_first = rs_state->flatshade_first; + + if (rs_state->clip_plane_enable) + key.nr_userclip = util_logbase2(rs_state->clip_plane_enable) + 1; + + if (screen->devinfo.ver == 5) + key.clip_mode = BRW_CLIP_MODE_KERNEL_CLIP; + else + key.clip_mode = BRW_CLIP_MODE_NORMAL; + + if (key.primitive == PIPE_PRIM_TRIANGLES) { + if (rs_state->cull_face == PIPE_FACE_FRONT_AND_BACK) + key.clip_mode = BRW_CLIP_MODE_REJECT_ALL; + else { + uint32_t fill_front = BRW_CLIP_FILL_MODE_CULL; + uint32_t fill_back = BRW_CLIP_FILL_MODE_CULL; + uint32_t offset_front = 0; + uint32_t offset_back = 0; + + if (!(rs_state->cull_face & PIPE_FACE_FRONT)) { + switch (rs_state->fill_front) { + case PIPE_POLYGON_MODE_FILL: + fill_front = BRW_CLIP_FILL_MODE_FILL; + offset_front = 0; + break; + case PIPE_POLYGON_MODE_LINE: + fill_front = BRW_CLIP_FILL_MODE_LINE; + offset_front = rs_state->offset_line; + break; + case PIPE_POLYGON_MODE_POINT: + fill_front = BRW_CLIP_FILL_MODE_POINT; + offset_front = rs_state->offset_point; + break; + } + } + + if (!(rs_state->cull_face & PIPE_FACE_BACK)) { + switch (rs_state->fill_back) { + case PIPE_POLYGON_MODE_FILL: + fill_back = BRW_CLIP_FILL_MODE_FILL; + offset_back = 0; + break; + case PIPE_POLYGON_MODE_LINE: + fill_back = BRW_CLIP_FILL_MODE_LINE; + offset_back = rs_state->offset_line; + break; + case PIPE_POLYGON_MODE_POINT: + fill_back = BRW_CLIP_FILL_MODE_POINT; + offset_back = rs_state->offset_point; + break; + } + } + + if (rs_state->fill_back != PIPE_POLYGON_MODE_FILL || + rs_state->fill_front != PIPE_POLYGON_MODE_FILL) { + key.do_unfilled = 1; + + /* Most cases the fixed function units will handle. Cases where + * one or more polygon faces are unfilled will require help: + */ + key.clip_mode = BRW_CLIP_MODE_CLIP_NON_REJECTED; + + if (offset_back || offset_front) { + double mrd = 0.0; + if (ice->state.framebuffer.zsbuf) + mrd = util_get_depth_format_mrd(util_format_description(ice->state.framebuffer.zsbuf->format)); + key.offset_units = rs_state->offset_units * mrd * 2; + key.offset_factor = rs_state->offset_scale * mrd; + key.offset_clamp = rs_state->offset_clamp * mrd; + } + + if (!(rs_state->front_ccw ^ rs_state->bottom_edge_rule)) { + key.fill_ccw = fill_front; + key.fill_cw = fill_back; + key.offset_ccw = offset_front; + key.offset_cw = offset_back; + if (rs_state->light_twoside && + key.fill_cw != BRW_CLIP_FILL_MODE_CULL) + key.copy_bfc_cw = 1; + } else { + key.fill_cw = fill_front; + key.fill_ccw = fill_back; + key.offset_cw = offset_front; + key.offset_ccw = offset_back; + if (rs_state->light_twoside && + key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) + key.copy_bfc_ccw = 1; + } + } + } + } + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_CLIP, sizeof(key), &key); + + if (!shader) + shader = crocus_compile_clip(ice, &key); + + if (old != shader) { + ice->state.dirty |= CROCUS_DIRTY_CLIP; + ice->shaders.clip_prog = shader; + } +} + +static struct crocus_compiled_shader * +crocus_compile_sf(struct crocus_context *ice, struct brw_sf_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx; + unsigned program_size; + mem_ctx = ralloc_context(NULL); + + struct brw_sf_prog_data *sf_prog_data = + rzalloc(mem_ctx, struct brw_sf_prog_data); + + const unsigned *program = brw_compile_sf(compiler, mem_ctx, key, sf_prog_data, + ice->shaders.last_vue_map, &program_size); + + if (program == NULL) { + dbg_printf("failed to compile sf shader\n"); + ralloc_free(mem_ctx); + return false; + } + + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_SF, sizeof(*key), key, program, + program_size, + (struct brw_stage_prog_data *)sf_prog_data, sizeof(*sf_prog_data), + NULL, NULL, 0, 0, &bt); + ralloc_free(mem_ctx); + return shader; +} + +static void +crocus_update_compiled_sf(struct crocus_context *ice) +{ + struct brw_sf_prog_key key; + struct crocus_compiled_shader *old = ice->shaders.sf_prog; + memset(&key, 0, sizeof(key)); + + key.attrs = ice->shaders.last_vue_map->slots_valid; + + switch (u_reduced_prim(ice->state.prim_mode)) { + case GL_TRIANGLES: + default: + if (key.attrs & BITFIELD64_BIT(VARYING_SLOT_EDGE)) + key.primitive = BRW_SF_PRIM_UNFILLED_TRIS; + else + key.primitive = BRW_SF_PRIM_TRIANGLES; + break; + case GL_LINES: + key.primitive = BRW_SF_PRIM_LINES; + break; + case GL_POINTS: + key.primitive = BRW_SF_PRIM_POINTS; + break; + } + + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + key.userclip_active = rs_state->clip_plane_enable != 0; + const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); + if (wm_prog_data) { + key.contains_flat_varying = wm_prog_data->contains_flat_varying; + memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode)); + } + + key.do_twoside_color = rs_state->light_twoside; + + key.do_point_sprite = rs_state->point_quad_rasterization; + if (key.do_point_sprite) { + key.point_sprite_coord_replace = rs_state->sprite_coord_enable & 0xff; + if (rs_state->sprite_coord_enable & (1 << 8)) + key.do_point_coord = 1; + if (wm_prog_data && wm_prog_data->urb_setup[VARYING_SLOT_PNTC] != -1) + key.do_point_coord = 1; + } + + key.sprite_origin_lower_left = rs_state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT; + + if (key.do_twoside_color) { + key.frontface_ccw = rs_state->front_ccw; + } + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_SF, sizeof(key), &key); + + if (!shader) + shader = crocus_compile_sf(ice, &key); + + if (old != shader) { + ice->state.dirty |= CROCUS_DIRTY_RASTER; + ice->shaders.sf_prog = shader; + } +} + +static struct crocus_compiled_shader * +crocus_compile_ff_gs(struct crocus_context *ice, struct brw_ff_gs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct brw_compiler *compiler = screen->compiler; + void *mem_ctx; + unsigned program_size; + mem_ctx = ralloc_context(NULL); + + struct brw_ff_gs_prog_data *ff_gs_prog_data = + rzalloc(mem_ctx, struct brw_ff_gs_prog_data); + + const unsigned *program = brw_compile_ff_gs_prog(compiler, mem_ctx, key, ff_gs_prog_data, + ice->shaders.last_vue_map, &program_size); + + if (program == NULL) { + dbg_printf("failed to compile sf shader\n"); + ralloc_free(mem_ctx); + return false; + } + + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + + if (screen->devinfo.ver == 6) { + bt.sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS; + bt.used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1; + + bt.size_bytes = BRW_MAX_SOL_BINDINGS * 4; + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_FF_GS, sizeof(*key), key, program, + program_size, + (struct brw_stage_prog_data *)ff_gs_prog_data, sizeof(*ff_gs_prog_data), + NULL, NULL, 0, 0, &bt); + ralloc_free(mem_ctx); + return shader; +} + +static void +crocus_update_compiled_ff_gs(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_ff_gs_prog_key key; + struct crocus_compiled_shader *old = ice->shaders.ff_gs_prog; + memset(&key, 0, sizeof(key)); + + assert(devinfo->ver < 7); + + key.attrs = ice->shaders.last_vue_map->slots_valid; + + key.primitive = screen->vtbl.translate_prim_type(ice->state.prim_mode, 0); + + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + key.pv_first = rs_state->flatshade_first; + + if (key.primitive == _3DPRIM_QUADLIST && !rs_state->flatshade) { + /* Provide consistenbbbbbt primitive order with brw_set_prim's + * optimization of single quads to trifans. + */ + key.pv_first = true; + } + + if (devinfo->ver >= 6) { + key.need_gs_prog = ice->state.streamout_active; + if (key.need_gs_prog) { + struct crocus_uncompiled_shader *vs = + ice->shaders.uncompiled[MESA_SHADER_VERTEX]; + gfx6_ff_gs_xfb_setup(&vs->stream_output, + &key); + } + } else { + key.need_gs_prog = (key.primitive == _3DPRIM_QUADLIST || + key.primitive == _3DPRIM_QUADSTRIP || + key.primitive == _3DPRIM_LINELOOP); + } + + struct crocus_compiled_shader *shader = NULL; + if (key.need_gs_prog) { + shader = crocus_find_cached_shader(ice, CROCUS_CACHE_FF_GS, + sizeof(key), &key); + if (!shader) + shader = crocus_compile_ff_gs(ice, &key); + } + if (old != shader) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS; + if (!!old != !!shader) + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + ice->shaders.ff_gs_prog = shader; + if (shader) { + const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data; + ice->state.last_xfb_verts_per_prim = gs_prog_data->svbi_postincrement_value; + } + } +} + +// XXX: crocus_compiled_shaders are space-leaking :( +// XXX: do remember to unbind them if deleting them. + +/** + * Update the current shader variants for the given state. + * + * This should be called on every draw call to ensure that the correct + * shaders are bound. It will also flag any dirty state triggered by + * swapping out those shaders. + */ +bool +crocus_update_compiled_shaders(struct crocus_context *ice) +{ + struct crocus_screen *screen = (void *) ice->ctx.screen; + const uint64_t stage_dirty = ice->state.stage_dirty; + + struct brw_vue_prog_data *old_prog_datas[4]; + if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) { + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) + old_prog_datas[i] = get_vue_prog_data(ice, i); + } + + if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_TCS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) { + struct crocus_uncompiled_shader *tes = + ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]; + if (tes) { + crocus_update_compiled_tcs(ice); + crocus_update_compiled_tes(ice); + } else { + ice->shaders.prog[CROCUS_CACHE_TCS] = NULL; + ice->shaders.prog[CROCUS_CACHE_TES] = NULL; + ice->state.stage_dirty |= + CROCUS_STAGE_DIRTY_TCS | CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_BINDINGS_TCS | CROCUS_STAGE_DIRTY_BINDINGS_TES | + CROCUS_STAGE_DIRTY_CONSTANTS_TCS | CROCUS_STAGE_DIRTY_CONSTANTS_TES; + } + } + + if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_VS) + crocus_update_compiled_vs(ice); + if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_GS) + crocus_update_compiled_gs(ice); + + if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_GS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) { + const struct crocus_compiled_shader *gs = + ice->shaders.prog[MESA_SHADER_GEOMETRY]; + const struct crocus_compiled_shader *tes = + ice->shaders.prog[MESA_SHADER_TESS_EVAL]; + + bool points_or_lines = false; + + if (gs) { + const struct brw_gs_prog_data *gs_prog_data = (void *) gs->prog_data; + points_or_lines = + gs_prog_data->output_topology == _3DPRIM_POINTLIST || + gs_prog_data->output_topology == _3DPRIM_LINESTRIP; + } else if (tes) { + const struct brw_tes_prog_data *tes_data = (void *) tes->prog_data; + points_or_lines = + tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_LINE || + tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT; + } + + if (ice->shaders.output_topology_is_points_or_lines != points_or_lines) { + /* Outbound to XY Clip enables */ + ice->shaders.output_topology_is_points_or_lines = points_or_lines; + ice->state.dirty |= CROCUS_DIRTY_CLIP; + } + } + + if (!ice->shaders.prog[MESA_SHADER_VERTEX]) + return false; + + gl_shader_stage last_stage = last_vue_stage(ice); + struct crocus_compiled_shader *shader = ice->shaders.prog[last_stage]; + struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[last_stage]; + update_last_vue_map(ice, shader->prog_data); + if (ice->state.streamout != shader->streamout) { + ice->state.streamout = shader->streamout; + ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST | CROCUS_DIRTY_STREAMOUT; + } + + if (ice->state.streamout_active) { + screen->vtbl.update_so_strides(ice, ish->stream_output.stride); + } + + /* use ice->state version as last_vue_map can dirty this bit */ + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_FS) + crocus_update_compiled_fs(ice); + + if (screen->devinfo.ver <= 6) { + if (ice->state.dirty & CROCUS_DIRTY_GEN4_FF_GS_PROG && + !ice->shaders.prog[MESA_SHADER_GEOMETRY]) + crocus_update_compiled_ff_gs(ice); + } + + if (screen->devinfo.ver < 6) { + if (ice->state.dirty & CROCUS_DIRTY_GEN4_CLIP_PROG) + crocus_update_compiled_clip(ice); + if (ice->state.dirty & CROCUS_DIRTY_GEN4_SF_PROG) + crocus_update_compiled_sf(ice); + } + + + /* Changing shader interfaces may require a URB configuration. */ + if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) { + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + struct brw_vue_prog_data *old = old_prog_datas[i]; + struct brw_vue_prog_data *new = get_vue_prog_data(ice, i); + if (!!old != !!new || + (new && new->urb_entry_size != old->urb_entry_size)) { + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + break; + } + } + } + + if (ice->state.stage_dirty & CROCUS_RENDER_STAGE_DIRTY_CONSTANTS) { + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) { + if (ice->state.stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << i)) + crocus_update_pull_constant_descriptors(ice, i); + } + } + return true; +} + +static struct crocus_compiled_shader * +crocus_compile_cs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_cs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx = ralloc_context(NULL); + struct brw_cs_prog_data *cs_prog_data = + rzalloc(mem_ctx, struct brw_cs_prog_data); + struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + enum brw_param_builtin *system_values; + const struct intel_device_info *devinfo = &screen->devinfo; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics); + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + crocus_lower_swizzles(nir, &key->base.tex); + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + struct brw_compile_cs_params params = { + .nir = nir, + .key = key, + .prog_data = cs_prog_data, + .log_data = &ice->dbg, + }; + + const unsigned *program = + brw_compile_cs(compiler, mem_ctx, ¶ms); + if (program == NULL) { + dbg_printf("Failed to compile compute shader: %s\n", params.error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_CS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*cs_prog_data), NULL, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +static void +crocus_update_compiled_cs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_COMPUTE]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_cs_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_COMPUTE, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_cs_key(ice, &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_CS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_CS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_cs(ice, ish, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_CS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS | + CROCUS_STAGE_DIRTY_BINDINGS_CS | + CROCUS_STAGE_DIRTY_CONSTANTS_CS; + shs->sysvals_need_upload = true; + } +} + +void +crocus_update_compiled_compute_shader(struct crocus_context *ice) +{ + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_CS) + crocus_update_compiled_cs(ice); + + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) + crocus_update_pull_constant_descriptors(ice, MESA_SHADER_COMPUTE); +} + +void +crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data, + unsigned threads, + uint32_t *dst) +{ + assert(brw_cs_push_const_total_size(cs_prog_data, threads) > 0); + assert(cs_prog_data->push.cross_thread.size == 0); + assert(cs_prog_data->push.per_thread.dwords == 1); + assert(cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID); + for (unsigned t = 0; t < threads; t++) + dst[8 * t] = t; +} + +/** + * Allocate scratch BOs as needed for the given per-thread size and stage. + */ +struct crocus_bo * +crocus_get_scratch_space(struct crocus_context *ice, + unsigned per_thread_scratch, + gl_shader_stage stage) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + const struct intel_device_info *devinfo = &screen->devinfo; + + unsigned encoded_size = ffs(per_thread_scratch) - 11; + assert(encoded_size < (1 << 16)); + + struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage]; + + unsigned subslice_total = screen->subslice_total; + subslice_total = 4 * devinfo->num_slices; + // assert(subslice_total >= screen->subslice_total); + + if (!*bop) { + unsigned scratch_ids_per_subslice = devinfo->max_cs_threads; + + uint32_t max_threads[] = { + [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, + [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, + [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, + [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, + [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, + [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total, + }; + + uint32_t size = per_thread_scratch * max_threads[stage]; + + *bop = crocus_bo_alloc(bufmgr, "scratch", size); + } + + return *bop; +} + +/* ------------------------------------------------------------------- */ + +/** + * The pipe->create_[stage]_state() driver hooks. + * + * Performs basic NIR preprocessing, records any state dependencies, and + * returns an crocus_uncompiled_shader as the Gallium CSO. + * + * Actual shader compilation to assembly happens later, at first use. + */ +static void * +crocus_create_uncompiled_shader(struct pipe_context *ctx, + nir_shader *nir, + const struct pipe_stream_output_info *so_info) +{ + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_uncompiled_shader *ish = + calloc(1, sizeof(struct crocus_uncompiled_shader)); + if (!ish) + return NULL; + + if (devinfo->ver >= 6) + NIR_PASS(ish->needs_edge_flag, nir, crocus_fix_edge_flags); + else + ish->needs_edge_flag = false; + + brw_preprocess_nir(screen->compiler, nir, NULL); + + NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo, false); + NIR_PASS_V(nir, crocus_lower_storage_image_derefs); + + nir_sweep(nir); + + ish->program_id = get_new_program_id(screen); + ish->nir = nir; + if (so_info) { + memcpy(&ish->stream_output, so_info, sizeof(*so_info)); + update_so_info(&ish->stream_output, nir->info.outputs_written); + } + + /* Save this now before potentially dropping nir->info.name */ + if (nir->info.name && strncmp(nir->info.name, "ARB", 3) == 0) + ish->use_alt_mode = true; + + if (screen->disk_cache) { + /* Serialize the NIR to a binary blob that we can hash for the disk + * cache. Drop unnecessary information (like variable names) + * so the serialized NIR is smaller, and also to let us detect more + * isomorphic shaders when hashing, increasing cache hits. + */ + struct blob blob; + blob_init(&blob); + nir_serialize(&blob, nir, true); + _mesa_sha1_compute(blob.data, blob.size, ish->nir_sha1); + blob_finish(&blob); + } + + return ish; +} + +static struct crocus_uncompiled_shader * +crocus_create_shader_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct nir_shader *nir; + + if (state->type == PIPE_SHADER_IR_TGSI) + nir = tgsi_to_nir(state->tokens, ctx->screen, false); + else + nir = state->ir.nir; + + return crocus_create_uncompiled_shader(ctx, nir, &state->stream_output); +} + +static void * +crocus_create_vs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + /* User clip planes or gen5 sprite coord enable */ + if (ish->nir->info.clip_distance_array_size == 0 || + screen->devinfo.ver <= 5) + ish->nos |= (1ull << CROCUS_NOS_RASTERIZER); + + if (!screen->devinfo.is_haswell) + ish->nos |= (1ull << CROCUS_NOS_VERTEX_ELEMENTS); + + if (screen->precompile) { + struct brw_vs_prog_key key = { KEY_INIT() }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_vs(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_tcs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + struct shader_info *info = &ish->nir->info; + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + if (screen->precompile) { + const unsigned _GL_TRIANGLES = 0x0004; + struct brw_tcs_prog_key key = { + KEY_INIT(), + // XXX: make sure the linker fills this out from the TES... + .tes_primitive_mode = + info->tess.primitive_mode ? info->tess.primitive_mode + : _GL_TRIANGLES, + .outputs_written = info->outputs_written, + .patch_outputs_written = info->patch_outputs_written, + }; + + key.input_vertices = info->tess.tcs_vertices_out; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_tcs(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_tes_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + struct shader_info *info = &ish->nir->info; + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + /* User clip planes */ + if (ish->nir->info.clip_distance_array_size == 0) + ish->nos |= (1ull << CROCUS_NOS_RASTERIZER); + + if (screen->precompile) { + struct brw_tes_prog_key key = { + KEY_INIT(), + // XXX: not ideal, need TCS output/TES input unification + .inputs_read = info->inputs_read, + .patch_inputs_read = info->patch_inputs_read, + }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_tes(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_gs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + /* User clip planes */ + if (ish->nir->info.clip_distance_array_size == 0) + ish->nos |= (1ull << CROCUS_NOS_RASTERIZER); + + if (screen->precompile) { + struct brw_gs_prog_key key = { KEY_INIT() }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_gs(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_fs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + struct shader_info *info = &ish->nir->info; + + ish->nos |= (1ull << CROCUS_NOS_FRAMEBUFFER) | + (1ull << CROCUS_NOS_DEPTH_STENCIL_ALPHA) | + (1ull << CROCUS_NOS_RASTERIZER) | + (1ull << CROCUS_NOS_TEXTURES) | + (1ull << CROCUS_NOS_BLEND); + + /* The program key needs the VUE map if there are > 16 inputs or gen4/5 */ + if (screen->devinfo.ver < 6 || util_bitcount64(ish->nir->info.inputs_read & + BRW_FS_VARYING_INPUT_MASK) > 16) { + ish->nos |= (1ull << CROCUS_NOS_LAST_VUE_MAP); + } + + if (screen->precompile) { + const uint64_t color_outputs = info->outputs_written & + ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) | + BITFIELD64_BIT(FRAG_RESULT_STENCIL) | + BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + + bool can_rearrange_varyings = + screen->devinfo.ver > 6 && util_bitcount64(info->inputs_read & BRW_FS_VARYING_INPUT_MASK) <= 16; + + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_wm_prog_key key = { + KEY_INIT(), + .nr_color_regions = util_bitcount(color_outputs), + .coherent_fb_fetch = false, + .input_slots_valid = + can_rearrange_varyings ? 0 : info->inputs_read | VARYING_BIT_POS, + }; + + struct brw_vue_map vue_map; + if (devinfo->ver < 6) { + brw_compute_vue_map(devinfo, &vue_map, + info->inputs_read | VARYING_BIT_POS, + false, /* pos slots */ 1); + } + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_fs(ice, ish, &key, &vue_map); + } + + return ish; +} + +static void * +crocus_create_compute_state(struct pipe_context *ctx, + const struct pipe_compute_state *state) +{ + assert(state->ir_type == PIPE_SHADER_IR_NIR); + + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = + crocus_create_uncompiled_shader(ctx, (void *) state->prog, NULL); + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + // XXX: disallow more than 64KB of shared variables + + if (screen->precompile) { + struct brw_cs_prog_key key = { KEY_INIT() }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_cs(ice, ish, &key); + } + + return ish; +} + +/** + * The pipe->delete_[stage]_state() driver hooks. + * + * Frees the crocus_uncompiled_shader. + */ +static void +crocus_delete_shader_state(struct pipe_context *ctx, void *state, gl_shader_stage stage) +{ + struct crocus_uncompiled_shader *ish = state; + struct crocus_context *ice = (void *) ctx; + + if (ice->shaders.uncompiled[stage] == ish) { + ice->shaders.uncompiled[stage] = NULL; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage; + } + + if (ish->const_data) { + pipe_resource_reference(&ish->const_data, NULL); + pipe_resource_reference(&ish->const_data_state.res, NULL); + } + + ralloc_free(ish->nir); + free(ish); +} + +static void +crocus_delete_vs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_VERTEX); +} + +static void +crocus_delete_tcs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_CTRL); +} + +static void +crocus_delete_tes_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_EVAL); +} + +static void +crocus_delete_gs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_GEOMETRY); +} + +static void +crocus_delete_fs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_FRAGMENT); +} + +static void +crocus_delete_cs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_COMPUTE); +} + +/** + * The pipe->bind_[stage]_state() driver hook. + * + * Binds an uncompiled shader as the current one for a particular stage. + * Updates dirty tracking to account for the shader's NOS. + */ +static void +bind_shader_state(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + gl_shader_stage stage) +{ + uint64_t dirty_bit = CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage; + const uint64_t nos = ish ? ish->nos : 0; + + const struct shader_info *old_info = crocus_get_shader_info(ice, stage); + const struct shader_info *new_info = ish ? &ish->nir->info : NULL; + + if ((old_info ? BITSET_LAST_BIT(old_info->textures_used) : 0) != + (new_info ? BITSET_LAST_BIT(new_info->textures_used) : 0)) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage; + } + + ice->shaders.uncompiled[stage] = ish; + ice->state.stage_dirty |= dirty_bit; + + /* Record that CSOs need to mark CROCUS_DIRTY_UNCOMPILED_XS when they change + * (or that they no longer need to do so). + */ + for (int i = 0; i < CROCUS_NOS_COUNT; i++) { + if (nos & (1 << i)) + ice->state.stage_dirty_for_nos[i] |= dirty_bit; + else + ice->state.stage_dirty_for_nos[i] &= ~dirty_bit; + } +} + +static void +crocus_bind_vs_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_uncompiled_shader *new_ish = state; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (new_ish && + ice->state.window_space_position != + new_ish->nir->info.vs.window_space_position) { + ice->state.window_space_position = + new_ish->nir->info.vs.window_space_position; + + ice->state.dirty |= CROCUS_DIRTY_CLIP | + CROCUS_DIRTY_RASTER | + CROCUS_DIRTY_CC_VIEWPORT; + } + + if (devinfo->ver == 6) { + ice->state.stage_dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; + } + + bind_shader_state((void *) ctx, state, MESA_SHADER_VERTEX); +} + +static void +crocus_bind_tcs_state(struct pipe_context *ctx, void *state) +{ + bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_CTRL); +} + +static void +crocus_bind_tes_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + /* Enabling/disabling optional stages requires a URB reconfiguration. */ + if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + + bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_EVAL); +} + +static void +crocus_bind_gs_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + /* Enabling/disabling optional stages requires a URB reconfiguration. */ + if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + + bind_shader_state((void *) ctx, state, MESA_SHADER_GEOMETRY); +} + +static void +crocus_bind_fs_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_uncompiled_shader *old_ish = + ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; + struct crocus_uncompiled_shader *new_ish = state; + + const unsigned color_bits = + BITFIELD64_BIT(FRAG_RESULT_COLOR) | + BITFIELD64_RANGE(FRAG_RESULT_DATA0, BRW_MAX_DRAW_BUFFERS); + + /* Fragment shader outputs influence HasWriteableRT */ + if (!old_ish || !new_ish || + (old_ish->nir->info.outputs_written & color_bits) != + (new_ish->nir->info.outputs_written & color_bits)) + ice->state.dirty |= CROCUS_DIRTY_WM; + + bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT); +} + +static void +crocus_bind_cs_state(struct pipe_context *ctx, void *state) +{ + bind_shader_state((void *) ctx, state, MESA_SHADER_COMPUTE); +} + +void +crocus_init_program_functions(struct pipe_context *ctx) +{ + ctx->create_vs_state = crocus_create_vs_state; + ctx->create_tcs_state = crocus_create_tcs_state; + ctx->create_tes_state = crocus_create_tes_state; + ctx->create_gs_state = crocus_create_gs_state; + ctx->create_fs_state = crocus_create_fs_state; + ctx->create_compute_state = crocus_create_compute_state; + + ctx->delete_vs_state = crocus_delete_vs_state; + ctx->delete_tcs_state = crocus_delete_tcs_state; + ctx->delete_tes_state = crocus_delete_tes_state; + ctx->delete_gs_state = crocus_delete_gs_state; + ctx->delete_fs_state = crocus_delete_fs_state; + ctx->delete_compute_state = crocus_delete_cs_state; + + ctx->bind_vs_state = crocus_bind_vs_state; + ctx->bind_tcs_state = crocus_bind_tcs_state; + ctx->bind_tes_state = crocus_bind_tes_state; + ctx->bind_gs_state = crocus_bind_gs_state; + ctx->bind_fs_state = crocus_bind_fs_state; + ctx->bind_compute_state = crocus_bind_cs_state; +} diff --git a/src/gallium/drivers/crocus/crocus_program_cache.c b/src/gallium/drivers/crocus/crocus_program_cache.c new file mode 100644 index 00000000000..d2d4b821754 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_program_cache.c @@ -0,0 +1,347 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_program_cache.c + * + * The in-memory program cache. This is basically a hash table mapping + * API-specified shaders and a state key to a compiled variant. It also + * takes care of uploading shader assembly into a BO for use on the GPU. + */ + +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_atomic.h" +#include "util/u_upload_mgr.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/compiler/brw_eu.h" +#include "intel/compiler/brw_nir.h" +#include "crocus_context.h" +#include "crocus_resource.h" + +struct keybox { + uint16_t size; + enum crocus_program_cache_id cache_id; + uint8_t data[0]; +}; + +static struct keybox * +make_keybox(void *mem_ctx, enum crocus_program_cache_id cache_id, + const void *key, uint32_t key_size) +{ + struct keybox *keybox = + ralloc_size(mem_ctx, sizeof(struct keybox) + key_size); + + keybox->cache_id = cache_id; + keybox->size = key_size; + memcpy(keybox->data, key, key_size); + + return keybox; +} + +static uint32_t +keybox_hash(const void *void_key) +{ + const struct keybox *key = void_key; + return _mesa_hash_data(&key->cache_id, key->size + sizeof(key->cache_id)); +} + +static bool +keybox_equals(const void *void_a, const void *void_b) +{ + const struct keybox *a = void_a, *b = void_b; + if (a->size != b->size) + return false; + + return memcmp(a->data, b->data, a->size) == 0; +} + +struct crocus_compiled_shader * +crocus_find_cached_shader(struct crocus_context *ice, + enum crocus_program_cache_id cache_id, + uint32_t key_size, const void *key) +{ + struct keybox *keybox = make_keybox(NULL, cache_id, key, key_size); + struct hash_entry *entry = + _mesa_hash_table_search(ice->shaders.cache, keybox); + + ralloc_free(keybox); + + return entry ? entry->data : NULL; +} + +const void * +crocus_find_previous_compile(const struct crocus_context *ice, + enum crocus_program_cache_id cache_id, + unsigned program_string_id) +{ + hash_table_foreach(ice->shaders.cache, entry) { + const struct keybox *keybox = entry->key; + const struct brw_base_prog_key *key = (const void *)keybox->data; + if (keybox->cache_id == cache_id && + key->program_string_id == program_string_id) { + return keybox->data; + } + } + + return NULL; +} + +/** + * Look for an existing entry in the cache that has identical assembly code. + * + * This is useful for programs generating shaders at runtime, where multiple + * distinct shaders (from an API perspective) may compile to the same assembly + * in our backend. This saves space in the program cache buffer. + */ +static const struct crocus_compiled_shader * +find_existing_assembly(struct hash_table *cache, void *map, + const void *assembly, unsigned assembly_size) +{ + hash_table_foreach (cache, entry) { + const struct crocus_compiled_shader *existing = entry->data; + + if (existing->map_size != assembly_size) + continue; + + if (memcmp(map + existing->offset, assembly, assembly_size) == 0) + return existing; + } + return NULL; +} + +static void +crocus_cache_new_bo(struct crocus_context *ice, + uint32_t new_size) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct crocus_bo *new_bo; + new_bo = crocus_bo_alloc(screen->bufmgr, "program cache", new_size); + + void *map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE | + MAP_ASYNC | MAP_PERSISTENT); + + if (ice->shaders.cache_next_offset != 0) { + memcpy(map, ice->shaders.cache_bo_map, ice->shaders.cache_next_offset); + } + + crocus_bo_unmap(ice->shaders.cache_bo); + crocus_bo_unreference(ice->shaders.cache_bo); + ice->shaders.cache_bo = new_bo; + ice->shaders.cache_bo_map = map; + + if (screen->devinfo.ver == 4) { + /* reemit all shaders on GEN4 only. */ + ice->state.dirty |= CROCUS_DIRTY_CLIP | CROCUS_DIRTY_RASTER | + CROCUS_DIRTY_WM; + } + ice->batches[CROCUS_BATCH_RENDER].state_base_address_emitted = false; + ice->batches[CROCUS_BATCH_COMPUTE].state_base_address_emitted = false; + /* unset state base address */ +} + +static uint32_t +crocus_alloc_item_data(struct crocus_context *ice, uint32_t size) +{ + if (ice->shaders.cache_next_offset + size > ice->shaders.cache_bo->size) { + uint32_t new_size = ice->shaders.cache_bo->size * 2; + while (ice->shaders.cache_next_offset + size > new_size) + new_size *= 2; + + crocus_cache_new_bo(ice, new_size); + } + uint32_t offset = ice->shaders.cache_next_offset; + + /* Programs are always 64-byte aligned, so set up the next one now */ + ice->shaders.cache_next_offset = ALIGN(offset + size, 64); + return offset; +} + +struct crocus_compiled_shader * +crocus_upload_shader(struct crocus_context *ice, + enum crocus_program_cache_id cache_id, uint32_t key_size, + const void *key, const void *assembly, uint32_t asm_size, + struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, uint32_t *streamout, + enum brw_param_builtin *system_values, + unsigned num_system_values, unsigned num_cbufs, + const struct crocus_binding_table *bt) +{ + struct hash_table *cache = ice->shaders.cache; + struct crocus_compiled_shader *shader = + rzalloc_size(cache, sizeof(struct crocus_compiled_shader)); + const struct crocus_compiled_shader *existing = find_existing_assembly( + cache, ice->shaders.cache_bo_map, assembly, asm_size); + + /* If we can find a matching prog in the cache already, then reuse the + * existing stuff without creating new copy into the underlying buffer + * object. This is notably useful for programs generating shaders at + * runtime, where multiple shaders may compile to the same thing in our + * backend. + */ + if (existing) { + shader->offset = existing->offset; + shader->map_size = existing->map_size; + } else { + shader->offset = crocus_alloc_item_data(ice, asm_size); + shader->map_size = asm_size; + + memcpy(ice->shaders.cache_bo_map + shader->offset, assembly, asm_size); + } + + shader->prog_data = prog_data; + shader->prog_data_size = prog_data_size; + shader->streamout = streamout; + shader->system_values = system_values; + shader->num_system_values = num_system_values; + shader->num_cbufs = num_cbufs; + shader->bt = *bt; + + ralloc_steal(shader, shader->prog_data); + if (prog_data_size > 16) { + ralloc_steal(shader->prog_data, prog_data->param); + ralloc_steal(shader->prog_data, prog_data->pull_param); + } + ralloc_steal(shader, shader->streamout); + ralloc_steal(shader, shader->system_values); + + struct keybox *keybox = make_keybox(shader, cache_id, key, key_size); + _mesa_hash_table_insert(ice->shaders.cache, keybox, shader); + + return shader; +} + +bool +crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch, const void *key, + uint32_t key_size, uint32_t *kernel_out, + void *prog_data_out) +{ + struct blorp_context *blorp = blorp_batch->blorp; + struct crocus_context *ice = blorp->driver_ctx; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_BLORP, key_size, key); + + if (!shader) + return false; + + *kernel_out = shader->offset; + *((void **)prog_data_out) = shader->prog_data; + + return true; +} + +bool +crocus_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage, + const void *key, uint32_t key_size, + const void *kernel, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data_templ, + uint32_t prog_data_size, uint32_t *kernel_out, + void *prog_data_out) +{ + struct blorp_context *blorp = blorp_batch->blorp; + struct crocus_context *ice = blorp->driver_ctx; + + struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size); + memcpy(prog_data, prog_data_templ, prog_data_size); + + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + + struct crocus_compiled_shader *shader = crocus_upload_shader( + ice, CROCUS_CACHE_BLORP, key_size, key, kernel, kernel_size, prog_data, + prog_data_size, NULL, NULL, 0, 0, &bt); + + *kernel_out = shader->offset; + *((void **)prog_data_out) = shader->prog_data; + + return true; +} + +void +crocus_init_program_cache(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + ice->shaders.cache = + _mesa_hash_table_create(ice, keybox_hash, keybox_equals); + + ice->shaders.cache_bo = + crocus_bo_alloc(screen->bufmgr, "program_cache", 16384); + ice->shaders.cache_bo_map = + crocus_bo_map(NULL, ice->shaders.cache_bo, + MAP_READ | MAP_WRITE | MAP_ASYNC | MAP_PERSISTENT); +} + +void +crocus_destroy_program_cache(struct crocus_context *ice) +{ + for (int i = 0; i < MESA_SHADER_STAGES; i++) { + ice->shaders.prog[i] = NULL; + } + + if (ice->shaders.cache_bo) { + crocus_bo_unmap(ice->shaders.cache_bo); + crocus_bo_unreference(ice->shaders.cache_bo); + ice->shaders.cache_bo_map = NULL; + ice->shaders.cache_bo = NULL; + } + + ralloc_free(ice->shaders.cache); +} + +static const char * +cache_name(enum crocus_program_cache_id cache_id) +{ + if (cache_id == CROCUS_CACHE_BLORP) + return "BLORP"; + + if (cache_id == CROCUS_CACHE_SF) + return "SF"; + + if (cache_id == CROCUS_CACHE_CLIP) + return "CLIP"; + + if (cache_id == CROCUS_CACHE_FF_GS) + return "FF_GS"; + + return _mesa_shader_stage_to_string(cache_id); +} + +void +crocus_print_program_cache(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + hash_table_foreach(ice->shaders.cache, entry) { + const struct keybox *keybox = entry->key; + struct crocus_compiled_shader *shader = entry->data; + fprintf(stderr, "%s:\n", cache_name(keybox->cache_id)); + brw_disassemble(devinfo, ice->shaders.cache_bo_map + shader->offset, 0, + shader->prog_data->program_size, NULL, stderr); + } +} diff --git a/src/gallium/drivers/crocus/crocus_query.c b/src/gallium/drivers/crocus/crocus_query.c new file mode 100644 index 00000000000..14ba9fbce59 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_query.c @@ -0,0 +1,996 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_query.c + * + * ============================= GENXML CODE ============================= + * [This file is compiled once per generation.] + * ======================================================================= + * + * Query object support. This allows measuring various simple statistics + * via counters on the GPU. We use GenX code for MI_MATH calculations. + */ + +#include +#include +#include "perf/intel_perf.h" +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_inlines.h" +#include "util/u_upload_mgr.h" +#include "crocus_context.h" +#include "crocus_defines.h" +#include "crocus_fence.h" +#include "crocus_monitor.h" +#include "crocus_resource.h" +#include "crocus_screen.h" + +#include "crocus_genx_macros.h" + +#if GFX_VER == 6 +// TOOD: Add these to genxml? +#define SO_PRIM_STORAGE_NEEDED(n) (0x2280) +#define SO_NUM_PRIMS_WRITTEN(n) (0x2288) + +// TODO: remove HS/DS/CS +#define GFX6_IA_VERTICES_COUNT_num 0x2310 +#define GFX6_IA_PRIMITIVES_COUNT_num 0x2318 +#define GFX6_VS_INVOCATION_COUNT_num 0x2320 +#define GFX6_HS_INVOCATION_COUNT_num 0x2300 +#define GFX6_DS_INVOCATION_COUNT_num 0x2308 +#define GFX6_GS_INVOCATION_COUNT_num 0x2328 +#define GFX6_GS_PRIMITIVES_COUNT_num 0x2330 +#define GFX6_CL_INVOCATION_COUNT_num 0x2338 +#define GFX6_CL_PRIMITIVES_COUNT_num 0x2340 +#define GFX6_PS_INVOCATION_COUNT_num 0x2348 +#define GFX6_CS_INVOCATION_COUNT_num 0x2290 +#define GFX6_PS_DEPTH_COUNT_num 0x2350 + +#elif GFX_VER == 7 +#define SO_PRIM_STORAGE_NEEDED(n) (GENX(SO_PRIM_STORAGE_NEEDED0_num) + (n) * 8) +#define SO_NUM_PRIMS_WRITTEN(n) (GENX(SO_NUM_PRIMS_WRITTEN0_num) + (n) * 8) +#endif + +struct crocus_query { + enum pipe_query_type type; + int index; + + bool ready; + + bool stalled; + + uint64_t result; + + struct crocus_state_ref query_state_ref; + struct crocus_query_snapshots *map; + struct crocus_syncobj *syncobj; + + int batch_idx; + + struct crocus_monitor_object *monitor; + + /* Fence for PIPE_QUERY_GPU_FINISHED. */ + struct pipe_fence_handle *fence; +}; + +struct crocus_query_snapshots { + /** crocus_render_condition's saved MI_PREDICATE_RESULT value. */ + uint64_t predicate_result; + + /** Have the start/end snapshots landed? */ + uint64_t snapshots_landed; + + /** Starting and ending counter snapshots */ + uint64_t start; + uint64_t end; +}; + +struct crocus_query_so_overflow { + uint64_t predicate_result; + uint64_t snapshots_landed; + + struct { + uint64_t prim_storage_needed[2]; + uint64_t num_prims[2]; + } stream[4]; +}; + +#if GFX_VERx10 == 75 +static struct mi_value +query_mem64(struct crocus_query *q, uint32_t offset) +{ + return mi_mem64(rw_bo(crocus_resource_bo(q->query_state_ref.res), + q->query_state_ref.offset + offset)); +} +#endif + +/** + * Is this type of query written by PIPE_CONTROL? + */ +static bool +crocus_is_query_pipelined(struct crocus_query *q) +{ + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + case PIPE_QUERY_TIME_ELAPSED: + return true; + + default: + return false; + } +} + +static void +mark_available(struct crocus_context *ice, struct crocus_query *q) +{ +#if GFX_VERx10 == 75 + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + struct crocus_screen *screen = batch->screen; + unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE; + unsigned offset = offsetof(struct crocus_query_snapshots, snapshots_landed); + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + offset += q->query_state_ref.offset; + + if (!crocus_is_query_pipelined(q)) { + screen->vtbl.store_data_imm64(batch, bo, offset, true); + } else { + /* Order available *after* the query results. */ + flags |= PIPE_CONTROL_FLUSH_ENABLE; + crocus_emit_pipe_control_write(batch, "query: mark available", + flags, bo, offset, true); + } +#endif +} + +/** + * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL. + */ +static void +crocus_pipelined_write(struct crocus_batch *batch, + struct crocus_query *q, + enum pipe_control_flags flags, + unsigned offset) +{ + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + + crocus_emit_pipe_control_write(batch, "query: pipelined snapshot write", + flags, + bo, offset, 0ull); +} + +static void +write_value(struct crocus_context *ice, struct crocus_query *q, unsigned offset) +{ + struct crocus_batch *batch = &ice->batches[q->batch_idx]; +#if GFX_VER >= 6 + struct crocus_screen *screen = batch->screen; + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); +#endif + + if (!crocus_is_query_pipelined(q)) { + crocus_emit_pipe_control_flush(batch, + "query: non-pipelined snapshot write", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + q->stalled = true; + } + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q, + PIPE_CONTROL_WRITE_DEPTH_COUNT | + PIPE_CONTROL_DEPTH_STALL, + offset); + break; + case PIPE_QUERY_TIME_ELAPSED: + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q, + PIPE_CONTROL_WRITE_TIMESTAMP, + offset); + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: +#if GFX_VER >= 6 + screen->vtbl.store_register_mem64(batch, + q->index == 0 ? + GENX(CL_INVOCATION_COUNT_num) : + SO_PRIM_STORAGE_NEEDED(q->index), + bo, offset, false); +#endif + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: +#if GFX_VER >= 6 + screen->vtbl.store_register_mem64(batch, + SO_NUM_PRIMS_WRITTEN(q->index), + bo, offset, false); +#endif + break; + case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: { +#if GFX_VER >= 6 + static const uint32_t index_to_reg[] = { + GENX(IA_VERTICES_COUNT_num), + GENX(IA_PRIMITIVES_COUNT_num), + GENX(VS_INVOCATION_COUNT_num), + GENX(GS_INVOCATION_COUNT_num), + GENX(GS_PRIMITIVES_COUNT_num), + GENX(CL_INVOCATION_COUNT_num), + GENX(CL_PRIMITIVES_COUNT_num), + GENX(PS_INVOCATION_COUNT_num), + GENX(HS_INVOCATION_COUNT_num), + GENX(DS_INVOCATION_COUNT_num), + GENX(CS_INVOCATION_COUNT_num), + }; + uint32_t reg = index_to_reg[q->index]; + +#if GFX_VER == 6 + /* Gfx6 GS code counts full primitives, that is, it won't count individual + * triangles in a triangle strip. Use CL_INVOCATION_COUNT for that. + */ + if (q->index == PIPE_STAT_QUERY_GS_PRIMITIVES) + reg = GENX(CL_INVOCATION_COUNT_num); +#endif + + screen->vtbl.store_register_mem64(batch, reg, bo, offset, false); +#endif + break; + } + default: + assert(false); + } +} + +#if GFX_VER >= 6 +static void +write_overflow_values(struct crocus_context *ice, struct crocus_query *q, bool end) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4; + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + uint32_t offset = q->query_state_ref.offset; + crocus_emit_pipe_control_flush(batch, + "query: write SO overflow snapshots", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + for (uint32_t i = 0; i < count; i++) { + int s = q->index + i; + int g_idx = offset + offsetof(struct crocus_query_so_overflow, + stream[s].num_prims[end]); + int w_idx = offset + offsetof(struct crocus_query_so_overflow, + stream[s].prim_storage_needed[end]); + screen->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s), + bo, g_idx, false); + screen->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s), + bo, w_idx, false); + } +} +#endif +static uint64_t +crocus_raw_timestamp_delta(uint64_t time0, uint64_t time1) +{ + if (time0 > time1) { + return (1ULL << TIMESTAMP_BITS) + time1 - time0; + } else { + return time1 - time0; + } +} + +static bool +stream_overflowed(struct crocus_query_so_overflow *so, int s) +{ + return (so->stream[s].prim_storage_needed[1] - + so->stream[s].prim_storage_needed[0]) != + (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]); +} + +static void +calculate_result_on_cpu(const struct intel_device_info *devinfo, + struct crocus_query *q) +{ + switch (q->type) { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + q->result = q->map->end != q->map->start; + break; + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* The timestamp is the single starting snapshot. */ + q->result = intel_device_info_timebase_scale(devinfo, q->map->start); + q->result &= (1ull << TIMESTAMP_BITS) - 1; + break; + case PIPE_QUERY_TIME_ELAPSED: + q->result = crocus_raw_timestamp_delta(q->map->start, q->map->end); + q->result = intel_device_info_timebase_scale(devinfo, q->result); + q->result &= (1ull << TIMESTAMP_BITS) - 1; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + q->result = stream_overflowed((void *) q->map, q->index); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + q->result = false; + for (int i = 0; i < MAX_VERTEX_STREAMS; i++) + q->result |= stream_overflowed((void *) q->map, i); + break; + case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: + q->result = q->map->end - q->map->start; + + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if (GFX_VER == 7 && devinfo->is_haswell && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS) + q->result /= 4; + break; + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + default: + q->result = q->map->end - q->map->start; + break; + } + + q->ready = true; +} + +#if GFX_VERx10 == 75 +/** + * Calculate the streamout overflow for stream \p idx: + * + * (num_prims[1] - num_prims[0]) - (storage_needed[1] - storage_needed[0]) + */ +static struct mi_value +calc_overflow_for_stream(struct mi_builder *b, + struct crocus_query *q, + int idx) +{ +#define C(counter, i) query_mem64(q, \ + offsetof(struct crocus_query_so_overflow, stream[idx].counter[i])) + + return mi_isub(b, mi_isub(b, C(num_prims, 1), C(num_prims, 0)), + mi_isub(b, C(prim_storage_needed, 1), + C(prim_storage_needed, 0))); +#undef C +} + +/** + * Calculate whether any stream has overflowed. + */ +static struct mi_value +calc_overflow_any_stream(struct mi_builder *b, struct crocus_query *q) +{ + struct mi_value stream_result[MAX_VERTEX_STREAMS]; + for (int i = 0; i < MAX_VERTEX_STREAMS; i++) + stream_result[i] = calc_overflow_for_stream(b, q, i); + + struct mi_value result = stream_result[0]; + for (int i = 1; i < MAX_VERTEX_STREAMS; i++) + result = mi_ior(b, result, stream_result[i]); + + return result; +} + + +static bool +query_is_boolean(enum pipe_query_type type) +{ + switch (type) { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + return true; + default: + return false; + } +} + +/** + * Calculate the result using MI_MATH. + */ +static struct mi_value +calculate_result_on_gpu(const struct intel_device_info *devinfo, + struct mi_builder *b, + struct crocus_query *q) +{ + struct mi_value result; + struct mi_value start_val = + query_mem64(q, offsetof(struct crocus_query_snapshots, start)); + struct mi_value end_val = + query_mem64(q, offsetof(struct crocus_query_snapshots, end)); + + switch (q->type) { + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result = calc_overflow_for_stream(b, q, q->index); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + result = calc_overflow_any_stream(b, q); + break; + case PIPE_QUERY_TIMESTAMP: { + /* TODO: This discards any fractional bits of the timebase scale. + * We would need to do a bit of fixed point math on the CS ALU, or + * launch an actual shader to calculate this with full precision. + */ + uint32_t scale = 1000000000ull / devinfo->timestamp_frequency; + result = mi_iand(b, mi_imm((1ull << 36) - 1), + mi_imul_imm(b, start_val, scale)); + break; + } + case PIPE_QUERY_TIME_ELAPSED: { + /* TODO: This discards fractional bits (see above). */ + uint32_t scale = 1000000000ull / devinfo->timestamp_frequency; + result = mi_imul_imm(b, mi_isub(b, end_val, start_val), scale); + break; + } + default: + result = mi_isub(b, end_val, start_val); + break; + } + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if (GFX_VER == 7 && devinfo->is_haswell && + q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE && + q->index == PIPE_STAT_QUERY_PS_INVOCATIONS) + result = mi_ushr32_imm(b, result, 2); + + if (query_is_boolean(q->type)) + result = mi_iand(b, mi_nz(b, result), mi_imm(1)); + + return result; +} +#endif + +static struct pipe_query * +crocus_create_query(struct pipe_context *ctx, + unsigned query_type, + unsigned index) +{ + struct crocus_query *q = calloc(1, sizeof(struct crocus_query)); + + q->type = query_type; + q->index = index; + q->monitor = NULL; + + if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE && + q->index == PIPE_STAT_QUERY_CS_INVOCATIONS) + q->batch_idx = CROCUS_BATCH_COMPUTE; + else + q->batch_idx = CROCUS_BATCH_RENDER; + return (struct pipe_query *) q; +} + +static struct pipe_query * +crocus_create_batch_query(struct pipe_context *ctx, + unsigned num_queries, + unsigned *query_types) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = calloc(1, sizeof(struct crocus_query)); + if (unlikely(!q)) + return NULL; + q->type = PIPE_QUERY_DRIVER_SPECIFIC; + q->index = -1; + q->monitor = crocus_create_monitor_object(ice, num_queries, query_types); + if (unlikely(!q->monitor)) { + free(q); + return NULL; + } + + return (struct pipe_query *) q; +} + +static void +crocus_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query) +{ + struct crocus_query *query = (void *) p_query; + struct crocus_screen *screen = (void *) ctx->screen; + if (query->monitor) { + crocus_destroy_monitor_object(ctx, query->monitor); + query->monitor = NULL; + } else { + crocus_syncobj_reference(screen, &query->syncobj, NULL); + screen->base.fence_reference(ctx->screen, &query->fence, NULL); + } + free(query); +} + + +static bool +crocus_begin_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + if (q->monitor) + return crocus_begin_monitor(ctx, q->monitor); + + void *ptr = NULL; + uint32_t size; + + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + size = sizeof(struct crocus_query_so_overflow); + else + size = sizeof(struct crocus_query_snapshots); + + u_upload_alloc(ice->query_buffer_uploader, 0, + size, size, &q->query_state_ref.offset, + &q->query_state_ref.res, &ptr); + + if (!crocus_resource_bo(q->query_state_ref.res)) + return false; + + q->map = ptr; + if (!q->map) + return false; + + q->result = 0ull; + q->ready = false; + WRITE_ONCE(q->map->snapshots_landed, false); + + if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) { + ice->state.prims_generated_query_active = true; + ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP; + } + +#if GFX_VER <= 5 + if (q->type == PIPE_QUERY_OCCLUSION_COUNTER || + q->type == PIPE_QUERY_OCCLUSION_PREDICATE) { + ice->state.stats_wm++; + ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE; + } +#endif +#if GFX_VER >= 6 + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + write_overflow_values(ice, q, false); + else +#endif + write_value(ice, q, + q->query_state_ref.offset + + offsetof(struct crocus_query_snapshots, start)); + + return true; +} + +static bool +crocus_end_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + if (q->monitor) + return crocus_end_monitor(ctx, q->monitor); + + if (q->type == PIPE_QUERY_GPU_FINISHED) { + ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED); + return true; + } + + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + + if (q->type == PIPE_QUERY_TIMESTAMP) { + crocus_begin_query(ctx, query); + crocus_batch_reference_signal_syncobj(batch, &q->syncobj); + mark_available(ice, q); + return true; + } + +#if GFX_VER <= 5 + if (q->type == PIPE_QUERY_OCCLUSION_COUNTER || + q->type == PIPE_QUERY_OCCLUSION_PREDICATE) { + ice->state.stats_wm--; + ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE; + } +#endif + if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) { + ice->state.prims_generated_query_active = false; + ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP; + } + +#if GFX_VER >= 6 + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + write_overflow_values(ice, q, true); + else +#endif + write_value(ice, q, + q->query_state_ref.offset + + offsetof(struct crocus_query_snapshots, end)); + + crocus_batch_reference_signal_syncobj(batch, &q->syncobj); + mark_available(ice, q); + + return true; +} + +/** + * See if the snapshots have landed for a query, and if so, compute the + * result and mark it ready. Does not flush (unlike crocus_get_query_result). + */ +static void +crocus_check_query_no_flush(struct crocus_context *ice, struct crocus_query *q) +{ + struct crocus_screen *screen = (void *) ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (!q->ready && READ_ONCE(q->map->snapshots_landed)) { + calculate_result_on_cpu(devinfo, q); + } +} + +static bool +crocus_get_query_result(struct pipe_context *ctx, + struct pipe_query *query, + bool wait, + union pipe_query_result *result) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + if (q->monitor) + return crocus_get_monitor_result(ctx, q->monitor, wait, result->batch); + + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (unlikely(screen->no_hw)) { + result->u64 = 0; + return true; + } + + if (!q->ready) { + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + if (q->syncobj == crocus_batch_get_signal_syncobj(batch)) + crocus_batch_flush(batch); + +#if GFX_VERx10 == 75 + while (!READ_ONCE(q->map->snapshots_landed)) { + if (wait) + crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX); + else + return false; + } + assert(READ_ONCE(q->map->snapshots_landed)); +#else + if (wait) + crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX); +#endif + calculate_result_on_cpu(devinfo, q); + } + + assert(q->ready); + + result->u64 = q->result; + + return true; +} + +#if GFX_VER == 7 +static void +crocus_get_query_result_resource(struct pipe_context *ctx, + struct pipe_query *query, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *p_res, + unsigned offset) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + struct crocus_screen *screen = batch->screen; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + struct crocus_resource *res = (void *) p_res; + struct crocus_bo *query_bo = crocus_resource_bo(q->query_state_ref.res); + struct crocus_bo *dst_bo = crocus_resource_bo(p_res); + unsigned snapshots_landed_offset = + offsetof(struct crocus_query_snapshots, snapshots_landed); + + res->bind_history |= PIPE_BIND_QUERY_BUFFER; + + if (index == -1) { + /* They're asking for the availability of the result. If we still + * have commands queued up which produce the result, submit them + * now so that progress happens. Either way, copy the snapshots + * landed field to the destination resource. + */ + if (q->syncobj == crocus_batch_get_signal_syncobj(batch)) + crocus_batch_flush(batch); + + screen->vtbl.copy_mem_mem(batch, dst_bo, offset, + query_bo, snapshots_landed_offset, + result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8); + return; + } + + if (!q->ready && READ_ONCE(q->map->snapshots_landed)) { + /* The final snapshots happen to have landed, so let's just compute + * the result on the CPU now... + */ + calculate_result_on_cpu(devinfo, q); + } + + if (q->ready) { + /* We happen to have the result on the CPU, so just copy it. */ + if (result_type <= PIPE_QUERY_TYPE_U32) { + screen->vtbl.store_data_imm32(batch, dst_bo, offset, q->result); + } else { + screen->vtbl.store_data_imm64(batch, dst_bo, offset, q->result); + } + + /* Make sure the result lands before they use bind the QBO elsewhere + * and use the result. + */ + // XXX: Why? i965 doesn't do this. + crocus_emit_pipe_control_flush(batch, + "query: unknown QBO flushing hack", + PIPE_CONTROL_CS_STALL); + return; + } + +#if GFX_VERx10 == 75 + bool predicated = !wait && !q->stalled; + + struct mi_builder b; + mi_builder_init(&b, &batch->screen->devinfo, batch); + + struct mi_value result = calculate_result_on_gpu(devinfo, &b, q); + struct mi_value dst = + result_type <= PIPE_QUERY_TYPE_U32 ? mi_mem32(rw_bo(dst_bo, offset)) + : mi_mem64(rw_bo(dst_bo, offset)); + + if (predicated) { + mi_store(&b, mi_reg32(MI_PREDICATE_RESULT), + mi_mem64(ro_bo(query_bo, snapshots_landed_offset))); + mi_store_if(&b, dst, result); + } else { + mi_store(&b, dst, result); + } +#endif +} +#endif + +static void +crocus_set_active_query_state(struct pipe_context *ctx, bool enable) +{ + struct crocus_context *ice = (void *) ctx; + + if (ice->state.statistics_counters_enabled == enable) + return; + + // XXX: most packets aren't paying attention to this yet, because it'd + // have to be done dynamically at draw time, which is a pain + ice->state.statistics_counters_enabled = enable; + ice->state.dirty |= CROCUS_DIRTY_CLIP | + CROCUS_DIRTY_RASTER | + CROCUS_DIRTY_STREAMOUT | + CROCUS_DIRTY_WM; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS | + CROCUS_STAGE_DIRTY_TCS | + CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_VS; +} + +static void +set_predicate_enable(struct crocus_context *ice, bool value) +{ + if (value) + ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER; + else + ice->state.predicate = CROCUS_PREDICATE_STATE_DONT_RENDER; +} + +#if GFX_VER == 7 +static void +set_predicate_for_result(struct crocus_context *ice, + struct crocus_query *q, + bool inverted) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + +#if GFX_VERx10 != 75 + /* IVB doesn't have enough MI for this */ + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { + ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY; + return; + } +#endif + + /* The CPU doesn't have the query result yet; use hardware predication */ + ice->state.predicate = CROCUS_PREDICATE_STATE_USE_BIT; + + /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */ + crocus_emit_pipe_control_flush(batch, + "conditional rendering: set predicate", + PIPE_CONTROL_FLUSH_ENABLE); + q->stalled = true; + +#if GFX_VERx10 != 75 + struct crocus_screen *screen = batch->screen; + screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo, + q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, start)); + screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo, + q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, end)); + + uint32_t mi_predicate = MI_PREDICATE | MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + if (inverted) + mi_predicate |= MI_PREDICATE_LOADOP_LOAD; + else + mi_predicate |= MI_PREDICATE_LOADOP_LOADINV; + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); +#else + struct mi_builder b; + mi_builder_init(&b, &batch->screen->devinfo, batch); + + struct mi_value result; + + switch (q->type) { + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result = calc_overflow_for_stream(&b, q, q->index); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + result = calc_overflow_any_stream(&b, q); + break; + default: { + /* PIPE_QUERY_OCCLUSION_* */ + struct mi_value start = + query_mem64(q, offsetof(struct crocus_query_snapshots, start)); + struct mi_value end = + query_mem64(q, offsetof(struct crocus_query_snapshots, end)); + result = mi_isub(&b, end, start); + break; + } + } + + result = inverted ? mi_z(&b, result) : mi_nz(&b, result); + result = mi_iand(&b, result, mi_imm(1)); + + /* We immediately set the predicate on the render batch, as all the + * counters come from 3D operations. However, we may need to predicate + * a compute dispatch, which executes in a different GEM context and has + * a different MI_PREDICATE_RESULT register. So, we save the result to + * memory and reload it in crocus_launch_grid. + */ + mi_value_ref(&b, result); + + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), result); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + + unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | + MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); + mi_store(&b, query_mem64(q, offsetof(struct crocus_query_snapshots, + predicate_result)), result); +#endif + ice->state.compute_predicate = bo; +} +#endif + +static void +crocus_render_condition(struct pipe_context *ctx, + struct pipe_query *query, + bool condition, + enum pipe_render_cond_flag mode) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + /* The old condition isn't relevant; we'll update it if necessary */ + ice->state.compute_predicate = NULL; + ice->condition.query = q; + ice->condition.condition = condition; + ice->condition.mode = mode; + + if (!q) { + ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER; + return; + } + + crocus_check_query_no_flush(ice, q); + + if (q->result || q->ready) { + set_predicate_enable(ice, (q->result != 0) ^ condition); + } else { + if (mode == PIPE_RENDER_COND_NO_WAIT || + mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) { + perf_debug(&ice->dbg, "Conditional rendering demoted from " + "\"no wait\" to \"wait\"."); + } +#if GFX_VER == 7 + set_predicate_for_result(ice, q, condition); +#else + ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY; +#endif + } +} + +static void +crocus_resolve_conditional_render(struct crocus_context *ice) +{ + struct pipe_context *ctx = (void *) ice; + struct crocus_query *q = ice->condition.query; + struct pipe_query *query = (void *) q; + union pipe_query_result result; + + if (ice->state.predicate != CROCUS_PREDICATE_STATE_USE_BIT) + return; + + assert(q); + + crocus_get_query_result(ctx, query, true, &result); + set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition); +} + +#if GFX_VER >= 7 +static void +crocus_emit_compute_predicate(struct crocus_batch *batch) +{ + struct crocus_context *ice = batch->ice; + struct crocus_screen *screen = batch->screen; + screen->vtbl.load_register_mem32(batch, MI_PREDICATE_SRC0, + ice->state.compute_predicate, 0); + screen->vtbl.load_register_imm32(batch, MI_PREDICATE_SRC1, 0); + unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | + MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); +} +#endif + +void +genX(init_screen_query)(struct crocus_screen *screen) +{ + screen->vtbl.resolve_conditional_render = crocus_resolve_conditional_render; +#if GFX_VER >= 7 + screen->vtbl.emit_compute_predicate = crocus_emit_compute_predicate; +#endif +} + +void +genX(init_query)(struct crocus_context *ice) +{ + struct pipe_context *ctx = &ice->ctx; + + ctx->create_query = crocus_create_query; + ctx->create_batch_query = crocus_create_batch_query; + ctx->destroy_query = crocus_destroy_query; + ctx->begin_query = crocus_begin_query; + ctx->end_query = crocus_end_query; + ctx->get_query_result = crocus_get_query_result; +#if GFX_VER == 7 + ctx->get_query_result_resource = crocus_get_query_result_resource; +#endif + ctx->set_active_query_state = crocus_set_active_query_state; + ctx->render_condition = crocus_render_condition; + +} diff --git a/src/gallium/drivers/crocus/crocus_resolve.c b/src/gallium/drivers/crocus/crocus_resolve.c new file mode 100644 index 00000000000..a38eb4a94a7 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_resolve.c @@ -0,0 +1,1061 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_resolve.c + * + * This file handles resolve tracking for main and auxiliary surfaces. + * + * It also handles our cache tracking. We have sets for the render cache, + * depth cache, and so on. If a BO is in a cache's set, then it may have + * data in that cache. The helpers take care of emitting flushes for + * render-to-texture, format reinterpretation issues, and other situations. + */ + +#include "util/hash_table.h" +#include "util/set.h" +#include "crocus_context.h" +#include "compiler/nir/nir.h" + +#define FILE_DEBUG_FLAG DEBUG_BLORP + +static void +crocus_update_stencil_shadow(struct crocus_context *ice, + struct crocus_resource *res); +/** + * Disable auxiliary buffers if a renderbuffer is also bound as a texture + * or shader image. This causes a self-dependency, where both rendering + * and sampling may concurrently read or write the CCS buffer, causing + * incorrect pixels. + */ +static bool +disable_rb_aux_buffer(struct crocus_context *ice, + bool *draw_aux_buffer_disabled, + struct crocus_resource *tex_res, + unsigned min_level, unsigned num_levels, + const char *usage) +{ + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + bool found = false; + + /* We only need to worry about fast clears. */ + if (tex_res->aux.usage != ISL_AUX_USAGE_CCS_D) + return false; + + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + if (!surf) + continue; + + struct crocus_resource *rb_res = (void *) surf->base.texture; + + if (rb_res->bo == tex_res->bo && + surf->base.u.tex.level >= min_level && + surf->base.u.tex.level < min_level + num_levels) { + found = draw_aux_buffer_disabled[i] = true; + } + } + + if (found) { + perf_debug(&ice->dbg, + "Disabling CCS because a renderbuffer is also bound %s.\n", + usage); + } + + return found; +} + +static void +resolve_sampler_views(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_shader_state *shs, + const struct shader_info *info, + bool *draw_aux_buffer_disabled, + bool consider_framebuffer) +{ + uint32_t views = info ? (shs->bound_sampler_views & info->textures_used[0]) : 0; + + while (views) { + const int i = u_bit_scan(&views); + struct crocus_sampler_view *isv = shs->textures[i]; + + if (isv->res->base.target != PIPE_BUFFER) { + if (consider_framebuffer) { + disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, isv->res, + isv->view.base_level, isv->view.levels, + "for sampling"); + } + + crocus_resource_prepare_texture(ice, isv->res, isv->view.format, + isv->view.base_level, isv->view.levels, + isv->view.base_array_layer, + isv->view.array_len); + } + + crocus_cache_flush_for_read(batch, isv->res->bo); + + if (batch->screen->devinfo.ver >= 7 && + (isv->base.format == PIPE_FORMAT_X24S8_UINT || + isv->base.format == PIPE_FORMAT_X32_S8X24_UINT || + isv->base.format == PIPE_FORMAT_S8_UINT)) { + struct crocus_resource *zres, *sres; + crocus_get_depth_stencil_resources(&batch->screen->devinfo, isv->base.texture, &zres, &sres); + crocus_update_stencil_shadow(ice, sres); + crocus_cache_flush_for_read(batch, sres->shadow->bo); + } + } +} + +static void +resolve_image_views(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_shader_state *shs, + bool *draw_aux_buffer_disabled, + bool consider_framebuffer) +{ + /* TODO: Consider images used by program */ + uint32_t views = shs->bound_image_views; + + while (views) { + const int i = u_bit_scan(&views); + struct pipe_image_view *pview = &shs->image[i].base; + struct crocus_resource *res = (void *) pview->resource; + + if (res->base.target != PIPE_BUFFER) { + if (consider_framebuffer) { + disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, + res, pview->u.tex.level, 1, + "as a shader image"); + } + + unsigned num_layers = + pview->u.tex.last_layer - pview->u.tex.first_layer + 1; + + /* The data port doesn't understand any compression */ + crocus_resource_prepare_access(ice, res, + pview->u.tex.level, 1, + pview->u.tex.first_layer, num_layers, + ISL_AUX_USAGE_NONE, false); + } + + crocus_cache_flush_for_read(batch, res->bo); + } +} + +static void +crocus_update_align_res(struct crocus_batch *batch, + struct crocus_surface *surf, + bool copy_to_wa) +{ + struct crocus_screen *screen = (struct crocus_screen *)batch->screen; + struct pipe_blit_info info = { 0 }; + + info.src.resource = copy_to_wa ? surf->base.texture : surf->align_res; + info.src.level = copy_to_wa ? surf->base.u.tex.level : 0; + u_box_2d_zslice(0, 0, copy_to_wa ? surf->base.u.tex.first_layer : 0, + u_minify(surf->base.texture->width0, surf->base.u.tex.level), + u_minify(surf->base.texture->height0, surf->base.u.tex.level), &info.src.box); + info.src.format = surf->base.texture->format; + info.dst.resource = copy_to_wa ? surf->align_res : surf->base.texture; + info.dst.level = copy_to_wa ? 0 : surf->base.u.tex.level; + info.dst.box = info.src.box; + info.dst.box.z = copy_to_wa ? 0 : surf->base.u.tex.first_layer; + info.dst.format = surf->base.texture->format; + info.mask = util_format_is_depth_or_stencil(surf->base.texture->format) ? PIPE_MASK_ZS : PIPE_MASK_RGBA; + info.filter = 0; + if (!screen->vtbl.blit_blt(batch, &info)) { + assert(0); + } +} + +/** + * \brief Resolve buffers before drawing. + * + * Resolve the depth buffer's HiZ buffer, resolve the depth buffer of each + * enabled depth texture, and flush the render cache for any dirty textures. + */ +void +crocus_predraw_resolve_inputs(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled, + gl_shader_stage stage, + bool consider_framebuffer) +{ + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + const struct shader_info *info = crocus_get_shader_info(ice, stage); + + uint64_t stage_dirty = (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage) | + (consider_framebuffer ? CROCUS_STAGE_DIRTY_BINDINGS_FS : 0); + + if (ice->state.stage_dirty & stage_dirty) { + resolve_sampler_views(ice, batch, shs, info, draw_aux_buffer_disabled, + consider_framebuffer); + resolve_image_views(ice, batch, shs, draw_aux_buffer_disabled, + consider_framebuffer); + } +} + +void +crocus_predraw_resolve_framebuffer(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled) +{ + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; + const nir_shader *nir = ish->nir; + + if (ice->state.dirty & CROCUS_DIRTY_DEPTH_BUFFER) { + struct pipe_surface *zs_surf = cso_fb->zsbuf; + + if (zs_surf) { + struct crocus_resource *z_res, *s_res; + crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res); + unsigned num_layers = + zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1; + + if (z_res) { + crocus_resource_prepare_render(ice, z_res, + zs_surf->u.tex.level, + zs_surf->u.tex.first_layer, + num_layers, ice->state.hiz_usage); + crocus_cache_flush_for_depth(batch, z_res->bo); + + if (((struct crocus_surface *)zs_surf)->align_res) { + crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, true); + } + } + + if (s_res) { + crocus_cache_flush_for_depth(batch, s_res->bo); + } + } + } + + if (nir->info.outputs_read != 0) { + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + if (cso_fb->cbufs[i]) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + struct crocus_resource *res = (void *) cso_fb->cbufs[i]->texture; + + crocus_resource_prepare_texture(ice, res, surf->view.format, + surf->view.base_level, 1, + surf->view.base_array_layer, + surf->view.array_len); + } + } + } + + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS) { + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + if (!surf) + continue; + + struct crocus_resource *res = (void *) surf->base.texture; + + if (surf->align_res) + crocus_update_align_res(batch, surf, true); + + enum isl_aux_usage aux_usage = + crocus_resource_render_aux_usage(ice, res, surf->view.format, + ice->state.blend_enables & (1u << i), + draw_aux_buffer_disabled[i]); + + if (ice->state.draw_aux_usage[i] != aux_usage) { + ice->state.draw_aux_usage[i] = aux_usage; + /* XXX: Need to track which bindings to make dirty */ + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; + } + + crocus_resource_prepare_render(ice, res, surf->view.base_level, + surf->view.base_array_layer, + surf->view.array_len, + aux_usage); + + crocus_cache_flush_for_render(batch, res->bo, surf->view.format, + aux_usage); + } + } +} + +/** + * \brief Call this after drawing to mark which buffers need resolving + * + * If the depth buffer was written to and if it has an accompanying HiZ + * buffer, then mark that it needs a depth resolve. + * + * If the color buffer is a multisample window system buffer, then + * mark that it needs a downsample. + * + * Also mark any render targets which will be textured as needing a render + * cache flush. + */ +void +crocus_postdraw_update_resolve_tracking(struct crocus_context *ice, + struct crocus_batch *batch) +{ + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + // XXX: front buffer drawing? + + bool may_have_resolved_depth = + ice->state.dirty & (CROCUS_DIRTY_DEPTH_BUFFER | + CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL); + + struct pipe_surface *zs_surf = cso_fb->zsbuf; + if (zs_surf) { + struct crocus_resource *z_res, *s_res; + crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res); + unsigned num_layers = + zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1; + + if (z_res) { + if (may_have_resolved_depth && ice->state.depth_writes_enabled) { + crocus_resource_finish_render(ice, z_res, zs_surf->u.tex.level, + zs_surf->u.tex.first_layer, num_layers, + ice->state.hiz_usage); + } + + if (ice->state.depth_writes_enabled) + crocus_depth_cache_add_bo(batch, z_res->bo); + + if (((struct crocus_surface *)zs_surf)->align_res) { + crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, false); + } + } + + if (s_res) { + if (may_have_resolved_depth && ice->state.stencil_writes_enabled) { + crocus_resource_finish_write(ice, s_res, zs_surf->u.tex.level, + zs_surf->u.tex.first_layer, num_layers, + s_res->aux.usage); + } + + if (ice->state.stencil_writes_enabled) + crocus_depth_cache_add_bo(batch, s_res->bo); + } + } + + bool may_have_resolved_color = + ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS; + + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + if (!surf) + continue; + + if (surf->align_res) + crocus_update_align_res(batch, surf, false); + struct crocus_resource *res = (void *) surf->base.texture; + enum isl_aux_usage aux_usage = ice->state.draw_aux_usage[i]; + + crocus_render_cache_add_bo(batch, res->bo, surf->view.format, + aux_usage); + + if (may_have_resolved_color) { + union pipe_surface_desc *desc = &surf->base.u; + unsigned num_layers = + desc->tex.last_layer - desc->tex.first_layer + 1; + crocus_resource_finish_render(ice, res, desc->tex.level, + desc->tex.first_layer, num_layers, + aux_usage); + } + } +} + +/** + * Clear the cache-tracking sets. + */ +void +crocus_cache_sets_clear(struct crocus_batch *batch) +{ + hash_table_foreach(batch->cache.render, render_entry) + _mesa_hash_table_remove(batch->cache.render, render_entry); + + set_foreach(batch->cache.depth, depth_entry) + _mesa_set_remove(batch->cache.depth, depth_entry); +} + +/** + * Emits an appropriate flush for a BO if it has been rendered to within the + * same batchbuffer as a read that's about to be emitted. + * + * The GPU has separate, incoherent caches for the render cache and the + * sampler cache, along with other caches. Usually data in the different + * caches don't interact (e.g. we don't render to our driver-generated + * immediate constant data), but for render-to-texture in FBOs we definitely + * do. When a batchbuffer is flushed, the kernel will ensure that everything + * necessary is flushed before another use of that BO, but for reuse from + * different caches within a batchbuffer, it's all our responsibility. + */ +void +crocus_flush_depth_and_render_caches(struct crocus_batch *batch) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + if (devinfo->ver >= 6) { + crocus_emit_pipe_control_flush(batch, + "cache tracker: render-to-texture", + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_CS_STALL); + + crocus_emit_pipe_control_flush(batch, + "cache tracker: render-to-texture", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE); + } else { + crocus_emit_mi_flush(batch); + } + + crocus_cache_sets_clear(batch); +} + +void +crocus_cache_flush_for_read(struct crocus_batch *batch, + struct crocus_bo *bo) +{ + if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo) || + _mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo)) + crocus_flush_depth_and_render_caches(batch); +} + +static void * +format_aux_tuple(enum isl_format format, enum isl_aux_usage aux_usage) +{ + return (void *)(uintptr_t)((uint32_t)format << 8 | aux_usage); +} + +void +crocus_cache_flush_for_render(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage) +{ + if (_mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo)) + crocus_flush_depth_and_render_caches(batch); + + /* Check to see if this bo has been used by a previous rendering operation + * but with a different format or aux usage. If it has, flush the render + * cache so we ensure that it's only in there with one format or aux usage + * at a time. + * + * Even though it's not obvious, this can easily happen in practice. + * Suppose a client is blending on a surface with sRGB encode enabled on + * gen9. This implies that you get AUX_USAGE_CCS_D at best. If the client + * then disables sRGB decode and continues blending we will flip on + * AUX_USAGE_CCS_E without doing any sort of resolve in-between (this is + * perfectly valid since CCS_E is a subset of CCS_D). However, this means + * that we have fragments in-flight which are rendering with UNORM+CCS_E + * and other fragments in-flight with SRGB+CCS_D on the same surface at the + * same time and the pixel scoreboard and color blender are trying to sort + * it all out. This ends badly (i.e. GPU hangs). + * + * To date, we have never observed GPU hangs or even corruption to be + * associated with switching the format, only the aux usage. However, + * there are comments in various docs which indicate that the render cache + * isn't 100% resilient to format changes. We may as well be conservative + * and flush on format changes too. We can always relax this later if we + * find it to be a performance problem. + */ + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo); + if (entry && entry->data != format_aux_tuple(format, aux_usage)) + crocus_flush_depth_and_render_caches(batch); +} + +void +crocus_render_cache_add_bo(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage) +{ +#ifndef NDEBUG + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo); + if (entry) { + /* Otherwise, someone didn't do a flush_for_render and that would be + * very bad indeed. + */ + assert(entry->data == format_aux_tuple(format, aux_usage)); + } +#endif + + _mesa_hash_table_insert_pre_hashed(batch->cache.render, bo->hash, bo, + format_aux_tuple(format, aux_usage)); +} + +void +crocus_cache_flush_for_depth(struct crocus_batch *batch, + struct crocus_bo *bo) +{ + if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo)) + crocus_flush_depth_and_render_caches(batch); +} + +void +crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo) +{ + _mesa_set_add_pre_hashed(batch->cache.depth, bo->hash, bo); +} + +static void +crocus_resolve_color(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + unsigned level, unsigned layer, + enum isl_aux_op resolve_op) +{ + struct crocus_screen *screen = batch->screen; + DBG("%s to res %p level %u layer %u\n", __func__, res, level, layer); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, level, true); + + crocus_batch_maybe_flush(batch, 1500); + + /* Ivybridge PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)": + * + * "Any transition from any value in {Clear, Render, Resolve} to a + * different value in {Clear, Render, Resolve} requires end of pipe + * synchronization." + * + * In other words, fast clear ops are not properly synchronized with + * other drawing. We need to use a PIPE_CONTROL to ensure that the + * contents of the previous draw hit the render target before we resolve + * and again afterwards to ensure that the resolve is complete before we + * do any more regular drawing. + */ + crocus_emit_end_of_pipe_sync(batch, "color resolve: pre-flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + blorp_ccs_resolve(&blorp_batch, &surf, level, layer, 1, + isl_format_srgb_to_linear(res->surf.format), + resolve_op); + blorp_batch_finish(&blorp_batch); + + /* See comment above */ + crocus_emit_end_of_pipe_sync(batch, "color resolve: post-flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); +} + +static void +crocus_mcs_partial_resolve(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + uint32_t start_layer, + uint32_t num_layers) +{ + struct crocus_screen *screen = batch->screen; + + DBG("%s to res %p layers %u-%u\n", __func__, res, + start_layer, start_layer + num_layers - 1); + + assert(isl_aux_usage_has_mcs(res->aux.usage)); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, 0, true); + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + blorp_mcs_partial_resolve(&blorp_batch, &surf, + isl_format_srgb_to_linear(res->surf.format), + start_layer, num_layers); + blorp_batch_finish(&blorp_batch); +} + +/** + * Perform a HiZ or depth resolve operation. + * + * For an overview of HiZ ops, see the following sections of the Sandy Bridge + * PRM, Volume 1, Part 2: + * - 7.5.3.1 Depth Buffer Clear + * - 7.5.3.2 Depth Buffer Resolve + * - 7.5.3.3 Hierarchical Depth Buffer Resolve + */ +void +crocus_hiz_exec(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + unsigned int level, unsigned int start_layer, + unsigned int num_layers, enum isl_aux_op op, + bool update_clear_depth) +{ + struct crocus_screen *screen = batch->screen; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + assert(crocus_resource_level_has_hiz(res, level)); + assert(op != ISL_AUX_OP_NONE); + UNUSED const char *name = NULL; + + switch (op) { + case ISL_AUX_OP_FULL_RESOLVE: + name = "depth resolve"; + break; + case ISL_AUX_OP_AMBIGUATE: + name = "hiz ambiguate"; + break; + case ISL_AUX_OP_FAST_CLEAR: + name = "depth clear"; + break; + case ISL_AUX_OP_PARTIAL_RESOLVE: + case ISL_AUX_OP_NONE: + unreachable("Invalid HiZ op"); + } + + DBG("%s %s to res %p level %d layers %d-%d\n", + __func__, name, res, level, start_layer, start_layer + num_layers - 1); + + /* The following stalls and flushes are only documented to be required + * for HiZ clear operations. However, they also seem to be required for + * resolve operations. + * + * From the Ivybridge PRM, volume 2, "Depth Buffer Clear": + * + * "If other rendering operations have preceded this clear, a + * PIPE_CONTROL with depth cache flush enabled, Depth Stall bit + * enabled must be issued before the rectangle primitive used for + * the depth buffer clear operation." + * + * Same applies for Gen8 and Gen9. + * + * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1 + * PIPE_CONTROL, Depth Cache Flush Enable: + * + * "This bit must not be set when Depth Stall Enable bit is set in + * this packet." + * + * This is confirmed to hold for real, Haswell gets immediate gpu hangs. + * + * Therefore issue two pipe control flushes, one for cache flush and + * another for depth stall. + */ + if (devinfo->ver == 6) { + /* From the Sandy Bridge PRM, volume 2 part 1, page 313: + * + * "If other rendering operations have preceded this clear, a + * PIPE_CONTROL with write cache flush enabled and Z-inhibit + * disabled must be issued before the rectangle primitive used for + * the depth buffer clear operation. + */ + crocus_emit_pipe_control_flush(batch, + "hiz op: pre-flushes (1)", + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + } else if (devinfo->ver >= 7) { + crocus_emit_pipe_control_flush(batch, + "hiz op: pre-flushes (1/2)", + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(batch, "hiz op: pre-flushes (2/2)", + PIPE_CONTROL_DEPTH_STALL); + } + + assert(isl_aux_usage_has_hiz(res->aux.usage) && res->aux.bo); + + crocus_batch_maybe_flush(batch, 1500); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, level, true); + + struct blorp_batch blorp_batch; + enum blorp_batch_flags flags = 0; + flags |= update_clear_depth ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, flags); + blorp_hiz_op(&blorp_batch, &surf, level, start_layer, num_layers, op); + blorp_batch_finish(&blorp_batch); + + /* The following stalls and flushes are only documented to be required + * for HiZ clear operations. However, they also seem to be required for + * resolve operations. + * + * From the Broadwell PRM, volume 7, "Depth Buffer Clear": + * + * "Depth buffer clear pass using any of the methods (WM_STATE, + * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a + * PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits + * "set" before starting to render. DepthStall and DepthFlush are + * not needed between consecutive depth clear passes nor is it + * required if the depth clear pass was done with + * 'full_surf_clear' bit set in the 3DSTATE_WM_HZ_OP." + * + * TODO: Such as the spec says, this could be conditional. + */ + if (devinfo->ver == 6) { + /* From the Sandy Bridge PRM, volume 2 part 1, page 314: + * + * "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be + * followed by a PIPE_CONTROL command with DEPTH_STALL bit set + * and Then followed by Depth FLUSH' + */ + crocus_emit_pipe_control_flush(batch, + "hiz op: post-flushes (1/2)", + PIPE_CONTROL_DEPTH_STALL); + + crocus_emit_pipe_control_flush(batch, + "hiz op: post-flushes (2/2)", + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + } +} + +/** + * Does the resource's slice have hiz enabled? + */ +bool +crocus_resource_level_has_hiz(const struct crocus_resource *res, uint32_t level) +{ + crocus_resource_check_level_layer(res, level, 0); + return res->aux.has_hiz & 1 << level; +} + +static bool +crocus_resource_level_has_aux(const struct crocus_resource *res, uint32_t level) +{ + if (isl_aux_usage_has_hiz(res->aux.usage)) + return crocus_resource_level_has_hiz(res, level); + else + return level < res->aux.surf.levels; +} + +/** \brief Assert that the level and layer are valid for the resource. */ +void +crocus_resource_check_level_layer(UNUSED const struct crocus_resource *res, + UNUSED uint32_t level, UNUSED uint32_t layer) +{ + assert(level < res->surf.levels); + assert(layer < util_num_layers(&res->base, level)); +} + +static inline uint32_t +miptree_level_range_length(const struct crocus_resource *res, + uint32_t start_level, uint32_t num_levels) +{ + assert(start_level < res->surf.levels); + + if (num_levels == INTEL_REMAINING_LAYERS) + num_levels = res->surf.levels; + + /* Check for overflow */ + assert(start_level + num_levels >= start_level); + assert(start_level + num_levels <= res->surf.levels); + + return num_levels; +} + +static inline uint32_t +miptree_layer_range_length(const struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers) +{ + assert(level <= res->base.last_level); + + const uint32_t total_num_layers = crocus_get_num_logical_layers(res, level); + assert(start_layer < total_num_layers); + if (num_layers == INTEL_REMAINING_LAYERS) + num_layers = total_num_layers - start_layer; + /* Check for overflow */ + assert(start_layer + num_layers >= start_layer); + assert(start_layer + num_layers <= total_num_layers); + + return num_layers; +} + +bool +crocus_has_invalid_primary(const struct crocus_resource *res, + unsigned start_level, unsigned num_levels, + unsigned start_layer, unsigned num_layers) +{ + if (!res->aux.bo) + return false; + + /* Clamp the level range to fit the resource */ + num_levels = miptree_level_range_length(res, start_level, num_levels); + + for (uint32_t l = 0; l < num_levels; l++) { + const uint32_t level = start_level + l; + if (!crocus_resource_level_has_aux(res, level)) + continue; + + const uint32_t level_layers = + miptree_layer_range_length(res, level, start_layer, num_layers); + for (unsigned a = 0; a < level_layers; a++) { + enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, start_layer + a); + if (!isl_aux_state_has_valid_primary(aux_state)) + return true; + } + } + + return false; +} + +void +crocus_resource_prepare_access(struct crocus_context *ice, + struct crocus_resource *res, + uint32_t start_level, uint32_t num_levels, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_usage aux_usage, + bool fast_clear_supported) +{ + if (!res->aux.bo) + return; + + /* We can't do resolves on the compute engine, so awkwardly, we have to + * do them on the render batch... + */ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + const uint32_t clamped_levels = + miptree_level_range_length(res, start_level, num_levels); + for (uint32_t l = 0; l < clamped_levels; l++) { + const uint32_t level = start_level + l; + if (!crocus_resource_level_has_aux(res, level)) + continue; + + const uint32_t level_layers = + miptree_layer_range_length(res, level, start_layer, num_layers); + for (uint32_t a = 0; a < level_layers; a++) { + const uint32_t layer = start_layer + a; + const enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, layer); + const enum isl_aux_op aux_op = + isl_aux_prepare_access(aux_state, aux_usage, fast_clear_supported); + + /* Prepare the aux buffer for a conditional or unconditional access. + * A conditional access is handled by assuming that the access will + * not evaluate to a no-op. If the access does in fact occur, the aux + * will be in the required state. If it does not, no data is lost + * because the aux_op performed is lossless. + */ + if (aux_op == ISL_AUX_OP_NONE) { + /* Nothing to do here. */ + } else if (isl_aux_usage_has_mcs(res->aux.usage)) { + assert(aux_op == ISL_AUX_OP_PARTIAL_RESOLVE); + crocus_mcs_partial_resolve(ice, batch, res, layer, 1); + } else if (isl_aux_usage_has_hiz(res->aux.usage)) { + crocus_hiz_exec(ice, batch, res, level, layer, 1, aux_op, false); + } else if (res->aux.usage == ISL_AUX_USAGE_STC_CCS) { + unreachable("crocus doesn't resolve STC_CCS resources"); + } else { + assert(isl_aux_usage_has_ccs(res->aux.usage)); + crocus_resolve_color(ice, batch, res, level, layer, aux_op); + } + + const enum isl_aux_state new_state = + isl_aux_state_transition_aux_op(aux_state, res->aux.usage, aux_op); + crocus_resource_set_aux_state(ice, res, level, layer, 1, new_state); + } + } +} + +void +crocus_resource_finish_write(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_usage aux_usage) +{ + if (res->base.format == PIPE_FORMAT_S8_UINT) + res->shadow_needs_update = true; + + if (!crocus_resource_level_has_aux(res, level)) + return; + + const uint32_t level_layers = + miptree_layer_range_length(res, level, start_layer, num_layers); + + for (uint32_t a = 0; a < level_layers; a++) { + const uint32_t layer = start_layer + a; + const enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, layer); + + /* Transition the aux state for a conditional or unconditional write. A + * conditional write is handled by assuming that the write applies to + * only part of the render target. This prevents the new state from + * losing the types of compression that might exist in the current state + * (e.g. CLEAR). If the write evaluates to a no-op, the state will still + * be able to communicate when resolves are necessary (but it may + * falsely communicate this as well). + */ + const enum isl_aux_state new_aux_state = + isl_aux_state_transition_write(aux_state, aux_usage, false); + + crocus_resource_set_aux_state(ice, res, level, layer, 1, new_aux_state); + } +} + +enum isl_aux_state +crocus_resource_get_aux_state(const struct crocus_resource *res, + uint32_t level, uint32_t layer) +{ + crocus_resource_check_level_layer(res, level, layer); + assert(crocus_resource_level_has_aux(res, level)); + + return res->aux.state[level][layer]; +} + +void +crocus_resource_set_aux_state(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_state aux_state) +{ + assert(crocus_resource_level_has_aux(res, level)); + + num_layers = miptree_layer_range_length(res, level, start_layer, num_layers); + for (unsigned a = 0; a < num_layers; a++) { + if (res->aux.state[level][start_layer + a] != aux_state) { + res->aux.state[level][start_layer + a] = aux_state; + ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES | + CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES; + /* XXX: Need to track which bindings to make dirty */ + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; + } + } +} + +static bool +isl_formats_are_fast_clear_compatible(enum isl_format a, enum isl_format b) +{ + /* On gen8 and earlier, the hardware was only capable of handling 0/1 clear + * values so sRGB curve application was a no-op for all fast-clearable + * formats. + * + * On gen9+, the hardware supports arbitrary clear values. For sRGB clear + * values, the hardware interprets the floats, not as what would be + * returned from the sampler (or written by the shader), but as being + * between format conversion and sRGB curve application. This means that + * we can switch between sRGB and UNORM without having to whack the clear + * color. + */ + return isl_format_srgb_to_linear(a) == isl_format_srgb_to_linear(b); +} + +void +crocus_resource_prepare_texture(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format view_format, + uint32_t start_level, uint32_t num_levels, + uint32_t start_layer, uint32_t num_layers) +{ + enum isl_aux_usage aux_usage = + crocus_resource_texture_aux_usage(res); + + bool clear_supported = aux_usage != ISL_AUX_USAGE_NONE; + + /* Clear color is specified as ints or floats and the conversion is done by + * the sampler. If we have a texture view, we would have to perform the + * clear color conversion manually. Just disable clear color. + */ + if (!isl_formats_are_fast_clear_compatible(res->surf.format, view_format)) + clear_supported = false; + + crocus_resource_prepare_access(ice, res, start_level, num_levels, + start_layer, num_layers, + aux_usage, clear_supported); +} + +enum isl_aux_usage +crocus_resource_render_aux_usage(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format render_format, + bool blend_enabled, + bool draw_aux_disabled) +{ + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + + if (draw_aux_disabled) + return ISL_AUX_USAGE_NONE; + + switch (res->aux.usage) { + case ISL_AUX_USAGE_MCS: + return res->aux.usage; + + case ISL_AUX_USAGE_CCS_D: + /* Otherwise, we try to fall back to CCS_D */ + if (isl_format_supports_ccs_d(devinfo, render_format)) + return ISL_AUX_USAGE_CCS_D; + + return ISL_AUX_USAGE_NONE; + + default: + return ISL_AUX_USAGE_NONE; + } +} + +void +crocus_resource_prepare_render(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t layer_count, + enum isl_aux_usage aux_usage) +{ + crocus_resource_prepare_access(ice, res, level, 1, start_layer, + layer_count, aux_usage, + aux_usage != ISL_AUX_USAGE_NONE); +} + +void +crocus_resource_finish_render(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t layer_count, + enum isl_aux_usage aux_usage) +{ + crocus_resource_finish_write(ice, res, level, start_layer, layer_count, + aux_usage); +} + +static void +crocus_update_stencil_shadow(struct crocus_context *ice, + struct crocus_resource *res) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + UNUSED const struct intel_device_info *devinfo = &screen->devinfo; + assert(devinfo->ver >= 7); + + if (!res->shadow_needs_update) + return; + + struct pipe_box box; + for (unsigned level = 0; level <= res->base.last_level; level++) { + u_box_2d(0, 0, + u_minify(res->base.width0, level), + u_minify(res->base.height0, level), &box); + const unsigned depth = res->base.target == PIPE_TEXTURE_3D ? + u_minify(res->base.depth0, level) : res->base.array_size; + + for (unsigned layer = 0; layer < depth; layer++) { + box.z = layer; + ice->ctx.resource_copy_region(&ice->ctx, + &res->shadow->base, level, 0, 0, layer, + &res->base, level, &box); + } + } + res->shadow_needs_update = false; +} diff --git a/src/gallium/drivers/crocus/crocus_resource.c b/src/gallium/drivers/crocus/crocus_resource.c new file mode 100644 index 00000000000..b5bf5a42e1a --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_resource.c @@ -0,0 +1,1946 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_resource.c + * + * Resources are images, buffers, and other objects used by the GPU. + * + * XXX: explain resources + */ + +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/os_memory.h" +#include "util/u_cpu_detect.h" +#include "util/u_inlines.h" +#include "util/format/u_format.h" +#include "util/u_threaded_context.h" +#include "util/u_transfer.h" +#include "util/u_transfer_helper.h" +#include "util/u_upload_mgr.h" +#include "util/ralloc.h" +#include "crocus_batch.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "intel/dev/intel_debug.h" +#include "isl/isl.h" +#include "drm-uapi/drm_fourcc.h" +#include "drm-uapi/i915_drm.h" + +enum modifier_priority { + MODIFIER_PRIORITY_INVALID = 0, + MODIFIER_PRIORITY_LINEAR, + MODIFIER_PRIORITY_X, + MODIFIER_PRIORITY_Y, + MODIFIER_PRIORITY_Y_CCS, +}; + +static const uint64_t priority_to_modifier[] = { + [MODIFIER_PRIORITY_INVALID] = DRM_FORMAT_MOD_INVALID, + [MODIFIER_PRIORITY_LINEAR] = DRM_FORMAT_MOD_LINEAR, + [MODIFIER_PRIORITY_X] = I915_FORMAT_MOD_X_TILED, + [MODIFIER_PRIORITY_Y] = I915_FORMAT_MOD_Y_TILED, + [MODIFIER_PRIORITY_Y_CCS] = I915_FORMAT_MOD_Y_TILED_CCS, +}; + +static bool +modifier_is_supported(const struct intel_device_info *devinfo, + enum pipe_format pfmt, uint64_t modifier) +{ + /* XXX: do something real */ + switch (modifier) { + case I915_FORMAT_MOD_Y_TILED_CCS: + return false; + case I915_FORMAT_MOD_Y_TILED: + return devinfo->ver >= 6; + case I915_FORMAT_MOD_X_TILED: + case DRM_FORMAT_MOD_LINEAR: + return true; + case DRM_FORMAT_MOD_INVALID: + default: + return false; + } +} + +static uint64_t +select_best_modifier(struct intel_device_info *devinfo, enum pipe_format pfmt, + const uint64_t *modifiers, + int count) +{ + enum modifier_priority prio = MODIFIER_PRIORITY_INVALID; + + for (int i = 0; i < count; i++) { + if (!modifier_is_supported(devinfo, pfmt, modifiers[i])) + continue; + + switch (modifiers[i]) { + case I915_FORMAT_MOD_Y_TILED_CCS: + prio = MAX2(prio, MODIFIER_PRIORITY_Y_CCS); + break; + case I915_FORMAT_MOD_Y_TILED: + prio = MAX2(prio, MODIFIER_PRIORITY_Y); + break; + case I915_FORMAT_MOD_X_TILED: + prio = MAX2(prio, MODIFIER_PRIORITY_X); + break; + case DRM_FORMAT_MOD_LINEAR: + prio = MAX2(prio, MODIFIER_PRIORITY_LINEAR); + break; + case DRM_FORMAT_MOD_INVALID: + default: + break; + } + } + + return priority_to_modifier[prio]; +} + +static enum isl_surf_dim +crocus_target_to_isl_surf_dim(enum pipe_texture_target target) +{ + switch (target) { + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return ISL_SURF_DIM_1D; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE_ARRAY: + return ISL_SURF_DIM_2D; + case PIPE_TEXTURE_3D: + return ISL_SURF_DIM_3D; + case PIPE_MAX_TEXTURE_TYPES: + break; + } + unreachable("invalid texture type"); +} + +static void +crocus_query_dmabuf_modifiers(struct pipe_screen *pscreen, + enum pipe_format pfmt, + int max, + uint64_t *modifiers, + unsigned int *external_only, + int *count) +{ + struct crocus_screen *screen = (void *) pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + uint64_t all_modifiers[] = { + DRM_FORMAT_MOD_LINEAR, + I915_FORMAT_MOD_X_TILED, + I915_FORMAT_MOD_Y_TILED, + I915_FORMAT_MOD_Y_TILED_CCS, + }; + + int supported_mods = 0; + + for (int i = 0; i < ARRAY_SIZE(all_modifiers); i++) { + if (!modifier_is_supported(devinfo, pfmt, all_modifiers[i])) + continue; + + if (supported_mods < max) { + if (modifiers) + modifiers[supported_mods] = all_modifiers[i]; + + if (external_only) + external_only[supported_mods] = util_format_is_yuv(pfmt); + } + + supported_mods++; + } + + *count = supported_mods; +} + +static isl_surf_usage_flags_t +pipe_bind_to_isl_usage(unsigned bindings) +{ + isl_surf_usage_flags_t usage = 0; + + if (bindings & PIPE_BIND_RENDER_TARGET) + usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT; + + if (bindings & PIPE_BIND_SAMPLER_VIEW) + usage |= ISL_SURF_USAGE_TEXTURE_BIT; + + if (bindings & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SHADER_BUFFER)) + usage |= ISL_SURF_USAGE_STORAGE_BIT; + + if (bindings & PIPE_BIND_DISPLAY_TARGET) + usage |= ISL_SURF_USAGE_DISPLAY_BIT; + + return usage; +} + +struct pipe_resource * +crocus_resource_get_separate_stencil(struct pipe_resource *p_res) +{ + /* For packed depth-stencil, we treat depth as the primary resource + * and store S8 as the "second plane" resource. + */ + if (p_res->next && p_res->next->format == PIPE_FORMAT_S8_UINT) + return p_res->next; + + return NULL; + +} + +static void +crocus_resource_set_separate_stencil(struct pipe_resource *p_res, + struct pipe_resource *stencil) +{ + assert(util_format_has_depth(util_format_description(p_res->format))); + pipe_resource_reference(&p_res->next, stencil); +} + +void +crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo, + struct pipe_resource *res, + struct crocus_resource **out_z, + struct crocus_resource **out_s) +{ + if (!res) { + *out_z = NULL; + *out_s = NULL; + return; + } + + /* gen4/5 only supports packed ds */ + if (devinfo->ver < 6) { + *out_z = (void *)res; + *out_s = (void *)res; + return; + } + + if (res->format != PIPE_FORMAT_S8_UINT) { + *out_z = (void *) res; + *out_s = (void *) crocus_resource_get_separate_stencil(res); + } else { + *out_z = NULL; + *out_s = (void *) res; + } +} + +void +crocus_resource_disable_aux(struct crocus_resource *res) +{ + crocus_bo_unreference(res->aux.bo); + free(res->aux.state); + + res->aux.usage = ISL_AUX_USAGE_NONE; + res->aux.has_hiz = 0; + res->aux.surf.size_B = 0; + res->aux.surf.levels = 0; + res->aux.bo = NULL; + res->aux.extra_aux.surf.size_B = 0; + res->aux.state = NULL; +} + +static void +crocus_resource_destroy(struct pipe_screen *screen, + struct pipe_resource *resource) +{ + struct crocus_resource *res = (struct crocus_resource *)resource; + + if (resource->target == PIPE_BUFFER) + util_range_destroy(&res->valid_buffer_range); + + if (res->shadow) + pipe_resource_reference((struct pipe_resource **)&res->shadow, NULL); + crocus_resource_disable_aux(res); + + crocus_bo_unreference(res->bo); + crocus_pscreen_unref(res->orig_screen); + free(res); +} + +static struct crocus_resource * +crocus_alloc_resource(struct pipe_screen *pscreen, + const struct pipe_resource *templ) +{ + struct crocus_resource *res = calloc(1, sizeof(struct crocus_resource)); + if (!res) + return NULL; + + res->base = *templ; + res->base.screen = pscreen; + res->orig_screen = crocus_pscreen_ref(pscreen); + pipe_reference_init(&res->base.reference, 1); + + if (templ->target == PIPE_BUFFER) + util_range_init(&res->valid_buffer_range); + + return res; +} + +unsigned +crocus_get_num_logical_layers(const struct crocus_resource *res, unsigned level) +{ + if (res->surf.dim == ISL_SURF_DIM_3D) + return minify(res->surf.logical_level0_px.depth, level); + else + return res->surf.logical_level0_px.array_len; +} + +static enum isl_aux_state ** +create_aux_state_map(struct crocus_resource *res, enum isl_aux_state initial) +{ + assert(res->aux.state == NULL); + + uint32_t total_slices = 0; + for (uint32_t level = 0; level < res->surf.levels; level++) + total_slices += crocus_get_num_logical_layers(res, level); + + const size_t per_level_array_size = + res->surf.levels * sizeof(enum isl_aux_state *); + + /* We're going to allocate a single chunk of data for both the per-level + * reference array and the arrays of aux_state. This makes cleanup + * significantly easier. + */ + const size_t total_size = + per_level_array_size + total_slices * sizeof(enum isl_aux_state); + + void *data = malloc(total_size); + if (!data) + return NULL; + + enum isl_aux_state **per_level_arr = data; + enum isl_aux_state *s = data + per_level_array_size; + for (uint32_t level = 0; level < res->surf.levels; level++) { + per_level_arr[level] = s; + const unsigned level_layers = crocus_get_num_logical_layers(res, level); + for (uint32_t a = 0; a < level_layers; a++) + *(s++) = initial; + } + assert((void *)s == data + total_size); + + return per_level_arr; +} + +/** + * Configure aux for the resource, but don't allocate it. For images which + * might be shared with modifiers, we must allocate the image and aux data in + * a single bo. + * + * Returns false on unexpected error (e.g. allocation failed, or invalid + * configuration result). + */ +static bool +crocus_resource_configure_aux(struct crocus_screen *screen, + struct crocus_resource *res, bool imported, + uint64_t *aux_size_B, + uint32_t *alloc_flags) +{ + const struct intel_device_info *devinfo = &screen->devinfo; + + /* Try to create the auxiliary surfaces allowed by the modifier or by + * the user if no modifier is specified. + */ + assert(!res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE); + + const bool has_mcs = devinfo->ver >= 7 && !res->mod_info && + isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf); + + const bool has_hiz = devinfo->ver >= 6 && !res->mod_info && + !(INTEL_DEBUG & DEBUG_NO_HIZ) && + isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf); + + const bool has_ccs = + ((devinfo->ver >= 7 && !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) || + (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) && + isl_surf_get_ccs_surf(&screen->isl_dev, &res->surf, &res->aux.surf, + &res->aux.extra_aux.surf, 0); + + /* Having both HIZ and MCS is impossible. */ + assert(!has_mcs || !has_hiz); + + /* Ensure aux surface creation for MCS_CCS and HIZ_CCS is correct. */ + if (has_ccs && (has_mcs || has_hiz)) { + assert(res->aux.extra_aux.surf.size_B > 0 && + res->aux.extra_aux.surf.usage & ISL_SURF_USAGE_CCS_BIT); + assert(res->aux.surf.size_B > 0 && + res->aux.surf.usage & + (ISL_SURF_USAGE_HIZ_BIT | ISL_SURF_USAGE_MCS_BIT)); + } + + if (res->mod_info && has_ccs) { + res->aux.usage = res->mod_info->aux_usage; + } else if (has_mcs) { + res->aux.usage = ISL_AUX_USAGE_MCS; + } else if (has_hiz) { + res->aux.usage = ISL_AUX_USAGE_HIZ; + } else if (has_ccs) { + if (isl_format_supports_ccs_d(devinfo, res->surf.format)) + res->aux.usage = ISL_AUX_USAGE_CCS_D; + } + + enum isl_aux_state initial_state = ISL_AUX_STATE_AUX_INVALID; + *aux_size_B = 0; + *alloc_flags = 0; + assert(!res->aux.bo); + + switch (res->aux.usage) { + case ISL_AUX_USAGE_NONE: + /* Having no aux buffer is only okay if there's no modifier with aux. */ + res->aux.surf.levels = 0; + return !res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE; + case ISL_AUX_USAGE_HIZ: + initial_state = ISL_AUX_STATE_AUX_INVALID; + break; + case ISL_AUX_USAGE_MCS: + /* The Ivybridge PRM, Vol 2 Part 1 p326 says: + * + * "When MCS buffer is enabled and bound to MSRT, it is required + * that it is cleared prior to any rendering." + * + * Since we only use the MCS buffer for rendering, we just clear it + * immediately on allocation. The clear value for MCS buffers is all + * 1's, so we simply memset it to 0xff. + */ + initial_state = ISL_AUX_STATE_CLEAR; + break; + case ISL_AUX_USAGE_CCS_D: + /* When CCS_E is used, we need to ensure that the CCS starts off in + * a valid state. From the Sky Lake PRM, "MCS Buffer for Render + * Target(s)": + * + * "If Software wants to enable Color Compression without Fast + * clear, Software needs to initialize MCS with zeros." + * + * A CCS value of 0 indicates that the corresponding block is in the + * pass-through state which is what we want. + * + * For CCS_D, do the same thing. On Gen9+, this avoids having any + * undefined bits in the aux buffer. + */ + if (imported) + initial_state = + isl_drm_modifier_get_default_aux_state(res->mod_info->modifier); + else + initial_state = ISL_AUX_STATE_PASS_THROUGH; + *alloc_flags |= BO_ALLOC_ZEROED; + break; + default: + unreachable("non-crocus aux"); + } + + /* Create the aux_state for the auxiliary buffer. */ + res->aux.state = create_aux_state_map(res, initial_state); + if (!res->aux.state) + return false; + + /* Increase the aux offset if the main and aux surfaces will share a BO. */ + res->aux.offset = + !res->mod_info || res->mod_info->aux_usage == res->aux.usage ? + ALIGN(res->surf.size_B, res->aux.surf.alignment_B) : 0; + uint64_t size = res->aux.surf.size_B; + + /* Allocate space in the buffer for storing the CCS. */ + if (res->aux.extra_aux.surf.size_B > 0) { + const uint64_t padded_aux_size = + ALIGN(size, res->aux.extra_aux.surf.alignment_B); + res->aux.extra_aux.offset = res->aux.offset + padded_aux_size; + size = padded_aux_size + res->aux.extra_aux.surf.size_B; + } + + /* Allocate space in the buffer for storing the clear color. On modern + * platforms (gen > 9), we can read it directly from such buffer. + * + * On gen <= 9, we are going to store the clear color on the buffer + * anyways, and copy it back to the surface state during state emission. + * + * Also add some padding to make sure the fast clear color state buffer + * starts at a 4K alignment. We believe that 256B might be enough, but due + * to lack of testing we will leave this as 4K for now. + */ + size = ALIGN(size, 4096); + *aux_size_B = size; + + if (isl_aux_usage_has_hiz(res->aux.usage)) { + for (unsigned level = 0; level < res->surf.levels; ++level) { + uint32_t width = u_minify(res->surf.phys_level0_sa.width, level); + uint32_t height = u_minify(res->surf.phys_level0_sa.height, level); + + /* Disable HiZ for LOD > 0 unless the width/height are 8x4 aligned. + * For LOD == 0, we can grow the dimensions to make it work. + */ + if (!devinfo->is_haswell || + (level == 0 || ((width & 7) == 0 && (height & 3) == 0))) + res->aux.has_hiz |= 1 << level; + } + } + + return true; +} + +/** + * Initialize the aux buffer contents. + * + * Returns false on unexpected error (e.g. mapping a BO failed). + */ +static bool +crocus_resource_init_aux_buf(struct crocus_resource *res, uint32_t alloc_flags) +{ + if (!(alloc_flags & BO_ALLOC_ZEROED)) { + void *map = crocus_bo_map(NULL, res->aux.bo, MAP_WRITE | MAP_RAW); + + if (!map) + return false; + + if (crocus_resource_get_aux_state(res, 0, 0) != ISL_AUX_STATE_AUX_INVALID) { + uint8_t memset_value = isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0; + memset((char*)map + res->aux.offset, memset_value, + res->aux.surf.size_B); + } + + /* Bspec section titled : MCS/CCS Buffers for Render Target(s) states: + * - If Software wants to enable Color Compression without Fast clear, + * Software needs to initialize MCS with zeros. + * - Lossless compression and CCS initialized to all F (using HW Fast + * Clear or SW direct Clear) + * + * We think, the first bullet point above is referring to CCS aux + * surface. Since we initialize the MCS in the clear state, we also + * initialize the CCS in the clear state (via SW direct clear) to keep + * the two in sync. + */ + memset((char*)map + res->aux.extra_aux.offset, + isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0, + res->aux.extra_aux.surf.size_B); + + crocus_bo_unmap(res->aux.bo); + } + + return true; +} + +/** + * Allocate the initial aux surface for a resource based on aux.usage + * + * Returns false on unexpected error (e.g. allocation failed, or invalid + * configuration result). + */ +static bool +crocus_resource_alloc_separate_aux(struct crocus_screen *screen, + struct crocus_resource *res) +{ + uint32_t alloc_flags; + uint64_t size; + if (!crocus_resource_configure_aux(screen, res, false, &size, &alloc_flags)) + return false; + + if (size == 0) + return true; + + /* Allocate the auxiliary buffer. ISL has stricter set of alignment rules + * the drm allocator. Therefore, one can pass the ISL dimensions in terms + * of bytes instead of trying to recalculate based on different format + * block sizes. + */ + res->aux.bo = crocus_bo_alloc_tiled(screen->bufmgr, "aux buffer", size, 4096, + isl_tiling_to_i915_tiling(res->aux.surf.tiling), + res->aux.surf.row_pitch_B, alloc_flags); + if (!res->aux.bo) { + return false; + } + + if (!crocus_resource_init_aux_buf(res, alloc_flags)) + return false; + + return true; +} + +void +crocus_resource_finish_aux_import(struct pipe_screen *pscreen, + struct crocus_resource *res) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + assert(crocus_resource_unfinished_aux_import(res)); + assert(!res->mod_info->supports_clear_color); + + struct crocus_resource *aux_res = (void *) res->base.next; + assert(aux_res->aux.surf.row_pitch_B && aux_res->aux.offset && + aux_res->aux.bo); + + assert(res->bo == aux_res->aux.bo); + crocus_bo_reference(aux_res->aux.bo); + res->aux.bo = aux_res->aux.bo; + + res->aux.offset = aux_res->aux.offset; + + assert(res->bo->size >= (res->aux.offset + res->aux.surf.size_B)); + assert(aux_res->aux.surf.row_pitch_B == res->aux.surf.row_pitch_B); + + crocus_resource_destroy(&screen->base, res->base.next); + res->base.next = NULL; +} + +static struct pipe_resource * +crocus_resource_create_for_buffer(struct pipe_screen *pscreen, + const struct pipe_resource *templ) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + struct crocus_resource *res = crocus_alloc_resource(pscreen, templ); + + assert(templ->target == PIPE_BUFFER); + assert(templ->height0 <= 1); + assert(templ->depth0 <= 1); + assert(templ->format == PIPE_FORMAT_NONE || + util_format_get_blocksize(templ->format) == 1); + + res->internal_format = templ->format; + res->surf.tiling = ISL_TILING_LINEAR; + + const char *name = templ->target == PIPE_BUFFER ? "buffer" : "miptree"; + + res->bo = crocus_bo_alloc(screen->bufmgr, name, templ->width0); + if (!res->bo) { + crocus_resource_destroy(pscreen, &res->base); + return NULL; + } + + return &res->base; +} + +static struct pipe_resource * +crocus_resource_create_with_modifiers(struct pipe_screen *pscreen, + const struct pipe_resource *templ, + const uint64_t *modifiers, + int modifiers_count) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_resource *res = crocus_alloc_resource(pscreen, templ); + + if (!res) + return NULL; + + const struct util_format_description *format_desc = + util_format_description(templ->format); + const bool has_depth = util_format_has_depth(format_desc); + uint64_t modifier = + select_best_modifier(devinfo, templ->format, modifiers, modifiers_count); + + isl_tiling_flags_t tiling_flags = ISL_TILING_ANY_MASK; + + /* TODO: This used to be because there wasn't BLORP to handle Y-tiling. */ + if (devinfo->ver < 6 && !util_format_is_depth_or_stencil(templ->format)) + tiling_flags &= ~ISL_TILING_Y0_BIT; + + if (modifier != DRM_FORMAT_MOD_INVALID) { + res->mod_info = isl_drm_modifier_get_info(modifier); + + tiling_flags = 1 << res->mod_info->tiling; + } else { + if (modifiers_count > 0) { + fprintf(stderr, "Unsupported modifier, resource creation failed.\n"); + goto fail; + } + + if (templ->bind & PIPE_BIND_RENDER_TARGET && devinfo->ver < 6) { + modifier = I915_FORMAT_MOD_X_TILED; + res->mod_info = isl_drm_modifier_get_info(modifier); + tiling_flags = 1 << res->mod_info->tiling; + } + /* Use linear for staging buffers */ + if (templ->usage == PIPE_USAGE_STAGING || + templ->bind & (PIPE_BIND_LINEAR | PIPE_BIND_CURSOR) ) + tiling_flags = ISL_TILING_LINEAR_BIT; + } + + isl_surf_usage_flags_t usage = pipe_bind_to_isl_usage(templ->bind); + + if (templ->target == PIPE_TEXTURE_CUBE || + templ->target == PIPE_TEXTURE_CUBE_ARRAY) + usage |= ISL_SURF_USAGE_CUBE_BIT; + + if (templ->usage != PIPE_USAGE_STAGING) { + if (templ->format == PIPE_FORMAT_S8_UINT) + usage |= ISL_SURF_USAGE_STENCIL_BIT; + else if (has_depth) { + /* combined DS only on gen4/5 */ + if (devinfo->ver < 6) { + if (templ->format == PIPE_FORMAT_Z24X8_UNORM || + templ->format == PIPE_FORMAT_Z24_UNORM_S8_UINT || + templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) + usage |= ISL_SURF_USAGE_STENCIL_BIT; + } + usage |= ISL_SURF_USAGE_DEPTH_BIT; + } + + if (templ->format == PIPE_FORMAT_S8_UINT) + tiling_flags = ISL_TILING_W_BIT; + } + + if (templ->usage == PIPE_USAGE_STAGING && + templ->bind == PIPE_BIND_DEPTH_STENCIL && + devinfo->ver < 6) + return NULL; + + enum pipe_format pfmt = templ->format; + res->internal_format = pfmt; + + /* Should be handled by u_transfer_helper */ +// assert(!util_format_is_depth_and_stencil(pfmt)); + + struct crocus_format_info fmt = crocus_format_for_usage(devinfo, pfmt, usage); + assert(fmt.fmt != ISL_FORMAT_UNSUPPORTED); + enum isl_surf_dim dim = crocus_target_to_isl_surf_dim(templ->target); + + UNUSED const bool isl_surf_created_successfully = + isl_surf_init(&screen->isl_dev, &res->surf, + .dim = dim, + .format = fmt.fmt, + .width = templ->width0, + .height = templ->height0, + .depth = templ->depth0, + .levels = templ->last_level + 1, + .array_len = templ->array_size, + .samples = MAX2(templ->nr_samples, 1), + .min_alignment_B = 0, + .row_pitch_B = 0, + .usage = usage, + .tiling_flags = tiling_flags); + assert(isl_surf_created_successfully); + + const char *name = "miptree"; + + unsigned int flags = 0; + if (templ->usage == PIPE_USAGE_STAGING) + flags |= BO_ALLOC_COHERENT; + + uint64_t aux_size = 0; + uint32_t aux_preferred_alloc_flags; + + if (!crocus_resource_configure_aux(screen, res, false, &aux_size, + &aux_preferred_alloc_flags)) { + goto fail; + } + + /* Modifiers require the aux data to be in the same buffer as the main + * surface, but we combine them even when a modifiers is not being used. + */ + const uint64_t bo_size = + MAX2(res->surf.size_B, res->aux.offset + aux_size); + uint32_t alignment = MAX2(4096, res->surf.alignment_B); + res->bo = crocus_bo_alloc_tiled(screen->bufmgr, name, bo_size, alignment, + isl_tiling_to_i915_tiling(res->surf.tiling), + res->surf.row_pitch_B, flags); + + if (!res->bo) + goto fail; + + if (aux_size > 0) { + res->aux.bo = res->bo; + crocus_bo_reference(res->aux.bo); + if (!crocus_resource_init_aux_buf(res, flags)) + goto fail; + } + + if (templ->format == PIPE_FORMAT_S8_UINT && !(templ->usage == PIPE_USAGE_STAGING) && + devinfo->ver == 7 && (templ->bind & PIPE_BIND_SAMPLER_VIEW)) { + struct pipe_resource templ_shadow = (struct pipe_resource) { + .usage = 0, + .bind = PIPE_BIND_SAMPLER_VIEW, + .width0 = res->base.width0, + .height0 = res->base.height0, + .depth0 = res->base.depth0, + .last_level = res->base.last_level, + .nr_samples = res->base.nr_samples, + .nr_storage_samples = res->base.nr_storage_samples, + .array_size = res->base.array_size, + .format = PIPE_FORMAT_R8_UINT, + .target = res->base.target, + }; + res->shadow = (struct crocus_resource *)screen->base.resource_create(&screen->base, &templ_shadow); + assert(res->shadow); + } + + return &res->base; + +fail: + fprintf(stderr, "XXX: resource creation failed\n"); + crocus_resource_destroy(pscreen, &res->base); + return NULL; + +} + +static struct pipe_resource * +crocus_resource_create(struct pipe_screen *pscreen, + const struct pipe_resource *templ) +{ + if (templ->target == PIPE_BUFFER) + return crocus_resource_create_for_buffer(pscreen, templ); + else + return crocus_resource_create_with_modifiers(pscreen, templ, NULL, 0); +} + +static uint64_t +tiling_to_modifier(uint32_t tiling) +{ + static const uint64_t map[] = { + [I915_TILING_NONE] = DRM_FORMAT_MOD_LINEAR, + [I915_TILING_X] = I915_FORMAT_MOD_X_TILED, + [I915_TILING_Y] = I915_FORMAT_MOD_Y_TILED, + }; + + assert(tiling < ARRAY_SIZE(map)); + + return map[tiling]; +} + +static struct pipe_resource * +crocus_resource_from_user_memory(struct pipe_screen *pscreen, + const struct pipe_resource *templ, + void *user_memory) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + struct crocus_resource *res = crocus_alloc_resource(pscreen, templ); + if (!res) + return NULL; + + assert(templ->target == PIPE_BUFFER); + + res->internal_format = templ->format; + res->bo = crocus_bo_create_userptr(bufmgr, "user", + user_memory, templ->width0); + if (!res->bo) { + free(res); + return NULL; + } + + util_range_add(&res->base, &res->valid_buffer_range, 0, templ->width0); + + return &res->base; +} + +static struct pipe_resource * +crocus_resource_from_handle(struct pipe_screen *pscreen, + const struct pipe_resource *templ, + struct winsys_handle *whandle, + unsigned usage) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + struct crocus_resource *res = crocus_alloc_resource(pscreen, templ); + const struct isl_drm_modifier_info *mod_inf = + isl_drm_modifier_get_info(whandle->modifier); + uint32_t tiling; + + if (!res) + return NULL; + + switch (whandle->type) { + case WINSYS_HANDLE_TYPE_FD: + if (mod_inf) + tiling = isl_tiling_to_i915_tiling(mod_inf->tiling); + else + tiling = I915_TILING_LAST + 1; + res->bo = crocus_bo_import_dmabuf(bufmgr, whandle->handle, + tiling, whandle->stride); + break; + case WINSYS_HANDLE_TYPE_SHARED: + res->bo = crocus_bo_gem_create_from_name(bufmgr, "winsys image", + whandle->handle); + break; + default: + unreachable("invalid winsys handle type"); + } + if (!res->bo) + return NULL; + + res->offset = whandle->offset; + + if (mod_inf == NULL) { + mod_inf = + isl_drm_modifier_get_info(tiling_to_modifier(res->bo->tiling_mode)); + } + assert(mod_inf); + + res->external_format = whandle->format; + res->mod_info = mod_inf; + + isl_surf_usage_flags_t isl_usage = pipe_bind_to_isl_usage(templ->bind); + + const struct crocus_format_info fmt = + crocus_format_for_usage(devinfo, templ->format, isl_usage); + res->internal_format = templ->format; + + if (templ->target == PIPE_BUFFER) { + res->surf.tiling = ISL_TILING_LINEAR; + } else { + if (whandle->plane < util_format_get_num_planes(whandle->format)) { + UNUSED const bool isl_surf_created_successfully = + isl_surf_init(&screen->isl_dev, &res->surf, + .dim = crocus_target_to_isl_surf_dim(templ->target), + .format = fmt.fmt, + .width = templ->width0, + .height = templ->height0, + .depth = templ->depth0, + .levels = templ->last_level + 1, + .array_len = templ->array_size, + .samples = MAX2(templ->nr_samples, 1), + .min_alignment_B = 0, + .row_pitch_B = whandle->stride, + .usage = isl_usage, + .tiling_flags = 1 << res->mod_info->tiling); + assert(isl_surf_created_successfully); + assert(res->bo->tiling_mode == + isl_tiling_to_i915_tiling(res->surf.tiling)); + + // XXX: create_ccs_buf_for_image? + if (whandle->modifier == DRM_FORMAT_MOD_INVALID) { + if (!crocus_resource_alloc_separate_aux(screen, res)) + goto fail; + } else { + if (res->mod_info->aux_usage != ISL_AUX_USAGE_NONE) { + uint32_t alloc_flags; + uint64_t size; + UNUSED bool ok = crocus_resource_configure_aux(screen, res, true, &size, + &alloc_flags); + assert(ok); + /* The gallium dri layer will create a separate plane resource + * for the aux image. crocus_resource_finish_aux_import will + * merge the separate aux parameters back into a single + * crocus_resource. + */ + } + } + } else { + /* Save modifier import information to reconstruct later. After + * import, this will be available under a second image accessible + * from the main image with res->base.next. See + * crocus_resource_finish_aux_import. + */ + res->aux.surf.row_pitch_B = whandle->stride; + res->aux.offset = whandle->offset; + res->aux.bo = res->bo; + res->bo = NULL; + } + } + + return &res->base; + +fail: + crocus_resource_destroy(pscreen, &res->base); + return NULL; +} + +static void +crocus_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_resource *res = (void *) resource; + const struct isl_drm_modifier_info *mod = res->mod_info; + + crocus_resource_prepare_access(ice, res, + 0, INTEL_REMAINING_LEVELS, + 0, INTEL_REMAINING_LAYERS, + mod ? mod->aux_usage : ISL_AUX_USAGE_NONE, + mod ? mod->supports_clear_color : false); +} + +static void +crocus_resource_disable_aux_on_first_query(struct pipe_resource *resource, + unsigned usage) +{ + struct crocus_resource *res = (struct crocus_resource *)resource; + bool mod_with_aux = + res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE; + + /* Disable aux usage if explicit flush not set and this is the first time + * we are dealing with this resource and the resource was not created with + * a modifier with aux. + */ + if (!mod_with_aux && + (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && res->aux.usage != 0) && + p_atomic_read(&resource->reference.count) == 1) { + crocus_resource_disable_aux(res); + } +} + +static bool +crocus_resource_get_param(struct pipe_screen *pscreen, + struct pipe_context *context, + struct pipe_resource *resource, + unsigned plane, + unsigned layer, + unsigned level, + enum pipe_resource_param param, + unsigned handle_usage, + uint64_t *value) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + struct crocus_resource *res = (struct crocus_resource *)resource; + bool mod_with_aux = + res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE; + bool wants_aux = mod_with_aux && plane > 0; + bool result; + unsigned handle; + + if (crocus_resource_unfinished_aux_import(res)) + crocus_resource_finish_aux_import(pscreen, res); + + struct crocus_bo *bo = wants_aux ? res->aux.bo : res->bo; + + crocus_resource_disable_aux_on_first_query(resource, handle_usage); + + switch (param) { + case PIPE_RESOURCE_PARAM_NPLANES: + if (mod_with_aux) { + *value = util_format_get_num_planes(res->external_format); + } else { + unsigned count = 0; + for (struct pipe_resource *cur = resource; cur; cur = cur->next) + count++; + *value = count; + } + return true; + case PIPE_RESOURCE_PARAM_STRIDE: + *value = wants_aux ? res->aux.surf.row_pitch_B : res->surf.row_pitch_B; + return true; + case PIPE_RESOURCE_PARAM_OFFSET: + *value = wants_aux ? res->aux.offset : 0; + return true; + case PIPE_RESOURCE_PARAM_MODIFIER: + *value = res->mod_info ? res->mod_info->modifier : + tiling_to_modifier(res->bo->tiling_mode); + return true; + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED: + result = crocus_bo_flink(bo, &handle) == 0; + if (result) + *value = handle; + return result; + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: { + /* Because we share the same drm file across multiple crocus_screen, when + * we export a GEM handle we must make sure it is valid in the DRM file + * descriptor the caller is using (this is the FD given at screen + * creation). + */ + uint32_t handle; + if (crocus_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle)) + return false; + *value = handle; + return true; + } + case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD: + result = crocus_bo_export_dmabuf(bo, (int *) &handle) == 0; + if (result) + *value = handle; + return result; + default: + return false; + } +} + +static bool +crocus_resource_get_handle(struct pipe_screen *pscreen, + struct pipe_context *ctx, + struct pipe_resource *resource, + struct winsys_handle *whandle, + unsigned usage) +{ + struct crocus_screen *screen = (struct crocus_screen *) pscreen; + struct crocus_resource *res = (struct crocus_resource *)resource; + bool mod_with_aux = + res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE; + + crocus_resource_disable_aux_on_first_query(resource, usage); + + struct crocus_bo *bo; + if (mod_with_aux && whandle->plane > 0) { + assert(res->aux.bo); + bo = res->aux.bo; + whandle->stride = res->aux.surf.row_pitch_B; + whandle->offset = res->aux.offset; + } else { + /* If this is a buffer, stride should be 0 - no need to special case */ + whandle->stride = res->surf.row_pitch_B; + bo = res->bo; + } + whandle->format = res->external_format; + whandle->modifier = + res->mod_info ? res->mod_info->modifier + : tiling_to_modifier(res->bo->tiling_mode); + +#ifndef NDEBUG + enum isl_aux_usage allowed_usage = + res->mod_info ? res->mod_info->aux_usage : ISL_AUX_USAGE_NONE; + + if (res->aux.usage != allowed_usage) { + enum isl_aux_state aux_state = crocus_resource_get_aux_state(res, 0, 0); + assert(aux_state == ISL_AUX_STATE_RESOLVED || + aux_state == ISL_AUX_STATE_PASS_THROUGH); + } +#endif + + switch (whandle->type) { + case WINSYS_HANDLE_TYPE_SHARED: + return crocus_bo_flink(bo, &whandle->handle) == 0; + case WINSYS_HANDLE_TYPE_KMS: { + /* Because we share the same drm file across multiple crocus_screen, when + * we export a GEM handle we must make sure it is valid in the DRM file + * descriptor the caller is using (this is the FD given at screen + * creation). + */ + uint32_t handle; + if (crocus_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle)) + return false; + whandle->handle = handle; + return true; + } + case WINSYS_HANDLE_TYPE_FD: + return crocus_bo_export_dmabuf(bo, (int *) &whandle->handle) == 0; + } + + return false; +} + +static bool +resource_is_busy(struct crocus_context *ice, + struct crocus_resource *res) +{ + bool busy = crocus_bo_busy(res->bo); + + for (int i = 0; i < ice->batch_count; i++) + busy |= crocus_batch_references(&ice->batches[i], res->bo); + + return busy; +} + +static void +crocus_invalidate_resource(struct pipe_context *ctx, + struct pipe_resource *resource) +{ + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_context *ice = (void *) ctx; + struct crocus_resource *res = (void *) resource; + + if (resource->target != PIPE_BUFFER) + return; + + if (!resource_is_busy(ice, res)) { + /* The resource is idle, so just mark that it contains no data and + * keep using the same underlying buffer object. + */ + util_range_set_empty(&res->valid_buffer_range); + return; + } + + /* Otherwise, try and replace the backing storage with a new BO. */ + + /* We can't reallocate memory we didn't allocate in the first place. */ + if (res->bo->userptr) + return; + + // XXX: We should support this. + if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) + return; + + struct crocus_bo *old_bo = res->bo; + struct crocus_bo *new_bo = + crocus_bo_alloc(screen->bufmgr, res->bo->name, resource->width0); + + if (!new_bo) + return; + + /* Swap out the backing storage */ + res->bo = new_bo; + + /* Rebind the buffer, replacing any state referring to the old BO's + * address, and marking state dirty so it's reemitted. + */ + screen->vtbl.rebind_buffer(ice, res); + + util_range_set_empty(&res->valid_buffer_range); + + crocus_bo_unreference(old_bo); +} + +static void +crocus_flush_staging_region(struct pipe_transfer *xfer, + const struct pipe_box *flush_box) +{ + if (!(xfer->usage & PIPE_MAP_WRITE)) + return; + + struct crocus_transfer *map = (void *) xfer; + + struct pipe_box src_box = *flush_box; + + /* Account for extra alignment padding in staging buffer */ + if (xfer->resource->target == PIPE_BUFFER) + src_box.x += xfer->box.x % CROCUS_MAP_BUFFER_ALIGNMENT; + + struct pipe_box dst_box = (struct pipe_box) { + .x = xfer->box.x + flush_box->x, + .y = xfer->box.y + flush_box->y, + .z = xfer->box.z + flush_box->z, + .width = flush_box->width, + .height = flush_box->height, + .depth = flush_box->depth, + }; + + crocus_copy_region(map->blorp, map->batch, xfer->resource, xfer->level, + dst_box.x, dst_box.y, dst_box.z, map->staging, 0, + &src_box); +} + +static void +crocus_unmap_copy_region(struct crocus_transfer *map) +{ + crocus_resource_destroy(map->staging->screen, map->staging); + + map->ptr = NULL; +} + +static void +crocus_map_copy_region(struct crocus_transfer *map) +{ + struct pipe_screen *pscreen = &map->batch->screen->base; + struct pipe_transfer *xfer = &map->base; + struct pipe_box *box = &xfer->box; + struct crocus_resource *res = (void *) xfer->resource; + + unsigned extra = xfer->resource->target == PIPE_BUFFER ? + box->x % CROCUS_MAP_BUFFER_ALIGNMENT : 0; + + struct pipe_resource templ = (struct pipe_resource) { + .usage = PIPE_USAGE_STAGING, + .width0 = box->width + extra, + .height0 = box->height, + .depth0 = 1, + .nr_samples = xfer->resource->nr_samples, + .nr_storage_samples = xfer->resource->nr_storage_samples, + .array_size = box->depth, + .format = res->internal_format, + }; + + if (xfer->resource->target == PIPE_BUFFER) + templ.target = PIPE_BUFFER; + else if (templ.array_size > 1) + templ.target = PIPE_TEXTURE_2D_ARRAY; + else + templ.target = PIPE_TEXTURE_2D; + + map->staging = crocus_resource_create(pscreen, &templ); + assert(map->staging); + + if (templ.target != PIPE_BUFFER) { + struct isl_surf *surf = &((struct crocus_resource *) map->staging)->surf; + xfer->stride = isl_surf_get_row_pitch_B(surf); + xfer->layer_stride = isl_surf_get_array_pitch(surf); + } + + if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) { + crocus_copy_region(map->blorp, map->batch, map->staging, 0, extra, 0, 0, + xfer->resource, xfer->level, box); + /* Ensure writes to the staging BO land before we map it below. */ + crocus_emit_pipe_control_flush(map->batch, + "transfer read: flush before mapping", + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_CS_STALL); + } + + struct crocus_bo *staging_bo = crocus_resource_bo(map->staging); + + if (crocus_batch_references(map->batch, staging_bo)) + crocus_batch_flush(map->batch); + + map->ptr = + crocus_bo_map(map->dbg, staging_bo, xfer->usage & MAP_FLAGS) + extra; + + map->unmap = crocus_unmap_copy_region; +} + +static void +get_image_offset_el(const struct isl_surf *surf, unsigned level, unsigned z, + unsigned *out_x0_el, unsigned *out_y0_el) +{ + ASSERTED uint32_t z0_el, a0_el; + if (surf->dim == ISL_SURF_DIM_3D) { + isl_surf_get_image_offset_el(surf, level, 0, z, + out_x0_el, out_y0_el, &z0_el, &a0_el); + } else { + isl_surf_get_image_offset_el(surf, level, z, 0, + out_x0_el, out_y0_el, &z0_el, &a0_el); + } + assert(z0_el == 0 && a0_el == 0); +} + +void +crocus_resource_get_image_offset(struct crocus_resource *res, + uint32_t level, uint32_t z, + uint32_t *x, uint32_t *y) +{ + get_image_offset_el(&res->surf, level, z, x, y); +} + +/** + * Get pointer offset into stencil buffer. + * + * The stencil buffer is W tiled. Since the GTT is incapable of W fencing, we + * must decode the tile's layout in software. + * + * See + * - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.2.1 W-Major Tile + * Format. + * - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling Algorithm + * + * Even though the returned offset is always positive, the return type is + * signed due to + * commit e8b1c6d6f55f5be3bef25084fdd8b6127517e137 + * mesa: Fix return type of _mesa_get_format_bytes() (#37351) + */ +static intptr_t +s8_offset(uint32_t stride, uint32_t x, uint32_t y, bool swizzled) +{ + uint32_t tile_size = 4096; + uint32_t tile_width = 64; + uint32_t tile_height = 64; + uint32_t row_size = 64 * stride / 2; /* Two rows are interleaved. */ + + uint32_t tile_x = x / tile_width; + uint32_t tile_y = y / tile_height; + + /* The byte's address relative to the tile's base addres. */ + uint32_t byte_x = x % tile_width; + uint32_t byte_y = y % tile_height; + + uintptr_t u = tile_y * row_size + + tile_x * tile_size + + 512 * (byte_x / 8) + + 64 * (byte_y / 8) + + 32 * ((byte_y / 4) % 2) + + 16 * ((byte_x / 4) % 2) + + 8 * ((byte_y / 2) % 2) + + 4 * ((byte_x / 2) % 2) + + 2 * (byte_y % 2) + + 1 * (byte_x % 2); + + if (swizzled) { + /* adjust for bit6 swizzling */ + if (((byte_x / 8) % 2) == 1) { + if (((byte_y / 8) % 2) == 0) { + u += 64; + } else { + u -= 64; + } + } + } + + return u; +} + +static void +crocus_unmap_s8(struct crocus_transfer *map) +{ + struct pipe_transfer *xfer = &map->base; + const struct pipe_box *box = &xfer->box; + struct crocus_resource *res = (struct crocus_resource *) xfer->resource; + struct isl_surf *surf = &res->surf; + + if (xfer->usage & PIPE_MAP_WRITE) { + uint8_t *untiled_s8_map = map->ptr; + uint8_t *tiled_s8_map = + crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS); + + for (int s = 0; s < box->depth; s++) { + unsigned x0_el, y0_el; + get_image_offset_el(surf, xfer->level, box->z + s, &x0_el, &y0_el); + + for (uint32_t y = 0; y < box->height; y++) { + for (uint32_t x = 0; x < box->width; x++) { + ptrdiff_t offset = s8_offset(surf->row_pitch_B, + x0_el + box->x + x, + y0_el + box->y + y, + map->has_swizzling); + tiled_s8_map[offset] = + untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x]; + } + } + } + } + + free(map->buffer); +} + +static void +crocus_map_s8(struct crocus_transfer *map) +{ + struct pipe_transfer *xfer = &map->base; + const struct pipe_box *box = &xfer->box; + struct crocus_resource *res = (struct crocus_resource *) xfer->resource; + struct isl_surf *surf = &res->surf; + + xfer->stride = surf->row_pitch_B; + xfer->layer_stride = xfer->stride * box->height; + + /* The tiling and detiling functions require that the linear buffer has + * a 16-byte alignment (that is, its `x0` is 16-byte aligned). Here we + * over-allocate the linear buffer to get the proper alignment. + */ + map->buffer = map->ptr = malloc(xfer->layer_stride * box->depth); + assert(map->buffer); + + /* One of either READ_BIT or WRITE_BIT or both is set. READ_BIT implies no + * INVALIDATE_RANGE_BIT. WRITE_BIT needs the original values read in unless + * invalidate is set, since we'll be writing the whole rectangle from our + * temporary buffer back out. + */ + if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) { + uint8_t *untiled_s8_map = map->ptr; + uint8_t *tiled_s8_map = + crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS); + + for (int s = 0; s < box->depth; s++) { + unsigned x0_el, y0_el; + get_image_offset_el(surf, xfer->level, box->z + s, &x0_el, &y0_el); + + for (uint32_t y = 0; y < box->height; y++) { + for (uint32_t x = 0; x < box->width; x++) { + ptrdiff_t offset = s8_offset(surf->row_pitch_B, + x0_el + box->x + x, + y0_el + box->y + y, + map->has_swizzling); + untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x] = + tiled_s8_map[offset]; + } + } + } + } + + map->unmap = crocus_unmap_s8; +} + +/* Compute extent parameters for use with tiled_memcpy functions. + * xs are in units of bytes and ys are in units of strides. + */ +static inline void +tile_extents(const struct isl_surf *surf, + const struct pipe_box *box, + unsigned level, int z, + unsigned *x1_B, unsigned *x2_B, + unsigned *y1_el, unsigned *y2_el) +{ + const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format); + const unsigned cpp = fmtl->bpb / 8; + + assert(box->x % fmtl->bw == 0); + assert(box->y % fmtl->bh == 0); + + unsigned x0_el, y0_el; + get_image_offset_el(surf, level, box->z + z, &x0_el, &y0_el); + + *x1_B = (box->x / fmtl->bw + x0_el) * cpp; + *y1_el = box->y / fmtl->bh + y0_el; + *x2_B = (DIV_ROUND_UP(box->x + box->width, fmtl->bw) + x0_el) * cpp; + *y2_el = DIV_ROUND_UP(box->y + box->height, fmtl->bh) + y0_el; +} + +static void +crocus_unmap_tiled_memcpy(struct crocus_transfer *map) +{ + struct pipe_transfer *xfer = &map->base; + const struct pipe_box *box = &xfer->box; + struct crocus_resource *res = (struct crocus_resource *) xfer->resource; + struct isl_surf *surf = &res->surf; + + if (xfer->usage & PIPE_MAP_WRITE) { + char *dst = + crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS); + + for (int s = 0; s < box->depth; s++) { + unsigned x1, x2, y1, y2; + tile_extents(surf, box, xfer->level, s, &x1, &x2, &y1, &y2); + + void *ptr = map->ptr + s * xfer->layer_stride; + + isl_memcpy_linear_to_tiled(x1, x2, y1, y2, dst, ptr, + surf->row_pitch_B, xfer->stride, + map->has_swizzling, + surf->tiling, ISL_MEMCPY); + } + } + os_free_aligned(map->buffer); + map->buffer = map->ptr = NULL; +} + +static void +crocus_map_tiled_memcpy(struct crocus_transfer *map) +{ + struct pipe_transfer *xfer = &map->base; + const struct pipe_box *box = &xfer->box; + struct crocus_resource *res = (struct crocus_resource *) xfer->resource; + struct isl_surf *surf = &res->surf; + + xfer->stride = ALIGN(surf->row_pitch_B, 16); + xfer->layer_stride = xfer->stride * box->height; + + unsigned x1, x2, y1, y2; + tile_extents(surf, box, xfer->level, 0, &x1, &x2, &y1, &y2); + + /* The tiling and detiling functions require that the linear buffer has + * a 16-byte alignment (that is, its `x0` is 16-byte aligned). Here we + * over-allocate the linear buffer to get the proper alignment. + */ + map->buffer = + os_malloc_aligned(xfer->layer_stride * box->depth, 16); + assert(map->buffer); + map->ptr = (char *)map->buffer + (x1 & 0xf); + + if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) { + char *src = + crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS); + + for (int s = 0; s < box->depth; s++) { + unsigned x1, x2, y1, y2; + tile_extents(surf, box, xfer->level, s, &x1, &x2, &y1, &y2); + + /* Use 's' rather than 'box->z' to rebase the first slice to 0. */ + void *ptr = map->ptr + s * xfer->layer_stride; + + isl_memcpy_tiled_to_linear(x1, x2, y1, y2, ptr, src, xfer->stride, + surf->row_pitch_B, + map->has_swizzling, + surf->tiling, +#if defined(USE_SSE41) + util_get_cpu_caps()->has_sse4_1 ? ISL_MEMCPY_STREAMING_LOAD : +#endif + ISL_MEMCPY); + } + } + + map->unmap = crocus_unmap_tiled_memcpy; +} + +static void +crocus_map_direct(struct crocus_transfer *map) +{ + struct pipe_transfer *xfer = &map->base; + struct pipe_box *box = &xfer->box; + struct crocus_resource *res = (struct crocus_resource *) xfer->resource; + + void *ptr = crocus_bo_map(map->dbg, res->bo, xfer->usage & MAP_FLAGS); + + if (res->base.target == PIPE_BUFFER) { + xfer->stride = 0; + xfer->layer_stride = 0; + + map->ptr = ptr + box->x; + } else { + struct isl_surf *surf = &res->surf; + const struct isl_format_layout *fmtl = + isl_format_get_layout(surf->format); + const unsigned cpp = fmtl->bpb / 8; + unsigned x0_el, y0_el; + + get_image_offset_el(surf, xfer->level, box->z, &x0_el, &y0_el); + + xfer->stride = isl_surf_get_row_pitch_B(surf); + xfer->layer_stride = isl_surf_get_array_pitch(surf); + + map->ptr = ptr + (y0_el + box->y) * xfer->stride + (x0_el + box->x) * cpp; + } +} + +static bool +can_promote_to_async(const struct crocus_resource *res, + const struct pipe_box *box, + unsigned usage) +{ + /* If we're writing to a section of the buffer that hasn't even been + * initialized with useful data, then we can safely promote this write + * to be unsynchronized. This helps the common pattern of appending data. + */ + return res->base.target == PIPE_BUFFER && (usage & PIPE_MAP_WRITE) && + !(usage & TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED) && + !util_ranges_intersect(&res->valid_buffer_range, box->x, + box->x + box->width); +} + +static void * +crocus_transfer_map(struct pipe_context *ctx, + struct pipe_resource *resource, + unsigned level, + unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **ptransfer) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_resource *res = (struct crocus_resource *)resource; + struct isl_surf *surf = &res->surf; + + if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) { + /* Replace the backing storage with a fresh buffer for non-async maps */ + if (!(usage & (PIPE_MAP_UNSYNCHRONIZED | + TC_TRANSFER_MAP_NO_INVALIDATE))) + crocus_invalidate_resource(ctx, resource); + + /* If we can discard the whole resource, we can discard the range. */ + usage |= PIPE_MAP_DISCARD_RANGE; + } + + if (!(usage & PIPE_MAP_UNSYNCHRONIZED) && + can_promote_to_async(res, box, usage)) { + usage |= PIPE_MAP_UNSYNCHRONIZED; + } + + bool map_would_stall = false; + + if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) { + map_would_stall = resource_is_busy(ice, res) || + crocus_has_invalid_primary(res, level, 1, box->z, box->depth); + + + if (map_would_stall && (usage & PIPE_MAP_DONTBLOCK) && + (usage & PIPE_MAP_DIRECTLY)) + return NULL; + } + + if (surf->tiling != ISL_TILING_LINEAR && + (usage & PIPE_MAP_DIRECTLY)) + return NULL; + + struct crocus_transfer *map = slab_alloc(&ice->transfer_pool); + struct pipe_transfer *xfer = &map->base; + + if (!map) + return NULL; + + memset(map, 0, sizeof(*map)); + map->dbg = &ice->dbg; + + map->has_swizzling = ((struct crocus_screen *)ctx->screen)->has_swizzling; + pipe_resource_reference(&xfer->resource, resource); + xfer->level = level; + xfer->usage = usage; + xfer->box = *box; + *ptransfer = xfer; + + map->dest_had_defined_contents = + util_ranges_intersect(&res->valid_buffer_range, box->x, + box->x + box->width); + + if (usage & PIPE_MAP_WRITE) + util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width); + + /* Avoid using GPU copies for persistent/coherent buffers, as the idea + * there is to access them simultaneously on the CPU & GPU. This also + * avoids trying to use GPU copies for our u_upload_mgr buffers which + * contain state we're constructing for a GPU draw call, which would + * kill us with infinite stack recursion. + */ + bool no_gpu = usage & (PIPE_MAP_PERSISTENT | + PIPE_MAP_COHERENT | + PIPE_MAP_DIRECTLY); + + /* GPU copies are not useful for buffer reads. Instead of stalling to + * read from the original buffer, we'd simply copy it to a temporary... + * then stall (a bit longer) to read from that buffer. + * + * Images are less clear-cut. Color resolves are destructive, removing + * the underlying compression, so we'd rather blit the data to a linear + * temporary and map that, to avoid the resolve. (It might be better to + * a tiled temporary and use the tiled_memcpy paths...) + */ + if (!(usage & PIPE_MAP_DISCARD_RANGE) && + !crocus_has_invalid_primary(res, level, 1, box->z, box->depth)) + no_gpu = true; + + const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format); + if (fmtl->txc == ISL_TXC_ASTC) + no_gpu = true; + + if (map_would_stall && !no_gpu) { + /* If we need a synchronous mapping and the resource is busy, or needs + * resolving, we copy to/from a linear temporary buffer using the GPU. + */ + map->batch = &ice->batches[CROCUS_BATCH_RENDER]; + map->blorp = &ice->blorp; + crocus_map_copy_region(map); + } else { + /* Otherwise we're free to map on the CPU. */ + + if (resource->target != PIPE_BUFFER) { + crocus_resource_access_raw(ice, res, + level, box->z, box->depth, + usage & PIPE_MAP_WRITE); + } + + if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) { + for (int i = 0; i < ice->batch_count; i++) { + if (crocus_batch_references(&ice->batches[i], res->bo)) + crocus_batch_flush(&ice->batches[i]); + } + } + + if (surf->tiling == ISL_TILING_W) { + /* TODO: Teach crocus_map_tiled_memcpy about W-tiling... */ + crocus_map_s8(map); + } else if (surf->tiling != ISL_TILING_LINEAR) { + crocus_map_tiled_memcpy(map); + } else { + crocus_map_direct(map); + } + } + + return map->ptr; +} + +static void +crocus_transfer_flush_region(struct pipe_context *ctx, + struct pipe_transfer *xfer, + const struct pipe_box *box) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_resource *res = (struct crocus_resource *) xfer->resource; + struct crocus_transfer *map = (void *) xfer; + + if (map->staging) + crocus_flush_staging_region(xfer, box); + + uint32_t history_flush = 0; + + if (res->base.target == PIPE_BUFFER) { + if (map->staging) + history_flush |= PIPE_CONTROL_RENDER_TARGET_FLUSH; + + if (map->dest_had_defined_contents) + history_flush |= crocus_flush_bits_for_history(res); + + util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width); + } + + if (history_flush & ~PIPE_CONTROL_CS_STALL) { + for (int i = 0; i < ice->batch_count; i++) { + struct crocus_batch *batch = &ice->batches[i]; + + if (!batch->command.bo) + continue; + if (batch->contains_draw || batch->cache.render->entries) { + crocus_batch_maybe_flush(batch, 24); + crocus_emit_pipe_control_flush(batch, + "cache history: transfer flush", + history_flush); + } + } + } + + /* Make sure we flag constants dirty even if there's no need to emit + * any PIPE_CONTROLs to a batch. + */ + crocus_dirty_for_history(ice, res); +} + +static void +crocus_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *xfer) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_transfer *map = (void *) xfer; + + if (!(xfer->usage & (PIPE_MAP_FLUSH_EXPLICIT | + PIPE_MAP_COHERENT))) { + struct pipe_box flush_box = { + .x = 0, .y = 0, .z = 0, + .width = xfer->box.width, + .height = xfer->box.height, + .depth = xfer->box.depth, + }; + crocus_transfer_flush_region(ctx, xfer, &flush_box); + } + + if (map->unmap) + map->unmap(map); + + pipe_resource_reference(&xfer->resource, NULL); + slab_free(&ice->transfer_pool, map); +} + +/** + * Mark state dirty that needs to be re-emitted when a resource is written. + */ +void +crocus_dirty_for_history(struct crocus_context *ice, + struct crocus_resource *res) +{ + uint64_t stage_dirty = 0ull; + + if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) { + stage_dirty |= ((uint64_t)res->bind_stages) << CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS; + } + + ice->state.stage_dirty |= stage_dirty; +} + +/** + * Produce a set of PIPE_CONTROL bits which ensure data written to a + * resource becomes visible, and any stale read cache data is invalidated. + */ +uint32_t +crocus_flush_bits_for_history(struct crocus_resource *res) +{ + uint32_t flush = PIPE_CONTROL_CS_STALL; + + if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) { + flush |= PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + } + + if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) + flush |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + + if (res->bind_history & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER)) + flush |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + + if (res->bind_history & (PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE)) + flush |= PIPE_CONTROL_DATA_CACHE_FLUSH; + + return flush; +} + +void +crocus_flush_and_dirty_for_history(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + uint32_t extra_flags, + const char *reason) +{ + if (res->base.target != PIPE_BUFFER) + return; + + uint32_t flush = crocus_flush_bits_for_history(res) | extra_flags; + + crocus_emit_pipe_control_flush(batch, reason, flush); + + crocus_dirty_for_history(ice, res); +} + +bool +crocus_resource_set_clear_color(struct crocus_context *ice, + struct crocus_resource *res, + union isl_color_value color) +{ + if (memcmp(&res->aux.clear_color, &color, sizeof(color)) != 0) { + res->aux.clear_color = color; + return true; + } + + return false; +} + +union isl_color_value +crocus_resource_get_clear_color(const struct crocus_resource *res) +{ + assert(res->aux.bo); + + return res->aux.clear_color; +} + +static enum pipe_format +crocus_resource_get_internal_format(struct pipe_resource *p_res) +{ + struct crocus_resource *res = (void *) p_res; + return res->internal_format; +} + +static const struct u_transfer_vtbl transfer_vtbl = { + .resource_create = crocus_resource_create, + .resource_destroy = crocus_resource_destroy, + .transfer_map = crocus_transfer_map, + .transfer_unmap = crocus_transfer_unmap, + .transfer_flush_region = crocus_transfer_flush_region, + .get_internal_format = crocus_resource_get_internal_format, + .set_stencil = crocus_resource_set_separate_stencil, + .get_stencil = crocus_resource_get_separate_stencil, +}; + +static bool +crocus_is_dmabuf_modifier_supported(struct pipe_screen *pscreen, + uint64_t modifier, enum pipe_format pfmt, + bool *external_only) +{ + struct crocus_screen *screen = (void *) pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (modifier_is_supported(devinfo, pfmt, modifier)) { + if (external_only) + *external_only = false; + + return true; + } + + return false; +} + +static unsigned int +crocus_get_dmabuf_modifier_planes(struct pipe_screen *pscreen, uint64_t modifier, + enum pipe_format format) +{ + return util_format_get_num_planes(format); +} + +void +crocus_init_screen_resource_functions(struct pipe_screen *pscreen) +{ + struct crocus_screen *screen = (void *) pscreen; + pscreen->query_dmabuf_modifiers = crocus_query_dmabuf_modifiers; + pscreen->is_dmabuf_modifier_supported = crocus_is_dmabuf_modifier_supported; + pscreen->get_dmabuf_modifier_planes = crocus_get_dmabuf_modifier_planes; + pscreen->resource_create_with_modifiers = + crocus_resource_create_with_modifiers; + pscreen->resource_create = u_transfer_helper_resource_create; + pscreen->resource_from_user_memory = crocus_resource_from_user_memory; + pscreen->resource_from_handle = crocus_resource_from_handle; + pscreen->resource_get_handle = crocus_resource_get_handle; + pscreen->resource_get_param = crocus_resource_get_param; + pscreen->resource_destroy = u_transfer_helper_resource_destroy; + pscreen->transfer_helper = + u_transfer_helper_create(&transfer_vtbl, screen->devinfo.ver >= 6, + screen->devinfo.ver >= 6, false, true); +} + +void +crocus_init_resource_functions(struct pipe_context *ctx) +{ + ctx->flush_resource = crocus_flush_resource; + ctx->invalidate_resource = crocus_invalidate_resource; + ctx->buffer_map = u_transfer_helper_transfer_map; + ctx->texture_map = u_transfer_helper_transfer_map; + ctx->transfer_flush_region = u_transfer_helper_transfer_flush_region; + ctx->buffer_unmap = u_transfer_helper_transfer_unmap; + ctx->texture_unmap = u_transfer_helper_transfer_unmap; + ctx->buffer_subdata = u_default_buffer_subdata; + ctx->texture_subdata = u_default_texture_subdata; +} diff --git a/src/gallium/drivers/crocus/crocus_resource.h b/src/gallium/drivers/crocus/crocus_resource.h new file mode 100644 index 00000000000..8eb49118f54 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_resource.h @@ -0,0 +1,501 @@ +/* + * Copyright 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_RESOURCE_H +#define CROCUS_RESOURCE_H + +#include "pipe/p_state.h" +#include "util/u_inlines.h" +#include "util/u_range.h" +#include "intel/isl/isl.h" + +#include "crocus_bufmgr.h" + +struct crocus_batch; +struct crocus_context; + +#define CROCUS_MAX_MIPLEVELS 15 + +struct crocus_format_info { + enum isl_format fmt; + enum pipe_swizzle swizzles[4]; +}; + +static inline enum isl_channel_select +pipe_to_isl_swizzle(const enum pipe_swizzle pswz, bool green_to_blue) +{ + unsigned swz = (pswz + 4) & 7; + + return (green_to_blue && swz == ISL_CHANNEL_SELECT_GREEN) ? ISL_CHANNEL_SELECT_BLUE : swz; +} + +static inline struct isl_swizzle +pipe_to_isl_swizzles(const enum pipe_swizzle pswz[4]) +{ + struct isl_swizzle swz; + swz.r = pipe_to_isl_swizzle(pswz[0], false); + swz.g = pipe_to_isl_swizzle(pswz[1], false); + swz.b = pipe_to_isl_swizzle(pswz[2], false); + swz.a = pipe_to_isl_swizzle(pswz[3], false); + return swz; +} + +static inline void +crocus_combine_swizzle(enum pipe_swizzle outswz[4], + const enum pipe_swizzle fswz[4], + const enum pipe_swizzle vswz[4]) +{ + for (unsigned i = 0; i < 4; i++) { + switch (vswz[i]) { + case PIPE_SWIZZLE_X: outswz[i] = fswz[0]; break; + case PIPE_SWIZZLE_Y: outswz[i] = fswz[1]; break; + case PIPE_SWIZZLE_Z: outswz[i] = fswz[2]; break; + case PIPE_SWIZZLE_W: outswz[i] = fswz[3]; break; + case PIPE_SWIZZLE_1: outswz[i] = PIPE_SWIZZLE_1; break; + case PIPE_SWIZZLE_0: outswz[i] = PIPE_SWIZZLE_0; break; + default: unreachable("invalid swizzle"); + } + } +} + +/** + * Resources represent a GPU buffer object or image (mipmap tree). + * + * They contain the storage (BO) and layout information (ISL surface). + */ +struct crocus_resource { + struct pipe_resource base; + enum pipe_format internal_format; + + /** + * The ISL surface layout information for this resource. + * + * This is not filled out for PIPE_BUFFER resources, but is guaranteed + * to be zeroed. Note that this also guarantees that res->surf.tiling + * will be ISL_TILING_LINEAR, so it's safe to check that. + */ + struct isl_surf surf; + + /** Backing storage for the resource */ + struct crocus_bo *bo; + + /** offset at which data starts in the BO */ + uint64_t offset; + + /** + * A bitfield of PIPE_BIND_* indicating how this resource was bound + * in the past. Only meaningful for PIPE_BUFFER; used for flushing. + */ + unsigned bind_history; + + /** + * A bitfield of MESA_SHADER_* stages indicating where this resource + * was bound. + */ + unsigned bind_stages; + + /** + * For PIPE_BUFFER resources, a range which may contain valid data. + * + * This is a conservative estimate of what part of the buffer contains + * valid data that we have to preserve. The rest of the buffer is + * considered invalid, and we can promote writes to that region to + * be unsynchronized writes, avoiding blit copies. + */ + struct util_range valid_buffer_range; + + /** + * Auxiliary buffer information (CCS, MCS, or HiZ). + */ + struct { + /** The surface layout for the auxiliary buffer. */ + struct isl_surf surf; + + /** The buffer object containing the auxiliary data. */ + struct crocus_bo *bo; + + /** Offset into 'bo' where the auxiliary surface starts. */ + uint32_t offset; + + struct { + struct isl_surf surf; + + /** Offset into 'bo' where the auxiliary surface starts. */ + uint32_t offset; + } extra_aux; + + /** + * Fast clear color for this surface. For depth surfaces, the clear + * value is stored as a float32 in the red component. + */ + union isl_color_value clear_color; + + /** + * \brief The type of auxiliary compression used by this resource. + * + * This describes the type of auxiliary compression that is intended to + * be used by this resource. An aux usage of ISL_AUX_USAGE_NONE means + * that auxiliary compression is permanently disabled. An aux usage + * other than ISL_AUX_USAGE_NONE does not imply that auxiliary + * compression will always be enabled for this surface. + */ + enum isl_aux_usage usage; + + /** + * \brief Maps miptree slices to their current aux state. + * + * This two-dimensional array is indexed as [level][layer] and stores an + * aux state for each slice. + */ + enum isl_aux_state **state; + + /** + * If (1 << level) is set, HiZ is enabled for that miplevel. + */ + uint16_t has_hiz; + } aux; + + /** + * \brief Shadow miptree for sampling when the main isn't supported by HW. + * + * To workaround various sampler bugs and limitations, we blit the main + * texture into a new texture that can be sampled. + * + * This miptree may be used for: + * - Stencil texturing (pre-BDW) as required by GL_ARB_stencil_texturing. + */ + struct crocus_resource *shadow; + bool shadow_needs_update; + + /** + * For external surfaces, this is format that was used to create or import + * the surface. For internal surfaces, this will always be + * PIPE_FORMAT_NONE. + */ + enum pipe_format external_format; + + /** + * For external surfaces, this is DRM format modifier that was used to + * create or import the surface. For internal surfaces, this will always + * be DRM_FORMAT_MOD_INVALID. + */ + const struct isl_drm_modifier_info *mod_info; + + /** + * The screen the resource was originally created with, stored for refcounting. + */ + struct pipe_screen *orig_screen; +}; + +/** + * A simple tuple for storing a reference to a + * piece of state stored in a GPU buffer object. + */ +struct crocus_state_ref { + struct pipe_resource *res; + uint32_t offset; +}; + +/** + * Gallium CSO for sampler views (texture views). + * + * In addition to the normal pipe_resource, this adds an ISL view + * which may reinterpret the format or restrict levels/layers. + * + * These can also be linear texture buffers. + */ +struct crocus_sampler_view { + struct pipe_sampler_view base; + struct isl_view view; + struct isl_view gather_view; + + enum pipe_swizzle swizzle[4]; + union isl_color_value clear_color; + + /* A short-cut (not a reference) to the actual resource being viewed. + * Multi-planar (or depth+stencil) images may have multiple resources + * chained together; this skips having to traverse base->texture->*. + */ + struct crocus_resource *res; +}; + +/** + * Image view representation. + */ +struct crocus_image_view { + struct pipe_image_view base; + struct isl_view view; +}; + +/** + * Gallium CSO for surfaces (framebuffer attachments). + * + * A view of a surface that can be bound to a color render target or + * depth/stencil attachment. + */ +struct crocus_surface { + struct pipe_surface base; + struct isl_view view; + struct isl_view read_view; + struct isl_surf surf; + union isl_color_value clear_color; + + struct pipe_resource *align_res; +}; + +/** + * Transfer object - information about a buffer mapping. + */ +struct crocus_transfer { + struct pipe_transfer base; + struct pipe_debug_callback *dbg; + void *buffer; + void *ptr; + + /** A linear staging resource for GPU-based copy_region transfers. */ + struct pipe_resource *staging; + struct blorp_context *blorp; + struct crocus_batch *batch; + + bool dest_had_defined_contents; + bool has_swizzling; + + void (*unmap)(struct crocus_transfer *); +}; + +/** + * Unwrap a pipe_resource to get the underlying crocus_bo (for convenience). + */ +static inline struct crocus_bo * +crocus_resource_bo(struct pipe_resource *p_res) +{ + struct crocus_resource *res = (void *) p_res; + return res->bo; +} + +static inline uint32_t +crocus_mocs(const struct crocus_bo *bo, + const struct isl_device *dev) +{ + return isl_mocs(dev, 0, bo && crocus_bo_is_external(bo)); +} + +struct crocus_format_info crocus_format_for_usage(const struct intel_device_info *, + enum pipe_format pf, + isl_surf_usage_flags_t usage); + +struct pipe_resource *crocus_resource_get_separate_stencil(struct pipe_resource *); + +void crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo, + struct pipe_resource *res, + struct crocus_resource **out_z, + struct crocus_resource **out_s); +bool crocus_resource_set_clear_color(struct crocus_context *ice, + struct crocus_resource *res, + union isl_color_value color); +union isl_color_value +crocus_resource_get_clear_color(const struct crocus_resource *res); + +void crocus_init_screen_resource_functions(struct pipe_screen *pscreen); + +void crocus_dirty_for_history(struct crocus_context *ice, + struct crocus_resource *res); +uint32_t crocus_flush_bits_for_history(struct crocus_resource *res); + +void crocus_flush_and_dirty_for_history(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + uint32_t extra_flags, + const char *reason); + +unsigned crocus_get_num_logical_layers(const struct crocus_resource *res, + unsigned level); + +void crocus_resource_disable_aux(struct crocus_resource *res); + +#define INTEL_REMAINING_LAYERS UINT32_MAX +#define INTEL_REMAINING_LEVELS UINT32_MAX + +void +crocus_hiz_exec(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + unsigned int level, unsigned int start_layer, + unsigned int num_layers, enum isl_aux_op op, + bool update_clear_depth); + +/** + * Prepare a miptree for access + * + * This function should be called prior to any access to miptree in order to + * perform any needed resolves. + * + * \param[in] start_level The first mip level to be accessed + * + * \param[in] num_levels The number of miplevels to be accessed or + * INTEL_REMAINING_LEVELS to indicate every level + * above start_level will be accessed + * + * \param[in] start_layer The first array slice or 3D layer to be accessed + * + * \param[in] num_layers The number of array slices or 3D layers be + * accessed or INTEL_REMAINING_LAYERS to indicate + * every layer above start_layer will be accessed + * + * \param[in] aux_supported Whether or not the access will support the + * miptree's auxiliary compression format; this + * must be false for uncompressed miptrees + * + * \param[in] fast_clear_supported Whether or not the access will support + * fast clears in the miptree's auxiliary + * compression format + */ +void +crocus_resource_prepare_access(struct crocus_context *ice, + struct crocus_resource *res, + uint32_t start_level, uint32_t num_levels, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_usage aux_usage, + bool fast_clear_supported); + +/** + * Complete a write operation + * + * This function should be called after any operation writes to a miptree. + * This will update the miptree's compression state so that future resolves + * happen correctly. Technically, this function can be called before the + * write occurs but the caller must ensure that they don't interlace + * crocus_resource_prepare_access and crocus_resource_finish_write calls to + * overlapping layer/level ranges. + * + * \param[in] level The mip level that was written + * + * \param[in] start_layer The first array slice or 3D layer written + * + * \param[in] num_layers The number of array slices or 3D layers + * written or INTEL_REMAINING_LAYERS to indicate + * every layer above start_layer was written + * + * \param[in] written_with_aux Whether or not the write was done with + * auxiliary compression enabled + */ +void +crocus_resource_finish_write(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_usage aux_usage); + +/** Get the auxiliary compression state of a miptree slice */ +enum isl_aux_state +crocus_resource_get_aux_state(const struct crocus_resource *res, + uint32_t level, uint32_t layer); + +/** + * Set the auxiliary compression state of a miptree slice range + * + * This function directly sets the auxiliary compression state of a slice + * range of a miptree. It only modifies data structures and does not do any + * resolves. This should only be called by code which directly performs + * compression operations such as fast clears and resolves. Most code should + * use crocus_resource_prepare_access or crocus_resource_finish_write. + */ +void +crocus_resource_set_aux_state(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_state aux_state); + +/** + * Prepare a miptree for raw access + * + * This helper prepares the miptree for access that knows nothing about any + * sort of compression whatsoever. This is useful when mapping the surface or + * using it with the blitter. + */ +static inline void +crocus_resource_access_raw(struct crocus_context *ice, + struct crocus_resource *res, + uint32_t level, uint32_t layer, + uint32_t num_layers, + bool write) +{ + crocus_resource_prepare_access(ice, res, level, 1, layer, num_layers, + ISL_AUX_USAGE_NONE, false); + if (write) { + crocus_resource_finish_write(ice, res, level, layer, num_layers, + ISL_AUX_USAGE_NONE); + } +} + +void +crocus_resource_get_image_offset(struct crocus_resource *res, + uint32_t level, uint32_t z, + uint32_t *x, uint32_t *y); +static inline enum isl_aux_usage +crocus_resource_texture_aux_usage(const struct crocus_resource *res) +{ + return res->aux.usage == ISL_AUX_USAGE_MCS ? ISL_AUX_USAGE_MCS : ISL_AUX_USAGE_NONE; +} + +void crocus_resource_prepare_texture(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format view_format, + uint32_t start_level, uint32_t num_levels, + uint32_t start_layer, uint32_t num_layers); + +static inline bool +crocus_resource_unfinished_aux_import(struct crocus_resource *res) +{ + return res->base.next != NULL && res->mod_info && + res->mod_info->aux_usage != ISL_AUX_USAGE_NONE; +} + +void crocus_resource_finish_aux_import(struct pipe_screen *pscreen, + struct crocus_resource *res); + +bool crocus_has_invalid_primary(const struct crocus_resource *res, + unsigned start_level, unsigned num_levels, + unsigned start_layer, unsigned num_layers); + +void crocus_resource_check_level_layer(const struct crocus_resource *res, + uint32_t level, uint32_t layer); + +bool crocus_resource_level_has_hiz(const struct crocus_resource *res, + uint32_t level); +bool crocus_has_color_unresolved(const struct crocus_resource *res, + unsigned start_level, unsigned num_levels, + unsigned start_layer, unsigned num_layers); + +enum isl_aux_usage crocus_resource_render_aux_usage(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format render_fmt, + bool blend_enabled, + bool draw_aux_disabled); +void crocus_resource_prepare_render(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t layer_count, + enum isl_aux_usage aux_usage); +void crocus_resource_finish_render(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t layer_count, + enum isl_aux_usage aux_usage); +#endif diff --git a/src/gallium/drivers/crocus/crocus_screen.c b/src/gallium/drivers/crocus/crocus_screen.c new file mode 100644 index 00000000000..d5331d66730 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_screen.c @@ -0,0 +1,829 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_screen.c + * + * Screen related driver hooks and capability lists. + * + * A program may use multiple rendering contexts (crocus_context), but + * they all share a common screen (crocus_screen). Global driver state + * can be stored in the screen; it may be accessed by multiple threads. + */ + +#include +#include +#include +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/debug.h" +#include "util/u_inlines.h" +#include "util/format/u_format.h" +#include "util/u_transfer_helper.h" +#include "util/u_upload_mgr.h" +#include "util/ralloc.h" +#include "util/xmlconfig.h" +#include "drm-uapi/i915_drm.h" +#include "crocus_context.h" +#include "crocus_defines.h" +#include "crocus_fence.h" +#include "crocus_pipe.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/common/intel_gem.h" +#include "intel/common/intel_l3_config.h" +#include "crocus_monitor.h" + +#define genX_call(devinfo, func, ...) \ + switch ((devinfo)->verx10) { \ + case 75: \ + gfx75_##func(__VA_ARGS__); \ + break; \ + case 70: \ + gfx7_##func(__VA_ARGS__); \ + break; \ + case 60: \ + gfx6_##func(__VA_ARGS__); \ + break; \ + case 50: \ + gfx5_##func(__VA_ARGS__); \ + break; \ + case 45: \ + gfx45_##func(__VA_ARGS__); \ + break; \ + case 40: \ + gfx4_##func(__VA_ARGS__); \ + break; \ + default: \ + unreachable("Unknown hardware generation"); \ + } + +static void +crocus_flush_frontbuffer(struct pipe_screen *_screen, + struct pipe_context *_pipe, + struct pipe_resource *resource, + unsigned level, unsigned layer, + void *context_private, struct pipe_box *box) +{ +} + +static const char * +crocus_get_vendor(struct pipe_screen *pscreen) +{ + return "Intel"; +} + +static const char * +crocus_get_device_vendor(struct pipe_screen *pscreen) +{ + return "Intel"; +} + +static const char * +crocus_get_name(struct pipe_screen *pscreen) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + static char buf[128]; + + const char *name = intel_get_device_name(screen->pci_id); + + if (!name) + name = "Intel Unknown"; + + snprintf(buf, sizeof(buf), "Mesa %s", name); + return buf; +} + +static uint64_t +get_aperture_size(int fd) +{ + struct drm_i915_gem_get_aperture aperture = {}; + intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture); + return aperture.aper_size; +} + +static int +crocus_get_param(struct pipe_screen *pscreen, enum pipe_cap param) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + switch (param) { + case PIPE_CAP_NPOT_TEXTURES: + case PIPE_CAP_ANISOTROPIC_FILTER: + case PIPE_CAP_POINT_SPRITE: + case PIPE_CAP_OCCLUSION_QUERY: + case PIPE_CAP_TEXTURE_SWIZZLE: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: + case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: + case PIPE_CAP_VERTEX_SHADER_SATURATE: + case PIPE_CAP_PRIMITIVE_RESTART: + case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX: + case PIPE_CAP_INDEP_BLEND_ENABLE: + case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: + case PIPE_CAP_DEPTH_CLIP_DISABLE: + case PIPE_CAP_TGSI_INSTANCEID: + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: + case PIPE_CAP_SEAMLESS_CUBE_MAP: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: + case PIPE_CAP_CONDITIONAL_RENDER: + case PIPE_CAP_TEXTURE_BARRIER: + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: + case PIPE_CAP_START_INSTANCE: + case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: + case PIPE_CAP_FORCE_PERSAMPLE_INTERP: + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: + case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: + case PIPE_CAP_ACCELERATED: + case PIPE_CAP_UMA: + case PIPE_CAP_CLIP_HALFZ: + case PIPE_CAP_TGSI_TEXCOORD: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_POLYGON_OFFSET_CLAMP: + case PIPE_CAP_TGSI_TEX_TXF_LZ: + case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: + case PIPE_CAP_CLEAR_TEXTURE: + case PIPE_CAP_TGSI_VOTE: + case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: + case PIPE_CAP_TEXTURE_GATHER_SM5: + case PIPE_CAP_TGSI_ARRAY_COMPONENTS: + case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS: + case PIPE_CAP_NIR_COMPACT_ARRAYS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: + case PIPE_CAP_CS_DERIVED_SYSTEM_VALUES_SUPPORTED: + case PIPE_CAP_FENCE_SIGNAL: + case PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION: + return true; + case PIPE_CAP_INT64: + case PIPE_CAP_INT64_DIVMOD: + case PIPE_CAP_TGSI_BALLOT: + case PIPE_CAP_PACKED_UNIFORMS: + case PIPE_CAP_GL_CLAMP: + return false; + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + return devinfo->ver <= 5; + case PIPE_CAP_TEXTURE_QUERY_LOD: + case PIPE_CAP_QUERY_TIME_ELAPSED: + return devinfo->ver >= 5; + case PIPE_CAP_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: + case PIPE_CAP_TGSI_CLOCK: + case PIPE_CAP_TGSI_TXQS: + case PIPE_CAP_COMPUTE: + case PIPE_CAP_SAMPLER_VIEW_TARGET: + case PIPE_CAP_SHADER_SAMPLES_IDENTICAL: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: + case PIPE_CAP_GL_SPIRV: + case PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS: + case PIPE_CAP_COMPUTE_SHADER_DERIVATIVES: + case PIPE_CAP_DOUBLES: + return devinfo->ver >= 7; + case PIPE_CAP_QUERY_BUFFER_OBJECT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: + return devinfo->is_haswell; + case PIPE_CAP_CULL_DISTANCE: + case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE: + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + case PIPE_CAP_SAMPLE_SHADING: + case PIPE_CAP_CUBE_MAP_ARRAY: + case PIPE_CAP_QUERY_SO_OVERFLOW: + case PIPE_CAP_TEXTURE_MULTISAMPLE: + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + case PIPE_CAP_QUERY_TIMESTAMP: + case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + case PIPE_CAP_INDEP_BLEND_FUNC: + case PIPE_CAP_TEXTURE_SHADOW_LOD: + case PIPE_CAP_LOAD_CONSTBUF: + case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_CLEAR_SCISSORED: + return devinfo->ver >= 6; + case PIPE_CAP_FBFETCH: + return devinfo->verx10 >= 45 ? BRW_MAX_DRAW_BUFFERS : 0; + case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: + return devinfo->ver >= 6 ? 1 : 0; + case PIPE_CAP_MAX_RENDER_TARGETS: + return BRW_MAX_DRAW_BUFFERS; + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: + if (devinfo->ver >= 7) + return 16384; + else + return 8192; + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + if (devinfo->ver >= 7) + return CROCUS_MAX_MIPLEVELS; /* 16384x16384 */ + else + return CROCUS_MAX_MIPLEVELS - 1; /* 8192x8192 */ + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + return 12; /* 2048x2048 */ + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + return (devinfo->ver >= 6) ? 4 : 0; + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return devinfo->ver >= 7 ? 2048 : 512; + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + return BRW_MAX_SOL_BINDINGS / CROCUS_MAX_SOL_BUFFERS; + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + return BRW_MAX_SOL_BINDINGS; + case PIPE_CAP_GLSL_FEATURE_LEVEL: { + if (devinfo->is_haswell) + return 460; + else if (devinfo->ver >= 7) + return 420; + else if (devinfo->ver >= 6) + return 330; + return 120; + } + case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: + return devinfo->ver < 6 ? 120 : 130; + + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: + /* 3DSTATE_CONSTANT_XS requires the start of UBOs to be 32B aligned */ + return 32; + case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: + return CROCUS_MAP_BUFFER_ALIGNMENT; + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + /* Choose a cacheline (64 bytes) so that we can safely have the CPU and + * GPU writing the same SSBO on non-coherent systems (Atom CPUs). With + * UBOs, the GPU never writes, so there's no problem. For an SSBO, the + * GPU and the CPU can be updating disjoint regions of the buffer + * simultaneously and that will break if the regions overlap the same + * cacheline. + */ + return devinfo->ver >= 7 ? 64 : 0; + case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: + return devinfo->ver >= 7 ? (1 << 27) : 0; + case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: + return 16; // XXX: u_screen says 256 is the minimum value... + case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: + return true; + case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: + return CROCUS_MAX_TEXTURE_BUFFER_SIZE; + case PIPE_CAP_MAX_VIEWPORTS: + return devinfo->ver >= 6 ? 16 : 1; + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + return devinfo->ver >= 6 ? 256 : 0; + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + return devinfo->ver >= 6 ? 1024 : 0; + case PIPE_CAP_MAX_GS_INVOCATIONS: + return devinfo->ver >= 7 ? 32 : 1; + case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + if (devinfo->ver >= 7) + return 4; + else if (devinfo->ver == 6) + return 1; + else + return 0; + case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: + if (devinfo->ver >= 7) + return -32; + else if (devinfo->ver == 6) + return -8; + else + return 0; + case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: + if (devinfo->ver >= 7) + return 31; + else if (devinfo->ver == 6) + return 7; + else + return 0; + case PIPE_CAP_MAX_VERTEX_STREAMS: + return devinfo->ver >= 7 ? 4 : 1; + case PIPE_CAP_VENDOR_ID: + return 0x8086; + case PIPE_CAP_DEVICE_ID: + return screen->pci_id; + case PIPE_CAP_VIDEO_MEMORY: { + /* Once a batch uses more than 75% of the maximum mappable size, we + * assume that there's some fragmentation, and we start doing extra + * flushing, etc. That's the big cliff apps will care about. + */ + const unsigned gpu_mappable_megabytes = + (screen->aperture_bytes * 3 / 4) / (1024 * 1024); + + const long system_memory_pages = sysconf(_SC_PHYS_PAGES); + const long system_page_size = sysconf(_SC_PAGE_SIZE); + + if (system_memory_pages <= 0 || system_page_size <= 0) + return -1; + + const uint64_t system_memory_bytes = + (uint64_t) system_memory_pages * (uint64_t) system_page_size; + + const unsigned system_memory_megabytes = + (unsigned) (system_memory_bytes / (1024 * 1024)); + + return MIN2(system_memory_megabytes, gpu_mappable_megabytes); + } + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + case PIPE_CAP_MAX_VARYINGS: + return (screen->devinfo.ver >= 6) ? 32 : 16; + case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: + /* AMD_pinned_memory assumes the flexibility of using client memory + * for any buffer (incl. vertex buffers) which rules out the prospect + * of using snooped buffers, as using snooped buffers without + * cogniscience is likely to be detrimental to performance and require + * extensive checking in the driver for correctness, e.g. to prevent + * illegal snoop <-> snoop transfers. + */ + return devinfo->has_llc; + case PIPE_CAP_THROTTLE: + return screen->driconf.disable_throttling ? 0 : 1; + + case PIPE_CAP_CONTEXT_PRIORITY_MASK: + return PIPE_CONTEXT_PRIORITY_LOW | + PIPE_CONTEXT_PRIORITY_MEDIUM | + PIPE_CONTEXT_PRIORITY_HIGH; + + case PIPE_CAP_FRONTEND_NOOP: + return true; + // XXX: don't hardcode 00:00:02.0 PCI here + case PIPE_CAP_PCI_GROUP: + return 0; + case PIPE_CAP_PCI_BUS: + return 0; + case PIPE_CAP_PCI_DEVICE: + return 2; + case PIPE_CAP_PCI_FUNCTION: + return 0; + + default: + return u_pipe_screen_get_param_defaults(pscreen, param); + } + return 0; +} + +static float +crocus_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + if (devinfo->ver >= 6) + return 7.375f; + else + return 7.0f; + + case PIPE_CAPF_MAX_POINT_WIDTH: + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + return 255.0f; + + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 16.0f; + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 15.0f; + case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: + return 0.0f; + default: + unreachable("unknown param"); + } +} + +static int +crocus_get_shader_param(struct pipe_screen *pscreen, + enum pipe_shader_type p_stage, + enum pipe_shader_cap param) +{ + gl_shader_stage stage = stage_from_pipe(p_stage); + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (devinfo->ver < 6 && + p_stage != PIPE_SHADER_VERTEX && + p_stage != PIPE_SHADER_FRAGMENT) + return 0; + + if (devinfo->ver == 6 && + p_stage != PIPE_SHADER_VERTEX && + p_stage != PIPE_SHADER_FRAGMENT && + p_stage != PIPE_SHADER_GEOMETRY) + return 0; + + /* this is probably not totally correct.. but it's a start: */ + switch (param) { + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + return stage == MESA_SHADER_FRAGMENT ? 1024 : 16384; + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + return stage == MESA_SHADER_FRAGMENT ? 1024 : 0; + + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return UINT_MAX; + + case PIPE_SHADER_CAP_MAX_INPUTS: + if (stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_GEOMETRY) + return 16; /* Gen7 vec4 geom backend */ + return 32; + case PIPE_SHADER_CAP_MAX_OUTPUTS: + return 32; + case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: + return 16 * 1024 * sizeof(float); + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + return devinfo->ver >= 6 ? 16 : 1; + case PIPE_SHADER_CAP_MAX_TEMPS: + return 256; /* GL_MAX_PROGRAM_TEMPORARIES_ARB */ + case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: + return 0; + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + /* Lie about these to avoid st/mesa's GLSL IR lowering of indirects, + * which we don't want. Our compiler backend will check brw_compiler's + * options and call nir_lower_indirect_derefs appropriately anyway. + */ + return true; + case PIPE_SHADER_CAP_SUBROUTINES: + return 0; + case PIPE_SHADER_CAP_INTEGERS: + return 1; + case PIPE_SHADER_CAP_INT64_ATOMICS: + case PIPE_SHADER_CAP_FP16: + return 0; + case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: + case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: + return devinfo->is_haswell ? CROCUS_MAX_TEXTURE_SAMPLERS : 16; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + if (devinfo->ver >= 7 && + (p_stage == PIPE_SHADER_FRAGMENT || + p_stage == PIPE_SHADER_COMPUTE)) + return CROCUS_MAX_TEXTURE_SAMPLERS; + return 0; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return devinfo->ver >= 7 ? (CROCUS_MAX_ABOS + CROCUS_MAX_SSBOS) : 0; + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: + return 0; + case PIPE_SHADER_CAP_PREFERRED_IR: + return PIPE_SHADER_IR_NIR; + case PIPE_SHADER_CAP_SUPPORTED_IRS: + return 1 << PIPE_SHADER_IR_NIR; + case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: + return 1; + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + case PIPE_SHADER_CAP_FP16_DERIVATIVES: + case PIPE_SHADER_CAP_INT16: + case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS: + case PIPE_SHADER_CAP_FP16_CONST_BUFFERS: + return 0; + default: + unreachable("unknown shader param"); + } +} + +static int +crocus_get_compute_param(struct pipe_screen *pscreen, + enum pipe_shader_ir ir_type, + enum pipe_compute_cap param, + void *ret) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + const unsigned max_threads = MIN2(64, devinfo->max_cs_threads); + const uint32_t max_invocations = 32 * max_threads; + + if (devinfo->ver < 7) + return 0; +#define RET(x) do { \ + if (ret) \ + memcpy(ret, x, sizeof(x)); \ + return sizeof(x); \ +} while (0) + + switch (param) { + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + RET((uint32_t []){ 32 }); + + case PIPE_COMPUTE_CAP_IR_TARGET: + if (ret) + strcpy(ret, "gen"); + return 4; + + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + RET((uint64_t []) { 3 }); + + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + RET(((uint64_t []) { 65535, 65535, 65535 })); + + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + /* MaxComputeWorkGroupSize[0..2] */ + RET(((uint64_t []) {max_invocations, max_invocations, max_invocations})); + + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + /* MaxComputeWorkGroupInvocations */ + RET((uint64_t []) { max_invocations }); + + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + /* MaxComputeSharedMemorySize */ + RET((uint64_t []) { 64 * 1024 }); + + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + RET((uint32_t []) { 1 }); + + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + RET((uint32_t []) { BRW_SUBGROUP_SIZE }); + + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + RET((uint64_t []) { max_invocations }); + + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + + // XXX: I think these are for Clover... + return 0; + + default: + unreachable("unknown compute param"); + } +} + +static uint64_t +crocus_get_timestamp(struct pipe_screen *pscreen) +{ + struct crocus_screen *screen = (struct crocus_screen *) pscreen; + const unsigned TIMESTAMP = 0x2358; + uint64_t result; + + crocus_reg_read(screen->bufmgr, TIMESTAMP | 1, &result); + + result = intel_device_info_timebase_scale(&screen->devinfo, result); + result &= (1ull << TIMESTAMP_BITS) - 1; + + return result; +} + +void +crocus_screen_destroy(struct crocus_screen *screen) +{ + u_transfer_helper_destroy(screen->base.transfer_helper); + crocus_bufmgr_unref(screen->bufmgr); + disk_cache_destroy(screen->disk_cache); + close(screen->winsys_fd); + ralloc_free(screen); +} + +static void +crocus_screen_unref(struct pipe_screen *pscreen) +{ + crocus_pscreen_unref(pscreen); +} + +static void +crocus_query_memory_info(struct pipe_screen *pscreen, + struct pipe_memory_info *info) +{ +} + +static const void * +crocus_get_compiler_options(struct pipe_screen *pscreen, + enum pipe_shader_ir ir, + enum pipe_shader_type pstage) +{ + struct crocus_screen *screen = (struct crocus_screen *) pscreen; + gl_shader_stage stage = stage_from_pipe(pstage); + assert(ir == PIPE_SHADER_IR_NIR); + + return screen->compiler->glsl_compiler_options[stage].NirOptions; +} + +static struct disk_cache * +crocus_get_disk_shader_cache(struct pipe_screen *pscreen) +{ + struct crocus_screen *screen = (struct crocus_screen *) pscreen; + return screen->disk_cache; +} + +static const struct intel_l3_config * +crocus_get_default_l3_config(const struct intel_device_info *devinfo, + bool compute) +{ + bool wants_dc_cache = true; + bool has_slm = compute; + const struct intel_l3_weights w = + intel_get_default_l3_weights(devinfo, wants_dc_cache, has_slm); + return intel_get_l3_config(devinfo, w); +} + +static void +crocus_shader_debug_log(void *data, const char *fmt, ...) +{ + struct pipe_debug_callback *dbg = data; + unsigned id = 0; + va_list args; + + if (!dbg->debug_message) + return; + + va_start(args, fmt); + dbg->debug_message(dbg->data, &id, PIPE_DEBUG_TYPE_SHADER_INFO, fmt, args); + va_end(args); +} + +static void +crocus_shader_perf_log(void *data, const char *fmt, ...) +{ + struct pipe_debug_callback *dbg = data; + unsigned id = 0; + va_list args; + va_start(args, fmt); + + if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { + va_list args_copy; + va_copy(args_copy, args); + vfprintf(stderr, fmt, args_copy); + va_end(args_copy); + } + + if (dbg->debug_message) { + dbg->debug_message(dbg->data, &id, PIPE_DEBUG_TYPE_PERF_INFO, fmt, args); + } + + va_end(args); +} + +static bool +crocus_detect_swizzling(struct crocus_screen *screen) +{ + /* Broadwell PRM says: + * + * "Before Gen8, there was a historical configuration control field to + * swizzle address bit[6] for in X/Y tiling modes. This was set in three + * different places: TILECTL[1:0], ARB_MODE[5:4], and + * DISP_ARB_CTL[14:13]. + * + * For Gen8 and subsequent generations, the swizzle fields are all + * reserved, and the CPU's memory controller performs all address + * swizzling modifications." + */ + uint32_t tiling = I915_TILING_X; + uint32_t swizzle_mode = 0; + struct crocus_bo *buffer = + crocus_bo_alloc_tiled(screen->bufmgr, "swizzle test", 32768, + 0, tiling, 512, 0); + if (buffer == NULL) + return false; + + crocus_bo_get_tiling(buffer, &tiling, &swizzle_mode); + crocus_bo_unreference(buffer); + + return swizzle_mode != I915_BIT_6_SWIZZLE_NONE; +} + +struct pipe_screen * +crocus_screen_create(int fd, const struct pipe_screen_config *config) +{ + struct crocus_screen *screen = rzalloc(NULL, struct crocus_screen); + if (!screen) + return NULL; + + if (!intel_get_device_info_from_fd(fd, &screen->devinfo)) + return NULL; + screen->pci_id = screen->devinfo.chipset_id; + screen->no_hw = screen->devinfo.no_hw; + + if (screen->devinfo.ver >= 8) + return NULL; + + p_atomic_set(&screen->refcount, 1); + + screen->aperture_bytes = get_aperture_size(fd); + + if (getenv("INTEL_NO_HW") != NULL) + screen->no_hw = true; + + bool bo_reuse = false; + int bo_reuse_mode = driQueryOptioni(config->options, "bo_reuse"); + switch (bo_reuse_mode) { + case DRI_CONF_BO_REUSE_DISABLED: + break; + case DRI_CONF_BO_REUSE_ALL: + bo_reuse = true; + break; + } + + screen->bufmgr = crocus_bufmgr_get_for_fd(&screen->devinfo, fd, bo_reuse); + if (!screen->bufmgr) + return NULL; + screen->fd = crocus_bufmgr_get_fd(screen->bufmgr); + screen->winsys_fd = fd; + + screen->has_swizzling = crocus_detect_swizzling(screen); + brw_process_intel_debug_variable(); + + screen->driconf.dual_color_blend_by_location = + driQueryOptionb(config->options, "dual_color_blend_by_location"); + screen->driconf.disable_throttling = + driQueryOptionb(config->options, "disable_throttling"); + screen->driconf.always_flush_cache = + driQueryOptionb(config->options, "always_flush_cache"); + + screen->precompile = env_var_as_boolean("shader_precompile", true); + + isl_device_init(&screen->isl_dev, &screen->devinfo, + screen->has_swizzling); + + screen->compiler = brw_compiler_create(screen, &screen->devinfo); + screen->compiler->shader_debug_log = crocus_shader_debug_log; + screen->compiler->shader_perf_log = crocus_shader_perf_log; + screen->compiler->supports_pull_constants = false; + screen->compiler->supports_shader_constants = false; + screen->compiler->compact_params = false; + screen->compiler->constant_buffer_0_is_relative = true; + + if (screen->devinfo.ver == 7) { + screen->l3_config_3d = crocus_get_default_l3_config(&screen->devinfo, false); + screen->l3_config_cs = crocus_get_default_l3_config(&screen->devinfo, true); + } + + crocus_disk_cache_init(screen); + + slab_create_parent(&screen->transfer_pool, + sizeof(struct crocus_transfer), 64); + + screen->subslice_total = intel_device_info_subslice_total(&screen->devinfo); + assert(screen->subslice_total >= 1); + + struct pipe_screen *pscreen = &screen->base; + + crocus_init_screen_fence_functions(pscreen); + crocus_init_screen_resource_functions(pscreen); + + pscreen->destroy = crocus_screen_unref; + pscreen->get_name = crocus_get_name; + pscreen->get_vendor = crocus_get_vendor; + pscreen->get_device_vendor = crocus_get_device_vendor; + pscreen->get_param = crocus_get_param; + pscreen->get_shader_param = crocus_get_shader_param; + pscreen->get_compute_param = crocus_get_compute_param; + pscreen->get_paramf = crocus_get_paramf; + pscreen->get_compiler_options = crocus_get_compiler_options; + pscreen->get_disk_shader_cache = crocus_get_disk_shader_cache; + pscreen->is_format_supported = crocus_is_format_supported; + pscreen->context_create = crocus_create_context; + pscreen->flush_frontbuffer = crocus_flush_frontbuffer; + pscreen->get_timestamp = crocus_get_timestamp; + pscreen->query_memory_info = crocus_query_memory_info; + pscreen->get_driver_query_group_info = crocus_get_monitor_group_info; + pscreen->get_driver_query_info = crocus_get_monitor_info; + + genX_call(&screen->devinfo, init_screen_state, screen); + genX_call(&screen->devinfo, init_screen_query, screen); + return pscreen; +} diff --git a/src/gallium/drivers/crocus/crocus_screen.h b/src/gallium/drivers/crocus/crocus_screen.h new file mode 100644 index 00000000000..4d942eb8415 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_screen.h @@ -0,0 +1,253 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_SCREEN_H +#define CROCUS_SCREEN_H + +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "frontend/drm_driver.h" +#include "util/disk_cache.h" +#include "util/slab.h" +#include "util/u_screen.h" +#include "intel/dev/intel_device_info.h" +#include "intel/isl/isl.h" +#include "crocus_bufmgr.h" +#include "compiler/shader_enums.h" + +struct crocus_monitor_config; +struct crocus_resource; +struct crocus_context; +struct crocus_sampler_state; +struct brw_vue_map; +struct brw_tcs_prog_key; +struct brw_tes_prog_key; +struct brw_cs_prog_key; +struct brw_wm_prog_key; +struct brw_vs_prog_key; +struct brw_gs_prog_key; +struct shader_info; + +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) +#define WRITE_ONCE(x, v) *(volatile __typeof__(x) *)&(x) = (v) + +#define CROCUS_MAX_TEXTURE_SAMPLERS 32 +#define CROCUS_MAX_SOL_BUFFERS 4 +#define CROCUS_MAP_BUFFER_ALIGNMENT 64 + + +/** + * Virtual table for generation-specific (genxml) function calls. + */ +struct crocus_vtable { + void (*destroy_state)(struct crocus_context *ice); + void (*init_render_context)(struct crocus_batch *batch); + void (*init_compute_context)(struct crocus_batch *batch); + void (*upload_render_state)(struct crocus_context *ice, + struct crocus_batch *batch, + const struct pipe_draw_info *draw, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc); + void (*update_surface_base_address)(struct crocus_batch *batch); + + void (*upload_compute_state)(struct crocus_context *ice, + struct crocus_batch *batch, + const struct pipe_grid_info *grid); + void (*rebind_buffer)(struct crocus_context *ice, + struct crocus_resource *res); + void (*resolve_conditional_render)(struct crocus_context *ice); + void (*emit_compute_predicate)(struct crocus_batch *batch); + void (*load_register_reg32)(struct crocus_batch *batch, uint32_t dst, + uint32_t src); + void (*load_register_reg64)(struct crocus_batch *batch, uint32_t dst, + uint32_t src); + void (*load_register_imm32)(struct crocus_batch *batch, uint32_t reg, + uint32_t val); + void (*load_register_imm64)(struct crocus_batch *batch, uint32_t reg, + uint64_t val); + void (*load_register_mem32)(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset); + void (*load_register_mem64)(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset); + void (*store_register_mem32)(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset, + bool predicated); + void (*store_register_mem64)(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset, + bool predicated); + void (*store_data_imm32)(struct crocus_batch *batch, + struct crocus_bo *bo, uint32_t offset, + uint32_t value); + void (*store_data_imm64)(struct crocus_batch *batch, + struct crocus_bo *bo, uint32_t offset, + uint64_t value); + void (*copy_mem_mem)(struct crocus_batch *batch, + struct crocus_bo *dst_bo, uint32_t dst_offset, + struct crocus_bo *src_bo, uint32_t src_offset, + unsigned bytes); + void (*emit_raw_pipe_control)(struct crocus_batch *batch, + const char *reason, uint32_t flags, + struct crocus_bo *bo, uint32_t offset, + uint64_t imm); + + void (*emit_mi_report_perf_count)(struct crocus_batch *batch, + struct crocus_bo *bo, + uint32_t offset_in_bytes, + uint32_t report_id); + + uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol, + const struct brw_vue_map *vue_map); + void (*populate_vs_key)(const struct crocus_context *ice, + const struct shader_info *info, + gl_shader_stage last_stage, + struct brw_vs_prog_key *key); + void (*populate_tcs_key)(const struct crocus_context *ice, + struct brw_tcs_prog_key *key); + void (*populate_tes_key)(const struct crocus_context *ice, + const struct shader_info *info, + gl_shader_stage last_stage, + struct brw_tes_prog_key *key); + void (*populate_gs_key)(const struct crocus_context *ice, + const struct shader_info *info, + gl_shader_stage last_stage, + struct brw_gs_prog_key *key); + void (*populate_fs_key)(const struct crocus_context *ice, + const struct shader_info *info, + struct brw_wm_prog_key *key); + void (*populate_cs_key)(const struct crocus_context *ice, + struct brw_cs_prog_key *key); + void (*lost_genx_state)(struct crocus_context *ice, struct crocus_batch *batch); + + void (*finish_batch)(struct crocus_batch *batch); /* haswell only */ + + void (*upload_urb_fence)(struct crocus_batch *batch); /* gen4/5 only */ + + bool (*blit_blt)(struct crocus_batch *batch, + const struct pipe_blit_info *info); + bool (*copy_region_blt)(struct crocus_batch *batch, + struct crocus_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct crocus_resource *src, + unsigned src_level, + const struct pipe_box *src_box); + bool (*calculate_urb_fence)(struct crocus_batch *batch, unsigned csize, + unsigned vsize, unsigned sfsize); + void (*batch_reset_dirty)(struct crocus_batch *batch); + unsigned (*translate_prim_type)(enum pipe_prim_type prim, uint8_t verts_per_patch); + + void (*update_so_strides)(struct crocus_context *ice, + uint16_t *strides); + + uint32_t (*get_so_offset)(struct pipe_stream_output_target *tgt); +}; + +struct crocus_screen { + struct pipe_screen base; + + uint32_t refcount; + + /** Global slab allocator for crocus_transfer_map objects */ + struct slab_parent_pool transfer_pool; + + /** drm device file descriptor, shared with bufmgr, do not close. */ + int fd; + + /** + * drm device file descriptor to used for window system integration, owned + * by iris_screen, can be a different DRM instance than fd. + */ + int winsys_fd; + + /** PCI ID for our GPU device */ + int pci_id; + + bool no_hw; + + struct crocus_vtable vtbl; + + /** Global program_string_id counter (see get_program_string_id()) */ + unsigned program_id; + + /** Precompile shaders at link time? (Can be disabled for debugging.) */ + bool precompile; + + /** driconf options and application workarounds */ + struct { + /** Dual color blend by location instead of index (for broken apps) */ + bool dual_color_blend_by_location; + bool disable_throttling; + bool always_flush_cache; + } driconf; + + unsigned subslice_total; + + uint64_t aperture_bytes; + + struct intel_device_info devinfo; + struct isl_device isl_dev; + struct crocus_bufmgr *bufmgr; + struct brw_compiler *compiler; + struct crocus_monitor_config *monitor_cfg; + bool has_swizzling; + + const struct intel_l3_config *l3_config_3d; + const struct intel_l3_config *l3_config_cs; + + struct disk_cache *disk_cache; +}; + +struct pipe_screen * +crocus_screen_create(int fd, const struct pipe_screen_config *config); + +void crocus_screen_destroy(struct crocus_screen *screen); + +UNUSED static inline struct pipe_screen * +crocus_pscreen_ref(struct pipe_screen *pscreen) +{ + struct crocus_screen *screen = (struct crocus_screen *) pscreen; + + p_atomic_inc(&screen->refcount); + return pscreen; +} + +UNUSED static inline void +crocus_pscreen_unref(struct pipe_screen *pscreen) +{ + struct crocus_screen *screen = (struct crocus_screen *) pscreen; + + if (p_atomic_dec_zero(&screen->refcount)) + crocus_screen_destroy(screen); +} + +bool +crocus_is_format_supported(struct pipe_screen *pscreen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, + unsigned usage); + +void crocus_disk_cache_init(struct crocus_screen *screen); + +#endif diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c new file mode 100644 index 00000000000..7202140df02 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_state.c @@ -0,0 +1,8382 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_state.c + * + * ============================= GENXML CODE ============================= + * [This file is compiled once per generation.] + * ======================================================================= + * + * This is the main state upload code. + * + * Gallium uses Constant State Objects, or CSOs, for most state. Large, + * complex, or highly reusable state can be created once, and bound and + * rebound multiple times. This is modeled with the pipe->create_*_state() + * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is + * streamed out on the fly, via pipe->set_*_state() hooks. + * + * OpenGL involves frequently mutating context state, which is mirrored in + * core Mesa by highly mutable data structures. However, most applications + * typically draw the same things over and over - from frame to frame, most + * of the same objects are still visible and need to be redrawn. So, rather + * than inventing new state all the time, applications usually mutate to swap + * between known states that we've seen before. + * + * Gallium isolates us from this mutation by tracking API state, and + * distilling it into a set of Constant State Objects, or CSOs. Large, + * complex, or typically reusable state can be created once, then reused + * multiple times. Drivers can create and store their own associated data. + * This create/bind model corresponds to the pipe->create_*_state() and + * pipe->bind_*_state() driver hooks. + * + * Some state is cheap to create, or expected to be highly dynamic. Rather + * than creating and caching piles of CSOs for these, Gallium simply streams + * them out, via the pipe->set_*_state() driver hooks. + * + * To reduce draw time overhead, we try to compute as much state at create + * time as possible. Wherever possible, we translate the Gallium pipe state + * to 3DSTATE commands, and store those commands in the CSO. At draw time, + * we can simply memcpy them into a batch buffer. + * + * No hardware matches the abstraction perfectly, so some commands require + * information from multiple CSOs. In this case, we can store two copies + * of the packet (one in each CSO), and simply | together their DWords at + * draw time. Sometimes the second set is trivial (one or two fields), so + * we simply pack it at draw time. + * + * There are two main components in the file below. First, the CSO hooks + * create/bind/track state. The second are the draw-time upload functions, + * crocus_upload_render_state() and crocus_upload_compute_state(), which read + * the context state and emit the commands into the actual batch. + */ + +#include +#include + +#if HAVE_VALGRIND +#include +#include +#define VG(x) x +#ifdef DEBUG +#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x)) +#endif +#else +#define VG(x) +#endif + +#include "drm-uapi/i915_drm.h" +#include "intel/common/intel_l3_config.h" +#include "intel/common/intel_sample_positions.h" +#include "intel/compiler/brw_compiler.h" +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "util/format/u_format.h" +#include "util/half_float.h" +#include "util/u_dual_blend.h" +#include "util/u_framebuffer.h" +#include "util/u_helpers.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_prim.h" +#include "util/u_transfer.h" +#include "util/u_upload_mgr.h" +#include "util/u_viewport.h" +#include "crocus_batch.h" +#include "crocus_context.h" +#include "crocus_defines.h" +#include "crocus_pipe.h" +#include "crocus_resource.h" + +#include "crocus_genx_macros.h" +#include "intel/common/intel_guardband.h" + +/** + * Statically assert that PIPE_* enums match the hardware packets. + * (As long as they match, we don't need to translate them.) + */ +UNUSED static void pipe_asserts() +{ +#define PIPE_ASSERT(x) STATIC_ASSERT((int)x) + + /* pipe_logicop happens to match the hardware. */ + PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR); + PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR); + PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED); + PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED); + PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE); + PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT); + PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR); + PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND); + PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND); + PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV); + PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP); + PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED); + PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY); + PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE); + PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR); + PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET); + + /* pipe_blend_func happens to match the hardware. */ + PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE); + PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA); + PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA); + PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE); + PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA); + PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA); + PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR); + PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA); + + /* pipe_blend_func happens to match the hardware. */ + PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD); + PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT); + PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT); + PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN); + PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX); + + /* pipe_stencil_op happens to match the hardware. */ + PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP); + PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO); + PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE); + PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT); + PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT); + PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR); + PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR); + PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT); + +#if GFX_VER >= 6 + /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */ + PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT); + PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT); +#endif +#undef PIPE_ASSERT +} + +static unsigned +translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch) +{ + static const unsigned map[] = { + [PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST, + [PIPE_PRIM_LINES] = _3DPRIM_LINELIST, + [PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP, + [PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP, + [PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST, + [PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP, + [PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN, + [PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST, + [PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP, + [PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON, +#if GFX_VER >= 6 + [PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ, + [PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ, + [PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ, + [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, +#endif +#if GFX_VER >= 7 + [PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1, +#endif + }; + + return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0); +} + +static unsigned +translate_compare_func(enum pipe_compare_func pipe_func) +{ + static const unsigned map[] = { + [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER, + [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS, + [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL, + [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL, + [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER, + [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL, + [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL, + [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS, + }; + return map[pipe_func]; +} + +static unsigned +translate_shadow_func(enum pipe_compare_func pipe_func) +{ + /* Gallium specifies the result of shadow comparisons as: + * + * 1 if ref texel, + * 0 otherwise. + * + * The hardware does: + * + * 0 if texel ref, + * 1 otherwise. + * + * So we need to flip the operator and also negate. + */ + static const unsigned map[] = { + [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS, + [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL, + [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL, + [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS, + [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL, + [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL, + [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER, + [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER, + }; + return map[pipe_func]; +} + +static unsigned +translate_cull_mode(unsigned pipe_face) +{ + static const unsigned map[4] = { + [PIPE_FACE_NONE] = CULLMODE_NONE, + [PIPE_FACE_FRONT] = CULLMODE_FRONT, + [PIPE_FACE_BACK] = CULLMODE_BACK, + [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH, + }; + return map[pipe_face]; +} + +#if GFX_VER >= 6 +static unsigned +translate_fill_mode(unsigned pipe_polymode) +{ + static const unsigned map[4] = { + [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID, + [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME, + [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT, + [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID, + }; + return map[pipe_polymode]; +} +#endif + +static unsigned +translate_mip_filter(enum pipe_tex_mipfilter pipe_mip) +{ + static const unsigned map[] = { + [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST, + [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR, + [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE, + }; + return map[pipe_mip]; +} + +static uint32_t +translate_wrap(unsigned pipe_wrap, bool either_nearest) +{ + static const unsigned map[] = { + [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP, + [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER, + [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP, + [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER, + [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR, + [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE, + + /* These are unsupported. */ + [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1, + [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1, + }; + if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest) + return TCM_CLAMP; + return map[pipe_wrap]; +} + +/** + * Equiv if brw_state_batch + */ +static uint32_t * +stream_state(struct crocus_batch *batch, + unsigned size, + unsigned alignment, + uint32_t *out_offset) +{ + uint32_t offset = ALIGN(batch->state.used, alignment); + + if (offset + size >= STATE_SZ && !batch->no_wrap) { + crocus_batch_flush(batch); + offset = ALIGN(batch->state.used, alignment); + } else if (offset + size >= batch->state.bo->size) { + const unsigned new_size = + MIN2(batch->state.bo->size + batch->state.bo->size / 2, + MAX_STATE_SIZE); + crocus_grow_buffer(batch, true, batch->state.used, new_size); + assert(offset + size < batch->state.bo->size); + } + + crocus_record_state_size(batch->state_sizes, offset, size); + + batch->state.used = offset + size; + *out_offset = offset; + + return (uint32_t *)batch->state.map + (offset >> 2); +} + +/** + * stream_state() + memcpy. + */ +static uint32_t +emit_state(struct crocus_batch *batch, const void *data, unsigned size, + unsigned alignment) +{ + unsigned offset = 0; + uint32_t *map = stream_state(batch, size, alignment, &offset); + + if (map) + memcpy(map, data, size); + + return offset; +} + +#if GFX_VER <= 5 +static void +upload_pipelined_state_pointers(struct crocus_batch *batch, + bool gs_active, uint32_t gs_offset, + uint32_t vs_offset, uint32_t sf_offset, + uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset) +{ +#if GFX_VER == 5 + /* Need to flush before changing clip max threads for errata. */ + crocus_emit_cmd(batch, GENX(MI_FLUSH), foo); +#endif + + crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) { + pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset); + pp.GSEnable = gs_active; + if (gs_active) + pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset); + pp.ClipEnable = true; + pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset); + pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset); + pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset); + pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset); + } +} + +#endif +/** + * Did field 'x' change between 'old_cso' and 'new_cso'? + * + * (If so, we may want to set some dirty flags.) + */ +#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x)) +#define cso_changed_memcmp(x) \ + (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0) + +static void +flush_before_state_base_change(struct crocus_batch *batch) +{ +#if GFX_VER >= 6 + /* Flush before emitting STATE_BASE_ADDRESS. + * + * This isn't documented anywhere in the PRM. However, it seems to be + * necessary prior to changing the surface state base adress. We've + * seen issues in Vulkan where we get GPU hangs when using multi-level + * command buffers which clear depth, reset state base address, and then + * go render stuff. + * + * Normally, in GL, we would trust the kernel to do sufficient stalls + * and flushes prior to executing our batch. However, it doesn't seem + * as if the kernel's flushing is always sufficient and we don't want to + * rely on it. + * + * We make this an end-of-pipe sync instead of a normal flush because we + * do not know the current status of the GPU. On Haswell at least, + * having a fast-clear operation in flight at the same time as a normal + * rendering operation can cause hangs. Since the kernel's flushing is + * insufficient, we need to ensure that any rendering operations from + * other processes are definitely complete before we try to do our own + * rendering. It's a bit of a big hammer but it appears to work. + */ + const unsigned dc_flush = + batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0; + crocus_emit_end_of_pipe_sync(batch, + "change STATE_BASE_ADDRESS (flushes)", + PIPE_CONTROL_RENDER_TARGET_FLUSH | + dc_flush | + PIPE_CONTROL_DEPTH_CACHE_FLUSH); +#endif +} + +static void +flush_after_state_base_change(struct crocus_batch *batch) +{ + /* After re-setting the surface state base address, we have to do some + * cache flusing so that the sampler engine will pick up the new + * SURFACE_STATE objects and binding tables. From the Broadwell PRM, + * Shared Function > 3D Sampler > State > State Caching (page 96): + * + * Coherency with system memory in the state cache, like the texture + * cache is handled partially by software. It is expected that the + * command stream or shader will issue Cache Flush operation or + * Cache_Flush sampler message to ensure that the L1 cache remains + * coherent with system memory. + * + * [...] + * + * Whenever the value of the Dynamic_State_Base_Addr, + * Surface_State_Base_Addr are altered, the L1 state cache must be + * invalidated to ensure the new surface or sampler state is fetched + * from system memory. + * + * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit + * which, according the PIPE_CONTROL instruction documentation in the + * Broadwell PRM: + * + * Setting this bit is independent of any other bit in this packet. + * This bit controls the invalidation of the L1 and L2 state caches + * at the top of the pipe i.e. at the parsing time. + * + * Unfortunately, experimentation seems to indicate that state cache + * invalidation through a PIPE_CONTROL does nothing whatsoever in + * regards to surface state and binding tables. In stead, it seems that + * invalidating the texture cache is what is actually needed. + * + * XXX: As far as we have been able to determine through + * experimentation, shows that flush the texture cache appears to be + * sufficient. The theory here is that all of the sampling/rendering + * units cache the binding table in the texture cache. However, we have + * yet to be able to actually confirm this. + */ +#if GFX_VER >= 6 + crocus_emit_end_of_pipe_sync(batch, + "change STATE_BASE_ADDRESS (invalidates)", + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE); +#endif +} + +#if GFX_VER >= 6 +static void +crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset, + bool predicated) +{ + crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.RegisterAddress = reg; + srm.MemoryAddress = ggtt_bo(bo, offset); +#if GFX_VERx10 == 75 + srm.PredicateEnable = predicated; +#else + if (predicated) + unreachable("unsupported predication"); +#endif + } +} + +static void +crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset, + bool predicated) +{ + crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated); + crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated); +} +#endif + +#if GFX_VER >= 7 +static void +_crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val) +{ + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = reg; + lri.DataDWord = val; + } +} +#define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v) + +#if GFX_VERx10 == 75 +static void +_crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src) +{ + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { + lrr.SourceRegisterAddress = src; + lrr.DestinationRegisterAddress = dst; + } +} + +static void +crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst, + uint32_t src) +{ + _crocus_emit_lrr(batch, dst, src); +} + +static void +crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst, + uint32_t src) +{ + _crocus_emit_lrr(batch, dst, src); + _crocus_emit_lrr(batch, dst + 4, src + 4); +} +#endif + +static void +crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg, + uint32_t val) +{ + _crocus_emit_lri(batch, reg, val); +} + +static void +crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg, + uint64_t val) +{ + _crocus_emit_lri(batch, reg + 0, val & 0xffffffff); + _crocus_emit_lri(batch, reg + 4, val >> 32); +} + +/** + * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer. + */ +static void +crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset) +{ + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = reg; + lrm.MemoryAddress = ro_bo(bo, offset); + } +} + +/** + * Load a 64-bit value from a buffer into a MMIO register via + * two MI_LOAD_REGISTER_MEM commands. + */ +static void +crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg, + struct crocus_bo *bo, uint32_t offset) +{ + crocus_load_register_mem32(batch, reg + 0, bo, offset + 0); + crocus_load_register_mem32(batch, reg + 4, bo, offset + 4); +} + +#if GFX_VERx10 == 75 +static void +crocus_store_data_imm32(struct crocus_batch *batch, + struct crocus_bo *bo, uint32_t offset, + uint32_t imm) +{ + crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = rw_bo(bo, offset); +#if GFX_VER >= 6 + sdi.ImmediateData = imm; +#endif + } +} + +static void +crocus_store_data_imm64(struct crocus_batch *batch, + struct crocus_bo *bo, uint32_t offset, + uint64_t imm) +{ + /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of + * 2 in genxml but it's actually variable length and we need 5 DWords. + */ + void *map = crocus_get_command_space(batch, 4 * 5); + _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) { + sdi.DWordLength = 5 - 2; + sdi.Address = rw_bo(bo, offset); +#if GFX_VER >= 6 + sdi.ImmediateData = imm; +#endif + } +} +#endif + +static void +crocus_copy_mem_mem(struct crocus_batch *batch, + struct crocus_bo *dst_bo, uint32_t dst_offset, + struct crocus_bo *src_bo, uint32_t src_offset, + unsigned bytes) +{ + assert(bytes % 4 == 0); + assert(dst_offset % 4 == 0); + assert(src_offset % 4 == 0); + +#define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */ + for (unsigned i = 0; i < bytes; i += 4) { + crocus_load_register_mem32(batch, CROCUS_TEMP_REG, + src_bo, src_offset + i); + crocus_store_register_mem32(batch, CROCUS_TEMP_REG, + dst_bo, dst_offset + i, false); + } +} +#endif + +/** + * Gallium CSO for rasterizer state. + */ +struct crocus_rasterizer_state { + struct pipe_rasterizer_state cso; +#if GFX_VER >= 6 + uint32_t sf[GENX(3DSTATE_SF_length)]; + uint32_t clip[GENX(3DSTATE_CLIP_length)]; +#endif + uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)]; + + uint8_t num_clip_plane_consts; + bool fill_mode_point_or_line; +}; + +#if GFX_VER <= 5 +#define URB_VS 0 +#define URB_GS 1 +#define URB_CLP 2 +#define URB_SF 3 +#define URB_CS 4 + +static const struct { + uint32_t min_nr_entries; + uint32_t preferred_nr_entries; + uint32_t min_entry_size; + uint32_t max_entry_size; +} limits[URB_CS+1] = { + { 16, 32, 1, 5 }, /* vs */ + { 4, 8, 1, 5 }, /* gs */ + { 5, 10, 1, 5 }, /* clp */ + { 1, 8, 1, 12 }, /* sf */ + { 1, 4, 1, 32 } /* cs */ +}; + +static bool check_urb_layout(struct crocus_context *ice) +{ + ice->urb.vs_start = 0; + ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize; + ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize; + ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize; + ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize; + + return ice->urb.cs_start + ice->urb.nr_cs_entries * + ice->urb.csize <= ice->urb.size; +} + + +static bool +crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize, + unsigned vsize, unsigned sfsize) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + struct crocus_context *ice = batch->ice; + if (csize < limits[URB_CS].min_entry_size) + csize = limits[URB_CS].min_entry_size; + + if (vsize < limits[URB_VS].min_entry_size) + vsize = limits[URB_VS].min_entry_size; + + if (sfsize < limits[URB_SF].min_entry_size) + sfsize = limits[URB_SF].min_entry_size; + + if (ice->urb.vsize < vsize || + ice->urb.sfsize < sfsize || + ice->urb.csize < csize || + (ice->urb.constrained && (ice->urb.vsize > vsize || + ice->urb.sfsize > sfsize || + ice->urb.csize > csize))) { + + + ice->urb.csize = csize; + ice->urb.sfsize = sfsize; + ice->urb.vsize = vsize; + + ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries; + ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries; + ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries; + ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries; + ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries; + + ice->urb.constrained = 0; + + if (devinfo->ver == 5) { + ice->urb.nr_vs_entries = 128; + ice->urb.nr_sf_entries = 48; + if (check_urb_layout(ice)) { + goto done; + } else { + ice->urb.constrained = 1; + ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries; + ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries; + } + } else if (devinfo->is_g4x) { + ice->urb.nr_vs_entries = 64; + if (check_urb_layout(ice)) { + goto done; + } else { + ice->urb.constrained = 1; + ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries; + } + } + + if (!check_urb_layout(ice)) { + ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries; + ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries; + ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries; + ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries; + ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries; + + /* Mark us as operating with constrained nr_entries, so that next + * time we recalculate we'll resize the fences in the hope of + * escaping constrained mode and getting back to normal performance. + */ + ice->urb.constrained = 1; + + if (!check_urb_layout(ice)) { + /* This is impossible, given the maximal sizes of urb + * entries and the values for minimum nr of entries + * provided above. + */ + fprintf(stderr, "couldn't calculate URB layout!\n"); + exit(1); + } + + if (unlikely(INTEL_DEBUG & (DEBUG_URB|DEBUG_PERF))) + fprintf(stderr, "URB CONSTRAINED\n"); + } + +done: + if (unlikely(INTEL_DEBUG & DEBUG_URB)) + fprintf(stderr, + "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n", + ice->urb.vs_start, + ice->urb.gs_start, + ice->urb.clip_start, + ice->urb.sf_start, + ice->urb.cs_start, + ice->urb.size); + return true; + } + return false; +} + +static void +crocus_upload_urb_fence(struct crocus_batch *batch) +{ + uint32_t urb_fence[3]; + _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) { + urb.VSUnitURBReallocationRequest = 1; + urb.GSUnitURBReallocationRequest = 1; + urb.CLIPUnitURBReallocationRequest = 1; + urb.SFUnitURBReallocationRequest = 1; + urb.VFEUnitURBReallocationRequest = 1; + urb.CSUnitURBReallocationRequest = 1; + + urb.VSFence = batch->ice->urb.gs_start; + urb.GSFence = batch->ice->urb.clip_start; + urb.CLIPFence = batch->ice->urb.sf_start; + urb.SFFence = batch->ice->urb.cs_start; + urb.CSFence = batch->ice->urb.size; + } + + /* erratum: URB_FENCE must not cross a 64byte cacheline */ + if ((crocus_batch_bytes_used(batch) & 15) > 12) { + int pad = 16 - (crocus_batch_bytes_used(batch) & 15); + do { + *(uint32_t *)batch->command.map_next = 0; + batch->command.map_next += sizeof(uint32_t); + } while (--pad); + } + + crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3); +} + +static bool +calculate_curbe_offsets(struct crocus_batch *batch) +{ + struct crocus_context *ice = batch->ice; + + unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0; + unsigned total_regs; + + nr_fp_regs = 0; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i]; + if (range->length == 0) + continue; + + /* ubo range tracks at 256-bit, we need 512-bit */ + nr_fp_regs += (range->length + 1) / 2; + } + + if (ice->state.cso_rast->cso.clip_plane_enable) { + unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable); + nr_clip_regs = (nr_planes * 4 + 15) / 16; + } + + nr_vp_regs = 0; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i]; + if (range->length == 0) + continue; + + /* ubo range tracks at 256-bit, we need 512-bit */ + nr_vp_regs += (range->length + 1) / 2; + } + if (nr_vp_regs == 0) { + /* The pre-gen6 VS requires that some push constants get loaded no + * matter what, or the GPU would hang. + */ + nr_vp_regs = 1; + } + total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs; + + /* The CURBE allocation size is limited to 32 512-bit units (128 EU + * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5 + * (volume 1, part 1) PRMs. + * + * Note that in brw_fs.cpp we're only loading up to 16 EU registers of + * values as push constants before spilling to pull constants, and in + * brw_vec4.cpp we're loading up to 32 registers of push constants. An EU + * register is 1/2 of one of these URB entry units, so that leaves us 16 EU + * regs for clip. + */ + assert(total_regs <= 32); + + /* Lazy resize: + */ + if (nr_fp_regs > ice->curbe.wm_size || + nr_vp_regs > ice->curbe.vs_size || + nr_clip_regs != ice->curbe.clip_size || + (total_regs < ice->curbe.total_size / 4 && + ice->curbe.total_size > 16)) { + + GLuint reg = 0; + + /* Calculate a new layout: + */ + reg = 0; + ice->curbe.wm_start = reg; + ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs; + ice->curbe.clip_start = reg; + ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs; + ice->curbe.vs_start = reg; + ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs; + ice->curbe.total_size = reg; + + if (0) + fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n", + ice->curbe.wm_start, + ice->curbe.wm_size, + ice->curbe.clip_start, + ice->curbe.clip_size, + ice->curbe.vs_start, + ice->curbe.vs_size ); + return true; + } + return false; +} + +static void +upload_shader_consts(struct crocus_context *ice, + gl_shader_stage stage, + uint32_t *map, + unsigned start) +{ + struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; + struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; + uint32_t *cmap; + bool found = false; + unsigned offset = start * 16; + int total = 0; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + + if (range->length == 0) + continue; + + unsigned block_index = crocus_bti_to_group_index( + &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block); + unsigned len = range->length * 8 * sizeof(float); + unsigned start = range->start * 8 * sizeof(float); + struct pipe_transfer *transfer; + + cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer, + ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len, + PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer); + if (cmap) + memcpy(&map[offset + (total * 8)], cmap, len); + pipe_buffer_unmap(&ice->ctx, transfer); + total += range->length; + found = true; + } + + if (stage == MESA_SHADER_VERTEX && !found) { + /* The pre-gen6 VS requires that some push constants get loaded no + * matter what, or the GPU would hang. + */ + unsigned len = 16; + memset(&map[offset], 0, len); + } +} + +static const float fixed_plane[6][4] = { + { 0, 0, -1, 1 }, + { 0, 0, 1, 1 }, + { 0, -1, 0, 1 }, + { 0, 1, 0, 1 }, + {-1, 0, 0, 1 }, + { 1, 0, 0, 1 } +}; + +static void +gen4_upload_curbe(struct crocus_batch *batch) +{ + struct crocus_context *ice = batch->ice; + const unsigned sz = ice->curbe.total_size; + const unsigned buf_sz = sz * 16 * sizeof(float); + + if (sz == 0) + goto emit; + + uint32_t *map; + u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64, + &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map); + + /* fragment shader constants */ + if (ice->curbe.wm_size) { + upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start); + } + + /* clipper constants */ + if (ice->curbe.clip_size) { + unsigned offset = ice->curbe.clip_start * 16; + float *fmap = (float *)map; + unsigned i; + /* If any planes are going this way, send them all this way: + */ + for (i = 0; i < 6; i++) { + fmap[offset + i * 4 + 0] = fixed_plane[i][0]; + fmap[offset + i * 4 + 1] = fixed_plane[i][1]; + fmap[offset + i * 4 + 2] = fixed_plane[i][2]; + fmap[offset + i * 4 + 3] = fixed_plane[i][3]; + } + + unsigned mask = ice->state.cso_rast->cso.clip_plane_enable; + struct pipe_clip_state *cp = &ice->state.clip_planes; + while (mask) { + const int j = u_bit_scan(&mask); + fmap[offset + i * 4 + 0] = cp->ucp[j][0]; + fmap[offset + i * 4 + 1] = cp->ucp[j][1]; + fmap[offset + i * 4 + 2] = cp->ucp[j][2]; + fmap[offset + i * 4 + 3] = cp->ucp[j][3]; + i++; + } + } + + /* vertex shader constants */ + if (ice->curbe.vs_size) { + upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start); + } + if (0) { + for (int i = 0; i < sz*16; i+=4) { + float *f = (float *)map; + fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4, + f[i+0], f[i+1], f[i+2], f[i+3]); + } + } + +emit: + crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) { + if (ice->curbe.curbe_res) { + cb.BufferLength = ice->curbe.total_size - 1; + cb.Valid = 1; + cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset); + } + } + +#if GFX_VER == 4 && GFX_VERx10 != 45 + /* Work around a Broadwater/Crestline depth interpolator bug. The + * following sequence will cause GPU hangs: + * + * 1. Change state so that all depth related fields in CC_STATE are + * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled. + * 2. Emit a CONSTANT_BUFFER packet. + * 3. Draw via 3DPRIMITIVE. + * + * The recommended workaround is to emit a non-pipelined state change after + * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline. + * + * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small), + * and always emit it when "PS Use Source Depth" is set. We could be more + * precise, but the additional complexity is probably not worth it. + * + */ + const struct shader_info *fs_info = + crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); + + if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) { + ice->state.global_depth_offset_clamp = 0; + crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp); + } +#endif +} +#endif + +#if GFX_VER == 7 + +#define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000 +#define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000 +#define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000 + +static void +setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL]; + const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] || + cfg->n[INTEL_L3P_ALL]; + const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] || + cfg->n[INTEL_L3P_ALL]; + const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] || + cfg->n[INTEL_L3P_ALL]; + const bool has_slm = cfg->n[INTEL_L3P_SLM]; + + /* According to the hardware docs, the L3 partitioning can only be changed + * while the pipeline is completely drained and the caches are flushed, + * which involves a first PIPE_CONTROL flush which stalls the pipeline... + */ + crocus_emit_pipe_control_flush(batch, "l3_config", + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + + /* ...followed by a second pipelined PIPE_CONTROL that initiates + * invalidation of the relevant caches. Note that because RO invalidation + * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL + * command is processed by the CS) we cannot combine it with the previous + * stalling flush as the hardware documentation suggests, because that + * would cause the CS to stall on previous rendering *after* RO + * invalidation and wouldn't prevent the RO caches from being polluted by + * concurrent rendering before the stall completes. This intentionally + * doesn't implement the SKL+ hardware workaround suggesting to enable CS + * stall on PIPE_CONTROLs with the texture cache invalidation bit set for + * GPGPU workloads because the previous and subsequent PIPE_CONTROLs + * already guarantee that there is no concurrent GPGPU kernel execution + * (see SKL HSD 2132585). + */ + crocus_emit_pipe_control_flush(batch, "l3 config", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE); + + /* Now send a third stalling flush to make sure that invalidation is + * complete when the L3 configuration registers are modified. + */ + crocus_emit_pipe_control_flush(batch, "l3 config", + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + + + assert(!cfg->n[INTEL_L3P_ALL]); + + /* When enabled SLM only uses a portion of the L3 on half of the banks, + * the matching space on the remaining banks has to be allocated to a + * client (URB for all validated configurations) set to the + * lower-bandwidth 2-bank address hashing mode. + */ + const bool urb_low_bw = has_slm && !devinfo->is_baytrail; + assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]); + + /* Minimum number of ways that can be allocated to the URB. */ + const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0); + assert(cfg->n[INTEL_L3P_URB] >= n0_urb); + + uint32_t l3sqcr1, l3cr2, l3cr3; + + crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) { + reg.ConvertDC_UC = !has_dc; + reg.ConvertIS_UC = !has_is; + reg.ConvertC_UC = !has_c; + reg.ConvertT_UC = !has_t; +#if GFX_VERx10 == 75 + reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT; +#else + reg.L3SQGeneralPriorityCreditInitialization = + devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT; +#endif + reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT; + }; + + crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) { + reg.SLMEnable = has_slm; + reg.URBLowBandwidth = urb_low_bw; + reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb; +#if !(GFX_VERx10 == 75) + reg.ALLAllocation = cfg->n[INTEL_L3P_ALL]; +#endif + reg.ROAllocation = cfg->n[INTEL_L3P_RO]; + reg.DCAllocation = cfg->n[INTEL_L3P_DC]; + }; + + crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) { + reg.ISAllocation = cfg->n[INTEL_L3P_IS]; + reg.ISLowBandwidth = 0; + reg.CAllocation = cfg->n[INTEL_L3P_C]; + reg.CLowBandwidth = 0; + reg.TAllocation = cfg->n[INTEL_L3P_T]; + reg.TLowBandwidth = 0; + }; + + /* Set up the L3 partitioning. */ + crocus_emit_lri(batch, L3SQCREG1, l3sqcr1); + crocus_emit_lri(batch, L3CNTLREG2, l3cr2); + crocus_emit_lri(batch, L3CNTLREG3, l3cr3); + +#if GFX_VERSIONx10 == 75 + /* TODO: Fail screen creation if command parser version < 4 */ + uint32_t scratch1, chicken3; + crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) { + reg.L3AtomicDisable = !has_dc; + } + crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) { + reg.L3AtomicDisableMask = true; + reg.L3AtomicDisable = !has_dc; + } + crocus_emit_lri(batch, SCRATCH1, scratch1); + crocus_emit_lri(batch, CHICKEN3, chicken3); +#endif +} + +static void +emit_l3_state(struct crocus_batch *batch, bool compute) +{ + const struct intel_l3_config *const cfg = + compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d; + + setup_l3_config(batch, cfg); + if (unlikely(INTEL_DEBUG & DEBUG_L3)) { + intel_dump_l3_config(cfg, stderr); + } +} + +/** + * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set. + */ +static void +gen7_emit_cs_stall_flush(struct crocus_batch *batch) +{ + crocus_emit_pipe_control_write(batch, + "workaround", + PIPE_CONTROL_CS_STALL + | PIPE_CONTROL_WRITE_IMMEDIATE, + batch->ice->workaround_bo, + batch->ice->workaround_offset, 0); +} +#endif + +static void +emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline) +{ +#if GFX_VER >= 6 + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * "Project: DEVSNB+ + * + * Software must ensure all the write caches are flushed through a + * stalling PIPE_CONTROL command followed by another PIPE_CONTROL + * command to invalidate read only caches prior to programming + * MI_PIPELINE_SELECT command to change the Pipeline Select Mode." + */ + const unsigned dc_flush = + batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0; + crocus_emit_pipe_control_flush(batch, + "workaround: PIPELINE_SELECT flushes (1/2)", + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + dc_flush | + PIPE_CONTROL_CS_STALL); + + crocus_emit_pipe_control_flush(batch, + "workaround: PIPELINE_SELECT flushes (2/2)", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE | + PIPE_CONTROL_INSTRUCTION_INVALIDATE); +#else + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * Project: PRE-DEVSNB + * + * Software must ensure the current pipeline is flushed via an + * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT. + */ + crocus_emit_cmd(batch, GENX(MI_FLUSH), foo); +#endif + + crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) { + sel.PipelineSelection = pipeline; + } + +#if GFX_VER == 7 && !(GFX_VERx10 == 75) + if (pipeline == _3D) { + gen7_emit_cs_stall_flush(batch); + + crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) { + prim.PrimitiveTopologyType = _3DPRIM_POINTLIST; + }; + } +#endif +} + +/** + * The following diagram shows how we partition the URB: + * + * 16kB or 32kB Rest of the URB space + * __________-__________ _________________-_________________ + * / \ / \ + * +-------------------------------------------------------------+ + * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB | + * | Constants | Entries | + * +-------------------------------------------------------------+ + * + * Notably, push constants must be stored at the beginning of the URB + * space, while entries can be stored anywhere. Ivybridge and Haswell + * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3 + * doubles this (32kB). + * + * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and + * sized) in increments of 1kB. Haswell GT3 requires them to be located and + * sized in increments of 2kB. + * + * Currently we split the constant buffer space evenly among whatever stages + * are active. This is probably not ideal, but simple. + * + * Ivybridge GT1 and Haswell GT1 have 128kB of URB space. + * Ivybridge GT2 and Haswell GT2 have 256kB of URB space. + * Haswell GT3 has 512kB of URB space. + * + * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations", + * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS. + */ +#if GFX_VER == 7 +static void +crocus_alloc_push_constants(struct crocus_batch *batch) +{ +#if GFX_VERx10 == 75 + const unsigned push_constant_kb = batch->screen->devinfo.gt == 3 ? 32 : 16; +#else + const unsigned push_constant_kb = 16; +#endif + unsigned size_per_stage = push_constant_kb / 5; + + /* For now, we set a static partitioning of the push constant area, + * assuming that all stages could be in use. + * + * TODO: Try lazily allocating the HS/DS/GS sections as needed, and + * see if that improves performance by offering more space to + * the VS/FS when those aren't in use. Also, try dynamically + * enabling/disabling it like i965 does. This would be more + * stalls and may not actually help; we don't know yet. + */ + for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) { + crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { + alloc._3DCommandSubOpcode = 18 + i; + alloc.ConstantBufferOffset = size_per_stage * i; + alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage; + } + } + + /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS): + * + * A PIPE_CONTROL command with the CS Stall bit set must be programmed + * in the ring after this instruction. + * + * No such restriction exists for Haswell or Baytrail. + */ + if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail) + gen7_emit_cs_stall_flush(batch); +} +#endif + +/** + * Upload the initial GPU state for a render context. + * + * This sets some invariant state that needs to be programmed a particular + * way, but we never actually change. + */ +static void +crocus_init_render_context(struct crocus_batch *batch) +{ + UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; + + emit_pipeline_select(batch, _3D); + + crocus_emit_cmd(batch, GENX(STATE_SIP), foo); + +#if GFX_VER == 7 + emit_l3_state(batch, false); +#endif +#if GFX_VER == 7 && GFX_VERx10 != 75 + crocus_emit_reg(batch, GENX(INSTPM), reg) { + reg.CONSTANT_BUFFERAddressOffsetDisable = true; + reg.CONSTANT_BUFFERAddressOffsetDisableMask = true; + } +#endif +#if GFX_VER >= 5 || GFX_VERx10 == 45 + /* Use the legacy AA line coverage computation. */ + crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo); +#endif + + /* No polygon stippling offsets are necessary. */ + /* TODO: may need to set an offset for origin-UL framebuffers */ + crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo); + +#if GFX_VER == 7 + crocus_alloc_push_constants(batch); +#endif +} + +#if GFX_VER == 7 +static void +crocus_init_compute_context(struct crocus_batch *batch) +{ + UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; + + emit_pipeline_select(batch, GPGPU); + +#if GFX_VER == 7 + emit_l3_state(batch, true); +#endif +} +#endif + +/** + * Generation-specific context state (ice->state.genx->...). + * + * Most state can go in crocus_context directly, but these encode hardware + * packets which vary by generation. + */ +struct crocus_genx_state { + struct { +#if GFX_VER == 7 + struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES]; +#endif + } shaders[MESA_SHADER_STAGES]; +}; + +/** + * The pipe->set_blend_color() driver hook. + * + * This corresponds to our COLOR_CALC_STATE. + */ +static void +crocus_set_blend_color(struct pipe_context *ctx, + const struct pipe_blend_color *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + + /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */ + memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color)); +#if GFX_VER <= 5 + ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR; +#else + ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; +#endif +} + +/** + * Gallium CSO for blend state (see pipe_blend_state). + */ +struct crocus_blend_state { + /** copy of BLEND_STATE */ + struct pipe_blend_state cso; + + /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */ + uint8_t blend_enables; + + /** Bitfield of whether color writes are enabled for RT[i] */ + uint8_t color_write_enables; + + /** Does RT[0] use dual color blending? */ + bool dual_color_blending; +}; + +#if GFX_VER >= 6 +static enum pipe_blendfactor +fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one) +{ + if (alpha_to_one) { + if (f == PIPE_BLENDFACTOR_SRC1_ALPHA) + return PIPE_BLENDFACTOR_ONE; + + if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA) + return PIPE_BLENDFACTOR_ZERO; + } + + return f; +} +#endif + +/** + * The pipe->create_blend_state() driver hook. + * + * Translates a pipe_blend_state into crocus_blend_state. + */ +static void * +crocus_create_blend_state(struct pipe_context *ctx, + const struct pipe_blend_state *state) +{ + struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state)); + + cso->blend_enables = 0; + cso->color_write_enables = 0; + STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8); + + cso->cso = *state; + cso->dual_color_blending = util_blend_state_is_dual(state, 0); + for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) { + const struct pipe_rt_blend_state *rt = + &state->rt[state->independent_blend_enable ? i : 0]; + if (rt->blend_enable) + cso->blend_enables |= 1u << i; + if (rt->colormask) + cso->color_write_enables |= 1u << i; + } + return cso; +} + +/** + * The pipe->bind_blend_state() driver hook. + * + * Bind a blending CSO and flag related dirty bits. + */ +static void +crocus_bind_blend_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_blend_state *cso = state; + + ice->state.cso_blend = cso; + ice->state.blend_enables = cso ? cso->blend_enables : 0; + + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS; + ice->state.dirty |= CROCUS_DIRTY_WM; +#if GFX_VER >= 6 + ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; +#endif +#if GFX_VER >= 7 + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS; +#endif + ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; + ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND]; +} + +/** + * Return true if the FS writes to any color outputs which are not disabled + * via color masking. + */ +static bool +has_writeable_rt(const struct crocus_blend_state *cso_blend, + const struct shader_info *fs_info) +{ + if (!fs_info) + return false; + + unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0; + + if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR)) + rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1; + + return cso_blend->color_write_enables & rt_outputs; +} + +/** + * Gallium CSO for depth, stencil, and alpha testing state. + */ +struct crocus_depth_stencil_alpha_state { + struct pipe_depth_stencil_alpha_state cso; + + bool depth_writes_enabled; + bool stencil_writes_enabled; +}; + +/** + * The pipe->create_depth_stencil_alpha_state() driver hook. + * + * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha + * testing state since we need pieces of it in a variety of places. + */ +static void * +crocus_create_zsa_state(struct pipe_context *ctx, + const struct pipe_depth_stencil_alpha_state *state) +{ + struct crocus_depth_stencil_alpha_state *cso = + malloc(sizeof(struct crocus_depth_stencil_alpha_state)); + + bool two_sided_stencil = state->stencil[1].enabled; + cso->cso = *state; + + cso->depth_writes_enabled = state->depth_writemask; + cso->stencil_writes_enabled = + state->stencil[0].writemask != 0 || + (two_sided_stencil && state->stencil[1].writemask != 0); + + /* The state tracker needs to optimize away EQUAL writes for us. */ + assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask)); + + return cso; +} + +/** + * The pipe->bind_depth_stencil_alpha_state() driver hook. + * + * Bind a depth/stencil/alpha CSO and flag related dirty bits. + */ +static void +crocus_bind_zsa_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa; + struct crocus_depth_stencil_alpha_state *new_cso = state; + + if (new_cso) { + if (cso_changed(cso.alpha_ref_value)) + ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; + +#if GFX_VER >= 6 + if (cso_changed(cso.alpha_enabled)) + ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; + + if (cso_changed(cso.alpha_func)) + ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; +#endif + + if (cso_changed(depth_writes_enabled)) + ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; + + ice->state.depth_writes_enabled = new_cso->depth_writes_enabled; + ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled; + +#if GFX_VER <= 5 + ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; +#endif + } + + ice->state.cso_zsa = new_cso; + ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT; +#if GFX_VER >= 6 + ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL; +#endif + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA]; +} + +static float +get_line_width(const struct pipe_rasterizer_state *state) +{ + float line_width = state->line_width; + + /* From the OpenGL 4.4 spec: + * + * "The actual width of non-antialiased lines is determined by rounding + * the supplied width to the nearest integer, then clamping it to the + * implementation-dependent maximum non-antialiased line width." + */ + if (!state->multisample && !state->line_smooth) + line_width = roundf(state->line_width); + + if (!state->multisample && state->line_smooth && line_width < 1.5f) { + /* For 1 pixel line thickness or less, the general anti-aliasing + * algorithm gives up, and a garbage line is generated. Setting a + * Line Width of 0.0 specifies the rasterization of the "thinnest" + * (one-pixel-wide), non-antialiased lines. + * + * Lines rendered with zero Line Width are rasterized using the + * "Grid Intersection Quantization" rules as specified by the + * "Zero-Width (Cosmetic) Line Rasterization" section of the docs. + */ + line_width = 0.0f; + } + + return line_width; +} + +/** + * The pipe->create_rasterizer_state() driver hook. + */ +static void * +crocus_create_rasterizer_state(struct pipe_context *ctx, + const struct pipe_rasterizer_state *state) +{ + struct crocus_rasterizer_state *cso = + malloc(sizeof(struct crocus_rasterizer_state)); + + cso->fill_mode_point_or_line = + state->fill_front == PIPE_POLYGON_MODE_LINE || + state->fill_front == PIPE_POLYGON_MODE_POINT || + state->fill_back == PIPE_POLYGON_MODE_LINE || + state->fill_back == PIPE_POLYGON_MODE_POINT; + + if (state->clip_plane_enable != 0) + cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1; + else + cso->num_clip_plane_consts = 0; + + cso->cso = *state; + +#if GFX_VER >= 6 + float line_width = get_line_width(state); + + crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) { + sf.StatisticsEnable = true; + sf.AALineDistanceMode = AALINEDISTANCE_TRUE; + sf.LineEndCapAntialiasingRegionWidth = + state->line_smooth ? _10pixels : _05pixels; + sf.LastPixelEnable = state->line_last_pixel; + sf.LineWidth = line_width; + sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State; + sf.PointWidth = state->point_size; + + if (state->flatshade_first) { + sf.TriangleFanProvokingVertexSelect = 1; + } else { + sf.TriangleStripListProvokingVertexSelect = 2; + sf.TriangleFanProvokingVertexSelect = 2; + sf.LineStripListProvokingVertexSelect = 1; + } + + sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way... + sf.CullMode = translate_cull_mode(state->cull_face); + + sf.ScissorRectangleEnable = true; + +#if GFX_VER == 6 + sf.AttributeSwizzleEnable = true; + if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) + sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT; + else + sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT; +#endif + +#if GFX_VER >= 6 + sf.GlobalDepthOffsetEnableSolid = state->offset_tri; + sf.GlobalDepthOffsetEnableWireframe = state->offset_line; + sf.GlobalDepthOffsetEnablePoint = state->offset_point; + sf.GlobalDepthOffsetConstant = state->offset_units * 2; + sf.GlobalDepthOffsetScale = state->offset_scale; + sf.GlobalDepthOffsetClamp = state->offset_clamp; + + sf.FrontFaceFillMode = translate_fill_mode(state->fill_front); + sf.BackFaceFillMode = translate_fill_mode(state->fill_back); +#endif + +#if GFX_VERx10 == 75 + sf.LineStippleEnable = state->line_stipple_enable; +#endif + } +#endif + +#if GFX_VER >= 6 + crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) { + /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from + * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB. + */ +#if GFX_VER >= 7 + cl.EarlyCullEnable = true; +#endif + +#if GFX_VER == 7 + cl.FrontWinding = state->front_ccw ? 1 : 0; + cl.CullMode = translate_cull_mode(state->cull_face); +#endif + cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable; + cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL; + cl.GuardbandClipTestEnable = true; + cl.ClipEnable = true; + cl.MinimumPointWidth = 0.125; + cl.MaximumPointWidth = 255.875; + cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far); + + if (state->flatshade_first) { + cl.TriangleFanProvokingVertexSelect = 1; + } else { + cl.TriangleStripListProvokingVertexSelect = 2; + cl.TriangleFanProvokingVertexSelect = 2; + cl.LineStripListProvokingVertexSelect = 1; + } + } +#endif + + /* Remap from 0..255 back to 1..256 */ + const unsigned line_stipple_factor = state->line_stipple_factor + 1; + + crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) { + if (state->line_stipple_enable) { + line.LineStipplePattern = state->line_stipple_pattern; + line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor; + line.LineStippleRepeatCount = line_stipple_factor; + } + } + + return cso; +} + +/** + * The pipe->bind_rasterizer_state() driver hook. + * + * Bind a rasterizer CSO and flag related dirty bits. + */ +static void +crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_rasterizer_state *old_cso = ice->state.cso_rast; + struct crocus_rasterizer_state *new_cso = state; + + if (new_cso) { + /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */ + if (cso_changed_memcmp(line_stipple)) + ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE; +#if GFX_VER >= 6 + if (cso_changed(cso.half_pixel_center)) + ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE; + if (cso_changed(cso.scissor)) + ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; +#else + if (cso_changed(cso.scissor)) + ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; +#endif + + if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable)) + ice->state.dirty |= CROCUS_DIRTY_WM; + +#if GFX_VER >= 6 + if (cso_changed(cso.rasterizer_discard)) + ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP; + + if (cso_changed(cso.flatshade_first)) + ice->state.dirty |= CROCUS_DIRTY_STREAMOUT; +#endif + + if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) || + cso_changed(cso.clip_halfz)) + ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT; + +#if GFX_VER >= 7 + if (cso_changed(cso.sprite_coord_enable) || + cso_changed(cso.sprite_coord_mode) || + cso_changed(cso.light_twoside)) + ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; +#endif +#if GFX_VER <= 5 + if (cso_changed(cso.clip_plane_enable)) + ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE; +#endif + } + + ice->state.cso_rast = new_cso; + ice->state.dirty |= CROCUS_DIRTY_RASTER; + ice->state.dirty |= CROCUS_DIRTY_CLIP; +#if GFX_VER <= 5 + ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; + ice->state.dirty |= CROCUS_DIRTY_WM; +#endif +#if GFX_VER <= 6 + ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; +#endif + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER]; +} + +/** + * Return true if the given wrap mode requires the border color to exist. + * + * (We can skip uploading it if the sampler isn't going to use it.) + */ +static bool +wrap_mode_needs_border_color(unsigned wrap_mode) +{ + return wrap_mode == TCM_CLAMP_BORDER; +} + +/** + * Gallium CSO for sampler state. + */ +struct crocus_sampler_state { + struct pipe_sampler_state pstate; + union pipe_color_union border_color; + bool needs_border_color; + unsigned wrap_s; + unsigned wrap_t; + unsigned wrap_r; + unsigned mag_img_filter; + float min_lod; +}; + +/** + * The pipe->create_sampler_state() driver hook. + * + * We fill out SAMPLER_STATE (except for the border color pointer), and + * store that on the CPU. It doesn't make sense to upload it to a GPU + * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires + * all bound sampler states to be in contiguous memor. + */ +static void * +crocus_create_sampler_state(struct pipe_context *ctx, + const struct pipe_sampler_state *state) +{ + struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state); + + if (!cso) + return NULL; + + STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST); + STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR); + + bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST || + state->mag_img_filter == PIPE_TEX_FILTER_NEAREST; + cso->wrap_s = translate_wrap(state->wrap_s, either_nearest); + cso->wrap_t = translate_wrap(state->wrap_t, either_nearest); + cso->wrap_r = translate_wrap(state->wrap_r, either_nearest); + + cso->pstate = *state; + + memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color)); + + cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) || + wrap_mode_needs_border_color(cso->wrap_t) || + wrap_mode_needs_border_color(cso->wrap_r); + + cso->min_lod = state->min_lod; + cso->mag_img_filter = state->mag_img_filter; + + // XXX: explain this code ported from ilo...I don't get it at all... + if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && + state->min_lod > 0.0f) { + cso->min_lod = 0.0f; + cso->mag_img_filter = state->min_img_filter; + } + + return cso; +} + +/** + * The pipe->bind_sampler_states() driver hook. + */ +static void +crocus_bind_sampler_states(struct pipe_context *ctx, + enum pipe_shader_type p_stage, + unsigned start, unsigned count, + void **states) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + gl_shader_stage stage = stage_from_pipe(p_stage); + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + + assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS); + + bool dirty = false; + + for (int i = 0; i < count; i++) { + if (shs->samplers[start + i] != states[i]) { + shs->samplers[start + i] = states[i]; + dirty = true; + } + } + + if (dirty) { +#if GFX_VER <= 5 + if (p_stage == PIPE_SHADER_FRAGMENT) + ice->state.dirty |= CROCUS_DIRTY_WM; + else if (p_stage == PIPE_SHADER_VERTEX) + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS; +#endif + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage; + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES]; + } +} + +enum samp_workaround { + SAMP_NORMAL, + SAMP_CUBE_CLAMP, + SAMP_CUBE_CUBE, + SAMP_T_WRAP, +}; + +static void +crocus_upload_sampler_state(struct crocus_batch *batch, + struct crocus_sampler_state *cso, + uint32_t border_color_offset, + enum samp_workaround samp_workaround, + uint32_t first_level, + void *map) +{ + struct pipe_sampler_state *state = &cso->pstate; + uint32_t wrap_s, wrap_t, wrap_r; + + wrap_s = cso->wrap_s; + wrap_t = cso->wrap_t; + wrap_r = cso->wrap_r; + + switch (samp_workaround) { + case SAMP_CUBE_CLAMP: + wrap_s = TCM_CLAMP; + wrap_t = TCM_CLAMP; + wrap_r = TCM_CLAMP; + break; + case SAMP_CUBE_CUBE: + wrap_s = TCM_CUBE; + wrap_t = TCM_CUBE; + wrap_r = TCM_CUBE; + break; + case SAMP_T_WRAP: + wrap_t = TCM_WRAP; + break; + default: + break; + } + + _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) { + samp.TCXAddressControlMode = wrap_s; + samp.TCYAddressControlMode = wrap_t; + samp.TCZAddressControlMode = wrap_r; + +#if GFX_VER >= 6 + samp.NonnormalizedCoordinateEnable = !state->normalized_coords; +#endif + samp.MinModeFilter = state->min_img_filter; + samp.MagModeFilter = cso->mag_img_filter; + samp.MipModeFilter = translate_mip_filter(state->min_mip_filter); + samp.MaximumAnisotropy = RATIO21; + + if (state->max_anisotropy >= 2) { + if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) { + samp.MinModeFilter = MAPFILTER_ANISOTROPIC; +#if GFX_VER >= 7 + samp.AnisotropicAlgorithm = EWAApproximation; +#endif + } + + if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR) + samp.MagModeFilter = MAPFILTER_ANISOTROPIC; + + samp.MaximumAnisotropy = + MIN2((state->max_anisotropy - 2) / 2, RATIO161); + } + + /* Set address rounding bits if not using nearest filtering. */ + if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) { + samp.UAddressMinFilterRoundingEnable = true; + samp.VAddressMinFilterRoundingEnable = true; + samp.RAddressMinFilterRoundingEnable = true; + } + + if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) { + samp.UAddressMagFilterRoundingEnable = true; + samp.VAddressMagFilterRoundingEnable = true; + samp.RAddressMagFilterRoundingEnable = true; + } + + if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) + samp.ShadowFunction = translate_shadow_func(state->compare_func); + + const float hw_max_lod = GFX_VER >= 7 ? 14 : 13; + + samp.LODPreClampEnable = true; + samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod); + samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod); + samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15); + +#if GFX_VER == 6 + samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod); + samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter; +#endif + +#if GFX_VER < 6 + samp.BorderColorPointer = + ro_bo(batch->state.bo, border_color_offset); +#else + samp.BorderColorPointer = border_color_offset; +#endif + } +} + +static void +crocus_upload_border_color(struct crocus_batch *batch, + struct crocus_sampler_state *cso, + struct crocus_sampler_view *tex, + uint32_t *bc_offset) +{ + /* We may need to swizzle the border color for format faking. + * A/LA formats are faked as R/RG with 000R or R00G swizzles. + * This means we need to move the border color's A channel into + * the R or G channels so that those read swizzles will move it + * back into A. + */ + enum pipe_format internal_format = PIPE_FORMAT_NONE; + union pipe_color_union *color = &cso->border_color; + union pipe_color_union tmp; + if (tex) { + internal_format = tex->res->internal_format; + + if (util_format_is_alpha(internal_format)) { + unsigned char swz[4] = { + PIPE_SWIZZLE_0, PIPE_SWIZZLE_0, + PIPE_SWIZZLE_0, PIPE_SWIZZLE_W, + }; + util_format_apply_color_swizzle(&tmp, color, swz, true); + color = &tmp; + } else if (util_format_is_luminance_alpha(internal_format) && + internal_format != PIPE_FORMAT_L8A8_SRGB) { + unsigned char swz[4] = { + PIPE_SWIZZLE_X, PIPE_SWIZZLE_X, + PIPE_SWIZZLE_X, PIPE_SWIZZLE_W + }; + util_format_apply_color_swizzle(&tmp, color, swz, true); + color = &tmp; + } + } + bool is_integer_format = util_format_is_pure_integer(internal_format); + unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4; + const int sbc_align = (GFX_VERx10 == 75 && is_integer_format) ? 512 : 32; + uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset); + + struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 }; + +#define ASSIGN(dst, src) \ + do { \ + dst = src; \ + } while (0) + +#define ASSIGNu16(dst, src) \ + do { \ + dst = (uint16_t)src; \ + } while (0) + +#define ASSIGNu8(dst, src) \ + do { \ + dst = (uint8_t)src; \ + } while (0) + +#define BORDER_COLOR_ATTR(macro, _color_type, src) \ + macro(state.BorderColor ## _color_type ## Red, src[0]); \ + macro(state.BorderColor ## _color_type ## Green, src[1]); \ + macro(state.BorderColor ## _color_type ## Blue, src[2]); \ + macro(state.BorderColor ## _color_type ## Alpha, src[3]); + +#if GFX_VERx10 == 75 + if (is_integer_format) { + const struct util_format_description *format_desc = + util_format_description(internal_format); + + /* From the Haswell PRM, "Command Reference: Structures", Page 36: + * "If any color channel is missing from the surface format, + * corresponding border color should be programmed as zero and if + * alpha channel is missing, corresponding Alpha border color should + * be programmed as 1." + */ + unsigned c[4] = { 0, 0, 0, 1 }; + for (int i = 0; i < 4; i++) { + if (format_desc->channel[i].size) + c[i] = color->ui[i]; + } + + switch (format_desc->channel[0].size) { + case 8: + /* Copy RGBA in order. */ + BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c); + break; + case 10: + /* R10G10B10A2_UINT is treated like a 16-bit format. */ + case 16: + BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c); + break; + case 32: + if (format_desc->channel[1].size && !format_desc->channel[2].size) { + /* Careful inspection of the tables reveals that for RG32 formats, + * the green channel needs to go where blue normally belongs. + */ + state.BorderColor32bitRed = c[0]; + state.BorderColor32bitBlue = c[1]; + state.BorderColor32bitAlpha = 1; + } else { + /* Copy RGBA in order. */ + BORDER_COLOR_ATTR(ASSIGN, 32bit, c); + } + break; + default: + assert(!"Invalid number of bits per channel in integer format."); + break; + } + } else { + BORDER_COLOR_ATTR(ASSIGN, Float, color->f); + } +#elif GFX_VER == 5 || GFX_VER == 6 + BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f); + BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f); + BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f); + +#define MESA_FLOAT_TO_HALF(dst, src) \ + dst = _mesa_float_to_half(src); + + BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f); + +#undef MESA_FLOAT_TO_HALF + + state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8; + state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8; + state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8; + state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8; + + BORDER_COLOR_ATTR(ASSIGN, Float, color->f); + +#elif GFX_VER == 4 + BORDER_COLOR_ATTR(ASSIGN, , color->f); +#else + BORDER_COLOR_ATTR(ASSIGN, Float, color->f); +#endif + +#undef ASSIGN +#undef BORDER_COLOR_ATTR + + GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state); +} + +/** + * Upload the sampler states into a contiguous area of GPU memory, for + * for 3DSTATE_SAMPLER_STATE_POINTERS_*. + * + * Also fill out the border color state pointers. + */ +static void +crocus_upload_sampler_states(struct crocus_context *ice, + struct crocus_batch *batch, gl_shader_stage stage) +{ + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + const struct shader_info *info = crocus_get_shader_info(ice, stage); + + /* We assume the state tracker will call pipe->bind_sampler_states() + * if the program's number of textures changes. + */ + unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0; + + if (!count) + return; + + /* Assemble the SAMPLER_STATEs into a contiguous table that lives + * in the dynamic state memory zone, so we can point to it via the + * 3DSTATE_SAMPLER_STATE_POINTERS_* commands. + */ + unsigned size = count * 4 * GENX(SAMPLER_STATE_length); + uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset); + + if (unlikely(!map)) + return; + + for (int i = 0; i < count; i++) { + struct crocus_sampler_state *state = shs->samplers[i]; + struct crocus_sampler_view *tex = shs->textures[i]; + + if (!state || !tex) { + memset(map, 0, 4 * GENX(SAMPLER_STATE_length)); + } else { + unsigned border_color_offset = 0; + if (state->needs_border_color) { + crocus_upload_border_color(batch, state, tex, &border_color_offset); + } + + enum samp_workaround wa = SAMP_NORMAL; + /* There's a bug in 1D texture sampling - it actually pays + * attention to the wrap_t value, though it should not. + * Override the wrap_t value here to GL_REPEAT to keep + * any nonexistent border pixels from floating in. + */ + if (tex->base.target == PIPE_TEXTURE_1D) + wa = SAMP_T_WRAP; + else if (tex->base.target == PIPE_TEXTURE_CUBE || + tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) { + /* Cube maps must use the same wrap mode for all three coordinate + * dimensions. Prior to Haswell, only CUBE and CLAMP are valid. + * + * Ivybridge and Baytrail seem to have problems with CUBE mode and + * integer formats. Fall back to CLAMP for now. + */ + if (state->pstate.seamless_cube_map && + !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format))) + wa = SAMP_CUBE_CUBE; + else + wa = SAMP_CUBE_CLAMP; + } + + uint32_t first_level = 0; + if (tex->base.target != PIPE_BUFFER) + first_level = tex->base.u.tex.first_level; + + crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map); + } + + map += GENX(SAMPLER_STATE_length); + } +} + +/** + * The pipe->create_sampler_view() driver hook. + */ +static struct pipe_sampler_view * +crocus_create_sampler_view(struct pipe_context *ctx, + struct pipe_resource *tex, + const struct pipe_sampler_view *tmpl) +{ + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view)); + + if (!isv) + return NULL; + + /* initialize base object */ + isv->base = *tmpl; + isv->base.context = ctx; + isv->base.texture = NULL; + pipe_reference_init(&isv->base.reference, 1); + pipe_resource_reference(&isv->base.texture, tex); + + if (util_format_is_depth_or_stencil(tmpl->format)) { + struct crocus_resource *zres, *sres; + const struct util_format_description *desc = + util_format_description(tmpl->format); + + crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres); + + tex = util_format_has_depth(desc) ? &zres->base : &sres->base; + + if (tex->format == PIPE_FORMAT_S8_UINT) + if (devinfo->ver == 7 && sres->shadow) + tex = &sres->shadow->base; + } + + isv->res = (struct crocus_resource *) tex; + + isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT; + + if (isv->base.target == PIPE_TEXTURE_CUBE || + isv->base.target == PIPE_TEXTURE_CUBE_ARRAY) + usage |= ISL_SURF_USAGE_CUBE_BIT; + + const struct crocus_format_info fmt = + crocus_format_for_usage(devinfo, tmpl->format, usage); + + enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a }; + crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz); + + /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */ + if (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT || + tmpl->format == PIPE_FORMAT_X24S8_UINT) { + isv->swizzle[0] = tmpl->swizzle_g; + isv->swizzle[1] = tmpl->swizzle_g; + isv->swizzle[2] = tmpl->swizzle_g; + isv->swizzle[3] = tmpl->swizzle_g; + } + + isv->clear_color = isv->res->aux.clear_color; + + isv->view = (struct isl_view) { + .format = fmt.fmt, +#if GFX_VERx10 >= 75 + .swizzle = (struct isl_swizzle) { + .r = pipe_to_isl_swizzle(isv->swizzle[0], false), + .g = pipe_to_isl_swizzle(isv->swizzle[1], false), + .b = pipe_to_isl_swizzle(isv->swizzle[2], false), + .a = pipe_to_isl_swizzle(isv->swizzle[3], false), + }, +#else + /* swizzling handled in shader code */ + .swizzle = ISL_SWIZZLE_IDENTITY, +#endif + .usage = usage, + }; + + /* Fill out SURFACE_STATE for this view. */ + if (tmpl->target != PIPE_BUFFER) { + isv->view.base_level = tmpl->u.tex.first_level; + isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1; + // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2? + isv->view.base_array_layer = tmpl->u.tex.first_layer; + isv->view.array_len = + tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1; + } +#if GFX_VER >= 6 + /* just create a second view struct for texture gather just in case */ + isv->gather_view = isv->view; + +#if GFX_VER >= 7 + if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT || + fmt.fmt == ISL_FORMAT_R32G32_SINT || + fmt.fmt == ISL_FORMAT_R32G32_UINT) { + isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD; +#if GFX_VERx10 >= 75 + isv->gather_view.swizzle = (struct isl_swizzle) { + .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75), + .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75), + .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75), + .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75), + }; +#endif + } +#endif +#if GFX_VER == 6 + /* Sandybridge's gather4 message is broken for integer formats. + * To work around this, we pretend the surface is UNORM for + * 8 or 16-bit formats, and emit shader instructions to recover + * the real INT/UINT value. For 32-bit formats, we pretend + * the surface is FLOAT, and simply reinterpret the resulting + * bits. + */ + switch (fmt.fmt) { + case ISL_FORMAT_R8_SINT: + case ISL_FORMAT_R8_UINT: + isv->gather_view.format = ISL_FORMAT_R8_UNORM; + break; + + case ISL_FORMAT_R16_SINT: + case ISL_FORMAT_R16_UINT: + isv->gather_view.format = ISL_FORMAT_R16_UNORM; + break; + + case ISL_FORMAT_R32_SINT: + case ISL_FORMAT_R32_UINT: + isv->gather_view.format = ISL_FORMAT_R32_FLOAT; + break; + + default: + break; + } +#endif +#endif + /* Fill out SURFACE_STATE for this view. */ + if (tmpl->target != PIPE_BUFFER) { + if (crocus_resource_unfinished_aux_import(isv->res)) + crocus_resource_finish_aux_import(&screen->base, isv->res); + + } + + return &isv->base; +} + +static void +crocus_sampler_view_destroy(struct pipe_context *ctx, + struct pipe_sampler_view *state) +{ + struct crocus_sampler_view *isv = (void *) state; + pipe_resource_reference(&state->texture, NULL); + free(isv); +} + +/** + * The pipe->create_surface() driver hook. + * + * In Gallium nomenclature, "surfaces" are a view of a resource that + * can be bound as a render target or depth/stencil buffer. + */ +static struct pipe_surface * +crocus_create_surface(struct pipe_context *ctx, + struct pipe_resource *tex, + const struct pipe_surface *tmpl) +{ + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + isl_surf_usage_flags_t usage = 0; + if (tmpl->writable) + usage = ISL_SURF_USAGE_STORAGE_BIT; + else if (util_format_is_depth_or_stencil(tmpl->format)) + usage = ISL_SURF_USAGE_DEPTH_BIT; + else + usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; + + const struct crocus_format_info fmt = + crocus_format_for_usage(devinfo, tmpl->format, usage); + + if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) && + !isl_format_supports_rendering(devinfo, fmt.fmt)) { + /* Framebuffer validation will reject this invalid case, but it + * hasn't had the opportunity yet. In the meantime, we need to + * avoid hitting ISL asserts about unsupported formats below. + */ + return NULL; + } + + struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface)); + struct pipe_surface *psurf = &surf->base; + struct crocus_resource *res = (struct crocus_resource *) tex; + + if (!surf) + return NULL; + + pipe_reference_init(&psurf->reference, 1); + pipe_resource_reference(&psurf->texture, tex); + psurf->context = ctx; + psurf->format = tmpl->format; + psurf->width = tex->width0; + psurf->height = tex->height0; + psurf->texture = tex; + psurf->u.tex.first_layer = tmpl->u.tex.first_layer; + psurf->u.tex.last_layer = tmpl->u.tex.last_layer; + psurf->u.tex.level = tmpl->u.tex.level; + + uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1; + + struct isl_view *view = &surf->view; + *view = (struct isl_view) { + .format = fmt.fmt, + .base_level = tmpl->u.tex.level, + .levels = 1, + .base_array_layer = tmpl->u.tex.first_layer, + .array_len = array_len, + .swizzle = ISL_SWIZZLE_IDENTITY, + .usage = usage, + }; + +#if GFX_VER >= 6 + struct isl_view *read_view = &surf->read_view; + *read_view = (struct isl_view) { + .format = fmt.fmt, + .base_level = tmpl->u.tex.level, + .levels = 1, + .base_array_layer = tmpl->u.tex.first_layer, + .array_len = array_len, + .swizzle = ISL_SWIZZLE_IDENTITY, + .usage = ISL_SURF_USAGE_TEXTURE_BIT, + }; +#endif + + surf->clear_color = res->aux.clear_color; + + /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */ + if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT | + ISL_SURF_USAGE_STENCIL_BIT)) + return psurf; + + if (!isl_format_is_compressed(res->surf.format)) { + if (crocus_resource_unfinished_aux_import(res)) + crocus_resource_finish_aux_import(&screen->base, res); + + memcpy(&surf->surf, &res->surf, sizeof(surf->surf)); + uint32_t temp_offset, temp_x, temp_y; + + isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level, + res->base.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer, + res->base.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0, + &temp_offset, &temp_x, &temp_y); + if (!devinfo->has_surface_tile_offset && + (temp_x || temp_y)) { + /* Original gfx4 hardware couldn't draw to a non-tile-aligned + * destination. + */ + /* move to temp */ + struct pipe_resource wa_templ = (struct pipe_resource) { + .width0 = u_minify(res->base.width0, tmpl->u.tex.level), + .height0 = u_minify(res->base.height0, tmpl->u.tex.level), + .depth0 = 1, + .array_size = 1, + .format = res->base.format, + .target = PIPE_TEXTURE_2D, + .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW, + }; + surf->align_res = screen->base.resource_create(&screen->base, &wa_templ); + view->base_level = 0; + view->base_array_layer = 0; + view->array_len = 1; + struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res; + memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf)); + } + return psurf; + } + + /* The resource has a compressed format, which is not renderable, but we + * have a renderable view format. We must be attempting to upload blocks + * of compressed data via an uncompressed view. + * + * In this case, we can assume there are no auxiliary buffers, a single + * miplevel, and that the resource is single-sampled. Gallium may try + * and create an uncompressed view with multiple layers, however. + */ + assert(!isl_format_is_compressed(fmt.fmt)); + assert(res->surf.samples == 1); + assert(view->levels == 1); + + /* TODO: compressed pbo uploads aren't working here */ + return NULL; + + uint32_t offset_B = 0, tile_x_sa = 0, tile_y_sa = 0; + + if (view->base_level > 0) { + /* We can't rely on the hardware's miplevel selection with such + * a substantial lie about the format, so we select a single image + * using the Tile X/Y Offset fields. In this case, we can't handle + * multiple array slices. + * + * On Broadwell, HALIGN and VALIGN are specified in pixels and are + * hard-coded to align to exactly the block size of the compressed + * texture. This means that, when reinterpreted as a non-compressed + * texture, the tile offsets may be anything and we can't rely on + * X/Y Offset. + * + * Return NULL to force the state tracker to take fallback paths. + */ + // TODO: check if the gen7 check is right, originally gen8 + if (view->array_len > 1 || GFX_VER == 7) + return NULL; + + const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D; + isl_surf_get_image_surf(&screen->isl_dev, &res->surf, + view->base_level, + is_3d ? 0 : view->base_array_layer, + is_3d ? view->base_array_layer : 0, + &surf->surf, + &offset_B, &tile_x_sa, &tile_y_sa); + + /* We use address and tile offsets to access a single level/layer + * as a subimage, so reset level/layer so it doesn't offset again. + */ + view->base_array_layer = 0; + view->base_level = 0; + } else { + /* Level 0 doesn't require tile offsets, and the hardware can find + * array slices using QPitch even with the format override, so we + * can allow layers in this case. Copy the original ISL surface. + */ + memcpy(&surf->surf, &res->surf, sizeof(surf->surf)); + } + + /* Scale down the image dimensions by the block size. */ + const struct isl_format_layout *fmtl = + isl_format_get_layout(res->surf.format); + surf->surf.format = fmt.fmt; + surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf); + surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf); + tile_x_sa /= fmtl->bw; + tile_y_sa /= fmtl->bh; + + psurf->width = surf->surf.logical_level0_px.width; + psurf->height = surf->surf.logical_level0_px.height; + + return psurf; +} + +#if GFX_VER == 7 +static void +fill_default_image_param(struct brw_image_param *param) +{ + memset(param, 0, sizeof(*param)); + /* Set the swizzling shifts to all-ones to effectively disable swizzling -- + * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more + * detailed explanation of these parameters. + */ + param->swizzling[0] = 0xff; + param->swizzling[1] = 0xff; +} + +static void +fill_buffer_image_param(struct brw_image_param *param, + enum pipe_format pfmt, + unsigned size) +{ + const unsigned cpp = util_format_get_blocksize(pfmt); + + fill_default_image_param(param); + param->size[0] = size / cpp; + param->stride[0] = cpp; +} + +#endif + +/** + * The pipe->set_shader_images() driver hook. + */ +static void +crocus_set_shader_images(struct pipe_context *ctx, + enum pipe_shader_type p_stage, + unsigned start_slot, unsigned count, + unsigned unbind_num_trailing_slots, + const struct pipe_image_view *p_images) +{ +#if GFX_VER == 7 + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + gl_shader_stage stage = stage_from_pipe(p_stage); + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + struct crocus_genx_state *genx = ice->state.genx; + struct brw_image_param *image_params = genx->shaders[stage].image_param; + + shs->bound_image_views &= ~u_bit_consecutive(start_slot, count); + + for (unsigned i = 0; i < count; i++) { + struct crocus_image_view *iv = &shs->image[start_slot + i]; + + if (p_images && p_images[i].resource) { + const struct pipe_image_view *img = &p_images[i]; + struct crocus_resource *res = (void *) img->resource; + + util_copy_image_view(&iv->base, img); + + shs->bound_image_views |= 1 << (start_slot + i); + + res->bind_history |= PIPE_BIND_SHADER_IMAGE; + res->bind_stages |= 1 << stage; + + isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT; + struct crocus_format_info fmt = + crocus_format_for_usage(devinfo, img->format, usage); + + struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles); + if (img->shader_access & PIPE_IMAGE_ACCESS_READ) { + /* On Gen8, try to use typed surfaces reads (which support a + * limited number of formats), and if not possible, fall back + * to untyped reads. + */ + if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt)) + fmt.fmt = ISL_FORMAT_RAW; + else + fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt); + } + + if (res->base.target != PIPE_BUFFER) { + struct isl_view view = { + .format = fmt.fmt, + .base_level = img->u.tex.level, + .levels = 1, + .base_array_layer = img->u.tex.first_layer, + .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1, + .swizzle = swiz, + .usage = usage, + }; + + iv->view = view; + + isl_surf_fill_image_param(&screen->isl_dev, + &image_params[start_slot + i], + &res->surf, &view); + } else { + struct isl_view view = { + .format = fmt.fmt, + .swizzle = swiz, + .usage = usage, + }; + iv->view = view; + + util_range_add(&res->base, &res->valid_buffer_range, img->u.buf.offset, + img->u.buf.offset + img->u.buf.size); + fill_buffer_image_param(&image_params[start_slot + i], + img->format, img->u.buf.size); + } + } else { + pipe_resource_reference(&iv->base.resource, NULL); + fill_default_image_param(&image_params[start_slot + i]); + } + } + + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage; + ice->state.dirty |= + stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES + : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; + + /* Broadwell also needs brw_image_params re-uploaded */ + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage; + shs->sysvals_need_upload = true; +#endif +} + + +/** + * The pipe->set_sampler_views() driver hook. + */ +static void +crocus_set_sampler_views(struct pipe_context *ctx, + enum pipe_shader_type p_stage, + unsigned start, unsigned count, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + gl_shader_stage stage = stage_from_pipe(p_stage); + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + + shs->bound_sampler_views &= ~u_bit_consecutive(start, count); + + for (unsigned i = 0; i < count; i++) { + struct pipe_sampler_view *pview = views ? views[i] : NULL; + pipe_sampler_view_reference((struct pipe_sampler_view **) + &shs->textures[start + i], pview); + struct crocus_sampler_view *view = (void *) pview; + if (view) { + view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW; + view->res->bind_stages |= 1 << stage; + + shs->bound_sampler_views |= 1 << (start + i); + } + } + + ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage); + ice->state.dirty |= + stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES + : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES]; +} + +/** + * The pipe->set_tess_state() driver hook. + */ +static void +crocus_set_tess_state(struct pipe_context *ctx, + const float default_outer_level[4], + const float default_inner_level[2]) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL]; + + memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float)); + memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float)); + + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS; + shs->sysvals_need_upload = true; +} + +static void +crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf) +{ + struct crocus_surface *surf = (void *) p_surf; + pipe_resource_reference(&p_surf->texture, NULL); + + pipe_resource_reference(&surf->align_res, NULL); + free(surf); +} + +static void +crocus_set_clip_state(struct pipe_context *ctx, + const struct pipe_clip_state *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX]; + struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY]; + struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL]; + + memcpy(&ice->state.clip_planes, state, sizeof(*state)); + +#if GFX_VER <= 5 + ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE; +#endif + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS | + CROCUS_STAGE_DIRTY_CONSTANTS_TES; + shs->sysvals_need_upload = true; + gshs->sysvals_need_upload = true; + tshs->sysvals_need_upload = true; +} + +/** + * The pipe->set_polygon_stipple() driver hook. + */ +static void +crocus_set_polygon_stipple(struct pipe_context *ctx, + const struct pipe_poly_stipple *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + memcpy(&ice->state.poly_stipple, state, sizeof(*state)); + ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE; +} + +/** + * The pipe->set_sample_mask() driver hook. + */ +static void +crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + + /* We only support 16x MSAA, so we have 16 bits of sample maks. + * st/mesa may pass us 0xffffffff though, meaning "enable all samples". + */ + ice->state.sample_mask = sample_mask & 0xff; + ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK; +} + +static void +crocus_fill_scissor_rect(struct crocus_context *ice, + int idx, + struct pipe_scissor_state *ss) +{ + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso; + const struct pipe_viewport_state *vp = &ice->state.viewports[idx]; + struct pipe_scissor_state scissor = (struct pipe_scissor_state) { + .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0), + .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1, + .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0), + .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1, + }; + if (cso_state->scissor) { + struct pipe_scissor_state *s = &ice->state.scissors[idx]; + scissor.minx = MAX2(scissor.minx, s->minx); + scissor.miny = MAX2(scissor.miny, s->miny); + scissor.maxx = MIN2(scissor.maxx, s->maxx); + scissor.maxy = MIN2(scissor.maxy, s->maxy); + } + *ss = scissor; +} + +/** + * The pipe->set_scissor_states() driver hook. + * + * This corresponds to our SCISSOR_RECT state structures. It's an + * exact match, so we just store them, and memcpy them out later. + */ +static void +crocus_set_scissor_states(struct pipe_context *ctx, + unsigned start_slot, + unsigned num_scissors, + const struct pipe_scissor_state *rects) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + + for (unsigned i = 0; i < num_scissors; i++) { + if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) { + /* If the scissor was out of bounds and got clamped to 0 width/height + * at the bounds, the subtraction of 1 from maximums could produce a + * negative number and thus not clip anything. Instead, just provide + * a min > max scissor inside the bounds, which produces the expected + * no rendering. + */ + ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) { + .minx = 1, .maxx = 0, .miny = 1, .maxy = 0, + }; + } else { + ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) { + .minx = rects[i].minx, .miny = rects[i].miny, + .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1, + }; + } + } + +#if GFX_VER < 6 + ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */ +#else + ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; +#endif + ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; + +} + +/** + * The pipe->set_stencil_ref() driver hook. + * + * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time. + */ +static void +crocus_set_stencil_ref(struct pipe_context *ctx, + const struct pipe_stencil_ref ref) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + ice->state.stencil_ref = ref; + ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; +} + +/** + * The pipe->set_viewport_states() driver hook. + * + * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate + * the guardband yet, as we need the framebuffer dimensions, but we can + * at least fill out the rest. + */ +static void +crocus_set_viewport_states(struct pipe_context *ctx, + unsigned start_slot, + unsigned count, + const struct pipe_viewport_state *states) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + + memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count); + + ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; + ice->state.dirty |= CROCUS_DIRTY_RASTER; +#if GFX_VER >= 6 + ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; +#endif + + if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near || + !ice->state.cso_rast->cso.depth_clip_far)) + ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT; +} + +/** + * The pipe->set_framebuffer_state() driver hook. + * + * Sets the current draw FBO, including color render targets, depth, + * and stencil buffers. + */ +static void +crocus_set_framebuffer_state(struct pipe_context *ctx, + const struct pipe_framebuffer_state *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct pipe_framebuffer_state *cso = &ice->state.framebuffer; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; +#if 0 + struct isl_device *isl_dev = &screen->isl_dev; + struct crocus_resource *zres; + struct crocus_resource *stencil_res; +#endif + + unsigned samples = util_framebuffer_get_num_samples(state); + unsigned layers = util_framebuffer_get_num_layers(state); + +#if GFX_VER >= 6 + if (cso->samples != samples) { + ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE; + ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK; + ice->state.dirty |= CROCUS_DIRTY_RASTER; +#if GFX_VERx10 == 75 + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS; +#endif + } +#endif + +#if GFX_VER >= 6 + if (cso->nr_cbufs != state->nr_cbufs) { + ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; + } +#endif + + if ((cso->layers == 0) != (layers == 0)) { + ice->state.dirty |= CROCUS_DIRTY_CLIP; + } + + if (cso->width != state->width || cso->height != state->height) { + ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT; + ice->state.dirty |= CROCUS_DIRTY_RASTER; + ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE; +#if GFX_VER >= 6 + ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; +#endif + } + + if (cso->zsbuf || state->zsbuf) { + ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER; + + /* update SF's depth buffer format */ + if (GFX_VER == 7 && cso->zsbuf) + ice->state.dirty |= CROCUS_DIRTY_RASTER; + } + + /* wm thread dispatch enable */ + ice->state.dirty |= CROCUS_DIRTY_WM; + util_copy_framebuffer_state(cso, state); + cso->samples = samples; + cso->layers = layers; + + if (cso->zsbuf) { + struct crocus_resource *zres; + struct crocus_resource *stencil_res; + enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE; + crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres, + &stencil_res); + if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) { + aux_usage = zres->aux.usage; + } + ice->state.hiz_usage = aux_usage; + } + + /* Render target change */ + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS; + + ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES; + + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER]; +} + +/** + * The pipe->set_constant_buffer() driver hook. + * + * This uploads any constant data in user buffers, and references + * any UBO resources containing constant data. + */ +static void +crocus_set_constant_buffer(struct pipe_context *ctx, + enum pipe_shader_type p_stage, unsigned index, + bool take_ownership, + const struct pipe_constant_buffer *input) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + gl_shader_stage stage = stage_from_pipe(p_stage); + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + struct pipe_constant_buffer *cbuf = &shs->constbufs[index]; + + util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership); + + if (input && input->buffer_size && (input->buffer || input->user_buffer)) { + shs->bound_cbufs |= 1u << index; + + if (input->user_buffer) { + void *map = NULL; + pipe_resource_reference(&cbuf->buffer, NULL); + u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64, + &cbuf->buffer_offset, &cbuf->buffer, (void **) &map); + + if (!cbuf->buffer) { + /* Allocation was unsuccessful - just unbind */ + crocus_set_constant_buffer(ctx, p_stage, index, false, NULL); + return; + } + + assert(map); + memcpy(map, input->user_buffer, input->buffer_size); + } + cbuf->buffer_size = + MIN2(input->buffer_size, + crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset); + + struct crocus_resource *res = (void *) cbuf->buffer; + res->bind_history |= PIPE_BIND_CONSTANT_BUFFER; + res->bind_stages |= 1 << stage; + } else { + shs->bound_cbufs &= ~(1u << index); + } + + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage; +} + +static void +upload_sysvals(struct crocus_context *ice, + gl_shader_stage stage) +{ + UNUSED struct crocus_genx_state *genx = ice->state.genx; + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + + struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; + if (!shader || shader->num_system_values == 0) + return; + + assert(shader->num_cbufs > 0); + + unsigned sysval_cbuf_index = shader->num_cbufs - 1; + struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index]; + unsigned upload_size = shader->num_system_values * sizeof(uint32_t); + uint32_t *map = NULL; + + assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS); + u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64, + &cbuf->buffer_offset, &cbuf->buffer, (void **) &map); + + for (int i = 0; i < shader->num_system_values; i++) { + uint32_t sysval = shader->system_values[i]; + uint32_t value = 0; + + if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) { +#if GFX_VER == 7 + unsigned img = BRW_PARAM_IMAGE_IDX(sysval); + unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval); + struct brw_image_param *param = + &genx->shaders[stage].image_param[img]; + + assert(offset < sizeof(struct brw_image_param)); + value = ((uint32_t *) param)[offset]; +#endif + } else if (sysval == BRW_PARAM_BUILTIN_ZERO) { + value = 0; + } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) { + int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval); + int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval); + value = fui(ice->state.clip_planes.ucp[plane][comp]); + } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) { + if (stage == MESA_SHADER_TESS_CTRL) { + value = ice->state.vertices_per_patch; + } else { + assert(stage == MESA_SHADER_TESS_EVAL); + const struct shader_info *tcs_info = + crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL); + if (tcs_info) + value = tcs_info->tess.tcs_vertices_out; + else + value = ice->state.vertices_per_patch; + } + } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X && + sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) { + unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X; + value = fui(ice->state.default_outer_level[i]); + } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) { + value = fui(ice->state.default_inner_level[0]); + } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) { + value = fui(ice->state.default_inner_level[1]); + } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X && + sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) { + unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X; + value = ice->state.last_block[i]; + } else { + assert(!"unhandled system value"); + } + + *map++ = value; + } + + cbuf->buffer_size = upload_size; + shs->sysvals_need_upload = false; +} + +/** + * The pipe->set_shader_buffers() driver hook. + * + * This binds SSBOs and ABOs. Unfortunately, we need to stream out + * SURFACE_STATE here, as the buffer offset may change each time. + */ +static void +crocus_set_shader_buffers(struct pipe_context *ctx, + enum pipe_shader_type p_stage, + unsigned start_slot, unsigned count, + const struct pipe_shader_buffer *buffers, + unsigned writable_bitmask) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + gl_shader_stage stage = stage_from_pipe(p_stage); + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + + unsigned modified_bits = u_bit_consecutive(start_slot, count); + + shs->bound_ssbos &= ~modified_bits; + shs->writable_ssbos &= ~modified_bits; + shs->writable_ssbos |= writable_bitmask << start_slot; + + for (unsigned i = 0; i < count; i++) { + if (buffers && buffers[i].buffer) { + struct crocus_resource *res = (void *) buffers[i].buffer; + struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i]; + pipe_resource_reference(&ssbo->buffer, &res->base); + ssbo->buffer_offset = buffers[i].buffer_offset; + ssbo->buffer_size = + MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset); + + shs->bound_ssbos |= 1 << (start_slot + i); + + res->bind_history |= PIPE_BIND_SHADER_BUFFER; + res->bind_stages |= 1 << stage; + + util_range_add(&res->base, &res->valid_buffer_range, ssbo->buffer_offset, + ssbo->buffer_offset + ssbo->buffer_size); + } else { + pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL); + } + } + + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage; +} + +static void +crocus_delete_state(struct pipe_context *ctx, void *state) +{ + free(state); +} + +/** + * The pipe->set_vertex_buffers() driver hook. + * + * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet. + */ +static void +crocus_set_vertex_buffers(struct pipe_context *ctx, + unsigned start_slot, unsigned count, + unsigned unbind_num_trailing_slots, + bool take_ownership, + const struct pipe_vertex_buffer *buffers) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_screen *screen = (struct crocus_screen *) ctx->screen; + const unsigned padding = + (!(GFX_VERx10 == 75) && !screen->devinfo.is_baytrail) * 2; + ice->state.bound_vertex_buffers &= + ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots); + + util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers, + buffers, start_slot, count, unbind_num_trailing_slots, + take_ownership); + + for (unsigned i = 0; i < count; i++) { + struct pipe_vertex_buffer *state = + &ice->state.vertex_buffers[start_slot + i]; + + if (!state->is_user_buffer && state->buffer.resource) { + struct crocus_resource *res = (void *)state->buffer.resource; + res->bind_history |= PIPE_BIND_VERTEX_BUFFER; + } + + uint32_t end = 0; + if (state->buffer.resource) + end = state->buffer.resource->width0 + padding; + ice->state.vb_end[start_slot + i] = end; + } + ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS; +} + +#if !(GFX_VERx10 == 75) +static uint8_t get_wa_flags(enum isl_format format) +{ + uint8_t wa_flags = 0; + + switch (format) { + case ISL_FORMAT_R10G10B10A2_USCALED: + wa_flags = BRW_ATTRIB_WA_SCALE; + break; + case ISL_FORMAT_R10G10B10A2_SSCALED: + wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE; + break; + case ISL_FORMAT_R10G10B10A2_UNORM: + wa_flags = BRW_ATTRIB_WA_NORMALIZE; + break; + case ISL_FORMAT_R10G10B10A2_SNORM: + wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE; + break; + case ISL_FORMAT_R10G10B10A2_SINT: + wa_flags = BRW_ATTRIB_WA_SIGN; + break; + case ISL_FORMAT_B10G10R10A2_USCALED: + wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA; + break; + case ISL_FORMAT_B10G10R10A2_SSCALED: + wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA; + break; + case ISL_FORMAT_B10G10R10A2_UNORM: + wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA; + break; + case ISL_FORMAT_B10G10R10A2_SNORM: + wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA; + break; + case ISL_FORMAT_B10G10R10A2_SINT: + wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA; + break; + case ISL_FORMAT_B10G10R10A2_UINT: + wa_flags = BRW_ATTRIB_WA_BGRA; + break; + default: + break; + } + return wa_flags; +} +#endif + +/** + * Gallium CSO for vertex elements. + */ +struct crocus_vertex_element_state { + uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)]; + uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)]; + uint32_t step_rate[16]; + uint8_t wa_flags[33]; + unsigned count; +}; + +/** + * The pipe->create_vertex_elements() driver hook. + * + * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS + * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing + * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are + * needed. In these cases we will need information available at draw time. + * We setup edgeflag_ve and edgeflag_vfi as alternatives last + * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at + * draw time if we detect that EdgeFlag is needed by the Vertex Shader. + */ +static void * +crocus_create_vertex_elements(struct pipe_context *ctx, + unsigned count, + const struct pipe_vertex_element *state) +{ + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_vertex_element_state *cso = + malloc(sizeof(struct crocus_vertex_element_state)); + + cso->count = count; + + crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) { + ve.DWordLength = + 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2; + } + + uint32_t *ve_pack_dest = &cso->vertex_elements[1]; + + if (count == 0) { + crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { + ve.Valid = true; + ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT; + ve.Component0Control = VFCOMP_STORE_0; + ve.Component1Control = VFCOMP_STORE_0; + ve.Component2Control = VFCOMP_STORE_0; + ve.Component3Control = VFCOMP_STORE_1_FP; + } + } + + for (int i = 0; i < count; i++) { + const struct crocus_format_info fmt = + crocus_format_for_usage(devinfo, state[i].src_format, 0); + unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC, + VFCOMP_STORE_SRC, VFCOMP_STORE_SRC }; + enum isl_format actual_fmt = fmt.fmt; + +#if !(GFX_VERx10 == 75) + cso->wa_flags[i] = get_wa_flags(fmt.fmt); + + if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED || + fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED || + fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM || + fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM || + fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT || + fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED || + fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED || + fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM || + fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM || + fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT || + fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT) + actual_fmt = ISL_FORMAT_R10G10B10A2_UINT; + if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT) + actual_fmt = ISL_FORMAT_R8G8B8A8_SINT; + if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT) + actual_fmt = ISL_FORMAT_R8G8B8A8_UINT; + if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT) + actual_fmt = ISL_FORMAT_R16G16B16A16_SINT; + if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT) + actual_fmt = ISL_FORMAT_R16G16B16A16_UINT; +#endif + + cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor; + + switch (isl_format_get_num_channels(fmt.fmt)) { + case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH; + case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH; + case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH; + case 3: + comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT + : VFCOMP_STORE_1_FP; + break; + } + crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { +#if GFX_VER >= 6 + ve.EdgeFlagEnable = false; +#endif + ve.VertexBufferIndex = state[i].vertex_buffer_index; + ve.Valid = true; + ve.SourceElementOffset = state[i].src_offset; + ve.SourceElementFormat = actual_fmt; + ve.Component0Control = comp[0]; + ve.Component1Control = comp[1]; + ve.Component2Control = comp[2]; + ve.Component3Control = comp[3]; +#if GFX_VER < 5 + ve.DestinationElementOffset = i * 4; +#endif + } + + ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length); + } + + /* An alternative version of the last VE and VFI is stored so it + * can be used at draw time in case Vertex Shader uses EdgeFlag + */ + if (count) { + const unsigned edgeflag_index = count - 1; + const struct crocus_format_info fmt = + crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0); + crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) { +#if GFX_VER >= 6 + ve.EdgeFlagEnable = true; +#endif + ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index; + ve.Valid = true; + ve.SourceElementOffset = state[edgeflag_index].src_offset; + ve.SourceElementFormat = fmt.fmt; + ve.Component0Control = VFCOMP_STORE_SRC; + ve.Component1Control = VFCOMP_STORE_0; + ve.Component2Control = VFCOMP_STORE_0; + ve.Component3Control = VFCOMP_STORE_0; + } + } + + return cso; +} + +/** + * The pipe->bind_vertex_elements_state() driver hook. + */ +static void +crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + + ice->state.cso_vertex_elements = state; + ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS; + ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS]; +} + +#if GFX_VER >= 6 +struct crocus_streamout_counter { + uint32_t offset_start; + uint32_t offset_end; + + uint64_t accum; +}; + +/** + * Gallium CSO for stream output (transform feedback) targets. + */ +struct crocus_stream_output_target { + struct pipe_stream_output_target base; + + /** Stride (bytes-per-vertex) during this transform feedback operation */ + uint16_t stride; + + /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */ + bool zeroed; + + struct crocus_resource *offset_res; + uint32_t offset_offset; + +#if GFX_VER == 6 + void *prim_map; + struct crocus_streamout_counter prev_count; + struct crocus_streamout_counter count; +#endif +}; + +#if GFX_VER >= 7 +static uint32_t +crocus_get_so_offset(struct pipe_stream_output_target *so) +{ + struct crocus_stream_output_target *tgt = (void *)so; + struct pipe_transfer *transfer; + struct pipe_box box; + uint32_t result; + u_box_1d(tgt->offset_offset, 4, &box); + void *val = so->context->buffer_map(so->context, &tgt->offset_res->base, + 0, PIPE_MAP_DIRECTLY, + &box, &transfer); + assert(val); + result = *(uint32_t *)val; + so->context->buffer_unmap(so->context, transfer); + + return result / tgt->stride; +} +#endif + +#if GFX_VER == 6 +static void +compute_vertices_written_so_far(struct crocus_context *ice, + struct crocus_stream_output_target *tgt, + struct crocus_streamout_counter *count, + uint64_t *svbi); + +static uint32_t +crocus_get_so_offset(struct pipe_stream_output_target *so) +{ + struct crocus_stream_output_target *tgt = (void *)so; + struct crocus_context *ice = (void *)so->context; + + uint64_t vert_written; + compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written); + return vert_written; +} +#endif + +/** + * The pipe->create_stream_output_target() driver hook. + * + * "Target" here refers to a destination buffer. We translate this into + * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet + * know which buffer this represents, or whether we ought to zero the + * write-offsets, or append. Those are handled in the set() hook. + */ +static struct pipe_stream_output_target * +crocus_create_stream_output_target(struct pipe_context *ctx, + struct pipe_resource *p_res, + unsigned buffer_offset, + unsigned buffer_size) +{ + struct crocus_resource *res = (void *) p_res; + struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso)); + if (!cso) + return NULL; + + res->bind_history |= PIPE_BIND_STREAM_OUTPUT; + + pipe_reference_init(&cso->base.reference, 1); + pipe_resource_reference(&cso->base.buffer, p_res); + cso->base.buffer_offset = buffer_offset; + cso->base.buffer_size = buffer_size; + cso->base.context = ctx; + + util_range_add(&res->base, &res->valid_buffer_range, buffer_offset, + buffer_offset + buffer_size); +#if GFX_VER >= 7 + struct crocus_context *ice = (struct crocus_context *) ctx; + void *temp; + u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4, + &cso->offset_offset, + (struct pipe_resource **)&cso->offset_res, + &temp); +#endif + + return &cso->base; +} + +static void +crocus_stream_output_target_destroy(struct pipe_context *ctx, + struct pipe_stream_output_target *state) +{ + struct crocus_stream_output_target *cso = (void *) state; + + pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL); + pipe_resource_reference(&cso->base.buffer, NULL); + + free(cso); +} + +#define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288 +#define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4) + +#if GFX_VER == 6 +static void +aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt, + struct crocus_streamout_counter *counter) +{ + uint64_t *prim_counts = tgt->prim_map; + + if (crocus_batch_references(batch, tgt->offset_res->bo)) { + struct pipe_fence_handle *out_fence = NULL; + batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0); + batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX); + batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL); + } + + for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) { + counter->accum += prim_counts[i + 1] - prim_counts[i]; + } + tgt->count.offset_start = tgt->count.offset_end = 0; +} + +static void +crocus_stream_store_prims_written(struct crocus_batch *batch, + struct crocus_stream_output_target *tgt) +{ + if (!tgt->offset_res) { + u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4, + &tgt->offset_offset, + (struct pipe_resource **)&tgt->offset_res, + &tgt->prim_map); + tgt->count.offset_start = tgt->count.offset_end = 0; + } + + if (tgt->count.offset_end + 16 >= 4096) { + aggregate_stream_counter(batch, tgt, &tgt->prev_count); + aggregate_stream_counter(batch, tgt, &tgt->count); + } + + crocus_emit_mi_flush(batch); + crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN, + tgt->offset_res->bo, + tgt->count.offset_end + tgt->offset_offset, false); + tgt->count.offset_end += 8; +} + +static void +compute_vertices_written_so_far(struct crocus_context *ice, + struct crocus_stream_output_target *tgt, + struct crocus_streamout_counter *counter, + uint64_t *svbi) +{ + //TODO vertices per prim + aggregate_stream_counter(&ice->batches[0], tgt, counter); + + *svbi = counter->accum * ice->state.last_xfb_verts_per_prim; +} +#endif +/** + * The pipe->set_stream_output_targets() driver hook. + * + * At this point, we know which targets are bound to a particular index, + * and also whether we want to append or start over. We can finish the + * 3DSTATE_SO_BUFFER packets we started earlier. + */ +static void +crocus_set_stream_output_targets(struct pipe_context *ctx, + unsigned num_targets, + struct pipe_stream_output_target **targets, + const unsigned *offsets) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL }; + const bool active = num_targets > 0; + if (ice->state.streamout_active != active) { + ice->state.streamout_active = active; +#if GFX_VER >= 7 + ice->state.dirty |= CROCUS_DIRTY_STREAMOUT; +#else + ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; +#endif + + /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because + * it's a non-pipelined command. If we're switching streamout on, we + * may have missed emitting it earlier, so do so now. (We're already + * taking a stall to update 3DSTATE_SO_BUFFERS anyway...) + */ + if (active) { +#if GFX_VER >= 7 + ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST; +#endif + } else { + uint32_t flush = 0; + for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + struct crocus_stream_output_target *tgt = + (void *) ice->state.so_target[i]; + if (tgt) { + struct crocus_resource *res = (void *) tgt->base.buffer; + + flush |= crocus_flush_bits_for_history(res); + crocus_dirty_for_history(ice, res); + } + } + crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER], + "make streamout results visible", flush); + } + } + + ice->state.so_targets = num_targets; + for (int i = 0; i < 4; i++) { + pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]); + pipe_so_target_reference(&ice->state.so_target[i], + i < num_targets ? targets[i] : NULL); + } + +#if GFX_VER == 6 + bool stored_num_prims = false; + for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + if (num_targets) { + struct crocus_stream_output_target *tgt = + (void *) ice->state.so_target[i]; + + if (!tgt) + continue; + if (offsets[i] == 0) { + // This means that we're supposed to ignore anything written to + // the buffer before. We can do this by just clearing out the + // count of writes to the prim count buffer. + tgt->count.offset_start = tgt->count.offset_end; + tgt->count.accum = 0; + ice->state.svbi = 0; + } else { + if (tgt->offset_res) { + compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi); + tgt->count.offset_start = tgt->count.offset_end; + } + } + + if (!stored_num_prims) { + crocus_stream_store_prims_written(batch, tgt); + stored_num_prims = true; + } + } else { + struct crocus_stream_output_target *tgt = + (void *) old_tgt[i]; + if (tgt) { + if (!stored_num_prims) { + crocus_stream_store_prims_written(batch, tgt); + stored_num_prims = true; + } + + if (tgt->offset_res) { + tgt->prev_count = tgt->count; + } + } + } + pipe_so_target_reference(&old_tgt[i], NULL); + } + +#else + for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + if (num_targets) { + struct crocus_stream_output_target *tgt = + (void *) ice->state.so_target[i]; + + if (offsets[i] == 0) + crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0); + else if (tgt) + crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i), + tgt->offset_res->bo, + tgt->offset_offset); + } else { + struct crocus_stream_output_target *tgt = + (void *) old_tgt[i]; + if (tgt) + crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i), + tgt->offset_res->bo, + tgt->offset_offset, false); + } + pipe_so_target_reference(&old_tgt[i], NULL); + } +#endif + /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */ + if (!active) + return; +#if GFX_VER >= 7 + ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS; +#elif GFX_VER == 6 + ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS; +#endif +} + +#endif + +#if GFX_VER >= 7 +/** + * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and + * 3DSTATE_STREAMOUT packets. + * + * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout + * hardware to record. We can create it entirely based on the shader, with + * no dynamic state dependencies. + * + * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and + * state-based settings. We capture the shader-related ones here, and merge + * the rest in at draw time. + */ +static uint32_t * +crocus_create_so_decl_list(const struct pipe_stream_output_info *info, + const struct brw_vue_map *vue_map) +{ + struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128]; + int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; + int max_decls = 0; + STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); + + memset(so_decl, 0, sizeof(so_decl)); + + /* Construct the list of SO_DECLs to be emitted. The formatting of the + * command feels strange -- each dword pair contains a SO_DECL per stream. + */ + for (unsigned i = 0; i < info->num_outputs; i++) { + const struct pipe_stream_output *output = &info->output[i]; + const int buffer = output->output_buffer; + const int varying = output->register_index; + const unsigned stream_id = output->stream; + assert(stream_id < MAX_VERTEX_STREAMS); + + buffer_mask[stream_id] |= 1 << buffer; + + assert(vue_map->varying_to_slot[varying] >= 0); + + /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] + * array. Instead, it simply increments DstOffset for the following + * input by the number of components that should be skipped. + * + * Our hardware is unusual in that it requires us to program SO_DECLs + * for fake "hole" components, rather than simply taking the offset + * for each real varying. Each hole can have size 1, 2, 3, or 4; we + * program as many size = 4 holes as we can, then a final hole to + * accommodate the final 1, 2, or 3 remaining. + */ + int skip_components = output->dst_offset - next_offset[buffer]; + + while (skip_components > 0) { + so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { + .HoleFlag = 1, + .OutputBufferSlot = output->output_buffer, + .ComponentMask = (1 << MIN2(skip_components, 4)) - 1, + }; + skip_components -= 4; + } + + next_offset[buffer] = output->dst_offset + output->num_components; + + so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { + .OutputBufferSlot = output->output_buffer, + .RegisterIndex = vue_map->varying_to_slot[varying], + .ComponentMask = + ((1 << output->num_components) - 1) << output->start_component, + }; + + if (decls[stream_id] > max_decls) + max_decls = decls[stream_id]; + } + + unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls); + uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords); + uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length); + + crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) { + int urb_entry_read_offset = 0; + int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - + urb_entry_read_offset; + + /* We always read the whole vertex. This could be reduced at some + * point by reading less and offsetting the register index in the + * SO_DECLs. + */ + sol.Stream0VertexReadOffset = urb_entry_read_offset; + sol.Stream0VertexReadLength = urb_entry_read_length - 1; + sol.Stream1VertexReadOffset = urb_entry_read_offset; + sol.Stream1VertexReadLength = urb_entry_read_length - 1; + sol.Stream2VertexReadOffset = urb_entry_read_offset; + sol.Stream2VertexReadLength = urb_entry_read_length - 1; + sol.Stream3VertexReadOffset = urb_entry_read_offset; + sol.Stream3VertexReadLength = urb_entry_read_length - 1; + + // TODO: Double-check that stride == 0 means no buffer. Probably this + // needs to go elsewhere, where the buffer enable stuff is actually + // known. + sol.SOBufferEnable0 = !!info->stride[0]; + sol.SOBufferEnable1 = !!info->stride[1]; + sol.SOBufferEnable2 = !!info->stride[2]; + sol.SOBufferEnable3 = !!info->stride[3]; + } + + crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) { + list.DWordLength = 3 + 2 * max_decls - 2; + list.StreamtoBufferSelects0 = buffer_mask[0]; + list.StreamtoBufferSelects1 = buffer_mask[1]; + list.StreamtoBufferSelects2 = buffer_mask[2]; + list.StreamtoBufferSelects3 = buffer_mask[3]; + list.NumEntries0 = decls[0]; + list.NumEntries1 = decls[1]; + list.NumEntries2 = decls[2]; + list.NumEntries3 = decls[3]; + } + + for (int i = 0; i < max_decls; i++) { + crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) { + entry.Stream0Decl = so_decl[0][i]; + entry.Stream1Decl = so_decl[1][i]; + entry.Stream2Decl = so_decl[2][i]; + entry.Stream3Decl = so_decl[3][i]; + } + } + + return map; +} +#endif + +#if GFX_VER == 6 +static void +crocus_emit_so_svbi(struct crocus_context *ice) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + unsigned max_vertex = 0xffffffff; + for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + struct crocus_stream_output_target *tgt = + (void *) ice->state.so_target[i]; + if (tgt) + max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride); + } + + crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) { + svbi.IndexNumber = 0; + svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */ + svbi.MaximumIndex = max_vertex; + } + + /* initialize the rest of the SVBI's to reasonable values so that we don't + * run out of room writing the regular data. + */ + for (int i = 1; i < 4; i++) { + crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) { + svbi.IndexNumber = i; + svbi.StreamedVertexBufferIndex = 0; + svbi.MaximumIndex = 0xffffffff; + } + } +} + +#endif + + +#if GFX_VER >= 6 +static bool +crocus_is_drawing_points(const struct crocus_context *ice) +{ + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + + if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT || + cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT) + return true; + + if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) { + const struct brw_gs_prog_data *gs_prog_data = + (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data; + return gs_prog_data->output_topology == _3DPRIM_POINTLIST; + } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) { + const struct brw_tes_prog_data *tes_data = + (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data; + return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT; + } else { + return ice->state.prim_mode == PIPE_PRIM_POINTS; + } +} +#endif + +#if GFX_VER >= 6 +static void +get_attr_override( + struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr, + const struct brw_vue_map *vue_map, + int urb_entry_read_offset, int fs_attr, + bool two_side_color, uint32_t *max_source_attr) +{ + /* Find the VUE slot for this attribute. */ + int slot = vue_map->varying_to_slot[fs_attr]; + + /* Viewport and Layer are stored in the VUE header. We need to override + * them to zero if earlier stages didn't write them, as GL requires that + * they read back as zero when not explicitly set. + */ + if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) { + attr->ComponentOverrideX = true; + attr->ComponentOverrideW = true; + attr->ConstantSource = CONST_0000; + + if (!(vue_map->slots_valid & VARYING_BIT_LAYER)) + attr->ComponentOverrideY = true; + if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT)) + attr->ComponentOverrideZ = true; + + return; + } + + /* If there was only a back color written but not front, use back + * as the color instead of undefined + */ + if (slot == -1 && fs_attr == VARYING_SLOT_COL0) + slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0]; + if (slot == -1 && fs_attr == VARYING_SLOT_COL1) + slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1]; + + if (slot == -1) { + /* This attribute does not exist in the VUE--that means that the vertex + * shader did not write to it. This means that either: + * + * (a) This attribute is a texture coordinate, and it is going to be + * replaced with point coordinates (as a consequence of a call to + * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the + * hardware will ignore whatever attribute override we supply. + * + * (b) This attribute is read by the fragment shader but not written by + * the vertex shader, so its value is undefined. Therefore the + * attribute override we supply doesn't matter. + * + * (c) This attribute is gl_PrimitiveID, and it wasn't written by the + * previous shader stage. + * + * Note that we don't have to worry about the cases where the attribute + * is gl_PointCoord or is undergoing point sprite coordinate + * replacement, because in those cases, this function isn't called. + * + * In case (c), we need to program the attribute overrides so that the + * primitive ID will be stored in this slot. In every other case, the + * attribute override we supply doesn't matter. So just go ahead and + * program primitive ID in every case. + */ + attr->ComponentOverrideW = true; + attr->ComponentOverrideX = true; + attr->ComponentOverrideY = true; + attr->ComponentOverrideZ = true; + attr->ConstantSource = PRIM_ID; + return; + } + + /* Compute the location of the attribute relative to urb_entry_read_offset. + * Each increment of urb_entry_read_offset represents a 256-bit value, so + * it counts for two 128-bit VUE slots. + */ + int source_attr = slot - 2 * urb_entry_read_offset; + assert(source_attr >= 0 && source_attr < 32); + + /* If we are doing two-sided color, and the VUE slot following this one + * represents a back-facing color, then we need to instruct the SF unit to + * do back-facing swizzling. + */ + bool swizzling = two_side_color && + ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 && + vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) || + (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 && + vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)); + + /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */ + if (*max_source_attr < source_attr + swizzling) + *max_source_attr = source_attr + swizzling; + + attr->SourceAttribute = source_attr; + if (swizzling) + attr->SwizzleSelect = INPUTATTR_FACING; +} + +static void +calculate_attr_overrides( + const struct crocus_context *ice, + struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides, + uint32_t *point_sprite_enables, + uint32_t *urb_entry_read_length, + uint32_t *urb_entry_read_offset) +{ + const struct brw_wm_prog_data *wm_prog_data = (void *) + ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data; + const struct brw_vue_map *vue_map = ice->shaders.last_vue_map; + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + uint32_t max_source_attr = 0; + const struct shader_info *fs_info = + crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); + + int first_slot = + brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map); + + /* Each URB offset packs two varying slots */ + assert(first_slot % 2 == 0); + *urb_entry_read_offset = first_slot / 2; + *point_sprite_enables = 0; + + for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) { + const int input_index = wm_prog_data->urb_setup[fs_attr]; + + if (input_index < 0) + continue; + + bool point_sprite = false; + if (crocus_is_drawing_points(ice)) { + if (fs_attr >= VARYING_SLOT_TEX0 && + fs_attr <= VARYING_SLOT_TEX7 && + cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0))) + point_sprite = true; + + if (fs_attr == VARYING_SLOT_PNTC) + point_sprite = true; + + if (point_sprite) + *point_sprite_enables |= 1U << input_index; + } + + struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 }; + if (!point_sprite) { + get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr, + cso_rast->cso.light_twoside, &max_source_attr); + } + + /* The hardware can only do the overrides on 16 overrides at a + * time, and the other up to 16 have to be lined up so that the + * input index = the output index. We'll need to do some + * tweaking to make sure that's the case. + */ + if (input_index < 16) + attr_overrides[input_index] = attribute; + else + assert(attribute.SourceAttribute == input_index); + } + + /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for + * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length": + * + * "This field should be set to the minimum length required to read the + * maximum source attribute. The maximum source attribute is indicated + * by the maximum value of the enabled Attribute # Source Attribute if + * Attribute Swizzle Enable is set, Number of Output Attributes-1 if + * enable is not set. + * read_length = ceiling((max_source_attr + 1) / 2) + * + * [errata] Corruption/Hang possible if length programmed larger than + * recommended" + * + * Similar text exists for Ivy Bridge. + */ + *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2); +} +#endif + +#if GFX_VER == 7 +static void +crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice) +{ + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + const struct brw_wm_prog_data *wm_prog_data = (void *) + ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data; + + uint32_t urb_entry_read_length; + uint32_t urb_entry_read_offset; + uint32_t point_sprite_enables; + + crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) { + sbe.AttributeSwizzleEnable = true; + sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; + sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode; + + calculate_attr_overrides(ice, + sbe.Attribute, + &point_sprite_enables, + &urb_entry_read_length, + &urb_entry_read_offset); + sbe.VertexURBEntryReadOffset = urb_entry_read_offset; + sbe.VertexURBEntryReadLength = urb_entry_read_length; + sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs; + sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables; + } +} +#endif + +/* ------------------------------------------------------------------- */ + +/** + * Populate VS program key fields based on the current state. + */ +static void +crocus_populate_vs_key(const struct crocus_context *ice, + const struct shader_info *info, + gl_shader_stage last_stage, + struct brw_vs_prog_key *key) +{ + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + + if (info->clip_distance_array_size == 0 && + (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && + last_stage == MESA_SHADER_VERTEX) + key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; + +#if GFX_VER <= 5 + key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL || + cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL); + key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff; +#endif + + key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color; + +#if !(GFX_VERx10 == 75) + uint64_t inputs_read = info->inputs_read; + int ve_idx = 0; + while (inputs_read) { + int i = u_bit_scan64(&inputs_read); + key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx]; + ve_idx++; + } +#endif +} + +/** + * Populate TCS program key fields based on the current state. + */ +static void +crocus_populate_tcs_key(const struct crocus_context *ice, + struct brw_tcs_prog_key *key) +{ +} + +/** + * Populate TES program key fields based on the current state. + */ +static void +crocus_populate_tes_key(const struct crocus_context *ice, + const struct shader_info *info, + gl_shader_stage last_stage, + struct brw_tes_prog_key *key) +{ + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + + if (info->clip_distance_array_size == 0 && + (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && + last_stage == MESA_SHADER_TESS_EVAL) + key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; +} + +/** + * Populate GS program key fields based on the current state. + */ +static void +crocus_populate_gs_key(const struct crocus_context *ice, + const struct shader_info *info, + gl_shader_stage last_stage, + struct brw_gs_prog_key *key) +{ + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + + if (info->clip_distance_array_size == 0 && + (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) && + last_stage == MESA_SHADER_GEOMETRY) + key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts; +} + +/** + * Populate FS program key fields based on the current state. + */ +static void +crocus_populate_fs_key(const struct crocus_context *ice, + const struct shader_info *info, + struct brw_wm_prog_key *key) +{ + struct crocus_screen *screen = (void *) ice->ctx.screen; + const struct pipe_framebuffer_state *fb = &ice->state.framebuffer; + const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa; + const struct crocus_rasterizer_state *rast = ice->state.cso_rast; + const struct crocus_blend_state *blend = ice->state.cso_blend; + +#if GFX_VER < 6 + uint32_t lookup = 0; + + if (info->fs.uses_discard || zsa->cso.alpha_enabled) + lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT; + + if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) + lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT; + + if (fb->zsbuf && zsa->cso.depth_enabled) { + lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT; + + if (zsa->cso.depth_writemask) + lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT; + + } + if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) { + lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT; + if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask) + lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT; + } + key->iz_lookup = lookup; + key->stats_wm = ice->state.stats_wm; +#endif + + uint32_t line_aa = BRW_WM_AA_NEVER; + if (rast->cso.line_smooth) { + int reduced_prim = u_reduced_prim(ice->state.prim_mode); + if (reduced_prim == PIPE_PRIM_LINES) + line_aa = BRW_WM_AA_ALWAYS; + else if (reduced_prim == PIPE_PRIM_TRIANGLES) { + if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) { + line_aa = BRW_WM_AA_SOMETIMES; + + if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE || + rast->cso.cull_face == PIPE_FACE_BACK) + line_aa = BRW_WM_AA_ALWAYS; + } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) { + line_aa = BRW_WM_AA_SOMETIMES; + + if (rast->cso.cull_face == PIPE_FACE_FRONT) + line_aa = BRW_WM_AA_ALWAYS; + } + } + } + key->line_aa = line_aa; + + key->nr_color_regions = fb->nr_cbufs; + + key->clamp_fragment_color = rast->cso.clamp_fragment_color; + + key->alpha_to_coverage = blend->cso.alpha_to_coverage; + + key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled; + + key->flat_shade = rast->cso.flatshade && + (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1)); + + key->persample_interp = rast->cso.force_persample_interp; + key->multisample_fbo = rast->cso.multisample && fb->samples > 1; + + key->ignore_sample_mask_out = !key->multisample_fbo; + key->coherent_fb_fetch = false; // TODO: needed? + + key->force_dual_color_blend = + screen->driconf.dual_color_blend_by_location && + (blend->blend_enables & 1) && blend->dual_color_blending; + + /* TODO: Respect glHint for key->high_quality_derivatives */ + +#if GFX_VER <= 5 + if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) { + key->alpha_test_func = zsa->cso.alpha_func; + key->alpha_test_ref = zsa->cso.alpha_ref_value; + } +#endif +} + +static void +crocus_populate_cs_key(const struct crocus_context *ice, + struct brw_cs_prog_key *key) +{ +} + +#if GFX_VER == 4 +#define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset); +#elif GFX_VER >= 5 +static uint64_t +KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader) +{ + return shader->offset; +} +#endif + +/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable + * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit + * this WA on C0 stepping. + * + * TODO: Fill out SamplerCount for prefetching? + */ + +#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \ + pkt.KernelStartPointer = KSP(ice, shader); \ + pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \ + pkt.FloatingPointMode = prog_data->use_alt_mode; \ + \ + pkt.DispatchGRFStartRegisterForURBData = \ + prog_data->dispatch_grf_start_reg; \ + pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \ + pkt.prefix##URBEntryReadOffset = 0; \ + \ + pkt.StatisticsEnable = true; \ + pkt.Enable = true; \ + \ + if (prog_data->total_scratch) { \ + struct crocus_bo *bo = \ + crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \ + pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \ + pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \ + } + +/* ------------------------------------------------------------------- */ +#if GFX_VER >= 6 +static const uint32_t push_constant_opcodes[] = { + [MESA_SHADER_VERTEX] = 21, + [MESA_SHADER_TESS_CTRL] = 25, /* HS */ + [MESA_SHADER_TESS_EVAL] = 26, /* DS */ + [MESA_SHADER_GEOMETRY] = 22, + [MESA_SHADER_FRAGMENT] = 23, + [MESA_SHADER_COMPUTE] = 0, +}; +#endif + +static void +emit_sized_null_surface(struct crocus_batch *batch, + unsigned width, unsigned height, + unsigned layers, unsigned levels, + unsigned minimum_array_element, + uint32_t *out_offset) +{ + struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t *surf = stream_state(batch, isl_dev->ss.size, + isl_dev->ss.align, + out_offset); + //TODO gen 6 multisample crash + isl_null_fill_state(isl_dev, surf, + .size = isl_extent3d(width, height, layers), + .levels = levels, + .minimum_array_element = minimum_array_element); +} +static void +emit_null_surface(struct crocus_batch *batch, + uint32_t *out_offset) +{ + emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset); +} + +static void +emit_null_fb_surface(struct crocus_batch *batch, + struct crocus_context *ice, + uint32_t *out_offset) +{ + uint32_t width, height, layers, level, layer; + /* If set_framebuffer_state() was never called, fall back to 1x1x1 */ + if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) { + emit_null_surface(batch, out_offset); + return; + } + + struct pipe_framebuffer_state *cso = &ice->state.framebuffer; + width = MAX2(cso->width, 1); + height = MAX2(cso->height, 1); + layers = cso->layers ? cso->layers : 1; + level = 0; + layer = 0; + + if (cso->nr_cbufs == 0 && cso->zsbuf) { + width = cso->zsbuf->width; + height = cso->zsbuf->height; + level = cso->zsbuf->u.tex.level; + layer = cso->zsbuf->u.tex.first_layer; + } + emit_sized_null_surface(batch, width, height, + layers, level, layer, + out_offset); +} + +static void +emit_surface_state(struct crocus_batch *batch, + struct crocus_resource *res, + const struct isl_surf *in_surf, + bool adjust_surf, + struct isl_view *view, + bool writeable, + enum isl_aux_usage aux_usage, + bool blend_enable, + uint32_t write_disables, + uint32_t *surf_state, + uint32_t addr_offset) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t reloc = RELOC_32BIT; + uint32_t offset = res->offset, tile_x_sa = 0, tile_y_sa = 0; + + if (writeable) + reloc |= RELOC_WRITE; + + struct isl_surf surf = *in_surf; + if (adjust_surf) { + if (res->base.target == PIPE_TEXTURE_3D && view->array_len == 1) { + isl_surf_get_image_surf(isl_dev, in_surf, + view->base_level, 0, + view->base_array_layer, + &surf, &offset, + &tile_x_sa, &tile_y_sa); + view->base_array_layer = 0; + view->base_level = 0; + } else if (res->base.target == PIPE_TEXTURE_CUBE && devinfo->ver == 4) { + isl_surf_get_image_surf(isl_dev, in_surf, + view->base_level, view->base_array_layer, + 0, + &surf, &offset, + &tile_x_sa, &tile_y_sa); + view->base_array_layer = 0; + view->base_level = 0; + } else if (res->base.target == PIPE_TEXTURE_1D_ARRAY) + surf.dim = ISL_SURF_DIM_2D; + } + + union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } }; + struct crocus_bo *aux_bo = NULL; + uint32_t aux_offset = 0; + struct isl_surf *aux_surf = NULL; + if (aux_usage != ISL_AUX_USAGE_NONE) { + aux_surf = &res->aux.surf; + aux_offset = res->aux.offset; + aux_bo = res->aux.bo; + + clear_color = crocus_resource_get_clear_color(res); + } + + isl_surf_fill_state(isl_dev, surf_state, + .surf = &surf, + .view = view, + .address = crocus_state_reloc(batch, + addr_offset + isl_dev->ss.addr_offset, + res->bo, offset, reloc), + .aux_surf = aux_surf, + .aux_usage = aux_usage, + .aux_address = aux_offset, + .mocs = crocus_mocs(res->bo, isl_dev), + .clear_color = clear_color, + .use_clear_address = false, + .clear_address = 0, + .x_offset_sa = tile_x_sa, + .y_offset_sa = tile_y_sa, +#if GFX_VER <= 5 + .blend_enable = blend_enable, + .write_disables = write_disables, +#endif + ); + + if (aux_surf) { + /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the + * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits + * contain other control information. Since buffer addresses are always + * on 4k boundaries (and thus have their lower 12 bits zero), we can use + * an ordinary reloc to do the necessary address translation. + * + * FIXME: move to the point of assignment. + */ + uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4); + *aux_addr = crocus_state_reloc(batch, + addr_offset + isl_dev->ss.aux_addr_offset, + aux_bo, *aux_addr, + reloc); + } + +} + +static uint32_t +emit_surface(struct crocus_batch *batch, + struct crocus_surface *surf, + enum isl_aux_usage aux_usage, + bool blend_enable, + uint32_t write_disables) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + struct isl_device *isl_dev = &batch->screen->isl_dev; + struct crocus_resource *res = (struct crocus_resource *)surf->base.texture; + struct isl_view *view = &surf->view; + uint32_t offset = 0; + enum pipe_texture_target target = res->base.target; + bool adjust_surf = false; + + if (devinfo->ver == 4 && target == PIPE_TEXTURE_CUBE) + adjust_surf = true; + + if (surf->align_res) + res = (struct crocus_resource *)surf->align_res; + + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset); + + emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true, + aux_usage, blend_enable, + write_disables, + surf_state, offset); + return offset; +} + +static uint32_t +emit_rt_surface(struct crocus_batch *batch, + struct crocus_surface *surf, + enum isl_aux_usage aux_usage) +{ + struct isl_device *isl_dev = &batch->screen->isl_dev; + struct crocus_resource *res = (struct crocus_resource *)surf->base.texture; + struct isl_view *view = &surf->read_view; + uint32_t offset = 0; + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset); + + emit_surface_state(batch, res, &surf->surf, true, view, false, + aux_usage, 0, false, + surf_state, offset); + return offset; +} + +static uint32_t +emit_grid(struct crocus_context *ice, + struct crocus_batch *batch) +{ + UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t offset = 0; + struct crocus_state_ref *grid_ref = &ice->state.grid_size; + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, + isl_dev->ss.align, &offset); + isl_buffer_fill_state(isl_dev, surf_state, + .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, + crocus_resource_bo(grid_ref->res), + grid_ref->offset, + RELOC_32BIT), + .size_B = 12, + .format = ISL_FORMAT_RAW, + .stride_B = 1, + .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev)); + return offset; +} + +static uint32_t +emit_ubo_buffer(struct crocus_context *ice, + struct crocus_batch *batch, + struct pipe_constant_buffer *buffer) +{ + UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t offset = 0; + + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, + isl_dev->ss.align, &offset); + isl_buffer_fill_state(isl_dev, surf_state, + .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, + crocus_resource_bo(buffer->buffer), + buffer->buffer_offset, + RELOC_32BIT), + .size_B = buffer->buffer_size, + .format = 0, + .swizzle = ISL_SWIZZLE_IDENTITY, + .stride_B = 1, + .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev)); + + return offset; +} + +static uint32_t +emit_ssbo_buffer(struct crocus_context *ice, + struct crocus_batch *batch, + struct pipe_shader_buffer *buffer, bool writeable) +{ + UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t offset = 0; + uint32_t reloc = RELOC_32BIT; + + if (writeable) + reloc |= RELOC_WRITE; + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, + isl_dev->ss.align, &offset); + isl_buffer_fill_state(isl_dev, surf_state, + .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, + crocus_resource_bo(buffer->buffer), + buffer->buffer_offset, + reloc), + .size_B = buffer->buffer_size, + .format = ISL_FORMAT_RAW, + .swizzle = ISL_SWIZZLE_IDENTITY, + .stride_B = 1, + .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev)); + + return offset; +} + +static uint32_t +emit_sampler_view(struct crocus_context *ice, + struct crocus_batch *batch, + bool for_gather, + struct crocus_sampler_view *isv) +{ + UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t offset = 0; + + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, + isl_dev->ss.align, &offset); + + if (isv->base.target == PIPE_BUFFER) { + const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format); + const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8; + unsigned final_size = + MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset, + CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp); + isl_buffer_fill_state(isl_dev, surf_state, + .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, + isv->res->bo, + isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT), + .size_B = final_size, + .format = isv->view.format, + .swizzle = isv->view.swizzle, + .stride_B = cpp, + .mocs = crocus_mocs(isv->res->bo, isl_dev) + ); + } else { + enum isl_aux_usage aux_usage = + crocus_resource_texture_aux_usage(isv->res); + + emit_surface_state(batch, isv->res, &isv->res->surf, false, + for_gather ? &isv->gather_view : &isv->view, + false, aux_usage, false, + 0, surf_state, offset); + } + return offset; +} + +static uint32_t +emit_image_view(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_image_view *iv) +{ + UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t offset = 0; + + struct crocus_resource *res = (struct crocus_resource *)iv->base.resource; + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, + isl_dev->ss.align, &offset); + bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE; + uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0); + if (res->base.target == PIPE_BUFFER) { + const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format); + const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8; + unsigned final_size = + MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset, + CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp); + isl_buffer_fill_state(isl_dev, surf_state, + .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, + res->bo, + res->offset + iv->base.u.buf.offset, reloc), + .size_B = final_size, + .format = iv->view.format, + .swizzle = iv->view.swizzle, + .stride_B = cpp, + .mocs = crocus_mocs(res->bo, isl_dev) + ); + } else { + if (iv->view.format == ISL_FORMAT_RAW) { + isl_buffer_fill_state(isl_dev, surf_state, + .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, + res->bo, + res->offset, reloc), + .size_B = res->bo->size - res->offset, + .format = iv->view.format, + .swizzle = iv->view.swizzle, + .stride_B = 1, + .mocs = crocus_mocs(res->bo, isl_dev), + ); + + + } else { + emit_surface_state(batch, res, + &res->surf, false, &iv->view, + write, 0, false, + 0, surf_state, offset); + } + } + + return offset; +} + +#if GFX_VER == 6 +static uint32_t +emit_sol_surface(struct crocus_batch *batch, + struct pipe_stream_output_info *so_info, + uint32_t idx) +{ + struct crocus_context *ice = batch->ice; + + if (idx >= so_info->num_outputs || !ice->state.streamout_active) + return 0; + const struct pipe_stream_output *output = &so_info->output[idx]; + const int buffer = output->output_buffer; + assert(output->stream == 0); + + struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer; + unsigned stride_dwords = so_info->stride[buffer]; + unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset; + + size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4; + unsigned num_vector_components = output->num_components; + unsigned num_elements; + /* FIXME: can we rely on core Mesa to ensure that the buffer isn't + * too big to map using a single binding table entry? + */ + // assert((size_dwords - offset_dwords) / stride_dwords + // <= BRW_MAX_NUM_BUFFER_ENTRIES); + + if (size_dwords > offset_dwords + num_vector_components) { + /* There is room for at least 1 transform feedback output in the buffer. + * Compute the number of additional transform feedback outputs the + * buffer has room for. + */ + num_elements = + (size_dwords - offset_dwords - num_vector_components); + } else { + /* There isn't even room for a single transform feedback output in the + * buffer. We can't configure the binding table entry to prevent output + * entirely; we'll have to rely on the geometry shader to detect + * overflow. But to minimize the damage in case of a bug, set up the + * binding table entry to just allow a single output. + */ + num_elements = 0; + } + num_elements += stride_dwords; + + uint32_t surface_format; + switch (num_vector_components) { + case 1: + surface_format = ISL_FORMAT_R32_FLOAT; + break; + case 2: + surface_format = ISL_FORMAT_R32G32_FLOAT; + break; + case 3: + surface_format = ISL_FORMAT_R32G32B32_FLOAT; + break; + case 4: + surface_format = ISL_FORMAT_R32G32B32A32_FLOAT; + break; + default: + unreachable("Invalid vector size for transform feedback output"); + } + + UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev; + uint32_t offset = 0; + + uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, + isl_dev->ss.align, &offset); + isl_buffer_fill_state(isl_dev, surf_state, + .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset, + crocus_resource_bo(&buf->base), + offset_dwords * 4, RELOC_32BIT|RELOC_WRITE), + .size_B = num_elements * 4, + .stride_B = stride_dwords * 4, + .swizzle = ISL_SWIZZLE_IDENTITY, + .format = surface_format); + return offset; +} +#endif + +#define foreach_surface_used(index, group) \ + for (int index = 0; index < bt->sizes[group]; index++) \ + if (crocus_group_index_to_bti(bt, group, index) != \ + CROCUS_SURFACE_NOT_USED) + +static void +crocus_populate_binding_table(struct crocus_context *ice, + struct crocus_batch *batch, + gl_shader_stage stage, bool ff_gs) +{ + struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage]; + struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage]; + if (!shader) + return; + + struct crocus_binding_table *bt = &shader->bt; + int s = 0; + uint32_t *surf_offsets = shader->surf_offset; + + const struct shader_info *info = crocus_get_shader_info(ice, stage); + + if (stage == MESA_SHADER_FRAGMENT) { + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */ + if (cso_fb->nr_cbufs) { + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + uint32_t write_disables = 0; + bool blend_enable = false; +#if GFX_VER <= 5 + const struct pipe_rt_blend_state *rt = + &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0]; + write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8; + write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4; + write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2; + write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1; + blend_enable = rt->blend_enable; +#endif + if (cso_fb->cbufs[i]) { + surf_offsets[s] = emit_surface(batch, + (struct crocus_surface *)cso_fb->cbufs[i], + ice->state.draw_aux_usage[i], + blend_enable, + write_disables); + } else { + emit_null_fb_surface(batch, ice, &surf_offsets[s]); + } + s++; + } + } else { + emit_null_fb_surface(batch, ice, &surf_offsets[s]); + s++; + } + + foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) { + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + if (cso_fb->cbufs[i]) { + surf_offsets[s++] = emit_rt_surface(batch, + (struct crocus_surface *)cso_fb->cbufs[i], + ice->state.draw_aux_usage[i]); + } + } + } + + if (stage == MESA_SHADER_COMPUTE) { + foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) { + surf_offsets[s] = emit_grid(ice, batch); + s++; + } + } + +#if GFX_VER == 6 + if (stage == MESA_SHADER_GEOMETRY) { + struct pipe_stream_output_info *so_info; + if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) + so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output; + else + so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output; + + foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) { + surf_offsets[s] = emit_sol_surface(batch, so_info, i); + s++; + } + } +#endif + + foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) { + struct crocus_sampler_view *view = shs->textures[i]; + if (view) + surf_offsets[s] = emit_sampler_view(ice, batch, false, view); + else + emit_null_surface(batch, &surf_offsets[s]); + s++; + } + + if (info && info->uses_texture_gather) { + foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) { + struct crocus_sampler_view *view = shs->textures[i]; + if (view) + surf_offsets[s] = emit_sampler_view(ice, batch, true, view); + else + emit_null_surface(batch, &surf_offsets[s]); + s++; + } + } + + foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) { + struct crocus_image_view *view = &shs->image[i]; + if (view->base.resource) + surf_offsets[s] = emit_image_view(ice, batch, view); + else + emit_null_surface(batch, &surf_offsets[s]); + s++; + } + foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) { + if (shs->constbufs[i].buffer) + surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]); + else + emit_null_surface(batch, &surf_offsets[s]); + s++; + } + foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) { + if (shs->ssbo[i].buffer) + surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i], + !!(shs->writable_ssbos & (1 << i))); + else + emit_null_surface(batch, &surf_offsets[s]); + s++; + } + +} +/* ------------------------------------------------------------------- */ +static uint32_t +crocus_upload_binding_table(struct crocus_context *ice, + struct crocus_batch *batch, + uint32_t *table, + uint32_t size) + +{ + if (size == 0) + return 0; + return emit_state(batch, table, size, 32); +} + +/** + * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address. + */ + +static void +crocus_update_surface_base_address(struct crocus_batch *batch) +{ + if (batch->state_base_address_emitted) + return; +#if GFX_VER >= 6 + uint32_t mocs = batch->screen->isl_dev.mocs.internal; +#endif + flush_before_state_base_change(batch); + + crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) { + + sba.SurfaceStateBaseAddressModifyEnable = true; + sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0); + +#if GFX_VER >= 5 + sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO! +#endif + + sba.GeneralStateBaseAddressModifyEnable = true; + sba.IndirectObjectBaseAddressModifyEnable = true; +#if GFX_VER >= 5 + sba.InstructionBaseAddressModifyEnable = true; +#endif + + sba.GeneralStateAccessUpperBoundModifyEnable = true; +#if GFX_VER >= 5 + sba.IndirectObjectAccessUpperBoundModifyEnable = true; + sba.InstructionAccessUpperBoundModifyEnable = true; +#endif +#if GFX_VER <= 5 + sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000); +#endif +#if GFX_VER >= 6 + /* The hardware appears to pay attention to the MOCS fields even + * if you don't set the "Address Modify Enable" bit for the base. + */ + sba.GeneralStateMOCS = mocs; + sba.StatelessDataPortAccessMOCS = mocs; + + sba.DynamicStateBaseAddressModifyEnable = true; + + sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0); + + /* Dynamic state upper bound. Although the documentation says that + * programming it to zero will cause it to be ignored, that is a lie. + * If this isn't programmed to a real bound, the sampler border color + * pointer is rejected, causing border color to mysteriously fail. + */ + sba.DynamicStateAccessUpperBoundModifyEnable = true; + sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000); +#endif + } + + flush_after_state_base_change(batch); + + /* According to section 3.6.1 of VOL1 of the 965 PRM, + * STATE_BASE_ADDRESS updates require a reissue of: + * + * 3DSTATE_PIPELINE_POINTERS + * 3DSTATE_BINDING_TABLE_POINTERS + * MEDIA_STATE_POINTERS + * + * and this continues through Ironlake. The Sandy Bridge PRM, vol + * 1 part 1 says that the folowing packets must be reissued: + * + * 3DSTATE_CC_POINTERS + * 3DSTATE_BINDING_TABLE_POINTERS + * 3DSTATE_SAMPLER_STATE_POINTERS + * 3DSTATE_VIEWPORT_STATE_POINTERS + * MEDIA_STATE_POINTERS + * + * Those are always reissued following SBA updates anyway (new + * batch time), except in the case of the program cache BO + * changing. Having a separate state flag makes the sequence more + * obvious. + */ +#if GFX_VER <= 5 + batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS; +#elif GFX_VER == 6 + batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS; +#endif + batch->state_base_address_emitted = true; +} + +static inline void +crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz, + bool window_space_position, float *zmin, float *zmax) +{ + if (window_space_position) { + *zmin = 0.f; + *zmax = 1.f; + return; + } + util_viewport_zmin_zmax(vp, halfz, zmin, zmax); +} + +struct push_bos { + struct { + struct crocus_address addr; + uint32_t length; + } buffers[4]; + int buffer_count; + uint32_t max_length; +}; + +#if GFX_VER >= 6 +static void +setup_constant_buffers(struct crocus_context *ice, + struct crocus_batch *batch, + int stage, + struct push_bos *push_bos) +{ + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; + struct brw_stage_prog_data *prog_data = (void *) shader->prog_data; + + uint32_t push_range_sum = 0; + + int n = 0; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + + if (range->length == 0) + continue; + + push_range_sum += range->length; + + if (range->length > push_bos->max_length) + push_bos->max_length = range->length; + + /* Range block is a binding table index, map back to UBO index. */ + unsigned block_index = crocus_bti_to_group_index( + &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block); + assert(block_index != CROCUS_SURFACE_NOT_USED); + + struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index]; + struct crocus_resource *res = (void *) cbuf->buffer; + + assert(cbuf->buffer_offset % 32 == 0); + + push_bos->buffers[n].length = range->length; + push_bos->buffers[n].addr = + res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset) + : ro_bo(batch->ice->workaround_bo, + batch->ice->workaround_offset); + n++; + } + + /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes: + * + * "The sum of all four read length fields must be less than or + * equal to the size of 64." + */ + assert(push_range_sum <= 64); + + push_bos->buffer_count = n; +} + +#if GFX_VER == 7 +static void +gen7_emit_vs_workaround_flush(struct crocus_batch *batch) +{ + ASSERTED const struct intel_device_info *devinfo = &batch->screen->devinfo; + + assert(devinfo->ver == 7); + crocus_emit_pipe_control_write(batch, + "vs workaround", + PIPE_CONTROL_WRITE_IMMEDIATE + | PIPE_CONTROL_DEPTH_STALL, + batch->ice->workaround_bo, + batch->ice->workaround_offset, 0); +} +#endif + +static void +emit_push_constant_packets(struct crocus_context *ice, + struct crocus_batch *batch, + int stage, + const struct push_bos *push_bos) +{ + struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; + struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL; + +#if GFX_VER == 7 + if (stage == MESA_SHADER_VERTEX) { + if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail) + gen7_emit_vs_workaround_flush(batch); + } +#endif + crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) { + pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; +#if GFX_VER == 7 + if (prog_data) { + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + int n = push_bos->buffer_count; + assert(n <= 4); +#if GFX_VERx10 >= 75 + const unsigned shift = 4 - n; +#else + const unsigned shift = 0; +#endif + for (int i = 0; i < n; i++) { + pkt.ConstantBody.ReadLength[i + shift] = + push_bos->buffers[i].length; + pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr; + } + } +#else + if (prog_data) { + int n = push_bos->buffer_count; + assert (n <= 1); + if (n == 1) { + pkt.Buffer0Valid = true; + pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset; + pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1; + } + } +#endif + } +} + +#endif + +#if GFX_VER >= 6 +typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML; +#else +typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML; +#endif + +static inline void +set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds) +{ + struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa; + ds->DepthTestEnable = cso->cso.depth_enabled; + ds->DepthBufferWriteEnable = cso->cso.depth_writemask; + ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func); + + ds->StencilFailOp = cso->cso.stencil[0].fail_op; + ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op; + ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op; + ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func); + + ds->StencilTestMask = cso->cso.stencil[0].valuemask; + ds->StencilWriteMask = cso->cso.stencil[0].writemask; + + ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op; + ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op; + ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op; + ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func); + + ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask; + ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask; + ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled; + ds->StencilTestEnable = cso->cso.stencil[0].enabled; + ds->StencilBufferWriteEnable = + cso->cso.stencil[0].writemask != 0 || + (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0); +} + +static void +emit_vertex_buffer_state(struct crocus_batch *batch, + unsigned buffer_id, + struct crocus_bo *bo, + unsigned start_offset, + unsigned end_offset, + unsigned stride, + unsigned step_rate, + uint32_t **map) +{ + const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length); + _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) { + vb.BufferStartingAddress = ro_bo(bo, start_offset); + vb.VertexBufferIndex = buffer_id; + vb.BufferPitch = stride; +#if GFX_VER == 7 + vb.AddressModifyEnable = true; +#endif +#if GFX_VER >= 6 + vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev); +#endif + vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA; + vb.InstanceDataStepRate = step_rate; +#if GFX_VER >= 5 + vb.EndAddress = ro_bo(bo, end_offset - 1); +#endif + } + *map += vb_dwords; +} + +static bool +can_emit_logic_op(struct crocus_context *ice) +{ + /* all pre gen8 have logicop restricted to unorm */ + enum pipe_format pformat = PIPE_FORMAT_NONE; + for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) { + if (ice->state.framebuffer.cbufs[i]) { + pformat = ice->state.framebuffer.cbufs[i]->format; + break; + } + } + return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat)); +} + +#if GFX_VER >= 6 +static uint32_t +determine_sample_mask(struct crocus_context *ice) +{ + uint32_t num_samples = ice->state.framebuffer.samples; + + if (num_samples <= 1) + return 1; + + uint32_t fb_mask = (1 << num_samples) - 1; + return ice->state.sample_mask & fb_mask; +} +#endif + +static void +crocus_upload_dirty_render_state(struct crocus_context *ice, + struct crocus_batch *batch, + const struct pipe_draw_info *draw) +{ + uint64_t dirty = ice->state.dirty; + uint64_t stage_dirty = ice->state.stage_dirty; + + if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) && + !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER)) + return; + + if (dirty & CROCUS_DIRTY_VF_STATISTICS) { + crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) { + vf.StatisticsEnable = true; + } + } + +#if GFX_VER <= 5 + if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS | + CROCUS_STAGE_DIRTY_CONSTANTS_FS)) { + bool ret = calculate_curbe_offsets(batch); + if (ret) { + dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP; + stage_dirty |= CROCUS_STAGE_DIRTY_VS; + } + } + + if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) || + stage_dirty & CROCUS_STAGE_DIRTY_VS) { + bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size, + brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size, + ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size); + if (ret) + dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; + } +#endif + if (dirty & CROCUS_DIRTY_CC_VIEWPORT) { + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + uint32_t cc_vp_address; + + /* XXX: could avoid streaming for depth_clip [0,1] case. */ + uint32_t *cc_vp_map = + stream_state(batch, + 4 * ice->state.num_viewports * + GENX(CC_VIEWPORT_length), 32, &cc_vp_address); + for (int i = 0; i < ice->state.num_viewports; i++) { + float zmin, zmax; + crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz, + ice->state.window_space_position, + &zmin, &zmax); + if (cso_rast->cso.depth_clip_near) + zmin = 0.0; + if (cso_rast->cso.depth_clip_far) + zmax = 1.0; + + crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) { + ccv.MinimumDepth = zmin; + ccv.MaximumDepth = zmax; + } + + cc_vp_map += GENX(CC_VIEWPORT_length); + } + +#if GFX_VER >= 7 + crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) { + ptr.CCViewportPointer = cc_vp_address; + } +#elif GFX_VER == 6 + crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { + vp.CCViewportStateChange = 1; + vp.PointertoCC_VIEWPORT = cc_vp_address; + } +#else + ice->state.cc_vp_address = cc_vp_address; + dirty |= CROCUS_DIRTY_COLOR_CALC_STATE; +#endif + } + + if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) { + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; +#if GFX_VER == 7 + uint32_t sf_cl_vp_address; + uint32_t *vp_map = + stream_state(batch, + 4 * ice->state.num_viewports * + GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address); +#else + uint32_t *vp_map = + stream_state(batch, + 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length), + 32, &ice->state.sf_vp_address); + uint32_t *clip_map = + stream_state(batch, + 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length), + 32, &ice->state.clip_vp_address); +#endif + + for (unsigned i = 0; i < ice->state.num_viewports; i++) { + const struct pipe_viewport_state *state = &ice->state.viewports[i]; + float gb_xmin, gb_xmax, gb_ymin, gb_ymax; + + intel_calculate_guardband_size(cso_fb->width, cso_fb->height, + state->scale[0], state->scale[1], + state->translate[0], state->translate[1], + &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax); +#if GFX_VER == 7 + crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) +#else + crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp) +#endif + { + vp.ViewportMatrixElementm00 = state->scale[0]; + vp.ViewportMatrixElementm11 = state->scale[1]; + vp.ViewportMatrixElementm22 = state->scale[2]; + vp.ViewportMatrixElementm30 = state->translate[0]; + vp.ViewportMatrixElementm31 = state->translate[1]; + vp.ViewportMatrixElementm32 = state->translate[2]; +#if GFX_VER < 6 + struct pipe_scissor_state scissor; + crocus_fill_scissor_rect(ice, 0, &scissor); + vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx; + vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx; + vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny; + vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy; +#endif + +#if GFX_VER == 7 + vp.XMinClipGuardband = gb_xmin; + vp.XMaxClipGuardband = gb_xmax; + vp.YMinClipGuardband = gb_ymin; + vp.YMaxClipGuardband = gb_ymax; +#endif + } +#if GFX_VER < 7 + crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) { + clip.XMinClipGuardband = gb_xmin; + clip.XMaxClipGuardband = gb_xmax; + clip.YMinClipGuardband = gb_ymin; + clip.YMaxClipGuardband = gb_ymax; + } +#endif +#if GFX_VER == 7 + vp_map += GENX(SF_CLIP_VIEWPORT_length); +#else + vp_map += GENX(SF_VIEWPORT_length); + clip_map += GENX(CLIP_VIEWPORT_length); +#endif + } +#if GFX_VER == 7 + crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) { + ptr.SFClipViewportPointer = sf_cl_vp_address; + } +#elif GFX_VER == 6 + crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { + vp.SFViewportStateChange = 1; + vp.CLIPViewportStateChange = 1; + vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address; + vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address; + } +#endif + } + +#if GFX_VER >= 6 + if (dirty & CROCUS_DIRTY_GEN6_URB) { +#if GFX_VER == 6 + bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL + || ice->shaders.ff_gs_prog; + + struct brw_vue_prog_data *vue_prog_data = + (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data; + const unsigned vs_size = vue_prog_data->urb_entry_size; + unsigned gs_size = vs_size; + if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) { + struct brw_vue_prog_data *gs_vue_prog_data = + (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data; + gs_size = gs_vue_prog_data->urb_entry_size; + } + + genX(upload_urb)(batch, vs_size, gs_present, gs_size); +#endif +#if GFX_VER == 7 + const struct intel_device_info *devinfo = &batch->screen->devinfo; + bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL; + bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL; + unsigned entry_size[4]; + + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + if (!ice->shaders.prog[i]) { + entry_size[i] = 1; + } else { + struct brw_vue_prog_data *vue_prog_data = + (void *) ice->shaders.prog[i]->prog_data; + entry_size[i] = vue_prog_data->urb_entry_size; + } + assert(entry_size[i] != 0); + } + + /* If we're just switching between programs with the same URB requirements, + * skip the rest of the logic. + */ + bool no_change = false; + if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] && + ice->urb.gs_present == gs_present && + ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] && + ice->urb.tess_present == tess_present && + ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] && + ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) { + no_change = true; + } + + if (!no_change) { + ice->urb.vsize = entry_size[MESA_SHADER_VERTEX]; + ice->urb.gs_present = gs_present; + ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY]; + ice->urb.tess_present = tess_present; + ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL]; + ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL]; + + unsigned entries[4]; + unsigned start[4]; + bool constrained; + intel_get_urb_config(devinfo, + batch->screen->l3_config_3d, + tess_present, + gs_present, + entry_size, + entries, start, NULL, &constrained); + + if (!(GFX_VERx10 == 75) && !devinfo->is_baytrail) + gen7_emit_vs_workaround_flush(batch); + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = start[i]; + urb.VSURBEntryAllocationSize = entry_size[i] - 1; + urb.VSNumberofURBEntries = entries[i]; + } + } + } +#endif + } + + if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) { + struct crocus_blend_state *cso_blend = ice->state.cso_blend; + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa; + + STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2); + + const int rt_dwords = + MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length); + + uint32_t blend_offset; + uint32_t *blend_map = + stream_state(batch, + 4 * rt_dwords, 64, &blend_offset); + + bool indep_alpha_blend = false; + for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) { + const struct pipe_rt_blend_state *rt = + &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0]; + + enum pipe_blendfactor src_rgb = + fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one); + enum pipe_blendfactor src_alpha = + fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one); + enum pipe_blendfactor dst_rgb = + fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one); + enum pipe_blendfactor dst_alpha = + fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one); + + if (rt->rgb_func != rt->alpha_func || + src_rgb != src_alpha || dst_rgb != dst_alpha) + indep_alpha_blend = true; + + crocus_pack_state(GENX(BLEND_STATE_ENTRY), blend_map, be) { + if (can_emit_logic_op(ice)) { + be.LogicOpEnable = cso_blend->cso.logicop_enable; + be.LogicOpFunction = cso_blend->cso.logicop_func; + } + + be.ColorClampRange = COLORCLAMP_RTFORMAT; + be.PreBlendColorClampEnable = true; + be.PostBlendColorClampEnable = true; + + if (i == 0) { + struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT]; + struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data; + be.ColorBufferBlendEnable = rt->blend_enable && + (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend); + } else + be.ColorBufferBlendEnable = rt->blend_enable; + + be.ColorBlendFunction = rt->rgb_func; + be.AlphaBlendFunction = rt->alpha_func; + be.SourceBlendFactor = (int) src_rgb; + be.SourceAlphaBlendFactor = (int) src_alpha; + be.DestinationBlendFactor = (int) dst_rgb; + be.DestinationAlphaBlendFactor = (int) dst_alpha; + + be.WriteDisableRed = !(rt->colormask & PIPE_MASK_R); + be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G); + be.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B); + be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A); + + be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage; + be.IndependentAlphaBlendEnable = indep_alpha_blend; + be.AlphaToOneEnable = cso_blend->cso.alpha_to_one; + be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage; + be.ColorDitherEnable = cso_blend->cso.dither; + + /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */ + // Except they're not... fix that. Can't be done here since it needs + // to be conditional on non-integer RT's + be.AlphaTestEnable = cso_zsa->cso.alpha_enabled; + be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func); + } + blend_map += GENX(BLEND_STATE_ENTRY_length); + } + +#if GFX_VER < 7 + crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { + ptr.PointertoBLEND_STATE = blend_offset; + ptr.BLEND_STATEChange = true; + } +#else + crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) { + ptr.BlendStatePointer = blend_offset; + } +#endif + } +#endif + + if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) { + struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa; + UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend; + struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref; + uint32_t cc_offset; + void *cc_map = + stream_state(batch, + sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length), + 64, &cc_offset); +#if GFX_VER <= 5 + dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; + int blend_idx = 0; + + if (cso_blend->cso.independent_blend_enable) { + for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) { + if (cso_blend->cso.rt[i].blend_enable) { + blend_idx = i; + break; + } + } + } + const struct pipe_rt_blend_state *rt = &cso_blend->cso.rt[blend_idx]; +#endif + _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) { + cc.AlphaTestFormat = ALPHATEST_FLOAT32; + cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value; + +#if GFX_VER <= 5 + + set_depth_stencil_bits(ice, &cc); + + cc.ColorBufferBlendEnable = rt->blend_enable; + + if (cso_blend->cso.logicop_enable) { + if (can_emit_logic_op(ice)) { + cc.LogicOpEnable = cso_blend->cso.logicop_enable; + cc.LogicOpFunction = cso_blend->cso.logicop_func; + } + } + cc.ColorDitherEnable = cso_blend->cso.dither; + cc.ColorBlendFunction = rt->rgb_func; + cc.AlphaBlendFunction = rt->alpha_func; + cc.SourceBlendFactor = rt->rgb_src_factor; + cc.SourceAlphaBlendFactor = rt->alpha_src_factor; + cc.DestinationBlendFactor = rt->rgb_dst_factor; + cc.DestinationAlphaBlendFactor = rt->alpha_dst_factor; + + if (rt->rgb_func != rt->alpha_func || + rt->rgb_src_factor != rt->alpha_src_factor || + rt->rgb_dst_factor != rt->alpha_dst_factor) + cc.IndependentAlphaBlendEnable = true; + + if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) { + cc.AlphaTestEnable = cso->cso.alpha_enabled; + cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func); + } + cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0; + cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address); +#else + cc.AlphaTestFormat = ALPHATEST_FLOAT32; + cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value; + + cc.BlendConstantColorRed = ice->state.blend_color.color[0]; + cc.BlendConstantColorGreen = ice->state.blend_color.color[1]; + cc.BlendConstantColorBlue = ice->state.blend_color.color[2]; + cc.BlendConstantColorAlpha = ice->state.blend_color.color[3]; +#endif + cc.StencilReferenceValue = p_stencil_refs->ref_value[0]; + cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1]; + } + ice->shaders.cc_offset = cc_offset; +#if GFX_VER >= 6 + crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { + ptr.ColorCalcStatePointer = cc_offset; +#if GFX_VER != 7 + ptr.ColorCalcStatePointerValid = true; +#endif + } +#endif + } +#if GFX_VER <= 5 + if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) { + crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) { + blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0]; + blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1]; + blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2]; + blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3]; + } + } +#endif + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { + if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage))) + continue; + + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; + + if (!shader) + continue; + + if (shs->sysvals_need_upload) + upload_sysvals(ice, stage); + +#if GFX_VER <= 5 + dirty |= CROCUS_DIRTY_GEN4_CURBE; +#endif +#if GFX_VER >= 7 + struct push_bos push_bos = {}; + setup_constant_buffers(ice, batch, stage, &push_bos); + + emit_push_constant_packets(ice, batch, stage, &push_bos); +#endif + } + + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { + if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) { + if (ice->shaders.prog[stage]) { +#if GFX_VER <= 6 + dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS; +#endif + crocus_populate_binding_table(ice, batch, stage, false); + ice->shaders.prog[stage]->bind_bo_offset = + crocus_upload_binding_table(ice, batch, + ice->shaders.prog[stage]->surf_offset, + ice->shaders.prog[stage]->bt.size_bytes); + +#if GFX_VER == 7 + crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) { + ptr._3DCommandSubOpcode = 38 + stage; + ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset; + } +#endif +#if GFX_VER == 6 + } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) { + dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS; + crocus_populate_binding_table(ice, batch, stage, true); + ice->shaders.ff_gs_prog->bind_bo_offset = + crocus_upload_binding_table(ice, batch, + ice->shaders.ff_gs_prog->surf_offset, + ice->shaders.ff_gs_prog->bt.size_bytes); +#endif + } + } + } +#if GFX_VER <= 6 + if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) { + struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY]; + if (gs == NULL) + gs = ice->shaders.ff_gs_prog; + crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) { + ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset; + ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset; +#if GFX_VER == 6 + ptr.VSBindingTableChange = true; + ptr.PSBindingTableChange = true; + ptr.GSBindingTableChange = gs ? true : false; + ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0; +#endif + } + } +#endif + + bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS; + for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { + if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) || + !ice->shaders.prog[stage]) + continue; + + crocus_upload_sampler_states(ice, batch, stage); + + sampler_updates = true; + +#if GFX_VER >= 7 + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + + crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) { + ptr._3DCommandSubOpcode = 43 + stage; + ptr.PointertoVSSamplerState = shs->sampler_offset; + } +#endif + } + + if (sampler_updates) { +#if GFX_VER == 6 + struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX]; + struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY]; + struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT]; + crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) { + if (ice->shaders.prog[MESA_SHADER_VERTEX] && + (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS || + stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) { + ptr.VSSamplerStateChange = true; + ptr.PointertoVSSamplerState = shs_vs->sampler_offset; + } + if (ice->shaders.prog[MESA_SHADER_GEOMETRY] && + (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS || + stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) { + ptr.GSSamplerStateChange = true; + ptr.PointertoGSSamplerState = shs_gs->sampler_offset; + } + if (ice->shaders.prog[MESA_SHADER_FRAGMENT] && + (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS || + stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) { + ptr.PSSamplerStateChange = true; + ptr.PointertoPSSamplerState = shs_fs->sampler_offset; + } + } +#endif + } + +#if GFX_VER >= 6 + if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) { + crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) { + ms.PixelLocation = + ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER; + if (ice->state.framebuffer.samples > 0) + ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1; +#if GFX_VER == 6 + INTEL_SAMPLE_POS_4X(ms.Sample); +#elif GFX_VER == 7 + switch (ice->state.framebuffer.samples) { + case 1: + INTEL_SAMPLE_POS_1X(ms.Sample); + break; + case 2: + INTEL_SAMPLE_POS_2X(ms.Sample); + break; + case 4: + INTEL_SAMPLE_POS_4X(ms.Sample); + break; + case 8: + INTEL_SAMPLE_POS_8X(ms.Sample); + break; + default: + break; + } +#endif + } + } + + if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) { + crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) { + ms.SampleMask = determine_sample_mask(ice); + } + } +#endif + +#if GFX_VER >= 7 + struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT]; + if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) { + struct brw_stage_prog_data *prog_data = shader->prog_data; + struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data; + + crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) { + ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; + ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; + ps._32PixelDispatchEnable = wm_prog_data->dispatch_32; + + ps.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1); + ps.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); + + ps.KernelStartPointer0 = KSP(ice, shader) + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); + ps.KernelStartPointer1 = KSP(ice, shader) + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); + ps.KernelStartPointer2 = KSP(ice, shader) + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); + +#if GFX_VERx10 == 75 + ps.SampleMask = determine_sample_mask(ice); +#endif + // XXX: WABTPPrefetchDisable, see above, drop at C0 + ps.BindingTableEntryCount = shader->bt.size_bytes / 4; + ps.FloatingPointMode = prog_data->use_alt_mode; + ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1; + + ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0; + + ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; + ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending; + ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0); + /* From the documentation for this packet: + * "If the PS kernel does not need the Position XY Offsets to + * compute a Position Value, then this field should be programmed + * to POSOFFSET_NONE." + * + * "SW Recommendation: If the PS kernel needs the Position Offsets + * to compute a Position XY value, this field should match Position + * ZW Interpolation Mode to ensure a consistent position.xyzw + * computation." + * + * We only require XY sample offsets. So, this recommendation doesn't + * look useful at the moment. We might need this in future. + */ + ps.PositionXYOffsetSelect = + wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE; + + if (wm_prog_data->base.total_scratch) { + struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT); + ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11; + ps.ScratchSpaceBasePointer = rw_bo(bo, 0); + } + } + } +#endif + +#if GFX_VER >= 7 + if (ice->state.streamout_active) { + if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) { + for (int i = 0; i < 4; i++) { + struct crocus_stream_output_target *tgt = + (void *) ice->state.so_target[i]; + + if (!tgt) { + crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) { + sob.SOBufferIndex = i; + } + continue; + } + struct crocus_resource *res = (void *) tgt->base.buffer; + uint32_t start = tgt->base.buffer_offset; + uint32_t end = ALIGN(start + tgt->base.buffer_size, 4); + crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) { + sob.SOBufferIndex = i; + + sob.SurfaceBaseAddress = rw_bo(res->bo, start); + sob.SurfacePitch = tgt->stride; + sob.SurfaceEndAddress = rw_bo(res->bo, end); + } + } + } + + if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) { + uint32_t *decl_list = + ice->state.streamout + GENX(3DSTATE_STREAMOUT_length); + crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2)); + } + + if (dirty & CROCUS_DIRTY_STREAMOUT) { + const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + + uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)]; + crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) { + sol.SOFunctionEnable = true; + sol.SOStatisticsEnable = true; + + sol.RenderingDisable = cso_rast->cso.rasterizer_discard && + !ice->state.prims_generated_query_active; + sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING; + } + + assert(ice->state.streamout); + + crocus_emit_merge(batch, ice->state.streamout, dynamic_sol, + GENX(3DSTATE_STREAMOUT_length)); + } + } else { + if (dirty & CROCUS_DIRTY_STREAMOUT) { + crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol); + } + } +#endif +#if GFX_VER == 6 + if (ice->state.streamout_active) { + if (dirty & CROCUS_DIRTY_GEN6_SVBI) { + crocus_emit_so_svbi(ice); + } + } +#endif + + if (dirty & CROCUS_DIRTY_CLIP) { +#if GFX_VER < 6 + const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data; + struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso; + + uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset); + dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; + _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) { + clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog); + clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate; + clip.SingleProgramFlow = true; + clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1; + + clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length; + clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length; + + clip.DispatchGRFStartRegisterForURBData = 1; + clip.VertexURBEntryReadOffset = 0; + clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2; + + clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries; + clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1; + + if (batch->ice->urb.nr_clip_entries >= 10) { + /* Half of the URB entries go to each thread, and it has to be an + * even number. + */ + assert(batch->ice->urb.nr_clip_entries % 2 == 0); + + /* Although up to 16 concurrent Clip threads are allowed on Ironlake, + * only 2 threads can output VUEs at a time. + */ + clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1; + } else { + assert(batch->ice->urb.nr_clip_entries >= 5); + clip.MaximumNumberofThreads = 1 - 1; + } + clip.VertexPositionSpace = VPOS_NDCSPACE; + clip.UserClipFlagsMustClipEnable = true; + clip.GuardbandClipTestEnable = true; + + clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address); + clip.ScreenSpaceViewportXMin = -1.0; + clip.ScreenSpaceViewportXMax = 1.0; + clip.ScreenSpaceViewportYMin = -1.0; + clip.ScreenSpaceViewportYMax = 1.0; + clip.ViewportXYClipTestEnable = true; + clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far); + +#if GFX_VER == 5 || GFX_VERx10 == 45 + clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable; +#else + /* Up to 6 actual clip flags, plus the 7th for the negative RHW + * workaround. + */ + clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40; +#endif + + clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL; + clip.GuardbandClipTestEnable = true; + + clip.ClipMode = clip_prog_data->clip_mode; +#if GFX_VERx10 == 45 + clip.NegativeWClipTestEnable = true; +#endif + } + +#else //if GFX_VER >= 6 + struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast; + const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data ); + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] || + ice->shaders.prog[MESA_SHADER_TESS_EVAL]; + bool points_or_lines = cso_rast->fill_mode_point_or_line || + (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines + : ice->state.prim_is_points_or_lines); + uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)]; + crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) { + cl.StatisticsEnable = ice->state.statistics_counters_enabled; + if (cso_rast->cso.rasterizer_discard) + cl.ClipMode = CLIPMODE_REJECT_ALL; + else if (ice->state.window_space_position) + cl.ClipMode = CLIPMODE_ACCEPT_ALL; + else + cl.ClipMode = CLIPMODE_NORMAL; + + cl.PerspectiveDivideDisable = ice->state.window_space_position; + cl.ViewportXYClipTestEnable = !points_or_lines; + + cl.UserClipDistanceCullTestEnableBitmask = + brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask; + + if (wm_prog_data->barycentric_interp_modes & + BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) + cl.NonPerspectiveBarycentricEnable = true; + + cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1; + cl.MaximumVPIndex = ice->state.num_viewports - 1; + } + crocus_emit_merge(batch, cso_rast->clip, dynamic_clip, + ARRAY_SIZE(cso_rast->clip)); +#endif + } + + if (stage_dirty & CROCUS_STAGE_DIRTY_VS) { + struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX]; + const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); + const struct brw_stage_prog_data *prog_data = &vue_prog_data->base; +#if GFX_VER == 7 + if (batch->screen->devinfo.is_ivybridge) + gen7_emit_vs_workaround_flush(batch); +#endif + + +#if GFX_VER == 6 + struct push_bos push_bos = {}; + setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos); + + emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos); +#endif +#if GFX_VER >= 6 + crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs) +#else + uint32_t *vs_ptr = stream_state(batch, + GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset); + dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; + _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs) +#endif + { + INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX); + + vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1; + +#if GFX_VER < 6 + vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1; + vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length; + vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2; + + vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0); + vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1; + + vs.MaximumNumberofThreads = + CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1; + vs.StatisticsEnable = false; + vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset); +#endif +#if GFX_VER == 5 + /* Force single program flow on Ironlake. We cannot reliably get + * all applications working without it. See: + * https://bugs.freedesktop.org/show_bug.cgi?id=29172 + * + * The most notable and reliably failing application is the Humus + * demo "CelShading" + */ + vs.SingleProgramFlow = true; + vs.SamplerCount = 0; /* hardware requirement */ + +#endif + } + +#if GFX_VER == 6 + crocus_emit_pipe_control_flush(batch, + "post VS const", + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE); +#endif + } + + if (stage_dirty & CROCUS_STAGE_DIRTY_GS) { + struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY]; + bool active = GFX_VER >= 6 && shader; +#if GFX_VER == 6 + struct push_bos push_bos = {}; + if (shader) + setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos); + + emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos); +#endif +#if GFX_VER >= 6 + crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs) +#else + uint32_t *gs_ptr = stream_state(batch, + GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset); + dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; + _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs) +#endif + { +#if GFX_VER >= 6 + if (active) { + const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data); + const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); + const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base; + + INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY); +#if GFX_VER >= 7 + gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; + gs.OutputTopology = gs_prog_data->output_topology; + gs.ControlDataHeaderSize = + gs_prog_data->control_data_header_size_hwords; + + gs.InstanceControl = gs_prog_data->invocations - 1; + gs.DispatchMode = vue_prog_data->dispatch_mode; + + gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; + + gs.ControlDataFormat = gs_prog_data->control_data_format; +#endif + + /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between + * Ivy Bridge and Haswell. + * + * On Ivy Bridge, setting this bit causes the vertices of a triangle + * strip to be delivered to the geometry shader in an order that does + * not strictly follow the OpenGL spec, but preserves triangle + * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then + * the geometry shader sees triangles: + * + * (1, 2, 3), (2, 4, 3), (3, 4, 5) + * + * (Clearing the bit is even worse, because it fails to preserve + * orientation). + * + * Triangle strips with adjacency always ordered in a way that preserves + * triangle orientation but does not strictly follow the OpenGL spec, + * regardless of the setting of this bit. + * + * On Haswell, both triangle strips and triangle strips with adjacency + * are always ordered in a way that preserves triangle orientation. + * Setting this bit causes the ordering to strictly follow the OpenGL + * spec. + * + * So in either case we want to set the bit. Unfortunately on Ivy + * Bridge this will get the order close to correct but not perfect. + */ + gs.ReorderMode = TRAILING; + gs.MaximumNumberofThreads = (batch->screen->devinfo.max_gs_threads - 1); + +#if GFX_VER < 7 + gs.SOStatisticsEnable = true; + if (gs_prog_data->num_transform_feedback_bindings) + gs.SVBIPayloadEnable = ice->state.streamout_active; + + /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it + * was previously done for gen6. + * + * TODO: test with both disabled to see if the HW is behaving + * as expected, like in gen7. + */ + gs.SingleProgramFlow = true; + gs.VectorMaskEnable = true; +#endif + } +#endif +#if GFX_VER <= 6 + if (!active && ice->shaders.ff_gs_prog) { + const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data; + /* In gen6, transform feedback for the VS stage is done with an + * ad-hoc GS program. This function provides the needed 3DSTATE_GS + * for this. + */ + gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog); + gs.SingleProgramFlow = true; + gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1; + gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length; + +#if GFX_VER <= 5 + gs.GRFRegisterCount = + DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1; + /* BRW_NEW_URB_FENCE */ + gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries; + gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1; + gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0; + gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate; +#else + gs.Enable = true; + gs.VectorMaskEnable = true; + gs.SVBIPayloadEnable = true; + gs.SVBIPostIncrementEnable = true; + gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value; + gs.SOStatisticsEnable = true; + gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1; +#endif + } +#endif + if (!active && !ice->shaders.ff_gs_prog) { + gs.DispatchGRFStartRegisterForURBData = 1; +#if GFX_VER >= 7 + gs.IncludeVertexHandles = true; +#endif + } +#if GFX_VER >= 6 + gs.StatisticsEnable = true; +#endif +#if GFX_VER == 5 || GFX_VER == 6 + gs.RenderingEnabled = true; +#endif +#if GFX_VER <= 5 + gs.MaximumVPIndex = ice->state.num_viewports - 1; +#endif + } + } + +#if GFX_VER >= 7 + if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) { + struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL]; + + if (shader) { + const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data); + const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); + const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base; + + crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) { + INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL); + hs.InstanceCount = tcs_prog_data->instances - 1; + hs.IncludeVertexHandles = true; + hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1; + } + } else { + crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs); + } + + } + + if (stage_dirty & CROCUS_STAGE_DIRTY_TES) { + struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL]; + if (shader) { + const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data); + const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data); + const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base; + + crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) { + te.Partitioning = tes_prog_data->partitioning; + te.OutputTopology = tes_prog_data->output_topology; + te.TEDomain = tes_prog_data->domain; + te.TEEnable = true; + te.MaximumTessellationFactorOdd = 63.0; + te.MaximumTessellationFactorNotOdd = 64.0; + }; + crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) { + INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL); + + ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1; + ds.ComputeWCoordinateEnable = + tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; + }; + } else { + crocus_emit_cmd(batch, GENX(3DSTATE_TE), te); + crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds); + } + } +#endif + if (dirty & CROCUS_DIRTY_RASTER) { + +#if GFX_VER < 6 + const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data; + struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso; + uint32_t *sf_ptr = stream_state(batch, + GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset); + dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; + _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) { + sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog); + sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate; + sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1; + sf.DispatchGRFStartRegisterForURBData = 3; + sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET; + sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length; + sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1; + sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries; + sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT; + + sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address); + + sf.MaximumNumberofThreads = + MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1; + + sf.SpritePointEnable = cso_state->point_quad_rasterization; + sf.DestinationOriginHorizontalBias = 0.5; + sf.DestinationOriginVerticalBias = 0.5; + + sf.LastPixelEnable = cso_state->line_last_pixel; + sf.LineWidth = get_line_width(cso_state); + sf.PointWidth = cso_state->point_size; + sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State; +#if GFX_VERx10 == 45 || GFX_VER >= 5 + sf.AALineDistanceMode = AALINEDISTANCE_TRUE; +#endif + sf.ViewportTransformEnable = true; + sf.FrontWinding = cso_state->front_ccw ? 1 : 0; + sf.ScissorRectangleEnable = true; + sf.CullMode = translate_cull_mode(cso_state->cull_face); + + if (cso_state->flatshade_first) { + sf.TriangleFanProvokingVertexSelect = 1; + } else { + sf.TriangleStripListProvokingVertexSelect = 2; + sf.TriangleFanProvokingVertexSelect = 2; + sf.LineStripListProvokingVertexSelect = 1; + } + } +#else + struct crocus_rasterizer_state *cso = ice->state.cso_rast; + uint32_t dynamic_sf[GENX(3DSTATE_SF_length)]; + crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) { + sf.ViewportTransformEnable = !ice->state.window_space_position; + +#if GFX_VER == 6 + const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); + uint32_t urb_entry_read_length; + uint32_t urb_entry_read_offset; + uint32_t point_sprite_enables; + calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables, + &urb_entry_read_length, + &urb_entry_read_offset); + sf.VertexURBEntryReadLength = urb_entry_read_length; + sf.VertexURBEntryReadOffset = urb_entry_read_offset; + sf.PointSpriteTextureCoordinateEnable = point_sprite_enables; + sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs; + sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; +#endif + +#if GFX_VER >= 6 + if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample) + sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; +#endif +#if GFX_VER == 7 + if (ice->state.framebuffer.zsbuf) { + struct crocus_resource *zres, *sres; + crocus_get_depth_stencil_resources(&batch->screen->devinfo, + ice->state.framebuffer.zsbuf->texture, + &zres, &sres); + /* ANV thinks that the stencil-ness doesn't matter, this is just + * about handling polygon offset scaling. + */ + sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM; + } +#endif + } + crocus_emit_merge(batch, cso->sf, dynamic_sf, + ARRAY_SIZE(dynamic_sf)); +#endif + } + + if (dirty & CROCUS_DIRTY_WM) { + struct crocus_rasterizer_state *cso = ice->state.cso_rast; + const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); + UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF; + UNUSED const struct shader_info *fs_info = + crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT); + +#if GFX_VER == 6 + struct push_bos push_bos = {}; + setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos); + + emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos); +#endif +#if GFX_VER >= 6 + crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm) +#else + uint32_t *wm_ptr = stream_state(batch, + GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset); + + dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS; + + _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm) +#endif + { +#if GFX_VER <= 6 + wm._8PixelDispatchEnable = wm_prog_data->dispatch_8; + wm._16PixelDispatchEnable = wm_prog_data->dispatch_16; + wm._32PixelDispatchEnable = wm_prog_data->dispatch_32; +#endif +#if GFX_VER == 4 + /* On gen4, we only have one shader kernel */ + if (brw_wm_state_has_ksp(wm, 0)) { + wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]); + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); + wm.DispatchGRFStartRegisterForConstantSetupData0 = + wm_prog_data->base.dispatch_grf_start_reg; + } +#elif GFX_VER == 5 + wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); + wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); + wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); + + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); + wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1); + wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2); + + wm.DispatchGRFStartRegisterForConstantSetupData0 = + wm_prog_data->base.dispatch_grf_start_reg; +#elif GFX_VER == 6 + wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); + wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); + wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); + + wm.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); + wm.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1); + wm.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2); +#endif +#if GFX_VER <= 5 + wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length; + wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2; + wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2; + wm.SetupURBEntryReadOffset = 0; + wm.EarlyDepthTestEnable = true; + wm.LineAntialiasingRegionWidth = _05pixels; + wm.LineEndCapAntialiasingRegionWidth = _10pixels; + wm.DepthCoefficientURBReadOffset = 1; + + if (cso->cso.offset_tri) { + wm.GlobalDepthOffsetEnable = true; + + /* Something weird going on with legacy_global_depth_bias, + * offset_constant, scaling and MRD. This value passes glean + * but gives some odd results elsewere (eg. the + * quad-offset-units test). + */ + wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2; + wm.GlobalDepthOffsetScale = cso->cso.offset_scale; + } + wm.SamplerStatePointer = ro_bo(batch->state.bo, + ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset); +#endif + + wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ? + ice->state.statistics_counters_enabled : 0; + +#if GFX_VER >= 6 + wm.LineAntialiasingRegionWidth = _10pixels; + wm.LineEndCapAntialiasingRegionWidth = _05pixels; + + wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT; + wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes; +#endif +#if GFX_VER == 6 + wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend && + ice->state.cso_blend->dual_color_blending; + wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; + wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; + + /* From the SNB PRM, volume 2 part 1, page 281: + * "If the PS kernel does not need the Position XY Offsets + * to compute a Position XY value, then this field should be + * programmed to POSOFFSET_NONE." + * + * "SW Recommendation: If the PS kernel needs the Position Offsets + * to compute a Position XY value, this field should match Position + * ZW Interpolation Mode to ensure a consistent position.xyzw + * computation." + * We only require XY sample offsets. So, this recommendation doesn't + * look useful at the moment. We might need this in future. + */ + if (wm_prog_data->uses_pos_offset) + wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE; + else + wm.PositionXYOffsetSelect = POSOFFSET_NONE; +#endif + wm.LineStippleEnable = cso->cso.line_stipple_enable; + wm.PolygonStippleEnable = cso->cso.poly_stipple_enable; + +#if GFX_VER < 7 + if (wm_prog_data->base.use_alt_mode) + wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate; + wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4; + wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1; +#endif + +#if GFX_VER >= 6 + wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; + + struct pipe_framebuffer_state *fb = &ice->state.framebuffer; + if (fb->samples > 1) { + if (cso->cso.multisample) + wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; + else + wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; + + if (wm_prog_data->persample_dispatch) + wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; + else + wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL; + } else { + wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; + wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; + } +#endif + + wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; + + if (wm_prog_data->uses_kill || + ice->state.cso_zsa->cso.alpha_enabled || + ice->state.cso_blend->cso.alpha_to_coverage || + (GFX_VER >= 6 && wm_prog_data->uses_omask)) + wm.PixelShaderKillsPixel = true; + + if (has_writeable_rt(ice->state.cso_blend, fs_info) || + writes_depth || wm.PixelShaderKillsPixel || + (GFX_VER >= 6 && wm_prog_data->has_side_effects)) + wm.ThreadDispatchEnable = true; + +#if GFX_VER >= 7 + wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; + wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; +#else + if (wm_prog_data->base.total_scratch) { + struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, + MESA_SHADER_FRAGMENT); + wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11; + wm.ScratchSpaceBasePointer = rw_bo(bo, 0); + } + + wm.PixelShaderComputedDepth = writes_depth; + +#endif + /* The "UAV access enable" bits are unnecessary on HSW because they only + * seem to have an effect on the HW-assisted coherency mechanism which we + * don't need, and the rasterization-related UAV_ONLY flag and the + * DISPATCH_ENABLE bit can be set independently from it. + * C.f. gen8_upload_ps_extra(). + * + * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | + * _NEW_COLOR + */ +#if GFX_VERx10 == 75 + if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) && + wm_prog_data->has_side_effects) + wm.PSUAVonly = ON; +#endif + +#if GFX_VER >= 7 + /* BRW_NEW_FS_PROG_DATA */ + if (wm_prog_data->early_fragment_tests) + wm.EarlyDepthStencilControl = EDSC_PREPS; + else if (wm_prog_data->has_side_effects) + wm.EarlyDepthStencilControl = EDSC_PSEXEC; +#endif + }; + +#if GFX_VER <= 5 + if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) { + crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) { + clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp; + } + ice->state.global_depth_offset_clamp = cso->cso.offset_clamp; + } +#endif + } + +#if GFX_VER >= 7 + if (dirty & CROCUS_DIRTY_GEN7_SBE) { + crocus_emit_sbe(batch, ice); + } +#endif + +#if GFX_VER >= 6 + if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) { + uint32_t ds_offset; + void *ds_map = stream_state(batch, + sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length), + 64, &ds_offset); + _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) { + set_depth_stencil_bits(ice, &ds); + } + +#if GFX_VER == 6 + crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { + ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; + ptr.DEPTH_STENCIL_STATEChange = true; + } +#else + crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) { + ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; + } +#endif + } + + if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) { + /* Align to 64-byte boundary as per anv. */ + uint32_t scissor_offset; + struct pipe_scissor_state *scissor_map = (void *) + stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports, + 64, &scissor_offset); + for (int i = 0; i < ice->state.num_viewports; i++) { + struct pipe_scissor_state scissor; + crocus_fill_scissor_rect(ice, i, &scissor); + scissor_map[i] = scissor; + } + + crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) { + ptr.ScissorRectPointer = scissor_offset; + } + } +#endif + + if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) { + struct isl_device *isl_dev = &batch->screen->isl_dev; +#if GFX_VER >= 6 + crocus_emit_depth_stall_flushes(batch); +#endif + void *batch_ptr; + struct crocus_resource *zres, *sres; + struct pipe_framebuffer_state *cso = &ice->state.framebuffer; + batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size); + + struct isl_view view = { + .base_level = 0, + .levels = 1, + .base_array_layer = 0, + .array_len = 1, + .swizzle = ISL_SWIZZLE_IDENTITY, + }; + struct isl_depth_stencil_hiz_emit_info info = { .view = &view }; + + if (cso->zsbuf) { + crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres); + struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf; + if (zsbuf->align_res) { + zres = (struct crocus_resource *)zsbuf->align_res; + } + view.base_level = cso->zsbuf->u.tex.level; + view.base_array_layer = cso->zsbuf->u.tex.first_layer; + view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1; + + if (zres) { + view.usage |= ISL_SURF_USAGE_DEPTH_BIT; + + info.depth_surf = &zres->surf; + info.depth_address = crocus_command_reloc(batch, + (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset, + zres->bo, 0, RELOC_32BIT); + + info.mocs = crocus_mocs(zres->bo, isl_dev); + view.format = zres->surf.format; + + if (crocus_resource_level_has_hiz(zres, view.base_level)) { + info.hiz_usage = zres->aux.usage; + info.hiz_surf = &zres->aux.surf; + uint32_t hiz_offset = 0; + +#if GFX_VER == 6 + /* HiZ surfaces on Sandy Bridge technically don't support + * mip-mapping. However, we can fake it by offsetting to the + * first slice of LOD0 in the HiZ surface. + */ + isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf, + view.base_level, 0, 0, + &hiz_offset, NULL, NULL); +#endif + info.hiz_address = crocus_command_reloc(batch, + (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset, + zres->aux.bo, zres->aux.offset + hiz_offset, + RELOC_32BIT); + info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0]; + } + } + +#if GFX_VER >= 6 + if (sres) { + view.usage |= ISL_SURF_USAGE_STENCIL_BIT; + info.stencil_aux_usage = sres->aux.usage; + info.stencil_surf = &sres->surf; + + uint32_t stencil_offset = 0; +#if GFX_VER == 6 + /* Stencil surfaces on Sandy Bridge technically don't support + * mip-mapping. However, we can fake it by offsetting to the + * first slice of LOD0 in the stencil surface. + */ + isl_surf_get_image_offset_B_tile_sa(&sres->surf, + view.base_level, 0, 0, + &stencil_offset, NULL, NULL); +#endif + + info.stencil_address = crocus_command_reloc(batch, + (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset, + sres->bo, stencil_offset, RELOC_32BIT); + if (!zres) { + view.format = sres->surf.format; + info.mocs = crocus_mocs(sres->bo, isl_dev); + } + } +#endif + } + isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info); + } + + /* TODO: Disable emitting this until something uses a stipple. */ + if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) { + crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) { + for (int i = 0; i < 32; i++) { + poly.PatternRow[i] = ice->state.poly_stipple.stipple[i]; + } + } + } + + if (dirty & CROCUS_DIRTY_LINE_STIPPLE) { + struct crocus_rasterizer_state *cso = ice->state.cso_rast; + crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple)); + } + +#if GFX_VER <= 5 + if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) { + upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset, + ice->shaders.vs_offset, ice->shaders.sf_offset, + ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset); + crocus_upload_urb_fence(batch); + + crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) { + cs.NumberofURBEntries = ice->urb.nr_cs_entries; + cs.URBEntryAllocationSize = ice->urb.csize - 1; + } + dirty |= CROCUS_DIRTY_GEN4_CURBE; + } +#endif + if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) { + struct pipe_framebuffer_state *fb = &ice->state.framebuffer; + if (fb->width && fb->height) { + crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { + rect.ClippedDrawingRectangleXMax = fb->width - 1; + rect.ClippedDrawingRectangleYMax = fb->height - 1; + } + } + } + + if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) { + const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers); + const uint32_t count = user_count + + ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params; + uint32_t dynamic_bound = ice->state.bound_vertex_buffers; + + if (count) { + const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length); + + uint32_t *map = + crocus_get_command_space(batch, 4 * (1 + vb_dwords * count)); + _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) { + vb.DWordLength = (vb_dwords * count + 1) - 2; + } + map += 1; + + uint32_t bound = dynamic_bound; + int i; + while (bound) { + i = u_bit_scan(&bound); + struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i]; + struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource); + uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i]; + + emit_vertex_buffer_state(batch, i, bo, + buf->buffer_offset, + ice->state.vb_end[i], + buf->stride, + step_rate, + &map); + } + i = user_count; + if (ice->state.vs_uses_draw_params) { + struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res; + emit_vertex_buffer_state(batch, i++, + res->bo, + ice->draw.draw_params.offset, + ice->draw.draw_params.res->width0, + 0, 0, &map); + } + if (ice->state.vs_uses_derived_draw_params) { + struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res; + emit_vertex_buffer_state(batch, i++, + res->bo, + ice->draw.derived_draw_params.offset, + ice->draw.derived_draw_params.res->width0, + 0, 0, &map); + } + } + } + + if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) { + struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements; + const unsigned entries = MAX2(cso->count, 1); + if (!(ice->state.vs_needs_sgvs_element || + ice->state.vs_uses_derived_draw_params || + ice->state.vs_needs_edge_flag)) { + crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) * + (1 + entries * GENX(VERTEX_ELEMENT_STATE_length))); + } else { + uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)]; + const unsigned dyn_count = cso->count + + ice->state.vs_needs_sgvs_element + + ice->state.vs_uses_derived_draw_params; + + crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), + &dynamic_ves, ve) { + ve.DWordLength = + 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2; + } + memcpy(&dynamic_ves[1], &cso->vertex_elements[1], + (cso->count - ice->state.vs_needs_edge_flag) * + GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t)); + uint32_t *ve_pack_dest = + &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) * + GENX(VERTEX_ELEMENT_STATE_length)]; + + if (ice->state.vs_needs_sgvs_element) { + uint32_t base_ctrl = ice->state.vs_uses_draw_params ? + VFCOMP_STORE_SRC : VFCOMP_STORE_0; + crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { + ve.Valid = true; + ve.VertexBufferIndex = + util_bitcount64(ice->state.bound_vertex_buffers); + ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT; + ve.Component0Control = base_ctrl; + ve.Component1Control = base_ctrl; + ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0; + ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0; +#if GFX_VER < 5 + ve.DestinationElementOffset = cso->count * 4; +#endif + } + ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length); + } + if (ice->state.vs_uses_derived_draw_params) { + crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) { + ve.Valid = true; + ve.VertexBufferIndex = + util_bitcount64(ice->state.bound_vertex_buffers) + + ice->state.vs_uses_draw_params; + ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT; + ve.Component0Control = VFCOMP_STORE_SRC; + ve.Component1Control = VFCOMP_STORE_SRC; + ve.Component2Control = VFCOMP_STORE_0; + ve.Component3Control = VFCOMP_STORE_0; +#if GFX_VER < 5 + ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4; +#endif + } + ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length); + } + if (ice->state.vs_needs_edge_flag) { + for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++) + ve_pack_dest[i] = cso->edgeflag_ve[i]; + } + + crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) * + (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length))); + } + } + +#if GFX_VERx10 == 75 + if (dirty & CROCUS_DIRTY_GEN75_VF) { + crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) { + if (draw->primitive_restart) { + vf.IndexedDrawCutIndexEnable = true; + vf.CutIndex = draw->restart_index; + } + } + } +#endif + +#if GFX_VER <= 5 + if (dirty & CROCUS_DIRTY_GEN4_CURBE) { + gen4_upload_curbe(batch); + } +#endif +} + +static void +crocus_upload_render_state(struct crocus_context *ice, + struct crocus_batch *batch, + const struct pipe_draw_info *draw, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc) +{ +#if GFX_VER == 7 + bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT; +#endif + bool emit_index = false; + batch->no_wrap = true; + + if (!batch->contains_draw) { + emit_index = true; + batch->contains_draw = true; + } + crocus_update_surface_base_address(batch); + + crocus_upload_dirty_render_state(ice, batch, draw); + + batch->no_wrap = false; + if (draw->index_size > 0) { + unsigned offset; + unsigned size; + + if (draw->has_user_indices) { + unsigned start_offset = draw->index_size * sc->start; + u_upload_data(ice->ctx.stream_uploader, 0, + sc->count * draw->index_size, 4, + (char *)draw->index.user + start_offset, + &offset, &ice->state.index_buffer.res); + offset -= start_offset; + size = start_offset + sc->count * draw->index_size; + emit_index = true; + } else { + struct crocus_resource *res = (void *) draw->index.resource; + res->bind_history |= PIPE_BIND_INDEX_BUFFER; + + if (ice->state.index_buffer.res != draw->index.resource) { + pipe_resource_reference(&ice->state.index_buffer.res, + draw->index.resource); + emit_index = true; + } + offset = 0; + size = draw->index.resource->width0; + } + + if (!emit_index && + (ice->state.index_buffer.size != size || + ice->state.index_buffer.index_size != draw->index_size || + ice->state.index_buffer.prim_restart != draw->primitive_restart)) + emit_index = true; + + if (emit_index) { + struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res); + + crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) { +#if !(GFX_VERx10 == 75) + ib.CutIndexEnable = draw->primitive_restart; +#endif + ib.IndexFormat = draw->index_size >> 1; + ib.BufferStartingAddress = ro_bo(bo, offset); + ib.BufferEndingAddress = ro_bo(bo, offset + size - 1); + } + ice->state.index_buffer.size = size; + ice->state.index_buffer.offset = offset; + ice->state.index_buffer.index_size = draw->index_size; + ice->state.index_buffer.prim_restart = draw->primitive_restart; + } + } + +#define _3DPRIM_END_OFFSET 0x2420 +#define _3DPRIM_START_VERTEX 0x2430 +#define _3DPRIM_VERTEX_COUNT 0x2434 +#define _3DPRIM_INSTANCE_COUNT 0x2438 +#define _3DPRIM_START_INSTANCE 0x243C +#define _3DPRIM_BASE_VERTEX 0x2440 + +#if GFX_VER == 7 + if (indirect && !indirect->count_from_stream_output) { + if (indirect->indirect_draw_count) { + use_predicate = true; + + struct crocus_bo *draw_count_bo = + crocus_resource_bo(indirect->indirect_draw_count); + unsigned draw_count_offset = + indirect->indirect_draw_count_offset; + + crocus_emit_pipe_control_flush(batch, + "ensure indirect draw buffer is flushed", + PIPE_CONTROL_FLUSH_ENABLE); + if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { +#if GFX_VERx10 == 75 + struct mi_builder b; + mi_builder_init(&b, &batch->screen->devinfo, batch); + + /* comparison = draw id < draw count */ + struct mi_value comparison = + mi_ult(&b, mi_imm(drawid_offset), + mi_mem32(ro_bo(draw_count_bo, + draw_count_offset))); + + /* predicate = comparison & conditional rendering predicate */ + struct mi_value pred = mi_iand(&b, comparison, + mi_reg32(CS_GPR(15))); + + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + + unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | + MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); +#endif + } else { + uint32_t mi_predicate; + + /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */ + crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset); + /* Upload the current draw count from the draw parameters buffer + * to MI_PREDICATE_SRC0. + */ + crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, + draw_count_bo, draw_count_offset); + /* Zero the top 32-bits of MI_PREDICATE_SRC0 */ + crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0); + + if (drawid_offset == 0) { + mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | + MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + } else { + /* While draw_index < draw_count the predicate's result will be + * (draw_index == draw_count) ^ TRUE = TRUE + * When draw_index == draw_count the result is + * (TRUE) ^ TRUE = FALSE + * After this all results will be: + * (FALSE) ^ FALSE = FALSE + */ + mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD | + MI_PREDICATE_COMBINEOP_XOR | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + } + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); + } + } + +#if GFX_VER >= 7 + struct crocus_bo *bo = crocus_resource_bo(indirect->buffer); + assert(bo); + + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT; + lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0); + } + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT; + lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4); + } + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = _3DPRIM_START_VERTEX; + lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8); + } + if (draw->index_size) { + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = _3DPRIM_BASE_VERTEX; + lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12); + } + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = _3DPRIM_START_INSTANCE; + lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16); + } + } else { + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = _3DPRIM_START_INSTANCE; + lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12); + } + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = _3DPRIM_BASE_VERTEX; + lri.DataDWord = 0; + } + } +#endif + } else if (indirect && indirect->count_from_stream_output) { +#if GFX_VERx10 == 75 + struct crocus_stream_output_target *so = + (void *) indirect->count_from_stream_output; + + /* XXX: Replace with actual cache tracking */ + crocus_emit_pipe_control_flush(batch, + "draw count from stream output stall", + PIPE_CONTROL_CS_STALL); + + struct mi_builder b; + mi_builder_init(&b, &batch->screen->devinfo, batch); + + struct crocus_address addr = + ro_bo(crocus_resource_bo(&so->offset_res->base), so->offset_offset); + struct mi_value offset = + mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset); + + mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT), + mi_udiv32_imm(&b, offset, so->stride)); + + _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0); + _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0); + _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0); + _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count); +#endif + } +#else + assert(!indirect); +#endif + + crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) { + prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL; +#if GFX_VER == 7 + prim.PredicateEnable = use_predicate; +#endif + + prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, draw->vertices_per_patch); + if (indirect) { + // XXX Probably have to do something for gen6 here? +#if GFX_VER == 7 + prim.IndirectParameterEnable = true; +#endif + } else { +#if GFX_VER >= 5 + prim.StartInstanceLocation = draw->start_instance; +#endif + prim.InstanceCount = draw->instance_count; + prim.VertexCountPerInstance = sc->count; + + prim.StartVertexLocation = sc->start; + + if (draw->index_size) { + prim.BaseVertexLocation += sc->index_bias; + } + } + } +} + +#if GFX_VER == 7 + +static void +crocus_upload_compute_state(struct crocus_context *ice, + struct crocus_batch *batch, + const struct pipe_grid_info *grid) +{ + const uint64_t stage_dirty = ice->state.stage_dirty; + struct crocus_screen *screen = batch->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE]; + struct crocus_compiled_shader *shader = + ice->shaders.prog[MESA_SHADER_COMPUTE]; + struct brw_stage_prog_data *prog_data = shader->prog_data; + struct brw_cs_prog_data *cs_prog_data = (void *) prog_data; + const struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block); + + crocus_update_surface_base_address(batch); + if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload) + upload_sysvals(ice, MESA_SHADER_COMPUTE); + + if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) { + crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false); + ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset = + crocus_upload_binding_table(ice, batch, + ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset, + ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes); + } + + if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS) + crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE); + + if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) || + cs_prog_data->local_size[0] == 0 /* Variable local group size */) { + /* The MEDIA_VFE_STATE documentation for Gen8+ says: + * + * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless + * the only bits that are changed are scoreboard related: Scoreboard + * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For + * these scoreboard related states, a MEDIA_STATE_FLUSH is + * sufficient." + */ + crocus_emit_pipe_control_flush(batch, + "workaround: stall before MEDIA_VFE_STATE", + PIPE_CONTROL_CS_STALL); + + crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) { + if (prog_data->total_scratch) { + struct crocus_bo *bo = + crocus_get_scratch_space(ice, prog_data->total_scratch, + MESA_SHADER_COMPUTE); +#if GFX_VERx10 == 75 + /* Haswell's Per Thread Scratch Space is in the range [0, 10] + * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. + */ + vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12; +#else + /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] + * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. + */ + vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1; +#endif + vfe.ScratchSpaceBasePointer = rw_bo(bo, 0); + } + + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * screen->subslice_total - 1; + vfe.ResetGatewayTimer = + Resettingrelativetimerandlatchingtheglobaltimestamp; + vfe.BypassGatewayControl = true; + vfe.GPGPUMode = 1; + vfe.NumberofURBEntries = 0; + vfe.URBEntryAllocationSize = 0; + + vfe.CURBEAllocationSize = + ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads + + cs_prog_data->push.cross_thread.regs, 2); + } + } + + /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */ + if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) || + cs_prog_data->local_size[0] == 0 /* Variable local group size */) { + uint32_t curbe_data_offset = 0; + assert(cs_prog_data->push.cross_thread.dwords == 0 && + cs_prog_data->push.per_thread.dwords == 1 && + cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID); + const unsigned push_const_size = + brw_cs_push_const_total_size(cs_prog_data, dispatch.threads); + uint32_t *curbe_data_map = + stream_state(batch, + ALIGN(push_const_size, 64), 64, + &curbe_data_offset); + assert(curbe_data_map); + memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64)); + crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads, + curbe_data_map); + + crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) { + curbe.CURBETotalDataLength = ALIGN(push_const_size, 64); + curbe.CURBEDataStartAddress = curbe_data_offset; + } + } + + if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS | + CROCUS_STAGE_DIRTY_BINDINGS_CS | + CROCUS_STAGE_DIRTY_CONSTANTS_CS | + CROCUS_STAGE_DIRTY_CS)) { + uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; + const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size); + crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) { + idd.KernelStartPointer = ksp; + idd.SamplerStatePointer = shs->sampler_offset; + idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset; + idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31); + idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads; + idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs; + idd.BarrierEnable = cs_prog_data->uses_barrier; + idd.SharedLocalMemorySize = encode_slm_size(GFX_VER, + prog_data->total_shared); +#if GFX_VERx10 >= 75 + idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs; +#endif + } + + crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) { + load.InterfaceDescriptorTotalLength = + GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); + load.InterfaceDescriptorDataStartAddress = + emit_state(batch, desc, sizeof(desc), 64); + } + } + +#define GPGPU_DISPATCHDIMX 0x2500 +#define GPGPU_DISPATCHDIMY 0x2504 +#define GPGPU_DISPATCHDIMZ 0x2508 + + if (grid->indirect) { + struct crocus_state_ref *grid_size = &ice->state.grid_size; + struct crocus_bo *bo = crocus_resource_bo(grid_size->res); + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GPGPU_DISPATCHDIMX; + lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0); + } + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GPGPU_DISPATCHDIMY; + lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4); + } + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GPGPU_DISPATCHDIMZ; + lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8); + } + + /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ + _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0); + crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0); + + /* Load compute_dispatch_indirect_x_size into SRC0 */ + crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0); + + /* predicate = (compute_dispatch_indirect_x_size == 0); */ + crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + }; + + /* Load compute_dispatch_indirect_y_size into SRC0 */ + crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4); + + /* predicate = (compute_dispatch_indirect_y_size == 0); */ + crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_OR; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + }; + + /* Load compute_dispatch_indirect_z_size into SRC0 */ + crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8); + + /* predicate = (compute_dispatch_indirect_z_size == 0); */ + crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_OR; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + }; + + /* predicate = !predicate; */ +#define COMPARE_FALSE 1 + crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_OR; + mip.CompareOperation = COMPARE_FALSE; + } + + } + + crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) { + ggw.IndirectParameterEnable = grid->indirect != NULL; + ggw.PredicateEnable = grid->indirect != NULL; + ggw.SIMDSize = dispatch.simd_size / 16; + ggw.ThreadDepthCounterMaximum = 0; + ggw.ThreadHeightCounterMaximum = 0; + ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; + ggw.ThreadGroupIDXDimension = grid->grid[0]; + ggw.ThreadGroupIDYDimension = grid->grid[1]; + ggw.ThreadGroupIDZDimension = grid->grid[2]; + ggw.RightExecutionMask = dispatch.right_mask; + ggw.BottomExecutionMask = 0xffffffff; + } + + crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf); + + batch->contains_draw = true; +} + +#endif /* GFX_VER == 7 */ + +/** + * State module teardown. + */ +static void +crocus_destroy_state(struct crocus_context *ice) +{ + + pipe_resource_reference(&ice->draw.draw_params.res, NULL); + pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL); + + for (int i = 0; i < 4; i++) { + pipe_so_target_reference(&ice->state.so_target[i], NULL); + } + + for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) { + pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL); + } + pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL); + + for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) { + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { + pipe_resource_reference(&shs->constbufs[i].buffer, NULL); + } + for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) { + pipe_resource_reference(&shs->image[i].base.resource, NULL); + } + for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) { + pipe_resource_reference(&shs->ssbo[i].buffer, NULL); + } + for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) { + pipe_sampler_view_reference((struct pipe_sampler_view **) + &shs->textures[i], NULL); + } + } + + pipe_resource_reference(&ice->state.grid_size.res, NULL); + + pipe_resource_reference(&ice->state.index_buffer.res, NULL); +} + +/* ------------------------------------------------------------------- */ + +static void +crocus_rebind_buffer(struct crocus_context *ice, + struct crocus_resource *res) +{ + struct pipe_context *ctx = &ice->ctx; + + assert(res->base.target == PIPE_BUFFER); + + /* Buffers can't be framebuffer attachments, nor display related, + * and we don't have upstream Clover support. + */ + assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL | + PIPE_BIND_RENDER_TARGET | + PIPE_BIND_BLENDABLE | + PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_CURSOR | + PIPE_BIND_COMPUTE_RESOURCE | + PIPE_BIND_GLOBAL))); + + if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) { + uint64_t bound_vbs = ice->state.bound_vertex_buffers; + while (bound_vbs) { + const int i = u_bit_scan64(&bound_vbs); + struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i]; + + if (!buffer->is_user_buffer && &res->base == buffer->buffer.resource) + ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS; + } + } + + if (res->bind_history & PIPE_BIND_INDEX_BUFFER) { + if (res->bo == crocus_resource_bo(ice->state.index_buffer.res)) + pipe_resource_reference(&ice->state.index_buffer.res, NULL); + } + /* There is no need to handle these: + * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw) + * - PIPE_BIND_QUERY_BUFFER (no persistent state references) + */ + + if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) { + /* XXX: be careful about resetting vs appending... */ + assert(false); + } + + for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) { + struct crocus_shader_state *shs = &ice->state.shaders[s]; + enum pipe_shader_type p_stage = stage_to_pipe(s); + + if (!(res->bind_stages & (1 << s))) + continue; + + if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) { + /* Skip constant buffer 0, it's for regular uniforms, not UBOs */ + uint32_t bound_cbufs = shs->bound_cbufs & ~1u; + while (bound_cbufs) { + const int i = u_bit_scan(&bound_cbufs); + struct pipe_constant_buffer *cbuf = &shs->constbufs[i]; + + if (res->bo == crocus_resource_bo(cbuf->buffer)) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s; + } + } + } + + if (res->bind_history & PIPE_BIND_SHADER_BUFFER) { + uint32_t bound_ssbos = shs->bound_ssbos; + while (bound_ssbos) { + const int i = u_bit_scan(&bound_ssbos); + struct pipe_shader_buffer *ssbo = &shs->ssbo[i]; + + if (res->bo == crocus_resource_bo(ssbo->buffer)) { + struct pipe_shader_buffer buf = { + .buffer = &res->base, + .buffer_offset = ssbo->buffer_offset, + .buffer_size = ssbo->buffer_size, + }; + crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf, + (shs->writable_ssbos >> i) & 1); + } + } + } + + if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) { + uint32_t bound_sampler_views = shs->bound_sampler_views; + while (bound_sampler_views) { + const int i = u_bit_scan(&bound_sampler_views); + struct crocus_sampler_view *isv = shs->textures[i]; + struct crocus_bo *bo = isv->res->bo; + + if (res->bo == bo) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s; + } + } + } + + if (res->bind_history & PIPE_BIND_SHADER_IMAGE) { + uint32_t bound_image_views = shs->bound_image_views; + while (bound_image_views) { + const int i = u_bit_scan(&bound_image_views); + struct crocus_image_view *iv = &shs->image[i]; + struct crocus_bo *bo = crocus_resource_bo(iv->base.resource); + + if (res->bo == bo) + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s; + } + } + } +} + +/* ------------------------------------------------------------------- */ + +static unsigned +flags_to_post_sync_op(uint32_t flags) +{ + if (flags & PIPE_CONTROL_WRITE_IMMEDIATE) + return WriteImmediateData; + + if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) + return WritePSDepthCount; + + if (flags & PIPE_CONTROL_WRITE_TIMESTAMP) + return WriteTimestamp; + + return 0; +} + +/* + * Do the given flags have a Post Sync or LRI Post Sync operation? + */ +static enum pipe_control_flags +get_post_sync_flags(enum pipe_control_flags flags) +{ + flags &= PIPE_CONTROL_WRITE_IMMEDIATE | + PIPE_CONTROL_WRITE_DEPTH_COUNT | + PIPE_CONTROL_WRITE_TIMESTAMP | + PIPE_CONTROL_LRI_POST_SYNC_OP; + + /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with + * "LRI Post Sync Operation". So more than one bit set would be illegal. + */ + assert(util_bitcount(flags) <= 1); + + return flags; +} + +#define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE) + +/** + * Emit a series of PIPE_CONTROL commands, taking into account any + * workarounds necessary to actually accomplish the caller's request. + * + * Unless otherwise noted, spec quotations in this function come from: + * + * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming + * Restrictions for PIPE_CONTROL. + * + * You should not use this function directly. Use the helpers in + * crocus_pipe_control.c instead, which may split the pipe control further. + */ +static void +crocus_emit_raw_pipe_control(struct crocus_batch *batch, + const char *reason, + uint32_t flags, + struct crocus_bo *bo, + uint32_t offset, + uint64_t imm) +{ + UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; + enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags); + UNUSED enum pipe_control_flags non_lri_post_sync_flags = + post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP; + + /* Recursive PIPE_CONTROL workarounds -------------------------------- + * (http://knowyourmeme.com/memes/xzibit-yo-dawg) + * + * We do these first because we want to look at the original operation, + * rather than any workarounds we set. + */ + + /* "Flush Types" workarounds --------------------------------------------- + * We do these now because they may add post-sync operations or CS stalls. + */ + + if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) { + /* Hardware workaround: SNB B-Spec says: + * + * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush + * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is + * required." + */ + crocus_emit_post_sync_nonzero_flush(batch); + } + + if (!(GFX_VERx10 == 75) && (flags & PIPE_CONTROL_DEPTH_STALL)) { + /* Project: PRE-HSW / Argument: Depth Stall + * + * "The following bits must be clear: + * - Render Target Cache Flush Enable ([12] of DW1) + * - Depth Cache Flush Enable ([0] of DW1)" + */ + assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH))); + } + + if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) { + /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable): + * + * "This bit must be DISABLED for operations other than writing + * PS_DEPTH_COUNT." + * + * This seems like nonsense. An Ivybridge workaround requires us to + * emit a PIPE_CONTROL with a depth stall and write immediate post-sync + * operation. Gen8+ requires us to emit depth stalls and depth cache + * flushes together. So, it's hard to imagine this means anything other + * than "we originally intended this to be used for PS_DEPTH_COUNT". + * + * We ignore the supposed restriction and do nothing. + */ + } + + if (!(GFX_VERx10 == 75) && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) { + /* Project: PRE-HSW / Argument: Depth Cache Flush + * + * "Depth Stall must be clear ([13] of DW1)." + */ + assert(!(flags & PIPE_CONTROL_DEPTH_STALL)); + } + + if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_STALL_AT_SCOREBOARD)) { + /* From the PIPE_CONTROL instruction table, bit 12 and bit 1: + * + * "This bit must be DISABLED for End-of-pipe (Read) fences, + * PS_DEPTH_COUNT or TIMESTAMP queries." + * + * TODO: Implement end-of-pipe checking. + */ + assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT | + PIPE_CONTROL_WRITE_TIMESTAMP))); + } + + if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) { + /* From the PIPE_CONTROL instruction table, bit 1: + * + * "This bit is ignored if Depth Stall Enable is set. + * Further, the render cache is not flushed even if Write Cache + * Flush Enable bit is set." + * + * We assert that the caller doesn't do this combination, to try and + * prevent mistakes. It shouldn't hurt the GPU, though. + * + * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard" + * and "Render Target Flush" combo is explicitly required for BTI + * update workarounds. + */ + assert(!(flags & (PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_RENDER_TARGET_FLUSH))); + } + + /* PIPE_CONTROL page workarounds ------------------------------------- */ + + if (GFX_VER == 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) { + /* From the PIPE_CONTROL page itself: + * + * "IVB, HSW, BDW + * Restriction: Pipe_control with CS-stall bit set must be issued + * before a pipe-control command that has the State Cache + * Invalidate bit set." + */ + flags |= PIPE_CONTROL_CS_STALL; + } + + if ((GFX_VERx10 == 75)) { + /* From the PIPE_CONTROL page itself: + * + * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation: + * Prior to programming a PIPECONTROL command with any of the RO + * cache invalidation bit set, program a PIPECONTROL flush command + * with “CS stall” bit and “HDC Flush” bit set." + * + * TODO: Actually implement this. What's an HDC Flush? + */ + } + + if (flags & PIPE_CONTROL_FLUSH_LLC) { + /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC): + * + * "Project: ALL + * SW must always program Post-Sync Operation to "Write Immediate + * Data" when Flush LLC is set." + * + * For now, we just require the caller to do it. + */ + assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE); + } + + /* "Post-Sync Operation" workarounds -------------------------------- */ + + /* Project: All / Argument: Global Snapshot Count Reset [19] + * + * "This bit must not be exercised on any product. + * Requires stall bit ([20] of DW1) set." + * + * We don't use this, so we just assert that it isn't used. The + * PIPE_CONTROL instruction page indicates that they intended this + * as a debug feature and don't think it is useful in production, + * but it may actually be usable, should we ever want to. + */ + assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0); + + if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR | + PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) { + /* Project: All / Arguments: + * + * - Generic Media State Clear [16] + * - Indirect State Pointers Disable [16] + * + * "Requires stall bit ([20] of DW1) set." + * + * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media + * State Clear) says: + * + * "PIPECONTROL command with “Command Streamer Stall Enable” must be + * programmed prior to programming a PIPECONTROL command with "Media + * State Clear" set in GPGPU mode of operation" + * + * This is a subset of the earlier rule, so there's nothing to do. + */ + flags |= PIPE_CONTROL_CS_STALL; + } + + if (flags & PIPE_CONTROL_STORE_DATA_INDEX) { + /* Project: All / Argument: Store Data Index + * + * "Post-Sync Operation ([15:14] of DW1) must be set to something other + * than '0'." + * + * For now, we just assert that the caller does this. We might want to + * automatically add a write to the workaround BO... + */ + assert(non_lri_post_sync_flags != 0); + } + + if (flags & PIPE_CONTROL_SYNC_GFDT) { + /* Project: All / Argument: Sync GFDT + * + * "Post-Sync Operation ([15:14] of DW1) must be set to something other + * than '0' or 0x2520[13] must be set." + * + * For now, we just assert that the caller does this. + */ + assert(non_lri_post_sync_flags != 0); + } + + if (GFX_VER >= 6 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) { + /* Project: SNB, IVB, HSW / Argument: TLB inv + * + * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1) + * must be set to something other than '0'." + * + * For now, we just assert that the caller does this. + */ + assert(non_lri_post_sync_flags != 0); + } + + if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) { + /* Project: IVB+ / Argument: TLB inv + * + * "Requires stall bit ([20] of DW1) set." + * + * Also, from the PIPE_CONTROL instruction table: + * + * "Project: SKL+ + * Post Sync Operation or CS stall must be set to ensure a TLB + * invalidation occurs. Otherwise no cycle will occur to the TLB + * cache to invalidate." + * + * This is not a subset of the earlier rule, so there's nothing to do. + */ + flags |= PIPE_CONTROL_CS_STALL; + } + + /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT: + * + * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with + * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set." + * + * Note that the kernel does CS stalls between batches, so we only need + * to count them within a batch. We currently naively count every 4, and + * don't skip the ones with only read-cache-invalidate bits set. This + * may or may not be a problem... + */ + if (GFX_VER == 7 && !(GFX_VERx10 == 75)) { + if (flags & PIPE_CONTROL_CS_STALL) { + /* If we're doing a CS stall, reset the counter and carry on. */ + batch->pipe_controls_since_last_cs_stall = 0; + } + + /* If this is the fourth pipe control without a CS stall, do one now. */ + if (++batch->pipe_controls_since_last_cs_stall == 4) { + batch->pipe_controls_since_last_cs_stall = 0; + flags |= PIPE_CONTROL_CS_STALL; + } + } + + /* "Stall" workarounds ---------------------------------------------- + * These have to come after the earlier ones because we may have added + * some additional CS stalls above. + */ + + if (flags & PIPE_CONTROL_CS_STALL) { + /* Project: PRE-SKL, VLV, CHV + * + * "[All Stepping][All SKUs]: + * + * One of the following must also be set: + * + * - Render Target Cache Flush Enable ([12] of DW1) + * - Depth Cache Flush Enable ([0] of DW1) + * - Stall at Pixel Scoreboard ([1] of DW1) + * - Depth Stall ([13] of DW1) + * - Post-Sync Operation ([13] of DW1) + * - DC Flush Enable ([5] of DW1)" + * + * If we don't already have one of those bits set, we choose to add + * "Stall at Pixel Scoreboard". Some of the other bits require a + * CS stall as a workaround (see above), which would send us into + * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard" + * appears to be safe, so we choose that. + */ + const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_WRITE_IMMEDIATE | + PIPE_CONTROL_WRITE_DEPTH_COUNT | + PIPE_CONTROL_WRITE_TIMESTAMP | + PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_DATA_CACHE_FLUSH; + if (!(flags & wa_bits)) + flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; + } + + /* Emit --------------------------------------------------------------- */ + + if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) { + fprintf(stderr, + " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n", + (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "", + (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "", + (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "", + (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "", + (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "", + (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "", + (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "", + (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "", + (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "", + (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "", + (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "", + (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "", + (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "", + (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "", + (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "", + (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ? + "SnapRes" : "", + (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ? + "ISPDis" : "", + (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "", + (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "", + (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "", + imm, reason); + } + + crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) { +#if GFX_VER >= 7 + pc.LRIPostSyncOperation = NoLRIOperation; + pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE; + pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH; +#endif +#if GFX_VER >= 6 + pc.StoreDataIndex = 0; + pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL; + pc.GlobalSnapshotCountReset = + flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET; + pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE; + pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR; + pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD; + pc.RenderTargetCacheFlushEnable = + flags & PIPE_CONTROL_RENDER_TARGET_FLUSH; + pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH; + pc.StateCacheInvalidationEnable = + flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE; + pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE; + pc.ConstantCacheInvalidationEnable = + flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE; +#else + pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH; +#endif + pc.PostSyncOperation = flags_to_post_sync_op(flags); + pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL; + pc.InstructionCacheInvalidateEnable = + flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE; + pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE; +#if GFX_VER >= 5 || GFX_VERx10 == 45 + pc.IndirectStatePointersDisable = + flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE; +#endif +#if GFX_VER >= 6 + pc.TextureCacheInvalidationEnable = + flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; +#elif GFX_VER == 5 || GFX_VERx10 == 45 + pc.TextureCacheFlushEnable = + flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; +#endif + pc.Address = ggtt_bo(bo, offset); + if (GFX_VER < 7 && bo) + pc.DestinationAddressType = DAT_GGTT; + pc.ImmediateData = imm; + } +} + +#if GFX_VER == 6 +void +genX(upload_urb)(struct crocus_batch *batch, + unsigned vs_size, + bool gs_present, + unsigned gs_size) +{ + struct crocus_context *ice = batch->ice; + int nr_vs_entries, nr_gs_entries; + int total_urb_size = ice->urb.size * 1024; /* in bytes */ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + + /* Calculate how many entries fit in each stage's section of the URB */ + if (gs_present) { + nr_vs_entries = (total_urb_size/2) / (vs_size * 128); + nr_gs_entries = (total_urb_size/2) / (gs_size * 128); + } else { + nr_vs_entries = total_urb_size / (vs_size * 128); + nr_gs_entries = 0; + } + + /* Then clamp to the maximum allowed by the hardware */ + if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX]) + nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX]; + + if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY]) + nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY]; + + /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */ + ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4); + ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4); + + assert(ice->urb.nr_vs_entries >= + devinfo->urb.min_entries[MESA_SHADER_VERTEX]); + assert(ice->urb.nr_vs_entries % 4 == 0); + assert(ice->urb.nr_gs_entries % 4 == 0); + assert(vs_size <= 5); + assert(gs_size <= 5); + + crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) { + urb.VSNumberofURBEntries = ice->urb.nr_vs_entries; + urb.VSURBEntryAllocationSize = vs_size - 1; + + urb.GSNumberofURBEntries = ice->urb.nr_gs_entries; + urb.GSURBEntryAllocationSize = gs_size - 1; + }; + /* From the PRM Volume 2 part 1, section 1.4.7: + * + * Because of a urb corruption caused by allocating a previous gsunit’s + * urb entry to vsunit software is required to send a "GS NULL + * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus + * a dummy DRAW call before any case where VS will be taking over GS URB + * space. + * + * It is not clear exactly what this means ("URB fence" is a command that + * doesn't exist on Gen6). So for now we just do a full pipeline flush as + * a workaround. + */ + if (ice->urb.gs_present && !gs_present) + crocus_emit_mi_flush(batch); + ice->urb.gs_present = gs_present; +} +#endif + +static void +crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch) +{ +} + +static void +crocus_emit_mi_report_perf_count(struct crocus_batch *batch, + struct crocus_bo *bo, + uint32_t offset_in_bytes, + uint32_t report_id) +{ +#if GFX_VER >= 7 + crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) { + mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes); + mi_rpc.ReportID = report_id; + } +#endif +} + +/** + * From the PRM, Volume 2a: + * + * "Indirect State Pointers Disable + * + * At the completion of the post-sync operation associated with this pipe + * control packet, the indirect state pointers in the hardware are + * considered invalid; the indirect pointers are not saved in the context. + * If any new indirect state commands are executed in the command stream + * while the pipe control is pending, the new indirect state commands are + * preserved. + * + * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context + * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant + * commands are only considered as Indirect State Pointers. Once ISP is + * issued in a context, SW must initialize by programming push constant + * commands for all the shaders (at least to zero length) before attempting + * any rendering operation for the same context." + * + * 3DSTATE_CONSTANT_* packets are restored during a context restore, + * even though they point to a BO that has been already unreferenced at + * the end of the previous batch buffer. This has been fine so far since + * we are protected by these scratch page (every address not covered by + * a BO should be pointing to the scratch page). But on CNL, it is + * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_* + * instruction. + * + * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the + * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a + * context restore, so the mentioned hang doesn't happen. However, + * software must program push constant commands for all stages prior to + * rendering anything, so we flag them as dirty. + * + * Finally, we also make sure to stall at pixel scoreboard to make sure the + * constants have been loaded into the EUs prior to disable the push constants + * so that it doesn't hang a previous 3DPRIMITIVE. + */ +#if GFX_VER >= 7 +static void +gen7_emit_isp_disable(struct crocus_batch *batch) +{ + crocus_emit_raw_pipe_control(batch, "isp disable", + PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_CS_STALL, + NULL, 0, 0); + crocus_emit_raw_pipe_control(batch, "isp disable", + PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE | + PIPE_CONTROL_CS_STALL, + NULL, 0, 0); + + struct crocus_context *ice = batch->ice; + ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS | + CROCUS_STAGE_DIRTY_CONSTANTS_TCS | + CROCUS_STAGE_DIRTY_CONSTANTS_TES | + CROCUS_STAGE_DIRTY_CONSTANTS_GS | + CROCUS_STAGE_DIRTY_CONSTANTS_FS); +} +#endif + +#if GFX_VER >= 7 +static void +crocus_state_finish_batch(struct crocus_batch *batch) +{ +#if GFX_VERx10 == 75 + if (batch->name == CROCUS_BATCH_RENDER) { + crocus_emit_mi_flush(batch); + crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { + ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset; + } + + crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_CS_STALL); + } +#endif + gen7_emit_isp_disable(batch); +} +#endif + +static void +crocus_batch_reset_dirty(struct crocus_batch *batch) +{ + /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch + * as the old state batch won't still be available. + */ + batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER | + CROCUS_DIRTY_COLOR_CALC_STATE; + + batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS; + + batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS; + + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS; + + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS; + batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS; + batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT; + +#if GFX_VER >= 6 + /* SCISSOR_STATE */ + batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE; + batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT; + batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL; + +#endif +#if GFX_VER <= 5 + /* dirty the SF state on gen4/5 */ + batch->ice->state.dirty |= CROCUS_DIRTY_RASTER; + batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE; + batch->ice->state.dirty |= CROCUS_DIRTY_CLIP; + batch->ice->state.dirty |= CROCUS_DIRTY_WM; +#endif +#if GFX_VER >= 7 + /* Streamout dirty */ + batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT; + batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST; + batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS; +#endif +} + +#if GFX_VERx10 == 75 +struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice) +{ + return &ice->state.cso_rast->cso; +} +#endif + +#if GFX_VER >= 6 +static void update_so_strides(struct crocus_context *ice, + uint16_t *strides) +{ + for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + struct crocus_stream_output_target *so = (void *)ice->state.so_target[i]; + if (so) + so->stride = strides[i] * sizeof(uint32_t); + } +} +#endif + +static void +crocus_set_frontend_noop(struct pipe_context *ctx, bool enable) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + + if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) { + ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER; + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; + } + + if (ice->batch_count == 1) + return; + + if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) { + ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE; + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE; + } +} + +void +genX(init_screen_state)(struct crocus_screen *screen) +{ + assert(screen->devinfo.verx10 == GFX_VERx10); + screen->vtbl.destroy_state = crocus_destroy_state; + screen->vtbl.init_render_context = crocus_init_render_context; + screen->vtbl.upload_render_state = crocus_upload_render_state; +#if GFX_VER == 7 + screen->vtbl.init_compute_context = crocus_init_compute_context; + screen->vtbl.upload_compute_state = crocus_upload_compute_state; +#endif + screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control; + screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count; + screen->vtbl.rebind_buffer = crocus_rebind_buffer; +#if GFX_VERx10 == 75 + screen->vtbl.load_register_reg32 = crocus_load_register_reg32; + screen->vtbl.load_register_reg64 = crocus_load_register_reg64; + screen->vtbl.load_register_imm32 = crocus_load_register_imm32; + screen->vtbl.load_register_imm64 = crocus_load_register_imm64; + screen->vtbl.store_data_imm32 = crocus_store_data_imm32; + screen->vtbl.store_data_imm64 = crocus_store_data_imm64; +#endif +#if GFX_VER >= 7 + screen->vtbl.load_register_mem32 = crocus_load_register_mem32; + screen->vtbl.load_register_mem64 = crocus_load_register_mem64; + screen->vtbl.copy_mem_mem = crocus_copy_mem_mem; + screen->vtbl.create_so_decl_list = crocus_create_so_decl_list; +#endif + screen->vtbl.update_surface_base_address = crocus_update_surface_base_address; +#if GFX_VER >= 6 + screen->vtbl.store_register_mem32 = crocus_store_register_mem32; + screen->vtbl.store_register_mem64 = crocus_store_register_mem64; +#endif + screen->vtbl.populate_vs_key = crocus_populate_vs_key; + screen->vtbl.populate_tcs_key = crocus_populate_tcs_key; + screen->vtbl.populate_tes_key = crocus_populate_tes_key; + screen->vtbl.populate_gs_key = crocus_populate_gs_key; + screen->vtbl.populate_fs_key = crocus_populate_fs_key; + screen->vtbl.populate_cs_key = crocus_populate_cs_key; + screen->vtbl.lost_genx_state = crocus_lost_genx_state; +#if GFX_VER >= 7 + screen->vtbl.finish_batch = crocus_state_finish_batch; +#endif +#if GFX_VER <= 5 + screen->vtbl.upload_urb_fence = crocus_upload_urb_fence; + screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence; +#endif + screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty; + screen->vtbl.translate_prim_type = translate_prim_type; +#if GFX_VER >= 6 + screen->vtbl.update_so_strides = update_so_strides; + screen->vtbl.get_so_offset = crocus_get_so_offset; +#endif + + genX(init_blt)(screen); +} + +void +genX(init_state)(struct crocus_context *ice) +{ + struct pipe_context *ctx = &ice->ctx; + + ctx->create_blend_state = crocus_create_blend_state; + ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state; + ctx->create_rasterizer_state = crocus_create_rasterizer_state; + ctx->create_sampler_state = crocus_create_sampler_state; + ctx->create_sampler_view = crocus_create_sampler_view; + ctx->create_surface = crocus_create_surface; + ctx->create_vertex_elements_state = crocus_create_vertex_elements; + ctx->bind_blend_state = crocus_bind_blend_state; + ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state; + ctx->bind_sampler_states = crocus_bind_sampler_states; + ctx->bind_rasterizer_state = crocus_bind_rasterizer_state; + ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state; + ctx->delete_blend_state = crocus_delete_state; + ctx->delete_depth_stencil_alpha_state = crocus_delete_state; + ctx->delete_rasterizer_state = crocus_delete_state; + ctx->delete_sampler_state = crocus_delete_state; + ctx->delete_vertex_elements_state = crocus_delete_state; + ctx->set_blend_color = crocus_set_blend_color; + ctx->set_clip_state = crocus_set_clip_state; + ctx->set_constant_buffer = crocus_set_constant_buffer; + ctx->set_shader_buffers = crocus_set_shader_buffers; + ctx->set_shader_images = crocus_set_shader_images; + ctx->set_sampler_views = crocus_set_sampler_views; + ctx->set_tess_state = crocus_set_tess_state; + ctx->set_framebuffer_state = crocus_set_framebuffer_state; + ctx->set_polygon_stipple = crocus_set_polygon_stipple; + ctx->set_sample_mask = crocus_set_sample_mask; + ctx->set_scissor_states = crocus_set_scissor_states; + ctx->set_stencil_ref = crocus_set_stencil_ref; + ctx->set_vertex_buffers = crocus_set_vertex_buffers; + ctx->set_viewport_states = crocus_set_viewport_states; + ctx->sampler_view_destroy = crocus_sampler_view_destroy; + ctx->surface_destroy = crocus_surface_destroy; + ctx->draw_vbo = crocus_draw_vbo; + ctx->launch_grid = crocus_launch_grid; + + ctx->set_frontend_noop = crocus_set_frontend_noop; + +#if GFX_VER >= 6 + ctx->create_stream_output_target = crocus_create_stream_output_target; + ctx->stream_output_target_destroy = crocus_stream_output_target_destroy; + ctx->set_stream_output_targets = crocus_set_stream_output_targets; +#endif + + ice->state.dirty = ~0ull; + ice->state.stage_dirty = ~0ull; + + ice->state.statistics_counters_enabled = true; + + ice->state.sample_mask = 0xff; + ice->state.num_viewports = 1; + ice->state.prim_mode = PIPE_PRIM_MAX; + ice->state.genx = calloc(1, sizeof(struct crocus_genx_state)); + ice->draw.derived_params.drawid = -1; + + /* Default all scissor rectangles to be empty regions. */ + for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) { + ice->state.scissors[i] = (struct pipe_scissor_state) { + .minx = 1, .maxx = 0, .miny = 1, .maxy = 0, + }; + } +} diff --git a/src/gallium/drivers/crocus/crocus_todo.txt b/src/gallium/drivers/crocus/crocus_todo.txt new file mode 100644 index 00000000000..1a6d3c9a710 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_todo.txt @@ -0,0 +1,16 @@ +Quick TODO list from what I can see: + +General: +Re-emit SURFACE_STATE_BASE_ADDRESS at the top of every batch + +Gen4: +rgb32 issue + +Gen5: +rgb32 issue + +Gen6: +vec4 push constants + +Gen7: + diff --git a/src/gallium/drivers/crocus/driinfo_crocus.h b/src/gallium/drivers/crocus/driinfo_crocus.h new file mode 100644 index 00000000000..829bf7f818c --- /dev/null +++ b/src/gallium/drivers/crocus/driinfo_crocus.h @@ -0,0 +1,11 @@ +// crocus specific driconf options + +DRI_CONF_SECTION_DEBUG + DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false) + DRI_CONF_DISABLE_THROTTLING(false) + DRI_CONF_ALWAYS_FLUSH_CACHE(false) +DRI_CONF_SECTION_END + +DRI_CONF_SECTION_PERFORMANCE + DRI_CONF_OPT_E(bo_reuse, 1, 0, 1, "Buffer object reuse",) +DRI_CONF_SECTION_END diff --git a/src/gallium/drivers/crocus/gen4_blorp_exec.h b/src/gallium/drivers/crocus/gen4_blorp_exec.h new file mode 100644 index 00000000000..bc19a1b39fc --- /dev/null +++ b/src/gallium/drivers/crocus/gen4_blorp_exec.h @@ -0,0 +1,190 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +static inline struct blorp_address +dynamic_state_address(struct blorp_batch *blorp_batch, uint32_t offset) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + return (struct blorp_address) { + .buffer = batch->state.bo, + .offset = offset, + }; + +} + +static inline struct blorp_address +instruction_state_address(struct blorp_batch *blorp_batch, uint32_t offset) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + return (struct blorp_address) { + .buffer = batch->ice->shaders.cache_bo, + .offset = offset, + }; +} + +static struct blorp_address +blorp_emit_vs_state(struct blorp_batch *blorp_batch) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + uint32_t offset; + blorp_emit_dynamic(blorp_batch, GENX(VS_STATE), vs, 64, &offset) { + vs.Enable = false; + vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1; +#if GFX_VER == 5 + vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> 2; +#else + vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries; +#endif + } + + return dynamic_state_address(blorp_batch, offset); +} + +static struct blorp_address +blorp_emit_sf_state(struct blorp_batch *blorp_batch, + const struct blorp_params *params) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + const struct brw_sf_prog_data *prog_data = params->sf_prog_data; + + uint32_t offset; + blorp_emit_dynamic(blorp_batch, GENX(SF_STATE), sf, 64, &offset) { +#if GFX_VER == 4 + sf.KernelStartPointer = + instruction_state_address(blorp_batch, params->sf_prog_kernel); +#else + sf.KernelStartPointer = params->sf_prog_kernel; +#endif + sf.GRFRegisterCount = DIV_ROUND_UP(prog_data->total_grf, 16) - 1; + sf.VertexURBEntryReadLength = prog_data->urb_read_length; + sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET; + sf.DispatchGRFStartRegisterForURBData = 3; + sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1; + sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries; + +#if GFX_VER == 5 + sf.MaximumNumberofThreads = MIN2(48, batch->ice->urb.nr_sf_entries) - 1; +#else + sf.MaximumNumberofThreads = MIN2(24, batch->ice->urb.nr_sf_entries) - 1; +#endif + sf.ViewportTransformEnable = false; + + sf.CullMode = CULLMODE_NONE; + } + + return dynamic_state_address(blorp_batch, offset); +} + +static struct blorp_address +blorp_emit_wm_state(struct blorp_batch *blorp_batch, + const struct blorp_params *params) +{ + const struct brw_wm_prog_data *prog_data = params->wm_prog_data; + + uint32_t offset; + blorp_emit_dynamic(blorp_batch, GENX(WM_STATE), wm, 64, &offset) { + if (params->src.enabled) { + /* Iron Lake can't do sampler prefetch */ + wm.SamplerCount = (GFX_VER != 5); + wm.BindingTableEntryCount = 2; + uint32_t sampler = blorp_emit_sampler_state(blorp_batch); + wm.SamplerStatePointer = dynamic_state_address(blorp_batch, sampler); + } + + if (prog_data) { + wm.DispatchGRFStartRegisterForConstantSetupData0 = + prog_data->base.dispatch_grf_start_reg; + wm.SetupURBEntryReadLength = prog_data->num_varying_inputs * 2; + wm.SetupURBEntryReadOffset = 0; + + wm.DepthCoefficientURBReadOffset = 1; + wm.PixelShaderKillsPixel = prog_data->uses_kill; + wm.ThreadDispatchEnable = true; + wm.EarlyDepthTestEnable = true; + + wm._8PixelDispatchEnable = prog_data->dispatch_8; + wm._16PixelDispatchEnable = prog_data->dispatch_16; + wm._32PixelDispatchEnable = prog_data->dispatch_32; + +#if GFX_VER == 4 + wm.KernelStartPointer0 = + instruction_state_address(blorp_batch, params->wm_prog_kernel); + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0); +#else + wm.KernelStartPointer0 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 0); + wm.KernelStartPointer1 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 1); + wm.KernelStartPointer2 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 2); + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0); + wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(prog_data, wm, 1); + wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(prog_data, wm, 2); +#endif + } + + wm.MaximumNumberofThreads = + blorp_batch->blorp->compiler->devinfo->max_wm_threads - 1; + } + + return dynamic_state_address(blorp_batch, offset); +} + +static struct blorp_address +blorp_emit_color_calc_state(struct blorp_batch *blorp_batch) +{ + uint32_t cc_viewport = blorp_emit_cc_viewport(blorp_batch); + + uint32_t offset; + blorp_emit_dynamic(blorp_batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) { + cc.CCViewportStatePointer = dynamic_state_address(blorp_batch, cc_viewport); + } + + return dynamic_state_address(blorp_batch, offset); +} + +static void +blorp_emit_pipeline(struct blorp_batch *blorp_batch, + const struct blorp_params *params) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + emit_urb_config(blorp_batch, params, NULL); + + blorp_emit(blorp_batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) { + pp.PointertoVSState = blorp_emit_vs_state(blorp_batch); + pp.GSEnable = false; + pp.ClipEnable = false; + pp.PointertoSFState = blorp_emit_sf_state(blorp_batch, params); + pp.PointertoWMState = blorp_emit_wm_state(blorp_batch, params); + pp.PointertoColorCalcState = blorp_emit_color_calc_state(blorp_batch); + } + + batch->screen->vtbl.upload_urb_fence(batch); + + blorp_emit(blorp_batch, GENX(CS_URB_STATE), curb); + blorp_emit(blorp_batch, GENX(CONSTANT_BUFFER), curb); +} diff --git a/src/gallium/drivers/crocus/meson.build b/src/gallium/drivers/crocus/meson.build new file mode 100644 index 00000000000..2bdb1f2cfb5 --- /dev/null +++ b/src/gallium/drivers/crocus/meson.build @@ -0,0 +1,90 @@ +# Copyright © 2017-2019 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +files_libcrocus = files( + 'gen4_blorp_exec.h', + 'driinfo_crocus.h', + 'crocus_batch.c', + 'crocus_batch.h', + 'crocus_blit.c', + 'crocus_bufmgr.c', + 'crocus_bufmgr.h', + 'crocus_clear.c', + 'crocus_context.c', + 'crocus_context.h', + 'crocus_draw.c', + 'crocus_fence.c', + 'crocus_fence.h', + 'crocus_fine_fence.c', + 'crocus_fine_fence.h', + 'crocus_formats.c', + 'crocus_genx_macros.h', + 'crocus_genx_protos.h', + 'crocus_monitor.c', + 'crocus_pipe.h', + 'crocus_pipe_control.c', + 'crocus_program.c', + 'crocus_program_cache.c', + 'crocus_resolve.c', + 'crocus_resource.c', + 'crocus_resource.h', + 'crocus_screen.c', + 'crocus_screen.h', + 'crocus_disk_cache.c', +) + +crocus_per_hw_ver_libs = [] +foreach v : ['40', '45', '50', '60', '70', '75'] + crocus_per_hw_ver_libs += static_library( + 'crocus_per_hw_ver@0@'.format(v), + ['crocus_blorp.c', 'crocus_query.c', 'crocus_state.c', 'crocus_blt.c', gen_xml_pack], + include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_intel], + c_args : [ + no_override_init_args, c_sse2_args, + '-DGFX_VERx10=@0@'.format(v), + ], + gnu_symbol_visibility : 'hidden', + dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers], + ) +endforeach + +libcrocus = static_library( + 'crocus', + [files_libcrocus, gen_xml_pack], + include_directories : [ + inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_intel, + inc_gallium_drivers, + # these should not be necessary, but main/macros.h... + inc_mesa, inc_mapi + ], + c_args : [c_sse2_args], + cpp_args : [c_sse2_args], + gnu_symbol_visibility : 'hidden', + dependencies : [dep_libdrm, dep_valgrind, idep_genxml, idep_libintel_common, idep_nir_headers], + link_with : [ + crocus_per_hw_ver_libs, libintel_compiler, libintel_dev, libisl, + libblorp, libintel_perf + ], +) + +driver_crocus = declare_dependency( + compile_args : '-DGALLIUM_CROCUS', + link_with : [libcrocus, libcrocuswinsys], +) diff --git a/src/gallium/meson.build b/src/gallium/meson.build index 3b3bb07f1de..e64d7399ae1 100644 --- a/src/gallium/meson.build +++ b/src/gallium/meson.build @@ -129,6 +129,12 @@ if with_gallium_tegra else driver_tegra = declare_dependency() endif +if with_gallium_crocus + subdir('winsys/crocus/drm') + subdir('drivers/crocus') +else + driver_crocus = declare_dependency() +endif if with_gallium_iris subdir('winsys/iris/drm') subdir('drivers/iris') diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build index daef41613db..cc6c805641b 100644 --- a/src/gallium/targets/d3dadapter9/meson.build +++ b/src/gallium/targets/d3dadapter9/meson.build @@ -64,7 +64,7 @@ libgallium_nine = shared_library( dep_selinux, dep_libdrm, dep_llvm, dep_thread, idep_xmlconfig, idep_mesautil, idep_nir, driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau, - driver_i915, driver_svga, driver_iris + driver_i915, driver_svga, driver_iris, driver_crocus ], name_prefix : '', version : '.'.join(nine_version), diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build index 90b48bf508e..e4cc199b363 100644 --- a/src/gallium/targets/dri/meson.build +++ b/src/gallium/targets/dri/meson.build @@ -58,7 +58,7 @@ libgallium_dri = shared_library( driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv, driver_tegra, driver_i915, driver_svga, driver_virgl, driver_swr, driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12, - driver_asahi + driver_asahi, driver_crocus ], # Will be deleted during installation, see install_megadrivers.py install : true, @@ -98,6 +98,7 @@ foreach d : [[with_gallium_kmsro, [ [with_gallium_panfrost, 'panfrost_dri.so'], [with_gallium_etnaviv, 'etnaviv_dri.so'], [with_gallium_tegra, 'tegra_dri.so'], + [with_gallium_crocus, 'crocus_dri.so'], [with_gallium_iris, 'iris_dri.so'], [with_gallium_i915, 'i915_dri.so'], [with_gallium_r300, 'r300_dri.so'], diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c index 9df8da61803..3c7c2325f17 100644 --- a/src/gallium/targets/dri/target.c +++ b/src/gallium/targets/dri/target.c @@ -42,6 +42,10 @@ DEFINE_LOADER_DRM_ENTRYPOINT(i915) DEFINE_LOADER_DRM_ENTRYPOINT(iris) #endif +#if defined(GALLIUM_CROCUS) +DEFINE_LOADER_DRM_ENTRYPOINT(crocus) +#endif + #if defined(GALLIUM_NOUVEAU) DEFINE_LOADER_DRM_ENTRYPOINT(nouveau) #endif diff --git a/src/gallium/winsys/crocus/drm/crocus_drm_public.h b/src/gallium/winsys/crocus/drm/crocus_drm_public.h new file mode 100644 index 00000000000..614543136be --- /dev/null +++ b/src/gallium/winsys/crocus/drm/crocus_drm_public.h @@ -0,0 +1,33 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_DRM_PUBLIC_H +#define CROCUS_DRM_PUBLIC_H + +struct pipe_screen; +struct pipe_screen_config; + +struct pipe_screen * +crocus_drm_screen_create(int drm_fd, const struct pipe_screen_config *config); + +#endif /* CROCUS_DRM_PUBLIC_H */ diff --git a/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c b/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c new file mode 100644 index 00000000000..ffeeba567ac --- /dev/null +++ b/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c @@ -0,0 +1,39 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "util/os_file.h" + +#include "crocus_drm_public.h" +#include "crocus/crocus_screen.h" + +struct pipe_screen * +crocus_drm_screen_create(int fd, const struct pipe_screen_config *config) +{ + int newfd = os_dupfd_cloexec(fd); + if (newfd < 0) + return NULL; + return crocus_screen_create(newfd, config); +} diff --git a/src/gallium/winsys/crocus/drm/meson.build b/src/gallium/winsys/crocus/drm/meson.build new file mode 100644 index 00000000000..4e82fe52437 --- /dev/null +++ b/src/gallium/winsys/crocus/drm/meson.build @@ -0,0 +1,29 @@ +# Copyright © 2017 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libcrocuswinsys = static_library( + 'crocuswinsys', + files('crocus_drm_winsys.c'), + include_directories : [ + inc_src, inc_include, + inc_gallium, inc_gallium_aux, inc_gallium_drivers, + ], + gnu_symbol_visibility : 'hidden', +) -- cgit v1.2.3