diff options
Diffstat (limited to 'src/gallium')
47 files changed, 28494 insertions, 2 deletions
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c index 8147c3ca346..ca5bf121a88 100644 --- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c +++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c @@ -70,6 +70,7 @@ static const struct pipe_loader_ops pipe_loader_drm_ops; static const struct drm_driver_descriptor *driver_descriptors[] = { &i915_driver_descriptor, &iris_driver_descriptor, + &crocus_driver_descriptor, &nouveau_driver_descriptor, &r300_driver_descriptor, &r600_driver_descriptor, diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h index 6bab07a40e7..ff4621e1a88 100644 --- a/src/gallium/auxiliary/target-helpers/drm_helper.h +++ b/src/gallium/auxiliary/target-helpers/drm_helper.h @@ -112,6 +112,26 @@ DRM_DRIVER_DESCRIPTOR(iris, iris_driconf, ARRAY_SIZE(iris_driconf)) DRM_DRIVER_DESCRIPTOR_STUB(iris) #endif +#ifdef GALLIUM_CROCUS +#include "crocus/drm/crocus_drm_public.h" + +static struct pipe_screen * +pipe_crocus_create_screen(int fd, const struct pipe_screen_config *config) +{ + struct pipe_screen *screen; + + screen = crocus_drm_screen_create(fd, config); + return screen ? debug_screen_wrap(screen) : NULL; +} + +const driOptionDescription crocus_driconf[] = { + #include "crocus/driinfo_crocus.h" +}; +DRM_DRIVER_DESCRIPTOR(crocus, crocus_driconf, ARRAY_SIZE(crocus_driconf)) +#else +DRM_DRIVER_DESCRIPTOR_STUB(crocus) +#endif + #ifdef GALLIUM_NOUVEAU #include "nouveau/drm/nouveau_drm_public.h" diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h index 5fd3084dfdb..478e72b8525 100644 --- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h +++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h @@ -6,6 +6,7 @@ struct pipe_screen_config; extern const struct drm_driver_descriptor i915_driver_descriptor; extern const struct drm_driver_descriptor iris_driver_descriptor; +extern const struct drm_driver_descriptor crocus_driver_descriptor; extern const struct drm_driver_descriptor nouveau_driver_descriptor; extern const struct drm_driver_descriptor r300_driver_descriptor; extern const struct drm_driver_descriptor r600_driver_descriptor; diff --git a/src/gallium/drivers/crocus/crocus_batch.c b/src/gallium/drivers/crocus/crocus_batch.c new file mode 100644 index 00000000000..63cfe282de4 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_batch.c @@ -0,0 +1,1047 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_batch.c + * + * Batchbuffer and command submission module. + * + * Every API draw call results in a number of GPU commands, which we + * collect into a "batch buffer". Typically, many draw calls are grouped + * into a single batch to amortize command submission overhead. + * + * We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl. + * One critical piece of data is the "validation list", which contains a + * list of the buffer objects (BOs) which the commands in the GPU need. + * The kernel will make sure these are resident and pinned at the correct + * virtual memory address before executing our batch. If a BO is not in + * the validation list, it effectively does not exist, so take care. + */ + +#include "crocus_batch.h" +#include "crocus_bufmgr.h" +#include "crocus_context.h" +#include "crocus_fence.h" + +#include "drm-uapi/i915_drm.h" + +#include "intel/common/intel_gem.h" +#include "main/macros.h" +#include "util/hash_table.h" +#include "util/set.h" +#include "util/u_upload_mgr.h" + +#include <errno.h> +#include <xf86drm.h> + +#if HAVE_VALGRIND +#include <memcheck.h> +#include <valgrind.h> +#define VG(x) x +#else +#define VG(x) +#endif + +#define FILE_DEBUG_FLAG DEBUG_BUFMGR + +/* Terminating the batch takes either 4 bytes for MI_BATCH_BUFFER_END + * or 12 bytes for MI_BATCH_BUFFER_START (when chaining). Plus, we may + * need an extra 4 bytes to pad out to the nearest QWord. So reserve 16. + */ +#define BATCH_RESERVED(devinfo) ((devinfo)->is_haswell ? 32 : 16) + +static void crocus_batch_reset(struct crocus_batch *batch); + +static unsigned +num_fences(struct crocus_batch *batch) +{ + return util_dynarray_num_elements(&batch->exec_fences, + struct drm_i915_gem_exec_fence); +} + +/** + * Debugging code to dump the fence list, used by INTEL_DEBUG=submit. + */ +static void +dump_fence_list(struct crocus_batch *batch) +{ + fprintf(stderr, "Fence list (length %u): ", num_fences(batch)); + + util_dynarray_foreach(&batch->exec_fences, + struct drm_i915_gem_exec_fence, f) { + fprintf(stderr, "%s%u%s ", + (f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "", + f->handle, + (f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : ""); + } + + fprintf(stderr, "\n"); +} + +/** + * Debugging code to dump the validation list, used by INTEL_DEBUG=submit. + */ +static void +dump_validation_list(struct crocus_batch *batch) +{ + fprintf(stderr, "Validation list (length %d):\n", batch->exec_count); + + for (int i = 0; i < batch->exec_count; i++) { + uint64_t flags = batch->validation_list[i].flags; + assert(batch->validation_list[i].handle == + batch->exec_bos[i]->gem_handle); + fprintf(stderr, + "[%2d]: %2d %-14s @ 0x%016llx (%" PRIu64 "B)\t %2d refs %s\n", i, + batch->validation_list[i].handle, batch->exec_bos[i]->name, + batch->validation_list[i].offset, batch->exec_bos[i]->size, + batch->exec_bos[i]->refcount, + (flags & EXEC_OBJECT_WRITE) ? " (write)" : ""); + } +} + +/** + * Return BO information to the batch decoder (for debugging). + */ +static struct intel_batch_decode_bo +decode_get_bo(void *v_batch, bool ppgtt, uint64_t address) +{ + struct crocus_batch *batch = v_batch; + + for (int i = 0; i < batch->exec_count; i++) { + struct crocus_bo *bo = batch->exec_bos[i]; + /* The decoder zeroes out the top 16 bits, so we need to as well */ + uint64_t bo_address = bo->gtt_offset & (~0ull >> 16); + + if (address >= bo_address && address < bo_address + bo->size) { + return (struct intel_batch_decode_bo){ + .addr = address, + .size = bo->size, + .map = crocus_bo_map(batch->dbg, bo, MAP_READ) + + (address - bo_address), + }; + } + } + + return (struct intel_batch_decode_bo) { }; +} + +static unsigned +decode_get_state_size(void *v_batch, uint64_t address, + uint64_t base_address) +{ + struct crocus_batch *batch = v_batch; + + /* The decoder gives us offsets from a base address, which is not great. + * Binding tables are relative to surface state base address, and other + * state is relative to dynamic state base address. These could alias, + * but in practice it's unlikely because surface offsets are always in + * the [0, 64K) range, and we assign dynamic state addresses starting at + * the top of the 4GB range. We should fix this but it's likely good + * enough for now. + */ + unsigned size = (uintptr_t) + _mesa_hash_table_u64_search(batch->state_sizes, address - base_address); + + return size; +} + +/** + * Decode the current batch. + */ +static void +decode_batch(struct crocus_batch *batch) +{ + void *map = crocus_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ); + intel_print_batch(&batch->decoder, map, batch->primary_batch_size, + batch->exec_bos[0]->gtt_offset, false); +} + +static void +init_reloc_list(struct crocus_reloc_list *rlist, int count) +{ + rlist->reloc_count = 0; + rlist->reloc_array_size = count; + rlist->relocs = malloc(rlist->reloc_array_size * + sizeof(struct drm_i915_gem_relocation_entry)); +} + +void +crocus_init_batch(struct crocus_context *ice, + enum crocus_batch_name name, + int priority) +{ + struct crocus_batch *batch = &ice->batches[name]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + + batch->ice = ice; + batch->screen = screen; + batch->dbg = &ice->dbg; + batch->reset = &ice->reset; + batch->name = name; + batch->contains_fence_signal = false; + + if (devinfo->ver >= 7) { + batch->fine_fences.uploader = + u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM, + PIPE_USAGE_STAGING, 0); + } + crocus_fine_fence_init(batch); + + batch->hw_ctx_id = crocus_create_hw_context(screen->bufmgr); + assert(batch->hw_ctx_id); + + crocus_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority); + + batch->valid_reloc_flags = EXEC_OBJECT_WRITE; + if (devinfo->ver == 6) + batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT; + + if (INTEL_DEBUG & DEBUG_BATCH) { + /* The shadow doesn't get relocs written so state decode fails. */ + batch->use_shadow_copy = false; + } else + batch->use_shadow_copy = !devinfo->has_llc; + + util_dynarray_init(&batch->exec_fences, ralloc_context(NULL)); + util_dynarray_init(&batch->syncobjs, ralloc_context(NULL)); + + init_reloc_list(&batch->command.relocs, 250); + init_reloc_list(&batch->state.relocs, 250); + + batch->exec_count = 0; + batch->exec_array_size = 100; + batch->exec_bos = + malloc(batch->exec_array_size * sizeof(batch->exec_bos[0])); + batch->validation_list = + malloc(batch->exec_array_size * sizeof(batch->validation_list[0])); + + batch->cache.render = _mesa_hash_table_create(NULL, NULL, + _mesa_key_pointer_equal); + batch->cache.depth = _mesa_set_create(NULL, NULL, + _mesa_key_pointer_equal); + + memset(batch->other_batches, 0, sizeof(batch->other_batches)); + + for (int i = 0, j = 0; i < ice->batch_count; i++) { + if (i != name) + batch->other_batches[j++] = &ice->batches[i]; + } + + if (INTEL_DEBUG & DEBUG_BATCH) { + + batch->state_sizes = _mesa_hash_table_u64_create(NULL); + const unsigned decode_flags = + INTEL_BATCH_DECODE_FULL | + ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) | + INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS; + + intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo, stderr, + decode_flags, NULL, decode_get_bo, + decode_get_state_size, batch); + batch->decoder.max_vbo_decoded_lines = 32; + } + + crocus_batch_reset(batch); +} + +static struct drm_i915_gem_exec_object2 * +find_validation_entry(struct crocus_batch *batch, struct crocus_bo *bo) +{ + unsigned index = READ_ONCE(bo->index); + + if (index < batch->exec_count && batch->exec_bos[index] == bo) + return &batch->validation_list[index]; + + /* May have been shared between multiple active batches */ + for (index = 0; index < batch->exec_count; index++) { + if (batch->exec_bos[index] == bo) + return &batch->validation_list[index]; + } + + return NULL; +} + +static void +ensure_exec_obj_space(struct crocus_batch *batch, uint32_t count) +{ + while (batch->exec_count + count > batch->exec_array_size) { + batch->exec_array_size *= 2; + batch->exec_bos = realloc( + batch->exec_bos, batch->exec_array_size * sizeof(batch->exec_bos[0])); + batch->validation_list = + realloc(batch->validation_list, + batch->exec_array_size * sizeof(batch->validation_list[0])); + } +} + +static struct drm_i915_gem_exec_object2 * +crocus_use_bo(struct crocus_batch *batch, struct crocus_bo *bo, bool writable) +{ + assert(bo->bufmgr == batch->command.bo->bufmgr); + + if (bo == batch->ice->workaround_bo) + writable = false; + + struct drm_i915_gem_exec_object2 *existing_entry = + find_validation_entry(batch, bo); + + if (existing_entry) { + /* The BO is already in the validation list; mark it writable */ + if (writable) + existing_entry->flags |= EXEC_OBJECT_WRITE; + return existing_entry; + } + + if (bo != batch->command.bo && bo != batch->state.bo) { + /* This is the first time our batch has seen this BO. Before we use it, + * we may need to flush and synchronize with other batches. + */ + for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) { + + if (!batch->other_batches[b]) + continue; + struct drm_i915_gem_exec_object2 *other_entry = + find_validation_entry(batch->other_batches[b], bo); + + /* If the buffer is referenced by another batch, and either batch + * intends to write it, then flush the other batch and synchronize. + * + * Consider these cases: + * + * 1. They read, we read => No synchronization required. + * 2. They read, we write => Synchronize (they need the old value) + * 3. They write, we read => Synchronize (we need their new value) + * 4. They write, we write => Synchronize (order writes) + * + * The read/read case is very common, as multiple batches usually + * share a streaming state buffer or shader assembly buffer, and + * we want to avoid synchronizing in this case. + */ + if (other_entry && + ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) { + crocus_batch_flush(batch->other_batches[b]); + crocus_batch_add_syncobj(batch, + batch->other_batches[b]->last_fence->syncobj, + I915_EXEC_FENCE_WAIT); + } + } + } + + /* Bump the ref count since the batch is now using this bo. */ + crocus_bo_reference(bo); + + ensure_exec_obj_space(batch, 1); + + batch->validation_list[batch->exec_count] = + (struct drm_i915_gem_exec_object2) { + .handle = bo->gem_handle, + .offset = bo->gtt_offset, + .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0), + }; + + bo->index = batch->exec_count; + batch->exec_bos[batch->exec_count] = bo; + batch->aperture_space += bo->size; + + batch->exec_count++; + + return &batch->validation_list[batch->exec_count - 1]; +} + +static uint64_t +emit_reloc(struct crocus_batch *batch, + struct crocus_reloc_list *rlist, uint32_t offset, + struct crocus_bo *target, int32_t target_offset, + unsigned int reloc_flags) +{ + assert(target != NULL); + + bool writable = reloc_flags & RELOC_WRITE; + + struct drm_i915_gem_exec_object2 *entry = + crocus_use_bo(batch, target, writable); + + if (rlist->reloc_count == rlist->reloc_array_size) { + rlist->reloc_array_size *= 2; + rlist->relocs = realloc(rlist->relocs, + rlist->reloc_array_size * + sizeof(struct drm_i915_gem_relocation_entry)); + } + + if (reloc_flags & RELOC_32BIT) { + /* Restrict this buffer to the low 32 bits of the address space. + * + * Altering the validation list flags restricts it for this batch, + * but we also alter the BO's kflags to restrict it permanently + * (until the BO is destroyed and put back in the cache). Buffers + * may stay bound across batches, and we want keep it constrained. + */ + target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + + /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */ + reloc_flags &= ~RELOC_32BIT; + } + + if (reloc_flags) + entry->flags |= reloc_flags & batch->valid_reloc_flags; + + rlist->relocs[rlist->reloc_count++] = + (struct drm_i915_gem_relocation_entry) { + .offset = offset, + .delta = target_offset, + .target_handle = target->index, + .presumed_offset = entry->offset, + }; + + /* Using the old buffer offset, write in what the right data would be, in + * case the buffer doesn't move and we can short-circuit the relocation + * processing in the kernel + */ + return entry->offset + target_offset; +} + +uint64_t +crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags) +{ + assert(batch_offset <= batch->command.bo->size - sizeof(uint32_t)); + + return emit_reloc(batch, &batch->command.relocs, batch_offset, + target, target_offset, reloc_flags); +} + +uint64_t +crocus_state_reloc(struct crocus_batch *batch, uint32_t state_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags) +{ + assert(state_offset <= batch->state.bo->size - sizeof(uint32_t)); + + return emit_reloc(batch, &batch->state.relocs, state_offset, + target, target_offset, reloc_flags); +} + +static void +recreate_growing_buffer(struct crocus_batch *batch, + struct crocus_growing_bo *grow, + const char *name, unsigned size) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + grow->bo = crocus_bo_alloc(bufmgr, name, size); + grow->bo->kflags |= EXEC_OBJECT_CAPTURE; + grow->partial_bo = NULL; + grow->partial_bo_map = NULL; + grow->partial_bytes = 0; + if (batch->use_shadow_copy) + grow->map = realloc(grow->map, grow->bo->size); + else + grow->map = crocus_bo_map(NULL, grow->bo, MAP_READ | MAP_WRITE); + grow->map_next = grow->map; +} + +static void +create_batch(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + + recreate_growing_buffer(batch, &batch->command, + "command buffer", + BATCH_SZ + BATCH_RESERVED(&screen->devinfo)); + + crocus_use_bo(batch, batch->command.bo, false); + + recreate_growing_buffer(batch, &batch->state, + "state buffer", + STATE_SZ); + + batch->state.used = 1; + crocus_use_bo(batch, batch->state.bo, false); +} + +static void +crocus_batch_maybe_noop(struct crocus_batch *batch) +{ + /* We only insert the NOOP at the beginning of the batch. */ + assert(crocus_batch_bytes_used(batch) == 0); + + if (batch->noop_enabled) { + /* Emit MI_BATCH_BUFFER_END to prevent any further command to be + * executed. + */ + uint32_t *map = batch->command.map_next; + + map[0] = (0xA << 23); + + batch->command.map_next += 4; + } +} + +static void +crocus_batch_reset(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + + crocus_bo_unreference(batch->command.bo); + crocus_bo_unreference(batch->state.bo); + batch->primary_batch_size = 0; + batch->contains_draw = false; + batch->contains_fence_signal = false; + batch->state_base_address_emitted = false; + batch->screen->vtbl.batch_reset_dirty(batch); + + create_batch(batch); + assert(batch->command.bo->index == 0); + + if (batch->state_sizes) + _mesa_hash_table_u64_clear(batch->state_sizes); + struct crocus_syncobj *syncobj = crocus_create_syncobj(screen); + crocus_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL); + crocus_syncobj_reference(screen, &syncobj, NULL); + + crocus_cache_sets_clear(batch); +} + +void +crocus_batch_free(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + + if (batch->use_shadow_copy) { + free(batch->command.map); + free(batch->state.map); + } + + for (int i = 0; i < batch->exec_count; i++) { + crocus_bo_unreference(batch->exec_bos[i]); + } + + pipe_resource_reference(&batch->fine_fences.ref.res, NULL); + + free(batch->command.relocs.relocs); + free(batch->state.relocs.relocs); + free(batch->exec_bos); + free(batch->validation_list); + + ralloc_free(batch->exec_fences.mem_ctx); + + util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s) + crocus_syncobj_reference(screen, s, NULL); + ralloc_free(batch->syncobjs.mem_ctx); + + crocus_fine_fence_reference(batch->screen, &batch->last_fence, NULL); + if (batch_has_fine_fence(batch)) + u_upload_destroy(batch->fine_fences.uploader); + + crocus_bo_unreference(batch->command.bo); + batch->command.bo = NULL; + batch->command.map = NULL; + batch->command.map_next = NULL; + + crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id); + + _mesa_hash_table_destroy(batch->cache.render, NULL); + _mesa_set_destroy(batch->cache.depth, NULL); + + if (batch->state_sizes) { + _mesa_hash_table_u64_destroy(batch->state_sizes); + intel_batch_decode_ctx_finish(&batch->decoder); + } +} + +/** + * If we've chained to a secondary batch, or are getting near to the end, + * then flush. This should only be called between draws. + */ +void +crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate) +{ + if (batch->command.bo != batch->exec_bos[0] || + crocus_batch_bytes_used(batch) + estimate >= BATCH_SZ) { + crocus_batch_flush(batch); + } +} + +/** + * Finish copying the old batch/state buffer's contents to the new one + * after we tried to "grow" the buffer in an earlier operation. + */ +static void +finish_growing_bos(struct crocus_growing_bo *grow) +{ + struct crocus_bo *old_bo = grow->partial_bo; + if (!old_bo) + return; + + memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes); + + grow->partial_bo = NULL; + grow->partial_bo_map = NULL; + grow->partial_bytes = 0; + + crocus_bo_unreference(old_bo); +} + +void +crocus_grow_buffer(struct crocus_batch *batch, bool grow_state, + unsigned used, + unsigned new_size) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + struct crocus_growing_bo *grow = grow_state ? &batch->state : &batch->command; + struct crocus_bo *bo = grow->bo; + + if (grow->partial_bo) { + /* We've already grown once, and now we need to do it again. + * Finish our last grow operation so we can start a new one. + * This should basically never happen. + */ + finish_growing_bos(grow); + } + + struct crocus_bo *new_bo = crocus_bo_alloc(bufmgr, bo->name, new_size); + + /* Copy existing data to the new larger buffer */ + grow->partial_bo_map = grow->map; + + if (batch->use_shadow_copy) { + /* We can't safely use realloc, as it may move the existing buffer, + * breaking existing pointers the caller may still be using. Just + * malloc a new copy and memcpy it like the normal BO path. + * + * Use bo->size rather than new_size because the bufmgr may have + * rounded up the size, and we want the shadow size to match. + */ + grow->map = malloc(new_bo->size); + } else { + grow->map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE); + } + /* Try to put the new BO at the same GTT offset as the old BO (which + * we're throwing away, so it doesn't need to be there). + * + * This guarantees that our relocations continue to work: values we've + * already written into the buffer, values we're going to write into the + * buffer, and the validation/relocation lists all will match. + * + * Also preserve kflags for EXEC_OBJECT_CAPTURE. + */ + new_bo->gtt_offset = bo->gtt_offset; + new_bo->index = bo->index; + new_bo->kflags = bo->kflags; + + /* Batch/state buffers are per-context, and if we've run out of space, + * we must have actually used them before, so...they will be in the list. + */ + assert(bo->index < batch->exec_count); + assert(batch->exec_bos[bo->index] == bo); + + /* Update the validation list to use the new BO. */ + batch->validation_list[bo->index].handle = new_bo->gem_handle; + /* Exchange the two BOs...without breaking pointers to the old BO. + * + * Consider this scenario: + * + * 1. Somebody calls brw_state_batch() to get a region of memory, and + * and then creates a brw_address pointing to brw->batch.state.bo. + * 2. They then call brw_state_batch() a second time, which happens to + * grow and replace the state buffer. They then try to emit a + * relocation to their first section of memory. + * + * If we replace the brw->batch.state.bo pointer at step 2, we would + * break the address created in step 1. They'd have a pointer to the + * old destroyed BO. Emitting a relocation would add this dead BO to + * the validation list...causing /both/ statebuffers to be in the list, + * and all kinds of disasters. + * + * This is not a contrived case - BLORP vertex data upload hits this. + * + * There are worse scenarios too. Fences for GL sync objects reference + * brw->batch.batch.bo. If we replaced the batch pointer when growing, + * we'd need to chase down every fence and update it to point to the + * new BO. Otherwise, it would refer to a "batch" that never actually + * gets submitted, and would fail to trigger. + * + * To work around both of these issues, we transmutate the buffers in + * place, making the existing struct brw_bo represent the new buffer, + * and "new_bo" represent the old BO. This is highly unusual, but it + * seems like a necessary evil. + * + * We also defer the memcpy of the existing batch's contents. Callers + * may make multiple brw_state_batch calls, and retain pointers to the + * old BO's map. We'll perform the memcpy in finish_growing_bo() when + * we finally submit the batch, at which point we've finished uploading + * state, and nobody should have any old references anymore. + * + * To do that, we keep a reference to the old BO in grow->partial_bo, + * and store the number of bytes to copy in grow->partial_bytes. We + * can monkey with the refcounts directly without atomics because these + * are per-context BOs and they can only be touched by this thread. + */ + assert(new_bo->refcount == 1); + new_bo->refcount = bo->refcount; + bo->refcount = 1; + + struct crocus_bo tmp; + memcpy(&tmp, bo, sizeof(struct crocus_bo)); + memcpy(bo, new_bo, sizeof(struct crocus_bo)); + memcpy(new_bo, &tmp, sizeof(struct crocus_bo)); + + grow->partial_bo = new_bo; /* the one reference of the OLD bo */ + grow->partial_bytes = used; +} + +static void +finish_seqno(struct crocus_batch *batch) +{ + struct crocus_fine_fence *sq = crocus_fine_fence_new(batch, CROCUS_FENCE_END); + if (!sq) + return; + + crocus_fine_fence_reference(batch->screen, &batch->last_fence, sq); + crocus_fine_fence_reference(batch->screen, &sq, NULL); +} + +/** + * Terminate a batch with MI_BATCH_BUFFER_END. + */ +static void +crocus_finish_batch(struct crocus_batch *batch) +{ + + batch->no_wrap = true; + if (batch->screen->vtbl.finish_batch) + batch->screen->vtbl.finish_batch(batch); + + finish_seqno(batch); + + /* Emit MI_BATCH_BUFFER_END to finish our batch. */ + uint32_t *map = batch->command.map_next; + + map[0] = (0xA << 23); + + batch->command.map_next += 4; + VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->command.map, crocus_batch_bytes_used(batch))); + + if (batch->command.bo == batch->exec_bos[0]) + batch->primary_batch_size = crocus_batch_bytes_used(batch); + batch->no_wrap = false; +} + +/** + * Replace our current GEM context with a new one (in case it got banned). + */ +static bool +replace_hw_ctx(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + + uint32_t new_ctx = crocus_clone_hw_context(bufmgr, batch->hw_ctx_id); + if (!new_ctx) + return false; + + crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id); + batch->hw_ctx_id = new_ctx; + + /* Notify the context that state must be re-initialized. */ + crocus_lost_context_state(batch); + + return true; +} + +enum pipe_reset_status +crocus_batch_check_for_reset(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + enum pipe_reset_status status = PIPE_NO_RESET; + struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id }; + + if (drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats)) + DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno)); + + if (stats.batch_active != 0) { + /* A reset was observed while a batch from this hardware context was + * executing. Assume that this context was at fault. + */ + status = PIPE_GUILTY_CONTEXT_RESET; + } else if (stats.batch_pending != 0) { + /* A reset was observed while a batch from this context was in progress, + * but the batch was not executing. In this case, assume that the + * context was not at fault. + */ + status = PIPE_INNOCENT_CONTEXT_RESET; + } + + if (status != PIPE_NO_RESET) { + /* Our context is likely banned, or at least in an unknown state. + * Throw it away and start with a fresh context. Ideally this may + * catch the problem before our next execbuf fails with -EIO. + */ + replace_hw_ctx(batch); + } + + return status; +} + +/** + * Submit the batch to the GPU via execbuffer2. + */ +static int +submit_batch(struct crocus_batch *batch) +{ + + if (batch->use_shadow_copy) { + void *bo_map = crocus_bo_map(batch->dbg, batch->command.bo, MAP_WRITE); + memcpy(bo_map, batch->command.map, crocus_batch_bytes_used(batch)); + + bo_map = crocus_bo_map(batch->dbg, batch->state.bo, MAP_WRITE); + memcpy(bo_map, batch->state.map, batch->state.used); + } + + crocus_bo_unmap(batch->command.bo); + crocus_bo_unmap(batch->state.bo); + + /* The requirement for using I915_EXEC_NO_RELOC are: + * + * The addresses written in the objects must match the corresponding + * reloc.gtt_offset which in turn must match the corresponding + * execobject.offset. + * + * Any render targets written to in the batch must be flagged with + * EXEC_OBJECT_WRITE. + * + * To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + */ + /* Set statebuffer relocations */ + const unsigned state_index = batch->state.bo->index; + if (state_index < batch->exec_count && + batch->exec_bos[state_index] == batch->state.bo) { + struct drm_i915_gem_exec_object2 *entry = + &batch->validation_list[state_index]; + assert(entry->handle == batch->state.bo->gem_handle); + entry->relocation_count = batch->state.relocs.reloc_count; + entry->relocs_ptr = (uintptr_t)batch->state.relocs.relocs; + } + + /* Set batchbuffer relocations */ + struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0]; + assert(entry->handle == batch->command.bo->gem_handle); + entry->relocation_count = batch->command.relocs.reloc_count; + entry->relocs_ptr = (uintptr_t)batch->command.relocs.relocs; + + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = (uintptr_t)batch->validation_list, + .buffer_count = batch->exec_count, + .batch_start_offset = 0, + /* This must be QWord aligned. */ + .batch_len = ALIGN(batch->primary_batch_size, 8), + .flags = I915_EXEC_RENDER | + I915_EXEC_NO_RELOC | + I915_EXEC_BATCH_FIRST | + I915_EXEC_HANDLE_LUT, + .rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */ + }; + + if (num_fences(batch)) { + execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.num_cliprects = num_fences(batch); + execbuf.cliprects_ptr = + (uintptr_t)util_dynarray_begin(&batch->exec_fences); + } + + int ret = 0; + if (!batch->screen->no_hw && + intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf)) + ret = -errno; + + for (int i = 0; i < batch->exec_count; i++) { + struct crocus_bo *bo = batch->exec_bos[i]; + + bo->idle = false; + bo->index = -1; + + /* Update brw_bo::gtt_offset */ + if (batch->validation_list[i].offset != bo->gtt_offset) { + DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n", + bo->gem_handle, bo->gtt_offset, + batch->validation_list[i].offset); + assert(!(bo->kflags & EXEC_OBJECT_PINNED)); + bo->gtt_offset = batch->validation_list[i].offset; + } + } + + return ret; +} + +static const char * +batch_name_to_string(enum crocus_batch_name name) +{ + const char *names[CROCUS_BATCH_COUNT] = { + [CROCUS_BATCH_RENDER] = "render", + [CROCUS_BATCH_COMPUTE] = "compute", + }; + return names[name]; +} + +/** + * Flush the batch buffer, submitting it to the GPU and resetting it so + * we're ready to emit the next batch. + * + * \param in_fence_fd is ignored if -1. Otherwise, this function takes + * ownership of the fd. + * + * \param out_fence_fd is ignored if NULL. Otherwise, the caller must + * take ownership of the returned fd. + */ +void +_crocus_batch_flush(struct crocus_batch *batch, const char *file, int line) +{ + struct crocus_screen *screen = batch->screen; + + /* If a fence signals we need to flush it. */ + if (crocus_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal) + return; + + assert(!batch->no_wrap); + crocus_finish_batch(batch); + + finish_growing_bos(&batch->command); + finish_growing_bos(&batch->state); + int ret = submit_batch(batch); + + if (unlikely(INTEL_DEBUG & + (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL))) { + int bytes_for_commands = crocus_batch_bytes_used(batch); + int second_bytes = 0; + if (batch->command.bo != batch->exec_bos[0]) { + second_bytes = bytes_for_commands; + bytes_for_commands += batch->primary_batch_size; + } + fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) " + "(cmds), %4d BOs (%0.1fMb aperture)," + " %4d command relocs, %4d state relocs\n", + file, line, batch_name_to_string(batch->name), batch->hw_ctx_id, + batch->primary_batch_size, second_bytes, + 100.0f * bytes_for_commands / BATCH_SZ, + batch->exec_count, + (float) batch->aperture_space / (1024 * 1024), + batch->command.relocs.reloc_count, + batch->state.relocs.reloc_count); + + if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) { + dump_fence_list(batch); + dump_validation_list(batch); + } + + if (INTEL_DEBUG & DEBUG_BATCH) { + decode_batch(batch); + } + } + + for (int i = 0; i < batch->exec_count; i++) { + struct crocus_bo *bo = batch->exec_bos[i]; + crocus_bo_unreference(bo); + } + + batch->command.relocs.reloc_count = 0; + batch->state.relocs.reloc_count = 0; + batch->exec_count = 0; + batch->aperture_space = 0; + + util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s) + crocus_syncobj_reference(screen, s, NULL); + util_dynarray_clear(&batch->syncobjs); + + util_dynarray_clear(&batch->exec_fences); + + if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) { + dbg_printf("waiting for idle\n"); + crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */ + } + + /* Start a new batch buffer. */ + crocus_batch_reset(batch); + + /* EIO means our context is banned. In this case, try and replace it + * with a new logical context, and inform crocus_context that all state + * has been lost and needs to be re-initialized. If this succeeds, + * dubiously claim success... + */ + if (ret == -EIO && replace_hw_ctx(batch)) { + if (batch->reset->reset) { + /* Tell the state tracker the device is lost and it was our fault. */ + batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET); + } + + ret = 0; + } + + if (ret < 0) { +#ifdef DEBUG + const bool color = INTEL_DEBUG & DEBUG_COLOR; + fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n", + color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : ""); +#endif + abort(); + } +} + +/** + * Does the current batch refer to the given BO? + * + * (In other words, is the BO in the current batch's validation list?) + */ +bool +crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo) +{ + return find_validation_entry(batch, bo) != NULL; +} + +/** + * Updates the state of the noop feature. Returns true if there was a noop + * transition that led to state invalidation. + */ +bool +crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable) +{ + if (batch->noop_enabled == noop_enable) + return 0; + + batch->noop_enabled = noop_enable; + + crocus_batch_flush(batch); + + /* If the batch was empty, flush had no effect, so insert our noop. */ + if (crocus_batch_bytes_used(batch) == 0) + crocus_batch_maybe_noop(batch); + + /* We only need to update the entire state if we transition from noop -> + * not-noop. + */ + return !batch->noop_enabled; +} diff --git a/src/gallium/drivers/crocus/crocus_batch.h b/src/gallium/drivers/crocus/crocus_batch.h new file mode 100644 index 00000000000..fe6857d83ed --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_batch.h @@ -0,0 +1,325 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_BATCH_DOT_H +#define CROCUS_BATCH_DOT_H + +#include <stdbool.h> +#include <stdint.h> +#include <string.h> + +#include "util/u_dynarray.h" + +#include "common/intel_decoder.h" +#include "drm-uapi/i915_drm.h" + +#include "crocus_fence.h" +#include "crocus_fine_fence.h" + +#include "crocus_bufmgr.h" +/* The kernel assumes batchbuffers are smaller than 256kB. */ +#define MAX_BATCH_SIZE (256 * 1024) + +/* 3DSTATE_BINDING_TABLE_POINTERS has a U16 offset from Surface State Base + * Address, which means that we can't put binding tables beyond 64kB. This + * effectively limits the maximum statebuffer size to 64kB. + */ +#define MAX_STATE_SIZE (64 * 1024) + +/* Our target batch size - flush approximately at this point. */ +#define BATCH_SZ (20 * 1024) +#define STATE_SZ (16 * 1024) + +enum crocus_batch_name { + CROCUS_BATCH_RENDER, + CROCUS_BATCH_COMPUTE, +}; + +#define CROCUS_BATCH_COUNT 2 + +struct crocus_address { + struct crocus_bo *bo; + int32_t offset; + uint32_t reloc_flags; +}; + +struct crocus_reloc_list { + struct drm_i915_gem_relocation_entry *relocs; + int reloc_count; + int reloc_array_size; +}; + +struct crocus_growing_bo { + struct crocus_bo *bo; + void *map; + void *map_next; + struct crocus_bo *partial_bo; + void *partial_bo_map; + unsigned partial_bytes; + struct crocus_reloc_list relocs; + unsigned used; +}; + +struct crocus_batch { + struct crocus_context *ice; + struct crocus_screen *screen; + struct pipe_debug_callback *dbg; + struct pipe_device_reset_callback *reset; + + /** What batch is this? (e.g. CROCUS_BATCH_RENDER/COMPUTE) */ + enum crocus_batch_name name; + + /** buffers: command, state */ + struct crocus_growing_bo command, state; + + /** Size of the primary batch if we've moved on to a secondary. */ + unsigned primary_batch_size; + + bool state_base_address_emitted; + uint8_t pipe_controls_since_last_cs_stall; + + uint32_t hw_ctx_id; + + uint32_t valid_reloc_flags; + + bool use_shadow_copy; + bool no_wrap; + + /** The validation list */ + struct drm_i915_gem_exec_object2 *validation_list; + struct crocus_bo **exec_bos; + int exec_count; + int exec_array_size; + + /** Whether INTEL_BLACKHOLE_RENDER is enabled in the batch (aka first + * instruction is a MI_BATCH_BUFFER_END). + */ + bool noop_enabled; + + /** + * A list of crocus_syncobjs associated with this batch. + * + * The first list entry will always be a signalling sync-point, indicating + * that this batch has completed. The others are likely to be sync-points + * to wait on before executing the batch. + */ + struct util_dynarray syncobjs; + + /** A list of drm_i915_exec_fences to have execbuf signal or wait on */ + struct util_dynarray exec_fences; + + /** The amount of aperture space (in bytes) used by all exec_bos */ + int aperture_space; + + struct { + /** Uploader to use for sequence numbers */ + struct u_upload_mgr *uploader; + + /** GPU buffer and CPU map where our seqno's will be written. */ + struct crocus_state_ref ref; + uint32_t *map; + + /** The sequence number to write the next time we add a fence. */ + uint32_t next; + } fine_fences; + + /** A seqno (and syncobj) for the last batch that was submitted. */ + struct crocus_fine_fence *last_fence; + + /** List of other batches which we might need to flush to use a BO */ + struct crocus_batch *other_batches[CROCUS_BATCH_COUNT - 1]; + + struct { + /** + * Set of struct brw_bo * that have been rendered to within this + * batchbuffer and would need flushing before being used from another + * cache domain that isn't coherent with it (i.e. the sampler). + */ + struct hash_table *render; + + /** + * Set of struct brw_bo * that have been used as a depth buffer within + * this batchbuffer and would need flushing before being used from + * another cache domain that isn't coherent with it (i.e. the sampler). + */ + struct set *depth; + } cache; + + struct intel_batch_decode_ctx decoder; + struct hash_table_u64 *state_sizes; + + /** Have we emitted any draw calls to this batch? */ + bool contains_draw; + + /** Batch contains fence signal operation. */ + bool contains_fence_signal; +}; + +static inline bool +batch_has_fine_fence(struct crocus_batch *batch) +{ + return !!batch->fine_fences.uploader; +} + +#define BATCH_HAS_FINE_FENCES(batch) (!!(batch)->fine_fences.uploader) +void crocus_init_batch(struct crocus_context *ctx, + enum crocus_batch_name name, + int priority); +void crocus_batch_free(struct crocus_batch *batch); +void crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate); + +void _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line); +#define crocus_batch_flush(batch) _crocus_batch_flush((batch), __FILE__, __LINE__) + +bool crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo); + +bool crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable); + +#define RELOC_WRITE EXEC_OBJECT_WRITE +#define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT +/* Inverted meaning, but using the same bit...emit_reloc will flip it. */ +#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS + +void crocus_use_pinned_bo(struct crocus_batch *batch, struct crocus_bo *bo, + bool writable); +uint64_t crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags); +uint64_t crocus_state_reloc(struct crocus_batch *batch, uint32_t batch_offset, + struct crocus_bo *target, uint32_t target_offset, + unsigned int reloc_flags); + +enum pipe_reset_status crocus_batch_check_for_reset(struct crocus_batch *batch); + +void crocus_grow_buffer(struct crocus_batch *batch, bool grow_state, + unsigned used, unsigned new_size); + +static inline unsigned +crocus_batch_bytes_used(struct crocus_batch *batch) +{ + return batch->command.map_next - batch->command.map; +} + +/** + * Ensure the current command buffer has \param size bytes of space + * remaining. If not, this creates a secondary batch buffer and emits + * a jump from the primary batch to the start of the secondary. + * + * Most callers want crocus_get_command_space() instead. + */ +static inline void +crocus_require_command_space(struct crocus_batch *batch, unsigned size) +{ + const unsigned required_bytes = crocus_batch_bytes_used(batch) + size; + unsigned used = crocus_batch_bytes_used(batch); + if (required_bytes >= BATCH_SZ && !batch->no_wrap) { + crocus_batch_flush(batch); + } else if (used + size >= batch->command.bo->size) { + const unsigned new_size = + MIN2(batch->command.bo->size + batch->command.bo->size / 2, + MAX_BATCH_SIZE); + + crocus_grow_buffer(batch, false, used, new_size); + batch->command.map_next = (void *)batch->command.map + used; + assert(crocus_batch_bytes_used(batch) + size < batch->command.bo->size); + } +} + +/** + * Allocate space in the current command buffer, and return a pointer + * to the mapped area so the caller can write commands there. + * + * This should be called whenever emitting commands. + */ +static inline void * +crocus_get_command_space(struct crocus_batch *batch, unsigned bytes) +{ + crocus_require_command_space(batch, bytes); + void *map = batch->command.map_next; + batch->command.map_next += bytes; + return map; +} + +/** + * Helper to emit GPU commands - allocates space, copies them there. + */ +static inline void +crocus_batch_emit(struct crocus_batch *batch, const void *data, unsigned size) +{ + void *map = crocus_get_command_space(batch, size); + memcpy(map, data, size); +} + +/** + * Get a pointer to the batch's signalling syncobj. Does not refcount. + */ +static inline struct crocus_syncobj * +crocus_batch_get_signal_syncobj(struct crocus_batch *batch) +{ + /* The signalling syncobj is the first one in the list. */ + struct crocus_syncobj *syncobj = + ((struct crocus_syncobj **)util_dynarray_begin(&batch->syncobjs))[0]; + return syncobj; +} + +/** + * Take a reference to the batch's signalling syncobj. + * + * Callers can use this to wait for the the current batch under construction + * to complete (after flushing it). + */ +static inline void +crocus_batch_reference_signal_syncobj(struct crocus_batch *batch, + struct crocus_syncobj **out_syncobj) +{ + struct crocus_syncobj *syncobj = crocus_batch_get_signal_syncobj(batch); + crocus_syncobj_reference(batch->screen, out_syncobj, syncobj); +} + +/** + * Record the size of a piece of state for use in INTEL_DEBUG=bat printing. + */ +static inline void +crocus_record_state_size(struct hash_table_u64 *ht, uint32_t offset_from_base, + uint32_t size) +{ + if (ht) { + _mesa_hash_table_u64_insert(ht, offset_from_base, + (void *)(uintptr_t)size); + } +} + +static inline bool +crocus_ptr_in_state_buffer(struct crocus_batch *batch, void *p) +{ + return (char *)p >= (char *)batch->state.map && + (char *)p < (char *)batch->state.map + batch->state.bo->size; +} + +static inline void +crocus_require_statebuffer_space(struct crocus_batch *batch, int size) +{ + if (batch->state.used + size >= STATE_SZ) + crocus_batch_flush(batch); +} +#endif diff --git a/src/gallium/drivers/crocus/crocus_blit.c b/src/gallium/drivers/crocus/crocus_blit.c new file mode 100644 index 00000000000..9cae82e3e2d --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_blit.c @@ -0,0 +1,836 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <stdio.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_surface.h" +#include "util/ralloc.h" +#include "intel/blorp/blorp.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" + +void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond) +{ + util_blitter_save_vertex_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_VERTEX]); + util_blitter_save_tessctrl_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL]); + util_blitter_save_tesseval_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]); + util_blitter_save_geometry_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]); + util_blitter_save_so_targets(ice->blitter, ice->state.so_targets, + (struct pipe_stream_output_target**)ice->state.so_target); + util_blitter_save_vertex_buffer_slot(ice->blitter, ice->state.vertex_buffers); + util_blitter_save_vertex_elements(ice->blitter, (void *)ice->state.cso_vertex_elements); + if (op & CROCUS_SAVE_FRAGMENT_STATE) { + util_blitter_save_blend(ice->blitter, ice->state.cso_blend); + util_blitter_save_depth_stencil_alpha(ice->blitter, ice->state.cso_zsa); + util_blitter_save_stencil_ref(ice->blitter, &ice->state.stencil_ref); + util_blitter_save_fragment_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]); + util_blitter_save_sample_mask(ice->blitter, ice->state.sample_mask); + util_blitter_save_rasterizer(ice->blitter, ice->state.cso_rast); + util_blitter_save_scissor(ice->blitter, &ice->state.scissors[0]); + util_blitter_save_viewport(ice->blitter, &ice->state.viewports[0]); + util_blitter_save_fragment_constant_buffer_slot(ice->blitter, &ice->state.shaders[MESA_SHADER_FRAGMENT].constbufs[0]); + } + + if (!render_cond) + util_blitter_save_render_condition(ice->blitter, + (struct pipe_query *)ice->condition.query, + ice->condition.condition, + ice->condition.mode); + +// util_blitter_save_scissor(ice->blitter, &ice->scissors[0]); + if (op & CROCUS_SAVE_FRAMEBUFFER) + util_blitter_save_framebuffer(ice->blitter, &ice->state.framebuffer); + + if (op & CROCUS_SAVE_TEXTURES) { + util_blitter_save_fragment_sampler_states(ice->blitter, 1, (void **)ice->state.shaders[MESA_SHADER_FRAGMENT].samplers); + util_blitter_save_fragment_sampler_views(ice->blitter, 1, (struct pipe_sampler_view **)ice->state.shaders[MESA_SHADER_FRAGMENT].textures); + } +} + +/** + * Helper function for handling mirror image blits. + * + * If coord0 > coord1, swap them and return "true" (mirrored). + */ +static bool +apply_mirror(float *coord0, float *coord1) +{ + if (*coord0 > *coord1) { + float tmp = *coord0; + *coord0 = *coord1; + *coord1 = tmp; + return true; + } + return false; +} + +/** + * Compute the number of pixels to clip for each side of a rect + * + * \param x0 The rect's left coordinate + * \param y0 The rect's bottom coordinate + * \param x1 The rect's right coordinate + * \param y1 The rect's top coordinate + * \param min_x The clipping region's left coordinate + * \param min_y The clipping region's bottom coordinate + * \param max_x The clipping region's right coordinate + * \param max_y The clipping region's top coordinate + * \param clipped_x0 The number of pixels to clip from the left side + * \param clipped_y0 The number of pixels to clip from the bottom side + * \param clipped_x1 The number of pixels to clip from the right side + * \param clipped_y1 The number of pixels to clip from the top side + * + * \return false if we clip everything away, true otherwise + */ +static inline bool +compute_pixels_clipped(float x0, float y0, float x1, float y1, + float min_x, float min_y, float max_x, float max_y, + float *clipped_x0, float *clipped_y0, + float *clipped_x1, float *clipped_y1) +{ + /* If we are going to clip everything away, stop. */ + if (!(min_x <= max_x && + min_y <= max_y && + x0 <= max_x && + y0 <= max_y && + min_x <= x1 && + min_y <= y1 && + x0 <= x1 && + y0 <= y1)) { + return false; + } + + if (x0 < min_x) + *clipped_x0 = min_x - x0; + else + *clipped_x0 = 0; + if (max_x < x1) + *clipped_x1 = x1 - max_x; + else + *clipped_x1 = 0; + + if (y0 < min_y) + *clipped_y0 = min_y - y0; + else + *clipped_y0 = 0; + if (max_y < y1) + *clipped_y1 = y1 - max_y; + else + *clipped_y1 = 0; + + return true; +} + +/** + * Clips a coordinate (left, right, top or bottom) for the src or dst rect + * (whichever requires the largest clip) and adjusts the coordinate + * for the other rect accordingly. + * + * \param mirror true if mirroring is required + * \param src the source rect coordinate (for example src_x0) + * \param dst0 the dst rect coordinate (for example dst_x0) + * \param dst1 the opposite dst rect coordinate (for example dst_x1) + * \param clipped_dst0 number of pixels to clip from the dst coordinate + * \param clipped_dst1 number of pixels to clip from the opposite dst coordinate + * \param scale the src vs dst scale involved for that coordinate + * \param is_left_or_bottom true if we are clipping the left or bottom sides + * of the rect. + */ +static void +clip_coordinates(bool mirror, + float *src, float *dst0, float *dst1, + float clipped_dst0, + float clipped_dst1, + float scale, + bool is_left_or_bottom) +{ + /* When clipping we need to add or subtract pixels from the original + * coordinates depending on whether we are acting on the left/bottom + * or right/top sides of the rect respectively. We assume we have to + * add them in the code below, and multiply by -1 when we should + * subtract. + */ + int mult = is_left_or_bottom ? 1 : -1; + + if (!mirror) { + *dst0 += clipped_dst0 * mult; + *src += clipped_dst0 * scale * mult; + } else { + *dst1 -= clipped_dst1 * mult; + *src += clipped_dst1 * scale * mult; + } +} + +/** + * Apply a scissor rectangle to blit coordinates. + * + * Returns true if the blit was entirely scissored away. + */ +static bool +apply_blit_scissor(const struct pipe_scissor_state *scissor, + float *src_x0, float *src_y0, + float *src_x1, float *src_y1, + float *dst_x0, float *dst_y0, + float *dst_x1, float *dst_y1, + bool mirror_x, bool mirror_y) +{ + float clip_dst_x0, clip_dst_x1, clip_dst_y0, clip_dst_y1; + + /* Compute number of pixels to scissor away. */ + if (!compute_pixels_clipped(*dst_x0, *dst_y0, *dst_x1, *dst_y1, + scissor->minx, scissor->miny, + scissor->maxx, scissor->maxy, + &clip_dst_x0, &clip_dst_y0, + &clip_dst_x1, &clip_dst_y1)) + return true; + + // XXX: comments assume source clipping, which we don't do + + /* When clipping any of the two rects we need to adjust the coordinates + * in the other rect considering the scaling factor involved. To obtain + * the best precision we want to make sure that we only clip once per + * side to avoid accumulating errors due to the scaling adjustment. + * + * For example, if src_x0 and dst_x0 need both to be clipped we want to + * avoid the situation where we clip src_x0 first, then adjust dst_x0 + * accordingly but then we realize that the resulting dst_x0 still needs + * to be clipped, so we clip dst_x0 and adjust src_x0 again. Because we are + * applying scaling factors to adjust the coordinates in each clipping + * pass we lose some precision and that can affect the results of the + * blorp blit operation slightly. What we want to do here is detect the + * rect that we should clip first for each side so that when we adjust + * the other rect we ensure the resulting coordinate does not need to be + * clipped again. + * + * The code below implements this by comparing the number of pixels that + * we need to clip for each side of both rects considering the scales + * involved. For example, clip_src_x0 represents the number of pixels + * to be clipped for the src rect's left side, so if clip_src_x0 = 5, + * clip_dst_x0 = 4 and scale_x = 2 it means that we are clipping more + * from the dst rect so we should clip dst_x0 only and adjust src_x0. + * This is because clipping 4 pixels in the dst is equivalent to + * clipping 4 * 2 = 8 > 5 in the src. + */ + + if (*src_x0 == *src_x1 || *src_y0 == *src_y1 + || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1) + return true; + + float scale_x = (float) (*src_x1 - *src_x0) / (*dst_x1 - *dst_x0); + float scale_y = (float) (*src_y1 - *src_y0) / (*dst_y1 - *dst_y0); + + /* Clip left side */ + clip_coordinates(mirror_x, src_x0, dst_x0, dst_x1, + clip_dst_x0, clip_dst_x1, scale_x, true); + + /* Clip right side */ + clip_coordinates(mirror_x, src_x1, dst_x1, dst_x0, + clip_dst_x1, clip_dst_x0, scale_x, false); + + /* Clip bottom side */ + clip_coordinates(mirror_y, src_y0, dst_y0, dst_y1, + clip_dst_y0, clip_dst_y1, scale_y, true); + + /* Clip top side */ + clip_coordinates(mirror_y, src_y1, dst_y1, dst_y0, + clip_dst_y1, clip_dst_y0, scale_y, false); + + /* Check for invalid bounds + * Can't blit for 0-dimensions + */ + return *src_x0 == *src_x1 || *src_y0 == *src_y1 + || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1; +} + +void +crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl, + struct isl_device *isl_dev, + struct blorp_surf *surf, + struct pipe_resource *p_res, + enum isl_aux_usage aux_usage, + unsigned level, + bool is_render_target) +{ + struct crocus_resource *res = (void *) p_res; + + assert(!crocus_resource_unfinished_aux_import(res)); + + if (isl_aux_usage_has_hiz(aux_usage) && + !crocus_resource_level_has_hiz(res, level)) + aux_usage = ISL_AUX_USAGE_NONE; + + *surf = (struct blorp_surf) { + .surf = &res->surf, + .addr = (struct blorp_address) { + .buffer = res->bo, + .offset = res->offset, + .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0, + .mocs = crocus_mocs(res->bo, isl_dev), + }, + .aux_usage = aux_usage, + }; + + if (aux_usage != ISL_AUX_USAGE_NONE) { + surf->aux_surf = &res->aux.surf; + surf->aux_addr = (struct blorp_address) { + .buffer = res->aux.bo, + .offset = res->aux.offset, + .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0, + .mocs = crocus_mocs(res->bo, isl_dev), + }; + surf->clear_color = + crocus_resource_get_clear_color(res); + } +} + +static void +tex_cache_flush_hack(struct crocus_batch *batch, + enum isl_format view_format, + enum isl_format surf_format) +{ + /* The WaSamplerCacheFlushBetweenRedescribedSurfaceReads workaround says: + * + * "Currently Sampler assumes that a surface would not have two + * different format associate with it. It will not properly cache + * the different views in the MT cache, causing a data corruption." + * + * We may need to handle this for texture views in general someday, but + * for now we handle it here, as it hurts copies and blits particularly + * badly because they ofter reinterpret formats. + * + * If the BO hasn't been referenced yet this batch, we assume that the + * texture cache doesn't contain any relevant data nor need flushing. + * + * Icelake (Gen11+) claims to fix this issue, but seems to still have + * issues with ASTC formats. + */ + bool need_flush = view_format != surf_format; + if (!need_flush) + return; + + const char *reason = + "workaround: WaSamplerCacheFlushBetweenRedescribedSurfaceReads"; + + crocus_emit_pipe_control_flush(batch, reason, PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(batch, reason, + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); +} + +static struct crocus_resource * +crocus_resource_for_aspect(const struct intel_device_info *devinfo, + struct pipe_resource *p_res, unsigned pipe_mask) +{ + if (pipe_mask == PIPE_MASK_S) { + struct crocus_resource *junk, *s_res; + crocus_get_depth_stencil_resources(devinfo, p_res, &junk, &s_res); + return s_res; + } else { + return (struct crocus_resource *)p_res; + } +} + +static enum pipe_format +pipe_format_for_aspect(enum pipe_format format, unsigned pipe_mask) +{ + if (pipe_mask == PIPE_MASK_S) { + return util_format_stencil_only(format); + } else if (pipe_mask == PIPE_MASK_Z) { + return util_format_get_depth_only(format); + } else { + return format; + } +} + +static void +crocus_u_blitter(struct crocus_context *ice, + const struct pipe_blit_info *info) +{ + struct pipe_blit_info dinfo = *info; + if (!util_format_has_alpha(dinfo.dst.resource->format)) + dinfo.mask &= ~PIPE_MASK_A; + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + util_blitter_blit(ice->blitter, &dinfo); +} + +/** + * The pipe->blit() driver hook. + * + * This performs a blit between two surfaces, which copies data but may + * also perform format conversion, scaling, flipping, and so on. + */ +static void +crocus_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + enum blorp_batch_flags blorp_flags = 0; + + /* We don't support color masking. */ + assert((info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA || + (info->mask & PIPE_MASK_RGBA) == 0); + + if (info->render_condition_enable) + if (!crocus_check_conditional_render(ice)) + return; + + if (devinfo->ver <= 5) { + if (!screen->vtbl.blit_blt(batch, info)) { + + if (!util_format_is_depth_or_stencil(info->src.resource->format) && + info->dst.resource->target != PIPE_TEXTURE_3D) + goto use_blorp; + + if (!util_blitter_is_blit_supported(ice->blitter, info)) { + if (util_format_is_depth_or_stencil(info->src.resource->format)) { + + struct pipe_blit_info depth_blit = *info; + depth_blit.mask = PIPE_MASK_Z; + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + util_blitter_blit(ice->blitter, &depth_blit); + + struct pipe_surface *dst_view, dst_templ; + util_blitter_default_dst_texture(&dst_templ, info->dst.resource, info->dst.level, info->dst.box.z); + dst_view = ctx->create_surface(ctx, info->dst.resource, &dst_templ); + + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + + util_blitter_clear_depth_stencil(ice->blitter, dst_view, PIPE_CLEAR_STENCIL, + 0, 0, info->dst.box.x, info->dst.box.y, + info->dst.box.width, info->dst.box.height); + crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable); + util_blitter_stencil_fallback(ice->blitter, + info->dst.resource, + info->dst.level, + &info->dst.box, + info->src.resource, + info->src.level, + &info->src.box, NULL); + + } + return; + } + + crocus_u_blitter(ice, info); + } + return; + } + + if (devinfo->ver == 6) { + if (info->src.resource->target == PIPE_TEXTURE_3D && + info->dst.resource->target == PIPE_TEXTURE_3D) { + crocus_u_blitter(ice, info); + return; + } + } + +use_blorp: + if (info->render_condition_enable) { + if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) + blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE; + } + + float src_x0 = info->src.box.x; + float src_x1 = info->src.box.x + info->src.box.width; + float src_y0 = info->src.box.y; + float src_y1 = info->src.box.y + info->src.box.height; + float dst_x0 = info->dst.box.x; + float dst_x1 = info->dst.box.x + info->dst.box.width; + float dst_y0 = info->dst.box.y; + float dst_y1 = info->dst.box.y + info->dst.box.height; + bool mirror_x = apply_mirror(&src_x0, &src_x1); + bool mirror_y = apply_mirror(&src_y0, &src_y1); + enum blorp_filter filter; + + if (info->scissor_enable) { + bool noop = apply_blit_scissor(&info->scissor, + &src_x0, &src_y0, &src_x1, &src_y1, + &dst_x0, &dst_y0, &dst_x1, &dst_y1, + mirror_x, mirror_y); + if (noop) + return; + } + + if (abs(info->dst.box.width) == abs(info->src.box.width) && + abs(info->dst.box.height) == abs(info->src.box.height)) { + if (info->src.resource->nr_samples > 1 && + info->dst.resource->nr_samples <= 1) { + /* The OpenGL ES 3.2 specification, section 16.2.1, says: + * + * "If the read framebuffer is multisampled (its effective + * value of SAMPLE_BUFFERS is one) and the draw framebuffer + * is not (its value of SAMPLE_BUFFERS is zero), the samples + * corresponding to each pixel location in the source are + * converted to a single sample before being written to the + * destination. The filter parameter is ignored. If the + * source formats are integer types or stencil values, a + * single sample’s value is selected for each pixel. If the + * source formats are floating-point or normalized types, + * the sample values for each pixel are resolved in an + * implementation-dependent manner. If the source formats + * are depth values, sample values are resolved in an + * implementation-dependent manner where the result will be + * between the minimum and maximum depth values in the pixel." + * + * When selecting a single sample, we always choose sample 0. + */ + if (util_format_is_depth_or_stencil(info->src.format) || + util_format_is_pure_integer(info->src.format)) { + filter = BLORP_FILTER_SAMPLE_0; + } else { + filter = BLORP_FILTER_AVERAGE; + } + } else { + /* The OpenGL 4.6 specification, section 18.3.1, says: + * + * "If the source and destination dimensions are identical, + * no filtering is applied." + * + * Using BLORP_FILTER_NONE will also handle the upsample case by + * replicating the one value in the source to all values in the + * destination. + */ + filter = BLORP_FILTER_NONE; + } + } else if (info->filter == PIPE_TEX_FILTER_LINEAR) { + filter = BLORP_FILTER_BILINEAR; + } else { + filter = BLORP_FILTER_NEAREST; + } + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + float src_z_step = (float)info->src.box.depth / (float)info->dst.box.depth; + + /* There is no interpolation to the pixel center during rendering, so + * add the 0.5 offset ourselves here. + */ + float depth_center_offset = 0; + if (info->src.resource->target == PIPE_TEXTURE_3D) + depth_center_offset = 0.5 / info->dst.box.depth * info->src.box.depth; + + /* Perform a blit for each aspect requested by the caller. PIPE_MASK_R is + * used to represent the color aspect. */ + unsigned aspect_mask = info->mask & (PIPE_MASK_R | PIPE_MASK_ZS); + while (aspect_mask) { + unsigned aspect = 1 << u_bit_scan(&aspect_mask); + + struct crocus_resource *src_res = + crocus_resource_for_aspect(devinfo, info->src.resource, aspect); + struct crocus_resource *dst_res = + crocus_resource_for_aspect(devinfo, info->dst.resource, aspect); + + enum pipe_format src_pfmt = + pipe_format_for_aspect(info->src.format, aspect); + enum pipe_format dst_pfmt = + pipe_format_for_aspect(info->dst.format, aspect); + + if (crocus_resource_unfinished_aux_import(src_res)) + crocus_resource_finish_aux_import(ctx->screen, src_res); + if (crocus_resource_unfinished_aux_import(dst_res)) + crocus_resource_finish_aux_import(ctx->screen, dst_res); + + struct crocus_format_info src_fmt = + crocus_format_for_usage(devinfo, src_pfmt, ISL_SURF_USAGE_TEXTURE_BIT); + enum isl_aux_usage src_aux_usage = + crocus_resource_texture_aux_usage(src_res); + + crocus_resource_prepare_texture(ice, src_res, src_fmt.fmt, + info->src.level, 1, info->src.box.z, + info->src.box.depth); + // crocus_emit_buffer_barrier_for(batch, src_res->bo, + // CROCUS_DOMAIN_OTHER_READ); + + struct crocus_format_info dst_fmt = + crocus_format_for_usage(devinfo, dst_pfmt, + ISL_SURF_USAGE_RENDER_TARGET_BIT); + enum isl_aux_usage dst_aux_usage = + crocus_resource_render_aux_usage(ice, dst_res, info->dst.level, + dst_fmt.fmt, false); + + struct blorp_surf src_surf, dst_surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf, + &src_res->base, src_aux_usage, + info->src.level, false); + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf, + &dst_res->base, dst_aux_usage, + info->dst.level, true); + + crocus_resource_prepare_render(ice, dst_res, info->dst.level, + info->dst.box.z, info->dst.box.depth, + dst_aux_usage); + // crocus_emit_buffer_barrier_for(batch, dst_res->bo, + // CROCUS_DOMAIN_RENDER_WRITE); + + if (crocus_batch_references(batch, src_res->bo)) + tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format); + + if (dst_res->base.target == PIPE_BUFFER) { + util_range_add(&dst_res->base, &dst_res->valid_buffer_range, + dst_x0, dst_x1); + } + + struct isl_swizzle src_swiz = pipe_to_isl_swizzles(src_fmt.swizzles); + struct isl_swizzle dst_swiz = pipe_to_isl_swizzles(dst_fmt.swizzles); + + for (int slice = 0; slice < info->dst.box.depth; slice++) { + unsigned dst_z = info->dst.box.z + slice; + float src_z = info->src.box.z + slice * src_z_step + + depth_center_offset; + + crocus_batch_maybe_flush(batch, 1500); + + blorp_blit(&blorp_batch, + &src_surf, info->src.level, src_z, + src_fmt.fmt, src_swiz, + &dst_surf, info->dst.level, dst_z, + dst_fmt.fmt, dst_swiz, + src_x0, src_y0, src_x1, src_y1, + dst_x0, dst_y0, dst_x1, dst_y1, + filter, mirror_x, mirror_y); + + } + + tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format); + + crocus_resource_finish_render(ice, dst_res, info->dst.level, + info->dst.box.z, info->dst.box.depth, + dst_aux_usage); + } + + blorp_batch_finish(&blorp_batch); + + crocus_flush_and_dirty_for_history(ice, batch, (struct crocus_resource *) + info->dst.resource, + PIPE_CONTROL_RENDER_TARGET_FLUSH, + "cache history: post-blit"); +} + +static void +get_copy_region_aux_settings(struct crocus_resource *res, + enum isl_aux_usage *out_aux_usage, + bool is_render_target) +{ + switch (res->aux.usage) { + case ISL_AUX_USAGE_MCS: + /* A stencil resolve operation must be performed prior to doing resource + * copies or used by CPU. + * (see HSD 1209978162) + */ + if (is_render_target && isl_surf_usage_is_stencil(res->surf.usage)) { + *out_aux_usage = ISL_AUX_USAGE_NONE; + } else { + *out_aux_usage = res->aux.usage; + } + break; + default: + *out_aux_usage = ISL_AUX_USAGE_NONE; + break; + } +} + +/** + * Perform a GPU-based raw memory copy between compatible view classes. + * + * Does not perform any flushing - the new data may still be left in the + * render cache, and old data may remain in other caches. + * + * Wraps blorp_copy() and blorp_buffer_copy(). + */ +void +crocus_copy_region(struct blorp_context *blorp, + struct crocus_batch *batch, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct blorp_batch blorp_batch; + struct crocus_context *ice = blorp->driver_ctx; + struct crocus_screen *screen = (void *) ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_resource *src_res = (void *) src; + struct crocus_resource *dst_res = (void *) dst; + + if (devinfo->ver <= 5) { + if (screen->vtbl.copy_region_blt(batch, dst_res, + dst_level, dstx, dsty, dstz, + src_res, src_level, src_box)) + return; + } + enum isl_aux_usage src_aux_usage, dst_aux_usage; + get_copy_region_aux_settings(src_res, &src_aux_usage, + false); + get_copy_region_aux_settings(dst_res, &dst_aux_usage, + true); + + if (crocus_batch_references(batch, src_res->bo)) + tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format); + + if (dst->target == PIPE_BUFFER) + util_range_add(&dst_res->base, &dst_res->valid_buffer_range, dstx, dstx + src_box->width); + + if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { + struct blorp_address src_addr = { + .buffer = crocus_resource_bo(src), .offset = src_box->x, + }; + struct blorp_address dst_addr = { + .buffer = crocus_resource_bo(dst), .offset = dstx, + .reloc_flags = EXEC_OBJECT_WRITE, + }; + + crocus_batch_maybe_flush(batch, 1500); + + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + blorp_buffer_copy(&blorp_batch, src_addr, dst_addr, src_box->width); + blorp_batch_finish(&blorp_batch); + } else { + // XXX: what about one surface being a buffer and not the other? + + struct blorp_surf src_surf, dst_surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf, + src, src_aux_usage, src_level, false); + crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf, + dst, dst_aux_usage, dst_level, true); + + crocus_resource_prepare_access(ice, src_res, src_level, 1, + src_box->z, src_box->depth, + src_aux_usage, false); + crocus_resource_prepare_access(ice, dst_res, dst_level, 1, + dstz, src_box->depth, + dst_aux_usage, false); + + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + + for (int slice = 0; slice < src_box->depth; slice++) { + crocus_batch_maybe_flush(batch, 1500); + + blorp_copy(&blorp_batch, &src_surf, src_level, src_box->z + slice, + &dst_surf, dst_level, dstz + slice, + src_box->x, src_box->y, dstx, dsty, + src_box->width, src_box->height); + } + blorp_batch_finish(&blorp_batch); + + crocus_resource_finish_write(ice, dst_res, dst_level, dstz, + src_box->depth, dst_aux_usage); + } + + tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format); +} + +static struct crocus_batch * +get_preferred_batch(struct crocus_context *ice, struct crocus_bo *bo) +{ + /* If the compute batch is already using this buffer, we'd prefer to + * continue queueing in the compute batch. + */ + if (crocus_batch_references(&ice->batches[CROCUS_BATCH_COMPUTE], bo)) + return &ice->batches[CROCUS_BATCH_COMPUTE]; + + /* Otherwise default to the render batch. */ + return &ice->batches[CROCUS_BATCH_RENDER]; +} + + +/** + * The pipe->resource_copy_region() driver hook. + * + * This implements ARB_copy_image semantics - a raw memory copy between + * compatible view classes. + */ +static void +crocus_resource_copy_region(struct pipe_context *ctx, + struct pipe_resource *p_dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *p_src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_resource *src = (void *) p_src; + struct crocus_resource *dst = (void *) p_dst; + + if (crocus_resource_unfinished_aux_import(src)) + crocus_resource_finish_aux_import(ctx->screen, src); + if (crocus_resource_unfinished_aux_import(dst)) + crocus_resource_finish_aux_import(ctx->screen, dst); + + /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */ + if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER && + (src_box->width % 4 == 0) && src_box->width <= 16 && + screen->vtbl.copy_mem_mem) { + struct crocus_bo *dst_bo = crocus_resource_bo(p_dst); + batch = get_preferred_batch(ice, dst_bo); + crocus_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4)); + crocus_emit_pipe_control_flush(batch, + "stall for MI_COPY_MEM_MEM copy_region", + PIPE_CONTROL_CS_STALL); + screen->vtbl.copy_mem_mem(batch, dst_bo, dstx, crocus_resource_bo(p_src), + src_box->x, src_box->width); + return; + } + + if (devinfo->ver < 6 && util_format_is_depth_or_stencil(p_dst->format)) { + util_resource_copy_region(ctx, p_dst, dst_level, dstx, dsty, dstz, + p_src, src_level, src_box); + return; + } + crocus_copy_region(&ice->blorp, batch, p_dst, dst_level, dstx, dsty, dstz, + p_src, src_level, src_box); + + if (util_format_is_depth_and_stencil(p_dst->format) && + util_format_has_stencil(util_format_description(p_src->format)) && + devinfo->ver >= 6) { + struct crocus_resource *junk, *s_src_res, *s_dst_res; + crocus_get_depth_stencil_resources(devinfo, p_src, &junk, &s_src_res); + crocus_get_depth_stencil_resources(devinfo, p_dst, &junk, &s_dst_res); + + crocus_copy_region(&ice->blorp, batch, &s_dst_res->base, dst_level, dstx, + dsty, dstz, &s_src_res->base, src_level, src_box); + } + + crocus_flush_and_dirty_for_history(ice, batch, dst, + PIPE_CONTROL_RENDER_TARGET_FLUSH, + "cache history: post copy_region"); +} + +void +crocus_init_blit_functions(struct pipe_context *ctx) +{ + ctx->blit = crocus_blit; + ctx->resource_copy_region = crocus_resource_copy_region; +} diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c new file mode 100644 index 00000000000..75f0078d535 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_blorp.c @@ -0,0 +1,399 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_blorp.c + * + * ============================= GENXML CODE ============================= + * [This file is compiled once per generation.] + * ======================================================================= + * + * GenX specific code for working with BLORP (blitting, resolves, clears + * on the 3D engine). This provides the driver-specific hooks needed to + * implement the BLORP API. + * + * See crocus_blit.c, crocus_clear.c, and so on. + */ + +#include <assert.h> + +#include "crocus_batch.h" +#include "crocus_resource.h" +#include "crocus_context.h" + +#include "util/u_upload_mgr.h" +#include "intel/common/intel_l3_config.h" + +#include "blorp/blorp_genX_exec.h" + +#if GFX_VER <= 5 +#include "gen4_blorp_exec.h" +#endif + +static uint32_t * +stream_state(struct crocus_batch *batch, + unsigned size, + unsigned alignment, + uint32_t *out_offset, + struct crocus_bo **out_bo) +{ + uint32_t offset = ALIGN(batch->state.used, alignment); + + if (offset + size >= STATE_SZ && !batch->no_wrap) { + crocus_batch_flush(batch); + offset = ALIGN(batch->state.used, alignment); + } else if (offset + size >= batch->state.bo->size) { + const unsigned new_size = + MIN2(batch->state.bo->size + batch->state.bo->size / 2, + MAX_STATE_SIZE); + crocus_grow_buffer(batch, true, batch->state.used, new_size); + assert(offset + size < batch->state.bo->size); + } + + crocus_record_state_size(batch->state_sizes, offset, size); + + batch->state.used = offset + size; + *out_offset = offset; + + /* If the caller has asked for a BO, we leave them the responsibility of + * adding bo->gtt_offset (say, by handing an address to genxml). If not, + * we assume they want the offset from a base address. + */ + if (out_bo) + *out_bo = batch->state.bo; + + return (uint32_t *)batch->state.map + (offset >> 2); +} + +static void * +blorp_emit_dwords(struct blorp_batch *blorp_batch, unsigned n) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + return crocus_get_command_space(batch, n * sizeof(uint32_t)); +} + +static uint64_t +blorp_emit_reloc(struct blorp_batch *blorp_batch, UNUSED void *location, + struct blorp_address addr, uint32_t delta) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + uint32_t offset; + + if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) { + offset = (char *)location - (char *)batch->state.map; + return crocus_state_reloc(batch, offset, + addr.buffer, addr.offset + delta, + addr.reloc_flags); + } + + assert(!crocus_ptr_in_state_buffer(batch, location)); + + offset = (char *)location - (char *)batch->command.map; + return crocus_command_reloc(batch, offset, + addr.buffer, addr.offset + delta, + addr.reloc_flags); +} + +static void +blorp_surface_reloc(struct blorp_batch *blorp_batch, uint32_t ss_offset, + struct blorp_address addr, uint32_t delta) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + struct crocus_bo *bo = addr.buffer; + + uint64_t reloc_val = + crocus_state_reloc(batch, ss_offset, bo, addr.offset + delta, + addr.reloc_flags); + + void *reloc_ptr = (void *)batch->state.map + ss_offset; + *(uint32_t *)reloc_ptr = reloc_val; +} + +static uint64_t +blorp_get_surface_address(struct blorp_batch *blorp_batch, + struct blorp_address addr) +{ + /* We'll let blorp_surface_reloc write the address. */ + return 0ull; +} + +#if GFX_VER >= 7 +static struct blorp_address +blorp_get_surface_base_address(struct blorp_batch *blorp_batch) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + return (struct blorp_address) { + .buffer = batch->state.bo, + .offset = 0 + }; +} +#endif + +static void * +blorp_alloc_dynamic_state(struct blorp_batch *blorp_batch, + uint32_t size, + uint32_t alignment, + uint32_t *offset) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + return stream_state(batch, size, alignment, offset, NULL); +} + +static void +blorp_alloc_binding_table(struct blorp_batch *blorp_batch, + unsigned num_entries, + unsigned state_size, + unsigned state_alignment, + uint32_t *bt_offset, + uint32_t *surface_offsets, + void **surface_maps) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + uint32_t *bt_map = stream_state(batch, num_entries * sizeof(uint32_t), 32, + bt_offset, NULL); + + for (unsigned i = 0; i < num_entries; i++) { + surface_maps[i] = stream_state(batch, + state_size, state_alignment, + &(surface_offsets)[i], NULL); + bt_map[i] = surface_offsets[i]; + } +} + +static void * +blorp_alloc_vertex_buffer(struct blorp_batch *blorp_batch, + uint32_t size, + struct blorp_address *addr) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + struct crocus_bo *bo; + uint32_t offset; + + void *map = stream_state(batch, size, 64, + &offset, &bo); + + *addr = (struct blorp_address) { + .buffer = bo, + .offset = offset, + .reloc_flags = RELOC_32BIT, +#if GFX_VER >= 7 + .mocs = crocus_mocs(bo, &batch->screen->isl_dev), +#endif + }; + + return map; +} + +/** + */ +static void +blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *blorp_batch, + const struct blorp_address *addrs, + UNUSED uint32_t *sizes, + unsigned num_vbs) +{ +} + +static struct blorp_address +blorp_get_workaround_address(struct blorp_batch *blorp_batch) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + + return (struct blorp_address) { + .buffer = batch->ice->workaround_bo, + .offset = batch->ice->workaround_offset, + }; +} + +static void +blorp_flush_range(UNUSED struct blorp_batch *blorp_batch, + UNUSED void *start, + UNUSED size_t size) +{ + /* All allocated states come from the batch which we will flush before we + * submit it. There's nothing for us to do here. + */ +} + +#if GFX_VER >= 7 +static const struct intel_l3_config * +blorp_get_l3_config(struct blorp_batch *blorp_batch) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; + return batch->screen->l3_config_3d; +} +#else /* GFX_VER < 7 */ +static void +blorp_emit_urb_config(struct blorp_batch *blorp_batch, + unsigned vs_entry_size, + UNUSED unsigned sf_entry_size) +{ + struct crocus_batch *batch = blorp_batch->driver_batch; +#if GFX_VER <= 5 + batch->screen->vtbl.calculate_urb_fence(batch, 0, vs_entry_size, sf_entry_size); +#else + genX(upload_urb)(batch, vs_entry_size, false, vs_entry_size); +#endif +} +#endif + +static void +crocus_blorp_exec(struct blorp_batch *blorp_batch, + const struct blorp_params *params) +{ + struct crocus_context *ice = blorp_batch->blorp->driver_ctx; + struct crocus_batch *batch = blorp_batch->driver_batch; + + /* Flush the sampler and render caches. We definitely need to flush the + * sampler cache so that we get updated contents from the render cache for + * the glBlitFramebuffer() source. Also, we are sometimes warned in the + * docs to flush the cache between reinterpretations of the same surface + * data with different formats, which blorp does for stencil and depth + * data. + */ + if (params->src.enabled) + crocus_cache_flush_for_read(batch, params->src.addr.buffer); + if (params->dst.enabled) { + crocus_cache_flush_for_render(batch, params->dst.addr.buffer, + params->dst.view.format, + params->dst.aux_usage); + } + if (params->depth.enabled) + crocus_cache_flush_for_depth(batch, params->depth.addr.buffer); + if (params->stencil.enabled) + crocus_cache_flush_for_depth(batch, params->stencil.addr.buffer); + + crocus_require_command_space(batch, 1400); + crocus_require_statebuffer_space(batch, 600); + batch->no_wrap = true; +#if GFX_VER == 6 + /* Emit workaround flushes when we switch from drawing to blorping. */ + crocus_emit_post_sync_nonzero_flush(batch); +#endif + +#if GFX_VER >= 6 + crocus_emit_depth_stall_flushes(batch); +#endif + + blorp_emit(blorp_batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { + rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1; + rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1; + } + + batch->screen->vtbl.update_surface_base_address(batch); + crocus_handle_always_flush_cache(batch); + + batch->contains_draw = true; + blorp_exec(blorp_batch, params); + + batch->no_wrap = false; + crocus_handle_always_flush_cache(batch); + + /* We've smashed all state compared to what the normal 3D pipeline + * rendering tracks for GL. + */ + + uint64_t skip_bits = (CROCUS_DIRTY_POLYGON_STIPPLE | + CROCUS_DIRTY_GEN7_SO_BUFFERS | + CROCUS_DIRTY_SO_DECL_LIST | + CROCUS_DIRTY_LINE_STIPPLE | + CROCUS_ALL_DIRTY_FOR_COMPUTE | + CROCUS_DIRTY_GEN6_SCISSOR_RECT | + CROCUS_DIRTY_GEN75_VF | + CROCUS_DIRTY_SF_CL_VIEWPORT); + + uint64_t skip_stage_bits = (CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE | + CROCUS_STAGE_DIRTY_UNCOMPILED_VS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TCS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TES | + CROCUS_STAGE_DIRTY_UNCOMPILED_GS | + CROCUS_STAGE_DIRTY_UNCOMPILED_FS | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES | + CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS); + + if (!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) { + /* BLORP disabled tessellation, that's fine for the next draw */ + skip_stage_bits |= CROCUS_STAGE_DIRTY_TCS | + CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_CONSTANTS_TCS | + CROCUS_STAGE_DIRTY_CONSTANTS_TES | + CROCUS_STAGE_DIRTY_BINDINGS_TCS | + CROCUS_STAGE_DIRTY_BINDINGS_TES; + } + + if (!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) { + /* BLORP disabled geometry shaders, that's fine for the next draw */ + skip_stage_bits |= CROCUS_STAGE_DIRTY_GS | + CROCUS_STAGE_DIRTY_CONSTANTS_GS | + CROCUS_STAGE_DIRTY_BINDINGS_GS; + } + + /* we can skip flagging CROCUS_DIRTY_DEPTH_BUFFER, if + * BLORP_BATCH_NO_EMIT_DEPTH_STENCIL is set. + */ + if (blorp_batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) + skip_bits |= CROCUS_DIRTY_DEPTH_BUFFER; + + if (!params->wm_prog_data) + skip_bits |= CROCUS_DIRTY_GEN6_BLEND_STATE; + + ice->state.dirty |= ~skip_bits; + ice->state.stage_dirty |= ~skip_stage_bits; + + ice->urb.vsize = 0; + ice->urb.gs_present = false; + ice->urb.gsize = 0; + ice->urb.tess_present = false; + ice->urb.hsize = 0; + ice->urb.dsize = 0; + + if (params->dst.enabled) { + crocus_render_cache_add_bo(batch, params->dst.addr.buffer, + params->dst.view.format, + params->dst.aux_usage); + } + if (params->depth.enabled) + crocus_depth_cache_add_bo(batch, params->depth.addr.buffer); + if (params->stencil.enabled) + crocus_depth_cache_add_bo(batch, params->stencil.addr.buffer); +} + +static void +blorp_measure_start(struct blorp_batch *blorp_batch, + const struct blorp_params *params) +{ +} + +void +genX(init_blorp)(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + + blorp_init(&ice->blorp, ice, &screen->isl_dev); + ice->blorp.compiler = screen->compiler; + ice->blorp.lookup_shader = crocus_blorp_lookup_shader; + ice->blorp.upload_shader = crocus_blorp_upload_shader; + ice->blorp.exec = crocus_blorp_exec; +} diff --git a/src/gallium/drivers/crocus/crocus_blt.c b/src/gallium/drivers/crocus/crocus_blt.c new file mode 100644 index 00000000000..d27891352bd --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_blt.c @@ -0,0 +1,337 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* blt command encoding for gen4/5 */ +#include "crocus_context.h" + +#include "crocus_genx_macros.h" +#include "crocus_genx_protos.h" +#include "crocus_resource.h" + +#define FILE_DEBUG_FLAG DEBUG_BLIT + +#if GFX_VER <= 5 + +static bool validate_blit_for_blt(struct crocus_batch *batch, + const struct pipe_blit_info *info) +{ + /* If the source and destination are the same size with no mirroring, + * the rectangles are within the size of the texture and there is no + * scissor, then we can probably use the blit engine. + */ + if (info->dst.box.width != info->src.box.width || + info->dst.box.height != info->src.box.height) + return false; + + if (info->scissor_enable) + return false; + + if (info->dst.box.height < 0 || info->src.box.height < 0) + return false; + + if (info->dst.box.depth > 1 || info->src.box.depth > 1) + return false; + + return true; +} + +static inline int crocus_resource_blt_pitch(struct crocus_resource *res) +{ + int pitch = res->surf.row_pitch_B; + if (res->surf.tiling != ISL_TILING_LINEAR) + pitch /= 4; + return pitch; +} + +static uint32_t +color_depth_for_cpp(int cpp) +{ + switch (cpp) { + case 4: return COLOR_DEPTH__32bit; + case 2: return COLOR_DEPTH__565; + case 1: return COLOR_DEPTH__8bit; + default: + unreachable("not reached"); + } +} + +static bool emit_copy_blt(struct crocus_batch *batch, + struct crocus_resource *src, + struct crocus_resource *dst, + unsigned cpp, + int32_t src_pitch, + unsigned src_offset, + int32_t dst_pitch, + unsigned dst_offset, + uint16_t src_x, uint16_t src_y, + uint16_t dst_x, uint16_t dst_y, + uint16_t w, uint16_t h) + +{ + uint32_t src_tile_w, src_tile_h; + uint32_t dst_tile_w, dst_tile_h; + int dst_y2 = dst_y + h; + int dst_x2 = dst_x + w; + + DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n", + __func__, + src, src_pitch, src_offset, src_x, src_y, + dst, dst_pitch, dst_offset, dst_x, dst_y, w, h); + + isl_get_tile_dims(src->surf.tiling, cpp, &src_tile_w, &src_tile_h); + isl_get_tile_dims(dst->surf.tiling, cpp, &dst_tile_w, &dst_tile_h); + + /* For Tiled surfaces, the pitch has to be a multiple of the Tile width + * (X direction width of the Tile). This is ensured while allocating the + * buffer object. + */ + assert(src->surf.tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0); + assert(dst->surf.tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0); + + /* For big formats (such as floating point), do the copy using 16 or + * 32bpp and multiply the coordinates. + */ + if (cpp > 4) { + if (cpp % 4 == 2) { + dst_x *= cpp / 2; + dst_x2 *= cpp / 2; + src_x *= cpp / 2; + cpp = 2; + } else { + assert(cpp % 4 == 0); + dst_x *= cpp / 4; + dst_x2 *= cpp / 4; + src_x *= cpp / 4; + cpp = 4; + } + } + + /* For tiled source and destination, pitch value should be specified + * as a number of Dwords. + */ + if (dst->surf.tiling != ISL_TILING_LINEAR) + dst_pitch /= 4; + + if (src->surf.tiling != ISL_TILING_LINEAR) + src_pitch /= 4; + + assert(cpp <= 4); + crocus_emit_cmd(batch, GENX(XY_SRC_COPY_BLT), xyblt) { + xyblt.RasterOperation = 0xCC; + xyblt.DestinationTilingEnable = dst->surf.tiling != ISL_TILING_LINEAR; + xyblt.SourceTilingEnable = src->surf.tiling != ISL_TILING_LINEAR; + xyblt.SourceBaseAddress = ro_bo(src->bo, src_offset); + xyblt.DestinationBaseAddress = rw_bo(dst->bo, dst_offset); + xyblt.ColorDepth = color_depth_for_cpp(cpp); + xyblt._32bppByteMask = cpp == 4 ? 0x3 : 0x1; + xyblt.DestinationX1Coordinate = dst_x; + xyblt.DestinationY1Coordinate = dst_y; + xyblt.DestinationX2Coordinate = dst_x2; + xyblt.DestinationY2Coordinate = dst_y2; + xyblt.DestinationPitch = dst_pitch; + xyblt.SourceX1Coordinate = src_x; + xyblt.SourceY1Coordinate = src_y; + xyblt.SourcePitch = src_pitch; + }; + + crocus_emit_mi_flush(batch); + return true; +} + +static bool crocus_emit_blt(struct crocus_batch *batch, + struct crocus_resource *src, + struct crocus_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, + unsigned dst_z, + unsigned src_level, + const struct pipe_box *src_box) +{ + const struct isl_format_layout *src_fmtl = isl_format_get_layout(src->surf.format); + unsigned src_cpp = src_fmtl->bpb / 8; + const struct isl_format_layout *dst_fmtl = isl_format_get_layout(dst->surf.format); + const unsigned dst_cpp = dst_fmtl->bpb / 8; + uint16_t src_x, src_y; + uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y; + uint32_t src_width = src_box->width, src_height = src_box->height; + + /* gen4/5 can't handle Y tiled blits. */ + if (src->surf.tiling == ISL_TILING_Y0 || dst->surf.tiling == ISL_TILING_Y0) + return false; + + if (src->surf.format != dst->surf.format) + return false; + + if (src_cpp != dst_cpp) + return false; + + src_x = src_box->x; + src_y = src_box->y; + + assert(src_cpp == dst_cpp); + + crocus_resource_get_image_offset(src, src_level, src_box->z, &src_image_x, + &src_image_y); + if (util_format_is_compressed(src->base.format)) { + int bw = util_format_get_blockwidth(src->base.format); + int bh = util_format_get_blockheight(src->base.format); + assert(src_x % bw == 0); + assert(src_y % bh == 0); + src_x /= (int)bw; + src_y /= (int)bh; + src_width = DIV_ROUND_UP(src_width, (int)bw); + src_height = DIV_ROUND_UP(src_height, (int)bh); + } + + crocus_resource_get_image_offset(dst, dst_level, dst_z, &dst_image_x, + &dst_image_y); + if (util_format_is_compressed(dst->base.format)) { + int bw = util_format_get_blockwidth(dst->base.format); + int bh = util_format_get_blockheight(dst->base.format); + assert(dst_x % bw == 0); + assert(dst_y % bh == 0); + dst_x /= (int)bw; + dst_y /= (int)bh; + } + src_x += src_image_x; + src_y += src_image_y; + dst_x += dst_image_x; + dst_y += dst_image_y; + + /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics + * Data Size Limitations): + * + * The BLT engine is capable of transferring very large quantities of + * graphics data. Any graphics data read from and written to the + * destination is permitted to represent a number of pixels that + * occupies up to 65,536 scan lines and up to 32,768 bytes per scan line + * at the destination. The maximum number of pixels that may be + * represented per scan line’s worth of graphics data depends on the + * color depth. + * + * The blitter's pitch is a signed 16-bit integer, but measured in bytes + * for linear surfaces and DWords for tiled surfaces. So the maximum + * pitch is 32k linear and 128k tiled. + */ + if (crocus_resource_blt_pitch(src) >= 32768 || + crocus_resource_blt_pitch(dst) >= 32768) { + return false; + } + + /* We need to split the blit into chunks that each fit within the blitter's + * restrictions. We can't use a chunk size of 32768 because we need to + * ensure that src_tile_x + chunk_size fits. We choose 16384 because it's + * a nice round power of two, big enough that performance won't suffer, and + * small enough to guarantee everything fits. + */ + const uint32_t max_chunk_size = 16384; + + for (uint32_t chunk_x = 0; chunk_x < src_width; chunk_x += max_chunk_size) { + for (uint32_t chunk_y = 0; chunk_y < src_height; chunk_y += max_chunk_size) { + const uint32_t chunk_w = MIN2(max_chunk_size, src_width - chunk_x); + const uint32_t chunk_h = MIN2(max_chunk_size, src_height - chunk_y); + + ASSERTED uint32_t z_offset_el, array_offset; + uint32_t src_offset, src_tile_x, src_tile_y; + isl_tiling_get_intratile_offset_el(src->surf.tiling, + src_cpp * 8, src->surf.row_pitch_B, + src->surf.array_pitch_el_rows, + src_x + chunk_x, src_y + chunk_y, 0, 0, + &src_offset, + &src_tile_x, &src_tile_y, + &z_offset_el, &array_offset); + assert(z_offset_el == 0); + assert(array_offset == 0); + + uint32_t dst_offset, dst_tile_x, dst_tile_y; + isl_tiling_get_intratile_offset_el(dst->surf.tiling, + dst_cpp * 8, dst->surf.row_pitch_B, + dst->surf.array_pitch_el_rows, + dst_x + chunk_x, dst_y + chunk_y, 0, 0, + &dst_offset, + &dst_tile_x, &dst_tile_y, + &z_offset_el, &array_offset); + assert(z_offset_el == 0); + assert(array_offset == 0); + if (!emit_copy_blt(batch, src, dst, + src_cpp, src->surf.row_pitch_B, + src_offset, + dst->surf.row_pitch_B, dst_offset, + src_tile_x, src_tile_y, + dst_tile_x, dst_tile_y, + chunk_w, chunk_h)) { + return false; + } + } + } + return true; +} + +static bool crocus_blit_blt(struct crocus_batch *batch, + const struct pipe_blit_info *info) +{ + if (!validate_blit_for_blt(batch, info)) + return false; + + return crocus_emit_blt(batch, + (struct crocus_resource *)info->src.resource, + (struct crocus_resource *)info->dst.resource, + info->dst.level, + info->dst.box.x, + info->dst.box.y, + info->dst.box.z, + info->src.level, + &info->src.box); +} + + +static bool crocus_copy_region_blt(struct crocus_batch *batch, + struct crocus_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct crocus_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + if (dst->base.target == PIPE_BUFFER || src->base.target == PIPE_BUFFER) + return false; + return crocus_emit_blt(batch, + src, + dst, + dst_level, + dstx, dsty, dstz, + src_level, + src_box); +} +#endif + +void +genX(init_blt)(struct crocus_screen *screen) +{ +#if GFX_VER <= 5 + screen->vtbl.blit_blt = crocus_blit_blt; + screen->vtbl.copy_region_blt = crocus_copy_region_blt; +#else + screen->vtbl.blit_blt = NULL; + screen->vtbl.copy_region_blt = NULL; +#endif +} diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.c b/src/gallium/drivers/crocus/crocus_bufmgr.c new file mode 100644 index 00000000000..caca821cd7e --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_bufmgr.c @@ -0,0 +1,1689 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_bufmgr.c + * + * The crocus buffer manager. + * + * XXX: write better comments + * - BOs + * - Explain BO cache + * - main interface to GEM in the kernel + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xf86drm.h> +#include <util/u_atomic.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <stdbool.h> +#include <time.h> + +#include "errno.h" +#include "common/intel_clflush.h" +#include "dev/intel_debug.h" +#include "common/intel_gem.h" +#include "dev/intel_device_info.h" +#include "main/macros.h" +#include "util/debug.h" +#include "util/macros.h" +#include "util/hash_table.h" +#include "util/list.h" +#include "util/os_file.h" +#include "util/u_dynarray.h" +#include "util/vma.h" +#include "crocus_bufmgr.h" +#include "crocus_context.h" +#include "string.h" + +#include "drm-uapi/i915_drm.h" + +#ifdef HAVE_VALGRIND +#include <valgrind.h> +#include <memcheck.h> +#define VG(x) x +#else +#define VG(x) +#endif + +/** + * For debugging purposes, this returns a time in seconds. + */ +static double +get_time(void) +{ + struct timespec tp; + + clock_gettime(CLOCK_MONOTONIC, &tp); + + return tp.tv_sec + tp.tv_nsec / 1000000000.0; +} + +/* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier + * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is + * leaked. All because it does not call VG(cli_free) from its + * VG_USERREQ__FREELIKE_BLOCK handler. Instead of treating the memory like + * and allocation, we mark it available for use upon mmapping and remove + * it upon unmapping. + */ +#define VG_DEFINED(ptr, size) VG(VALGRIND_MAKE_MEM_DEFINED(ptr, size)) +#define VG_NOACCESS(ptr, size) VG(VALGRIND_MAKE_MEM_NOACCESS(ptr, size)) + +#define PAGE_SIZE 4096 + +#define WARN_ONCE(cond, fmt...) do { \ + if (unlikely(cond)) { \ + static bool _warned = false; \ + if (!_warned) { \ + fprintf(stderr, "WARNING: "); \ + fprintf(stderr, fmt); \ + _warned = true; \ + } \ + } \ +} while (0) + +#define FILE_DEBUG_FLAG DEBUG_BUFMGR + +static inline int +atomic_add_unless(int *v, int add, int unless) +{ + int c, old; + c = p_atomic_read(v); + while (c != unless && (old = p_atomic_cmpxchg(v, c, c + add)) != c) + c = old; + return c == unless; +} + +struct bo_cache_bucket { + /** List of cached BOs. */ + struct list_head head; + + /** Size of this bucket, in bytes. */ + uint64_t size; +}; + +struct bo_export { + /** File descriptor associated with a handle export. */ + int drm_fd; + + /** GEM handle in drm_fd */ + uint32_t gem_handle; + + struct list_head link; +}; + +struct crocus_bufmgr { + /** + * List into the list of bufmgr. + */ + struct list_head link; + + uint32_t refcount; + + int fd; + + mtx_t lock; + + /** Array of lists of cached gem objects of power-of-two sizes */ + struct bo_cache_bucket cache_bucket[14 * 4]; + int num_buckets; + time_t time; + + struct hash_table *name_table; + struct hash_table *handle_table; + + /** + * List of BOs which we've effectively freed, but are hanging on to + * until they're idle before closing and returning the VMA. + */ + struct list_head zombie_list; + + bool has_llc:1; + bool has_mmap_offset:1; + bool has_tiling_uapi:1; + bool bo_reuse:1; +}; + +static mtx_t global_bufmgr_list_mutex = _MTX_INITIALIZER_NP; +static struct list_head global_bufmgr_list = { + .next = &global_bufmgr_list, + .prev = &global_bufmgr_list, +}; + +static int bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode, + uint32_t stride); + +static void bo_free(struct crocus_bo *bo); + +static uint32_t +key_hash_uint(const void *key) +{ + return _mesa_hash_data(key, 4); +} + +static bool +key_uint_equal(const void *a, const void *b) +{ + return *((unsigned *) a) == *((unsigned *) b); +} + +static struct crocus_bo * +find_and_ref_external_bo(struct hash_table *ht, unsigned int key) +{ + struct hash_entry *entry = _mesa_hash_table_search(ht, &key); + struct crocus_bo *bo = entry ? entry->data : NULL; + + if (bo) { + assert(bo->external); + assert(!bo->reusable); + + /* Being non-reusable, the BO cannot be in the cache lists, but it + * may be in the zombie list if it had reached zero references, but + * we hadn't yet closed it...and then reimported the same BO. If it + * is, then remove it since it's now been resurrected. + */ + if (bo->head.prev || bo->head.next) + list_del(&bo->head); + + crocus_bo_reference(bo); + } + + return bo; +} + +/** + * This function finds the correct bucket fit for the input size. + * The function works with O(1) complexity when the requested size + * was queried instead of iterating the size through all the buckets. + */ +static struct bo_cache_bucket * +bucket_for_size(struct crocus_bufmgr *bufmgr, uint64_t size) +{ + /* Calculating the pages and rounding up to the page size. */ + const unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + + /* Row Bucket sizes clz((x-1) | 3) Row Column + * in pages stride size + * 0: 1 2 3 4 -> 30 30 30 30 4 1 + * 1: 5 6 7 8 -> 29 29 29 29 4 1 + * 2: 10 12 14 16 -> 28 28 28 28 8 2 + * 3: 20 24 28 32 -> 27 27 27 27 16 4 + */ + const unsigned row = 30 - __builtin_clz((pages - 1) | 3); + const unsigned row_max_pages = 4 << row; + + /* The '& ~2' is the special case for row 1. In row 1, max pages / + * 2 is 2, but the previous row maximum is zero (because there is + * no previous row). All row maximum sizes are power of 2, so that + * is the only case where that bit will be set. + */ + const unsigned prev_row_max_pages = (row_max_pages / 2) & ~2; + int col_size_log2 = row - 1; + col_size_log2 += (col_size_log2 < 0); + + const unsigned col = (pages - prev_row_max_pages + + ((1 << col_size_log2) - 1)) >> col_size_log2; + + /* Calculating the index based on the row and column. */ + const unsigned index = (row * 4) + (col - 1); + + return (index < bufmgr->num_buckets) ? + &bufmgr->cache_bucket[index] : NULL; +} + + +int +crocus_bo_busy(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct drm_i915_gem_busy busy = { .handle = bo->gem_handle }; + + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy); + if (ret == 0) { + bo->idle = !busy.busy; + return busy.busy; + } + return false; +} + +int +crocus_bo_madvise(struct crocus_bo *bo, int state) +{ + struct drm_i915_gem_madvise madv = { + .handle = bo->gem_handle, + .madv = state, + .retained = 1, + }; + + intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv); + + return madv.retained; +} + +static struct crocus_bo * +bo_calloc(void) +{ + struct crocus_bo *bo = calloc(1, sizeof(*bo)); + if (!bo) + return NULL; + + list_inithead(&bo->exports); + bo->hash = _mesa_hash_pointer(bo); + return bo; +} + +static struct crocus_bo * +alloc_bo_from_cache(struct crocus_bufmgr *bufmgr, + struct bo_cache_bucket *bucket, + uint32_t alignment, + unsigned flags) +{ + if (!bucket) + return NULL; + + struct crocus_bo *bo = NULL; + + list_for_each_entry_safe(struct crocus_bo, cur, &bucket->head, head) { + /* If the last BO in the cache is busy, there are no idle BOs. Bail, + * either falling back to a non-matching memzone, or if that fails, + * allocating a fresh buffer. + */ + if (crocus_bo_busy(cur)) + return NULL; + + list_del(&cur->head); + + /* Tell the kernel we need this BO. If it still exists, we're done! */ + if (crocus_bo_madvise(cur, I915_MADV_WILLNEED)) { + bo = cur; + break; + } + + /* This BO was purged, throw it out and keep looking. */ + bo_free(cur); + } + + if (!bo) + return NULL; + + /* Zero the contents if necessary. If this fails, fall back to + * allocating a fresh BO, which will always be zeroed by the kernel. + */ + if (flags & BO_ALLOC_ZEROED) { + void *map = crocus_bo_map(NULL, bo, MAP_WRITE | MAP_RAW); + if (map) { + memset(map, 0, bo->size); + } else { + bo_free(bo); + return NULL; + } + } + + return bo; +} + +static struct crocus_bo * +alloc_fresh_bo(struct crocus_bufmgr *bufmgr, uint64_t bo_size) +{ + struct crocus_bo *bo = bo_calloc(); + if (!bo) + return NULL; + + struct drm_i915_gem_create create = { .size = bo_size }; + + /* All new BOs we get from the kernel are zeroed, so we don't need to + * worry about that here. + */ + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create) != 0) { + free(bo); + return NULL; + } + + bo->gem_handle = create.handle; + bo->bufmgr = bufmgr; + bo->size = bo_size; + bo->idle = true; + bo->tiling_mode = I915_TILING_NONE; + bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE; + bo->stride = 0; + + /* Calling set_domain() will allocate pages for the BO outside of the + * struct mutex lock in the kernel, which is more efficient than waiting + * to create them during the first execbuf that uses the BO. + */ + struct drm_i915_gem_set_domain sd = { + .handle = bo->gem_handle, + .read_domains = I915_GEM_DOMAIN_CPU, + .write_domain = 0, + }; + + if (intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) { + bo_free(bo); + return NULL; + } + + return bo; +} + +static struct crocus_bo * +bo_alloc_internal(struct crocus_bufmgr *bufmgr, + const char *name, + uint64_t size, + uint32_t alignment, + unsigned flags, + uint32_t tiling_mode, + uint32_t stride) +{ + struct crocus_bo *bo; + unsigned int page_size = getpagesize(); + struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size); + + /* Round the size up to the bucket size, or if we don't have caching + * at this size, a multiple of the page size. + */ + uint64_t bo_size = + bucket ? bucket->size : MAX2(ALIGN(size, page_size), page_size); + + mtx_lock(&bufmgr->lock); + + /* Get a buffer out of the cache if available. First, we try to find + * one with a matching memory zone so we can avoid reallocating VMA. + */ + bo = alloc_bo_from_cache(bufmgr, bucket, alignment, flags); + + mtx_unlock(&bufmgr->lock); + + if (!bo) { + bo = alloc_fresh_bo(bufmgr, bo_size); + if (!bo) + return NULL; + } + + if (bo_set_tiling_internal(bo, tiling_mode, stride)) + goto err_free; + + bo->name = name; + p_atomic_set(&bo->refcount, 1); + bo->reusable = bucket && bufmgr->bo_reuse; + bo->cache_coherent = bufmgr->has_llc; + bo->index = -1; + bo->kflags = 0; + + if ((flags & BO_ALLOC_COHERENT) && !bo->cache_coherent) { + struct drm_i915_gem_caching arg = { + .handle = bo->gem_handle, + .caching = 1, + }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) == 0) { + bo->cache_coherent = true; + bo->reusable = false; + } + } + + DBG("bo_create: buf %d (%s) %llub\n", bo->gem_handle, + bo->name, (unsigned long long) size); + + return bo; + +err_free: + bo_free(bo); + return NULL; +} + +struct crocus_bo * +crocus_bo_alloc(struct crocus_bufmgr *bufmgr, + const char *name, + uint64_t size) +{ + return bo_alloc_internal(bufmgr, name, size, 1, + 0, I915_TILING_NONE, 0); +} + +struct crocus_bo * +crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr, const char *name, + uint64_t size, uint32_t alignment, + uint32_t tiling_mode, uint32_t pitch, unsigned flags) +{ + return bo_alloc_internal(bufmgr, name, size, alignment, + flags, tiling_mode, pitch); +} + +struct crocus_bo * +crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr, const char *name, + void *ptr, size_t size) +{ + struct crocus_bo *bo; + + bo = bo_calloc(); + if (!bo) + return NULL; + + struct drm_i915_gem_userptr arg = { + .user_ptr = (uintptr_t)ptr, + .user_size = size, + }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_USERPTR, &arg)) + goto err_free; + bo->gem_handle = arg.handle; + + /* Check the buffer for validity before we try and use it in a batch */ + struct drm_i915_gem_set_domain sd = { + .handle = bo->gem_handle, + .read_domains = I915_GEM_DOMAIN_CPU, + }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd)) + goto err_close; + + bo->name = name; + bo->size = size; + bo->map_cpu = ptr; + + bo->bufmgr = bufmgr; + bo->kflags = 0; + + if (bo->gtt_offset == 0ull) + goto err_close; + + p_atomic_set(&bo->refcount, 1); + bo->userptr = true; + bo->cache_coherent = true; + bo->index = -1; + bo->idle = true; + + return bo; + +err_close: + intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &bo->gem_handle); +err_free: + free(bo); + return NULL; +} + +/** + * Returns a crocus_bo wrapping the given buffer object handle. + * + * This can be used when one application needs to pass a buffer object + * to another. + */ +struct crocus_bo * +crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr, + const char *name, unsigned int handle) +{ + struct crocus_bo *bo; + + /* At the moment most applications only have a few named bo. + * For instance, in a DRI client only the render buffers passed + * between X and the client are named. And since X returns the + * alternating names for the front/back buffer a linear search + * provides a sufficiently fast match. + */ + mtx_lock(&bufmgr->lock); + bo = find_and_ref_external_bo(bufmgr->name_table, handle); + if (bo) + goto out; + + struct drm_gem_open open_arg = { .name = handle }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_OPEN, &open_arg); + if (ret != 0) { + DBG("Couldn't reference %s handle 0x%08x: %s\n", + name, handle, strerror(errno)); + bo = NULL; + goto out; + } + /* Now see if someone has used a prime handle to get this + * object from the kernel before by looking through the list + * again for a matching gem_handle + */ + bo = find_and_ref_external_bo(bufmgr->handle_table, open_arg.handle); + if (bo) + goto out; + + bo = bo_calloc(); + if (!bo) + goto out; + + p_atomic_set(&bo->refcount, 1); + + bo->size = open_arg.size; + bo->gtt_offset = 0; + bo->bufmgr = bufmgr; + bo->gem_handle = open_arg.handle; + bo->name = name; + bo->global_name = handle; + bo->reusable = false; + bo->external = true; + bo->kflags = 0; + + _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); + _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); + + struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle }; + ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling); + if (ret != 0) + goto err_unref; + + bo->tiling_mode = get_tiling.tiling_mode; + bo->swizzle_mode = get_tiling.swizzle_mode; + /* XXX stride is unknown */ + DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name); + +out: + mtx_unlock(&bufmgr->lock); + return bo; + +err_unref: + bo_free(bo); + mtx_unlock(&bufmgr->lock); + return NULL; +} + +static void +bo_close(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bo->external) { + struct hash_entry *entry; + + if (bo->global_name) { + entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name); + _mesa_hash_table_remove(bufmgr->name_table, entry); + } + + entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle); + _mesa_hash_table_remove(bufmgr->handle_table, entry); + } + + /* Close this object */ + struct drm_gem_close close = { .handle = bo->gem_handle }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close); + if (ret != 0) { + DBG("DRM_IOCTL_GEM_CLOSE %d failed (%s): %s\n", + bo->gem_handle, bo->name, strerror(errno)); + } + + free(bo); +} + +static void +bo_free(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bo->map_cpu && !bo->userptr) { + VG_NOACCESS(bo->map_cpu, bo->size); + munmap(bo->map_cpu, bo->size); + } + if (bo->map_wc) { + VG_NOACCESS(bo->map_wc, bo->size); + munmap(bo->map_wc, bo->size); + } + if (bo->map_gtt) { + VG_NOACCESS(bo->map_gtt, bo->size); + munmap(bo->map_gtt, bo->size); + } + + if (bo->idle) { + bo_close(bo); + } else { + /* Defer closing the GEM BO and returning the VMA for reuse until the + * BO is idle. Just move it to the dead list for now. + */ + list_addtail(&bo->head, &bufmgr->zombie_list); + } +} + +/** Frees all cached buffers significantly older than @time. */ +static void +cleanup_bo_cache(struct crocus_bufmgr *bufmgr, time_t time) +{ + int i; + + if (bufmgr->time == time) + return; + + for (i = 0; i < bufmgr->num_buckets; i++) { + struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i]; + + list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) { + if (time - bo->free_time <= 1) + break; + + list_del(&bo->head); + + bo_free(bo); + } + } + + list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) { + /* Stop once we reach a busy BO - all others past this point were + * freed more recently so are likely also busy. + */ + if (!bo->idle && crocus_bo_busy(bo)) + break; + + list_del(&bo->head); + bo_close(bo); + } + + bufmgr->time = time; +} + +static void +bo_unreference_final(struct crocus_bo *bo, time_t time) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct bo_cache_bucket *bucket; + + DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name); + + bucket = NULL; + if (bo->reusable) + bucket = bucket_for_size(bufmgr, bo->size); + /* Put the buffer into our internal cache for reuse if we can. */ + if (bucket && crocus_bo_madvise(bo, I915_MADV_DONTNEED)) { + bo->free_time = time; + bo->name = NULL; + + list_addtail(&bo->head, &bucket->head); + } else { + bo_free(bo); + } +} + +void +crocus_bo_unreference(struct crocus_bo *bo) +{ + if (bo == NULL) + return; + + assert(p_atomic_read(&bo->refcount) > 0); + + if (atomic_add_unless(&bo->refcount, -1, 1)) { + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct timespec time; + + clock_gettime(CLOCK_MONOTONIC, &time); + + mtx_lock(&bufmgr->lock); + + if (p_atomic_dec_zero(&bo->refcount)) { + bo_unreference_final(bo, time.tv_sec); + cleanup_bo_cache(bufmgr, time.tv_sec); + } + + mtx_unlock(&bufmgr->lock); + } +} + +static void +bo_wait_with_stall_warning(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, + const char *action) +{ + bool busy = dbg && !bo->idle; + double elapsed = unlikely(busy) ? -get_time() : 0.0; + + crocus_bo_wait_rendering(bo); + + if (unlikely(busy)) { + elapsed += get_time(); + if (elapsed > 1e-5) /* 0.01ms */ { + perf_debug(dbg, "%s a busy \"%s\" BO stalled and took %.03f ms.\n", + action, bo->name, elapsed * 1000); + } + } +} + +static void +print_flags(unsigned flags) +{ + if (flags & MAP_READ) + DBG("READ "); + if (flags & MAP_WRITE) + DBG("WRITE "); + if (flags & MAP_ASYNC) + DBG("ASYNC "); + if (flags & MAP_PERSISTENT) + DBG("PERSISTENT "); + if (flags & MAP_COHERENT) + DBG("COHERENT "); + if (flags & MAP_RAW) + DBG("RAW "); + DBG("\n"); +} + +static void * +crocus_bo_gem_mmap_legacy(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, bool wc) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + struct drm_i915_gem_mmap mmap_arg = { + .handle = bo->gem_handle, + .size = bo->size, + .flags = wc ? I915_MMAP_WC : 0, + }; + + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg); + if (ret != 0) { + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + void *map = (void *) (uintptr_t) mmap_arg.addr_ptr; + + return map; +} + +static void * +crocus_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct crocus_bo *bo, + bool wc) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + struct drm_i915_gem_mmap_offset mmap_arg = { + .handle = bo->gem_handle, + .flags = wc ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB, + }; + + /* Get the fake offset back */ + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mmap_arg); + if (ret != 0) { + DBG("%s:%d: Error preparing buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* And map it */ + void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, + bufmgr->fd, mmap_arg.offset); + if (map == MAP_FAILED) { + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + return map; +} + +static void * +crocus_bo_gem_mmap(struct pipe_debug_callback *dbg, struct crocus_bo *bo, bool wc) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bufmgr->has_mmap_offset) + return crocus_bo_gem_mmap_offset(dbg, bo, wc); + else + return crocus_bo_gem_mmap_legacy(dbg, bo, wc); +} + +static void * +crocus_bo_map_cpu(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + /* We disallow CPU maps for writing to non-coherent buffers, as the + * CPU map can become invalidated when a batch is flushed out, which + * can happen at unpredictable times. You should use WC maps instead. + */ + assert(bo->cache_coherent || !(flags & MAP_WRITE)); + + if (!bo->map_cpu) { + DBG("crocus_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name); + + void *map = crocus_bo_gem_mmap(dbg, bo, false); + if (!map) { + return NULL; + } + + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) { + VG_NOACCESS(map, bo->size); + munmap(map, bo->size); + } + } + assert(bo->map_cpu); + + DBG("crocus_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name, + bo->map_cpu); + print_flags(flags); + + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(dbg, bo, "CPU mapping"); + } + + if (!bo->cache_coherent && !bo->bufmgr->has_llc) { + /* If we're reusing an existing CPU mapping, the CPU caches may + * contain stale data from the last time we read from that mapping. + * (With the BO cache, it might even be data from a previous buffer!) + * Even if it's a brand new mapping, the kernel may have zeroed the + * buffer via CPU writes. + * + * We need to invalidate those cachelines so that we see the latest + * contents, and so long as we only read from the CPU mmap we do not + * need to write those cachelines back afterwards. + * + * On LLC, the emprical evidence suggests that writes from the GPU + * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU + * cachelines. (Other reads, such as the display engine, bypass the + * LLC entirely requiring us to keep dirty pixels for the scanout + * out of any cache.) + */ + intel_invalidate_range(bo->map_cpu, bo->size); + } + + return bo->map_cpu; +} + +static void * +crocus_bo_map_wc(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + if (!bo->map_wc) { + DBG("crocus_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name); + + void *map = crocus_bo_gem_mmap(dbg, bo, true); + if (!map) { + return NULL; + } + + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) { + VG_NOACCESS(map, bo->size); + munmap(map, bo->size); + } + } + assert(bo->map_wc); + + DBG("crocus_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc); + print_flags(flags); + + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(dbg, bo, "WC mapping"); + } + + return bo->map_wc; +} + +/** + * Perform an uncached mapping via the GTT. + * + * Write access through the GTT is not quite fully coherent. On low power + * systems especially, like modern Atoms, we can observe reads from RAM before + * the write via GTT has landed. A write memory barrier that flushes the Write + * Combining Buffer (i.e. sfence/mfence) is not sufficient to order the later + * read after the write as the GTT write suffers a small delay through the GTT + * indirection. The kernel uses an uncached mmio read to ensure the GTT write + * is ordered with reads (either by the GPU, WB or WC) and unconditionally + * flushes prior to execbuf submission. However, if we are not informing the + * kernel about our GTT writes, it will not flush before earlier access, such + * as when using the cmdparser. Similarly, we need to be careful if we should + * ever issue a CPU read immediately following a GTT write. + * + * Telling the kernel about write access also has one more important + * side-effect. Upon receiving notification about the write, it cancels any + * scanout buffering for FBC/PSR and friends. Later FBC/PSR is then flushed by + * either SW_FINISH or DIRTYFB. The presumption is that we never write to the + * actual scanout via a mmaping, only to a backbuffer and so all the FBC/PSR + * tracking is handled on the buffer exchange instead. + */ +static void * +crocus_bo_map_gtt(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + /* If we don't support get/set_tiling, there's no support for GTT mapping + * either (it won't do any de-tiling for us). + */ + assert(bufmgr->has_tiling_uapi); + + /* Get a mapping of the buffer if we haven't before. */ + if (bo->map_gtt == NULL) { + DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name); + + struct drm_i915_gem_mmap_gtt mmap_arg = { .handle = bo->gem_handle }; + + /* Get the fake offset back... */ + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg); + if (ret != 0) { + DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* and mmap it. */ + void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, + MAP_SHARED, bufmgr->fd, mmap_arg.offset); + if (map == MAP_FAILED) { + DBG("%s:%d: Error mapping buffer %d (%s): %s .\n", + __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno)); + return NULL; + } + + /* We don't need to use VALGRIND_MALLOCLIKE_BLOCK because Valgrind will + * already intercept this mmap call. However, for consistency between + * all the mmap paths, we mark the pointer as defined now and mark it + * as inaccessible afterwards. + */ + VG_DEFINED(map, bo->size); + + if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) { + VG_NOACCESS(map, bo->size); + munmap(map, bo->size); + } + } + assert(bo->map_gtt); + + DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt); + print_flags(flags); + + if (!(flags & MAP_ASYNC)) { + bo_wait_with_stall_warning(dbg, bo, "GTT mapping"); + } + + return bo->map_gtt; +} + +static bool +can_map_cpu(struct crocus_bo *bo, unsigned flags) +{ + if (bo->cache_coherent) + return true; + + /* Even if the buffer itself is not cache-coherent (such as a scanout), on + * an LLC platform reads always are coherent (as they are performed via the + * central system agent). It is just the writes that we need to take special + * care to ensure that land in main memory and not stick in the CPU cache. + */ + if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc) + return true; + + /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid + * across batch flushes where the kernel will change cache domains of the + * bo, invalidating continued access to the CPU mmap on non-LLC device. + * + * Similarly, ASYNC typically means that the buffer will be accessed via + * both the CPU and the GPU simultaneously. Batches may be executed that + * use the BO even while it is mapped. While OpenGL technically disallows + * most drawing while non-persistent mappings are active, we may still use + * the GPU for blits or other operations, causing batches to happen at + * inconvenient times. + * + * If RAW is set, we expect the caller to be able to handle a WC buffer + * more efficiently than the involuntary clflushes. + */ + if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC | MAP_RAW)) + return false; + + return !(flags & MAP_WRITE); +} + +void * +crocus_bo_map(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags) +{ + if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW)) + return crocus_bo_map_gtt(dbg, bo, flags); + + void *map; + + if (can_map_cpu(bo, flags)) + map = crocus_bo_map_cpu(dbg, bo, flags); + else + map = crocus_bo_map_wc(dbg, bo, flags); + + /* Allow the attempt to fail by falling back to the GTT where necessary. + * + * Not every buffer can be mmaped directly using the CPU (or WC), for + * example buffers that wrap stolen memory or are imported from other + * devices. For those, we have little choice but to use a GTT mmapping. + * However, if we use a slow GTT mmapping for reads where we expected fast + * access, that order of magnitude difference in throughput will be clearly + * expressed by angry users. + * + * We skip MAP_RAW because we want to avoid map_gtt's fence detiling. + */ + if (!map && !(flags & MAP_RAW)) { + perf_debug(dbg, "Fallback GTT mapping for %s with access flags %x\n", + bo->name, flags); + map = crocus_bo_map_gtt(dbg, bo, flags); + } + + return map; +} + +/** Waits for all GPU rendering with the object to have completed. */ +void +crocus_bo_wait_rendering(struct crocus_bo *bo) +{ + /* We require a kernel recent enough for WAIT_IOCTL support. + * See intel_init_bufmgr() + */ + crocus_bo_wait(bo, -1); +} + +/** + * Waits on a BO for the given amount of time. + * + * @bo: buffer object to wait for + * @timeout_ns: amount of time to wait in nanoseconds. + * If value is less than 0, an infinite wait will occur. + * + * Returns 0 if the wait was successful ie. the last batch referencing the + * object has completed within the allotted time. Otherwise some negative return + * value describes the error. Of particular interest is -ETIME when the wait has + * failed to yield the desired result. + * + * Similar to crocus_bo_wait_rendering except a timeout parameter allows + * the operation to give up after a certain amount of time. Another subtle + * difference is the internal locking semantics are different (this variant does + * not hold the lock for the duration of the wait). This makes the wait subject + * to a larger userspace race window. + * + * The implementation shall wait until the object is no longer actively + * referenced within a batch buffer at the time of the call. The wait will + * not guarantee that the buffer is re-issued via another thread, or an flinked + * handle. Userspace must make sure this race does not occur if such precision + * is important. + * + * Note that some kernels have broken the inifite wait for negative values + * promise, upgrade to latest stable kernels if this is the case. + */ +int +crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + /* If we know it's idle, don't bother with the kernel round trip */ + if (bo->idle && !bo->external) + return 0; + + struct drm_i915_gem_wait wait = { + .bo_handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait); + if (ret != 0) + return -errno; + + bo->idle = true; + + return ret; +} + +static void +crocus_bufmgr_destroy(struct crocus_bufmgr *bufmgr) +{ + mtx_destroy(&bufmgr->lock); + + /* Free any cached buffer objects we were going to reuse */ + for (int i = 0; i < bufmgr->num_buckets; i++) { + struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i]; + + list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) { + list_del(&bo->head); + + bo_free(bo); + } + } + + /* Close any buffer objects on the dead list. */ + list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) { + list_del(&bo->head); + bo_close(bo); + } + + _mesa_hash_table_destroy(bufmgr->name_table, NULL); + _mesa_hash_table_destroy(bufmgr->handle_table, NULL); + + close(bufmgr->fd); + + free(bufmgr); +} + +static int +bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode, + uint32_t stride) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + struct drm_i915_gem_set_tiling set_tiling; + int ret; + + if (bo->global_name == 0 && + tiling_mode == bo->tiling_mode && stride == bo->stride) + return 0; + + memset(&set_tiling, 0, sizeof(set_tiling)); + do { + /* set_tiling is slightly broken and overwrites the + * input on the error path, so we have to open code + * drm_ioctl. + */ + set_tiling.handle = bo->gem_handle; + set_tiling.tiling_mode = tiling_mode; + set_tiling.stride = stride; + + ret = ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling); + } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); + if (ret == -1) + return -errno; + + bo->tiling_mode = set_tiling.tiling_mode; + bo->swizzle_mode = set_tiling.swizzle_mode; + bo->stride = set_tiling.stride; + return 0; +} + +int +crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode, + uint32_t *swizzle_mode) +{ + *tiling_mode = bo->tiling_mode; + *swizzle_mode = bo->swizzle_mode; + return 0; +} + +struct crocus_bo * +crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr, int prime_fd, + uint32_t tiling, uint32_t stride) +{ + uint32_t handle; + struct crocus_bo *bo; + + mtx_lock(&bufmgr->lock); + int ret = drmPrimeFDToHandle(bufmgr->fd, prime_fd, &handle); + if (ret) { + DBG("import_dmabuf: failed to obtain handle from fd: %s\n", + strerror(errno)); + mtx_unlock(&bufmgr->lock); + return NULL; + } + + /* + * See if the kernel has already returned this buffer to us. Just as + * for named buffers, we must not create two bo's pointing at the same + * kernel object + */ + bo = find_and_ref_external_bo(bufmgr->handle_table, handle); + if (bo) + goto out; + + bo = bo_calloc(); + if (!bo) + goto out; + + p_atomic_set(&bo->refcount, 1); + + /* Determine size of bo. The fd-to-handle ioctl really should + * return the size, but it doesn't. If we have kernel 3.12 or + * later, we can lseek on the prime fd to get the size. Older + * kernels will just fail, in which case we fall back to the + * provided (estimated or guess size). */ + ret = lseek(prime_fd, 0, SEEK_END); + if (ret != -1) + bo->size = ret; + + bo->bufmgr = bufmgr; + bo->name = "prime"; + bo->reusable = false; + bo->external = true; + bo->kflags = 0; + bo->gem_handle = handle; + _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); + + struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle }; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) + goto err; + + if (get_tiling.tiling_mode == tiling || tiling > I915_TILING_LAST) { + bo->tiling_mode = get_tiling.tiling_mode; + bo->swizzle_mode = get_tiling.swizzle_mode; + /* XXX stride is unknown */ + } else { + if (bo_set_tiling_internal(bo, tiling, stride)) { + goto err; + } + } + +out: + mtx_unlock(&bufmgr->lock); + return bo; + +err: + bo_free(bo); + mtx_unlock(&bufmgr->lock); + return NULL; +} + +static void +crocus_bo_make_external_locked(struct crocus_bo *bo) +{ + if (!bo->external) { + _mesa_hash_table_insert(bo->bufmgr->handle_table, &bo->gem_handle, bo); + bo->external = true; + bo->reusable = false; + } +} + +static void +crocus_bo_make_external(struct crocus_bo *bo) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (bo->external) { + assert(!bo->reusable); + return; + } + + mtx_lock(&bufmgr->lock); + crocus_bo_make_external_locked(bo); + mtx_unlock(&bufmgr->lock); +} + +int +crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + crocus_bo_make_external(bo); + + if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle, + DRM_CLOEXEC, prime_fd) != 0) + return -errno; + + return 0; +} + +uint32_t +crocus_bo_export_gem_handle(struct crocus_bo *bo) +{ + crocus_bo_make_external(bo); + + return bo->gem_handle; +} + +int +crocus_bo_flink(struct crocus_bo *bo, uint32_t *name) +{ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + + if (!bo->global_name) { + struct drm_gem_flink flink = { .handle = bo->gem_handle }; + + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink)) + return -errno; + + mtx_lock(&bufmgr->lock); + if (!bo->global_name) { + crocus_bo_make_external_locked(bo); + bo->global_name = flink.name; + _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); + } + mtx_unlock(&bufmgr->lock); + } + + *name = bo->global_name; + return 0; +} + +int +crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd, + uint32_t *out_handle) +{ + /* Only add the new GEM handle to the list of export if it belongs to a + * different GEM device. Otherwise we might close the same buffer multiple + * times. + */ + struct crocus_bufmgr *bufmgr = bo->bufmgr; + int ret = os_same_file_description(drm_fd, bufmgr->fd); + WARN_ONCE(ret < 0, + "Kernel has no file descriptor comparison support: %s\n", + strerror(errno)); + if (ret == 0) { + *out_handle = crocus_bo_export_gem_handle(bo); + return 0; + } + + struct bo_export *export = calloc(1, sizeof(*export)); + if (!export) + return -ENOMEM; + + export->drm_fd = drm_fd; + + int dmabuf_fd = -1; + int err = crocus_bo_export_dmabuf(bo, &dmabuf_fd); + if (err) { + free(export); + return err; + } + + mtx_lock(&bufmgr->lock); + err = drmPrimeFDToHandle(drm_fd, dmabuf_fd, &export->gem_handle); + close(dmabuf_fd); + if (err) { + mtx_unlock(&bufmgr->lock); + free(export); + return err; + } + + bool found = false; + list_for_each_entry(struct bo_export, iter, &bo->exports, link) { + if (iter->drm_fd != drm_fd) + continue; + /* Here we assume that for a given DRM fd, we'll always get back the + * same GEM handle for a given buffer. + */ + assert(iter->gem_handle == export->gem_handle); + free(export); + export = iter; + found = true; + break; + } + if (!found) + list_addtail(&export->link, &bo->exports); + + mtx_unlock(&bufmgr->lock); + + *out_handle = export->gem_handle; + + return 0; +} + +static void +add_bucket(struct crocus_bufmgr *bufmgr, int size) +{ + unsigned int i = bufmgr->num_buckets; + + assert(i < ARRAY_SIZE(bufmgr->cache_bucket)); + + list_inithead(&bufmgr->cache_bucket[i].head); + bufmgr->cache_bucket[i].size = size; + bufmgr->num_buckets++; + + assert(bucket_for_size(bufmgr, size) == &bufmgr->cache_bucket[i]); + assert(bucket_for_size(bufmgr, size - 2048) == &bufmgr->cache_bucket[i]); + assert(bucket_for_size(bufmgr, size + 1) != &bufmgr->cache_bucket[i]); +} + +static void +init_cache_buckets(struct crocus_bufmgr *bufmgr) +{ + uint64_t size, cache_max_size = 64 * 1024 * 1024; + + /* OK, so power of two buckets was too wasteful of memory. + * Give 3 other sizes between each power of two, to hopefully + * cover things accurately enough. (The alternative is + * probably to just go for exact matching of sizes, and assume + * that for things like composited window resize the tiled + * width/height alignment and rounding of sizes to pages will + * get us useful cache hit rates anyway) + */ + add_bucket(bufmgr, PAGE_SIZE); + add_bucket(bufmgr, PAGE_SIZE * 2); + add_bucket(bufmgr, PAGE_SIZE * 3); + + /* Initialize the linked lists for BO reuse cache. */ + for (size = 4 * PAGE_SIZE; size <= cache_max_size; size *= 2) { + add_bucket(bufmgr, size); + + add_bucket(bufmgr, size + size * 1 / 4); + add_bucket(bufmgr, size + size * 2 / 4); + add_bucket(bufmgr, size + size * 3 / 4); + } +} + +uint32_t +crocus_create_hw_context(struct crocus_bufmgr *bufmgr) +{ + struct drm_i915_gem_context_create create = { }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create); + if (ret != 0) { + DBG("DRM_IOCTL_I915_GEM_CONTEXT_CREATE failed: %s\n", strerror(errno)); + return 0; + } + + /* Upon declaring a GPU hang, the kernel will zap the guilty context + * back to the default logical HW state and attempt to continue on to + * our next submitted batchbuffer. However, our render batches assume + * the previous GPU state is preserved, and only emit commands needed + * to incrementally change that state. In particular, we inherit the + * STATE_BASE_ADDRESS and PIPELINE_SELECT settings, which are critical. + * With default base addresses, our next batches will almost certainly + * cause more GPU hangs, leading to repeated hangs until we're banned + * or the machine is dead. + * + * Here we tell the kernel not to attempt to recover our context but + * immediately (on the next batchbuffer submission) report that the + * context is lost, and we will do the recovery ourselves. Ideally, + * we'll have two lost batches instead of a continual stream of hangs. + */ + struct drm_i915_gem_context_param p = { + .ctx_id = create.ctx_id, + .param = I915_CONTEXT_PARAM_RECOVERABLE, + .value = false, + }; + drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p); + + return create.ctx_id; +} + +static int +crocus_hw_context_get_priority(struct crocus_bufmgr *bufmgr, uint32_t ctx_id) +{ + struct drm_i915_gem_context_param p = { + .ctx_id = ctx_id, + .param = I915_CONTEXT_PARAM_PRIORITY, + }; + drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p); + return p.value; /* on error, return 0 i.e. default priority */ +} + +int +crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr, + uint32_t ctx_id, + int priority) +{ + struct drm_i915_gem_context_param p = { + .ctx_id = ctx_id, + .param = I915_CONTEXT_PARAM_PRIORITY, + .value = priority, + }; + int err; + + err = 0; + if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p)) + err = -errno; + + return err; +} + +uint32_t +crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id) +{ + uint32_t new_ctx = crocus_create_hw_context(bufmgr); + + if (new_ctx) { + int priority = crocus_hw_context_get_priority(bufmgr, ctx_id); + crocus_hw_context_set_priority(bufmgr, new_ctx, priority); + } + + return new_ctx; +} + +void +crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id) +{ + struct drm_i915_gem_context_destroy d = { .ctx_id = ctx_id }; + + if (ctx_id != 0 && + intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &d) != 0) { + fprintf(stderr, "DRM_IOCTL_I915_GEM_CONTEXT_DESTROY failed: %s\n", + strerror(errno)); + } +} + +int +crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset, uint64_t *result) +{ + struct drm_i915_reg_read reg_read = { .offset = offset }; + int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_REG_READ, ®_read); + + *result = reg_read.val; + return ret; +} + +static int +gem_param(int fd, int name) +{ + int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */ + + struct drm_i915_getparam gp = { .param = name, .value = &v }; + if (intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp)) + return -1; + + return v; +} + +/** + * Initializes the GEM buffer manager, which uses the kernel to allocate, map, + * and manage map buffer objections. + * + * \param fd File descriptor of the opened DRM device. + */ +static struct crocus_bufmgr * +crocus_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse) +{ + struct crocus_bufmgr *bufmgr = calloc(1, sizeof(*bufmgr)); + if (bufmgr == NULL) + return NULL; + + /* Handles to buffer objects belong to the device fd and are not + * reference counted by the kernel. If the same fd is used by + * multiple parties (threads sharing the same screen bufmgr, or + * even worse the same device fd passed to multiple libraries) + * ownership of those handles is shared by those independent parties. + * + * Don't do this! Ensure that each library/bufmgr has its own device + * fd so that its namespace does not clash with another. + */ + bufmgr->fd = os_dupfd_cloexec(fd); + + p_atomic_set(&bufmgr->refcount, 1); + + if (mtx_init(&bufmgr->lock, mtx_plain) != 0) { + free(bufmgr); + return NULL; + } + + list_inithead(&bufmgr->zombie_list); + + bufmgr->has_llc = devinfo->has_llc; + bufmgr->has_tiling_uapi = devinfo->has_tiling_uapi; + bufmgr->bo_reuse = bo_reuse; + bufmgr->has_mmap_offset = gem_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4; + + init_cache_buckets(bufmgr); + + bufmgr->name_table = + _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + bufmgr->handle_table = + _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal); + + return bufmgr; +} + +static struct crocus_bufmgr * +crocus_bufmgr_ref(struct crocus_bufmgr *bufmgr) +{ + p_atomic_inc(&bufmgr->refcount); + return bufmgr; +} + +void +crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr) +{ + mtx_lock(&global_bufmgr_list_mutex); + if (p_atomic_dec_zero(&bufmgr->refcount)) { + list_del(&bufmgr->link); + crocus_bufmgr_destroy(bufmgr); + } + mtx_unlock(&global_bufmgr_list_mutex); +} + +/** + * Gets an already existing GEM buffer manager or create a new one. + * + * \param fd File descriptor of the opened DRM device. + */ +struct crocus_bufmgr * +crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd, bool bo_reuse) +{ + struct stat st; + + if (fstat(fd, &st)) + return NULL; + + struct crocus_bufmgr *bufmgr = NULL; + + mtx_lock(&global_bufmgr_list_mutex); + list_for_each_entry(struct crocus_bufmgr, iter_bufmgr, &global_bufmgr_list, link) { + struct stat iter_st; + if (fstat(iter_bufmgr->fd, &iter_st)) + continue; + + if (st.st_rdev == iter_st.st_rdev) { + assert(iter_bufmgr->bo_reuse == bo_reuse); + bufmgr = crocus_bufmgr_ref(iter_bufmgr); + goto unlock; + } + } + + bufmgr = crocus_bufmgr_create(devinfo, fd, bo_reuse); + if (bufmgr) + list_addtail(&bufmgr->link, &global_bufmgr_list); + + unlock: + mtx_unlock(&global_bufmgr_list_mutex); + + return bufmgr; +} + +int +crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr) +{ + return bufmgr->fd; +} diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.h b/src/gallium/drivers/crocus/crocus_bufmgr.h new file mode 100644 index 00000000000..8bb328fdeae --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_bufmgr.h @@ -0,0 +1,331 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_BUFMGR_H +#define CROCUS_BUFMGR_H + +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <sys/types.h> +#include "util/macros.h" +#include "util/u_atomic.h" +#include "util/list.h" +#include "pipe/p_defines.h" + +struct crocus_batch; +struct intel_device_info; +struct pipe_debug_callback; + +#define CROCUS_BINDER_SIZE (64 * 1024) +#define CROCUS_MAX_BINDERS 100 + +struct crocus_bo { + /** + * Size in bytes of the buffer object. + * + * The size may be larger than the size originally requested for the + * allocation, such as being aligned to page size. + */ + uint64_t size; + + /** Buffer manager context associated with this buffer object */ + struct crocus_bufmgr *bufmgr; + + /** The GEM handle for this buffer object. */ + uint32_t gem_handle; + + /** + * Virtual address of the buffer inside the PPGTT (Per-Process Graphics + * Translation Table). + * + * Although each hardware context has its own VMA, we assign BO's to the + * same address in all contexts, for simplicity. + */ + uint64_t gtt_offset; + + /** + * The validation list index for this buffer, or -1 when not in a batch. + * Note that a single buffer may be in multiple batches (contexts), and + * this is a global field, which refers to the last batch using the BO. + * It should not be considered authoritative, but can be used to avoid a + * linear walk of the validation list in the common case by guessing that + * exec_bos[bo->index] == bo and confirming whether that's the case. + * + * XXX: this is not ideal now that we have more than one batch per context, + * XXX: as the index will flop back and forth between the render index and + * XXX: compute index... + */ + unsigned index; + + /** + * Boolean of whether the GPU is definitely not accessing the buffer. + * + * This is only valid when reusable, since non-reusable + * buffers are those that have been shared with other + * processes, so we don't know their state. + */ + bool idle; + + int refcount; + const char *name; + + uint64_t kflags; + + /** + * Kenel-assigned global name for this object + * + * List contains both flink named and prime fd'd objects + */ + unsigned global_name; + + /** + * Current tiling mode + */ + uint32_t tiling_mode; + uint32_t swizzle_mode; + uint32_t stride; + + time_t free_time; + + /** Mapped address for the buffer, saved across map/unmap cycles */ + void *map_cpu; + /** GTT virtual address for the buffer, saved across map/unmap cycles */ + void *map_gtt; + /** WC CPU address for the buffer, saved across map/unmap cycles */ + void *map_wc; + + /** BO cache list */ + struct list_head head; + + /** List of GEM handle exports of this buffer (bo_export) */ + struct list_head exports; + + /** + * Boolean of whether this buffer can be re-used + */ + bool reusable; + + /** + * Boolean of whether this buffer has been shared with an external client. + */ + bool external; + + /** + * Boolean of whether this buffer is cache coherent + */ + bool cache_coherent; + + /** + * Boolean of whether this buffer points into user memory + */ + bool userptr; + + /** Pre-computed hash using _mesa_hash_pointer for cache tracking sets */ + uint32_t hash; +}; + +#define BO_ALLOC_ZEROED (1 << 0) +#define BO_ALLOC_COHERENT (1 << 1) + +/** + * Allocate a buffer object. + * + * Buffer objects are not necessarily initially mapped into CPU virtual + * address space or graphics device aperture. They must be mapped + * using crocus_bo_map() to be used by the CPU. + */ +struct crocus_bo *crocus_bo_alloc(struct crocus_bufmgr *bufmgr, + const char *name, uint64_t size); + +/** + * Allocate a tiled buffer object. + * + * Alignment for tiled objects is set automatically; the 'flags' + * argument provides a hint about how the object will be used initially. + * + * Valid tiling formats are: + * I915_TILING_NONE + * I915_TILING_X + * I915_TILING_Y + */ +struct crocus_bo *crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr, + const char *name, uint64_t size, + uint32_t alignment, + uint32_t tiling_mode, uint32_t pitch, + unsigned flags); + +struct crocus_bo *crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr, + const char *name, void *ptr, + size_t size); + +/** Takes a reference on a buffer object */ +static inline void +crocus_bo_reference(struct crocus_bo *bo) +{ + p_atomic_inc(&bo->refcount); +} + +/** + * Releases a reference on a buffer object, freeing the data if + * no references remain. + */ +void crocus_bo_unreference(struct crocus_bo *bo); + +#define MAP_READ PIPE_MAP_READ +#define MAP_WRITE PIPE_MAP_WRITE +#define MAP_ASYNC PIPE_MAP_UNSYNCHRONIZED +#define MAP_PERSISTENT PIPE_MAP_PERSISTENT +#define MAP_COHERENT PIPE_MAP_COHERENT +/* internal */ +#define MAP_INTERNAL_MASK (0xff << 24) +#define MAP_RAW (0x01 << 24) + +#define MAP_FLAGS (MAP_READ | MAP_WRITE | MAP_ASYNC | \ + MAP_PERSISTENT | MAP_COHERENT | MAP_INTERNAL_MASK) + +/** + * Maps the buffer into userspace. + * + * This function will block waiting for any existing execution on the + * buffer to complete, first. The resulting mapping is returned. + */ +MUST_CHECK void *crocus_bo_map(struct pipe_debug_callback *dbg, + struct crocus_bo *bo, unsigned flags); + +/** + * Reduces the refcount on the userspace mapping of the buffer + * object. + */ +static inline int crocus_bo_unmap(struct crocus_bo *bo) { return 0; } + +/** + * Waits for rendering to an object by the GPU to have completed. + * + * This is not required for any access to the BO by bo_map, + * bo_subdata, etc. It is merely a way for the driver to implement + * glFinish. + */ +void crocus_bo_wait_rendering(struct crocus_bo *bo); + +/** + * Unref a buffer manager instance. + */ +void crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr); + +/** + * Get the current tiling (and resulting swizzling) mode for the bo. + * + * \param buf Buffer to get tiling mode for + * \param tiling_mode returned tiling mode + * \param swizzle_mode returned swizzling mode + */ +int crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode, + uint32_t *swizzle_mode); + +/** + * Create a visible name for a buffer which can be used by other apps + * + * \param buf Buffer to create a name for + * \param name Returned name + */ +int crocus_bo_flink(struct crocus_bo *bo, uint32_t *name); + +/** + * Is this buffer shared with external clients (exported)? + */ +static inline bool +crocus_bo_is_external(const struct crocus_bo *bo) +{ + return bo->external; +} + +/** + * Returns 1 if mapping the buffer for write could cause the process + * to block, due to the object being active in the GPU. + */ +int crocus_bo_busy(struct crocus_bo *bo); + +/** + * Specify the volatility of the buffer. + * \param bo Buffer to create a name for + * \param madv The purgeable status + * + * Use I915_MADV_DONTNEED to mark the buffer as purgeable, and it will be + * reclaimed under memory pressure. If you subsequently require the buffer, + * then you must pass I915_MADV_WILLNEED to mark the buffer as required. + * + * Returns 1 if the buffer was retained, or 0 if it was discarded whilst + * marked as I915_MADV_DONTNEED. + */ +int crocus_bo_madvise(struct crocus_bo *bo, int madv); + +/* drm_bacon_bufmgr_gem.c */ +struct crocus_bufmgr * +crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd, + bool bo_reuse); +int crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr); + +struct crocus_bo *crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr, + const char *name, + unsigned handle); + +int crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns); + +uint32_t crocus_create_hw_context(struct crocus_bufmgr *bufmgr); +uint32_t crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id); + +#define CROCUS_CONTEXT_LOW_PRIORITY ((I915_CONTEXT_MIN_USER_PRIORITY - 1) / 2) +#define CROCUS_CONTEXT_MEDIUM_PRIORITY (I915_CONTEXT_DEFAULT_PRIORITY) +#define CROCUS_CONTEXT_HIGH_PRIORITY ((I915_CONTEXT_MAX_USER_PRIORITY + 1) / 2) + +int crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr, + uint32_t ctx_id, int priority); + +void crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id); + +int crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd); +struct crocus_bo *crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr, + int prime_fd, uint32_t tiling, + uint32_t stride); + +/** + * Exports a bo as a GEM handle into a given DRM file descriptor + * \param bo Buffer to export + * \param drm_fd File descriptor where the new handle is created + * \param out_handle Pointer to store the new handle + * + * Returns 0 if the buffer was successfully exported, a non zero error code + * otherwise. + */ +int crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd, + uint32_t *out_handle); + +uint32_t crocus_bo_export_gem_handle(struct crocus_bo *bo); + +int crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset, + uint64_t *out); + +int drm_ioctl(int fd, unsigned long request, void *arg); + +#endif /* CROCUS_BUFMGR_H */ diff --git a/src/gallium/drivers/crocus/crocus_clear.c b/src/gallium/drivers/crocus/crocus_clear.c new file mode 100644 index 00000000000..1c56e23f794 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_clear.c @@ -0,0 +1,859 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <stdio.h> +#include <errno.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_inlines.h" +#include "util/u_surface.h" +#include "util/format/u_format.h" +#include "util/u_upload_mgr.h" +#include "util/ralloc.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "intel/compiler/brw_compiler.h" +#include "util/format_srgb.h" + +static bool +crocus_is_color_fast_clear_compatible(struct crocus_context *ice, + enum isl_format format, + const union isl_color_value color) +{ + if (isl_format_has_int_channel(format)) { + perf_debug(&ice->dbg, "Integer fast clear not enabled for %s", + isl_format_get_name(format)); + return false; + } + + for (int i = 0; i < 4; i++) { + if (!isl_format_has_color_component(format, i)) { + continue; + } + + if (color.f32[i] != 0.0f && color.f32[i] != 1.0f) { + return false; + } + } + + return true; +} + +static bool +can_fast_clear_color(struct crocus_context *ice, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + enum isl_format format, + enum isl_format render_format, + union isl_color_value color) +{ + struct crocus_resource *res = (void *) p_res; + + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return false; + + if (!isl_aux_usage_has_fast_clears(res->aux.usage)) + return false; + + /* Check for partial clear */ + if (box->x > 0 || box->y > 0 || + box->width < minify(p_res->width0, level) || + box->height < minify(p_res->height0, level)) { + return false; + } + + /* Avoid conditional fast clears to maintain correct tracking of the aux + * state (see iris_resource_finish_write for more info). Note that partial + * fast clears (if they existed) would not pose a problem with conditional + * rendering. + */ + if (render_condition_enabled && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + return false; + } + + /* We store clear colors as floats or uints as needed. If there are + * texture views in play, the formats will not properly be respected + * during resolves because the resolve operations only know about the + * resource and not the renderbuffer. + */ + if (isl_format_srgb_to_linear(render_format) != + isl_format_srgb_to_linear(format)) { + return false; + } + + /* XXX: if (irb->mt->supports_fast_clear) + * see intel_miptree_create_for_dri_image() + */ + + if (!crocus_is_color_fast_clear_compatible(ice, format, color)) + return false; + + return true; +} + +static union isl_color_value +convert_fast_clear_color(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format render_format, + const union isl_color_value color) +{ + union isl_color_value override_color = color; + struct pipe_resource *p_res = (void *) res; + + const enum pipe_format format = p_res->format; + const struct util_format_description *desc = + util_format_description(format); + unsigned colormask = util_format_colormask(desc); + + if (util_format_is_intensity(format) || + util_format_is_luminance(format) || + util_format_is_luminance_alpha(format)) { + override_color.u32[1] = override_color.u32[0]; + override_color.u32[2] = override_color.u32[0]; + if (util_format_is_intensity(format)) + override_color.u32[3] = override_color.u32[0]; + } else { + for (int chan = 0; chan < 3; chan++) { + if (!(colormask & (1 << chan))) + override_color.u32[chan] = 0; + } + } + + if (util_format_is_unorm(format)) { + for (int i = 0; i < 4; i++) + override_color.f32[i] = CLAMP(override_color.f32[i], 0.0f, 1.0f); + } else if (util_format_is_snorm(format)) { + for (int i = 0; i < 4; i++) + override_color.f32[i] = CLAMP(override_color.f32[i], -1.0f, 1.0f); + } else if (util_format_is_pure_uint(format)) { + for (int i = 0; i < 4; i++) { + unsigned bits = util_format_get_component_bits( + format, UTIL_FORMAT_COLORSPACE_RGB, i); + if (bits < 32) { + uint32_t max = (1u << bits) - 1; + override_color.u32[i] = MIN2(override_color.u32[i], max); + } + } + } else if (util_format_is_pure_sint(format)) { + for (int i = 0; i < 4; i++) { + unsigned bits = util_format_get_component_bits( + format, UTIL_FORMAT_COLORSPACE_RGB, i); + if (bits < 32) { + int32_t max = (1 << (bits - 1)) - 1; + int32_t min = -(1 << (bits - 1)); + override_color.i32[i] = CLAMP(override_color.i32[i], min, max); + } + } + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT || + format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + /* these packed float formats only store unsigned values */ + for (int i = 0; i < 4; i++) + override_color.f32[i] = MAX2(override_color.f32[i], 0.0f); + } + + if (!(colormask & 1 << 3)) { + if (util_format_is_pure_integer(format)) + override_color.u32[3] = 1; + else + override_color.f32[3] = 1.0f; + } + + /* Handle linear to SRGB conversion */ + if (isl_format_is_srgb(render_format)) { + for (int i = 0; i < 3; i++) { + override_color.f32[i] = + util_format_linear_to_srgb_float(override_color.f32[i]); + } + } + + return override_color; +} + +static void +fast_clear_color(struct crocus_context *ice, + struct crocus_resource *res, + unsigned level, + const struct pipe_box *box, + enum isl_format format, + union isl_color_value color, + enum blorp_batch_flags blorp_flags) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + struct pipe_resource *p_res = (void *) res; + + color = convert_fast_clear_color(ice, res, format, color); + + bool color_changed = !!memcmp(&res->aux.clear_color, &color, + sizeof(color)); + + if (color_changed) { + /* If we are clearing to a new clear value, we need to resolve fast + * clears from other levels/layers first, since we can't have different + * levels/layers with different fast clear colors. + */ + for (unsigned res_lvl = 0; res_lvl < res->surf.levels; res_lvl++) { + const unsigned level_layers = + crocus_get_num_logical_layers(res, res_lvl); + for (unsigned layer = 0; layer < level_layers; layer++) { + if (res_lvl == level && + layer >= box->z && + layer < box->z + box->depth) { + /* We're going to clear this layer anyway. Leave it alone. */ + continue; + } + + enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, res_lvl, layer); + + if (aux_state != ISL_AUX_STATE_CLEAR && + aux_state != ISL_AUX_STATE_PARTIAL_CLEAR && + aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) { + /* This slice doesn't have any fast-cleared bits. */ + continue; + } + + /* If we got here, then the level may have fast-clear bits that use + * the old clear value. We need to do a color resolve to get rid + * of their use of the clear color before we can change it. + * Fortunately, few applications ever change their clear color at + * different levels/layers, so this shouldn't happen often. + */ + crocus_resource_prepare_access(ice, res, + res_lvl, 1, layer, 1, + res->aux.usage, + false); + perf_debug(&ice->dbg, + "Resolving resource (%p) level %d, layer %d: color changing from " + "(%0.2f, %0.2f, %0.2f, %0.2f) to " + "(%0.2f, %0.2f, %0.2f, %0.2f)\n", + res, res_lvl, layer, + res->aux.clear_color.f32[0], + res->aux.clear_color.f32[1], + res->aux.clear_color.f32[2], + res->aux.clear_color.f32[3], + color.f32[0], color.f32[1], color.f32[2], color.f32[3]); + } + } + } + + crocus_resource_set_clear_color(ice, res, color); + + /* If the buffer is already in ISL_AUX_STATE_CLEAR, and the color hasn't + * changed, the clear is redundant and can be skipped. + */ + const enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, box->z); + if (!color_changed && box->depth == 1 && aux_state == ISL_AUX_STATE_CLEAR) + return; + + /* Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)": + * + * "Any transition from any value in {Clear, Render, Resolve} to a + * different value in {Clear, Render, Resolve} requires end of pipe + * synchronization." + * + * In other words, fast clear ops are not properly synchronized with + * other drawing. We need to use a PIPE_CONTROL to ensure that the + * contents of the previous draw hit the render target before we resolve + * and again afterwards to ensure that the resolve is complete before we + * do any more regular drawing. + */ + crocus_emit_end_of_pipe_sync(batch, + "fast clear: pre-flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + /* If we reach this point, we need to fast clear to change the state to + * ISL_AUX_STATE_CLEAR, or to update the fast clear color (or both). + */ + blorp_flags |= color_changed ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR; + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + p_res, res->aux.usage, level, true); + + /* In newer gens (> 9), the hardware will do a linear -> sRGB conversion of + * the clear color during the fast clear, if the surface format is of sRGB + * type. We use the linear version of the surface format here to prevent + * that from happening, since we already do our own linear -> sRGB + * conversion in convert_fast_clear_color(). + */ + blorp_fast_clear(&blorp_batch, &surf, isl_format_srgb_to_linear(format), + ISL_SWIZZLE_IDENTITY, + level, box->z, box->depth, + box->x, box->y, box->x + box->width, + box->y + box->height); + blorp_batch_finish(&blorp_batch); + crocus_emit_end_of_pipe_sync(batch, + "fast clear: post flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + crocus_resource_set_aux_state(ice, res, level, box->z, + box->depth, ISL_AUX_STATE_CLEAR); + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; + return; +} + +static void +clear_color(struct crocus_context *ice, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + enum isl_format format, + struct isl_swizzle swizzle, + union isl_color_value color) +{ + struct crocus_resource *res = (void *) p_res; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + enum blorp_batch_flags blorp_flags = 0; + + if (render_condition_enabled) { + if (!crocus_check_conditional_render(ice)) + return; + + if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) + blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE; + } + + if (p_res->target == PIPE_BUFFER) + util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width); + + crocus_batch_maybe_flush(batch, 1500); + + bool can_fast_clear = can_fast_clear_color(ice, p_res, level, box, + render_condition_enabled, + res->surf.format, format, color); + if (can_fast_clear) { + fast_clear_color(ice, res, level, box, format, color, + blorp_flags); + return; + } + + bool color_write_disable[4] = { false, false, false, false }; + enum isl_aux_usage aux_usage = + crocus_resource_render_aux_usage(ice, res, format, + false, false); + + crocus_resource_prepare_render(ice, res, level, + box->z, box->depth, aux_usage); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + p_res, aux_usage, level, true); + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + if (!isl_format_supports_rendering(devinfo, format) && + isl_format_is_rgbx(format)) + format = isl_format_rgbx_to_rgba(format); + + blorp_clear(&blorp_batch, &surf, format, swizzle, + level, box->z, box->depth, box->x, box->y, + box->x + box->width, box->y + box->height, + color, color_write_disable); + + blorp_batch_finish(&blorp_batch); + crocus_flush_and_dirty_for_history(ice, batch, res, + PIPE_CONTROL_RENDER_TARGET_FLUSH, + "cache history: post color clear"); + + crocus_resource_finish_render(ice, res, level, + box->z, box->depth, aux_usage); +} + +static bool +can_fast_clear_depth(struct crocus_context *ice, + struct crocus_resource *res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + float depth) +{ + struct pipe_resource *p_res = (void *) res; + struct pipe_context *ctx = (void *) ice; + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (devinfo->ver < 6) + return false; + + if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR) + return false; + + /* Check for partial clears */ + if (box->x > 0 || box->y > 0 || + box->width < u_minify(p_res->width0, level) || + box->height < u_minify(p_res->height0, level)) { + return false; + } + + /* Avoid conditional fast clears to maintain correct tracking of the aux + * state (see iris_resource_finish_write for more info). Note that partial + * fast clears would not pose a problem with conditional rendering. + */ + if (render_condition_enabled && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + return false; + } + + if (!crocus_resource_level_has_hiz(res, level)) + return false; + + if (res->base.format == PIPE_FORMAT_Z16_UNORM) { + /* From the Sandy Bridge PRM, volume 2 part 1, page 314: + * + * "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be + * enabled (the legacy method of clearing must be performed): + * + * - DevSNB{W/A}]: When depth buffer format is D16_UNORM and the + * width of the map (LOD0) is not multiple of 16, fast clear + * optimization must be disabled. + */ + if (devinfo->ver == 6 && + (minify(res->surf.phys_level0_sa.width, + level) % 16) != 0) + return false; + } + return true; +} + +static void +fast_clear_depth(struct crocus_context *ice, + struct crocus_resource *res, + unsigned level, + const struct pipe_box *box, + float depth) +{ + struct pipe_resource *p_res = (void *) res; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + /* Quantize the clear value to what can be stored in the actual depth + * buffer. This makes the following check more accurate because it now + * checks if the actual depth bits will match. It also prevents us from + * getting a too-accurate depth value during depth testing or when sampling + * with HiZ enabled. + */ + const unsigned nbits = p_res->format == PIPE_FORMAT_Z16_UNORM ? 16 : 24; + const uint32_t depth_max = (1 << nbits) - 1; + depth = p_res->format == PIPE_FORMAT_Z32_FLOAT ? depth : + (unsigned)(depth * depth_max) / (float)depth_max; + + bool update_clear_depth = false; + + /* If we're clearing to a new clear value, then we need to resolve any clear + * flags out of the HiZ buffer into the real depth buffer. + */ + if (res->aux.clear_color.f32[0] != depth) { + for (unsigned res_level = 0; res_level < res->surf.levels; res_level++) { + if (!crocus_resource_level_has_hiz(res, res_level)) + continue; + + const unsigned level_layers = + crocus_get_num_logical_layers(res, res_level); + for (unsigned layer = 0; layer < level_layers; layer++) { + if (res_level == level && + layer >= box->z && + layer < box->z + box->depth) { + /* We're going to clear this layer anyway. Leave it alone. */ + continue; + } + + enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, res_level, layer); + + if (aux_state != ISL_AUX_STATE_CLEAR && + aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) { + /* This slice doesn't have any fast-cleared bits. */ + continue; + } + + /* If we got here, then the level may have fast-clear bits that + * use the old clear value. We need to do a depth resolve to get + * rid of their use of the clear value before we can change it. + * Fortunately, few applications ever change their depth clear + * value so this shouldn't happen often. + */ + crocus_hiz_exec(ice, batch, res, res_level, layer, 1, + ISL_AUX_OP_FULL_RESOLVE, false); + crocus_resource_set_aux_state(ice, res, res_level, layer, 1, + ISL_AUX_STATE_RESOLVED); + } + } + const union isl_color_value clear_value = { .f32 = {depth, } }; + crocus_resource_set_clear_color(ice, res, clear_value); + update_clear_depth = true; + } + + for (unsigned l = 0; l < box->depth; l++) { + enum isl_aux_state aux_state = + crocus_resource_level_has_hiz(res, level) ? + crocus_resource_get_aux_state(res, level, box->z + l) : + ISL_AUX_STATE_AUX_INVALID; + if (update_clear_depth || aux_state != ISL_AUX_STATE_CLEAR) { + if (aux_state == ISL_AUX_STATE_CLEAR) { + perf_debug(&ice->dbg, "Performing HiZ clear just to update the " + "depth clear value\n"); + } + crocus_hiz_exec(ice, batch, res, level, + box->z + l, 1, ISL_AUX_OP_FAST_CLEAR, + update_clear_depth); + } + } + + crocus_resource_set_aux_state(ice, res, level, box->z, box->depth, + ISL_AUX_STATE_CLEAR); + ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER; +} + +static void +clear_depth_stencil(struct crocus_context *ice, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + bool render_condition_enabled, + bool clear_depth, + bool clear_stencil, + float depth, + uint8_t stencil) +{ + struct crocus_resource *res = (void *) p_res; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + enum blorp_batch_flags blorp_flags = 0; + + if (render_condition_enabled) { + if (!crocus_check_conditional_render(ice)) + return; + + if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) + blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE; + } + + crocus_batch_maybe_flush(batch, 1500); + + struct crocus_resource *z_res; + struct crocus_resource *stencil_res; + struct blorp_surf z_surf; + struct blorp_surf stencil_surf; + + crocus_get_depth_stencil_resources(&batch->screen->devinfo, p_res, &z_res, &stencil_res); + if (z_res && clear_depth && + can_fast_clear_depth(ice, z_res, level, box, render_condition_enabled, + depth)) { + fast_clear_depth(ice, z_res, level, box, depth); + crocus_flush_and_dirty_for_history(ice, batch, res, 0, + "cache history: post fast Z clear"); + clear_depth = false; + z_res = NULL; + } + + /* At this point, we might have fast cleared the depth buffer. So if there's + * no stencil clear pending, return early. + */ + if (!(clear_depth || (clear_stencil && stencil_res))) { + return; + } + + if (clear_depth && z_res) { + const enum isl_aux_usage aux_usage = + crocus_resource_render_aux_usage(ice, z_res, level, z_res->surf.format, + false); + crocus_resource_prepare_render(ice, z_res, level, box->z, box->depth, + aux_usage); + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, + &z_surf, &z_res->base, aux_usage, + level, true); + } + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags); + + uint8_t stencil_mask = clear_stencil && stencil_res ? 0xff : 0; + if (stencil_mask) { + crocus_resource_prepare_access(ice, stencil_res, level, 1, box->z, + box->depth, stencil_res->aux.usage, false); + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, + &stencil_surf, &stencil_res->base, + stencil_res->aux.usage, level, true); + } + + blorp_clear_depth_stencil(&blorp_batch, &z_surf, &stencil_surf, + level, box->z, box->depth, + box->x, box->y, + box->x + box->width, + box->y + box->height, + clear_depth && z_res, depth, + stencil_mask, stencil); + + blorp_batch_finish(&blorp_batch); + crocus_flush_and_dirty_for_history(ice, batch, res, 0, + "cache history: post slow ZS clear"); + + if (clear_depth && z_res) { + crocus_resource_finish_render(ice, z_res, level, + box->z, box->depth, z_surf.aux_usage); + } + + if (stencil_mask) { + crocus_resource_finish_write(ice, stencil_res, level, box->z, box->depth, + stencil_res->aux.usage); + } +} + +/** + * The pipe->clear() driver hook. + * + * This clears buffers attached to the current draw framebuffer. + */ +static void +crocus_clear(struct pipe_context *ctx, + unsigned buffers, + const struct pipe_scissor_state *scissor_state, + const union pipe_color_union *p_color, + double depth, + unsigned stencil) +{ + struct crocus_context *ice = (void *) ctx; + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + assert(buffers != 0); + + struct pipe_box box = { + .width = cso_fb->width, + .height = cso_fb->height, + }; + + if (scissor_state) { + box.x = scissor_state->minx; + box.y = scissor_state->miny; + box.width = MIN2(box.width, scissor_state->maxx - scissor_state->minx); + box.height = MIN2(box.height, scissor_state->maxy - scissor_state->miny); + } + + if (buffers & PIPE_CLEAR_DEPTHSTENCIL) { + if (devinfo->ver < 6) { + crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE, true); + util_blitter_clear(ice->blitter, cso_fb->width, cso_fb->height, + util_framebuffer_get_num_layers(cso_fb), + buffers & PIPE_CLEAR_DEPTHSTENCIL, p_color, depth, stencil, false); + } else { + struct pipe_surface *psurf = cso_fb->zsbuf; + box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1; + box.z = psurf->u.tex.first_layer; + + clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box, true, + buffers & PIPE_CLEAR_DEPTH, + buffers & PIPE_CLEAR_STENCIL, + depth, stencil); + } + buffers &= ~PIPE_CLEAR_DEPTHSTENCIL; + } + + if (buffers & PIPE_CLEAR_COLOR) { + /* pipe_color_union and isl_color_value are interchangeable */ + union isl_color_value *color = (void *) p_color; + + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + if (buffers & (PIPE_CLEAR_COLOR0 << i)) { + struct pipe_surface *psurf = cso_fb->cbufs[i]; + struct crocus_surface *isurf = (void *) psurf; + box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1, + box.z = psurf->u.tex.first_layer, + + clear_color(ice, psurf->texture, psurf->u.tex.level, &box, + true, isurf->view.format, isurf->view.swizzle, + *color); + } + } + } +} + +/** + * The pipe->clear_texture() driver hook. + * + * This clears the given texture resource. + */ +static void +crocus_clear_texture(struct pipe_context *ctx, + struct pipe_resource *p_res, + unsigned level, + const struct pipe_box *box, + const void *data) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_resource *res = (void *) p_res; + + if (devinfo->ver < 6) { + util_clear_texture(ctx, p_res, + level, box, data); + return; + } + + if (crocus_resource_unfinished_aux_import(res)) + crocus_resource_finish_aux_import(ctx->screen, res); + + if (util_format_is_depth_or_stencil(p_res->format)) { + const struct util_format_unpack_description *fmt_unpack = + util_format_unpack_description(p_res->format); + + float depth = 0.0; + uint8_t stencil = 0; + + if (fmt_unpack->unpack_z_float) + fmt_unpack->unpack_z_float(&depth, 0, data, 0, 1, 1); + + if (fmt_unpack->unpack_s_8uint) + fmt_unpack->unpack_s_8uint(&stencil, 0, data, 0, 1, 1); + + clear_depth_stencil(ice, p_res, level, box, true, true, true, + depth, stencil); + } else { + union isl_color_value color; + struct crocus_resource *res = (void *) p_res; + enum isl_format format = res->surf.format; + + if (!isl_format_supports_rendering(devinfo, format)) { + const struct isl_format_layout *fmtl = isl_format_get_layout(format); + // XXX: actually just get_copy_format_for_bpb from BLORP + // XXX: don't cut and paste this + switch (fmtl->bpb) { + case 8: format = ISL_FORMAT_R8_UINT; break; + case 16: format = ISL_FORMAT_R8G8_UINT; break; + case 24: format = ISL_FORMAT_R8G8B8_UINT; break; + case 32: format = ISL_FORMAT_R8G8B8A8_UINT; break; + case 48: format = ISL_FORMAT_R16G16B16_UINT; break; + case 64: format = ISL_FORMAT_R16G16B16A16_UINT; break; + case 96: format = ISL_FORMAT_R32G32B32_UINT; break; + case 128: format = ISL_FORMAT_R32G32B32A32_UINT; break; + default: + unreachable("Unknown format bpb"); + } + + /* No aux surfaces for non-renderable surfaces */ + assert(res->aux.usage == ISL_AUX_USAGE_NONE); + } + + isl_color_value_unpack(&color, format, data); + + clear_color(ice, p_res, level, box, true, format, + ISL_SWIZZLE_IDENTITY, color); + } +} + +/** + * The pipe->clear_render_target() driver hook. + * + * This clears the given render target surface. + */ +static void +crocus_clear_render_target(struct pipe_context *ctx, + struct pipe_surface *psurf, + const union pipe_color_union *p_color, + unsigned dst_x, unsigned dst_y, + unsigned width, unsigned height, + bool render_condition_enabled) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_surface *isurf = (void *) psurf; + struct pipe_box box = { + .x = dst_x, + .y = dst_y, + .z = psurf->u.tex.first_layer, + .width = width, + .height = height, + .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1 + }; + + /* pipe_color_union and isl_color_value are interchangeable */ + union isl_color_value *color = (void *) p_color; + + clear_color(ice, psurf->texture, psurf->u.tex.level, &box, + render_condition_enabled, + isurf->view.format, isurf->view.swizzle, *color); +} + +/** + * The pipe->clear_depth_stencil() driver hook. + * + * This clears the given depth/stencil surface. + */ +static void +crocus_clear_depth_stencil(struct pipe_context *ctx, + struct pipe_surface *psurf, + unsigned flags, + double depth, + unsigned stencil, + unsigned dst_x, unsigned dst_y, + unsigned width, unsigned height, + bool render_condition_enabled) +{ + return; +#if 0 + struct crocus_context *ice = (void *) ctx; + struct pipe_box box = { + .x = dst_x, + .y = dst_y, + .z = psurf->u.tex.first_layer, + .width = width, + .height = height, + .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1 + }; + uint32_t blit_flags = 0; + + assert(util_format_is_depth_or_stencil(psurf->texture->format)); + + crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE); + util_blitter_clear(ice->blitter, width, height, + 1, flags, NULL, depth, stencil, render_condition_enabled); +#if 0 + clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box, + render_condition_enabled, + flags & PIPE_CLEAR_DEPTH, flags & PIPE_CLEAR_STENCIL, + depth, stencil); +#endif +#endif +} + +void +crocus_init_clear_functions(struct pipe_context *ctx) +{ + ctx->clear = crocus_clear; + ctx->clear_texture = crocus_clear_texture; + ctx->clear_render_target = crocus_clear_render_target; + ctx->clear_depth_stencil = crocus_clear_depth_stencil; +} diff --git a/src/gallium/drivers/crocus/crocus_context.c b/src/gallium/drivers/crocus/crocus_context.c new file mode 100644 index 00000000000..cd8a54d6d34 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_context.c @@ -0,0 +1,336 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <stdio.h> +#include <time.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "util/ralloc.h" +#include "util/u_inlines.h" +#include "util/format/u_format.h" +#include "util/u_upload_mgr.h" +#include "drm-uapi/i915_drm.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "common/intel_defines.h" +#include "common/intel_sample_positions.h" + +/** + * The pipe->set_debug_callback() driver hook. + */ +static void +crocus_set_debug_callback(struct pipe_context *ctx, + const struct pipe_debug_callback *cb) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + if (cb) + ice->dbg = *cb; + else + memset(&ice->dbg, 0, sizeof(ice->dbg)); +} + +static bool +crocus_init_identifier_bo(struct crocus_context *ice) +{ + void *bo_map; + + bo_map = crocus_bo_map(NULL, ice->workaround_bo, MAP_READ | MAP_WRITE); + if (!bo_map) + return false; + + ice->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE; + ice->workaround_offset = ALIGN( + intel_debug_write_identifiers(bo_map, 4096, "Crocus") + 8, 8); + + crocus_bo_unmap(ice->workaround_bo); + + return true; +} + +/** + * Called from the batch module when it detects a GPU hang. + * + * In this case, we've lost our GEM context, and can't rely on any existing + * state on the GPU. We must mark everything dirty and wipe away any saved + * assumptions about the last known state of the GPU. + */ +void +crocus_lost_context_state(struct crocus_batch *batch) +{ + /* The batch module doesn't have an crocus_context, because we want to + * avoid introducing lots of layering violations. Unfortunately, here + * we do need to inform the context of batch catastrophe. We know the + * batch is one of our context's, so hackily claw our way back. + */ + struct crocus_context *ice = batch->ice; + struct crocus_screen *screen = batch->screen; + if (batch->name == CROCUS_BATCH_RENDER) { + screen->vtbl.init_render_context(batch); + } else if (batch->name == CROCUS_BATCH_COMPUTE) { + screen->vtbl.init_compute_context(batch); + } else { + unreachable("unhandled batch reset"); + } + + ice->state.dirty = ~0ull; + memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); + batch->state_base_address_emitted = false; + screen->vtbl.lost_genx_state(ice, batch); +} + +static enum pipe_reset_status +crocus_get_device_reset_status(struct pipe_context *ctx) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + enum pipe_reset_status worst_reset = PIPE_NO_RESET; + + /* Check the reset status of each batch's hardware context, and take the + * worst status (if one was guilty, proclaim guilt). + */ + for (int i = 0; i < ice->batch_count; i++) { + /* This will also recreate the hardware contexts as necessary, so any + * future queries will show no resets. We only want to report once. + */ + enum pipe_reset_status batch_reset = + crocus_batch_check_for_reset(&ice->batches[i]); + + if (batch_reset == PIPE_NO_RESET) + continue; + + if (worst_reset == PIPE_NO_RESET) { + worst_reset = batch_reset; + } else { + /* GUILTY < INNOCENT < UNKNOWN */ + worst_reset = MIN2(worst_reset, batch_reset); + } + } + + if (worst_reset != PIPE_NO_RESET && ice->reset.reset) + ice->reset.reset(ice->reset.data, worst_reset); + + return worst_reset; +} + +static void +crocus_set_device_reset_callback(struct pipe_context *ctx, + const struct pipe_device_reset_callback *cb) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + if (cb) + ice->reset = *cb; + else + memset(&ice->reset, 0, sizeof(ice->reset)); +} + +static void +crocus_get_sample_position(struct pipe_context *ctx, + unsigned sample_count, + unsigned sample_index, + float *out_value) +{ + union { + struct { + float x[16]; + float y[16]; + } a; + struct { + float _0XOffset, _1XOffset, _2XOffset, _3XOffset, + _4XOffset, _5XOffset, _6XOffset, _7XOffset, + _8XOffset, _9XOffset, _10XOffset, _11XOffset, + _12XOffset, _13XOffset, _14XOffset, _15XOffset; + float _0YOffset, _1YOffset, _2YOffset, _3YOffset, + _4YOffset, _5YOffset, _6YOffset, _7YOffset, + _8YOffset, _9YOffset, _10YOffset, _11YOffset, + _12YOffset, _13YOffset, _14YOffset, _15YOffset; + } v; + } u; + switch (sample_count) { + case 1: INTEL_SAMPLE_POS_1X(u.v._); break; + case 2: INTEL_SAMPLE_POS_2X(u.v._); break; + case 4: INTEL_SAMPLE_POS_4X(u.v._); break; + case 8: INTEL_SAMPLE_POS_8X(u.v._); break; + case 16: INTEL_SAMPLE_POS_16X(u.v._); break; + default: unreachable("invalid sample count"); + } + + out_value[0] = u.a.x[sample_index]; + out_value[1] = u.a.y[sample_index]; +} + +/** + * Destroy a context, freeing any associated memory. + */ +static void +crocus_destroy_context(struct pipe_context *ctx) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + if (ctx->stream_uploader) + u_upload_destroy(ctx->stream_uploader); + + if (ice->blitter) + util_blitter_destroy(ice->blitter); + screen->vtbl.destroy_state(ice); + crocus_destroy_program_cache(ice); + u_upload_destroy(ice->query_buffer_uploader); + + crocus_bo_unreference(ice->workaround_bo); + + slab_destroy_child(&ice->transfer_pool); + + crocus_batch_free(&ice->batches[CROCUS_BATCH_RENDER]); + if (ice->batches[CROCUS_BATCH_COMPUTE].ice) + crocus_batch_free(&ice->batches[CROCUS_BATCH_COMPUTE]); + + ralloc_free(ice); +} + +#define genX_call(devinfo, func, ...) \ + switch ((devinfo)->verx10) { \ + case 75: \ + gfx75_##func(__VA_ARGS__); \ + break; \ + case 70: \ + gfx7_##func(__VA_ARGS__); \ + break; \ + case 60: \ + gfx6_##func(__VA_ARGS__); \ + break; \ + case 50: \ + gfx5_##func(__VA_ARGS__); \ + break; \ + case 45: \ + gfx45_##func(__VA_ARGS__); \ + break; \ + case 40: \ + gfx4_##func(__VA_ARGS__); \ + break; \ + default: \ + unreachable("Unknown hardware generation"); \ + } + +/** + * Create a context. + * + * This is where each context begins. + */ +struct pipe_context * +crocus_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags) +{ + struct crocus_screen *screen = (struct crocus_screen*)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_context *ice = rzalloc(NULL, struct crocus_context); + + if (!ice) + return NULL; + + struct pipe_context *ctx = &ice->ctx; + + ctx->screen = pscreen; + ctx->priv = priv; + + ctx->stream_uploader = u_upload_create_default(ctx); + if (!ctx->stream_uploader) { + free(ctx); + return NULL; + } + ctx->const_uploader = ctx->stream_uploader; + + ctx->destroy = crocus_destroy_context; + ctx->set_debug_callback = crocus_set_debug_callback; + ctx->set_device_reset_callback = crocus_set_device_reset_callback; + ctx->get_device_reset_status = crocus_get_device_reset_status; + ctx->get_sample_position = crocus_get_sample_position; + + ice->shaders.urb_size = devinfo->urb.size; + + crocus_init_context_fence_functions(ctx); + crocus_init_blit_functions(ctx); + crocus_init_clear_functions(ctx); + crocus_init_program_functions(ctx); + crocus_init_resource_functions(ctx); + crocus_init_flush_functions(ctx); + + crocus_init_program_cache(ice); + + slab_create_child(&ice->transfer_pool, &screen->transfer_pool); + + ice->query_buffer_uploader = + u_upload_create(ctx, 4096, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, + 0); + + ice->workaround_bo = + crocus_bo_alloc(screen->bufmgr, "workaround", 4096); + if (!ice->workaround_bo) + return NULL; + + if (!crocus_init_identifier_bo(ice)) + return NULL; + + genX_call(devinfo, init_state, ice); + genX_call(devinfo, init_blorp, ice); + genX_call(devinfo, init_query, ice); + + ice->blitter = util_blitter_create(&ice->ctx); + if (ice->blitter == NULL) + return NULL; + int priority = 0; + if (flags & PIPE_CONTEXT_HIGH_PRIORITY) + priority = INTEL_CONTEXT_HIGH_PRIORITY; + if (flags & PIPE_CONTEXT_LOW_PRIORITY) + priority = INTEL_CONTEXT_LOW_PRIORITY; + + ice->batch_count = devinfo->ver >= 7 ? CROCUS_BATCH_COUNT : 1; + for (int i = 0; i < ice->batch_count; i++) { + crocus_init_batch(ice, (enum crocus_batch_name) i, + priority); + } + + ice->urb.size = devinfo->urb.size; + screen->vtbl.init_render_context(&ice->batches[CROCUS_BATCH_RENDER]); + if (ice->batch_count > 1) + screen->vtbl.init_compute_context(&ice->batches[CROCUS_BATCH_COMPUTE]); + + return ctx; +} + +bool +crocus_sw_check_cond_render(struct crocus_context *ice) +{ + struct crocus_query *q = ice->condition.query; + union pipe_query_result result; + + bool wait = ice->condition.mode == PIPE_RENDER_COND_WAIT || + ice->condition.mode == PIPE_RENDER_COND_BY_REGION_WAIT; + if (!q) + return true; + + bool ret = ice->ctx.get_query_result(&ice->ctx, (void *)q, wait, &result); + if (!ret) + return true; + + return ice->condition.condition ? result.u64 == 0 : result.u64 != 0; +} diff --git a/src/gallium/drivers/crocus/crocus_context.h b/src/gallium/drivers/crocus/crocus_context.h new file mode 100644 index 00000000000..8d6e43d80f6 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_context.h @@ -0,0 +1,955 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_CONTEXT_H +#define CROCUS_CONTEXT_H + +#include "pipe/p_context.h" +#include "pipe/p_state.h" +#include "util/u_debug.h" +#include "intel/blorp/blorp.h" +#include "intel/dev/intel_debug.h" +#include "intel/compiler/brw_compiler.h" +#include "crocus_batch.h" +#include "crocus_fence.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "util/u_blitter.h" + +struct crocus_bo; +struct crocus_context; +struct blorp_batch; +struct blorp_params; + +#define CROCUS_MAX_TEXTURE_BUFFER_SIZE (1 << 27) +#define CROCUS_MAX_TEXTURE_SAMPLERS 32 +/* CROCUS_MAX_ABOS and CROCUS_MAX_SSBOS must be the same. */ +#define CROCUS_MAX_ABOS 16 +#define CROCUS_MAX_SSBOS 16 +#define CROCUS_MAX_VIEWPORTS 16 +#define CROCUS_MAX_CLIP_PLANES 8 + +enum crocus_param_domain { + BRW_PARAM_DOMAIN_BUILTIN = 0, + BRW_PARAM_DOMAIN_IMAGE, +}; + +enum { + DRI_CONF_BO_REUSE_DISABLED, + DRI_CONF_BO_REUSE_ALL +}; + +#define BRW_PARAM(domain, val) (BRW_PARAM_DOMAIN_##domain << 24 | (val)) +#define BRW_PARAM_DOMAIN(param) ((uint32_t)(param) >> 24) +#define BRW_PARAM_VALUE(param) ((uint32_t)(param) & 0x00ffffff) +#define BRW_PARAM_IMAGE(idx, offset) BRW_PARAM(IMAGE, ((idx) << 8) | (offset)) +#define BRW_PARAM_IMAGE_IDX(value) (BRW_PARAM_VALUE(value) >> 8) +#define BRW_PARAM_IMAGE_OFFSET(value)(BRW_PARAM_VALUE(value) & 0xf) + +/** + * Dirty flags. When state changes, we flag some combination of these + * to indicate that particular GPU commands need to be re-emitted. + * + * Each bit typically corresponds to a single 3DSTATE_* command packet, but + * in rare cases they map to a group of related packets that need to be + * emitted together. + * + * See crocus_upload_render_state(). + */ +#define CROCUS_DIRTY_COLOR_CALC_STATE (1ull << 0) +#define CROCUS_DIRTY_POLYGON_STIPPLE (1ull << 1) +#define CROCUS_DIRTY_CC_VIEWPORT (1ull << 2) +#define CROCUS_DIRTY_SF_CL_VIEWPORT (1ull << 3) +#define CROCUS_DIRTY_RASTER (1ull << 4) +#define CROCUS_DIRTY_CLIP (1ull << 5) +#define CROCUS_DIRTY_LINE_STIPPLE (1ull << 6) +#define CROCUS_DIRTY_VERTEX_ELEMENTS (1ull << 7) +#define CROCUS_DIRTY_VERTEX_BUFFERS (1ull << 8) +#define CROCUS_DIRTY_DRAWING_RECTANGLE (1ull << 9) +#define CROCUS_DIRTY_GEN6_URB (1ull << 10) +#define CROCUS_DIRTY_DEPTH_BUFFER (1ull << 11) +#define CROCUS_DIRTY_WM (1ull << 12) +#define CROCUS_DIRTY_SO_DECL_LIST (1ull << 13) +#define CROCUS_DIRTY_STREAMOUT (1ull << 14) +#define CROCUS_DIRTY_GEN4_CONSTANT_COLOR (1ull << 15) +#define CROCUS_DIRTY_GEN4_CURBE (1ull << 16) +#define CROCUS_DIRTY_GEN4_URB_FENCE (1ull << 17) +#define CROCUS_DIRTY_GEN5_PIPELINED_POINTERS (1ull << 18) +#define CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS (1ull << 19) +#define CROCUS_DIRTY_GEN6_BLEND_STATE (1ull << 20) +#define CROCUS_DIRTY_GEN6_SCISSOR_RECT (1ull << 21) +#define CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL (1ull << 22) +#define CROCUS_DIRTY_GEN6_MULTISAMPLE (1ull << 23) +#define CROCUS_DIRTY_GEN6_SAMPLE_MASK (1ull << 24) +#define CROCUS_DIRTY_GEN7_SBE (1ull << 25) +#define CROCUS_DIRTY_GEN7_L3_CONFIG (1ull << 26) +#define CROCUS_DIRTY_GEN7_SO_BUFFERS (1ull << 27) +#define CROCUS_DIRTY_GEN75_VF (1ull << 28) +#define CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES (1ull << 29) +#define CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 30) +#define CROCUS_DIRTY_VF_STATISTICS (1ull << 31) +#define CROCUS_DIRTY_GEN4_CLIP_PROG (1ull << 32) +#define CROCUS_DIRTY_GEN4_SF_PROG (1ull << 33) +#define CROCUS_DIRTY_GEN4_FF_GS_PROG (1ull << 34) +#define CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS (1ull << 35) +#define CROCUS_DIRTY_GEN6_SVBI (1ull << 36) + +#define CROCUS_ALL_DIRTY_FOR_COMPUTE (CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES) + +#define CROCUS_ALL_DIRTY_FOR_RENDER (~CROCUS_ALL_DIRTY_FOR_COMPUTE) + +/** + * Per-stage dirty flags. When state changes, we flag some combination of + * these to indicate that particular GPU commands need to be re-emitted. + * Unlike the IRIS_DIRTY_* flags these are shader stage-specific and can be + * indexed by shifting the mask by the shader stage index. + * + * See crocus_upload_render_state(). + */ +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS (1ull << 0) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS (1ull << 1) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES (1ull << 2) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS (1ull << 3) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS (1ull << 4) +#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS (1ull << 5) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_VS (1ull << 6) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_TCS (1ull << 7) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_TES (1ull << 8) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_GS (1ull << 9) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_FS (1ull << 10) +#define CROCUS_STAGE_DIRTY_UNCOMPILED_CS (1ull << 11) +#define CROCUS_STAGE_DIRTY_VS (1ull << 12) +#define CROCUS_STAGE_DIRTY_TCS (1ull << 13) +#define CROCUS_STAGE_DIRTY_TES (1ull << 14) +#define CROCUS_STAGE_DIRTY_GS (1ull << 15) +#define CROCUS_STAGE_DIRTY_FS (1ull << 16) +#define CROCUS_STAGE_DIRTY_CS (1ull << 17) +#define CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS 18 +#define CROCUS_STAGE_DIRTY_CONSTANTS_VS (1ull << 18) +#define CROCUS_STAGE_DIRTY_CONSTANTS_TCS (1ull << 19) +#define CROCUS_STAGE_DIRTY_CONSTANTS_TES (1ull << 20) +#define CROCUS_STAGE_DIRTY_CONSTANTS_GS (1ull << 21) +#define CROCUS_STAGE_DIRTY_CONSTANTS_FS (1ull << 22) +#define CROCUS_STAGE_DIRTY_CONSTANTS_CS (1ull << 23) +#define CROCUS_STAGE_DIRTY_BINDINGS_VS (1ull << 24) +#define CROCUS_STAGE_DIRTY_BINDINGS_TCS (1ull << 25) +#define CROCUS_STAGE_DIRTY_BINDINGS_TES (1ull << 26) +#define CROCUS_STAGE_DIRTY_BINDINGS_GS (1ull << 27) +#define CROCUS_STAGE_DIRTY_BINDINGS_FS (1ull << 28) +#define CROCUS_STAGE_DIRTY_BINDINGS_CS (1ull << 29) + +#define CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE (CROCUS_STAGE_DIRTY_CS | \ + CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS | \ + CROCUS_STAGE_DIRTY_UNCOMPILED_CS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_CS | \ + CROCUS_STAGE_DIRTY_BINDINGS_CS) + +#define CROCUS_ALL_STAGE_DIRTY_FOR_RENDER (~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE) + +#define CROCUS_ALL_STAGE_DIRTY_BINDINGS (CROCUS_STAGE_DIRTY_BINDINGS_VS | \ + CROCUS_STAGE_DIRTY_BINDINGS_TCS | \ + CROCUS_STAGE_DIRTY_BINDINGS_TES | \ + CROCUS_STAGE_DIRTY_BINDINGS_GS | \ + CROCUS_STAGE_DIRTY_BINDINGS_FS | \ + CROCUS_STAGE_DIRTY_BINDINGS_CS) + +#define CROCUS_RENDER_STAGE_DIRTY_CONSTANTS (CROCUS_STAGE_DIRTY_CONSTANTS_VS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_TCS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_TES | \ + CROCUS_STAGE_DIRTY_CONSTANTS_GS | \ + CROCUS_STAGE_DIRTY_CONSTANTS_FS) + +/** + * Non-orthogonal state (NOS) dependency flags. + * + * Shader programs may depend on non-orthogonal state. These flags are + * used to indicate that a shader's key depends on the state provided by + * a certain Gallium CSO. Changing any CSOs marked as a dependency will + * cause the driver to re-compute the shader key, possibly triggering a + * shader recompile. + */ +enum crocus_nos_dep { + CROCUS_NOS_FRAMEBUFFER, + CROCUS_NOS_DEPTH_STENCIL_ALPHA, + CROCUS_NOS_RASTERIZER, + CROCUS_NOS_BLEND, + CROCUS_NOS_LAST_VUE_MAP, + CROCUS_NOS_TEXTURES, + CROCUS_NOS_VERTEX_ELEMENTS, + CROCUS_NOS_COUNT, +}; + +struct crocus_depth_stencil_alpha_state; + +/** + * Cache IDs for the in-memory program cache (ice->shaders.cache). + */ +enum crocus_program_cache_id { + CROCUS_CACHE_VS = MESA_SHADER_VERTEX, + CROCUS_CACHE_TCS = MESA_SHADER_TESS_CTRL, + CROCUS_CACHE_TES = MESA_SHADER_TESS_EVAL, + CROCUS_CACHE_GS = MESA_SHADER_GEOMETRY, + CROCUS_CACHE_FS = MESA_SHADER_FRAGMENT, + CROCUS_CACHE_CS = MESA_SHADER_COMPUTE, + CROCUS_CACHE_BLORP, + CROCUS_CACHE_SF, + CROCUS_CACHE_CLIP, + CROCUS_CACHE_FF_GS, +}; + +/** @{ + * + * Defines for PIPE_CONTROL operations, which trigger cache flushes, + * synchronization, pipelined memory writes, and so on. + * + * The bits here are not the actual hardware values. The actual fields + * move between various generations, so we just have flags for each + * potential operation, and use genxml to encode the actual packet. + */ +enum pipe_control_flags +{ + PIPE_CONTROL_FLUSH_LLC = (1 << 1), + PIPE_CONTROL_LRI_POST_SYNC_OP = (1 << 2), + PIPE_CONTROL_STORE_DATA_INDEX = (1 << 3), + PIPE_CONTROL_CS_STALL = (1 << 4), + PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET = (1 << 5), + PIPE_CONTROL_SYNC_GFDT = (1 << 6), + PIPE_CONTROL_TLB_INVALIDATE = (1 << 7), + PIPE_CONTROL_MEDIA_STATE_CLEAR = (1 << 8), + PIPE_CONTROL_WRITE_IMMEDIATE = (1 << 9), + PIPE_CONTROL_WRITE_DEPTH_COUNT = (1 << 10), + PIPE_CONTROL_WRITE_TIMESTAMP = (1 << 11), + PIPE_CONTROL_DEPTH_STALL = (1 << 12), + PIPE_CONTROL_RENDER_TARGET_FLUSH = (1 << 13), + PIPE_CONTROL_INSTRUCTION_INVALIDATE = (1 << 14), + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE = (1 << 15), + PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE = (1 << 16), + PIPE_CONTROL_NOTIFY_ENABLE = (1 << 17), + PIPE_CONTROL_FLUSH_ENABLE = (1 << 18), + PIPE_CONTROL_DATA_CACHE_FLUSH = (1 << 19), + PIPE_CONTROL_VF_CACHE_INVALIDATE = (1 << 20), + PIPE_CONTROL_CONST_CACHE_INVALIDATE = (1 << 21), + PIPE_CONTROL_STATE_CACHE_INVALIDATE = (1 << 22), + PIPE_CONTROL_STALL_AT_SCOREBOARD = (1 << 23), + PIPE_CONTROL_DEPTH_CACHE_FLUSH = (1 << 24), + PIPE_CONTROL_TILE_CACHE_FLUSH = (1 << 25), +}; + +#define PIPE_CONTROL_CACHE_FLUSH_BITS \ + (PIPE_CONTROL_DEPTH_CACHE_FLUSH | \ + PIPE_CONTROL_DATA_CACHE_FLUSH | \ + PIPE_CONTROL_RENDER_TARGET_FLUSH) + +#define PIPE_CONTROL_CACHE_INVALIDATE_BITS \ + (PIPE_CONTROL_STATE_CACHE_INVALIDATE | \ + PIPE_CONTROL_CONST_CACHE_INVALIDATE | \ + PIPE_CONTROL_VF_CACHE_INVALIDATE | \ + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \ + PIPE_CONTROL_INSTRUCTION_INVALIDATE) + +enum crocus_predicate_state { + /* The first two states are used if we can determine whether to draw + * without having to look at the values in the query object buffer. This + * will happen if there is no conditional render in progress, if the query + * object is already completed or if something else has already added + * samples to the preliminary result. + */ + CROCUS_PREDICATE_STATE_RENDER, + CROCUS_PREDICATE_STATE_DONT_RENDER, + + /* In this case whether to draw or not depends on the result of an + * MI_PREDICATE command so the predicate enable bit needs to be checked. + */ + CROCUS_PREDICATE_STATE_USE_BIT, + /* In this case, either MI_PREDICATE doesn't exist or we lack the + * necessary kernel features to use it. Stall for the query result. + */ + CROCUS_PREDICATE_STATE_STALL_FOR_QUERY, +}; + +/** @} */ + +/** + * An uncompiled, API-facing shader. This is the Gallium CSO for shaders. + * It primarily contains the NIR for the shader. + * + * Each API-facing shader can be compiled into multiple shader variants, + * based on non-orthogonal state dependencies, recorded in the shader key. + * + * See crocus_compiled_shader, which represents a compiled shader variant. + */ +struct crocus_uncompiled_shader { + struct nir_shader *nir; + + struct pipe_stream_output_info stream_output; + + /* A SHA1 of the serialized NIR for the disk cache. */ + unsigned char nir_sha1[20]; + + unsigned program_id; + + /** Bitfield of (1 << CROCUS_NOS_*) flags. */ + unsigned nos; + + /** Have any shader variants been compiled yet? */ + bool compiled_once; + + /** Should we use ALT mode for math? Useful for ARB programs. */ + bool use_alt_mode; + + bool needs_edge_flag; + + /** Constant data scraped from the shader by nir_opt_large_constants */ + struct pipe_resource *const_data; + + /** Surface state for const_data */ + struct crocus_state_ref const_data_state; +}; + +enum crocus_surface_group { + CROCUS_SURFACE_GROUP_RENDER_TARGET, + CROCUS_SURFACE_GROUP_RENDER_TARGET_READ, + CROCUS_SURFACE_GROUP_SOL, + CROCUS_SURFACE_GROUP_CS_WORK_GROUPS, + CROCUS_SURFACE_GROUP_TEXTURE, + CROCUS_SURFACE_GROUP_TEXTURE_GATHER, + CROCUS_SURFACE_GROUP_IMAGE, + CROCUS_SURFACE_GROUP_UBO, + CROCUS_SURFACE_GROUP_SSBO, + + CROCUS_SURFACE_GROUP_COUNT, +}; + +enum { + /* Invalid value for a binding table index. */ + CROCUS_SURFACE_NOT_USED = 0xa0a0a0a0, +}; + +struct crocus_binding_table { + uint32_t size_bytes; + + /** Number of surfaces in each group, before compacting. */ + uint32_t sizes[CROCUS_SURFACE_GROUP_COUNT]; + + /** Initial offset of each group. */ + uint32_t offsets[CROCUS_SURFACE_GROUP_COUNT]; + + /** Mask of surfaces used in each group. */ + uint64_t used_mask[CROCUS_SURFACE_GROUP_COUNT]; +}; + +/** + * A compiled shader variant, containing a pointer to the GPU assembly, + * as well as program data and other packets needed by state upload. + * + * There can be several crocus_compiled_shader variants per API-level shader + * (crocus_uncompiled_shader), due to state-based recompiles (brw_*_prog_key). + */ +struct crocus_compiled_shader { + /** Reference to the uploaded assembly. */ + uint32_t offset; + + /* asm size in map */ + uint32_t map_size; + + /** The program data (owned by the program cache hash table) */ + struct brw_stage_prog_data *prog_data; + uint32_t prog_data_size; + + /** A list of system values to be uploaded as uniforms. */ + enum brw_param_builtin *system_values; + unsigned num_system_values; + + /** Number of constbufs expected by the shader. */ + unsigned num_cbufs; + + /** + * Derived 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets + * (the VUE-based information for transform feedback outputs). + */ + uint32_t *streamout; + + struct crocus_binding_table bt; + + uint32_t bind_bo_offset; + uint32_t surf_offset[128];//TODO +}; + +/** + * API context state that is replicated per shader stage. + */ +struct crocus_shader_state { + /** Uniform Buffers */ + struct pipe_constant_buffer constbufs[PIPE_MAX_CONSTANT_BUFFERS]; + + bool sysvals_need_upload; + + /** Shader Storage Buffers */ + struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS]; + + /** Shader Storage Images (image load store) */ + struct crocus_image_view image[PIPE_MAX_SHADER_IMAGES]; + + struct crocus_sampler_state *samplers[CROCUS_MAX_TEXTURE_SAMPLERS]; + struct crocus_sampler_view *textures[CROCUS_MAX_TEXTURE_SAMPLERS]; + + /** Bitfield of which constant buffers are bound (non-null). */ + uint32_t bound_cbufs; + + /** Bitfield of which image views are bound (non-null). */ + uint32_t bound_image_views; + + /** Bitfield of which sampler views are bound (non-null). */ + uint32_t bound_sampler_views; + + /** Bitfield of which shader storage buffers are bound (non-null). */ + uint32_t bound_ssbos; + + /** Bitfield of which shader storage buffers are writable. */ + uint32_t writable_ssbos; + + uint32_t sampler_offset; +}; + +/** + * The API context (derived from pipe_context). + * + * Most driver state is tracked here. + */ +struct crocus_context { + struct pipe_context ctx; + + /** A debug callback for KHR_debug output. */ + struct pipe_debug_callback dbg; + + /** A device reset status callback for notifying that the GPU is hosed. */ + struct pipe_device_reset_callback reset; + + /** Slab allocator for crocus_transfer_map objects. */ + struct slab_child_pool transfer_pool; + + struct blorp_context blorp; + + int batch_count; + struct crocus_batch batches[CROCUS_BATCH_COUNT]; + + struct u_upload_mgr *query_buffer_uploader; + + struct blitter_context *blitter; + + struct { + struct { + /** + * Either the value of BaseVertex for indexed draw calls or the value + * of the argument <first> for non-indexed draw calls. + */ + int firstvertex; + int baseinstance; + } params; + + /** + * Are the above values the ones stored in the draw_params buffer? + * If so, we can compare them against new values to see if anything + * changed. If not, we need to assume they changed. + */ + bool params_valid; + + /** + * Resource and offset that stores draw_parameters from the indirect + * buffer or to the buffer that stures the previous values for non + * indirect draws. + */ + struct crocus_state_ref draw_params; + + struct { + /** + * The value of DrawID. This always comes in from it's own vertex + * buffer since it's not part of the indirect draw parameters. + */ + int drawid; + + /** + * Stores if an indexed or non-indexed draw (~0/0). Useful to + * calculate BaseVertex as an AND of firstvertex and is_indexed_draw. + */ + int is_indexed_draw; + } derived_params; + + /** + * Resource and offset used for GL_ARB_shader_draw_parameters which + * contains parameters that are not present in the indirect buffer as + * drawid and is_indexed_draw. They will go in their own vertex element. + */ + struct crocus_state_ref derived_draw_params; + } draw; + + struct { + struct crocus_uncompiled_shader *uncompiled[MESA_SHADER_STAGES]; + struct crocus_compiled_shader *prog[MESA_SHADER_STAGES]; + struct brw_vue_map *last_vue_map; + + struct crocus_bo *cache_bo; + uint32_t cache_next_offset; + void *cache_bo_map; + struct hash_table *cache; + + unsigned urb_size; + + /* gen 4/5 clip/sf progs */ + struct crocus_compiled_shader *clip_prog; + struct crocus_compiled_shader *sf_prog; + /* gen4/5 prims, gen6 streamout */ + struct crocus_compiled_shader *ff_gs_prog; + uint32_t clip_offset; + uint32_t sf_offset; + uint32_t wm_offset; + uint32_t vs_offset; + uint32_t gs_offset; + uint32_t cc_offset; + + /** Is a GS or TES outputting points or lines? */ + bool output_topology_is_points_or_lines; + + /* Track last VS URB entry size */ + unsigned last_vs_entry_size; + + /** + * Scratch buffers for various sizes and stages. + * + * Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding, + * and shader stage. + */ + struct crocus_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES]; + } shaders; + + struct { + struct crocus_query *query; + bool condition; + enum pipe_render_cond_flag mode; + } condition; + + struct intel_perf_context *perf_ctx; + + struct { + uint64_t dirty; + uint64_t stage_dirty; + uint64_t stage_dirty_for_nos[CROCUS_NOS_COUNT]; + + unsigned num_viewports; + unsigned sample_mask; + struct crocus_blend_state *cso_blend; + struct crocus_rasterizer_state *cso_rast; + struct crocus_depth_stencil_alpha_state *cso_zsa; + struct crocus_vertex_element_state *cso_vertex_elements; + struct pipe_blend_color blend_color; + struct pipe_poly_stipple poly_stipple; + struct pipe_viewport_state viewports[CROCUS_MAX_VIEWPORTS]; + struct pipe_scissor_state scissors[CROCUS_MAX_VIEWPORTS]; + struct pipe_stencil_ref stencil_ref; + struct pipe_framebuffer_state framebuffer; + struct pipe_clip_state clip_planes; + + float default_outer_level[4]; + float default_inner_level[2]; + + /** Bitfield of which vertex buffers are bound (non-null). */ + uint32_t bound_vertex_buffers; + struct pipe_vertex_buffer vertex_buffers[16]; + uint32_t vb_end[16]; + + bool primitive_restart; + unsigned cut_index; + enum pipe_prim_type prim_mode:8; + bool prim_is_points_or_lines; + uint8_t vertices_per_patch; + + bool window_space_position; + + /** The last compute group size */ + uint32_t last_block[3]; + + /** The last compute grid size */ + uint32_t last_grid[3]; + /** Reference to the BO containing the compute grid size */ + struct crocus_state_ref grid_size; + + /** + * Array of aux usages for drawing, altered to account for any + * self-dependencies from resources bound for sampling and rendering. + */ + enum isl_aux_usage draw_aux_usage[BRW_MAX_DRAW_BUFFERS]; + + /** Aux usage of the fb's depth buffer (which may or may not exist). */ + enum isl_aux_usage hiz_usage; + + /** Bitfield of whether color blending is enabled for RT[i] */ + uint8_t blend_enables; + + /** Are depth writes enabled? (Depth buffer may or may not exist.) */ + bool depth_writes_enabled; + + /** Are stencil writes enabled? (Stencil buffer may or may not exist.) */ + bool stencil_writes_enabled; + + /** GenX-specific current state */ + struct crocus_genx_state *genx; + + struct crocus_shader_state shaders[MESA_SHADER_STAGES]; + + /** Do vertex shader uses shader draw parameters ? */ + bool vs_uses_draw_params; + bool vs_uses_derived_draw_params; + bool vs_needs_sgvs_element; + bool vs_uses_vertexid; + bool vs_uses_instanceid; + + /** Do vertex shader uses edge flag ? */ + bool vs_needs_edge_flag; + + struct pipe_stream_output_target *so_target[PIPE_MAX_SO_BUFFERS]; + bool streamout_active; + int so_targets; + + bool statistics_counters_enabled; + + /** Current conditional rendering mode */ + enum crocus_predicate_state predicate; + bool predicate_supported; + + /** + * Query BO with a MI_PREDICATE_RESULT snapshot calculated on the + * render context that needs to be uploaded to the compute context. + */ + struct crocus_bo *compute_predicate; + + /** Is a PIPE_QUERY_PRIMITIVES_GENERATED query active? */ + bool prims_generated_query_active; + + /** 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets */ + uint32_t *streamout; + + /** + * Resources containing streamed state which our render context + * currently points to. Used to re-add these to the validation + * list when we start a new batch and haven't resubmitted commands. + */ + struct { + struct pipe_resource *res; + uint32_t offset; + uint32_t size; + uint32_t index_size; + bool prim_restart; + } index_buffer; + + uint32_t sf_vp_address; + uint32_t clip_vp_address; + uint32_t cc_vp_address; + + uint32_t stats_wm; + float global_depth_offset_clamp; + + uint32_t last_xfb_verts_per_prim; + uint64_t svbi; + } state; + + /* BRW_NEW_URB_ALLOCATIONS: + */ + struct { + uint32_t vsize; /* vertex size plus header in urb registers */ + uint32_t gsize; /* GS output size in urb registers */ + uint32_t hsize; /* Tessellation control output size in urb registers */ + uint32_t dsize; /* Tessellation evaluation output size in urb registers */ + uint32_t csize; /* constant buffer size in urb registers */ + uint32_t sfsize; /* setup data size in urb registers */ + + bool constrained; + + uint32_t nr_vs_entries; + uint32_t nr_hs_entries; + uint32_t nr_ds_entries; + uint32_t nr_gs_entries; + uint32_t nr_clip_entries; + uint32_t nr_sf_entries; + uint32_t nr_cs_entries; + + uint32_t vs_start; + uint32_t hs_start; + uint32_t ds_start; + uint32_t gs_start; + uint32_t clip_start; + uint32_t sf_start; + uint32_t cs_start; + /** + * URB size in the current configuration. The units this is expressed + * in are somewhat inconsistent, see intel_device_info::urb::size. + * + * FINISHME: Represent the URB size consistently in KB on all platforms. + */ + uint32_t size; + + /* True if the most recently sent _3DSTATE_URB message allocated + * URB space for the GS. + */ + bool gs_present; + + /* True if the most recently sent _3DSTATE_URB message allocated + * URB space for the HS and DS. + */ + bool tess_present; + } urb; + + /* GEN4/5 curbe */ + struct { + unsigned wm_start; + unsigned wm_size; + unsigned clip_start; + unsigned clip_size; + unsigned vs_start; + unsigned vs_size; + unsigned total_size; + + struct crocus_resource *curbe_res; + unsigned curbe_offset; + } curbe; + + /** + * A buffer containing a marker + description of the driver. This buffer is + * added to all execbufs syscalls so that we can identify the driver that + * generated a hang by looking at the content of the buffer in the error + * state. It is also used for hardware workarounds that require scratch + * writes or reads from some unimportant memory. To avoid overriding the + * debug data, use the workaround_address field for workarounds. + */ + struct crocus_bo *workaround_bo; + unsigned workaround_offset; +}; + +#define perf_debug(dbg, ...) do { \ + if (INTEL_DEBUG & DEBUG_PERF) \ + dbg_printf(__VA_ARGS__); \ + if (unlikely(dbg)) \ + pipe_debug_message(dbg, PERF_INFO, __VA_ARGS__); \ +} while(0) + + +struct pipe_context * +crocus_create_context(struct pipe_screen *screen, void *priv, unsigned flags); + +void crocus_lost_context_state(struct crocus_batch *batch); + +void crocus_init_blit_functions(struct pipe_context *ctx); +void crocus_init_clear_functions(struct pipe_context *ctx); +void crocus_init_program_functions(struct pipe_context *ctx); +void crocus_init_resource_functions(struct pipe_context *ctx); +bool crocus_update_compiled_shaders(struct crocus_context *ice); +void crocus_update_compiled_compute_shader(struct crocus_context *ice); +void crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data, + unsigned threads, uint32_t *dst); + + +/* crocus_blit.c */ +enum crocus_blitter_op +{ + CROCUS_SAVE_TEXTURES = 1, + CROCUS_SAVE_FRAMEBUFFER = 2, + CROCUS_SAVE_FRAGMENT_STATE = 4, + CROCUS_DISABLE_RENDER_COND = 8, +}; +void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond); + +void crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl, + struct isl_device *isl_dev, + struct blorp_surf *surf, + struct pipe_resource *p_res, + enum isl_aux_usage aux_usage, + unsigned level, + bool is_render_target); +void crocus_copy_region(struct blorp_context *blorp, + struct crocus_batch *batch, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box); + +/* crocus_draw.c */ +void crocus_draw_vbo(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws); +void crocus_launch_grid(struct pipe_context *, const struct pipe_grid_info *); + +/* crocus_pipe_control.c */ + +void crocus_emit_pipe_control_flush(struct crocus_batch *batch, + const char *reason, uint32_t flags); +void crocus_emit_pipe_control_write(struct crocus_batch *batch, + const char *reason, uint32_t flags, + struct crocus_bo *bo, uint32_t offset, + uint64_t imm); +void crocus_emit_mi_flush(struct crocus_batch *batch); +void crocus_emit_depth_stall_flushes(struct crocus_batch *batch); +void crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch); +void crocus_emit_end_of_pipe_sync(struct crocus_batch *batch, + const char *reason, uint32_t flags); +void crocus_flush_all_caches(struct crocus_batch *batch); + +#define crocus_handle_always_flush_cache(batch) \ + if (unlikely(batch->screen->driconf.always_flush_cache)) \ + crocus_flush_all_caches(batch); + +void crocus_init_flush_functions(struct pipe_context *ctx); + +/* crocus_program.c */ +const struct shader_info *crocus_get_shader_info(const struct crocus_context *ice, + gl_shader_stage stage); +struct crocus_bo *crocus_get_scratch_space(struct crocus_context *ice, + unsigned per_thread_scratch, + gl_shader_stage stage); +uint32_t crocus_group_index_to_bti(const struct crocus_binding_table *bt, + enum crocus_surface_group group, + uint32_t index); +uint32_t crocus_bti_to_group_index(const struct crocus_binding_table *bt, + enum crocus_surface_group group, + uint32_t bti); + +/* crocus_disk_cache.c */ + +void crocus_disk_cache_store(struct disk_cache *cache, + const struct crocus_uncompiled_shader *ish, + const struct crocus_compiled_shader *shader, + void *map, + const void *prog_key, + uint32_t prog_key_size); +struct crocus_compiled_shader * +crocus_disk_cache_retrieve(struct crocus_context *ice, + const struct crocus_uncompiled_shader *ish, + const void *prog_key, + uint32_t prog_key_size); + +/* crocus_program_cache.c */ + +void crocus_init_program_cache(struct crocus_context *ice); +void crocus_destroy_program_cache(struct crocus_context *ice); +void crocus_print_program_cache(struct crocus_context *ice); +struct crocus_compiled_shader *crocus_find_cached_shader(struct crocus_context *ice, + enum crocus_program_cache_id, + uint32_t key_size, + const void *key); +struct crocus_compiled_shader *crocus_upload_shader(struct crocus_context *ice, + enum crocus_program_cache_id, + uint32_t key_size, + const void *key, + const void *assembly, + uint32_t asm_size, + struct brw_stage_prog_data *, + uint32_t prog_data_size, + uint32_t *streamout, + enum brw_param_builtin *sysv, + unsigned num_system_values, + unsigned num_cbufs, + const struct crocus_binding_table *bt); +const void *crocus_find_previous_compile(const struct crocus_context *ice, + enum crocus_program_cache_id cache_id, + unsigned program_string_id); +bool crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch, + const void *key, + uint32_t key_size, + uint32_t *kernel_out, + void *prog_data_out); +bool crocus_blorp_upload_shader(struct blorp_batch *blorp_batch, + uint32_t stage, + const void *key, uint32_t key_size, + const void *kernel, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, + uint32_t *kernel_out, + void *prog_data_out); + +/* crocus_resolve.c */ + +void crocus_predraw_resolve_inputs(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled, + gl_shader_stage stage, + bool consider_framebuffer); +void crocus_predraw_resolve_framebuffer(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled); +void crocus_postdraw_update_resolve_tracking(struct crocus_context *ice, + struct crocus_batch *batch); +void crocus_cache_sets_clear(struct crocus_batch *batch); +void crocus_flush_depth_and_render_caches(struct crocus_batch *batch); +void crocus_cache_flush_for_read(struct crocus_batch *batch, struct crocus_bo *bo); +void crocus_cache_flush_for_render(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage); +void crocus_render_cache_add_bo(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage); +void crocus_cache_flush_for_depth(struct crocus_batch *batch, struct crocus_bo *bo); +void crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo); +int crocus_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info); +int crocus_get_driver_query_group_info(struct pipe_screen *pscreen, + unsigned index, + struct pipe_driver_query_group_info *info); + +struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ctx); + +bool crocus_sw_check_cond_render(struct crocus_context *ice); +static inline bool crocus_check_conditional_render(struct crocus_context *ice) +{ + if (ice->state.predicate == CROCUS_PREDICATE_STATE_STALL_FOR_QUERY) + return crocus_sw_check_cond_render(ice); + return ice->state.predicate != CROCUS_PREDICATE_STATE_DONT_RENDER; +} + +#ifdef genX +# include "crocus_genx_protos.h" +#else +# define genX(x) gfx4_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx45_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx5_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx6_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx7_##x +# include "crocus_genx_protos.h" +# undef genX +# define genX(x) gfx75_##x +# include "crocus_genx_protos.h" +# undef genX +#endif + +#endif diff --git a/src/gallium/drivers/crocus/crocus_defines.h b/src/gallium/drivers/crocus/crocus_defines.h new file mode 100644 index 00000000000..a634d0746b0 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_defines.h @@ -0,0 +1,58 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_DEFINES_H +#define CROCUS_DEFINES_H + +/** + * @file crocus_defines.h + * + * Random hardware #defines that we're not using GENXML for. + */ + +#define MI_PREDICATE (0xC << 23) +# define MI_PREDICATE_LOADOP_KEEP (0 << 6) +# define MI_PREDICATE_LOADOP_LOAD (2 << 6) +# define MI_PREDICATE_LOADOP_LOADINV (3 << 6) +# define MI_PREDICATE_COMBINEOP_SET (0 << 3) +# define MI_PREDICATE_COMBINEOP_AND (1 << 3) +# define MI_PREDICATE_COMBINEOP_OR (2 << 3) +# define MI_PREDICATE_COMBINEOP_XOR (3 << 3) +# define MI_PREDICATE_COMPAREOP_TRUE (0 << 0) +# define MI_PREDICATE_COMPAREOP_FALSE (1 << 0) +# define MI_PREDICATE_COMPAREOP_SRCS_EQUAL (2 << 0) +# define MI_PREDICATE_COMPAREOP_DELTAS_EQUAL (3 << 0) + +/* Predicate registers */ +#define MI_PREDICATE_SRC0 0x2400 +#define MI_PREDICATE_SRC1 0x2408 +#define MI_PREDICATE_DATA 0x2410 +#define MI_PREDICATE_RESULT 0x2418 +#define MI_PREDICATE_RESULT_1 0x241C +#define MI_PREDICATE_RESULT_2 0x2214 + +#define CS_GPR(n) (0x2600 + (n) * 8) + +/* The number of bits in our TIMESTAMP queries. */ +#define TIMESTAMP_BITS 36 + +#endif diff --git a/src/gallium/drivers/crocus/crocus_disk_cache.c b/src/gallium/drivers/crocus/crocus_disk_cache.c new file mode 100644 index 00000000000..c84d043fbc8 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_disk_cache.c @@ -0,0 +1,263 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_disk_cache.c + * + * Functions for interacting with the on-disk shader cache. + */ + +#include <stdio.h> +#include <stdint.h> +#include <assert.h> +#include <string.h> + +#include "compiler/nir/nir.h" +#include "util/blob.h" +#include "util/build_id.h" +#include "util/disk_cache.h" +#include "util/mesa-sha1.h" + +#include "crocus_context.h" + +static bool debug = false; + +/** + * Compute a disk cache key for the given uncompiled shader and NOS key. + */ +static void +crocus_disk_cache_compute_key(struct disk_cache *cache, + const struct crocus_uncompiled_shader *ish, + const void *orig_prog_key, + uint32_t prog_key_size, + cache_key cache_key) +{ + /* Create a copy of the program key with program_string_id zeroed out. + * It's essentially random data which we don't want to include in our + * hashing and comparisons. We'll set a proper value on a cache hit. + */ + union brw_any_prog_key prog_key; + memcpy(&prog_key, orig_prog_key, prog_key_size); + prog_key.base.program_string_id = 0; + + uint8_t data[sizeof(prog_key) + sizeof(ish->nir_sha1)]; + uint32_t data_size = prog_key_size + sizeof(ish->nir_sha1); + + memcpy(data, ish->nir_sha1, sizeof(ish->nir_sha1)); + memcpy(data + sizeof(ish->nir_sha1), &prog_key, prog_key_size); + + disk_cache_compute_key(cache, data, data_size, cache_key); +} + +/** + * Store the given compiled shader in the disk cache. + * + * This should only be called on newly compiled shaders. No checking is + * done to prevent repeated stores of the same shader. + */ +void +crocus_disk_cache_store(struct disk_cache *cache, + const struct crocus_uncompiled_shader *ish, + const struct crocus_compiled_shader *shader, + void *map, + const void *prog_key, + uint32_t prog_key_size) +{ +#ifdef ENABLE_SHADER_CACHE + if (!cache) + return; + + gl_shader_stage stage = ish->nir->info.stage; + const struct brw_stage_prog_data *prog_data = shader->prog_data; + + cache_key cache_key; + crocus_disk_cache_compute_key(cache, ish, prog_key, prog_key_size, cache_key); + + if (debug) { + char sha1[41]; + _mesa_sha1_format(sha1, cache_key); + fprintf(stderr, "[mesa disk cache] storing %s\n", sha1); + } + + struct blob blob; + blob_init(&blob); + + /* We write the following data to the cache blob: + * + * 1. Prog data (must come first because it has the assembly size) + * 2. Assembly code + * 3. Number of entries in the system value array + * 4. System value array + * 5. Legacy param array (only used for compute workgroup ID) + * 6. Binding table + */ + blob_write_bytes(&blob, shader->prog_data, brw_prog_data_size(stage)); + blob_write_bytes(&blob, map + shader->offset, shader->prog_data->program_size); + blob_write_bytes(&blob, &shader->num_system_values, sizeof(unsigned)); + blob_write_bytes(&blob, shader->system_values, + shader->num_system_values * sizeof(enum brw_param_builtin)); + blob_write_bytes(&blob, prog_data->param, + prog_data->nr_params * sizeof(uint32_t)); + blob_write_bytes(&blob, &shader->bt, sizeof(shader->bt)); + + disk_cache_put(cache, cache_key, blob.data, blob.size, NULL); + blob_finish(&blob); +#endif +} + +/** + * Search for a compiled shader in the disk cache. If found, upload it + * to the in-memory program cache so we can use it. + */ +struct crocus_compiled_shader * +crocus_disk_cache_retrieve(struct crocus_context *ice, + const struct crocus_uncompiled_shader *ish, + const void *prog_key, + uint32_t key_size) +{ +#ifdef ENABLE_SHADER_CACHE + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct disk_cache *cache = screen->disk_cache; + gl_shader_stage stage = ish->nir->info.stage; + + if (!cache) + return NULL; + + cache_key cache_key; + crocus_disk_cache_compute_key(cache, ish, prog_key, key_size, cache_key); + + if (debug) { + char sha1[41]; + _mesa_sha1_format(sha1, cache_key); + fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1); + } + + size_t size; + void *buffer = disk_cache_get(screen->disk_cache, cache_key, &size); + + if (debug) + fprintf(stderr, "%s\n", buffer ? "found" : "missing"); + + if (!buffer) + return NULL; + + const uint32_t prog_data_size = brw_prog_data_size(stage); + + struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size); + const void *assembly; + uint32_t num_system_values; + uint32_t *system_values = NULL; + uint32_t *so_decls = NULL; + + struct blob_reader blob; + blob_reader_init(&blob, buffer, size); + blob_copy_bytes(&blob, prog_data, prog_data_size); + assembly = blob_read_bytes(&blob, prog_data->program_size); + num_system_values = blob_read_uint32(&blob); + if (num_system_values) { + system_values = + ralloc_array(NULL, enum brw_param_builtin, num_system_values); + blob_copy_bytes(&blob, system_values, + num_system_values * sizeof(enum brw_param_builtin)); + } + + prog_data->param = NULL; + prog_data->pull_param = NULL; + assert(prog_data->nr_pull_params == 0); + + if (prog_data->nr_params) { + prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params); + blob_copy_bytes(&blob, prog_data->param, + prog_data->nr_params * sizeof(uint32_t)); + } + + struct crocus_binding_table bt; + blob_copy_bytes(&blob, &bt, sizeof(bt)); + + if ((stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY) && screen->devinfo.ver > 6) { + struct brw_vue_prog_data *vue_prog_data = (void *) prog_data; + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + } + + /* System values and uniforms are stored in constant buffer 0, the + * user-facing UBOs are indexed by one. So if any constant buffer is + * needed, the constant buffer 0 will be needed, so account for it. + */ + unsigned num_cbufs = ish->nir->info.num_ubos; + + if (num_cbufs || ish->nir->num_uniforms) + num_cbufs++; + + if (num_system_values) + num_cbufs++; + + /* Upload our newly read shader to the in-memory program cache and + * return it to the caller. + */ + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, stage, key_size, prog_key, assembly, + prog_data->program_size, + prog_data, prog_data_size, so_decls, system_values, + num_system_values, num_cbufs, &bt); + + free(buffer); + + return shader; +#else + return NULL; +#endif +} + +/** + * Initialize the on-disk shader cache. + */ +void +crocus_disk_cache_init(struct crocus_screen *screen) +{ +#ifdef ENABLE_SHADER_CACHE + if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK) + return; + + /* array length = print length + nul char + 1 extra to verify it's unused */ + char renderer[13]; + UNUSED int len = + snprintf(renderer, sizeof(renderer), "crocus_%04x", screen->pci_id); + assert(len == sizeof(renderer) - 2); + + const struct build_id_note *note = + build_id_find_nhdr_for_addr(crocus_disk_cache_init); + assert(note && build_id_length(note) == 20); /* sha1 */ + + const uint8_t *id_sha1 = build_id_data(note); + assert(id_sha1); + + char timestamp[41]; + _mesa_sha1_format(timestamp, id_sha1); + + const uint64_t driver_flags = + brw_get_compiler_config_value(screen->compiler); + screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags); +#endif +} diff --git a/src/gallium/drivers/crocus/crocus_draw.c b/src/gallium/drivers/crocus/crocus_draw.c new file mode 100644 index 00000000000..119c5571ae1 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_draw.c @@ -0,0 +1,511 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_draw.c + * + * The main driver hooks for drawing and launching compute shaders. + */ + +#include <stdio.h> +#include <errno.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_draw.h" +#include "util/u_inlines.h" +#include "util/u_transfer.h" +#include "util/u_upload_mgr.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/compiler/brw_eu_defines.h" +#include "crocus_context.h" +#include "crocus_defines.h" +#include "util/u_prim_restart.h" +#include "indices/u_primconvert.h" +#include "util/u_prim.h" + +static bool +prim_is_points_or_lines(enum pipe_prim_type mode) +{ + /* We don't need to worry about adjacency - it can only be used with + * geometry shaders, and we don't care about this info when GS is on. + */ + return mode == PIPE_PRIM_POINTS || + mode == PIPE_PRIM_LINES || + mode == PIPE_PRIM_LINE_LOOP || + mode == PIPE_PRIM_LINE_STRIP; +} + +static bool +can_cut_index_handle_restart_index(struct crocus_context *ice, + const struct pipe_draw_info *draw) +{ + switch (draw->index_size) { + case 1: + return draw->restart_index == 0xff; + case 2: + return draw->restart_index == 0xffff; + case 4: + return draw->restart_index == 0xffffffff; + default: + unreachable("illegal index size\n"); + } + + return false; +} + +static bool +can_cut_index_handle_prim(struct crocus_context *ice, + const struct pipe_draw_info *draw) +{ + struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + /* Haswell can do it all. */ + if (devinfo->is_haswell) + return true; + + if (!can_cut_index_handle_restart_index(ice, draw)) + return false; + + switch (draw->mode) { + case PIPE_PRIM_POINTS: + case PIPE_PRIM_LINES: + case PIPE_PRIM_LINE_STRIP: + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_LINES_ADJACENCY: + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + case PIPE_PRIM_TRIANGLES_ADJACENCY: + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + return true; + default: + break; + } + return false; +} + +/** + * Record the current primitive mode and restart information, flagging + * related packets as dirty if necessary. + * + * This must be called before updating compiled shaders, because the patch + * information informs the TCS key. + */ +static void +crocus_update_draw_info(struct crocus_context *ice, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + enum pipe_prim_type mode = info->mode; + + if (screen->devinfo.ver < 6) { + /* Slight optimization to avoid the GS program when not needed: + */ + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + if (mode == PIPE_PRIM_QUAD_STRIP && !rs_state->flatshade && + rs_state->fill_front == PIPE_POLYGON_MODE_FILL && + rs_state->fill_back == PIPE_POLYGON_MODE_FILL) + mode = PIPE_PRIM_TRIANGLE_STRIP; + if (mode == PIPE_PRIM_QUADS && + draw->count == 4 && + !rs_state->flatshade && + rs_state->fill_front == PIPE_POLYGON_MODE_FILL && + rs_state->fill_back == PIPE_POLYGON_MODE_FILL) + mode = PIPE_PRIM_TRIANGLE_FAN; + } + + if (ice->state.prim_mode != mode) { + ice->state.prim_mode = mode; + + if (screen->devinfo.ver < 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; + if (screen->devinfo.ver <= 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; + + if (screen->devinfo.ver >= 7) + ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; + + /* For XY Clip enables */ + bool points_or_lines = prim_is_points_or_lines(mode); + if (points_or_lines != ice->state.prim_is_points_or_lines) { + ice->state.prim_is_points_or_lines = points_or_lines; + ice->state.dirty |= CROCUS_DIRTY_CLIP; + } + } + + if (info->mode == PIPE_PRIM_PATCHES && + ice->state.vertices_per_patch != info->vertices_per_patch) { + ice->state.vertices_per_patch = info->vertices_per_patch; + + /* This is needed for key->input_vertices */ + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_TCS; + + /* Flag constants dirty for gl_PatchVerticesIn if needed. */ + const struct shader_info *tcs_info = + crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL); + if (tcs_info && + BITSET_TEST(tcs_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS; + ice->state.shaders[MESA_SHADER_TESS_CTRL].sysvals_need_upload = true; + } + } + + const unsigned cut_index = info->primitive_restart ? info->restart_index : + ice->state.cut_index; + if (ice->state.primitive_restart != info->primitive_restart || + ice->state.cut_index != cut_index) { + if (screen->devinfo.is_haswell) + ice->state.dirty |= CROCUS_DIRTY_GEN75_VF; + ice->state.primitive_restart = info->primitive_restart; + ice->state.cut_index = info->restart_index; + } +} + +/** + * Update shader draw parameters, flagging VF packets as dirty if necessary. + */ +static void +crocus_update_draw_parameters(struct crocus_context *ice, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draw) +{ + bool changed = false; + + if (ice->state.vs_uses_draw_params) { + struct crocus_state_ref *draw_params = &ice->draw.draw_params; + + if (indirect && indirect->buffer) { + pipe_resource_reference(&draw_params->res, indirect->buffer); + draw_params->offset = + indirect->offset + (info->index_size ? 12 : 8); + + changed = true; + ice->draw.params_valid = false; + } else { + int firstvertex = info->index_size ? draw->index_bias : draw->start; + + if (!ice->draw.params_valid || + ice->draw.params.firstvertex != firstvertex || + ice->draw.params.baseinstance != info->start_instance) { + + changed = true; + ice->draw.params.firstvertex = firstvertex; + ice->draw.params.baseinstance = info->start_instance; + ice->draw.params_valid = true; + + u_upload_data(ice->ctx.stream_uploader, 0, + sizeof(ice->draw.params), 4, &ice->draw.params, + &draw_params->offset, &draw_params->res); + } + } + } + + if (ice->state.vs_uses_derived_draw_params) { + struct crocus_state_ref *derived_params = &ice->draw.derived_draw_params; + int is_indexed_draw = info->index_size ? -1 : 0; + + if (ice->draw.derived_params.drawid != drawid_offset || + ice->draw.derived_params.is_indexed_draw != is_indexed_draw) { + + changed = true; + ice->draw.derived_params.drawid = drawid_offset; + ice->draw.derived_params.is_indexed_draw = is_indexed_draw; + + u_upload_data(ice->ctx.stream_uploader, 0, + sizeof(ice->draw.derived_params), 4, + &ice->draw.derived_params, &derived_params->offset, + &derived_params->res); + } + } + + if (changed) { + ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS | + CROCUS_DIRTY_VERTEX_ELEMENTS; + } +} + +static void +crocus_indirect_draw_vbo(struct crocus_context *ice, + const struct pipe_draw_info *dinfo, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *dindirect, + const struct pipe_draw_start_count_bias *draws) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + struct pipe_draw_info info = *dinfo; + struct pipe_draw_indirect_info indirect = *dindirect; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + + if (devinfo->is_haswell && indirect.indirect_draw_count && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + /* Upload MI_PREDICATE_RESULT to GPR15.*/ + screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT); + } + + uint64_t orig_dirty = ice->state.dirty; + uint64_t orig_stage_dirty = ice->state.stage_dirty; + + for (int i = 0; i < indirect.draw_count; i++) { + crocus_batch_maybe_flush(batch, 1500); + crocus_require_statebuffer_space(batch, 2400); + + crocus_update_draw_parameters(ice, &info, drawid_offset + i, &indirect, draws); + + screen->vtbl.upload_render_state(ice, batch, &info, drawid_offset + i, &indirect, draws); + + ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER; + ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; + + indirect.offset += indirect.stride; + } + + if (devinfo->is_haswell && indirect.indirect_draw_count && + ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) { + /* Restore MI_PREDICATE_RESULT. */ + screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15)); + } + + /* Put this back for post-draw resolves, we'll clear it again after. */ + ice->state.dirty = orig_dirty; + ice->state.stage_dirty = orig_stage_dirty; +} + +static void +crocus_simple_draw_vbo(struct crocus_context *ice, + const struct pipe_draw_info *draw, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + + crocus_batch_maybe_flush(batch, 1500); + crocus_require_statebuffer_space(batch, 2400); + + crocus_update_draw_parameters(ice, draw, drawid_offset, indirect, sc); + + screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc); +} + +static void +crocus_draw_vbo_get_vertex_count(struct pipe_context *ctx, + const struct pipe_draw_info *info_in, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect) +{ + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + struct pipe_draw_info info = *info_in; + struct pipe_draw_start_count_bias draw; + + uint32_t val = screen->vtbl.get_so_offset(indirect->count_from_stream_output); + + draw.start = 0; + draw.count = val; + ctx->draw_vbo(ctx, &info, drawid_offset, NULL, &draw, 1); +} + +/** + * The pipe->draw_vbo() driver hook. Performs a draw on the GPU. + */ +void +crocus_draw_vbo(struct pipe_context *ctx, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws, + unsigned num_draws) +{ + if (num_draws > 1) { + util_draw_multi(ctx, info, drawid_offset, indirect, draws, num_draws); + return; + } + + if (!indirect && (!draws[0].count || !info->instance_count)) + return; + + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + if (!crocus_check_conditional_render(ice)) + return; + + if (info->primitive_restart && !can_cut_index_handle_prim(ice, info)) { + util_draw_vbo_without_prim_restart(ctx, info, drawid_offset, + indirect, draws); + return; + } + + if (indirect && indirect->count_from_stream_output && + !screen->devinfo.is_haswell) { + crocus_draw_vbo_get_vertex_count(ctx, info, drawid_offset, indirect); + return; + } + + /** + * The hardware is capable of removing dangling vertices on its own; however, + * prior to Gen6, we sometimes convert quads into trifans (and quad strips + * into tristrips), since pre-Gen6 hardware requires a GS to render quads. + * This function manually trims dangling vertices from a draw call involving + * quads so that those dangling vertices won't get drawn when we convert to + * trifans/tristrips. + */ + if (screen->devinfo.ver < 6) { + if (info->mode == PIPE_PRIM_QUADS || info->mode == PIPE_PRIM_QUAD_STRIP) { + bool trim = u_trim_pipe_prim(info->mode, (unsigned *)&draws[0].count); + if (!trim) + return; + } + } + + /* We can't safely re-emit 3DSTATE_SO_BUFFERS because it may zero the + * write offsets, changing the behavior. + */ + if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) { + ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER & ~CROCUS_DIRTY_GEN7_SO_BUFFERS; + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; + } + + /* Emit Sandybridge workaround flushes on every primitive, for safety. */ + if (screen->devinfo.ver == 6) + crocus_emit_post_sync_nonzero_flush(batch); + + crocus_update_draw_info(ice, info, draws); + + if (!crocus_update_compiled_shaders(ice)) + return; + + if (ice->state.dirty & CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES) { + bool draw_aux_buffer_disabled[BRW_MAX_DRAW_BUFFERS] = { }; + for (gl_shader_stage stage = 0; stage < MESA_SHADER_COMPUTE; stage++) { + if (ice->shaders.prog[stage]) + crocus_predraw_resolve_inputs(ice, batch, draw_aux_buffer_disabled, + stage, true); + } + crocus_predraw_resolve_framebuffer(ice, batch, draw_aux_buffer_disabled); + } + + crocus_handle_always_flush_cache(batch); + + if (indirect && indirect->buffer) + crocus_indirect_draw_vbo(ice, info, drawid_offset, indirect, draws); + else + crocus_simple_draw_vbo(ice, info, drawid_offset, indirect, draws); + + crocus_handle_always_flush_cache(batch); + + crocus_postdraw_update_resolve_tracking(ice, batch); + + ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER; + ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER; +} + +static void +crocus_update_grid_size_resource(struct crocus_context *ice, + const struct pipe_grid_info *grid) +{ + struct crocus_state_ref *grid_ref = &ice->state.grid_size; + const struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_COMPUTE]; + bool grid_needs_surface = shader->bt.used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS]; + + if (grid->indirect) { + pipe_resource_reference(&grid_ref->res, grid->indirect); + grid_ref->offset = grid->indirect_offset; + + /* Zero out the grid size so that the next non-indirect grid launch will + * re-upload it properly. + */ + memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); + } else if (memcmp(ice->state.last_grid, grid->grid, sizeof(grid->grid)) != 0) { + memcpy(ice->state.last_grid, grid->grid, sizeof(grid->grid)); + u_upload_data(ice->ctx.const_uploader, 0, sizeof(grid->grid), 4, + grid->grid, &grid_ref->offset, &grid_ref->res); + } + + /* Skip surface upload if we don't need it or we already have one */ + if (!grid_needs_surface) + return; + + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_CS; +} + + +void +crocus_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_COMPUTE]; + struct crocus_screen *screen = batch->screen; + + if (!crocus_check_conditional_render(ice)) + return; + + if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) { + ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE; + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE; + } + + /* We can't do resolves on the compute engine, so awkwardly, we have to + * do them on the render batch... + */ + if (ice->state.dirty & CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES) { + crocus_predraw_resolve_inputs(ice, &ice->batches[CROCUS_BATCH_RENDER], NULL, + MESA_SHADER_COMPUTE, false); + } + + crocus_batch_maybe_flush(batch, 1500); + crocus_require_statebuffer_space(batch, 2500); + crocus_update_compiled_compute_shader(ice); + + if (memcmp(ice->state.last_block, grid->block, sizeof(grid->block)) != 0) { + memcpy(ice->state.last_block, grid->block, sizeof(grid->block)); + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS; + ice->state.shaders[MESA_SHADER_COMPUTE].sysvals_need_upload = true; + } + + crocus_update_grid_size_resource(ice, grid); + + if (ice->state.compute_predicate) { + screen->vtbl.emit_compute_predicate(batch); + ice->state.compute_predicate = NULL; + } + + crocus_handle_always_flush_cache(batch); + + screen->vtbl.upload_compute_state(ice, batch, grid); + + crocus_handle_always_flush_cache(batch); + + ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_COMPUTE; + ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE; + + /* Note: since compute shaders can't access the framebuffer, there's + * no need to call crocus_postdraw_update_resolve_tracking. + */ +} diff --git a/src/gallium/drivers/crocus/crocus_fence.c b/src/gallium/drivers/crocus/crocus_fence.c new file mode 100644 index 00000000000..fdff24b2dd4 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fence.c @@ -0,0 +1,571 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_fence.c + * + * Fences for driver and IPC serialisation, scheduling and synchronisation. + */ + +#include "util/u_inlines.h" +#include "intel/common/intel_gem.h" + +#include "crocus_batch.h" +#include "crocus_bufmgr.h" +#include "crocus_context.h" +#include "crocus_fence.h" +#include "crocus_screen.h" + +static uint32_t +gem_syncobj_create(int fd, uint32_t flags) +{ + struct drm_syncobj_create args = { + .flags = flags, + }; + + intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_CREATE, &args); + + return args.handle; +} + +static void +gem_syncobj_destroy(int fd, uint32_t handle) +{ + struct drm_syncobj_destroy args = { + .handle = handle, + }; + + intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_DESTROY, &args); +} + +/** + * Make a new sync-point. + */ +struct crocus_syncobj * +crocus_create_syncobj(struct crocus_screen *screen) +{ + struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj)); + + if (!syncobj) + return NULL; + + syncobj->handle = gem_syncobj_create(screen->fd, 0); + assert(syncobj->handle); + + pipe_reference_init(&syncobj->ref, 1); + + return syncobj; +} + +void +crocus_syncobj_destroy(struct crocus_screen *screen, + struct crocus_syncobj *syncobj) +{ + gem_syncobj_destroy(screen->fd, syncobj->handle); + free(syncobj); +} + +/** + * Add a sync-point to the batch, with the given flags. + * + * \p flags One of I915_EXEC_FENCE_WAIT or I915_EXEC_FENCE_SIGNAL. + */ +void +crocus_batch_add_syncobj(struct crocus_batch *batch, + struct crocus_syncobj *syncobj, unsigned flags) +{ + struct drm_i915_gem_exec_fence *fence = + util_dynarray_grow(&batch->exec_fences, struct drm_i915_gem_exec_fence, 1); + + *fence = (struct drm_i915_gem_exec_fence){ + .handle = syncobj->handle, + .flags = flags, + }; + + struct crocus_syncobj **store = + util_dynarray_grow(&batch->syncobjs, struct crocus_syncobj *, 1); + + *store = NULL; + crocus_syncobj_reference(batch->screen, store, syncobj); +} + +/** + * Walk through a batch's dependencies (any I915_EXEC_FENCE_WAIT syncobjs) + * and unreference any which have already passed. + * + * Sometimes the compute batch is seldom used, and accumulates references + * to stale render batches that are no longer of interest, so we can free + * those up. + */ +static void +clear_stale_syncobjs(struct crocus_batch *batch) +{ + struct crocus_screen *screen = batch->screen; + + int n = util_dynarray_num_elements(&batch->syncobjs, struct crocus_syncobj *); + + assert(n == util_dynarray_num_elements(&batch->exec_fences, + struct drm_i915_gem_exec_fence)); + + /* Skip the first syncobj, as it's the signalling one. */ + for (int i = n - 1; i > 1; i--) { + struct crocus_syncobj **syncobj = + util_dynarray_element(&batch->syncobjs, struct crocus_syncobj *, i); + struct drm_i915_gem_exec_fence *fence = + util_dynarray_element(&batch->exec_fences, + struct drm_i915_gem_exec_fence, i); + assert(fence->flags & I915_EXEC_FENCE_WAIT); + + if (crocus_wait_syncobj(&screen->base, *syncobj, 0)) + continue; + + /* This sync object has already passed, there's no need to continue + * marking it as a dependency; we can stop holding on to the reference. + */ + crocus_syncobj_reference(screen, syncobj, NULL); + + /* Remove it from the lists; move the last element here. */ + struct crocus_syncobj **nth_syncobj = + util_dynarray_pop_ptr(&batch->syncobjs, struct crocus_syncobj *); + struct drm_i915_gem_exec_fence *nth_fence = + util_dynarray_pop_ptr(&batch->exec_fences, + struct drm_i915_gem_exec_fence); + + if (syncobj != nth_syncobj) { + *syncobj = *nth_syncobj; + memcpy(fence, nth_fence, sizeof(*fence)); + } + } +} + +/* ------------------------------------------------------------------- */ + +struct pipe_fence_handle { + struct pipe_reference ref; + + struct pipe_context *unflushed_ctx; + + struct crocus_fine_fence *fine[CROCUS_BATCH_COUNT]; +}; + +static void +crocus_fence_destroy(struct pipe_screen *p_screen, + struct pipe_fence_handle *fence) +{ + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) + crocus_fine_fence_reference(screen, &fence->fine[i], NULL); + + free(fence); +} + +static void +crocus_fence_reference(struct pipe_screen *p_screen, + struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) +{ + if (pipe_reference(&(*dst)->ref, &src->ref)) + crocus_fence_destroy(p_screen, *dst); + + *dst = src; +} + +bool +crocus_wait_syncobj(struct pipe_screen *p_screen, + struct crocus_syncobj *syncobj, int64_t timeout_nsec) +{ + if (!syncobj) + return false; + + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + struct drm_syncobj_wait args = { + .handles = (uintptr_t)&syncobj->handle, + .count_handles = 1, + .timeout_nsec = timeout_nsec, + }; + return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args); +} + +static void +crocus_fence_flush(struct pipe_context *ctx, + struct pipe_fence_handle **out_fence, unsigned flags) +{ + struct crocus_screen *screen = (void *)ctx->screen; + struct crocus_context *ice = (struct crocus_context *)ctx; + + const bool deferred = flags & PIPE_FLUSH_DEFERRED; + + if (!deferred) { + for (unsigned i = 0; i < ice->batch_count; i++) + crocus_batch_flush(&ice->batches[i]); + } + + if (!out_fence) + return; + + struct pipe_fence_handle *fence = calloc(1, sizeof(*fence)); + if (!fence) + return; + + pipe_reference_init(&fence->ref, 1); + + if (deferred) + fence->unflushed_ctx = ctx; + + for (unsigned b = 0; b < ice->batch_count; b++) { + struct crocus_batch *batch = &ice->batches[b]; + + if (deferred && crocus_batch_bytes_used(batch) > 0) { + struct crocus_fine_fence *fine = + crocus_fine_fence_new(batch, CROCUS_FENCE_BOTTOM_OF_PIPE); + crocus_fine_fence_reference(screen, &fence->fine[b], fine); + crocus_fine_fence_reference(screen, &fine, NULL); + } else { + /* This batch has no commands queued up (perhaps we just flushed, + * or all the commands are on the other batch). Wait for the last + * syncobj on this engine - unless it's already finished by now. + */ + if (crocus_fine_fence_signaled(batch->last_fence)) + continue; + + crocus_fine_fence_reference(screen, &fence->fine[b], + batch->last_fence); + } + } + + crocus_fence_reference(ctx->screen, out_fence, NULL); + *out_fence = fence; +} + +static void +crocus_fence_await(struct pipe_context *ctx, struct pipe_fence_handle *fence) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + /* Unflushed fences from the same context are no-ops. */ + if (ctx && ctx == fence->unflushed_ctx) + return; + + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + for (unsigned b = 0; b < ice->batch_count; b++) { + struct crocus_batch *batch = &ice->batches[b]; + + /* We're going to make any future work in this batch wait for our + * fence to have gone by. But any currently queued work doesn't + * need to wait. Flush the batch now, so it can happen sooner. + */ + crocus_batch_flush(batch); + + /* Before adding a new reference, clean out any stale ones. */ + clear_stale_syncobjs(batch); + + crocus_batch_add_syncobj(batch, fine->syncobj, I915_EXEC_FENCE_WAIT); + } + } +} + +#define NSEC_PER_SEC (1000 * USEC_PER_SEC) +#define USEC_PER_SEC (1000 * MSEC_PER_SEC) +#define MSEC_PER_SEC (1000) + +static uint64_t +gettime_ns(void) +{ + struct timespec current; + clock_gettime(CLOCK_MONOTONIC, ¤t); + return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec; +} + +static uint64_t +rel2abs(uint64_t timeout) +{ + if (timeout == 0) + return 0; + + uint64_t current_time = gettime_ns(); + uint64_t max_timeout = (uint64_t)INT64_MAX - current_time; + + timeout = MIN2(max_timeout, timeout); + + return current_time + timeout; +} + +static bool +crocus_fence_finish(struct pipe_screen *p_screen, struct pipe_context *ctx, + struct pipe_fence_handle *fence, uint64_t timeout) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + + /* If we created the fence with PIPE_FLUSH_DEFERRED, we may not have + * flushed yet. Check if our syncobj is the current batch's signalling + * syncobj - if so, we haven't flushed and need to now. + * + * The Gallium docs mention that a flush will occur if \p ctx matches + * the context the fence was created with. It may be NULL, so we check + * that it matches first. + */ + if (ctx && ctx == fence->unflushed_ctx) { + for (unsigned i = 0; i < ice->batch_count; i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + if (fine->syncobj == crocus_batch_get_signal_syncobj(&ice->batches[i])) + crocus_batch_flush(&ice->batches[i]); + } + + /* The fence is no longer deferred. */ + fence->unflushed_ctx = NULL; + } + + unsigned int handle_count = 0; + uint32_t handles[ARRAY_SIZE(fence->fine)]; + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + handles[handle_count++] = fine->syncobj->handle; + } + + if (handle_count == 0) + return true; + + struct drm_syncobj_wait args = { + .handles = (uintptr_t)handles, + .count_handles = handle_count, + .timeout_nsec = rel2abs(timeout), + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL + }; + if (fence->unflushed_ctx) { + /* This fence had a deferred flush from another context. We can't + * safely flush it here, because the context might be bound to a + * different thread, and poking at its internals wouldn't be safe. + * + * Instead, use the WAIT_FOR_SUBMIT flag to block and hope that + * another thread submits the work. + */ + args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT; + } + return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args) == 0; +} + +#ifndef SYNC_IOC_MAGIC +/* duplicated from linux/sync_file.h to avoid build-time dependency + * on new (v4.7) kernel headers. Once distro's are mostly using + * something newer than v4.7 drop this and #include <linux/sync_file.h> + * instead. + */ +struct sync_merge_data { + char name[32]; + __s32 fd2; + __s32 fence; + __u32 flags; + __u32 pad; +}; + +#define SYNC_IOC_MAGIC '>' +#define SYNC_IOC_MERGE _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data) +#endif + +static int +sync_merge_fd(int sync_fd, int new_fd) +{ + if (sync_fd == -1) + return new_fd; + + if (new_fd == -1) + return sync_fd; + + struct sync_merge_data args = { + .name = "crocus fence", + .fd2 = new_fd, + .fence = -1, + }; + + intel_ioctl(sync_fd, SYNC_IOC_MERGE, &args); + close(new_fd); + close(sync_fd); + + return args.fence; +} + +static int +crocus_fence_get_fd(struct pipe_screen *p_screen, + struct pipe_fence_handle *fence) +{ + struct crocus_screen *screen = (struct crocus_screen *)p_screen; + int fd = -1; + + /* Deferred fences aren't supported. */ + if (fence->unflushed_ctx) + return -1; + + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + if (crocus_fine_fence_signaled(fine)) + continue; + + struct drm_syncobj_handle args = { + .handle = fine->syncobj->handle, + .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE, + .fd = -1, + }; + + intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args); + fd = sync_merge_fd(fd, args.fd); + } + + if (fd == -1) { + /* Our fence has no syncobj's recorded. This means that all of the + * batches had already completed, their syncobj's had been signalled, + * and so we didn't bother to record them. But we're being asked to + * export such a fence. So export a dummy already-signalled syncobj. + */ + struct drm_syncobj_handle args = { + .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE, + .fd = -1, + }; + + args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED); + intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args); + gem_syncobj_destroy(screen->fd, args.handle); + return args.fd; + } + + return fd; +} + +static void +crocus_fence_create_fd(struct pipe_context *ctx, struct pipe_fence_handle **out, + int fd, enum pipe_fd_type type) +{ + assert(type == PIPE_FD_TYPE_NATIVE_SYNC || type == PIPE_FD_TYPE_SYNCOBJ); + + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + struct drm_syncobj_handle args = { + .fd = fd, + }; + + if (type == PIPE_FD_TYPE_NATIVE_SYNC) { + args.flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE; + args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED); + } + + if (intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args) == -1) { + fprintf(stderr, "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE failed: %s\n", + strerror(errno)); + if (type == PIPE_FD_TYPE_NATIVE_SYNC) + gem_syncobj_destroy(screen->fd, args.handle); + *out = NULL; + return; + } + + struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj)); + if (!syncobj) { + *out = NULL; + return; + } + syncobj->handle = args.handle; + pipe_reference_init(&syncobj->ref, 1); + + struct crocus_fine_fence *fine = calloc(1, sizeof(*fine)); + if (!fine) { + free(syncobj); + *out = NULL; + return; + } + + static const uint32_t zero = 0; + + /* Fences work in terms of crocus_fine_fence, but we don't actually have a + * seqno for an imported fence. So, create a fake one which always + * returns as 'not signaled' so we fall back to using the sync object. + */ + fine->seqno = UINT32_MAX; + fine->map = &zero; + fine->syncobj = syncobj; + fine->flags = CROCUS_FENCE_END; + pipe_reference_init(&fine->reference, 1); + + struct pipe_fence_handle *fence = calloc(1, sizeof(*fence)); + if (!fence) { + free(fine); + free(syncobj); + *out = NULL; + return; + } + pipe_reference_init(&fence->ref, 1); + fence->fine[0] = fine; + + *out = fence; +} + +static void +crocus_fence_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + if (ctx == fence->unflushed_ctx) + return; + + for (unsigned b = 0; b < ice->batch_count; b++) { + for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) { + struct crocus_fine_fence *fine = fence->fine[i]; + + /* already signaled fence skipped */ + if (crocus_fine_fence_signaled(fine)) + continue; + + ice->batches[b].contains_fence_signal = true; + crocus_batch_add_syncobj(&ice->batches[b], fine->syncobj, + I915_EXEC_FENCE_SIGNAL); + } + } +} + +void +crocus_init_screen_fence_functions(struct pipe_screen *screen) +{ + screen->fence_reference = crocus_fence_reference; + screen->fence_finish = crocus_fence_finish; + screen->fence_get_fd = crocus_fence_get_fd; +} + +void +crocus_init_context_fence_functions(struct pipe_context *ctx) +{ + ctx->flush = crocus_fence_flush; + ctx->create_fence_fd = crocus_fence_create_fd; + ctx->fence_server_sync = crocus_fence_await; + ctx->fence_server_signal = crocus_fence_signal; +} diff --git a/src/gallium/drivers/crocus/crocus_fence.h b/src/gallium/drivers/crocus/crocus_fence.h new file mode 100644 index 00000000000..ef2eff5259b --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fence.h @@ -0,0 +1,60 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_FENCE_H +#define CROCUS_FENCE_H + +#include "util/u_inlines.h" + +struct pipe_screen; +struct crocus_screen; +struct crocus_batch; + +struct crocus_syncobj { + struct pipe_reference ref; + uint32_t handle; +}; + +void crocus_init_context_fence_functions(struct pipe_context *ctx); +void crocus_init_screen_fence_functions(struct pipe_screen *screen); + +struct crocus_syncobj *crocus_create_syncobj(struct crocus_screen *screen); +void crocus_syncobj_destroy(struct crocus_screen *, struct crocus_syncobj *); +void crocus_batch_add_syncobj(struct crocus_batch *batch, + struct crocus_syncobj *syncobj, + unsigned flags); +bool crocus_wait_syncobj(struct pipe_screen *screen, + struct crocus_syncobj *syncobj, + int64_t timeout_nsec); +static inline void +crocus_syncobj_reference(struct crocus_screen *screen, + struct crocus_syncobj **dst, + struct crocus_syncobj *src) +{ + if (pipe_reference(&(*dst)->ref, &src->ref)) + crocus_syncobj_destroy(screen, *dst); + + *dst = src; +} + +#endif diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.c b/src/gallium/drivers/crocus/crocus_fine_fence.c new file mode 100644 index 00000000000..9bb8a9673e3 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fine_fence.c @@ -0,0 +1,85 @@ +#include "crocus_context.h" +#include "crocus_fine_fence.h" +#include "util/u_upload_mgr.h" + +static void +crocus_fine_fence_reset(struct crocus_batch *batch) +{ + u_upload_alloc(batch->fine_fences.uploader, + 0, sizeof(uint64_t), sizeof(uint64_t), + &batch->fine_fences.ref.offset, &batch->fine_fences.ref.res, + (void **)&batch->fine_fences.map); + WRITE_ONCE(*batch->fine_fences.map, 0); + batch->fine_fences.next++; +} + +void +crocus_fine_fence_init(struct crocus_batch *batch) +{ + batch->fine_fences.ref.res = NULL; + batch->fine_fences.next = 0; + if (batch_has_fine_fence(batch)) + crocus_fine_fence_reset(batch); +} + +static uint32_t +crocus_fine_fence_next(struct crocus_batch *batch) +{ + if (!batch_has_fine_fence(batch)) + return UINT32_MAX; + + uint32_t seqno = batch->fine_fences.next++; + + if (batch->fine_fences.next == 0) + crocus_fine_fence_reset(batch); + + return seqno; +} + +void +crocus_fine_fence_destroy(struct crocus_screen *screen, + struct crocus_fine_fence *fine) +{ + crocus_syncobj_reference(screen, &fine->syncobj, NULL); + pipe_resource_reference(&fine->ref.res, NULL); + free(fine); +} + +struct crocus_fine_fence * +crocus_fine_fence_new(struct crocus_batch *batch, unsigned flags) +{ + struct crocus_fine_fence *fine = calloc(1, sizeof(*fine)); + if (!fine) + return NULL; + + pipe_reference_init(&fine->reference, 1); + + fine->seqno = crocus_fine_fence_next(batch); + + crocus_syncobj_reference(batch->screen, &fine->syncobj, + crocus_batch_get_signal_syncobj(batch)); + + if (!batch_has_fine_fence(batch)) + return fine; + pipe_resource_reference(&fine->ref.res, batch->fine_fences.ref.res); + fine->ref.offset = batch->fine_fences.ref.offset; + fine->map = batch->fine_fences.map; + fine->flags = flags; + + unsigned pc; + if (flags & CROCUS_FENCE_TOP_OF_PIPE) { + pc = PIPE_CONTROL_WRITE_IMMEDIATE | PIPE_CONTROL_CS_STALL; + } else { + pc = PIPE_CONTROL_WRITE_IMMEDIATE | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_TILE_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DATA_CACHE_FLUSH; + } + crocus_emit_pipe_control_write(batch, "fence: fine", pc, + crocus_resource_bo(fine->ref.res), + fine->ref.offset, + fine->seqno); + + return fine; +} diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.h b/src/gallium/drivers/crocus/crocus_fine_fence.h new file mode 100644 index 00000000000..ad6f02a945a --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_fine_fence.h @@ -0,0 +1,109 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CROCUS_FINE_FENCE_DOT_H +#define CROCUS_FINE_FENCE_DOT_H + +#include <stdbool.h> +#include <stdint.h> + +#include "crocus_screen.h" +#include "crocus_resource.h" + +/** + * A lightweight sequence number fence. + * + * We emit PIPE_CONTROLs inside a batch (possibly in the middle) + * which update a monotonically increasing, 32-bit counter. We + * can then check if that moment has passed by either: + * + * 1. Checking on the CPU by snooping on the DWord via a coherent map + * + * 2. Blocking on the GPU with MI_SEMAPHORE_WAIT from a second batch + * (relying on mid-batch preemption to switch GPU execution to the + * batch that writes it). + */ +struct crocus_fine_fence { + struct pipe_reference reference; + + /** Buffer where the seqno lives */ + struct crocus_state_ref ref; + + /** Coherent CPU map of the buffer containing the seqno DWord. */ + const uint32_t *map; + + /** + * A drm_syncobj pointing which will be signaled at the end of the + * batch which writes this seqno. This can be used to block until + * the seqno has definitely passed (but may wait longer than necessary). + */ + struct crocus_syncobj *syncobj; + +#define CROCUS_FENCE_BOTTOM_OF_PIPE 0x0 /**< Written by bottom-of-pipe flush */ +#define CROCUS_FENCE_TOP_OF_PIPE 0x1 /**< Written by top-of-pipe flush */ +#define CROCUS_FENCE_END 0x2 /**< Written at the end of a batch */ + + /** Information about the type of flush involved (see CROCUS_FENCE_*) */ + uint32_t flags; + + /** + * Sequence number expected to be written by the flush we inserted + * when creating this fence. The crocus_fine_fence is 'signaled' when *@map + * (written by the flush on the GPU) is greater-than-or-equal to @seqno. + */ + uint32_t seqno; +}; + +void crocus_fine_fence_init(struct crocus_batch *batch); + +struct crocus_fine_fence *crocus_fine_fence_new(struct crocus_batch *batch, + unsigned flags); + +void crocus_fine_fence_destroy(struct crocus_screen *screen, + struct crocus_fine_fence *sq); + +static inline void +crocus_fine_fence_reference(struct crocus_screen *screen, + struct crocus_fine_fence **dst, + struct crocus_fine_fence *src) +{ + if (pipe_reference(&(*dst)->reference, &src->reference)) + crocus_fine_fence_destroy(screen, *dst); + + *dst = src; +} + +/** + * Return true if this seqno has passed. + * + * NULL is considered signaled. + */ +static inline bool +crocus_fine_fence_signaled(const struct crocus_fine_fence *sq) +{ + if (sq && !sq->map) + return false; + return !sq || (READ_ONCE(*sq->map) >= sq->seqno); +} + +#endif diff --git a/src/gallium/drivers/crocus/crocus_formats.c b/src/gallium/drivers/crocus/crocus_formats.c new file mode 100644 index 00000000000..31762643bdc --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_formats.c @@ -0,0 +1,576 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_formats.c + * + * Converts Gallium formats (PIPE_FORMAT_*) to hardware ones (ISL_FORMAT_*). + * Provides information about which formats support what features. + */ + +#include "util/bitscan.h" +#include "util/macros.h" +#include "util/format/u_format.h" + +#include "crocus_resource.h" +#include "crocus_screen.h" + +static enum isl_format +crocus_isl_format_for_pipe_format(enum pipe_format pf) +{ + static const enum isl_format table[PIPE_FORMAT_COUNT] = { + [0 ... PIPE_FORMAT_COUNT-1] = ISL_FORMAT_UNSUPPORTED, + + [PIPE_FORMAT_B8G8R8A8_UNORM] = ISL_FORMAT_B8G8R8A8_UNORM, + [PIPE_FORMAT_B8G8R8X8_UNORM] = ISL_FORMAT_B8G8R8X8_UNORM, + [PIPE_FORMAT_B5G5R5A1_UNORM] = ISL_FORMAT_B5G5R5A1_UNORM, + [PIPE_FORMAT_B4G4R4A4_UNORM] = ISL_FORMAT_B4G4R4A4_UNORM, + [PIPE_FORMAT_B5G6R5_UNORM] = ISL_FORMAT_B5G6R5_UNORM, + [PIPE_FORMAT_R10G10B10A2_UNORM] = ISL_FORMAT_R10G10B10A2_UNORM, + + [PIPE_FORMAT_Z16_UNORM] = ISL_FORMAT_R16_UNORM, + [PIPE_FORMAT_Z32_UNORM] = ISL_FORMAT_R32_UNORM, + [PIPE_FORMAT_Z32_FLOAT] = ISL_FORMAT_R32_FLOAT, + + /* We translate the combined depth/stencil formats to depth only here */ + [PIPE_FORMAT_Z24_UNORM_S8_UINT] = ISL_FORMAT_R24_UNORM_X8_TYPELESS, + [PIPE_FORMAT_Z24X8_UNORM] = ISL_FORMAT_R24_UNORM_X8_TYPELESS, + [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = ISL_FORMAT_R32_FLOAT, + + [PIPE_FORMAT_S8_UINT] = ISL_FORMAT_R8_UINT, + [PIPE_FORMAT_X24S8_UINT] = ISL_FORMAT_R8_UINT, + [PIPE_FORMAT_X32_S8X24_UINT] = ISL_FORMAT_R8_UINT, + + [PIPE_FORMAT_R64_FLOAT] = ISL_FORMAT_R64_FLOAT, + [PIPE_FORMAT_R64G64_FLOAT] = ISL_FORMAT_R64G64_FLOAT, + [PIPE_FORMAT_R64G64B64_FLOAT] = ISL_FORMAT_R64G64B64_FLOAT, + [PIPE_FORMAT_R64G64B64A64_FLOAT] = ISL_FORMAT_R64G64B64A64_FLOAT, + [PIPE_FORMAT_R32_FLOAT] = ISL_FORMAT_R32_FLOAT, + [PIPE_FORMAT_R32G32_FLOAT] = ISL_FORMAT_R32G32_FLOAT, + [PIPE_FORMAT_R32G32B32_FLOAT] = ISL_FORMAT_R32G32B32_FLOAT, + [PIPE_FORMAT_R32G32B32A32_FLOAT] = ISL_FORMAT_R32G32B32A32_FLOAT, + [PIPE_FORMAT_R32_UNORM] = ISL_FORMAT_R32_UNORM, + [PIPE_FORMAT_R32G32_UNORM] = ISL_FORMAT_R32G32_UNORM, + [PIPE_FORMAT_R32G32B32_UNORM] = ISL_FORMAT_R32G32B32_UNORM, + [PIPE_FORMAT_R32G32B32A32_UNORM] = ISL_FORMAT_R32G32B32A32_UNORM, + [PIPE_FORMAT_R32_USCALED] = ISL_FORMAT_R32_USCALED, + [PIPE_FORMAT_R32G32_USCALED] = ISL_FORMAT_R32G32_USCALED, + [PIPE_FORMAT_R32G32B32_USCALED] = ISL_FORMAT_R32G32B32_USCALED, + [PIPE_FORMAT_R32G32B32A32_USCALED] = ISL_FORMAT_R32G32B32A32_USCALED, + [PIPE_FORMAT_R32_SNORM] = ISL_FORMAT_R32_SNORM, + [PIPE_FORMAT_R32G32_SNORM] = ISL_FORMAT_R32G32_SNORM, + [PIPE_FORMAT_R32G32B32_SNORM] = ISL_FORMAT_R32G32B32_SNORM, + [PIPE_FORMAT_R32G32B32A32_SNORM] = ISL_FORMAT_R32G32B32A32_SNORM, + [PIPE_FORMAT_R32_SSCALED] = ISL_FORMAT_R32_SSCALED, + [PIPE_FORMAT_R32G32_SSCALED] = ISL_FORMAT_R32G32_SSCALED, + [PIPE_FORMAT_R32G32B32_SSCALED] = ISL_FORMAT_R32G32B32_SSCALED, + [PIPE_FORMAT_R32G32B32A32_SSCALED] = ISL_FORMAT_R32G32B32A32_SSCALED, + [PIPE_FORMAT_R16_UNORM] = ISL_FORMAT_R16_UNORM, + [PIPE_FORMAT_R16G16_UNORM] = ISL_FORMAT_R16G16_UNORM, + [PIPE_FORMAT_R16G16B16_UNORM] = ISL_FORMAT_R16G16B16_UNORM, + [PIPE_FORMAT_R16G16B16A16_UNORM] = ISL_FORMAT_R16G16B16A16_UNORM, + [PIPE_FORMAT_R16_USCALED] = ISL_FORMAT_R16_USCALED, + [PIPE_FORMAT_R16G16_USCALED] = ISL_FORMAT_R16G16_USCALED, + [PIPE_FORMAT_R16G16B16_USCALED] = ISL_FORMAT_R16G16B16_USCALED, + [PIPE_FORMAT_R16G16B16A16_USCALED] = ISL_FORMAT_R16G16B16A16_USCALED, + [PIPE_FORMAT_R16_SNORM] = ISL_FORMAT_R16_SNORM, + [PIPE_FORMAT_R16G16_SNORM] = ISL_FORMAT_R16G16_SNORM, + [PIPE_FORMAT_R16G16B16_SNORM] = ISL_FORMAT_R16G16B16_SNORM, + [PIPE_FORMAT_R16G16B16A16_SNORM] = ISL_FORMAT_R16G16B16A16_SNORM, + [PIPE_FORMAT_R16_SSCALED] = ISL_FORMAT_R16_SSCALED, + [PIPE_FORMAT_R16G16_SSCALED] = ISL_FORMAT_R16G16_SSCALED, + [PIPE_FORMAT_R16G16B16_SSCALED] = ISL_FORMAT_R16G16B16_SSCALED, + [PIPE_FORMAT_R16G16B16A16_SSCALED] = ISL_FORMAT_R16G16B16A16_SSCALED, + [PIPE_FORMAT_R8_UNORM] = ISL_FORMAT_R8_UNORM, + [PIPE_FORMAT_R8G8_UNORM] = ISL_FORMAT_R8G8_UNORM, + [PIPE_FORMAT_R8G8B8_UNORM] = ISL_FORMAT_R8G8B8_UNORM, + [PIPE_FORMAT_R8G8B8A8_UNORM] = ISL_FORMAT_R8G8B8A8_UNORM, + [PIPE_FORMAT_R8_USCALED] = ISL_FORMAT_R8_USCALED, + [PIPE_FORMAT_R8G8_USCALED] = ISL_FORMAT_R8G8_USCALED, + [PIPE_FORMAT_R8G8B8_USCALED] = ISL_FORMAT_R8G8B8_USCALED, + [PIPE_FORMAT_R8G8B8A8_USCALED] = ISL_FORMAT_R8G8B8A8_USCALED, + [PIPE_FORMAT_R8_SNORM] = ISL_FORMAT_R8_SNORM, + [PIPE_FORMAT_R8G8_SNORM] = ISL_FORMAT_R8G8_SNORM, + [PIPE_FORMAT_R8G8B8_SNORM] = ISL_FORMAT_R8G8B8_SNORM, + [PIPE_FORMAT_R8G8B8A8_SNORM] = ISL_FORMAT_R8G8B8A8_SNORM, + [PIPE_FORMAT_R8_SSCALED] = ISL_FORMAT_R8_SSCALED, + [PIPE_FORMAT_R8G8_SSCALED] = ISL_FORMAT_R8G8_SSCALED, + [PIPE_FORMAT_R8G8B8_SSCALED] = ISL_FORMAT_R8G8B8_SSCALED, + [PIPE_FORMAT_R8G8B8A8_SSCALED] = ISL_FORMAT_R8G8B8A8_SSCALED, + [PIPE_FORMAT_R32_FIXED] = ISL_FORMAT_R32_SFIXED, + [PIPE_FORMAT_R32G32_FIXED] = ISL_FORMAT_R32G32_SFIXED, + [PIPE_FORMAT_R32G32B32_FIXED] = ISL_FORMAT_R32G32B32_SFIXED, + [PIPE_FORMAT_R32G32B32A32_FIXED] = ISL_FORMAT_R32G32B32A32_SFIXED, + [PIPE_FORMAT_R16_FLOAT] = ISL_FORMAT_R16_FLOAT, + [PIPE_FORMAT_R16G16_FLOAT] = ISL_FORMAT_R16G16_FLOAT, + [PIPE_FORMAT_R16G16B16_FLOAT] = ISL_FORMAT_R16G16B16_FLOAT, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = ISL_FORMAT_R16G16B16A16_FLOAT, + + [PIPE_FORMAT_R8G8B8_SRGB] = ISL_FORMAT_R8G8B8_UNORM_SRGB, + [PIPE_FORMAT_B8G8R8A8_SRGB] = ISL_FORMAT_B8G8R8A8_UNORM_SRGB, + [PIPE_FORMAT_B8G8R8X8_SRGB] = ISL_FORMAT_B8G8R8X8_UNORM_SRGB, + [PIPE_FORMAT_R8G8B8A8_SRGB] = ISL_FORMAT_R8G8B8A8_UNORM_SRGB, + + [PIPE_FORMAT_DXT1_RGB] = ISL_FORMAT_BC1_UNORM, + [PIPE_FORMAT_DXT1_RGBA] = ISL_FORMAT_BC1_UNORM, + [PIPE_FORMAT_DXT3_RGBA] = ISL_FORMAT_BC2_UNORM, + [PIPE_FORMAT_DXT5_RGBA] = ISL_FORMAT_BC3_UNORM, + + [PIPE_FORMAT_DXT1_SRGB] = ISL_FORMAT_BC1_UNORM_SRGB, + [PIPE_FORMAT_DXT1_SRGBA] = ISL_FORMAT_BC1_UNORM_SRGB, + [PIPE_FORMAT_DXT3_SRGBA] = ISL_FORMAT_BC2_UNORM_SRGB, + [PIPE_FORMAT_DXT5_SRGBA] = ISL_FORMAT_BC3_UNORM_SRGB, + + [PIPE_FORMAT_RGTC1_UNORM] = ISL_FORMAT_BC4_UNORM, + [PIPE_FORMAT_RGTC1_SNORM] = ISL_FORMAT_BC4_SNORM, + [PIPE_FORMAT_RGTC2_UNORM] = ISL_FORMAT_BC5_UNORM, + [PIPE_FORMAT_RGTC2_SNORM] = ISL_FORMAT_BC5_SNORM, + + [PIPE_FORMAT_R10G10B10A2_USCALED] = ISL_FORMAT_R10G10B10A2_USCALED, + [PIPE_FORMAT_R11G11B10_FLOAT] = ISL_FORMAT_R11G11B10_FLOAT, + [PIPE_FORMAT_R9G9B9E5_FLOAT] = ISL_FORMAT_R9G9B9E5_SHAREDEXP, + [PIPE_FORMAT_R1_UNORM] = ISL_FORMAT_R1_UNORM, + [PIPE_FORMAT_R10G10B10X2_USCALED] = ISL_FORMAT_R10G10B10X2_USCALED, + [PIPE_FORMAT_B10G10R10A2_UNORM] = ISL_FORMAT_B10G10R10A2_UNORM, + [PIPE_FORMAT_R8G8B8X8_UNORM] = ISL_FORMAT_R8G8B8X8_UNORM, + + [PIPE_FORMAT_I8_UNORM] = ISL_FORMAT_R8_UNORM, + [PIPE_FORMAT_I16_UNORM] = ISL_FORMAT_R16_UNORM, + [PIPE_FORMAT_I8_SNORM] = ISL_FORMAT_R8_SNORM, + [PIPE_FORMAT_I16_SNORM] = ISL_FORMAT_R16_SNORM, + [PIPE_FORMAT_I16_FLOAT] = ISL_FORMAT_R16_FLOAT, + [PIPE_FORMAT_I32_FLOAT] = ISL_FORMAT_R32_FLOAT, + + [PIPE_FORMAT_L8_UINT] = ISL_FORMAT_L8_UINT, + [PIPE_FORMAT_L8_UNORM] = ISL_FORMAT_L8_UNORM, + [PIPE_FORMAT_L8_SNORM] = ISL_FORMAT_R8_SNORM, + [PIPE_FORMAT_L8_SINT] = ISL_FORMAT_L8_SINT, + [PIPE_FORMAT_L16_UNORM] = ISL_FORMAT_L16_UNORM, + [PIPE_FORMAT_L16_SNORM] = ISL_FORMAT_R16_SNORM, + [PIPE_FORMAT_L16_FLOAT] = ISL_FORMAT_L16_FLOAT, + [PIPE_FORMAT_L32_FLOAT] = ISL_FORMAT_L32_FLOAT, + + [PIPE_FORMAT_A8_UNORM] = ISL_FORMAT_A8_UNORM, + [PIPE_FORMAT_A16_UNORM] = ISL_FORMAT_A16_UNORM, + [PIPE_FORMAT_A16_FLOAT] = ISL_FORMAT_A16_FLOAT, + [PIPE_FORMAT_A32_FLOAT] = ISL_FORMAT_A32_FLOAT, + + [PIPE_FORMAT_L8A8_UNORM] = ISL_FORMAT_L8A8_UNORM, + [PIPE_FORMAT_L16A16_UNORM] = ISL_FORMAT_L16A16_UNORM, + [PIPE_FORMAT_L16A16_FLOAT] = ISL_FORMAT_L16A16_FLOAT, + [PIPE_FORMAT_L32A32_FLOAT] = ISL_FORMAT_L32A32_FLOAT, + + /* Sadly, we have to use luminance[-alpha] formats for sRGB decoding. */ + [PIPE_FORMAT_R8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB, + [PIPE_FORMAT_L8_SRGB] = ISL_FORMAT_L8_UNORM_SRGB, + [PIPE_FORMAT_L8A8_SRGB] = ISL_FORMAT_L8A8_UNORM_SRGB, + + [PIPE_FORMAT_R10G10B10A2_SSCALED] = ISL_FORMAT_R10G10B10A2_SSCALED, + [PIPE_FORMAT_R10G10B10A2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM, + + [PIPE_FORMAT_B10G10R10A2_USCALED] = ISL_FORMAT_B10G10R10A2_USCALED, + [PIPE_FORMAT_B10G10R10A2_SSCALED] = ISL_FORMAT_B10G10R10A2_SSCALED, + [PIPE_FORMAT_B10G10R10A2_SNORM] = ISL_FORMAT_B10G10R10A2_SNORM, + + [PIPE_FORMAT_R8_UINT] = ISL_FORMAT_R8_UINT, + [PIPE_FORMAT_R8G8_UINT] = ISL_FORMAT_R8G8_UINT, + [PIPE_FORMAT_R8G8B8_UINT] = ISL_FORMAT_R8G8B8_UINT, + [PIPE_FORMAT_R8G8B8A8_UINT] = ISL_FORMAT_R8G8B8A8_UINT, + + [PIPE_FORMAT_R8_SINT] = ISL_FORMAT_R8_SINT, + [PIPE_FORMAT_R8G8_SINT] = ISL_FORMAT_R8G8_SINT, + [PIPE_FORMAT_R8G8B8_SINT] = ISL_FORMAT_R8G8B8_SINT, + [PIPE_FORMAT_R8G8B8A8_SINT] = ISL_FORMAT_R8G8B8A8_SINT, + + [PIPE_FORMAT_R16_UINT] = ISL_FORMAT_R16_UINT, + [PIPE_FORMAT_R16G16_UINT] = ISL_FORMAT_R16G16_UINT, + [PIPE_FORMAT_R16G16B16_UINT] = ISL_FORMAT_R16G16B16_UINT, + [PIPE_FORMAT_R16G16B16A16_UINT] = ISL_FORMAT_R16G16B16A16_UINT, + + [PIPE_FORMAT_R16_SINT] = ISL_FORMAT_R16_SINT, + [PIPE_FORMAT_R16G16_SINT] = ISL_FORMAT_R16G16_SINT, + [PIPE_FORMAT_R16G16B16_SINT] = ISL_FORMAT_R16G16B16_SINT, + [PIPE_FORMAT_R16G16B16A16_SINT] = ISL_FORMAT_R16G16B16A16_SINT, + + [PIPE_FORMAT_R32_UINT] = ISL_FORMAT_R32_UINT, + [PIPE_FORMAT_R32G32_UINT] = ISL_FORMAT_R32G32_UINT, + [PIPE_FORMAT_R32G32B32_UINT] = ISL_FORMAT_R32G32B32_UINT, + [PIPE_FORMAT_R32G32B32A32_UINT] = ISL_FORMAT_R32G32B32A32_UINT, + + [PIPE_FORMAT_R32_SINT] = ISL_FORMAT_R32_SINT, + [PIPE_FORMAT_R32G32_SINT] = ISL_FORMAT_R32G32_SINT, + [PIPE_FORMAT_R32G32B32_SINT] = ISL_FORMAT_R32G32B32_SINT, + [PIPE_FORMAT_R32G32B32A32_SINT] = ISL_FORMAT_R32G32B32A32_SINT, + + [PIPE_FORMAT_B10G10R10A2_UINT] = ISL_FORMAT_B10G10R10A2_UINT, + + [PIPE_FORMAT_ETC1_RGB8] = ISL_FORMAT_ETC1_RGB8, + + [PIPE_FORMAT_R8G8B8X8_SRGB] = ISL_FORMAT_R8G8B8X8_UNORM_SRGB, + [PIPE_FORMAT_B10G10R10X2_UNORM] = ISL_FORMAT_B10G10R10X2_UNORM, + [PIPE_FORMAT_R16G16B16X16_UNORM] = ISL_FORMAT_R16G16B16X16_UNORM, + [PIPE_FORMAT_R16G16B16X16_FLOAT] = ISL_FORMAT_R16G16B16X16_FLOAT, + [PIPE_FORMAT_R32G32B32X32_FLOAT] = ISL_FORMAT_R32G32B32X32_FLOAT, + + [PIPE_FORMAT_R10G10B10A2_UINT] = ISL_FORMAT_R10G10B10A2_UINT, + + [PIPE_FORMAT_B5G6R5_SRGB] = ISL_FORMAT_B5G6R5_UNORM_SRGB, + + [PIPE_FORMAT_BPTC_RGBA_UNORM] = ISL_FORMAT_BC7_UNORM, + [PIPE_FORMAT_BPTC_SRGBA] = ISL_FORMAT_BC7_UNORM_SRGB, + [PIPE_FORMAT_BPTC_RGB_FLOAT] = ISL_FORMAT_BC6H_SF16, + [PIPE_FORMAT_BPTC_RGB_UFLOAT] = ISL_FORMAT_BC6H_UF16, + + [PIPE_FORMAT_ETC2_RGB8] = ISL_FORMAT_ETC2_RGB8, + [PIPE_FORMAT_ETC2_SRGB8] = ISL_FORMAT_ETC2_SRGB8, + [PIPE_FORMAT_ETC2_RGB8A1] = ISL_FORMAT_ETC2_RGB8_PTA, + [PIPE_FORMAT_ETC2_SRGB8A1] = ISL_FORMAT_ETC2_SRGB8_PTA, + [PIPE_FORMAT_ETC2_RGBA8] = ISL_FORMAT_ETC2_EAC_RGBA8, + [PIPE_FORMAT_ETC2_SRGBA8] = ISL_FORMAT_ETC2_EAC_SRGB8_A8, + [PIPE_FORMAT_ETC2_R11_UNORM] = ISL_FORMAT_EAC_R11, + [PIPE_FORMAT_ETC2_R11_SNORM] = ISL_FORMAT_EAC_SIGNED_R11, + [PIPE_FORMAT_ETC2_RG11_UNORM] = ISL_FORMAT_EAC_RG11, + [PIPE_FORMAT_ETC2_RG11_SNORM] = ISL_FORMAT_EAC_SIGNED_RG11, + + [PIPE_FORMAT_FXT1_RGB] = ISL_FORMAT_FXT1, + [PIPE_FORMAT_FXT1_RGBA] = ISL_FORMAT_FXT1, + + [PIPE_FORMAT_ASTC_4x4] = ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16, + [PIPE_FORMAT_ASTC_5x4] = ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16, + [PIPE_FORMAT_ASTC_5x5] = ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16, + [PIPE_FORMAT_ASTC_6x5] = ISL_FORMAT_ASTC_LDR_2D_6X5_FLT16, + [PIPE_FORMAT_ASTC_6x6] = ISL_FORMAT_ASTC_LDR_2D_6X6_FLT16, + [PIPE_FORMAT_ASTC_8x5] = ISL_FORMAT_ASTC_LDR_2D_8X5_FLT16, + [PIPE_FORMAT_ASTC_8x6] = ISL_FORMAT_ASTC_LDR_2D_8X6_FLT16, + [PIPE_FORMAT_ASTC_8x8] = ISL_FORMAT_ASTC_LDR_2D_8X8_FLT16, + [PIPE_FORMAT_ASTC_10x5] = ISL_FORMAT_ASTC_LDR_2D_10X5_FLT16, + [PIPE_FORMAT_ASTC_10x6] = ISL_FORMAT_ASTC_LDR_2D_10X6_FLT16, + [PIPE_FORMAT_ASTC_10x8] = ISL_FORMAT_ASTC_LDR_2D_10X8_FLT16, + [PIPE_FORMAT_ASTC_10x10] = ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16, + [PIPE_FORMAT_ASTC_12x10] = ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16, + [PIPE_FORMAT_ASTC_12x12] = ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16, + + [PIPE_FORMAT_ASTC_4x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_4X4_U8SRGB, + [PIPE_FORMAT_ASTC_5x4_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X4_U8SRGB, + [PIPE_FORMAT_ASTC_5x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB, + [PIPE_FORMAT_ASTC_6x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X5_U8SRGB, + [PIPE_FORMAT_ASTC_6x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_6X6_U8SRGB, + [PIPE_FORMAT_ASTC_8x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X5_U8SRGB, + [PIPE_FORMAT_ASTC_8x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X6_U8SRGB, + [PIPE_FORMAT_ASTC_8x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_8X8_U8SRGB, + [PIPE_FORMAT_ASTC_10x5_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X5_U8SRGB, + [PIPE_FORMAT_ASTC_10x6_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X6_U8SRGB, + [PIPE_FORMAT_ASTC_10x8_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X8_U8SRGB, + [PIPE_FORMAT_ASTC_10x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_10X10_U8SRGB, + [PIPE_FORMAT_ASTC_12x10_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB, + [PIPE_FORMAT_ASTC_12x12_SRGB] = ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB, + + [PIPE_FORMAT_A1B5G5R5_UNORM] = ISL_FORMAT_A1B5G5R5_UNORM, + + /* We support these so that we know the API expects no alpha channel. + * Otherwise, the state tracker would just give us a format with alpha + * and we wouldn't know to override the swizzle to 1. + */ + [PIPE_FORMAT_R16G16B16X16_UINT] = ISL_FORMAT_R16G16B16A16_UINT, + [PIPE_FORMAT_R16G16B16X16_SINT] = ISL_FORMAT_R16G16B16A16_SINT, + [PIPE_FORMAT_R32G32B32X32_UINT] = ISL_FORMAT_R32G32B32A32_UINT, + [PIPE_FORMAT_R32G32B32X32_SINT] = ISL_FORMAT_R32G32B32A32_SINT, + [PIPE_FORMAT_R10G10B10X2_SNORM] = ISL_FORMAT_R10G10B10A2_SNORM, + }; + assert(pf < PIPE_FORMAT_COUNT); + return table[pf]; +} + +static enum isl_format +get_render_format(enum pipe_format pformat, enum isl_format def_format) +{ + switch (pformat) { + case PIPE_FORMAT_A16_UNORM: return ISL_FORMAT_R16_UNORM; + case PIPE_FORMAT_A16_FLOAT: return ISL_FORMAT_R16_FLOAT; + case PIPE_FORMAT_A32_FLOAT: return ISL_FORMAT_R32_FLOAT; + + case PIPE_FORMAT_I8_UNORM: return ISL_FORMAT_R8_UNORM; + case PIPE_FORMAT_I16_UNORM: return ISL_FORMAT_R16_UNORM; + case PIPE_FORMAT_I16_FLOAT: return ISL_FORMAT_R16_FLOAT; + case PIPE_FORMAT_I32_FLOAT: return ISL_FORMAT_R32_FLOAT; + + case PIPE_FORMAT_L8_UNORM: return ISL_FORMAT_R8_UNORM; + case PIPE_FORMAT_L8_UINT: return ISL_FORMAT_R8_UINT; + case PIPE_FORMAT_L8_SINT: return ISL_FORMAT_R8_SINT; + case PIPE_FORMAT_L16_UNORM: return ISL_FORMAT_R16_UNORM; + case PIPE_FORMAT_L16_FLOAT: return ISL_FORMAT_R16_FLOAT; + case PIPE_FORMAT_L32_FLOAT: return ISL_FORMAT_R32_FLOAT; + + case PIPE_FORMAT_L8A8_UNORM: return ISL_FORMAT_R8G8_UNORM; + case PIPE_FORMAT_L16A16_UNORM: return ISL_FORMAT_R16G16_UNORM; + case PIPE_FORMAT_L16A16_FLOAT: return ISL_FORMAT_R16G16_FLOAT; + case PIPE_FORMAT_L32A32_FLOAT: return ISL_FORMAT_R32G32_FLOAT; + + default: + return def_format; + } +} + +struct crocus_format_info +crocus_format_for_usage(const struct intel_device_info *devinfo, + enum pipe_format pformat, + isl_surf_usage_flags_t usage) +{ + struct crocus_format_info info = { crocus_isl_format_for_pipe_format(pformat), + { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W } }; + + if (info.fmt == ISL_FORMAT_UNSUPPORTED) + return info; + + if (pformat == PIPE_FORMAT_A8_UNORM) { + info.fmt = ISL_FORMAT_A8_UNORM; + } + + if (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) + info.fmt = get_render_format(pformat, info.fmt); + if (devinfo->ver < 6) { + if (pformat == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) + info.fmt = ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS; + if (pformat == PIPE_FORMAT_X32_S8X24_UINT) + info.fmt = ISL_FORMAT_X32_TYPELESS_G8X24_UINT; + if (pformat == PIPE_FORMAT_X24S8_UINT) + info.fmt = ISL_FORMAT_X24_TYPELESS_G8_UINT; + } + + const struct isl_format_layout *fmtl = isl_format_get_layout(info.fmt); + + if (util_format_is_snorm(pformat)) { + if (util_format_is_intensity(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_X; + info.swizzles[2] = PIPE_SWIZZLE_X; + info.swizzles[3] = PIPE_SWIZZLE_X; + } else if (util_format_is_luminance(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_X; + info.swizzles[2] = PIPE_SWIZZLE_X; + info.swizzles[3] = PIPE_SWIZZLE_1; + } else if (util_format_is_luminance_alpha(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_X; + info.swizzles[2] = PIPE_SWIZZLE_X; + info.swizzles[3] = PIPE_SWIZZLE_Y; + } else if (util_format_is_alpha(pformat)) { + info.swizzles[0] = PIPE_SWIZZLE_0; + info.swizzles[1] = PIPE_SWIZZLE_0; + info.swizzles[2] = PIPE_SWIZZLE_0; + info.swizzles[3] = PIPE_SWIZZLE_X; + } + } + + /* When faking RGBX pipe formats with RGBA ISL formats, override alpha. */ + if (!util_format_has_alpha(pformat) && fmtl->channels.a.type != ISL_VOID) { + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_Y; + info.swizzles[2] = PIPE_SWIZZLE_Z; + info.swizzles[3] = PIPE_SWIZZLE_1; + } + + /* We choose RGBA over RGBX for rendering the hardware doesn't support + * rendering to RGBX. However, when this internal override is used on Gen9+, + * fast clears don't work correctly. + * + * i965 fixes this by pretending to not support RGBX formats, and the higher + * layers of Mesa pick the RGBA format instead. Gallium doesn't work that + * way, and might choose a different format, like BGRX instead of RGBX, + * which will also cause problems when sampling from a surface fast cleared + * as RGBX. So we always choose RGBA instead of RGBX explicitly + * here. + */ + if (isl_format_is_rgbx(info.fmt) && + !isl_format_supports_rendering(devinfo, info.fmt) && + (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)) { + info.fmt = isl_format_rgbx_to_rgba(info.fmt); + info.swizzles[0] = PIPE_SWIZZLE_X; + info.swizzles[1] = PIPE_SWIZZLE_Y; + info.swizzles[2] = PIPE_SWIZZLE_Z; + info.swizzles[3] = PIPE_SWIZZLE_1; + } + + return info; +} + +/** + * The pscreen->is_format_supported() driver hook. + * + * Returns true if the given format is supported for the given usage + * (PIPE_BIND_*) and sample count. + */ +bool +crocus_is_format_supported(struct pipe_screen *pscreen, + enum pipe_format pformat, + enum pipe_texture_target target, + unsigned sample_count, unsigned storage_sample_count, + unsigned usage) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (!util_is_power_of_two_or_zero(sample_count)) + return false; + if (devinfo->ver >= 7) { + if (sample_count > 8 || sample_count == 2) + return false; + } else if (devinfo->ver == 6) { + if (sample_count > 4 || sample_count == 2) + return false; + } else if (sample_count > 1) { + return false; + } + + if (pformat == PIPE_FORMAT_NONE) + return true; + + enum isl_format format = crocus_isl_format_for_pipe_format(pformat); + + if (format == ISL_FORMAT_UNSUPPORTED) + return false; + + /* no stencil texturing prior to haswell */ + if (!devinfo->is_haswell) { + if (pformat == PIPE_FORMAT_S8_UINT || + pformat == PIPE_FORMAT_X24S8_UINT || + pformat == PIPE_FORMAT_S8X24_UINT || + pformat == PIPE_FORMAT_X32_S8X24_UINT) + return FALSE; + } + + const struct isl_format_layout *fmtl = isl_format_get_layout(format); + const bool is_integer = isl_format_has_int_channel(format); + bool supported = true; + + if (sample_count > 1) + supported &= isl_format_supports_multisampling(devinfo, format); + + if (usage & PIPE_BIND_DEPTH_STENCIL) { + supported &= format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS || + format == ISL_FORMAT_R32_FLOAT || + format == ISL_FORMAT_R24_UNORM_X8_TYPELESS || + format == ISL_FORMAT_R16_UNORM || + format == ISL_FORMAT_R8_UINT; + } + + if (usage & PIPE_BIND_RENDER_TARGET) { + /* Alpha and luminance-alpha formats other than A8_UNORM are not + * renderable. + * + * For BLORP, we can apply the swizzle in the shader. But for + * general rendering, this would mean recompiling the shader, which + * we'd like to avoid doing. So we mark these formats non-renderable. + * + * We do support A8_UNORM as it's required and is renderable. + */ + if (pformat != PIPE_FORMAT_A8_UNORM && + (util_format_is_alpha(pformat) || + util_format_is_luminance_alpha(pformat))) + supported = false; + + enum isl_format rt_format = format; + + if (isl_format_is_rgbx(format) && + !isl_format_supports_rendering(devinfo, format)) + rt_format = isl_format_rgbx_to_rgba(format); + + supported &= isl_format_supports_rendering(devinfo, rt_format); + + if (!is_integer) + supported &= isl_format_supports_alpha_blending(devinfo, rt_format); + } + + if (usage & PIPE_BIND_SHADER_IMAGE) { + /* Dataport doesn't support compression, and we can't resolve an MCS + * compressed surface. (Buffer images may have sample count of 0.) + */ + supported &= sample_count == 0; + + supported &= isl_format_supports_typed_writes(devinfo, format); + supported &= isl_has_matching_typed_storage_image_format(devinfo, format); + } + + if (usage & PIPE_BIND_SAMPLER_VIEW) { + supported &= isl_format_supports_sampling(devinfo, format); + bool ignore_filtering = false; + + if (is_integer) + ignore_filtering = true; + + /* I said them, but I lied them. */ + if (devinfo->ver < 5 && (format == ISL_FORMAT_R32G32B32A32_FLOAT || + format == ISL_FORMAT_R24_UNORM_X8_TYPELESS || + format == ISL_FORMAT_R32_FLOAT || + format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS)) + ignore_filtering = true; + if (!ignore_filtering) + supported &= isl_format_supports_filtering(devinfo, format); + + /* Don't advertise 3-component RGB formats for non-buffer textures. + * This ensures that they are renderable from an API perspective since + * the state tracker will fall back to RGBA or RGBX, which are + * renderable. We want to render internally for copies and blits, + * even if the application doesn't. + * + * Buffer textures don't need to be renderable, so we support real RGB. + * This is useful for PBO upload, and 32-bit RGB support is mandatory. + */ + if (target != PIPE_BUFFER) + supported &= fmtl->bpb != 24 && fmtl->bpb != 48 && fmtl->bpb != 96; + } + + if (usage & PIPE_BIND_VERTEX_BUFFER) { + supported &= isl_format_supports_vertex_fetch(devinfo, format); + + if (!devinfo->is_haswell) { + /* W/A: Pre-Haswell, the hardware doesn't really support the formats + * we'd like to use here, so upload everything as UINT and fix it in + * the shader + */ + if (format == ISL_FORMAT_R10G10B10A2_UNORM || + format == ISL_FORMAT_B10G10R10A2_UNORM || + format == ISL_FORMAT_R10G10B10A2_SNORM || + format == ISL_FORMAT_B10G10R10A2_SNORM || + format == ISL_FORMAT_R10G10B10A2_USCALED || + format == ISL_FORMAT_B10G10R10A2_USCALED || + format == ISL_FORMAT_R10G10B10A2_SSCALED || + format == ISL_FORMAT_B10G10R10A2_SSCALED) + supported = true; + + if (format == ISL_FORMAT_R8G8B8_SINT || + format == ISL_FORMAT_R8G8B8_UINT || + format == ISL_FORMAT_R16G16B16_SINT || + format == ISL_FORMAT_R16G16B16_UINT) + supported = true; + } + } + + if (usage & PIPE_BIND_INDEX_BUFFER) { + supported &= format == ISL_FORMAT_R8_UINT || + format == ISL_FORMAT_R16_UINT || + format == ISL_FORMAT_R32_UINT; + } + + return supported; +} diff --git a/src/gallium/drivers/crocus/crocus_genx_macros.h b/src/gallium/drivers/crocus/crocus_genx_macros.h new file mode 100644 index 00000000000..a0309513ed2 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_genx_macros.h @@ -0,0 +1,164 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * Macro and function definitions needed in order to use genxml. + * + * This should only be included in sources compiled per-generation. + */ + +#include "crocus_batch.h" + +#include "genxml/gen_macros.h" + +#define __gen_address_type struct crocus_address +#define __gen_user_data struct crocus_batch +#define __gen_combine_address crocus_combine_address + +static inline void * +__gen_get_batch_dwords(struct crocus_batch *batch, unsigned dwords) +{ + return crocus_get_command_space(batch, dwords * sizeof(uint32_t)); +} + +static inline struct crocus_address +__gen_address_offset(struct crocus_address addr, uint64_t offset) +{ + addr.offset += offset; + return addr; +} + +static uint64_t +__gen_combine_address(struct crocus_batch *batch, void *location, + struct crocus_address addr, uint32_t delta) +{ + uint32_t offset = (char *)location - (char *)batch->command.map; + + if (addr.bo == NULL) { + return addr.offset + delta; + } else { + if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) { + offset = (char *) location - (char *) batch->state.map; + return crocus_state_reloc(batch, offset, addr.bo, + addr.offset + delta, + addr.reloc_flags); + } + + assert(!crocus_ptr_in_state_buffer(batch, location)); + + offset = (char *) location - (char *) batch->command.map; + return crocus_command_reloc(batch, offset, addr.bo, + addr.offset + delta, + addr.reloc_flags); + } +} + +#define __gen_address_type struct crocus_address +#define __gen_user_data struct crocus_batch + +#define __genxml_cmd_length(cmd) cmd ## _length +#define __genxml_cmd_length_bias(cmd) cmd ## _length_bias +#define __genxml_cmd_header(cmd) cmd ## _header +#define __genxml_cmd_pack(cmd) cmd ## _pack +#define __genxml_reg_num(cmd) cmd ## _num + +#include "genxml/genX_pack.h" +#include "genxml/gen_macros.h" +#include "genxml/genX_bits.h" + +/* CS_GPR(15) is reserved for combining conditional rendering predicates + * with GL_ARB_indirect_parameters draw number predicates. + */ +#define MI_BUILDER_NUM_ALLOC_GPRS 15 +#include "common/mi_builder.h" + +#define _crocus_pack_command(batch, cmd, dst, name) \ + for (struct cmd name = { __genxml_cmd_header(cmd) }, \ + *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \ + ({ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name); \ + _dst = NULL; \ + })) + +#define crocus_pack_command(cmd, dst, name) \ + _crocus_pack_command(NULL, cmd, dst, name) + +#define _crocus_pack_state(batch, cmd, dst, name) \ + for (struct cmd name = {}, \ + *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \ + __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name), \ + _dst = NULL) + +#define crocus_pack_state(cmd, dst, name) \ + _crocus_pack_state(NULL, cmd, dst, name) + +#define crocus_emit_cmd(batch, cmd, name) \ + _crocus_pack_command(batch, cmd, __gen_get_batch_dwords(batch, __genxml_cmd_length(cmd)), name) + +#define crocus_emit_merge(batch, dwords0, dwords1, num_dwords) \ + do { \ + uint32_t *dw = __gen_get_batch_dwords(batch, num_dwords); \ + for (uint32_t i = 0; i < num_dwords; i++) \ + dw[i] = (dwords0)[i] | (dwords1)[i]; \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dwords)); \ + } while (0) + +#define crocus_emit_reg(batch, reg, name) \ + for (struct reg name = {}, *_cont = (struct reg *)1; _cont != NULL; \ + ({ \ + uint32_t _dw[__genxml_cmd_length(reg)]; \ + __genxml_cmd_pack(reg)(NULL, _dw, &name); \ + for (unsigned i = 0; i < __genxml_cmd_length(reg); i++) { \ + crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { \ + lri.RegisterOffset = __genxml_reg_num(reg); \ + lri.DataDWord = _dw[i]; \ + } \ + } \ + _cont = NULL; \ + })) + + +/** + * crocus_address constructor helpers: + * + * When using these to construct a CSO, pass NULL for \p bo, and manually + * pin the BO later. Otherwise, genxml's address handling will add the + * BO to the current batch's validation list at CSO creation time, rather + * than at draw time as desired. + */ + +UNUSED static struct crocus_address +ro_bo(struct crocus_bo *bo, uint64_t offset) +{ + return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT }; +} + +UNUSED static struct crocus_address +rw_bo(struct crocus_bo *bo, uint64_t offset) +{ + return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT | RELOC_WRITE }; +} + +UNUSED static struct crocus_address +ggtt_bo(struct crocus_bo *bo, uint64_t offset) +{ + return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT }; +} diff --git a/src/gallium/drivers/crocus/crocus_genx_protos.h b/src/gallium/drivers/crocus/crocus_genx_protos.h new file mode 100644 index 00000000000..ba6798f991e --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_genx_protos.h @@ -0,0 +1,56 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* GenX-specific function declarations. + * + * Don't include this directly, it will be included by crocus_context.h. + * + * NOTE: This header can be included multiple times, from the same file. + */ + +/* crocus_state.c */ +void genX(init_state)(struct crocus_context *ice); +void genX(init_screen_state)(struct crocus_screen *screen); +void genX(upload_urb)(struct crocus_batch *batch, + unsigned vs_size, + bool gs_present, + unsigned gs_size); +void genX(emit_hashing_mode)(struct crocus_context *ice, + struct crocus_batch *batch, + unsigned width, unsigned height, + unsigned scale); + +/* crocus_blorp.c */ +void genX(init_blorp)(struct crocus_context *ice); + +/* crocus_query.c */ +void genX(init_query)(struct crocus_context *ice); +void genX(init_screen_query)(struct crocus_screen *screen); +void genX(math_add32_gpr0)(struct crocus_context *ice, + struct crocus_batch *batch, + uint32_t x); +void genX(math_div32_gpr0)(struct crocus_context *ice, + struct crocus_batch *batch, + uint32_t D); + +/* crocus_blt.c */ +void genX(init_blt)(struct crocus_screen *screen); diff --git a/src/gallium/drivers/crocus/crocus_monitor.c b/src/gallium/drivers/crocus/crocus_monitor.c new file mode 100644 index 00000000000..c0465f22875 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_monitor.c @@ -0,0 +1,484 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "crocus_monitor.h" + +#include <xf86drm.h> + +#include "crocus_screen.h" +#include "crocus_context.h" + +#include "perf/intel_perf.h" +#include "perf/intel_perf_query.h" +#include "perf/intel_perf_regs.h" + +struct crocus_monitor_object { + int num_active_counters; + int *active_counters; + + size_t result_size; + unsigned char *result_buffer; + + struct intel_perf_query_object *query; +}; + +int +crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info) +{ + const struct crocus_screen *screen = (struct crocus_screen *)pscreen; + assert(screen->monitor_cfg); + if (!screen->monitor_cfg) + return 0; + + const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + + if (!info) { + /* return the number of metrics */ + return monitor_cfg->num_counters; + } + + const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + const int group = monitor_cfg->counters[index].group; + const int counter_index = monitor_cfg->counters[index].counter; + struct intel_perf_query_counter *counter = + &perf_cfg->queries[group].counters[counter_index]; + + info->group_id = group; + info->name = counter->name; + info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index; + + if (counter->type == INTEL_PERF_COUNTER_TYPE_THROUGHPUT) + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE; + else + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; + switch (counter->data_type) { + case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: + case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: + info->type = PIPE_DRIVER_QUERY_TYPE_UINT; + info->max_value.u32 = 0; + break; + case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; + info->max_value.u64 = 0; + break; + case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: + case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: + info->type = PIPE_DRIVER_QUERY_TYPE_FLOAT; + info->max_value.u64 = -1; + break; + default: + assert(false); + break; + } + + /* indicates that this is an OA query, not a pipeline statistics query */ + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; + return 1; +} + +typedef void (*bo_unreference_t)(void *); +typedef void *(*bo_map_t)(void *, void *, unsigned flags); +typedef void (*bo_unmap_t)(void *); +typedef void (*emit_mi_report_t)(void *, void *, uint32_t, uint32_t); +typedef void (*emit_mi_flush_t)(void *); +typedef void (*capture_frequency_stat_register_t)(void *, void *, + uint32_t ); +typedef void (*store_register_mem64_t)(void *ctx, void *bo, + uint32_t reg, uint32_t offset); +typedef bool (*batch_references_t)(void *batch, void *bo); +typedef void (*bo_wait_rendering_t)(void *bo); +typedef int (*bo_busy_t)(void *bo); + +static void * +crocus_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size) +{ + return crocus_bo_alloc(bufmgr, name, size); +} + +#if 0 +static void +crocus_monitor_emit_mi_flush(struct crocus_context *ice) +{ + const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CS_STALL; + crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER], + "OA metrics", flags); +} +#endif + +static void +crocus_monitor_emit_mi_report_perf_count(void *c, + void *bo, + uint32_t offset_in_bytes, + uint32_t report_id) +{ + struct crocus_context *ice = c; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + screen->vtbl.emit_mi_report_perf_count(batch, bo, offset_in_bytes, report_id); +} + +static void +crocus_monitor_batchbuffer_flush(void *c, const char *file, int line) +{ + struct crocus_context *ice = c; + _crocus_batch_flush(&ice->batches[CROCUS_BATCH_RENDER], __FILE__, __LINE__); +} + +#if 0 +static void +crocus_monitor_capture_frequency_stat_register(void *ctx, + void *bo, + uint32_t bo_offset) +{ + struct crocus_context *ice = ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + ice->vtbl.store_register_mem32(batch, GEN9_RPSTAT0, bo, bo_offset, false); +} + +static void +crocus_monitor_store_register_mem64(void *ctx, void *bo, + uint32_t reg, uint32_t offset) +{ + struct crocus_context *ice = ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + ice->vtbl.store_register_mem64(batch, reg, bo, offset, false); +} +#endif + +static bool +crocus_monitor_init_metrics(struct crocus_screen *screen) +{ + struct crocus_monitor_config *monitor_cfg = + rzalloc(screen, struct crocus_monitor_config); + struct intel_perf_config *perf_cfg = NULL; + if (unlikely(!monitor_cfg)) + goto allocation_error; + perf_cfg = intel_perf_new(monitor_cfg); + if (unlikely(!perf_cfg)) + goto allocation_error; + + monitor_cfg->perf_cfg = perf_cfg; + + perf_cfg->vtbl.bo_alloc = crocus_oa_bo_alloc; + perf_cfg->vtbl.bo_unreference = (bo_unreference_t)crocus_bo_unreference; + perf_cfg->vtbl.bo_map = (bo_map_t)crocus_bo_map; + perf_cfg->vtbl.bo_unmap = (bo_unmap_t)crocus_bo_unmap; + + perf_cfg->vtbl.emit_mi_report_perf_count = + (emit_mi_report_t)crocus_monitor_emit_mi_report_perf_count; + perf_cfg->vtbl.batchbuffer_flush = crocus_monitor_batchbuffer_flush; + perf_cfg->vtbl.batch_references = (batch_references_t)crocus_batch_references; + perf_cfg->vtbl.bo_wait_rendering = + (bo_wait_rendering_t)crocus_bo_wait_rendering; + perf_cfg->vtbl.bo_busy = (bo_busy_t)crocus_bo_busy; + + intel_perf_init_metrics(perf_cfg, &screen->devinfo, screen->fd, false, false); + screen->monitor_cfg = monitor_cfg; + + /* a gallium "group" is equivalent to a gen "query" + * a gallium "query" is equivalent to a gen "query_counter" + * + * Each gen_query supports a specific number of query_counters. To + * allocate the array of crocus_monitor_counter, we need an upper bound + * (ignoring duplicate query_counters). + */ + int gen_query_counters_count = 0; + for (int gen_query_id = 0; + gen_query_id < perf_cfg->n_queries; + ++gen_query_id) { + gen_query_counters_count += perf_cfg->queries[gen_query_id].n_counters; + } + + monitor_cfg->counters = rzalloc_size(monitor_cfg, + sizeof(struct crocus_monitor_counter) * + gen_query_counters_count); + if (unlikely(!monitor_cfg->counters)) + goto allocation_error; + + int crocus_monitor_id = 0; + for (int group = 0; group < perf_cfg->n_queries; ++group) { + for (int counter = 0; + counter < perf_cfg->queries[group].n_counters; + ++counter) { + /* Check previously identified metrics to filter out duplicates. The + * user is not helped by having the same metric available in several + * groups. (n^2 algorithm). + */ + bool duplicate = false; + for (int existing_group = 0; + existing_group < group && !duplicate; + ++existing_group) { + for (int existing_counter = 0; + existing_counter < perf_cfg->queries[existing_group].n_counters && !duplicate; + ++existing_counter) { + const char *current_name = + perf_cfg->queries[group].counters[counter].name; + const char *existing_name = + perf_cfg->queries[existing_group].counters[existing_counter].name; + if (strcmp(current_name, existing_name) == 0) { + duplicate = true; + } + } + } + if (duplicate) + continue; + monitor_cfg->counters[crocus_monitor_id].group = group; + monitor_cfg->counters[crocus_monitor_id].counter = counter; + ++crocus_monitor_id; + } + } + monitor_cfg->num_counters = crocus_monitor_id; + return monitor_cfg->num_counters; + +allocation_error: + if (monitor_cfg) + free(monitor_cfg->counters); + free(perf_cfg); + free(monitor_cfg); + return false; +} + +int +crocus_get_monitor_group_info(struct pipe_screen *pscreen, + unsigned group_index, + struct pipe_driver_query_group_info *info) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + if (!screen->monitor_cfg) { + if (!crocus_monitor_init_metrics(screen)) + return 0; + } + + const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + + if (!info) { + /* return the count that can be queried */ + return perf_cfg->n_queries; + } + + if (group_index >= perf_cfg->n_queries) { + /* out of range */ + return 0; + } + + struct intel_perf_query_info *query = &perf_cfg->queries[group_index]; + + info->name = query->name; + info->max_active_queries = query->n_counters; + info->num_queries = query->n_counters; + + return 1; +} + +static void +crocus_init_monitor_ctx(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen; + struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + + ice->perf_ctx = intel_perf_new_context(ice); + if (unlikely(!ice->perf_ctx)) + return; + + struct intel_perf_context *perf_ctx = ice->perf_ctx; + struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + intel_perf_init_context(perf_ctx, + perf_cfg, + ice, + ice, + screen->bufmgr, + &screen->devinfo, + ice->batches[CROCUS_BATCH_RENDER].hw_ctx_id, + screen->fd); +} + +/* entry point for GenPerfMonitorsAMD */ +struct crocus_monitor_object * +crocus_create_monitor_object(struct crocus_context *ice, + unsigned num_queries, + unsigned *query_types) +{ + struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen; + struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg; + struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg; + struct intel_perf_query_object *query_obj = NULL; + + /* initialize perf context if this has not already been done. This + * function is the first entry point that carries the gl context. + */ + if (ice->perf_ctx == NULL) { + crocus_init_monitor_ctx(ice); + } + struct intel_perf_context *perf_ctx = ice->perf_ctx; + + assert(num_queries > 0); + int query_index = query_types[0] - PIPE_QUERY_DRIVER_SPECIFIC; + assert(query_index <= monitor_cfg->num_counters); + const int group = monitor_cfg->counters[query_index].group; + + struct crocus_monitor_object *monitor = + calloc(1, sizeof(struct crocus_monitor_object)); + if (unlikely(!monitor)) + goto allocation_failure; + + monitor->num_active_counters = num_queries; + monitor->active_counters = calloc(num_queries, sizeof(int)); + if (unlikely(!monitor->active_counters)) + goto allocation_failure; + + for (int i = 0; i < num_queries; ++i) { + unsigned current_query = query_types[i]; + unsigned current_query_index = current_query - PIPE_QUERY_DRIVER_SPECIFIC; + + /* all queries must be in the same group */ + assert(current_query_index <= monitor_cfg->num_counters); + assert(monitor_cfg->counters[current_query_index].group == group); + monitor->active_counters[i] = + monitor_cfg->counters[current_query_index].counter; + } + + /* create the intel_perf_query */ + query_obj = intel_perf_new_query(perf_ctx, group); + if (unlikely(!query_obj)) + goto allocation_failure; + + monitor->query = query_obj; + monitor->result_size = perf_cfg->queries[group].data_size; + monitor->result_buffer = calloc(1, monitor->result_size); + if (unlikely(!monitor->result_buffer)) + goto allocation_failure; + + return monitor; + +allocation_failure: + if (monitor) { + free(monitor->active_counters); + free(monitor->result_buffer); + } + free(query_obj); + free(monitor); + return NULL; +} + +void +crocus_destroy_monitor_object(struct pipe_context *ctx, + struct crocus_monitor_object *monitor) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + intel_perf_delete_query(ice->perf_ctx, monitor->query); + free(monitor->result_buffer); + monitor->result_buffer = NULL; + free(monitor->active_counters); + monitor->active_counters = NULL; + free(monitor); +} + +bool +crocus_begin_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor) +{ + struct crocus_context *ice = (void *) ctx; + struct intel_perf_context *perf_ctx = ice->perf_ctx; + + return intel_perf_begin_query(perf_ctx, monitor->query); +} + +bool +crocus_end_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor) +{ + struct crocus_context *ice = (void *) ctx; + struct intel_perf_context *perf_ctx = ice->perf_ctx; + + intel_perf_end_query(perf_ctx, monitor->query); + return true; +} + +bool +crocus_get_monitor_result(struct pipe_context *ctx, + struct crocus_monitor_object *monitor, + bool wait, + union pipe_numeric_type_union *result) +{ + struct crocus_context *ice = (void *) ctx; + struct intel_perf_context *perf_ctx = ice->perf_ctx; + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + bool monitor_ready = + intel_perf_is_query_ready(perf_ctx, monitor->query, batch); + + if (!monitor_ready) { + if (!wait) + return false; + intel_perf_wait_query(perf_ctx, monitor->query, batch); + } + + assert(intel_perf_is_query_ready(perf_ctx, monitor->query, batch)); + + unsigned bytes_written; + intel_perf_get_query_data(perf_ctx, monitor->query, batch, + monitor->result_size, + (unsigned*) monitor->result_buffer, + &bytes_written); + if (bytes_written != monitor->result_size) + return false; + + /* copy metrics into the batch result */ + for (int i = 0; i < monitor->num_active_counters; ++i) { + int current_counter = monitor->active_counters[i]; + const struct intel_perf_query_info *info = + intel_perf_query_info(monitor->query); + const struct intel_perf_query_counter *counter = + &info->counters[current_counter]; + assert(intel_perf_query_counter_get_size(counter)); + switch (counter->data_type) { + case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: + result[i].u64 = *(uint64_t*)(monitor->result_buffer + counter->offset); + break; + case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: + result[i].f = *(float*)(monitor->result_buffer + counter->offset); + break; + case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: + case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: + result[i].u64 = *(uint32_t*)(monitor->result_buffer + counter->offset); + break; + case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: { + double v = *(double*)(monitor->result_buffer + counter->offset); + result[i].f = v; + break; + } + default: + unreachable("unexpected counter data type"); + } + } + return true; +} diff --git a/src/gallium/drivers/crocus/crocus_monitor.h b/src/gallium/drivers/crocus/crocus_monitor.h new file mode 100644 index 00000000000..3335c8860e2 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_monitor.h @@ -0,0 +1,72 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef CROCUS_MONITOR_H +#define CROCUS_MONITOR_H + +#include "pipe/p_screen.h" + +struct crocus_monitor_counter { + int group; + int counter; +}; + +struct crocus_monitor_config { + struct intel_perf_config *perf_cfg; + + /* gallium requires an index for each counter */ + int num_counters; + struct crocus_monitor_counter *counters; +}; + +int crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info); +int crocus_get_monitor_group_info(struct pipe_screen *pscreen, + unsigned index, + struct pipe_driver_query_group_info *info); + +struct crocus_context; +struct crocus_screen; + +struct crocus_monitor_object * +crocus_create_monitor_object(struct crocus_context *ice, + unsigned num_queries, + unsigned *query_types); + +struct pipe_query; +void crocus_destroy_monitor_object(struct pipe_context *ctx, + struct crocus_monitor_object *monitor); + +bool +crocus_begin_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor); +bool +crocus_end_monitor(struct pipe_context *ctx, + struct crocus_monitor_object *monitor); + +bool +crocus_get_monitor_result(struct pipe_context *ctx, + struct crocus_monitor_object *monitor, + bool wait, + union pipe_numeric_type_union *result); + +#endif diff --git a/src/gallium/drivers/crocus/crocus_pipe.h b/src/gallium/drivers/crocus/crocus_pipe.h new file mode 100644 index 00000000000..71b12d08e16 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_pipe.h @@ -0,0 +1,74 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef CROCUS_PIPE_H +#define CROCUS_PIPE_H + +#include "pipe/p_defines.h" +#include "compiler/shader_enums.h" + +static inline gl_shader_stage +stage_from_pipe(enum pipe_shader_type pstage) +{ + static const gl_shader_stage stages[PIPE_SHADER_TYPES] = { + [PIPE_SHADER_VERTEX] = MESA_SHADER_VERTEX, + [PIPE_SHADER_TESS_CTRL] = MESA_SHADER_TESS_CTRL, + [PIPE_SHADER_TESS_EVAL] = MESA_SHADER_TESS_EVAL, + [PIPE_SHADER_GEOMETRY] = MESA_SHADER_GEOMETRY, + [PIPE_SHADER_FRAGMENT] = MESA_SHADER_FRAGMENT, + [PIPE_SHADER_COMPUTE] = MESA_SHADER_COMPUTE, + }; + return stages[pstage]; +} + +static inline enum pipe_shader_type +stage_to_pipe(gl_shader_stage stage) +{ + static const enum pipe_shader_type pstages[MESA_SHADER_STAGES] = { + [MESA_SHADER_VERTEX] = PIPE_SHADER_VERTEX, + [MESA_SHADER_TESS_CTRL] = PIPE_SHADER_TESS_CTRL, + [MESA_SHADER_TESS_EVAL] = PIPE_SHADER_TESS_EVAL, + [MESA_SHADER_GEOMETRY] = PIPE_SHADER_GEOMETRY, + [MESA_SHADER_FRAGMENT] = PIPE_SHADER_FRAGMENT, + [MESA_SHADER_COMPUTE] = PIPE_SHADER_COMPUTE, + }; + return pstages[stage]; +} + +/** + * Convert an swizzle enumeration (i.e. PIPE_SWIZZLE_X) to one of the HW's + * "Shader Channel Select" enumerations (i.e. SCS_RED). The mappings are + * + * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE + * 0 1 2 3 4 5 + * 4 5 6 7 0 1 + * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE + * + * which is simply adding 4 then modding by 8 (or anding with 7). + */ +static inline enum isl_channel_select +pipe_swizzle_to_isl_channel(enum pipe_swizzle swizzle) +{ + return (swizzle + 4) & 7; +} + +#endif diff --git a/src/gallium/drivers/crocus/crocus_pipe_control.c b/src/gallium/drivers/crocus/crocus_pipe_control.c new file mode 100644 index 00000000000..7a9625c61ed --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_pipe_control.c @@ -0,0 +1,368 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_pipe_control.c + * + * PIPE_CONTROL is the main flushing and synchronization primitive on Intel + * GPUs. It can invalidate caches, stall until rendering reaches various + * stages of completion, write to memory, and other things. In a way, it's + * a swiss army knife command - it has all kinds of capabilities, but some + * significant limitations as well. + * + * Unfortunately, it's notoriously complicated and difficult to use. Many + * sub-commands can't be used together. Some are meant to be used at the + * top of the pipeline (invalidating caches before drawing), while some are + * meant to be used at the end (stalling or flushing after drawing). + * + * Also, there's a list of restrictions a mile long, which vary by generation. + * Do this before doing that, or suffer the consequences (usually a GPU hang). + * + * This file contains helpers for emitting them safely. You can simply call + * crocus_emit_pipe_control_flush() with the desired operations (as logical + * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple + * PIPE_CONTROL commands as necessary. The per-generation workarounds are + * applied in crocus_emit_raw_pipe_control() in crocus_state.c. + */ + +#include "crocus_context.h" +#include "util/hash_table.h" +#include "util/set.h" + +/** + * Emit a PIPE_CONTROL with various flushing flags. + * + * The caller is responsible for deciding what flags are appropriate for the + * given generation. + */ +void +crocus_emit_pipe_control_flush(struct crocus_batch *batch, + const char *reason, + uint32_t flags) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + + if (devinfo->ver >= 6 && + (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) && + (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) { + /* A pipe control command with flush and invalidate bits set + * simultaneously is an inherently racy operation on Gen6+ if the + * contents of the flushed caches were intended to become visible from + * any of the invalidated caches. Split it in two PIPE_CONTROLs, the + * first one should stall the pipeline to make sure that the flushed R/W + * caches are coherent with memory once the specified R/O caches are + * invalidated. On pre-Gen6 hardware the (implicit) R/O cache + * invalidation seems to happen at the bottom of the pipeline together + * with any write cache flush, so this shouldn't be a concern. In order + * to ensure a full stall, we do an end-of-pipe sync. + */ + crocus_emit_end_of_pipe_sync(batch, reason, + flags & PIPE_CONTROL_CACHE_FLUSH_BITS); + flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL); + } + + batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0); +} + +/** + * Emit a PIPE_CONTROL that writes to a buffer object. + * + * \p flags should contain one of the following items: + * - PIPE_CONTROL_WRITE_IMMEDIATE + * - PIPE_CONTROL_WRITE_TIMESTAMP + * - PIPE_CONTROL_WRITE_DEPTH_COUNT + */ +void +crocus_emit_pipe_control_write(struct crocus_batch *batch, + const char *reason, uint32_t flags, + struct crocus_bo *bo, uint32_t offset, + uint64_t imm) +{ + batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm); +} + +/** + * Restriction [DevSNB, DevIVB]: + * + * Prior to changing Depth/Stencil Buffer state (i.e. any combination of + * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER, + * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall + * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth + * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by + * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set), + * unless SW can otherwise guarantee that the pipeline from WM onwards is + * already flushed (e.g., via a preceding MI_FLUSH). + */ +void +crocus_emit_depth_stall_flushes(struct crocus_batch *batch) +{ + UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo; + + assert(devinfo->ver >= 6); + + crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL); + crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_CACHE_FLUSH); + crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL); +} + +/* + * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": + * + * Write synchronization is a special case of end-of-pipe + * synchronization that requires that the render cache and/or depth + * related caches are flushed to memory, where the data will become + * globally visible. This type of synchronization is required prior to + * SW (CPU) actually reading the result data from memory, or initiating + * an operation that will use as a read surface (such as a texture + * surface) a previous render target and/or depth/stencil buffer + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Exercising the write cache flush bits (Render Target Cache Flush + * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only + * ensures the write caches are flushed and doesn't guarantee the data + * is globally visible. + * + * SW can track the completion of the end-of-pipe-synchronization by + * using "Notify Enable" and "PostSync Operation - Write Immediate + * Data" in the PIPE_CONTROL command. + */ +void +crocus_emit_end_of_pipe_sync(struct crocus_batch *batch, + const char *reason, uint32_t flags) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + + if (devinfo->ver >= 6) { + /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": + * + * "The most common action to perform upon reaching a synchronization + * point is to write a value out to memory. An immediate value + * (included with the synchronization command) may be written." + * + * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": + * + * "In case the data flushed out by the render engine is to be read + * back in to the render engine in coherent manner, then the render + * engine has to wait for the fence completion before accessing the + * flushed data. This can be achieved by following means on various + * products: PIPE_CONTROL command with CS Stall and the required + * write caches flushed with Post-Sync-Operation as Write Immediate + * Data. + * + * Example: + * - Workload-1 (3D/GPGPU/MEDIA) + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate + * Data, Required Write Cache Flush bits set) + * - Workload-2 (Can use the data produce or output by Workload-1) + */ + crocus_emit_pipe_control_write(batch, reason, + flags | PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_WRITE_IMMEDIATE, + batch->ice->workaround_bo, + batch->ice->workaround_offset, 0); + + if (batch->screen->devinfo.is_haswell) { +#define GEN7_3DPRIM_START_INSTANCE 0x243C + batch->screen->vtbl.load_register_mem32(batch, GEN7_3DPRIM_START_INSTANCE, + batch->ice->workaround_bo, + batch->ice->workaround_offset); + } + } else { + /* On gen4-5, a regular pipe control seems to suffice. */ + crocus_emit_pipe_control_flush(batch, reason, flags); + } +} + +/* Emit a pipelined flush to either flush render and texture cache for + * reading from a FBO-drawn texture, or flush so that frontbuffer + * render appears on the screen in DRI1. + * + * This is also used for the always_flush_cache driconf debug option. + */ +void +crocus_emit_mi_flush(struct crocus_batch *batch) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH; + if (devinfo->ver >= 6) { + flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CS_STALL; + } + crocus_emit_pipe_control_flush(batch, "mi flush", flags); +} + +/** + * Emits a PIPE_CONTROL with a non-zero post-sync operation, for + * implementing two workarounds on gen6. From section 1.4.7.1 + * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: + * + * [DevSNB-C+{W/A}] Before any depth stall flush (including those + * produced by non-pipelined state commands), software needs to first + * send a PIPE_CONTROL with no bits set except Post-Sync Operation != + * 0. + * + * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable + * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. + * + * And the workaround for these two requires this workaround first: + * + * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent + * BEFORE the pipe-control with a post-sync op and no write-cache + * flushes. + * + * And this last workaround is tricky because of the requirements on + * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM + * volume 2 part 1: + * + * "1 of the following must also be set: + * - Render Target Cache Flush Enable ([12] of DW1) + * - Depth Cache Flush Enable ([0] of DW1) + * - Stall at Pixel Scoreboard ([1] of DW1) + * - Depth Stall ([13] of DW1) + * - Post-Sync Operation ([13] of DW1) + * - Notify Enable ([8] of DW1)" + * + * The cache flushes require the workaround flush that triggered this + * one, so we can't use it. Depth stall would trigger the same. + * Post-sync nonzero is what triggered this second workaround, so we + * can't use that one either. Notify enable is IRQs, which aren't + * really our business. That leaves only stall at scoreboard. + */ +void +crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch) +{ + crocus_emit_pipe_control_flush(batch, "nonzero", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + + crocus_emit_pipe_control_write(batch, "nonzero", + PIPE_CONTROL_WRITE_IMMEDIATE, + batch->ice->workaround_bo, + batch->ice->workaround_offset, 0); +} + +/** + * Flush and invalidate all caches (for debugging purposes). + */ +void +crocus_flush_all_caches(struct crocus_batch *batch) +{ + crocus_emit_pipe_control_flush(batch, "debug: flush all caches", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_INSTRUCTION_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE); +} + +static void +crocus_texture_barrier(struct pipe_context *ctx, unsigned flags) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_batch *render_batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_batch *compute_batch = &ice->batches[CROCUS_BATCH_COMPUTE]; + const struct intel_device_info *devinfo = &render_batch->screen->devinfo; + + if (devinfo->ver < 6) { + crocus_emit_mi_flush(render_batch); + return; + } + + if (render_batch->contains_draw) { + crocus_batch_maybe_flush(render_batch, 48); + crocus_emit_pipe_control_flush(render_batch, + "API: texture barrier (1/2)", + (flags == 1 ? PIPE_CONTROL_DEPTH_CACHE_FLUSH : 0) | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(render_batch, + "API: texture barrier (2/2)", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); + } + + if (compute_batch->contains_draw) { + crocus_batch_maybe_flush(compute_batch, 48); + crocus_emit_pipe_control_flush(compute_batch, + "API: texture barrier (1/2)", + PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(compute_batch, + "API: texture barrier (2/2)", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); + } +} + +static void +crocus_memory_barrier(struct pipe_context *ctx, unsigned flags) +{ + struct crocus_context *ice = (void *) ctx; + unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL; + const struct intel_device_info *devinfo = &ice->batches[0].screen->devinfo; + + assert(devinfo->ver == 7); + + if (flags & (PIPE_BARRIER_VERTEX_BUFFER | + PIPE_BARRIER_INDEX_BUFFER | + PIPE_BARRIER_INDIRECT_BUFFER)) { + bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + } + + if (flags & PIPE_BARRIER_CONSTANT_BUFFER) { + bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE; + } + + if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) { + bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_RENDER_TARGET_FLUSH; + } + + /* Typed surface messages are handled by the render cache on IVB, so we + * need to flush it too. + */ + if (!devinfo->is_haswell) + bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH; + + for (int i = 0; i < ice->batch_count; i++) { + if (ice->batches[i].contains_draw) { + crocus_batch_maybe_flush(&ice->batches[i], 24); + crocus_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier", + bits); + } + } +} + +void +crocus_init_flush_functions(struct pipe_context *ctx) +{ + ctx->memory_barrier = crocus_memory_barrier; + ctx->texture_barrier = crocus_texture_barrier; +} diff --git a/src/gallium/drivers/crocus/crocus_program.c b/src/gallium/drivers/crocus/crocus_program.c new file mode 100644 index 00000000000..fb8216b71ab --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_program.c @@ -0,0 +1,3171 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_program.c + * + * This file contains the driver interface for compiling shaders. + * + * See crocus_program_cache.c for the in-memory program cache where the + * compiled shaders are stored. + */ + +#include <stdio.h> +#include <errno.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_atomic.h" +#include "util/u_upload_mgr.h" +#include "util/debug.h" +#include "util/u_prim.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_serialize.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/compiler/brw_nir.h" +#include "crocus_context.h" +#include "nir/tgsi_to_nir.h" + +#define KEY_INIT_NO_ID() \ + .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \ + .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688, \ + .base.tex.compressed_multisample_layout_mask = ~0 +#define KEY_INIT() .base.program_string_id = ish->program_id, KEY_INIT_NO_ID() + +static void +crocus_sanitize_tex_key(struct brw_sampler_prog_key_data *key) +{ + key->gather_channel_quirk_mask = 0; + for (unsigned s = 0; s < MAX_SAMPLERS; s++) { + key->swizzles[s] = SWIZZLE_NOOP; + key->gfx6_gather_wa[s] = 0; + } +} + +static uint32_t +crocus_get_texture_swizzle(const struct crocus_context *ice, + const struct crocus_sampler_view *t) +{ + uint32_t swiz = 0; + + for (int i = 0; i < 4; i++) { + swiz |= t->swizzle[i] << (i * 3); + } + return swiz; +} + +static inline bool can_push_ubo(const struct intel_device_info *devinfo) +{ + /* push works for everyone except SNB at the moment */ + return devinfo->ver != 6; +} + +static uint8_t +gfx6_gather_workaround(enum pipe_format pformat) +{ + switch (pformat) { + case PIPE_FORMAT_R8_SINT: return WA_SIGN | WA_8BIT; + case PIPE_FORMAT_R8_UINT: return WA_8BIT; + case PIPE_FORMAT_R16_SINT: return WA_SIGN | WA_16BIT; + case PIPE_FORMAT_R16_UINT: return WA_16BIT; + default: + /* Note that even though PIPE_FORMAT_R32_SINT and + * PIPE_FORMAT_R32_UINThave format overrides in + * the surface state, there is no shader w/a required. + */ + return 0; + } +} + +static const unsigned crocus_gfx6_swizzle_for_offset[4] = { + BRW_SWIZZLE4(0, 1, 2, 3), + BRW_SWIZZLE4(1, 2, 3, 3), + BRW_SWIZZLE4(2, 3, 3, 3), + BRW_SWIZZLE4(3, 3, 3, 3) +}; + +static void +gfx6_gs_xfb_setup(const struct pipe_stream_output_info *so_info, + struct brw_gs_prog_data *gs_prog_data) +{ + /* Make sure that the VUE slots won't overflow the unsigned chars in + * prog_data->transform_feedback_bindings[]. + */ + STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256); + + /* Make sure that we don't need more binding table entries than we've + * set aside for use in transform feedback. (We shouldn't, since we + * set aside enough binding table entries to have one per component). + */ + assert(so_info->num_outputs <= BRW_MAX_SOL_BINDINGS); + + gs_prog_data->num_transform_feedback_bindings = so_info->num_outputs; + for (unsigned i = 0; i < so_info->num_outputs; i++) { + gs_prog_data->transform_feedback_bindings[i] = + so_info->output[i].register_index; + gs_prog_data->transform_feedback_swizzles[i] = + crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component]; + } +} + +static void +gfx6_ff_gs_xfb_setup(const struct pipe_stream_output_info *so_info, + struct brw_ff_gs_prog_key *key) +{ + key->num_transform_feedback_bindings = so_info->num_outputs; + for (unsigned i = 0; i < so_info->num_outputs; i++) { + key->transform_feedback_bindings[i] = + so_info->output[i].register_index; + key->transform_feedback_swizzles[i] = + crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component]; + } +} + +static void +crocus_populate_sampler_prog_key_data(struct crocus_context *ice, + const struct intel_device_info *devinfo, + gl_shader_stage stage, + struct crocus_uncompiled_shader *ish, + bool uses_texture_gather, + struct brw_sampler_prog_key_data *key) +{ + uint32_t mask = ish->nir->info.textures_used[0]; + + while (mask) { + const int s = u_bit_scan(&mask); + + struct crocus_sampler_view *texture = ice->state.shaders[stage].textures[s]; + key->swizzles[s] = SWIZZLE_NOOP; + key->scale_factors[s] = 0.0f; + + if (!texture) + continue; + if (texture->base.target == PIPE_BUFFER) + continue; + if (!devinfo->is_haswell) { + key->swizzles[s] = crocus_get_texture_swizzle(ice, texture); + } + + /* gather4 for RG32* is broken in multiple ways on Gen7. */ + if (devinfo->ver == 7 && uses_texture_gather) { + switch (texture->base.format) { + case PIPE_FORMAT_R32G32_UINT: + case PIPE_FORMAT_R32G32_SINT: { + /* We have to override the format to R32G32_FLOAT_LD. + * This means that SCS_ALPHA and SCS_ONE will return 0x3f8 + * (1.0) rather than integer 1. This needs shader hacks. + * + * On Ivybridge, we whack W (alpha) to ONE in our key's + * swizzle. On Haswell, we look at the original texture + * swizzle, and use XYZW with channels overridden to ONE, + * leaving normal texture swizzling to SCS. + */ + unsigned src_swizzle = key->swizzles[s]; + for (int i = 0; i < 4; i++) { + unsigned src_comp = GET_SWZ(src_swizzle, i); + if (src_comp == SWIZZLE_ONE || src_comp == SWIZZLE_W) { + key->swizzles[i] &= ~(0x7 << (3 * i)); + key->swizzles[i] |= SWIZZLE_ONE << (3 * i); + } + } + } + FALLTHROUGH; + case PIPE_FORMAT_R32G32_FLOAT: + /* The channel select for green doesn't work - we have to + * request blue. Haswell can use SCS for this, but Ivybridge + * needs a shader workaround. + */ + if (!devinfo->is_haswell) + key->gather_channel_quirk_mask |= 1 << s; + break; + default: + break; + } + } + if (devinfo->ver == 6 && uses_texture_gather) { + key->gfx6_gather_wa[s] = gfx6_gather_workaround(texture->base.format); + } + } +} + +static void +crocus_lower_swizzles(struct nir_shader *nir, + const struct brw_sampler_prog_key_data *key_tex) +{ + struct nir_lower_tex_options tex_options = { 0 }; + uint32_t mask = nir->info.textures_used[0]; + + while (mask) { + const int s = u_bit_scan(&mask); + + if (key_tex->swizzles[s] == SWIZZLE_NOOP) + continue; + + tex_options.swizzle_result |= (1 << s); + for (unsigned c = 0; c < 4; c++) + tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c); + } + if (tex_options.swizzle_result) + nir_lower_tex(nir, &tex_options); +} + +static unsigned +get_new_program_id(struct crocus_screen *screen) +{ + return p_atomic_inc_return(&screen->program_id); +} + +static nir_ssa_def * +get_aoa_deref_offset(nir_builder *b, + nir_deref_instr *deref, + unsigned elem_size) +{ + unsigned array_size = elem_size; + nir_ssa_def *offset = nir_imm_int(b, 0); + + while (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + + /* This level's element size is the previous level's array size */ + nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1); + assert(deref->arr.index.ssa); + offset = nir_iadd(b, offset, + nir_imul(b, index, nir_imm_int(b, array_size))); + + deref = nir_deref_instr_parent(deref); + assert(glsl_type_is_array(deref->type)); + array_size *= glsl_get_length(deref->type); + } + + /* Accessing an invalid surface index with the dataport can result in a + * hang. According to the spec "if the index used to select an individual + * element is negative or greater than or equal to the size of the array, + * the results of the operation are undefined but may not lead to + * termination" -- which is one of the possible outcomes of the hang. + * Clamp the index to prevent access outside of the array bounds. + */ + return nir_umin(b, offset, nir_imm_int(b, array_size - elem_size)); +} + +static void +crocus_lower_storage_image_derefs(nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_load_raw_intel: + case nir_intrinsic_image_deref_store_raw_intel: { + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + b.cursor = nir_before_instr(&intrin->instr); + nir_ssa_def *index = + nir_iadd(&b, nir_imm_int(&b, var->data.driver_location), + get_aoa_deref_offset(&b, deref, 1)); + nir_rewrite_image_intrinsic(intrin, index, false); + break; + } + + default: + break; + } + } + } +} + +// XXX: need unify_interfaces() at link time... + +/** + * Undo nir_lower_passthrough_edgeflags but keep the inputs_read flag. + */ +static bool +crocus_fix_edge_flags(nir_shader *nir) +{ + if (nir->info.stage != MESA_SHADER_VERTEX) { + nir_shader_preserve_all_metadata(nir); + return false; + } + + nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out, + VARYING_SLOT_EDGE); + if (!var) { + nir_shader_preserve_all_metadata(nir); + return false; + } + + var->data.mode = nir_var_shader_temp; + nir->info.outputs_written &= ~VARYING_BIT_EDGE; + nir->info.inputs_read &= ~VERT_BIT_EDGEFLAG; + nir_fixup_deref_modes(nir); + + nir_foreach_function(f, nir) { + if (f->impl) { + nir_metadata_preserve(f->impl, nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs | + nir_metadata_loop_analysis); + } else { + nir_metadata_preserve(f->impl, nir_metadata_all); + } + } + + return true; +} + +/** + * Fix an uncompiled shader's stream output info. + * + * Core Gallium stores output->register_index as a "slot" number, where + * slots are assigned consecutively to all outputs in info->outputs_written. + * This naive packing of outputs doesn't work for us - we too have slots, + * but the layout is defined by the VUE map, which we won't have until we + * compile a specific shader variant. So, we remap these and simply store + * VARYING_SLOT_* in our copy's output->register_index fields. + * + * We also fix up VARYING_SLOT_{LAYER,VIEWPORT,PSIZ} to select the Y/Z/W + * components of our VUE header. See brw_vue_map.c for the layout. + */ +static void +update_so_info(struct pipe_stream_output_info *so_info, + uint64_t outputs_written) +{ + uint8_t reverse_map[64] = {}; + unsigned slot = 0; + while (outputs_written) { + reverse_map[slot++] = u_bit_scan64(&outputs_written); + } + + for (unsigned i = 0; i < so_info->num_outputs; i++) { + struct pipe_stream_output *output = &so_info->output[i]; + + /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */ + output->register_index = reverse_map[output->register_index]; + + /* The VUE header contains three scalar fields packed together: + * - gl_PointSize is stored in VARYING_SLOT_PSIZ.w + * - gl_Layer is stored in VARYING_SLOT_PSIZ.y + * - gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z + */ + switch (output->register_index) { + case VARYING_SLOT_LAYER: + assert(output->num_components == 1); + output->register_index = VARYING_SLOT_PSIZ; + output->start_component = 1; + break; + case VARYING_SLOT_VIEWPORT: + assert(output->num_components == 1); + output->register_index = VARYING_SLOT_PSIZ; + output->start_component = 2; + break; + case VARYING_SLOT_PSIZ: + assert(output->num_components == 1); + output->start_component = 3; + break; + } + + //info->outputs_written |= 1ull << output->register_index; + } +} + +static void +setup_vec4_image_sysval(uint32_t *sysvals, uint32_t idx, + unsigned offset, unsigned n) +{ + assert(offset % sizeof(uint32_t) == 0); + + for (unsigned i = 0; i < n; ++i) + sysvals[i] = BRW_PARAM_IMAGE(idx, offset / sizeof(uint32_t) + i); + + for (unsigned i = n; i < 4; ++i) + sysvals[i] = BRW_PARAM_BUILTIN_ZERO; +} + +/** + * Associate NIR uniform variables with the prog_data->param[] mechanism + * used by the backend. Also, decide which UBOs we'd like to push in an + * ideal situation (though the backend can reduce this). + */ +static void +crocus_setup_uniforms(const struct brw_compiler *compiler, + void *mem_ctx, + nir_shader *nir, + struct brw_stage_prog_data *prog_data, + enum brw_param_builtin **out_system_values, + unsigned *out_num_system_values, + unsigned *out_num_cbufs) +{ + UNUSED const struct intel_device_info *devinfo = compiler->devinfo; + + const unsigned CROCUS_MAX_SYSTEM_VALUES = + PIPE_MAX_SHADER_IMAGES * BRW_IMAGE_PARAM_SIZE; + enum brw_param_builtin *system_values = + rzalloc_array(mem_ctx, enum brw_param_builtin, CROCUS_MAX_SYSTEM_VALUES); + unsigned num_system_values = 0; + + unsigned patch_vert_idx = -1; + unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES]; + unsigned img_idx[PIPE_MAX_SHADER_IMAGES]; + unsigned variable_group_size_idx = -1; + memset(ucp_idx, -1, sizeof(ucp_idx)); + memset(img_idx, -1, sizeof(img_idx)); + + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, impl); + + b.cursor = nir_before_block(nir_start_block(impl)); + nir_ssa_def *temp_ubo_name = nir_ssa_undef(&b, 1, 32); + nir_ssa_def *temp_const_ubo_name = NULL; + + /* Turn system value intrinsics into uniforms */ + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + nir_ssa_def *offset; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_constant: { + /* This one is special because it reads from the shader constant + * data and not cbuf0 which gallium uploads for us. + */ + b.cursor = nir_before_instr(instr); + nir_ssa_def *offset = + nir_iadd_imm(&b, nir_ssa_for_src(&b, intrin->src[0], 1), + nir_intrinsic_base(intrin)); + + if (temp_const_ubo_name == NULL) + temp_const_ubo_name = nir_imm_int(&b, 0); + + nir_intrinsic_instr *load_ubo = + nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo); + load_ubo->num_components = intrin->num_components; + load_ubo->src[0] = nir_src_for_ssa(temp_const_ubo_name); + load_ubo->src[1] = nir_src_for_ssa(offset); + nir_intrinsic_set_align(load_ubo, 4, 0); + nir_intrinsic_set_range_base(load_ubo, 0); + nir_intrinsic_set_range(load_ubo, ~0); + nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, + intrin->dest.ssa.name); + nir_builder_instr_insert(&b, &load_ubo->instr); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + &load_ubo->dest.ssa); + nir_instr_remove(&intrin->instr); + continue; + } + case nir_intrinsic_load_user_clip_plane: { + unsigned ucp = nir_intrinsic_ucp_id(intrin); + + if (ucp_idx[ucp] == -1) { + ucp_idx[ucp] = num_system_values; + num_system_values += 4; + } + + for (int i = 0; i < 4; i++) { + system_values[ucp_idx[ucp] + i] = + BRW_PARAM_BUILTIN_CLIP_PLANE(ucp, i); + } + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, ucp_idx[ucp] * sizeof(uint32_t)); + break; + } + case nir_intrinsic_load_patch_vertices_in: + if (patch_vert_idx == -1) + patch_vert_idx = num_system_values++; + + system_values[patch_vert_idx] = + BRW_PARAM_BUILTIN_PATCH_VERTICES_IN; + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t)); + break; + case nir_intrinsic_image_deref_load_param_intel: { + assert(devinfo->ver < 9); + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + if (img_idx[var->data.binding] == -1) { + /* GL only allows arrays of arrays of images. */ + assert(glsl_type_is_image(glsl_without_array(var->type))); + unsigned num_images = MAX2(1, glsl_get_aoa_size(var->type)); + + for (int i = 0; i < num_images; i++) { + const unsigned img = var->data.binding + i; + + img_idx[img] = num_system_values; + num_system_values += BRW_IMAGE_PARAM_SIZE; + + uint32_t *img_sv = &system_values[img_idx[img]]; + + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_OFFSET_OFFSET, img, + offsetof(struct brw_image_param, offset), 2); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_SIZE_OFFSET, img, + offsetof(struct brw_image_param, size), 3); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_STRIDE_OFFSET, img, + offsetof(struct brw_image_param, stride), 4); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_TILING_OFFSET, img, + offsetof(struct brw_image_param, tiling), 3); + setup_vec4_image_sysval( + img_sv + BRW_IMAGE_PARAM_SWIZZLING_OFFSET, img, + offsetof(struct brw_image_param, swizzling), 2); + } + } + + b.cursor = nir_before_instr(instr); + offset = nir_iadd(&b, + get_aoa_deref_offset(&b, deref, BRW_IMAGE_PARAM_SIZE * 4), + nir_imm_int(&b, img_idx[var->data.binding] * 4 + + nir_intrinsic_base(intrin) * 16)); + break; + } + case nir_intrinsic_load_workgroup_size: { + assert(nir->info.workgroup_size_variable); + if (variable_group_size_idx == -1) { + variable_group_size_idx = num_system_values; + num_system_values += 3; + for (int i = 0; i < 3; i++) { + system_values[variable_group_size_idx + i] = + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i; + } + } + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, + variable_group_size_idx * sizeof(uint32_t)); + break; + } + default: + continue; + } + + unsigned comps = nir_intrinsic_dest_components(intrin); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(nir, nir_intrinsic_load_ubo); + load->num_components = comps; + load->src[0] = nir_src_for_ssa(temp_ubo_name); + load->src[1] = nir_src_for_ssa(offset); + nir_intrinsic_set_align(load, 4, 0); + nir_intrinsic_set_range_base(load, 0); + nir_intrinsic_set_range(load, ~0); + nir_ssa_dest_init(&load->instr, &load->dest, comps, 32, NULL); + nir_builder_instr_insert(&b, &load->instr); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + &load->dest.ssa); + nir_instr_remove(instr); + } + } + + nir_validate_shader(nir, "before remapping"); + + /* Uniforms are stored in constant buffer 0, the + * user-facing UBOs are indexed by one. So if any constant buffer is + * needed, the constant buffer 0 will be needed, so account for it. + */ + unsigned num_cbufs = nir->info.num_ubos; + if (num_cbufs || nir->num_uniforms) + num_cbufs++; + + /* Place the new params in a new cbuf. */ + if (num_system_values > 0) { + unsigned sysval_cbuf_index = num_cbufs; + num_cbufs++; + + system_values = reralloc(mem_ctx, system_values, enum brw_param_builtin, + num_system_values); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); + + if (load->intrinsic != nir_intrinsic_load_ubo) + continue; + + b.cursor = nir_before_instr(instr); + + assert(load->src[0].is_ssa); + + if (load->src[0].ssa == temp_ubo_name) { + nir_ssa_def *imm = nir_imm_int(&b, sysval_cbuf_index); + nir_instr_rewrite_src(instr, &load->src[0], + nir_src_for_ssa(imm)); + } + } + } + + /* We need to fold the new iadds for brw_nir_analyze_ubo_ranges */ + nir_opt_constant_folding(nir); + } else { + ralloc_free(system_values); + system_values = NULL; + } + + assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS); + nir_validate_shader(nir, "after remap"); + + /* We don't use params[] but gallium leaves num_uniforms set. We use this + * to detect when cbuf0 exists but we don't need it anymore when we get + * here. Instead, zero it out so that the back-end doesn't get confused + * when nr_params * 4 != num_uniforms != nr_params * 4. + */ + nir->num_uniforms = 0; + + /* Constant loads (if any) need to go at the end of the constant buffers so + * we need to know num_cbufs before we can lower to them. + */ + if (temp_const_ubo_name != NULL) { + nir_load_const_instr *const_ubo_index = + nir_instr_as_load_const(temp_const_ubo_name->parent_instr); + assert(const_ubo_index->def.bit_size == 32); + const_ubo_index->value[0].u32 = num_cbufs; + } + + *out_system_values = system_values; + *out_num_system_values = num_system_values; + *out_num_cbufs = num_cbufs; +} + +static const char *surface_group_names[] = { + [CROCUS_SURFACE_GROUP_RENDER_TARGET] = "render target", + [CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = "non-coherent render target read", + [CROCUS_SURFACE_GROUP_SOL] = "streamout", + [CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = "CS work groups", + [CROCUS_SURFACE_GROUP_TEXTURE] = "texture", + [CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = "texture gather", + [CROCUS_SURFACE_GROUP_UBO] = "ubo", + [CROCUS_SURFACE_GROUP_SSBO] = "ssbo", + [CROCUS_SURFACE_GROUP_IMAGE] = "image", +}; + +static void +crocus_print_binding_table(FILE *fp, const char *name, + const struct crocus_binding_table *bt) +{ + STATIC_ASSERT(ARRAY_SIZE(surface_group_names) == CROCUS_SURFACE_GROUP_COUNT); + + uint32_t total = 0; + uint32_t compacted = 0; + + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) { + uint32_t size = bt->sizes[i]; + total += size; + if (size) + compacted += util_bitcount64(bt->used_mask[i]); + } + + if (total == 0) { + fprintf(fp, "Binding table for %s is empty\n\n", name); + return; + } + + if (total != compacted) { + fprintf(fp, "Binding table for %s " + "(compacted to %u entries from %u entries)\n", + name, compacted, total); + } else { + fprintf(fp, "Binding table for %s (%u entries)\n", name, total); + } + + uint32_t entry = 0; + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) { + uint64_t mask = bt->used_mask[i]; + while (mask) { + int index = u_bit_scan64(&mask); + fprintf(fp, " [%u] %s #%d\n", entry++, surface_group_names[i], index); + } + } + fprintf(fp, "\n"); +} + +enum { + /* Max elements in a surface group. */ + SURFACE_GROUP_MAX_ELEMENTS = 64, +}; + +/** + * Map a <group, index> pair to a binding table index. + * + * For example: <UBO, 5> => binding table index 12 + */ +uint32_t +crocus_group_index_to_bti(const struct crocus_binding_table *bt, + enum crocus_surface_group group, uint32_t index) +{ + assert(index < bt->sizes[group]); + uint64_t mask = bt->used_mask[group]; + uint64_t bit = 1ull << index; + if (bit & mask) { + return bt->offsets[group] + util_bitcount64((bit - 1) & mask); + } else { + return CROCUS_SURFACE_NOT_USED; + } +} + +/** + * Map a binding table index back to a <group, index> pair. + * + * For example: binding table index 12 => <UBO, 5> + */ +uint32_t +crocus_bti_to_group_index(const struct crocus_binding_table *bt, + enum crocus_surface_group group, uint32_t bti) +{ + uint64_t used_mask = bt->used_mask[group]; + assert(bti >= bt->offsets[group]); + + uint32_t c = bti - bt->offsets[group]; + while (used_mask) { + int i = u_bit_scan64(&used_mask); + if (c == 0) + return i; + c--; + } + + return CROCUS_SURFACE_NOT_USED; +} + +static void +rewrite_src_with_bti(nir_builder *b, struct crocus_binding_table *bt, + nir_instr *instr, nir_src *src, + enum crocus_surface_group group) +{ + assert(bt->sizes[group] > 0); + + b->cursor = nir_before_instr(instr); + nir_ssa_def *bti; + if (nir_src_is_const(*src)) { + uint32_t index = nir_src_as_uint(*src); + bti = nir_imm_intN_t(b, crocus_group_index_to_bti(bt, group, index), + src->ssa->bit_size); + } else { + /* Indirect usage makes all the surfaces of the group to be available, + * so we can just add the base. + */ + assert(bt->used_mask[group] == BITFIELD64_MASK(bt->sizes[group])); + bti = nir_iadd_imm(b, src->ssa, bt->offsets[group]); + } + nir_instr_rewrite_src(instr, src, nir_src_for_ssa(bti)); +} + +static void +mark_used_with_src(struct crocus_binding_table *bt, nir_src *src, + enum crocus_surface_group group) +{ + assert(bt->sizes[group] > 0); + + if (nir_src_is_const(*src)) { + uint64_t index = nir_src_as_uint(*src); + assert(index < bt->sizes[group]); + bt->used_mask[group] |= 1ull << index; + } else { + /* There's an indirect usage, we need all the surfaces. */ + bt->used_mask[group] = BITFIELD64_MASK(bt->sizes[group]); + } +} + +static bool +skip_compacting_binding_tables(void) +{ + static int skip = -1; + if (skip < 0) + skip = env_var_as_boolean("INTEL_DISABLE_COMPACT_BINDING_TABLE", false); + return skip; +} + +/** + * Set up the binding table indices and apply to the shader. + */ +static void +crocus_setup_binding_table(const struct intel_device_info *devinfo, + struct nir_shader *nir, + struct crocus_binding_table *bt, + unsigned num_render_targets, + unsigned num_system_values, + unsigned num_cbufs, + const struct brw_sampler_prog_key_data *key) +{ + const struct shader_info *info = &nir->info; + + memset(bt, 0, sizeof(*bt)); + + /* Set the sizes for each surface group. For some groups, we already know + * upfront how many will be used, so mark them. + */ + if (info->stage == MESA_SHADER_FRAGMENT) { + bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET] = num_render_targets; + /* All render targets used. */ + bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET] = + BITFIELD64_MASK(num_render_targets); + + /* Setup render target read surface group in order to support non-coherent + * framebuffer fetch on Gfx7 + */ + if (devinfo->ver >= 6 && info->outputs_read) { + bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = num_render_targets; + bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = + BITFIELD64_MASK(num_render_targets); + } + } else if (info->stage == MESA_SHADER_COMPUTE) { + bt->sizes[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1; + } else if (info->stage == MESA_SHADER_GEOMETRY) { + /* In gfx6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform + * feedback surfaces. + */ + if (devinfo->ver == 6) { + bt->sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS; + bt->used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1; + } + } + + bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE] = BITSET_LAST_BIT(info->textures_used); + bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE] = info->textures_used[0]; + + if (info->uses_texture_gather) { + bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = BITSET_LAST_BIT(info->textures_used); + bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = info->textures_used[0]; + } + + bt->sizes[CROCUS_SURFACE_GROUP_IMAGE] = info->num_images; + + /* Allocate an extra slot in the UBO section for NIR constants. + * Binding table compaction will remove it if unnecessary. + * + * We don't include them in crocus_compiled_shader::num_cbufs because + * they are uploaded separately from shs->constbufs[], but from a shader + * point of view, they're another UBO (at the end of the section). + */ + bt->sizes[CROCUS_SURFACE_GROUP_UBO] = num_cbufs + 1; + + bt->sizes[CROCUS_SURFACE_GROUP_SSBO] = info->num_ssbos; + + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) + assert(bt->sizes[i] <= SURFACE_GROUP_MAX_ELEMENTS); + + /* Mark surfaces used for the cases we don't have the information available + * upfront. + */ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_foreach_block (block, impl) { + nir_foreach_instr (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_num_workgroups: + bt->used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1; + break; + + case nir_intrinsic_load_output: + if (devinfo->ver >= 6) { + mark_used_with_src(bt, &intrin->src[0], + CROCUS_SURFACE_GROUP_RENDER_TARGET_READ); + } + break; + + case nir_intrinsic_image_size: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_load_raw_intel: + case nir_intrinsic_image_store_raw_intel: + mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_IMAGE); + break; + + case nir_intrinsic_load_ubo: + mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_UBO); + break; + + case nir_intrinsic_store_ssbo: + mark_used_with_src(bt, &intrin->src[1], CROCUS_SURFACE_GROUP_SSBO); + break; + + case nir_intrinsic_get_ssbo_size: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmax: + case nir_intrinsic_ssbo_atomic_fcomp_swap: + case nir_intrinsic_load_ssbo: + mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_SSBO); + break; + + default: + break; + } + } + } + + /* When disable we just mark everything as used. */ + if (unlikely(skip_compacting_binding_tables())) { + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) + bt->used_mask[i] = BITFIELD64_MASK(bt->sizes[i]); + } + + /* Calculate the offsets and the binding table size based on the used + * surfaces. After this point, the functions to go between "group indices" + * and binding table indices can be used. + */ + uint32_t next = 0; + for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) { + if (bt->used_mask[i] != 0) { + bt->offsets[i] = next; + next += util_bitcount64(bt->used_mask[i]); + } + } + bt->size_bytes = next * 4; + + if (unlikely(INTEL_DEBUG & DEBUG_BT)) { + crocus_print_binding_table(stderr, gl_shader_stage_name(info->stage), bt); + } + + /* Apply the binding table indices. The backend compiler is not expected + * to change those, as we haven't set any of the *_start entries in brw + * binding_table. + */ + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block (block, impl) { + nir_foreach_instr (instr, block) { + if (instr->type == nir_instr_type_tex) { + nir_tex_instr *tex = nir_instr_as_tex(instr); + bool is_gather = tex->op == nir_texop_tg4; + + /* rewrite the tg4 component from green to blue before replacing the + texture index */ + if (devinfo->ver == 7 && !devinfo->is_haswell) { + if (tex->component == 1) + if (key->gather_channel_quirk_mask & (1 << tex->texture_index)) + tex->component = 2; + } + + if (is_gather && devinfo->ver == 6 && key->gfx6_gather_wa[tex->texture_index]) { + b.cursor = nir_after_instr(instr); + enum gfx6_gather_sampler_wa wa = key->gfx6_gather_wa[tex->texture_index]; + int width = (wa & WA_8BIT) ? 8 : 16; + + nir_ssa_def *val = nir_fmul_imm(&b, &tex->dest.ssa, (1 << width) - 1); + val = nir_f2u32(&b, val); + if (wa & WA_SIGN) { + val = nir_ishl(&b, val, nir_imm_int(&b, 32 - width)); + val = nir_ishr(&b, val, nir_imm_int(&b, 32 - width)); + } + nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, val, val->parent_instr); + } + + tex->texture_index = + crocus_group_index_to_bti(bt, is_gather ? CROCUS_SURFACE_GROUP_TEXTURE_GATHER : CROCUS_SURFACE_GROUP_TEXTURE, + tex->texture_index); + continue; + } + + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_image_size: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_load_raw_intel: + case nir_intrinsic_image_store_raw_intel: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_IMAGE); + break; + + case nir_intrinsic_load_ubo: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_UBO); + break; + + case nir_intrinsic_store_ssbo: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[1], + CROCUS_SURFACE_GROUP_SSBO); + break; + + case nir_intrinsic_load_output: + if (devinfo->ver >= 6) { + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_RENDER_TARGET_READ); + } + break; + + case nir_intrinsic_get_ssbo_size: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmax: + case nir_intrinsic_ssbo_atomic_fcomp_swap: + case nir_intrinsic_load_ssbo: + rewrite_src_with_bti(&b, bt, instr, &intrin->src[0], + CROCUS_SURFACE_GROUP_SSBO); + break; + + default: + break; + } + } + } +} + +static void +crocus_debug_recompile(struct crocus_context *ice, + struct shader_info *info, + const struct brw_base_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen; + const struct brw_compiler *c = screen->compiler; + + if (!info) + return; + + c->shader_perf_log(&ice->dbg, "Recompiling %s shader for program %s: %s\n", + _mesa_shader_stage_to_string(info->stage), + info->name ? info->name : "(no identifier)", + info->label ? info->label : ""); + + const void *old_key = + crocus_find_previous_compile(ice, info->stage, key->program_string_id); + + brw_debug_key_recompile(c, &ice->dbg, info->stage, old_key, key); +} + +/** + * Get the shader for the last enabled geometry stage. + * + * This stage is the one which will feed stream output and the rasterizer. + */ +static gl_shader_stage +last_vue_stage(struct crocus_context *ice) +{ + if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) + return MESA_SHADER_GEOMETRY; + + if (ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) + return MESA_SHADER_TESS_EVAL; + + return MESA_SHADER_VERTEX; +} + +static GLbitfield64 +crocus_vs_outputs_written(struct crocus_context *ice, + const struct brw_vs_prog_key *key, + GLbitfield64 user_varyings) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + GLbitfield64 outputs_written = user_varyings; + + if (devinfo->ver < 6) { + + if (key->copy_edgeflag) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE); + + /* Put dummy slots into the VUE for the SF to put the replaced + * point sprite coords in. We shouldn't need these dummy slots, + * which take up precious URB space, but it would mean that the SF + * doesn't get nice aligned pairs of input coords into output + * coords, which would be a pain to handle. + */ + for (unsigned i = 0; i < 8; i++) { + if (key->point_coord_replace & (1 << i)) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i); + } + + /* if back colors are written, allocate slots for front colors too */ + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0)) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0); + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1)) + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1); + } + + /* In order for legacy clipping to work, we need to populate the clip + * distance varying slots whenever clipping is enabled, even if the vertex + * shader doesn't write to gl_ClipDistance. + */ + if (key->nr_userclip_plane_consts > 0) { + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); + outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); + } + + return outputs_written; +} + +/* + * If no edgeflags come from the user, gen4/5 + * require giving the clip shader a default edgeflag. + * + * This will always be 1.0. + */ +static void +crocus_lower_default_edgeflags(struct nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder b; + nir_builder_init(&b, impl); + + b.cursor = nir_after_cf_list(&b.impl->body); + nir_variable *var = nir_variable_create(nir, nir_var_shader_out, + glsl_float_type(), + "edgeflag"); + var->data.location = VARYING_SLOT_EDGE; + nir_store_var(&b, var, nir_imm_float(&b, 1.0), 0x1); +} + +/** + * Compile a vertex shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_vs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_vs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + const struct intel_device_info *devinfo = &screen->devinfo; + void *mem_ctx = ralloc_context(NULL); + struct brw_vs_prog_data *vs_prog_data = + rzalloc(mem_ctx, struct brw_vs_prog_data); + struct brw_vue_prog_data *vue_prog_data = &vs_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + enum brw_param_builtin *system_values; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + if (key->nr_userclip_plane_consts) { + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true, + false, NULL); + nir_lower_io_to_temporaries(nir, impl, true, false); + nir_lower_global_vars_to_local(nir); + nir_lower_vars_to_ssa(nir); + nir_shader_gather_info(nir, impl); + } + + prog_data->use_alt_mode = ish->use_alt_mode; + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + + crocus_lower_swizzles(nir, &key->base.tex); + + if (devinfo->ver <= 5 && + !(nir->info.inputs_read & BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG))) + crocus_lower_default_edgeflags(nir); + + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + uint64_t outputs_written = + crocus_vs_outputs_written(ice, key, nir->info.outputs_written); + brw_compute_vue_map(devinfo, + &vue_prog_data->vue_map, outputs_written, + nir->info.separate_shader, /* pos slots */ 1); + + /* Don't tell the backend about our clip plane constants, we've already + * lowered them in NIR and we don't want it doing it again. + */ + struct brw_vs_prog_key key_no_ucp = *key; + key_no_ucp.nr_userclip_plane_consts = 0; + key_no_ucp.copy_edgeflag = false; + crocus_sanitize_tex_key(&key_no_ucp.base.tex); + + struct brw_compile_vs_params params = { + .nir = nir, + .key = &key_no_ucp, + .prog_data = vs_prog_data, + .edgeflag_is_last = devinfo->ver < 6, + .log_data = &ice->dbg, + }; + const unsigned *program = + brw_compile_vs(compiler, mem_ctx, ¶ms); + if (program == NULL) { + dbg_printf("Failed to compile vertex shader: %s\n", params.error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + uint32_t *so_decls = NULL; + if (devinfo->ver > 6) + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_VS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*vs_prog_data), so_decls, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current vertex shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_vs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_VERTEX]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_vs_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_VERTEX, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_vs_key(ice, &ish->nir->info, last_vue_stage(ice), &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_VS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_VS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_vs(ice, ish, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_VS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS | + CROCUS_STAGE_DIRTY_BINDINGS_VS | + CROCUS_STAGE_DIRTY_CONSTANTS_VS; + shs->sysvals_need_upload = true; + + const struct brw_vs_prog_data *vs_prog_data = + (void *) shader->prog_data; + const bool uses_draw_params = vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance; + const bool uses_derived_draw_params = vs_prog_data->uses_drawid || + vs_prog_data->uses_is_indexed_draw; + const bool needs_sgvs_element = uses_draw_params || + vs_prog_data->uses_instanceid || + vs_prog_data->uses_vertexid; + + if (ice->state.vs_uses_draw_params != uses_draw_params || + ice->state.vs_uses_derived_draw_params != uses_derived_draw_params || + ice->state.vs_needs_edge_flag != ish->needs_edge_flag || + ice->state.vs_uses_vertexid != vs_prog_data->uses_vertexid || + ice->state.vs_uses_instanceid != vs_prog_data->uses_instanceid) { + ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS | + CROCUS_DIRTY_VERTEX_ELEMENTS; + } + ice->state.vs_uses_draw_params = uses_draw_params; + ice->state.vs_uses_derived_draw_params = uses_derived_draw_params; + ice->state.vs_needs_sgvs_element = needs_sgvs_element; + ice->state.vs_needs_edge_flag = ish->needs_edge_flag; + ice->state.vs_uses_vertexid = vs_prog_data->uses_vertexid; + ice->state.vs_uses_instanceid = vs_prog_data->uses_instanceid; + } +} + +/** + * Get the shader_info for a given stage, or NULL if the stage is disabled. + */ +const struct shader_info * +crocus_get_shader_info(const struct crocus_context *ice, gl_shader_stage stage) +{ + const struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[stage]; + + if (!ish) + return NULL; + + const nir_shader *nir = ish->nir; + return &nir->info; +} + +/** + * Get the union of TCS output and TES input slots. + * + * TCS and TES need to agree on a common URB entry layout. In particular, + * the data for all patch vertices is stored in a single URB entry (unlike + * GS which has one entry per input vertex). This means that per-vertex + * array indexing needs a stride. + * + * SSO requires locations to match, but doesn't require the number of + * outputs/inputs to match (in fact, the TCS often has extra outputs). + * So, we need to take the extra step of unifying these on the fly. + */ +static void +get_unified_tess_slots(const struct crocus_context *ice, + uint64_t *per_vertex_slots, + uint32_t *per_patch_slots) +{ + const struct shader_info *tcs = + crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL); + const struct shader_info *tes = + crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL); + + *per_vertex_slots = tes->inputs_read; + *per_patch_slots = tes->patch_inputs_read; + + if (tcs) { + *per_vertex_slots |= tcs->outputs_written; + *per_patch_slots |= tcs->patch_outputs_written; + } +} + +/** + * Compile a tessellation control shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_tcs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_tcs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + const struct nir_shader_compiler_options *options = + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].NirOptions; + void *mem_ctx = ralloc_context(NULL); + struct brw_tcs_prog_data *tcs_prog_data = + rzalloc(mem_ctx, struct brw_tcs_prog_data); + struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + const struct intel_device_info *devinfo = &screen->devinfo; + enum brw_param_builtin *system_values = NULL; + unsigned num_system_values = 0; + unsigned num_cbufs = 0; + + nir_shader *nir; + + struct crocus_binding_table bt; + + if (ish) { + nir = nir_shader_clone(mem_ctx, ish->nir); + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + + crocus_lower_swizzles(nir, &key->base.tex); + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + } else { + nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, options, key); + + /* Reserve space for passing the default tess levels as constants. */ + num_cbufs = 1; + num_system_values = 8; + system_values = + rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values); + prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values); + prog_data->nr_params = num_system_values; + + if (key->tes_primitive_mode == GL_QUADS) { + for (int i = 0; i < 4; i++) + system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; + + system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; + system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y; + } else if (key->tes_primitive_mode == GL_TRIANGLES) { + for (int i = 0; i < 3; i++) + system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; + + system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; + } else { + assert(key->tes_primitive_mode == GL_ISOLINES); + system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y; + system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X; + } + + /* Manually setup the TCS binding table. */ + memset(&bt, 0, sizeof(bt)); + bt.sizes[CROCUS_SURFACE_GROUP_UBO] = 1; + bt.used_mask[CROCUS_SURFACE_GROUP_UBO] = 1; + bt.size_bytes = 4; + + prog_data->ubo_ranges[0].length = 1; + } + + struct brw_tcs_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + char *error_str = NULL; + const unsigned *program = + brw_compile_tcs(compiler, &ice->dbg, mem_ctx, &key_clean, tcs_prog_data, nir, + -1, NULL, &error_str); + if (program == NULL) { + dbg_printf("Failed to compile control shader: %s\n", error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish) { + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_TCS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*tcs_prog_data), NULL, + system_values, num_system_values, + num_cbufs, &bt); + + if (ish) + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current tessellation control shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_tcs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL]; + struct crocus_uncompiled_shader *tcs = + ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + const struct shader_info *tes_info = + crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL); + struct brw_tcs_prog_key key = { + KEY_INIT_NO_ID(), + .base.program_string_id = tcs ? tcs->program_id : 0, + .tes_primitive_mode = tes_info->tess.primitive_mode, + .input_vertices = ice->state.vertices_per_patch, + .quads_workaround = tes_info->tess.primitive_mode == GL_QUADS && + tes_info->tess.spacing == TESS_SPACING_EQUAL, + }; + + if (tcs && tcs->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_CTRL, tcs, + tcs->nir->info.uses_texture_gather, &key.base.tex); + get_unified_tess_slots(ice, &key.outputs_written, + &key.patch_outputs_written); + screen->vtbl.populate_tcs_key(ice, &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TCS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_TCS, sizeof(key), &key); + + if (tcs && !shader) + shader = crocus_disk_cache_retrieve(ice, tcs, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_tcs(ice, tcs, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_TCS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TCS | + CROCUS_STAGE_DIRTY_BINDINGS_TCS | + CROCUS_STAGE_DIRTY_CONSTANTS_TCS; + shs->sysvals_need_upload = true; + } +} + +/** + * Compile a tessellation evaluation shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_tes(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_tes_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx = ralloc_context(NULL); + struct brw_tes_prog_data *tes_prog_data = + rzalloc(mem_ctx, struct brw_tes_prog_data); + struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + enum brw_param_builtin *system_values; + const struct intel_device_info *devinfo = &screen->devinfo; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + if (key->nr_userclip_plane_consts) { + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true, + false, NULL); + nir_lower_io_to_temporaries(nir, impl, true, false); + nir_lower_global_vars_to_local(nir); + nir_lower_vars_to_ssa(nir); + nir_shader_gather_info(nir, impl); + } + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + crocus_lower_swizzles(nir, &key->base.tex); + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + struct brw_vue_map input_vue_map; + brw_compute_tess_vue_map(&input_vue_map, key->inputs_read, + key->patch_inputs_read); + + struct brw_tes_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + char *error_str = NULL; + const unsigned *program = + brw_compile_tes(compiler, &ice->dbg, mem_ctx, &key_clean, &input_vue_map, + tes_prog_data, nir, -1, NULL, &error_str); + if (program == NULL) { + dbg_printf("Failed to compile evaluation shader: %s\n", error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + uint32_t *so_decls = NULL; + if (devinfo->ver > 6) + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_TES, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*tes_prog_data), so_decls, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current tessellation evaluation shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_tes(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_EVAL]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]; + struct brw_tes_prog_key key = { KEY_INIT() }; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_EVAL, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + get_unified_tess_slots(ice, &key.inputs_read, &key.patch_inputs_read); + screen->vtbl.populate_tes_key(ice, &ish->nir->info, last_vue_stage(ice), &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TES]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_TES, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_tes(ice, ish, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_TES] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_BINDINGS_TES | + CROCUS_STAGE_DIRTY_CONSTANTS_TES; + shs->sysvals_need_upload = true; + } + + /* TODO: Could compare and avoid flagging this. */ + const struct shader_info *tes_info = &ish->nir->info; + if (BITSET_TEST(tes_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES; + ice->state.shaders[MESA_SHADER_TESS_EVAL].sysvals_need_upload = true; + } +} + +/** + * Compile a geometry shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_gs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_gs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + const struct intel_device_info *devinfo = &screen->devinfo; + void *mem_ctx = ralloc_context(NULL); + struct brw_gs_prog_data *gs_prog_data = + rzalloc(mem_ctx, struct brw_gs_prog_data); + struct brw_vue_prog_data *vue_prog_data = &gs_prog_data->base; + struct brw_stage_prog_data *prog_data = &vue_prog_data->base; + enum brw_param_builtin *system_values; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + if (key->nr_userclip_plane_consts) { + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + nir_lower_clip_gs(nir, (1 << key->nr_userclip_plane_consts) - 1, false, + NULL); + nir_lower_io_to_temporaries(nir, impl, true, false); + nir_lower_global_vars_to_local(nir); + nir_lower_vars_to_ssa(nir); + nir_shader_gather_info(nir, impl); + } + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + crocus_lower_swizzles(nir, &key->base.tex); + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + brw_compute_vue_map(devinfo, + &vue_prog_data->vue_map, nir->info.outputs_written, + nir->info.separate_shader, /* pos slots */ 1); + + if (devinfo->ver == 6) + gfx6_gs_xfb_setup(&ish->stream_output, gs_prog_data); + struct brw_gs_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + + char *error_str = NULL; + const unsigned *program = + brw_compile_gs(compiler, &ice->dbg, mem_ctx, &key_clean, gs_prog_data, nir, + -1, NULL, &error_str); + if (program == NULL) { + dbg_printf("Failed to compile geometry shader: %s\n", error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + uint32_t *so_decls = NULL; + if (devinfo->ver > 6) + so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output, + &vue_prog_data->vue_map); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_GS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*gs_prog_data), so_decls, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current geometry shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_gs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_GEOMETRY]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]; + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_GS]; + struct crocus_compiled_shader *shader = NULL; + + if (ish) { + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_gs_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_GEOMETRY, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_gs_key(ice, &ish->nir->info, last_vue_stage(ice), &key); + + shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_GS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_gs(ice, ish, &key); + } + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_GS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS | + CROCUS_STAGE_DIRTY_BINDINGS_GS | + CROCUS_STAGE_DIRTY_CONSTANTS_GS; + shs->sysvals_need_upload = true; + } +} + +/** + * Compile a fragment (pixel) shader, and upload the assembly. + */ +static struct crocus_compiled_shader * +crocus_compile_fs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_wm_prog_key *key, + struct brw_vue_map *vue_map) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx = ralloc_context(NULL); + struct brw_wm_prog_data *fs_prog_data = + rzalloc(mem_ctx, struct brw_wm_prog_data); + struct brw_stage_prog_data *prog_data = &fs_prog_data->base; + enum brw_param_builtin *system_values; + const struct intel_device_info *devinfo = &screen->devinfo; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + prog_data->use_alt_mode = ish->use_alt_mode; + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + + /* Lower output variables to load_output intrinsics before setting up + * binding tables, so crocus_setup_binding_table can map any load_output + * intrinsics to CROCUS_SURFACE_GROUP_RENDER_TARGET_READ on Gen8 for + * non-coherent framebuffer fetches. + */ + brw_nir_lower_fs_outputs(nir); + + /* lower swizzles before binding table */ + crocus_lower_swizzles(nir, &key->base.tex); + int null_rts = 1; + + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, + MAX2(key->nr_color_regions, null_rts), + num_system_values, num_cbufs, + &key->base.tex); + + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + struct brw_wm_prog_key key_clean = *key; + crocus_sanitize_tex_key(&key_clean.base.tex); + + struct brw_compile_fs_params params = { + .nir = nir, + .key = &key_clean, + .prog_data = fs_prog_data, + + .allow_spilling = true, + .vue_map = vue_map, + + .log_data = &ice->dbg, + }; + const unsigned *program = + brw_compile_fs(compiler, mem_ctx, ¶ms); + if (program == NULL) { + dbg_printf("Failed to compile fragment shader: %s\n", params.error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_FS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*fs_prog_data), NULL, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +/** + * Update the current fragment shader variant. + * + * Fill out the key, look in the cache, compile and bind if needed. + */ +static void +crocus_update_compiled_fs(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_FRAGMENT]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; + struct brw_wm_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_FRAGMENT, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_fs_key(ice, &ish->nir->info, &key); + + if (ish->nos & (1ull << CROCUS_NOS_LAST_VUE_MAP)) + key.input_slots_valid = ice->shaders.last_vue_map->slots_valid; + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_FS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_FS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_fs(ice, ish, &key, ice->shaders.last_vue_map); + + if (old != shader) { + // XXX: only need to flag CLIP if barycentric has NONPERSPECTIVE + // toggles. might be able to avoid flagging SBE too. + ice->shaders.prog[CROCUS_CACHE_FS] = shader; + ice->state.dirty |= CROCUS_DIRTY_WM; + /* gen4 clip/sf rely on fs prog_data */ + if (devinfo->ver < 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; + else + ice->state.dirty |= CROCUS_DIRTY_CLIP; + if (devinfo->ver == 6) + ice->state.dirty |= CROCUS_DIRTY_RASTER; + if (devinfo->ver >= 7) + ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS | + CROCUS_STAGE_DIRTY_BINDINGS_FS | + CROCUS_STAGE_DIRTY_CONSTANTS_FS; + shs->sysvals_need_upload = true; + } +} + +/** + * Update the last enabled stage's VUE map. + * + * When the shader feeding the rasterizer's output interface changes, we + * need to re-emit various packets. + */ +static void +update_last_vue_map(struct crocus_context *ice, + struct brw_stage_prog_data *prog_data) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_vue_prog_data *vue_prog_data = (void *) prog_data; + struct brw_vue_map *vue_map = &vue_prog_data->vue_map; + struct brw_vue_map *old_map = ice->shaders.last_vue_map; + const uint64_t changed_slots = + (old_map ? old_map->slots_valid : 0ull) ^ vue_map->slots_valid; + + if (changed_slots & VARYING_BIT_VIEWPORT) { + ice->state.num_viewports = + (vue_map->slots_valid & VARYING_BIT_VIEWPORT) ? CROCUS_MAX_VIEWPORTS : 1; + ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT | + CROCUS_DIRTY_CC_VIEWPORT; + if (devinfo->ver < 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG; + + if (devinfo->ver <= 6) + ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; + + if (devinfo->ver >= 6) + ice->state.dirty |= CROCUS_DIRTY_CLIP | + CROCUS_DIRTY_GEN6_SCISSOR_RECT;; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS | + ice->state.stage_dirty_for_nos[CROCUS_NOS_LAST_VUE_MAP]; + } + + if (changed_slots || (old_map && old_map->separate != vue_map->separate)) { + ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS; + } + + ice->shaders.last_vue_map = &vue_prog_data->vue_map; +} + +static void +crocus_update_pull_constant_descriptors(struct crocus_context *ice, + gl_shader_stage stage) +{ + struct crocus_compiled_shader *shader = ice->shaders.prog[stage]; + + if (!shader || !shader->prog_data->has_ubo_pull) + return; + + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + bool any_new_descriptors = + shader->num_system_values > 0 && shs->sysvals_need_upload; + + unsigned bound_cbufs = shs->bound_cbufs; + + while (bound_cbufs) { + const int i = u_bit_scan(&bound_cbufs); + struct pipe_constant_buffer *cbuf = &shs->constbufs[i]; + if (cbuf->buffer) { + any_new_descriptors = true; + } + } + + if (any_new_descriptors) + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage; +} + +/** + * Get the prog_data for a given stage, or NULL if the stage is disabled. + */ +static struct brw_vue_prog_data * +get_vue_prog_data(struct crocus_context *ice, gl_shader_stage stage) +{ + if (!ice->shaders.prog[stage]) + return NULL; + + return (void *) ice->shaders.prog[stage]->prog_data; +} + +static struct crocus_compiled_shader * +crocus_compile_clip(struct crocus_context *ice, struct brw_clip_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx; + unsigned program_size; + mem_ctx = ralloc_context(NULL); + + struct brw_clip_prog_data *clip_prog_data = + rzalloc(mem_ctx, struct brw_clip_prog_data); + + const unsigned *program = brw_compile_clip(compiler, mem_ctx, key, clip_prog_data, + ice->shaders.last_vue_map, &program_size); + + if (program == NULL) { + dbg_printf("failed to compile clip shader\n"); + ralloc_free(mem_ctx); + return false; + } + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_CLIP, sizeof(*key), key, program, + program_size, + (struct brw_stage_prog_data *)clip_prog_data, sizeof(*clip_prog_data), + NULL, NULL, 0, 0, &bt); + ralloc_free(mem_ctx); + return shader; +} +static void +crocus_update_compiled_clip(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct brw_clip_prog_key key; + struct crocus_compiled_shader *old = ice->shaders.clip_prog; + memset(&key, 0, sizeof(key)); + + const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); + if (wm_prog_data) { + key.contains_flat_varying = wm_prog_data->contains_flat_varying; + key.contains_noperspective_varying = + wm_prog_data->contains_noperspective_varying; + memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode)); + } + + key.primitive = u_reduced_prim(ice->state.prim_mode); + key.attrs = ice->shaders.last_vue_map->slots_valid; + + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + key.pv_first = rs_state->flatshade_first; + + if (rs_state->clip_plane_enable) + key.nr_userclip = util_logbase2(rs_state->clip_plane_enable) + 1; + + if (screen->devinfo.ver == 5) + key.clip_mode = BRW_CLIP_MODE_KERNEL_CLIP; + else + key.clip_mode = BRW_CLIP_MODE_NORMAL; + + if (key.primitive == PIPE_PRIM_TRIANGLES) { + if (rs_state->cull_face == PIPE_FACE_FRONT_AND_BACK) + key.clip_mode = BRW_CLIP_MODE_REJECT_ALL; + else { + uint32_t fill_front = BRW_CLIP_FILL_MODE_CULL; + uint32_t fill_back = BRW_CLIP_FILL_MODE_CULL; + uint32_t offset_front = 0; + uint32_t offset_back = 0; + + if (!(rs_state->cull_face & PIPE_FACE_FRONT)) { + switch (rs_state->fill_front) { + case PIPE_POLYGON_MODE_FILL: + fill_front = BRW_CLIP_FILL_MODE_FILL; + offset_front = 0; + break; + case PIPE_POLYGON_MODE_LINE: + fill_front = BRW_CLIP_FILL_MODE_LINE; + offset_front = rs_state->offset_line; + break; + case PIPE_POLYGON_MODE_POINT: + fill_front = BRW_CLIP_FILL_MODE_POINT; + offset_front = rs_state->offset_point; + break; + } + } + + if (!(rs_state->cull_face & PIPE_FACE_BACK)) { + switch (rs_state->fill_back) { + case PIPE_POLYGON_MODE_FILL: + fill_back = BRW_CLIP_FILL_MODE_FILL; + offset_back = 0; + break; + case PIPE_POLYGON_MODE_LINE: + fill_back = BRW_CLIP_FILL_MODE_LINE; + offset_back = rs_state->offset_line; + break; + case PIPE_POLYGON_MODE_POINT: + fill_back = BRW_CLIP_FILL_MODE_POINT; + offset_back = rs_state->offset_point; + break; + } + } + + if (rs_state->fill_back != PIPE_POLYGON_MODE_FILL || + rs_state->fill_front != PIPE_POLYGON_MODE_FILL) { + key.do_unfilled = 1; + + /* Most cases the fixed function units will handle. Cases where + * one or more polygon faces are unfilled will require help: + */ + key.clip_mode = BRW_CLIP_MODE_CLIP_NON_REJECTED; + + if (offset_back || offset_front) { + double mrd = 0.0; + if (ice->state.framebuffer.zsbuf) + mrd = util_get_depth_format_mrd(util_format_description(ice->state.framebuffer.zsbuf->format)); + key.offset_units = rs_state->offset_units * mrd * 2; + key.offset_factor = rs_state->offset_scale * mrd; + key.offset_clamp = rs_state->offset_clamp * mrd; + } + + if (!(rs_state->front_ccw ^ rs_state->bottom_edge_rule)) { + key.fill_ccw = fill_front; + key.fill_cw = fill_back; + key.offset_ccw = offset_front; + key.offset_cw = offset_back; + if (rs_state->light_twoside && + key.fill_cw != BRW_CLIP_FILL_MODE_CULL) + key.copy_bfc_cw = 1; + } else { + key.fill_cw = fill_front; + key.fill_ccw = fill_back; + key.offset_cw = offset_front; + key.offset_ccw = offset_back; + if (rs_state->light_twoside && + key.fill_ccw != BRW_CLIP_FILL_MODE_CULL) + key.copy_bfc_ccw = 1; + } + } + } + } + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_CLIP, sizeof(key), &key); + + if (!shader) + shader = crocus_compile_clip(ice, &key); + + if (old != shader) { + ice->state.dirty |= CROCUS_DIRTY_CLIP; + ice->shaders.clip_prog = shader; + } +} + +static struct crocus_compiled_shader * +crocus_compile_sf(struct crocus_context *ice, struct brw_sf_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx; + unsigned program_size; + mem_ctx = ralloc_context(NULL); + + struct brw_sf_prog_data *sf_prog_data = + rzalloc(mem_ctx, struct brw_sf_prog_data); + + const unsigned *program = brw_compile_sf(compiler, mem_ctx, key, sf_prog_data, + ice->shaders.last_vue_map, &program_size); + + if (program == NULL) { + dbg_printf("failed to compile sf shader\n"); + ralloc_free(mem_ctx); + return false; + } + + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_SF, sizeof(*key), key, program, + program_size, + (struct brw_stage_prog_data *)sf_prog_data, sizeof(*sf_prog_data), + NULL, NULL, 0, 0, &bt); + ralloc_free(mem_ctx); + return shader; +} + +static void +crocus_update_compiled_sf(struct crocus_context *ice) +{ + struct brw_sf_prog_key key; + struct crocus_compiled_shader *old = ice->shaders.sf_prog; + memset(&key, 0, sizeof(key)); + + key.attrs = ice->shaders.last_vue_map->slots_valid; + + switch (u_reduced_prim(ice->state.prim_mode)) { + case GL_TRIANGLES: + default: + if (key.attrs & BITFIELD64_BIT(VARYING_SLOT_EDGE)) + key.primitive = BRW_SF_PRIM_UNFILLED_TRIS; + else + key.primitive = BRW_SF_PRIM_TRIANGLES; + break; + case GL_LINES: + key.primitive = BRW_SF_PRIM_LINES; + break; + case GL_POINTS: + key.primitive = BRW_SF_PRIM_POINTS; + break; + } + + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + key.userclip_active = rs_state->clip_plane_enable != 0; + const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data); + if (wm_prog_data) { + key.contains_flat_varying = wm_prog_data->contains_flat_varying; + memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode)); + } + + key.do_twoside_color = rs_state->light_twoside; + + key.do_point_sprite = rs_state->point_quad_rasterization; + if (key.do_point_sprite) { + key.point_sprite_coord_replace = rs_state->sprite_coord_enable & 0xff; + if (rs_state->sprite_coord_enable & (1 << 8)) + key.do_point_coord = 1; + if (wm_prog_data && wm_prog_data->urb_setup[VARYING_SLOT_PNTC] != -1) + key.do_point_coord = 1; + } + + key.sprite_origin_lower_left = rs_state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT; + + if (key.do_twoside_color) { + key.frontface_ccw = rs_state->front_ccw; + } + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_SF, sizeof(key), &key); + + if (!shader) + shader = crocus_compile_sf(ice, &key); + + if (old != shader) { + ice->state.dirty |= CROCUS_DIRTY_RASTER; + ice->shaders.sf_prog = shader; + } +} + +static struct crocus_compiled_shader * +crocus_compile_ff_gs(struct crocus_context *ice, struct brw_ff_gs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct brw_compiler *compiler = screen->compiler; + void *mem_ctx; + unsigned program_size; + mem_ctx = ralloc_context(NULL); + + struct brw_ff_gs_prog_data *ff_gs_prog_data = + rzalloc(mem_ctx, struct brw_ff_gs_prog_data); + + const unsigned *program = brw_compile_ff_gs_prog(compiler, mem_ctx, key, ff_gs_prog_data, + ice->shaders.last_vue_map, &program_size); + + if (program == NULL) { + dbg_printf("failed to compile sf shader\n"); + ralloc_free(mem_ctx); + return false; + } + + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + + if (screen->devinfo.ver == 6) { + bt.sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS; + bt.used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1; + + bt.size_bytes = BRW_MAX_SOL_BINDINGS * 4; + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_FF_GS, sizeof(*key), key, program, + program_size, + (struct brw_stage_prog_data *)ff_gs_prog_data, sizeof(*ff_gs_prog_data), + NULL, NULL, 0, 0, &bt); + ralloc_free(mem_ctx); + return shader; +} + +static void +crocus_update_compiled_ff_gs(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_ff_gs_prog_key key; + struct crocus_compiled_shader *old = ice->shaders.ff_gs_prog; + memset(&key, 0, sizeof(key)); + + assert(devinfo->ver < 7); + + key.attrs = ice->shaders.last_vue_map->slots_valid; + + key.primitive = screen->vtbl.translate_prim_type(ice->state.prim_mode, 0); + + struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice); + key.pv_first = rs_state->flatshade_first; + + if (key.primitive == _3DPRIM_QUADLIST && !rs_state->flatshade) { + /* Provide consistenbbbbbt primitive order with brw_set_prim's + * optimization of single quads to trifans. + */ + key.pv_first = true; + } + + if (devinfo->ver >= 6) { + key.need_gs_prog = ice->state.streamout_active; + if (key.need_gs_prog) { + struct crocus_uncompiled_shader *vs = + ice->shaders.uncompiled[MESA_SHADER_VERTEX]; + gfx6_ff_gs_xfb_setup(&vs->stream_output, + &key); + } + } else { + key.need_gs_prog = (key.primitive == _3DPRIM_QUADLIST || + key.primitive == _3DPRIM_QUADSTRIP || + key.primitive == _3DPRIM_LINELOOP); + } + + struct crocus_compiled_shader *shader = NULL; + if (key.need_gs_prog) { + shader = crocus_find_cached_shader(ice, CROCUS_CACHE_FF_GS, + sizeof(key), &key); + if (!shader) + shader = crocus_compile_ff_gs(ice, &key); + } + if (old != shader) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS; + if (!!old != !!shader) + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + ice->shaders.ff_gs_prog = shader; + if (shader) { + const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data; + ice->state.last_xfb_verts_per_prim = gs_prog_data->svbi_postincrement_value; + } + } +} + +// XXX: crocus_compiled_shaders are space-leaking :( +// XXX: do remember to unbind them if deleting them. + +/** + * Update the current shader variants for the given state. + * + * This should be called on every draw call to ensure that the correct + * shaders are bound. It will also flag any dirty state triggered by + * swapping out those shaders. + */ +bool +crocus_update_compiled_shaders(struct crocus_context *ice) +{ + struct crocus_screen *screen = (void *) ice->ctx.screen; + const uint64_t stage_dirty = ice->state.stage_dirty; + + struct brw_vue_prog_data *old_prog_datas[4]; + if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) { + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) + old_prog_datas[i] = get_vue_prog_data(ice, i); + } + + if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_TCS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) { + struct crocus_uncompiled_shader *tes = + ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]; + if (tes) { + crocus_update_compiled_tcs(ice); + crocus_update_compiled_tes(ice); + } else { + ice->shaders.prog[CROCUS_CACHE_TCS] = NULL; + ice->shaders.prog[CROCUS_CACHE_TES] = NULL; + ice->state.stage_dirty |= + CROCUS_STAGE_DIRTY_TCS | CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_BINDINGS_TCS | CROCUS_STAGE_DIRTY_BINDINGS_TES | + CROCUS_STAGE_DIRTY_CONSTANTS_TCS | CROCUS_STAGE_DIRTY_CONSTANTS_TES; + } + } + + if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_VS) + crocus_update_compiled_vs(ice); + if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_GS) + crocus_update_compiled_gs(ice); + + if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_GS | + CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) { + const struct crocus_compiled_shader *gs = + ice->shaders.prog[MESA_SHADER_GEOMETRY]; + const struct crocus_compiled_shader *tes = + ice->shaders.prog[MESA_SHADER_TESS_EVAL]; + + bool points_or_lines = false; + + if (gs) { + const struct brw_gs_prog_data *gs_prog_data = (void *) gs->prog_data; + points_or_lines = + gs_prog_data->output_topology == _3DPRIM_POINTLIST || + gs_prog_data->output_topology == _3DPRIM_LINESTRIP; + } else if (tes) { + const struct brw_tes_prog_data *tes_data = (void *) tes->prog_data; + points_or_lines = + tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_LINE || + tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT; + } + + if (ice->shaders.output_topology_is_points_or_lines != points_or_lines) { + /* Outbound to XY Clip enables */ + ice->shaders.output_topology_is_points_or_lines = points_or_lines; + ice->state.dirty |= CROCUS_DIRTY_CLIP; + } + } + + if (!ice->shaders.prog[MESA_SHADER_VERTEX]) + return false; + + gl_shader_stage last_stage = last_vue_stage(ice); + struct crocus_compiled_shader *shader = ice->shaders.prog[last_stage]; + struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[last_stage]; + update_last_vue_map(ice, shader->prog_data); + if (ice->state.streamout != shader->streamout) { + ice->state.streamout = shader->streamout; + ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST | CROCUS_DIRTY_STREAMOUT; + } + + if (ice->state.streamout_active) { + screen->vtbl.update_so_strides(ice, ish->stream_output.stride); + } + + /* use ice->state version as last_vue_map can dirty this bit */ + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_FS) + crocus_update_compiled_fs(ice); + + if (screen->devinfo.ver <= 6) { + if (ice->state.dirty & CROCUS_DIRTY_GEN4_FF_GS_PROG && + !ice->shaders.prog[MESA_SHADER_GEOMETRY]) + crocus_update_compiled_ff_gs(ice); + } + + if (screen->devinfo.ver < 6) { + if (ice->state.dirty & CROCUS_DIRTY_GEN4_CLIP_PROG) + crocus_update_compiled_clip(ice); + if (ice->state.dirty & CROCUS_DIRTY_GEN4_SF_PROG) + crocus_update_compiled_sf(ice); + } + + + /* Changing shader interfaces may require a URB configuration. */ + if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) { + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + struct brw_vue_prog_data *old = old_prog_datas[i]; + struct brw_vue_prog_data *new = get_vue_prog_data(ice, i); + if (!!old != !!new || + (new && new->urb_entry_size != old->urb_entry_size)) { + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + break; + } + } + } + + if (ice->state.stage_dirty & CROCUS_RENDER_STAGE_DIRTY_CONSTANTS) { + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) { + if (ice->state.stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << i)) + crocus_update_pull_constant_descriptors(ice, i); + } + } + return true; +} + +static struct crocus_compiled_shader * +crocus_compile_cs(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + const struct brw_cs_prog_key *key) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct brw_compiler *compiler = screen->compiler; + void *mem_ctx = ralloc_context(NULL); + struct brw_cs_prog_data *cs_prog_data = + rzalloc(mem_ctx, struct brw_cs_prog_data); + struct brw_stage_prog_data *prog_data = &cs_prog_data->base; + enum brw_param_builtin *system_values; + const struct intel_device_info *devinfo = &screen->devinfo; + unsigned num_system_values; + unsigned num_cbufs; + + nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir); + + NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics); + + crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); + crocus_lower_swizzles(nir, &key->base.tex); + struct crocus_binding_table bt; + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + + struct brw_compile_cs_params params = { + .nir = nir, + .key = key, + .prog_data = cs_prog_data, + .log_data = &ice->dbg, + }; + + const unsigned *program = + brw_compile_cs(compiler, mem_ctx, ¶ms); + if (program == NULL) { + dbg_printf("Failed to compile compute shader: %s\n", params.error_str); + ralloc_free(mem_ctx); + return false; + } + + if (ish->compiled_once) { + crocus_debug_recompile(ice, &nir->info, &key->base); + } else { + ish->compiled_once = true; + } + + struct crocus_compiled_shader *shader = + crocus_upload_shader(ice, CROCUS_CACHE_CS, sizeof(*key), key, program, + prog_data->program_size, + prog_data, sizeof(*cs_prog_data), NULL, + system_values, num_system_values, + num_cbufs, &bt); + + crocus_disk_cache_store(screen->disk_cache, ish, shader, + ice->shaders.cache_bo_map, + key, sizeof(*key)); + + ralloc_free(mem_ctx); + return shader; +} + +static void +crocus_update_compiled_cs(struct crocus_context *ice) +{ + struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE]; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_COMPUTE]; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_cs_prog_key key = { KEY_INIT() }; + + if (ish->nos & (1ull << CROCUS_NOS_TEXTURES)) + crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_COMPUTE, ish, + ish->nir->info.uses_texture_gather, &key.base.tex); + screen->vtbl.populate_cs_key(ice, &key); + + struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_CS]; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_CS, sizeof(key), &key); + + if (!shader) + shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)); + + if (!shader) + shader = crocus_compile_cs(ice, ish, &key); + + if (old != shader) { + ice->shaders.prog[CROCUS_CACHE_CS] = shader; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS | + CROCUS_STAGE_DIRTY_BINDINGS_CS | + CROCUS_STAGE_DIRTY_CONSTANTS_CS; + shs->sysvals_need_upload = true; + } +} + +void +crocus_update_compiled_compute_shader(struct crocus_context *ice) +{ + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_CS) + crocus_update_compiled_cs(ice); + + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) + crocus_update_pull_constant_descriptors(ice, MESA_SHADER_COMPUTE); +} + +void +crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data, + unsigned threads, + uint32_t *dst) +{ + assert(brw_cs_push_const_total_size(cs_prog_data, threads) > 0); + assert(cs_prog_data->push.cross_thread.size == 0); + assert(cs_prog_data->push.per_thread.dwords == 1); + assert(cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID); + for (unsigned t = 0; t < threads; t++) + dst[8 * t] = t; +} + +/** + * Allocate scratch BOs as needed for the given per-thread size and stage. + */ +struct crocus_bo * +crocus_get_scratch_space(struct crocus_context *ice, + unsigned per_thread_scratch, + gl_shader_stage stage) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct crocus_bufmgr *bufmgr = screen->bufmgr; + const struct intel_device_info *devinfo = &screen->devinfo; + + unsigned encoded_size = ffs(per_thread_scratch) - 11; + assert(encoded_size < (1 << 16)); + + struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage]; + + unsigned subslice_total = screen->subslice_total; + subslice_total = 4 * devinfo->num_slices; + // assert(subslice_total >= screen->subslice_total); + + if (!*bop) { + unsigned scratch_ids_per_subslice = devinfo->max_cs_threads; + + uint32_t max_threads[] = { + [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, + [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, + [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, + [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, + [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, + [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslice_total, + }; + + uint32_t size = per_thread_scratch * max_threads[stage]; + + *bop = crocus_bo_alloc(bufmgr, "scratch", size); + } + + return *bop; +} + +/* ------------------------------------------------------------------- */ + +/** + * The pipe->create_[stage]_state() driver hooks. + * + * Performs basic NIR preprocessing, records any state dependencies, and + * returns an crocus_uncompiled_shader as the Gallium CSO. + * + * Actual shader compilation to assembly happens later, at first use. + */ +static void * +crocus_create_uncompiled_shader(struct pipe_context *ctx, + nir_shader *nir, + const struct pipe_stream_output_info *so_info) +{ + struct crocus_screen *screen = (struct crocus_screen *)ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_uncompiled_shader *ish = + calloc(1, sizeof(struct crocus_uncompiled_shader)); + if (!ish) + return NULL; + + if (devinfo->ver >= 6) + NIR_PASS(ish->needs_edge_flag, nir, crocus_fix_edge_flags); + else + ish->needs_edge_flag = false; + + brw_preprocess_nir(screen->compiler, nir, NULL); + + NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo, false); + NIR_PASS_V(nir, crocus_lower_storage_image_derefs); + + nir_sweep(nir); + + ish->program_id = get_new_program_id(screen); + ish->nir = nir; + if (so_info) { + memcpy(&ish->stream_output, so_info, sizeof(*so_info)); + update_so_info(&ish->stream_output, nir->info.outputs_written); + } + + /* Save this now before potentially dropping nir->info.name */ + if (nir->info.name && strncmp(nir->info.name, "ARB", 3) == 0) + ish->use_alt_mode = true; + + if (screen->disk_cache) { + /* Serialize the NIR to a binary blob that we can hash for the disk + * cache. Drop unnecessary information (like variable names) + * so the serialized NIR is smaller, and also to let us detect more + * isomorphic shaders when hashing, increasing cache hits. + */ + struct blob blob; + blob_init(&blob); + nir_serialize(&blob, nir, true); + _mesa_sha1_compute(blob.data, blob.size, ish->nir_sha1); + blob_finish(&blob); + } + + return ish; +} + +static struct crocus_uncompiled_shader * +crocus_create_shader_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct nir_shader *nir; + + if (state->type == PIPE_SHADER_IR_TGSI) + nir = tgsi_to_nir(state->tokens, ctx->screen, false); + else + nir = state->ir.nir; + + return crocus_create_uncompiled_shader(ctx, nir, &state->stream_output); +} + +static void * +crocus_create_vs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + /* User clip planes or gen5 sprite coord enable */ + if (ish->nir->info.clip_distance_array_size == 0 || + screen->devinfo.ver <= 5) + ish->nos |= (1ull << CROCUS_NOS_RASTERIZER); + + if (!screen->devinfo.is_haswell) + ish->nos |= (1ull << CROCUS_NOS_VERTEX_ELEMENTS); + + if (screen->precompile) { + struct brw_vs_prog_key key = { KEY_INIT() }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_vs(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_tcs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + struct shader_info *info = &ish->nir->info; + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + if (screen->precompile) { + const unsigned _GL_TRIANGLES = 0x0004; + struct brw_tcs_prog_key key = { + KEY_INIT(), + // XXX: make sure the linker fills this out from the TES... + .tes_primitive_mode = + info->tess.primitive_mode ? info->tess.primitive_mode + : _GL_TRIANGLES, + .outputs_written = info->outputs_written, + .patch_outputs_written = info->patch_outputs_written, + }; + + key.input_vertices = info->tess.tcs_vertices_out; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_tcs(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_tes_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + struct shader_info *info = &ish->nir->info; + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + /* User clip planes */ + if (ish->nir->info.clip_distance_array_size == 0) + ish->nos |= (1ull << CROCUS_NOS_RASTERIZER); + + if (screen->precompile) { + struct brw_tes_prog_key key = { + KEY_INIT(), + // XXX: not ideal, need TCS output/TES input unification + .inputs_read = info->inputs_read, + .patch_inputs_read = info->patch_inputs_read, + }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_tes(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_gs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + /* User clip planes */ + if (ish->nir->info.clip_distance_array_size == 0) + ish->nos |= (1ull << CROCUS_NOS_RASTERIZER); + + if (screen->precompile) { + struct brw_gs_prog_key key = { KEY_INIT() }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_gs(ice, ish, &key); + } + + return ish; +} + +static void * +crocus_create_fs_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state); + struct shader_info *info = &ish->nir->info; + + ish->nos |= (1ull << CROCUS_NOS_FRAMEBUFFER) | + (1ull << CROCUS_NOS_DEPTH_STENCIL_ALPHA) | + (1ull << CROCUS_NOS_RASTERIZER) | + (1ull << CROCUS_NOS_TEXTURES) | + (1ull << CROCUS_NOS_BLEND); + + /* The program key needs the VUE map if there are > 16 inputs or gen4/5 */ + if (screen->devinfo.ver < 6 || util_bitcount64(ish->nir->info.inputs_read & + BRW_FS_VARYING_INPUT_MASK) > 16) { + ish->nos |= (1ull << CROCUS_NOS_LAST_VUE_MAP); + } + + if (screen->precompile) { + const uint64_t color_outputs = info->outputs_written & + ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) | + BITFIELD64_BIT(FRAG_RESULT_STENCIL) | + BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + + bool can_rearrange_varyings = + screen->devinfo.ver > 6 && util_bitcount64(info->inputs_read & BRW_FS_VARYING_INPUT_MASK) <= 16; + + const struct intel_device_info *devinfo = &screen->devinfo; + struct brw_wm_prog_key key = { + KEY_INIT(), + .nr_color_regions = util_bitcount(color_outputs), + .coherent_fb_fetch = false, + .input_slots_valid = + can_rearrange_varyings ? 0 : info->inputs_read | VARYING_BIT_POS, + }; + + struct brw_vue_map vue_map; + if (devinfo->ver < 6) { + brw_compute_vue_map(devinfo, &vue_map, + info->inputs_read | VARYING_BIT_POS, + false, /* pos slots */ 1); + } + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_fs(ice, ish, &key, &vue_map); + } + + return ish; +} + +static void * +crocus_create_compute_state(struct pipe_context *ctx, + const struct pipe_compute_state *state) +{ + assert(state->ir_type == PIPE_SHADER_IR_NIR); + + struct crocus_context *ice = (void *) ctx; + struct crocus_screen *screen = (void *) ctx->screen; + struct crocus_uncompiled_shader *ish = + crocus_create_uncompiled_shader(ctx, (void *) state->prog, NULL); + + ish->nos |= (1ull << CROCUS_NOS_TEXTURES); + // XXX: disallow more than 64KB of shared variables + + if (screen->precompile) { + struct brw_cs_prog_key key = { KEY_INIT() }; + + if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key))) + crocus_compile_cs(ice, ish, &key); + } + + return ish; +} + +/** + * The pipe->delete_[stage]_state() driver hooks. + * + * Frees the crocus_uncompiled_shader. + */ +static void +crocus_delete_shader_state(struct pipe_context *ctx, void *state, gl_shader_stage stage) +{ + struct crocus_uncompiled_shader *ish = state; + struct crocus_context *ice = (void *) ctx; + + if (ice->shaders.uncompiled[stage] == ish) { + ice->shaders.uncompiled[stage] = NULL; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage; + } + + if (ish->const_data) { + pipe_resource_reference(&ish->const_data, NULL); + pipe_resource_reference(&ish->const_data_state.res, NULL); + } + + ralloc_free(ish->nir); + free(ish); +} + +static void +crocus_delete_vs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_VERTEX); +} + +static void +crocus_delete_tcs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_CTRL); +} + +static void +crocus_delete_tes_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_EVAL); +} + +static void +crocus_delete_gs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_GEOMETRY); +} + +static void +crocus_delete_fs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_FRAGMENT); +} + +static void +crocus_delete_cs_state(struct pipe_context *ctx, void *state) +{ + crocus_delete_shader_state(ctx, state, MESA_SHADER_COMPUTE); +} + +/** + * The pipe->bind_[stage]_state() driver hook. + * + * Binds an uncompiled shader as the current one for a particular stage. + * Updates dirty tracking to account for the shader's NOS. + */ +static void +bind_shader_state(struct crocus_context *ice, + struct crocus_uncompiled_shader *ish, + gl_shader_stage stage) +{ + uint64_t dirty_bit = CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage; + const uint64_t nos = ish ? ish->nos : 0; + + const struct shader_info *old_info = crocus_get_shader_info(ice, stage); + const struct shader_info *new_info = ish ? &ish->nir->info : NULL; + + if ((old_info ? BITSET_LAST_BIT(old_info->textures_used) : 0) != + (new_info ? BITSET_LAST_BIT(new_info->textures_used) : 0)) { + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage; + } + + ice->shaders.uncompiled[stage] = ish; + ice->state.stage_dirty |= dirty_bit; + + /* Record that CSOs need to mark CROCUS_DIRTY_UNCOMPILED_XS when they change + * (or that they no longer need to do so). + */ + for (int i = 0; i < CROCUS_NOS_COUNT; i++) { + if (nos & (1 << i)) + ice->state.stage_dirty_for_nos[i] |= dirty_bit; + else + ice->state.stage_dirty_for_nos[i] &= ~dirty_bit; + } +} + +static void +crocus_bind_vs_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + struct crocus_uncompiled_shader *new_ish = state; + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (new_ish && + ice->state.window_space_position != + new_ish->nir->info.vs.window_space_position) { + ice->state.window_space_position = + new_ish->nir->info.vs.window_space_position; + + ice->state.dirty |= CROCUS_DIRTY_CLIP | + CROCUS_DIRTY_RASTER | + CROCUS_DIRTY_CC_VIEWPORT; + } + + if (devinfo->ver == 6) { + ice->state.stage_dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG; + } + + bind_shader_state((void *) ctx, state, MESA_SHADER_VERTEX); +} + +static void +crocus_bind_tcs_state(struct pipe_context *ctx, void *state) +{ + bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_CTRL); +} + +static void +crocus_bind_tes_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + /* Enabling/disabling optional stages requires a URB reconfiguration. */ + if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + + bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_EVAL); +} + +static void +crocus_bind_gs_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *)ctx; + + /* Enabling/disabling optional stages requires a URB reconfiguration. */ + if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) + ice->state.dirty |= CROCUS_DIRTY_GEN6_URB; + + bind_shader_state((void *) ctx, state, MESA_SHADER_GEOMETRY); +} + +static void +crocus_bind_fs_state(struct pipe_context *ctx, void *state) +{ + struct crocus_context *ice = (struct crocus_context *) ctx; + struct crocus_uncompiled_shader *old_ish = + ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; + struct crocus_uncompiled_shader *new_ish = state; + + const unsigned color_bits = + BITFIELD64_BIT(FRAG_RESULT_COLOR) | + BITFIELD64_RANGE(FRAG_RESULT_DATA0, BRW_MAX_DRAW_BUFFERS); + + /* Fragment shader outputs influence HasWriteableRT */ + if (!old_ish || !new_ish || + (old_ish->nir->info.outputs_written & color_bits) != + (new_ish->nir->info.outputs_written & color_bits)) + ice->state.dirty |= CROCUS_DIRTY_WM; + + bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT); +} + +static void +crocus_bind_cs_state(struct pipe_context *ctx, void *state) +{ + bind_shader_state((void *) ctx, state, MESA_SHADER_COMPUTE); +} + +void +crocus_init_program_functions(struct pipe_context *ctx) +{ + ctx->create_vs_state = crocus_create_vs_state; + ctx->create_tcs_state = crocus_create_tcs_state; + ctx->create_tes_state = crocus_create_tes_state; + ctx->create_gs_state = crocus_create_gs_state; + ctx->create_fs_state = crocus_create_fs_state; + ctx->create_compute_state = crocus_create_compute_state; + + ctx->delete_vs_state = crocus_delete_vs_state; + ctx->delete_tcs_state = crocus_delete_tcs_state; + ctx->delete_tes_state = crocus_delete_tes_state; + ctx->delete_gs_state = crocus_delete_gs_state; + ctx->delete_fs_state = crocus_delete_fs_state; + ctx->delete_compute_state = crocus_delete_cs_state; + + ctx->bind_vs_state = crocus_bind_vs_state; + ctx->bind_tcs_state = crocus_bind_tcs_state; + ctx->bind_tes_state = crocus_bind_tes_state; + ctx->bind_gs_state = crocus_bind_gs_state; + ctx->bind_fs_state = crocus_bind_fs_state; + ctx->bind_compute_state = crocus_bind_cs_state; +} diff --git a/src/gallium/drivers/crocus/crocus_program_cache.c b/src/gallium/drivers/crocus/crocus_program_cache.c new file mode 100644 index 00000000000..d2d4b821754 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_program_cache.c @@ -0,0 +1,347 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_program_cache.c + * + * The in-memory program cache. This is basically a hash table mapping + * API-specified shaders and a state key to a compiled variant. It also + * takes care of uploading shader assembly into a BO for use on the GPU. + */ + +#include <stdio.h> +#include <errno.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_atomic.h" +#include "util/u_upload_mgr.h" +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/compiler/brw_eu.h" +#include "intel/compiler/brw_nir.h" +#include "crocus_context.h" +#include "crocus_resource.h" + +struct keybox { + uint16_t size; + enum crocus_program_cache_id cache_id; + uint8_t data[0]; +}; + +static struct keybox * +make_keybox(void *mem_ctx, enum crocus_program_cache_id cache_id, + const void *key, uint32_t key_size) +{ + struct keybox *keybox = + ralloc_size(mem_ctx, sizeof(struct keybox) + key_size); + + keybox->cache_id = cache_id; + keybox->size = key_size; + memcpy(keybox->data, key, key_size); + + return keybox; +} + +static uint32_t +keybox_hash(const void *void_key) +{ + const struct keybox *key = void_key; + return _mesa_hash_data(&key->cache_id, key->size + sizeof(key->cache_id)); +} + +static bool +keybox_equals(const void *void_a, const void *void_b) +{ + const struct keybox *a = void_a, *b = void_b; + if (a->size != b->size) + return false; + + return memcmp(a->data, b->data, a->size) == 0; +} + +struct crocus_compiled_shader * +crocus_find_cached_shader(struct crocus_context *ice, + enum crocus_program_cache_id cache_id, + uint32_t key_size, const void *key) +{ + struct keybox *keybox = make_keybox(NULL, cache_id, key, key_size); + struct hash_entry *entry = + _mesa_hash_table_search(ice->shaders.cache, keybox); + + ralloc_free(keybox); + + return entry ? entry->data : NULL; +} + +const void * +crocus_find_previous_compile(const struct crocus_context *ice, + enum crocus_program_cache_id cache_id, + unsigned program_string_id) +{ + hash_table_foreach(ice->shaders.cache, entry) { + const struct keybox *keybox = entry->key; + const struct brw_base_prog_key *key = (const void *)keybox->data; + if (keybox->cache_id == cache_id && + key->program_string_id == program_string_id) { + return keybox->data; + } + } + + return NULL; +} + +/** + * Look for an existing entry in the cache that has identical assembly code. + * + * This is useful for programs generating shaders at runtime, where multiple + * distinct shaders (from an API perspective) may compile to the same assembly + * in our backend. This saves space in the program cache buffer. + */ +static const struct crocus_compiled_shader * +find_existing_assembly(struct hash_table *cache, void *map, + const void *assembly, unsigned assembly_size) +{ + hash_table_foreach (cache, entry) { + const struct crocus_compiled_shader *existing = entry->data; + + if (existing->map_size != assembly_size) + continue; + + if (memcmp(map + existing->offset, assembly, assembly_size) == 0) + return existing; + } + return NULL; +} + +static void +crocus_cache_new_bo(struct crocus_context *ice, + uint32_t new_size) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + struct crocus_bo *new_bo; + new_bo = crocus_bo_alloc(screen->bufmgr, "program cache", new_size); + + void *map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE | + MAP_ASYNC | MAP_PERSISTENT); + + if (ice->shaders.cache_next_offset != 0) { + memcpy(map, ice->shaders.cache_bo_map, ice->shaders.cache_next_offset); + } + + crocus_bo_unmap(ice->shaders.cache_bo); + crocus_bo_unreference(ice->shaders.cache_bo); + ice->shaders.cache_bo = new_bo; + ice->shaders.cache_bo_map = map; + + if (screen->devinfo.ver == 4) { + /* reemit all shaders on GEN4 only. */ + ice->state.dirty |= CROCUS_DIRTY_CLIP | CROCUS_DIRTY_RASTER | + CROCUS_DIRTY_WM; + } + ice->batches[CROCUS_BATCH_RENDER].state_base_address_emitted = false; + ice->batches[CROCUS_BATCH_COMPUTE].state_base_address_emitted = false; + /* unset state base address */ +} + +static uint32_t +crocus_alloc_item_data(struct crocus_context *ice, uint32_t size) +{ + if (ice->shaders.cache_next_offset + size > ice->shaders.cache_bo->size) { + uint32_t new_size = ice->shaders.cache_bo->size * 2; + while (ice->shaders.cache_next_offset + size > new_size) + new_size *= 2; + + crocus_cache_new_bo(ice, new_size); + } + uint32_t offset = ice->shaders.cache_next_offset; + + /* Programs are always 64-byte aligned, so set up the next one now */ + ice->shaders.cache_next_offset = ALIGN(offset + size, 64); + return offset; +} + +struct crocus_compiled_shader * +crocus_upload_shader(struct crocus_context *ice, + enum crocus_program_cache_id cache_id, uint32_t key_size, + const void *key, const void *assembly, uint32_t asm_size, + struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, uint32_t *streamout, + enum brw_param_builtin *system_values, + unsigned num_system_values, unsigned num_cbufs, + const struct crocus_binding_table *bt) +{ + struct hash_table *cache = ice->shaders.cache; + struct crocus_compiled_shader *shader = + rzalloc_size(cache, sizeof(struct crocus_compiled_shader)); + const struct crocus_compiled_shader *existing = find_existing_assembly( + cache, ice->shaders.cache_bo_map, assembly, asm_size); + + /* If we can find a matching prog in the cache already, then reuse the + * existing stuff without creating new copy into the underlying buffer + * object. This is notably useful for programs generating shaders at + * runtime, where multiple shaders may compile to the same thing in our + * backend. + */ + if (existing) { + shader->offset = existing->offset; + shader->map_size = existing->map_size; + } else { + shader->offset = crocus_alloc_item_data(ice, asm_size); + shader->map_size = asm_size; + + memcpy(ice->shaders.cache_bo_map + shader->offset, assembly, asm_size); + } + + shader->prog_data = prog_data; + shader->prog_data_size = prog_data_size; + shader->streamout = streamout; + shader->system_values = system_values; + shader->num_system_values = num_system_values; + shader->num_cbufs = num_cbufs; + shader->bt = *bt; + + ralloc_steal(shader, shader->prog_data); + if (prog_data_size > 16) { + ralloc_steal(shader->prog_data, prog_data->param); + ralloc_steal(shader->prog_data, prog_data->pull_param); + } + ralloc_steal(shader, shader->streamout); + ralloc_steal(shader, shader->system_values); + + struct keybox *keybox = make_keybox(shader, cache_id, key, key_size); + _mesa_hash_table_insert(ice->shaders.cache, keybox, shader); + + return shader; +} + +bool +crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch, const void *key, + uint32_t key_size, uint32_t *kernel_out, + void *prog_data_out) +{ + struct blorp_context *blorp = blorp_batch->blorp; + struct crocus_context *ice = blorp->driver_ctx; + struct crocus_compiled_shader *shader = + crocus_find_cached_shader(ice, CROCUS_CACHE_BLORP, key_size, key); + + if (!shader) + return false; + + *kernel_out = shader->offset; + *((void **)prog_data_out) = shader->prog_data; + + return true; +} + +bool +crocus_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage, + const void *key, uint32_t key_size, + const void *kernel, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data_templ, + uint32_t prog_data_size, uint32_t *kernel_out, + void *prog_data_out) +{ + struct blorp_context *blorp = blorp_batch->blorp; + struct crocus_context *ice = blorp->driver_ctx; + + struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size); + memcpy(prog_data, prog_data_templ, prog_data_size); + + struct crocus_binding_table bt; + memset(&bt, 0, sizeof(bt)); + + struct crocus_compiled_shader *shader = crocus_upload_shader( + ice, CROCUS_CACHE_BLORP, key_size, key, kernel, kernel_size, prog_data, + prog_data_size, NULL, NULL, 0, 0, &bt); + + *kernel_out = shader->offset; + *((void **)prog_data_out) = shader->prog_data; + + return true; +} + +void +crocus_init_program_cache(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + ice->shaders.cache = + _mesa_hash_table_create(ice, keybox_hash, keybox_equals); + + ice->shaders.cache_bo = + crocus_bo_alloc(screen->bufmgr, "program_cache", 16384); + ice->shaders.cache_bo_map = + crocus_bo_map(NULL, ice->shaders.cache_bo, + MAP_READ | MAP_WRITE | MAP_ASYNC | MAP_PERSISTENT); +} + +void +crocus_destroy_program_cache(struct crocus_context *ice) +{ + for (int i = 0; i < MESA_SHADER_STAGES; i++) { + ice->shaders.prog[i] = NULL; + } + + if (ice->shaders.cache_bo) { + crocus_bo_unmap(ice->shaders.cache_bo); + crocus_bo_unreference(ice->shaders.cache_bo); + ice->shaders.cache_bo_map = NULL; + ice->shaders.cache_bo = NULL; + } + + ralloc_free(ice->shaders.cache); +} + +static const char * +cache_name(enum crocus_program_cache_id cache_id) +{ + if (cache_id == CROCUS_CACHE_BLORP) + return "BLORP"; + + if (cache_id == CROCUS_CACHE_SF) + return "SF"; + + if (cache_id == CROCUS_CACHE_CLIP) + return "CLIP"; + + if (cache_id == CROCUS_CACHE_FF_GS) + return "FF_GS"; + + return _mesa_shader_stage_to_string(cache_id); +} + +void +crocus_print_program_cache(struct crocus_context *ice) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + hash_table_foreach(ice->shaders.cache, entry) { + const struct keybox *keybox = entry->key; + struct crocus_compiled_shader *shader = entry->data; + fprintf(stderr, "%s:\n", cache_name(keybox->cache_id)); + brw_disassemble(devinfo, ice->shaders.cache_bo_map + shader->offset, 0, + shader->prog_data->program_size, NULL, stderr); + } +} diff --git a/src/gallium/drivers/crocus/crocus_query.c b/src/gallium/drivers/crocus/crocus_query.c new file mode 100644 index 00000000000..14ba9fbce59 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_query.c @@ -0,0 +1,996 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_query.c + * + * ============================= GENXML CODE ============================= + * [This file is compiled once per generation.] + * ======================================================================= + * + * Query object support. This allows measuring various simple statistics + * via counters on the GPU. We use GenX code for MI_MATH calculations. + */ + +#include <stdio.h> +#include <errno.h> +#include "perf/intel_perf.h" +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_inlines.h" +#include "util/u_upload_mgr.h" +#include "crocus_context.h" +#include "crocus_defines.h" +#include "crocus_fence.h" +#include "crocus_monitor.h" +#include "crocus_resource.h" +#include "crocus_screen.h" + +#include "crocus_genx_macros.h" + +#if GFX_VER == 6 +// TOOD: Add these to genxml? +#define SO_PRIM_STORAGE_NEEDED(n) (0x2280) +#define SO_NUM_PRIMS_WRITTEN(n) (0x2288) + +// TODO: remove HS/DS/CS +#define GFX6_IA_VERTICES_COUNT_num 0x2310 +#define GFX6_IA_PRIMITIVES_COUNT_num 0x2318 +#define GFX6_VS_INVOCATION_COUNT_num 0x2320 +#define GFX6_HS_INVOCATION_COUNT_num 0x2300 +#define GFX6_DS_INVOCATION_COUNT_num 0x2308 +#define GFX6_GS_INVOCATION_COUNT_num 0x2328 +#define GFX6_GS_PRIMITIVES_COUNT_num 0x2330 +#define GFX6_CL_INVOCATION_COUNT_num 0x2338 +#define GFX6_CL_PRIMITIVES_COUNT_num 0x2340 +#define GFX6_PS_INVOCATION_COUNT_num 0x2348 +#define GFX6_CS_INVOCATION_COUNT_num 0x2290 +#define GFX6_PS_DEPTH_COUNT_num 0x2350 + +#elif GFX_VER == 7 +#define SO_PRIM_STORAGE_NEEDED(n) (GENX(SO_PRIM_STORAGE_NEEDED0_num) + (n) * 8) +#define SO_NUM_PRIMS_WRITTEN(n) (GENX(SO_NUM_PRIMS_WRITTEN0_num) + (n) * 8) +#endif + +struct crocus_query { + enum pipe_query_type type; + int index; + + bool ready; + + bool stalled; + + uint64_t result; + + struct crocus_state_ref query_state_ref; + struct crocus_query_snapshots *map; + struct crocus_syncobj *syncobj; + + int batch_idx; + + struct crocus_monitor_object *monitor; + + /* Fence for PIPE_QUERY_GPU_FINISHED. */ + struct pipe_fence_handle *fence; +}; + +struct crocus_query_snapshots { + /** crocus_render_condition's saved MI_PREDICATE_RESULT value. */ + uint64_t predicate_result; + + /** Have the start/end snapshots landed? */ + uint64_t snapshots_landed; + + /** Starting and ending counter snapshots */ + uint64_t start; + uint64_t end; +}; + +struct crocus_query_so_overflow { + uint64_t predicate_result; + uint64_t snapshots_landed; + + struct { + uint64_t prim_storage_needed[2]; + uint64_t num_prims[2]; + } stream[4]; +}; + +#if GFX_VERx10 == 75 +static struct mi_value +query_mem64(struct crocus_query *q, uint32_t offset) +{ + return mi_mem64(rw_bo(crocus_resource_bo(q->query_state_ref.res), + q->query_state_ref.offset + offset)); +} +#endif + +/** + * Is this type of query written by PIPE_CONTROL? + */ +static bool +crocus_is_query_pipelined(struct crocus_query *q) +{ + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + case PIPE_QUERY_TIME_ELAPSED: + return true; + + default: + return false; + } +} + +static void +mark_available(struct crocus_context *ice, struct crocus_query *q) +{ +#if GFX_VERx10 == 75 + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + struct crocus_screen *screen = batch->screen; + unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE; + unsigned offset = offsetof(struct crocus_query_snapshots, snapshots_landed); + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + offset += q->query_state_ref.offset; + + if (!crocus_is_query_pipelined(q)) { + screen->vtbl.store_data_imm64(batch, bo, offset, true); + } else { + /* Order available *after* the query results. */ + flags |= PIPE_CONTROL_FLUSH_ENABLE; + crocus_emit_pipe_control_write(batch, "query: mark available", + flags, bo, offset, true); + } +#endif +} + +/** + * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL. + */ +static void +crocus_pipelined_write(struct crocus_batch *batch, + struct crocus_query *q, + enum pipe_control_flags flags, + unsigned offset) +{ + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + + crocus_emit_pipe_control_write(batch, "query: pipelined snapshot write", + flags, + bo, offset, 0ull); +} + +static void +write_value(struct crocus_context *ice, struct crocus_query *q, unsigned offset) +{ + struct crocus_batch *batch = &ice->batches[q->batch_idx]; +#if GFX_VER >= 6 + struct crocus_screen *screen = batch->screen; + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); +#endif + + if (!crocus_is_query_pipelined(q)) { + crocus_emit_pipe_control_flush(batch, + "query: non-pipelined snapshot write", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + q->stalled = true; + } + + switch (q->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q, + PIPE_CONTROL_WRITE_DEPTH_COUNT | + PIPE_CONTROL_DEPTH_STALL, + offset); + break; + case PIPE_QUERY_TIME_ELAPSED: + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q, + PIPE_CONTROL_WRITE_TIMESTAMP, + offset); + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: +#if GFX_VER >= 6 + screen->vtbl.store_register_mem64(batch, + q->index == 0 ? + GENX(CL_INVOCATION_COUNT_num) : + SO_PRIM_STORAGE_NEEDED(q->index), + bo, offset, false); +#endif + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: +#if GFX_VER >= 6 + screen->vtbl.store_register_mem64(batch, + SO_NUM_PRIMS_WRITTEN(q->index), + bo, offset, false); +#endif + break; + case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: { +#if GFX_VER >= 6 + static const uint32_t index_to_reg[] = { + GENX(IA_VERTICES_COUNT_num), + GENX(IA_PRIMITIVES_COUNT_num), + GENX(VS_INVOCATION_COUNT_num), + GENX(GS_INVOCATION_COUNT_num), + GENX(GS_PRIMITIVES_COUNT_num), + GENX(CL_INVOCATION_COUNT_num), + GENX(CL_PRIMITIVES_COUNT_num), + GENX(PS_INVOCATION_COUNT_num), + GENX(HS_INVOCATION_COUNT_num), + GENX(DS_INVOCATION_COUNT_num), + GENX(CS_INVOCATION_COUNT_num), + }; + uint32_t reg = index_to_reg[q->index]; + +#if GFX_VER == 6 + /* Gfx6 GS code counts full primitives, that is, it won't count individual + * triangles in a triangle strip. Use CL_INVOCATION_COUNT for that. + */ + if (q->index == PIPE_STAT_QUERY_GS_PRIMITIVES) + reg = GENX(CL_INVOCATION_COUNT_num); +#endif + + screen->vtbl.store_register_mem64(batch, reg, bo, offset, false); +#endif + break; + } + default: + assert(false); + } +} + +#if GFX_VER >= 6 +static void +write_overflow_values(struct crocus_context *ice, struct crocus_query *q, bool end) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_screen *screen = batch->screen; + uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4; + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + uint32_t offset = q->query_state_ref.offset; + crocus_emit_pipe_control_flush(batch, + "query: write SO overflow snapshots", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_STALL_AT_SCOREBOARD); + for (uint32_t i = 0; i < count; i++) { + int s = q->index + i; + int g_idx = offset + offsetof(struct crocus_query_so_overflow, + stream[s].num_prims[end]); + int w_idx = offset + offsetof(struct crocus_query_so_overflow, + stream[s].prim_storage_needed[end]); + screen->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s), + bo, g_idx, false); + screen->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s), + bo, w_idx, false); + } +} +#endif +static uint64_t +crocus_raw_timestamp_delta(uint64_t time0, uint64_t time1) +{ + if (time0 > time1) { + return (1ULL << TIMESTAMP_BITS) + time1 - time0; + } else { + return time1 - time0; + } +} + +static bool +stream_overflowed(struct crocus_query_so_overflow *so, int s) +{ + return (so->stream[s].prim_storage_needed[1] - + so->stream[s].prim_storage_needed[0]) != + (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]); +} + +static void +calculate_result_on_cpu(const struct intel_device_info *devinfo, + struct crocus_query *q) +{ + switch (q->type) { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + q->result = q->map->end != q->map->start; + break; + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* The timestamp is the single starting snapshot. */ + q->result = intel_device_info_timebase_scale(devinfo, q->map->start); + q->result &= (1ull << TIMESTAMP_BITS) - 1; + break; + case PIPE_QUERY_TIME_ELAPSED: + q->result = crocus_raw_timestamp_delta(q->map->start, q->map->end); + q->result = intel_device_info_timebase_scale(devinfo, q->result); + q->result &= (1ull << TIMESTAMP_BITS) - 1; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + q->result = stream_overflowed((void *) q->map, q->index); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + q->result = false; + for (int i = 0; i < MAX_VERTEX_STREAMS; i++) + q->result |= stream_overflowed((void *) q->map, i); + break; + case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: + q->result = q->map->end - q->map->start; + + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if (GFX_VER == 7 && devinfo->is_haswell && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS) + q->result /= 4; + break; + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + default: + q->result = q->map->end - q->map->start; + break; + } + + q->ready = true; +} + +#if GFX_VERx10 == 75 +/** + * Calculate the streamout overflow for stream \p idx: + * + * (num_prims[1] - num_prims[0]) - (storage_needed[1] - storage_needed[0]) + */ +static struct mi_value +calc_overflow_for_stream(struct mi_builder *b, + struct crocus_query *q, + int idx) +{ +#define C(counter, i) query_mem64(q, \ + offsetof(struct crocus_query_so_overflow, stream[idx].counter[i])) + + return mi_isub(b, mi_isub(b, C(num_prims, 1), C(num_prims, 0)), + mi_isub(b, C(prim_storage_needed, 1), + C(prim_storage_needed, 0))); +#undef C +} + +/** + * Calculate whether any stream has overflowed. + */ +static struct mi_value +calc_overflow_any_stream(struct mi_builder *b, struct crocus_query *q) +{ + struct mi_value stream_result[MAX_VERTEX_STREAMS]; + for (int i = 0; i < MAX_VERTEX_STREAMS; i++) + stream_result[i] = calc_overflow_for_stream(b, q, i); + + struct mi_value result = stream_result[0]; + for (int i = 1; i < MAX_VERTEX_STREAMS; i++) + result = mi_ior(b, result, stream_result[i]); + + return result; +} + + +static bool +query_is_boolean(enum pipe_query_type type) +{ + switch (type) { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + return true; + default: + return false; + } +} + +/** + * Calculate the result using MI_MATH. + */ +static struct mi_value +calculate_result_on_gpu(const struct intel_device_info *devinfo, + struct mi_builder *b, + struct crocus_query *q) +{ + struct mi_value result; + struct mi_value start_val = + query_mem64(q, offsetof(struct crocus_query_snapshots, start)); + struct mi_value end_val = + query_mem64(q, offsetof(struct crocus_query_snapshots, end)); + + switch (q->type) { + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result = calc_overflow_for_stream(b, q, q->index); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + result = calc_overflow_any_stream(b, q); + break; + case PIPE_QUERY_TIMESTAMP: { + /* TODO: This discards any fractional bits of the timebase scale. + * We would need to do a bit of fixed point math on the CS ALU, or + * launch an actual shader to calculate this with full precision. + */ + uint32_t scale = 1000000000ull / devinfo->timestamp_frequency; + result = mi_iand(b, mi_imm((1ull << 36) - 1), + mi_imul_imm(b, start_val, scale)); + break; + } + case PIPE_QUERY_TIME_ELAPSED: { + /* TODO: This discards fractional bits (see above). */ + uint32_t scale = 1000000000ull / devinfo->timestamp_frequency; + result = mi_imul_imm(b, mi_isub(b, end_val, start_val), scale); + break; + } + default: + result = mi_isub(b, end_val, start_val); + break; + } + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if (GFX_VER == 7 && devinfo->is_haswell && + q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE && + q->index == PIPE_STAT_QUERY_PS_INVOCATIONS) + result = mi_ushr32_imm(b, result, 2); + + if (query_is_boolean(q->type)) + result = mi_iand(b, mi_nz(b, result), mi_imm(1)); + + return result; +} +#endif + +static struct pipe_query * +crocus_create_query(struct pipe_context *ctx, + unsigned query_type, + unsigned index) +{ + struct crocus_query *q = calloc(1, sizeof(struct crocus_query)); + + q->type = query_type; + q->index = index; + q->monitor = NULL; + + if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE && + q->index == PIPE_STAT_QUERY_CS_INVOCATIONS) + q->batch_idx = CROCUS_BATCH_COMPUTE; + else + q->batch_idx = CROCUS_BATCH_RENDER; + return (struct pipe_query *) q; +} + +static struct pipe_query * +crocus_create_batch_query(struct pipe_context *ctx, + unsigned num_queries, + unsigned *query_types) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = calloc(1, sizeof(struct crocus_query)); + if (unlikely(!q)) + return NULL; + q->type = PIPE_QUERY_DRIVER_SPECIFIC; + q->index = -1; + q->monitor = crocus_create_monitor_object(ice, num_queries, query_types); + if (unlikely(!q->monitor)) { + free(q); + return NULL; + } + + return (struct pipe_query *) q; +} + +static void +crocus_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query) +{ + struct crocus_query *query = (void *) p_query; + struct crocus_screen *screen = (void *) ctx->screen; + if (query->monitor) { + crocus_destroy_monitor_object(ctx, query->monitor); + query->monitor = NULL; + } else { + crocus_syncobj_reference(screen, &query->syncobj, NULL); + screen->base.fence_reference(ctx->screen, &query->fence, NULL); + } + free(query); +} + + +static bool +crocus_begin_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + if (q->monitor) + return crocus_begin_monitor(ctx, q->monitor); + + void *ptr = NULL; + uint32_t size; + + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + size = sizeof(struct crocus_query_so_overflow); + else + size = sizeof(struct crocus_query_snapshots); + + u_upload_alloc(ice->query_buffer_uploader, 0, + size, size, &q->query_state_ref.offset, + &q->query_state_ref.res, &ptr); + + if (!crocus_resource_bo(q->query_state_ref.res)) + return false; + + q->map = ptr; + if (!q->map) + return false; + + q->result = 0ull; + q->ready = false; + WRITE_ONCE(q->map->snapshots_landed, false); + + if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) { + ice->state.prims_generated_query_active = true; + ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP; + } + +#if GFX_VER <= 5 + if (q->type == PIPE_QUERY_OCCLUSION_COUNTER || + q->type == PIPE_QUERY_OCCLUSION_PREDICATE) { + ice->state.stats_wm++; + ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE; + } +#endif +#if GFX_VER >= 6 + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + write_overflow_values(ice, q, false); + else +#endif + write_value(ice, q, + q->query_state_ref.offset + + offsetof(struct crocus_query_snapshots, start)); + + return true; +} + +static bool +crocus_end_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + if (q->monitor) + return crocus_end_monitor(ctx, q->monitor); + + if (q->type == PIPE_QUERY_GPU_FINISHED) { + ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED); + return true; + } + + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + + if (q->type == PIPE_QUERY_TIMESTAMP) { + crocus_begin_query(ctx, query); + crocus_batch_reference_signal_syncobj(batch, &q->syncobj); + mark_available(ice, q); + return true; + } + +#if GFX_VER <= 5 + if (q->type == PIPE_QUERY_OCCLUSION_COUNTER || + q->type == PIPE_QUERY_OCCLUSION_PREDICATE) { + ice->state.stats_wm--; + ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE; + } +#endif + if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) { + ice->state.prims_generated_query_active = false; + ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP; + } + +#if GFX_VER >= 6 + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + write_overflow_values(ice, q, true); + else +#endif + write_value(ice, q, + q->query_state_ref.offset + + offsetof(struct crocus_query_snapshots, end)); + + crocus_batch_reference_signal_syncobj(batch, &q->syncobj); + mark_available(ice, q); + + return true; +} + +/** + * See if the snapshots have landed for a query, and if so, compute the + * result and mark it ready. Does not flush (unlike crocus_get_query_result). + */ +static void +crocus_check_query_no_flush(struct crocus_context *ice, struct crocus_query *q) +{ + struct crocus_screen *screen = (void *) ice->ctx.screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (!q->ready && READ_ONCE(q->map->snapshots_landed)) { + calculate_result_on_cpu(devinfo, q); + } +} + +static bool +crocus_get_query_result(struct pipe_context *ctx, + struct pipe_query *query, + bool wait, + union pipe_query_result *result) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + if (q->monitor) + return crocus_get_monitor_result(ctx, q->monitor, wait, result->batch); + + struct crocus_screen *screen = (void *) ctx->screen; + const struct intel_device_info *devinfo = &screen->devinfo; + + if (unlikely(screen->no_hw)) { + result->u64 = 0; + return true; + } + + if (!q->ready) { + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + if (q->syncobj == crocus_batch_get_signal_syncobj(batch)) + crocus_batch_flush(batch); + +#if GFX_VERx10 == 75 + while (!READ_ONCE(q->map->snapshots_landed)) { + if (wait) + crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX); + else + return false; + } + assert(READ_ONCE(q->map->snapshots_landed)); +#else + if (wait) + crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX); +#endif + calculate_result_on_cpu(devinfo, q); + } + + assert(q->ready); + + result->u64 = q->result; + + return true; +} + +#if GFX_VER == 7 +static void +crocus_get_query_result_resource(struct pipe_context *ctx, + struct pipe_query *query, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *p_res, + unsigned offset) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + struct crocus_batch *batch = &ice->batches[q->batch_idx]; + struct crocus_screen *screen = batch->screen; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + struct crocus_resource *res = (void *) p_res; + struct crocus_bo *query_bo = crocus_resource_bo(q->query_state_ref.res); + struct crocus_bo *dst_bo = crocus_resource_bo(p_res); + unsigned snapshots_landed_offset = + offsetof(struct crocus_query_snapshots, snapshots_landed); + + res->bind_history |= PIPE_BIND_QUERY_BUFFER; + + if (index == -1) { + /* They're asking for the availability of the result. If we still + * have commands queued up which produce the result, submit them + * now so that progress happens. Either way, copy the snapshots + * landed field to the destination resource. + */ + if (q->syncobj == crocus_batch_get_signal_syncobj(batch)) + crocus_batch_flush(batch); + + screen->vtbl.copy_mem_mem(batch, dst_bo, offset, + query_bo, snapshots_landed_offset, + result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8); + return; + } + + if (!q->ready && READ_ONCE(q->map->snapshots_landed)) { + /* The final snapshots happen to have landed, so let's just compute + * the result on the CPU now... + */ + calculate_result_on_cpu(devinfo, q); + } + + if (q->ready) { + /* We happen to have the result on the CPU, so just copy it. */ + if (result_type <= PIPE_QUERY_TYPE_U32) { + screen->vtbl.store_data_imm32(batch, dst_bo, offset, q->result); + } else { + screen->vtbl.store_data_imm64(batch, dst_bo, offset, q->result); + } + + /* Make sure the result lands before they use bind the QBO elsewhere + * and use the result. + */ + // XXX: Why? i965 doesn't do this. + crocus_emit_pipe_control_flush(batch, + "query: unknown QBO flushing hack", + PIPE_CONTROL_CS_STALL); + return; + } + +#if GFX_VERx10 == 75 + bool predicated = !wait && !q->stalled; + + struct mi_builder b; + mi_builder_init(&b, &batch->screen->devinfo, batch); + + struct mi_value result = calculate_result_on_gpu(devinfo, &b, q); + struct mi_value dst = + result_type <= PIPE_QUERY_TYPE_U32 ? mi_mem32(rw_bo(dst_bo, offset)) + : mi_mem64(rw_bo(dst_bo, offset)); + + if (predicated) { + mi_store(&b, mi_reg32(MI_PREDICATE_RESULT), + mi_mem64(ro_bo(query_bo, snapshots_landed_offset))); + mi_store_if(&b, dst, result); + } else { + mi_store(&b, dst, result); + } +#endif +} +#endif + +static void +crocus_set_active_query_state(struct pipe_context *ctx, bool enable) +{ + struct crocus_context *ice = (void *) ctx; + + if (ice->state.statistics_counters_enabled == enable) + return; + + // XXX: most packets aren't paying attention to this yet, because it'd + // have to be done dynamically at draw time, which is a pain + ice->state.statistics_counters_enabled = enable; + ice->state.dirty |= CROCUS_DIRTY_CLIP | + CROCUS_DIRTY_RASTER | + CROCUS_DIRTY_STREAMOUT | + CROCUS_DIRTY_WM; + ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS | + CROCUS_STAGE_DIRTY_TCS | + CROCUS_STAGE_DIRTY_TES | + CROCUS_STAGE_DIRTY_VS; +} + +static void +set_predicate_enable(struct crocus_context *ice, bool value) +{ + if (value) + ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER; + else + ice->state.predicate = CROCUS_PREDICATE_STATE_DONT_RENDER; +} + +#if GFX_VER == 7 +static void +set_predicate_for_result(struct crocus_context *ice, + struct crocus_query *q, + bool inverted) +{ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res); + +#if GFX_VERx10 != 75 + /* IVB doesn't have enough MI for this */ + if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { + ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY; + return; + } +#endif + + /* The CPU doesn't have the query result yet; use hardware predication */ + ice->state.predicate = CROCUS_PREDICATE_STATE_USE_BIT; + + /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */ + crocus_emit_pipe_control_flush(batch, + "conditional rendering: set predicate", + PIPE_CONTROL_FLUSH_ENABLE); + q->stalled = true; + +#if GFX_VERx10 != 75 + struct crocus_screen *screen = batch->screen; + screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo, + q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, start)); + screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo, + q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, end)); + + uint32_t mi_predicate = MI_PREDICATE | MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + if (inverted) + mi_predicate |= MI_PREDICATE_LOADOP_LOAD; + else + mi_predicate |= MI_PREDICATE_LOADOP_LOADINV; + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); +#else + struct mi_builder b; + mi_builder_init(&b, &batch->screen->devinfo, batch); + + struct mi_value result; + + switch (q->type) { + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result = calc_overflow_for_stream(&b, q, q->index); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + result = calc_overflow_any_stream(&b, q); + break; + default: { + /* PIPE_QUERY_OCCLUSION_* */ + struct mi_value start = + query_mem64(q, offsetof(struct crocus_query_snapshots, start)); + struct mi_value end = + query_mem64(q, offsetof(struct crocus_query_snapshots, end)); + result = mi_isub(&b, end, start); + break; + } + } + + result = inverted ? mi_z(&b, result) : mi_nz(&b, result); + result = mi_iand(&b, result, mi_imm(1)); + + /* We immediately set the predicate on the render batch, as all the + * counters come from 3D operations. However, we may need to predicate + * a compute dispatch, which executes in a different GEM context and has + * a different MI_PREDICATE_RESULT register. So, we save the result to + * memory and reload it in crocus_launch_grid. + */ + mi_value_ref(&b, result); + + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), result); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + + unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | + MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); + mi_store(&b, query_mem64(q, offsetof(struct crocus_query_snapshots, + predicate_result)), result); +#endif + ice->state.compute_predicate = bo; +} +#endif + +static void +crocus_render_condition(struct pipe_context *ctx, + struct pipe_query *query, + bool condition, + enum pipe_render_cond_flag mode) +{ + struct crocus_context *ice = (void *) ctx; + struct crocus_query *q = (void *) query; + + /* The old condition isn't relevant; we'll update it if necessary */ + ice->state.compute_predicate = NULL; + ice->condition.query = q; + ice->condition.condition = condition; + ice->condition.mode = mode; + + if (!q) { + ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER; + return; + } + + crocus_check_query_no_flush(ice, q); + + if (q->result || q->ready) { + set_predicate_enable(ice, (q->result != 0) ^ condition); + } else { + if (mode == PIPE_RENDER_COND_NO_WAIT || + mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) { + perf_debug(&ice->dbg, "Conditional rendering demoted from " + "\"no wait\" to \"wait\"."); + } +#if GFX_VER == 7 + set_predicate_for_result(ice, q, condition); +#else + ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY; +#endif + } +} + +static void +crocus_resolve_conditional_render(struct crocus_context *ice) +{ + struct pipe_context *ctx = (void *) ice; + struct crocus_query *q = ice->condition.query; + struct pipe_query *query = (void *) q; + union pipe_query_result result; + + if (ice->state.predicate != CROCUS_PREDICATE_STATE_USE_BIT) + return; + + assert(q); + + crocus_get_query_result(ctx, query, true, &result); + set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition); +} + +#if GFX_VER >= 7 +static void +crocus_emit_compute_predicate(struct crocus_batch *batch) +{ + struct crocus_context *ice = batch->ice; + struct crocus_screen *screen = batch->screen; + screen->vtbl.load_register_mem32(batch, MI_PREDICATE_SRC0, + ice->state.compute_predicate, 0); + screen->vtbl.load_register_imm32(batch, MI_PREDICATE_SRC1, 0); + unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV | + MI_PREDICATE_COMBINEOP_SET | + MI_PREDICATE_COMPAREOP_SRCS_EQUAL; + + crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); +} +#endif + +void +genX(init_screen_query)(struct crocus_screen *screen) +{ + screen->vtbl.resolve_conditional_render = crocus_resolve_conditional_render; +#if GFX_VER >= 7 + screen->vtbl.emit_compute_predicate = crocus_emit_compute_predicate; +#endif +} + +void +genX(init_query)(struct crocus_context *ice) +{ + struct pipe_context *ctx = &ice->ctx; + + ctx->create_query = crocus_create_query; + ctx->create_batch_query = crocus_create_batch_query; + ctx->destroy_query = crocus_destroy_query; + ctx->begin_query = crocus_begin_query; + ctx->end_query = crocus_end_query; + ctx->get_query_result = crocus_get_query_result; +#if GFX_VER == 7 + ctx->get_query_result_resource = crocus_get_query_result_resource; +#endif + ctx->set_active_query_state = crocus_set_active_query_state; + ctx->render_condition = crocus_render_condition; + +} diff --git a/src/gallium/drivers/crocus/crocus_resolve.c b/src/gallium/drivers/crocus/crocus_resolve.c new file mode 100644 index 00000000000..a38eb4a94a7 --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_resolve.c @@ -0,0 +1,1061 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_resolve.c + * + * This file handles resolve tracking for main and auxiliary surfaces. + * + * It also handles our cache tracking. We have sets for the render cache, + * depth cache, and so on. If a BO is in a cache's set, then it may have + * data in that cache. The helpers take care of emitting flushes for + * render-to-texture, format reinterpretation issues, and other situations. + */ + +#include "util/hash_table.h" +#include "util/set.h" +#include "crocus_context.h" +#include "compiler/nir/nir.h" + +#define FILE_DEBUG_FLAG DEBUG_BLORP + +static void +crocus_update_stencil_shadow(struct crocus_context *ice, + struct crocus_resource *res); +/** + * Disable auxiliary buffers if a renderbuffer is also bound as a texture + * or shader image. This causes a self-dependency, where both rendering + * and sampling may concurrently read or write the CCS buffer, causing + * incorrect pixels. + */ +static bool +disable_rb_aux_buffer(struct crocus_context *ice, + bool *draw_aux_buffer_disabled, + struct crocus_resource *tex_res, + unsigned min_level, unsigned num_levels, + const char *usage) +{ + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + bool found = false; + + /* We only need to worry about fast clears. */ + if (tex_res->aux.usage != ISL_AUX_USAGE_CCS_D) + return false; + + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + if (!surf) + continue; + + struct crocus_resource *rb_res = (void *) surf->base.texture; + + if (rb_res->bo == tex_res->bo && + surf->base.u.tex.level >= min_level && + surf->base.u.tex.level < min_level + num_levels) { + found = draw_aux_buffer_disabled[i] = true; + } + } + + if (found) { + perf_debug(&ice->dbg, + "Disabling CCS because a renderbuffer is also bound %s.\n", + usage); + } + + return found; +} + +static void +resolve_sampler_views(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_shader_state *shs, + const struct shader_info *info, + bool *draw_aux_buffer_disabled, + bool consider_framebuffer) +{ + uint32_t views = info ? (shs->bound_sampler_views & info->textures_used[0]) : 0; + + while (views) { + const int i = u_bit_scan(&views); + struct crocus_sampler_view *isv = shs->textures[i]; + + if (isv->res->base.target != PIPE_BUFFER) { + if (consider_framebuffer) { + disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, isv->res, + isv->view.base_level, isv->view.levels, + "for sampling"); + } + + crocus_resource_prepare_texture(ice, isv->res, isv->view.format, + isv->view.base_level, isv->view.levels, + isv->view.base_array_layer, + isv->view.array_len); + } + + crocus_cache_flush_for_read(batch, isv->res->bo); + + if (batch->screen->devinfo.ver >= 7 && + (isv->base.format == PIPE_FORMAT_X24S8_UINT || + isv->base.format == PIPE_FORMAT_X32_S8X24_UINT || + isv->base.format == PIPE_FORMAT_S8_UINT)) { + struct crocus_resource *zres, *sres; + crocus_get_depth_stencil_resources(&batch->screen->devinfo, isv->base.texture, &zres, &sres); + crocus_update_stencil_shadow(ice, sres); + crocus_cache_flush_for_read(batch, sres->shadow->bo); + } + } +} + +static void +resolve_image_views(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_shader_state *shs, + bool *draw_aux_buffer_disabled, + bool consider_framebuffer) +{ + /* TODO: Consider images used by program */ + uint32_t views = shs->bound_image_views; + + while (views) { + const int i = u_bit_scan(&views); + struct pipe_image_view *pview = &shs->image[i].base; + struct crocus_resource *res = (void *) pview->resource; + + if (res->base.target != PIPE_BUFFER) { + if (consider_framebuffer) { + disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, + res, pview->u.tex.level, 1, + "as a shader image"); + } + + unsigned num_layers = + pview->u.tex.last_layer - pview->u.tex.first_layer + 1; + + /* The data port doesn't understand any compression */ + crocus_resource_prepare_access(ice, res, + pview->u.tex.level, 1, + pview->u.tex.first_layer, num_layers, + ISL_AUX_USAGE_NONE, false); + } + + crocus_cache_flush_for_read(batch, res->bo); + } +} + +static void +crocus_update_align_res(struct crocus_batch *batch, + struct crocus_surface *surf, + bool copy_to_wa) +{ + struct crocus_screen *screen = (struct crocus_screen *)batch->screen; + struct pipe_blit_info info = { 0 }; + + info.src.resource = copy_to_wa ? surf->base.texture : surf->align_res; + info.src.level = copy_to_wa ? surf->base.u.tex.level : 0; + u_box_2d_zslice(0, 0, copy_to_wa ? surf->base.u.tex.first_layer : 0, + u_minify(surf->base.texture->width0, surf->base.u.tex.level), + u_minify(surf->base.texture->height0, surf->base.u.tex.level), &info.src.box); + info.src.format = surf->base.texture->format; + info.dst.resource = copy_to_wa ? surf->align_res : surf->base.texture; + info.dst.level = copy_to_wa ? 0 : surf->base.u.tex.level; + info.dst.box = info.src.box; + info.dst.box.z = copy_to_wa ? 0 : surf->base.u.tex.first_layer; + info.dst.format = surf->base.texture->format; + info.mask = util_format_is_depth_or_stencil(surf->base.texture->format) ? PIPE_MASK_ZS : PIPE_MASK_RGBA; + info.filter = 0; + if (!screen->vtbl.blit_blt(batch, &info)) { + assert(0); + } +} + +/** + * \brief Resolve buffers before drawing. + * + * Resolve the depth buffer's HiZ buffer, resolve the depth buffer of each + * enabled depth texture, and flush the render cache for any dirty textures. + */ +void +crocus_predraw_resolve_inputs(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled, + gl_shader_stage stage, + bool consider_framebuffer) +{ + struct crocus_shader_state *shs = &ice->state.shaders[stage]; + const struct shader_info *info = crocus_get_shader_info(ice, stage); + + uint64_t stage_dirty = (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage) | + (consider_framebuffer ? CROCUS_STAGE_DIRTY_BINDINGS_FS : 0); + + if (ice->state.stage_dirty & stage_dirty) { + resolve_sampler_views(ice, batch, shs, info, draw_aux_buffer_disabled, + consider_framebuffer); + resolve_image_views(ice, batch, shs, draw_aux_buffer_disabled, + consider_framebuffer); + } +} + +void +crocus_predraw_resolve_framebuffer(struct crocus_context *ice, + struct crocus_batch *batch, + bool *draw_aux_buffer_disabled) +{ + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + struct crocus_uncompiled_shader *ish = + ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]; + const nir_shader *nir = ish->nir; + + if (ice->state.dirty & CROCUS_DIRTY_DEPTH_BUFFER) { + struct pipe_surface *zs_surf = cso_fb->zsbuf; + + if (zs_surf) { + struct crocus_resource *z_res, *s_res; + crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res); + unsigned num_layers = + zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1; + + if (z_res) { + crocus_resource_prepare_render(ice, z_res, + zs_surf->u.tex.level, + zs_surf->u.tex.first_layer, + num_layers, ice->state.hiz_usage); + crocus_cache_flush_for_depth(batch, z_res->bo); + + if (((struct crocus_surface *)zs_surf)->align_res) { + crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, true); + } + } + + if (s_res) { + crocus_cache_flush_for_depth(batch, s_res->bo); + } + } + } + + if (nir->info.outputs_read != 0) { + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + if (cso_fb->cbufs[i]) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + struct crocus_resource *res = (void *) cso_fb->cbufs[i]->texture; + + crocus_resource_prepare_texture(ice, res, surf->view.format, + surf->view.base_level, 1, + surf->view.base_array_layer, + surf->view.array_len); + } + } + } + + if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS) { + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + if (!surf) + continue; + + struct crocus_resource *res = (void *) surf->base.texture; + + if (surf->align_res) + crocus_update_align_res(batch, surf, true); + + enum isl_aux_usage aux_usage = + crocus_resource_render_aux_usage(ice, res, surf->view.format, + ice->state.blend_enables & (1u << i), + draw_aux_buffer_disabled[i]); + + if (ice->state.draw_aux_usage[i] != aux_usage) { + ice->state.draw_aux_usage[i] = aux_usage; + /* XXX: Need to track which bindings to make dirty */ + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; + } + + crocus_resource_prepare_render(ice, res, surf->view.base_level, + surf->view.base_array_layer, + surf->view.array_len, + aux_usage); + + crocus_cache_flush_for_render(batch, res->bo, surf->view.format, + aux_usage); + } + } +} + +/** + * \brief Call this after drawing to mark which buffers need resolving + * + * If the depth buffer was written to and if it has an accompanying HiZ + * buffer, then mark that it needs a depth resolve. + * + * If the color buffer is a multisample window system buffer, then + * mark that it needs a downsample. + * + * Also mark any render targets which will be textured as needing a render + * cache flush. + */ +void +crocus_postdraw_update_resolve_tracking(struct crocus_context *ice, + struct crocus_batch *batch) +{ + struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer; + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + // XXX: front buffer drawing? + + bool may_have_resolved_depth = + ice->state.dirty & (CROCUS_DIRTY_DEPTH_BUFFER | + CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL); + + struct pipe_surface *zs_surf = cso_fb->zsbuf; + if (zs_surf) { + struct crocus_resource *z_res, *s_res; + crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res); + unsigned num_layers = + zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1; + + if (z_res) { + if (may_have_resolved_depth && ice->state.depth_writes_enabled) { + crocus_resource_finish_render(ice, z_res, zs_surf->u.tex.level, + zs_surf->u.tex.first_layer, num_layers, + ice->state.hiz_usage); + } + + if (ice->state.depth_writes_enabled) + crocus_depth_cache_add_bo(batch, z_res->bo); + + if (((struct crocus_surface *)zs_surf)->align_res) { + crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, false); + } + } + + if (s_res) { + if (may_have_resolved_depth && ice->state.stencil_writes_enabled) { + crocus_resource_finish_write(ice, s_res, zs_surf->u.tex.level, + zs_surf->u.tex.first_layer, num_layers, + s_res->aux.usage); + } + + if (ice->state.stencil_writes_enabled) + crocus_depth_cache_add_bo(batch, s_res->bo); + } + } + + bool may_have_resolved_color = + ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS; + + for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) { + struct crocus_surface *surf = (void *) cso_fb->cbufs[i]; + if (!surf) + continue; + + if (surf->align_res) + crocus_update_align_res(batch, surf, false); + struct crocus_resource *res = (void *) surf->base.texture; + enum isl_aux_usage aux_usage = ice->state.draw_aux_usage[i]; + + crocus_render_cache_add_bo(batch, res->bo, surf->view.format, + aux_usage); + + if (may_have_resolved_color) { + union pipe_surface_desc *desc = &surf->base.u; + unsigned num_layers = + desc->tex.last_layer - desc->tex.first_layer + 1; + crocus_resource_finish_render(ice, res, desc->tex.level, + desc->tex.first_layer, num_layers, + aux_usage); + } + } +} + +/** + * Clear the cache-tracking sets. + */ +void +crocus_cache_sets_clear(struct crocus_batch *batch) +{ + hash_table_foreach(batch->cache.render, render_entry) + _mesa_hash_table_remove(batch->cache.render, render_entry); + + set_foreach(batch->cache.depth, depth_entry) + _mesa_set_remove(batch->cache.depth, depth_entry); +} + +/** + * Emits an appropriate flush for a BO if it has been rendered to within the + * same batchbuffer as a read that's about to be emitted. + * + * The GPU has separate, incoherent caches for the render cache and the + * sampler cache, along with other caches. Usually data in the different + * caches don't interact (e.g. we don't render to our driver-generated + * immediate constant data), but for render-to-texture in FBOs we definitely + * do. When a batchbuffer is flushed, the kernel will ensure that everything + * necessary is flushed before another use of that BO, but for reuse from + * different caches within a batchbuffer, it's all our responsibility. + */ +void +crocus_flush_depth_and_render_caches(struct crocus_batch *batch) +{ + const struct intel_device_info *devinfo = &batch->screen->devinfo; + if (devinfo->ver >= 6) { + crocus_emit_pipe_control_flush(batch, + "cache tracker: render-to-texture", + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_CS_STALL); + + crocus_emit_pipe_control_flush(batch, + "cache tracker: render-to-texture", + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE); + } else { + crocus_emit_mi_flush(batch); + } + + crocus_cache_sets_clear(batch); +} + +void +crocus_cache_flush_for_read(struct crocus_batch *batch, + struct crocus_bo *bo) +{ + if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo) || + _mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo)) + crocus_flush_depth_and_render_caches(batch); +} + +static void * +format_aux_tuple(enum isl_format format, enum isl_aux_usage aux_usage) +{ + return (void *)(uintptr_t)((uint32_t)format << 8 | aux_usage); +} + +void +crocus_cache_flush_for_render(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage) +{ + if (_mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo)) + crocus_flush_depth_and_render_caches(batch); + + /* Check to see if this bo has been used by a previous rendering operation + * but with a different format or aux usage. If it has, flush the render + * cache so we ensure that it's only in there with one format or aux usage + * at a time. + * + * Even though it's not obvious, this can easily happen in practice. + * Suppose a client is blending on a surface with sRGB encode enabled on + * gen9. This implies that you get AUX_USAGE_CCS_D at best. If the client + * then disables sRGB decode and continues blending we will flip on + * AUX_USAGE_CCS_E without doing any sort of resolve in-between (this is + * perfectly valid since CCS_E is a subset of CCS_D). However, this means + * that we have fragments in-flight which are rendering with UNORM+CCS_E + * and other fragments in-flight with SRGB+CCS_D on the same surface at the + * same time and the pixel scoreboard and color blender are trying to sort + * it all out. This ends badly (i.e. GPU hangs). + * + * To date, we have never observed GPU hangs or even corruption to be + * associated with switching the format, only the aux usage. However, + * there are comments in various docs which indicate that the render cache + * isn't 100% resilient to format changes. We may as well be conservative + * and flush on format changes too. We can always relax this later if we + * find it to be a performance problem. + */ + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo); + if (entry && entry->data != format_aux_tuple(format, aux_usage)) + crocus_flush_depth_and_render_caches(batch); +} + +void +crocus_render_cache_add_bo(struct crocus_batch *batch, + struct crocus_bo *bo, + enum isl_format format, + enum isl_aux_usage aux_usage) +{ +#ifndef NDEBUG + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo); + if (entry) { + /* Otherwise, someone didn't do a flush_for_render and that would be + * very bad indeed. + */ + assert(entry->data == format_aux_tuple(format, aux_usage)); + } +#endif + + _mesa_hash_table_insert_pre_hashed(batch->cache.render, bo->hash, bo, + format_aux_tuple(format, aux_usage)); +} + +void +crocus_cache_flush_for_depth(struct crocus_batch *batch, + struct crocus_bo *bo) +{ + if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo)) + crocus_flush_depth_and_render_caches(batch); +} + +void +crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo) +{ + _mesa_set_add_pre_hashed(batch->cache.depth, bo->hash, bo); +} + +static void +crocus_resolve_color(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + unsigned level, unsigned layer, + enum isl_aux_op resolve_op) +{ + struct crocus_screen *screen = batch->screen; + DBG("%s to res %p level %u layer %u\n", __func__, res, level, layer); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, level, true); + + crocus_batch_maybe_flush(batch, 1500); + + /* Ivybridge PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)": + * + * "Any transition from any value in {Clear, Render, Resolve} to a + * different value in {Clear, Render, Resolve} requires end of pipe + * synchronization." + * + * In other words, fast clear ops are not properly synchronized with + * other drawing. We need to use a PIPE_CONTROL to ensure that the + * contents of the previous draw hit the render target before we resolve + * and again afterwards to ensure that the resolve is complete before we + * do any more regular drawing. + */ + crocus_emit_end_of_pipe_sync(batch, "color resolve: pre-flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + blorp_ccs_resolve(&blorp_batch, &surf, level, layer, 1, + isl_format_srgb_to_linear(res->surf.format), + resolve_op); + blorp_batch_finish(&blorp_batch); + + /* See comment above */ + crocus_emit_end_of_pipe_sync(batch, "color resolve: post-flush", + PIPE_CONTROL_RENDER_TARGET_FLUSH); +} + +static void +crocus_mcs_partial_resolve(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + uint32_t start_layer, + uint32_t num_layers) +{ + struct crocus_screen *screen = batch->screen; + + DBG("%s to res %p layers %u-%u\n", __func__, res, + start_layer, start_layer + num_layers - 1); + + assert(isl_aux_usage_has_mcs(res->aux.usage)); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, 0, true); + + struct blorp_batch blorp_batch; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0); + blorp_mcs_partial_resolve(&blorp_batch, &surf, + isl_format_srgb_to_linear(res->surf.format), + start_layer, num_layers); + blorp_batch_finish(&blorp_batch); +} + +/** + * Perform a HiZ or depth resolve operation. + * + * For an overview of HiZ ops, see the following sections of the Sandy Bridge + * PRM, Volume 1, Part 2: + * - 7.5.3.1 Depth Buffer Clear + * - 7.5.3.2 Depth Buffer Resolve + * - 7.5.3.3 Hierarchical Depth Buffer Resolve + */ +void +crocus_hiz_exec(struct crocus_context *ice, + struct crocus_batch *batch, + struct crocus_resource *res, + unsigned int level, unsigned int start_layer, + unsigned int num_layers, enum isl_aux_op op, + bool update_clear_depth) +{ + struct crocus_screen *screen = batch->screen; + const struct intel_device_info *devinfo = &batch->screen->devinfo; + assert(crocus_resource_level_has_hiz(res, level)); + assert(op != ISL_AUX_OP_NONE); + UNUSED const char *name = NULL; + + switch (op) { + case ISL_AUX_OP_FULL_RESOLVE: + name = "depth resolve"; + break; + case ISL_AUX_OP_AMBIGUATE: + name = "hiz ambiguate"; + break; + case ISL_AUX_OP_FAST_CLEAR: + name = "depth clear"; + break; + case ISL_AUX_OP_PARTIAL_RESOLVE: + case ISL_AUX_OP_NONE: + unreachable("Invalid HiZ op"); + } + + DBG("%s %s to res %p level %d layers %d-%d\n", + __func__, name, res, level, start_layer, start_layer + num_layers - 1); + + /* The following stalls and flushes are only documented to be required + * for HiZ clear operations. However, they also seem to be required for + * resolve operations. + * + * From the Ivybridge PRM, volume 2, "Depth Buffer Clear": + * + * "If other rendering operations have preceded this clear, a + * PIPE_CONTROL with depth cache flush enabled, Depth Stall bit + * enabled must be issued before the rectangle primitive used for + * the depth buffer clear operation." + * + * Same applies for Gen8 and Gen9. + * + * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1 + * PIPE_CONTROL, Depth Cache Flush Enable: + * + * "This bit must not be set when Depth Stall Enable bit is set in + * this packet." + * + * This is confirmed to hold for real, Haswell gets immediate gpu hangs. + * + * Therefore issue two pipe control flushes, one for cache flush and + * another for depth stall. + */ + if (devinfo->ver == 6) { + /* From the Sandy Bridge PRM, volume 2 part 1, page 313: + * + * "If other rendering operations have preceded this clear, a + * PIPE_CONTROL with write cache flush enabled and Z-inhibit + * disabled must be issued before the rectangle primitive used for + * the depth buffer clear operation. + */ + crocus_emit_pipe_control_flush(batch, + "hiz op: pre-flushes (1)", + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + } else if (devinfo->ver >= 7) { + crocus_emit_pipe_control_flush(batch, + "hiz op: pre-flushes (1/2)", + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + crocus_emit_pipe_control_flush(batch, "hiz op: pre-flushes (2/2)", + PIPE_CONTROL_DEPTH_STALL); + } + + assert(isl_aux_usage_has_hiz(res->aux.usage) && res->aux.bo); + + crocus_batch_maybe_flush(batch, 1500); + + struct blorp_surf surf; + crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf, + &res->base, res->aux.usage, level, true); + + struct blorp_batch blorp_batch; + enum blorp_batch_flags flags = 0; + flags |= update_clear_depth ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR; + blorp_batch_init(&ice->blorp, &blorp_batch, batch, flags); + blorp_hiz_op(&blorp_batch, &surf, level, start_layer, num_layers, op); + blorp_batch_finish(&blorp_batch); + + /* The following stalls and flushes are only documented to be required + * for HiZ clear operations. However, they also seem to be required for + * resolve operations. + * + * From the Broadwell PRM, volume 7, "Depth Buffer Clear": + * + * "Depth buffer clear pass using any of the methods (WM_STATE, + * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a + * PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits + * "set" before starting to render. DepthStall and DepthFlush are + * not needed between consecutive depth clear passes nor is it + * required if the depth clear pass was done with + * 'full_surf_clear' bit set in the 3DSTATE_WM_HZ_OP." + * + * TODO: Such as the spec says, this could be conditional. + */ + if (devinfo->ver == 6) { + /* From the Sandy Bridge PRM, volume 2 part 1, page 314: + * + * "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be + * followed by a PIPE_CONTROL command with DEPTH_STALL bit set + * and Then followed by Depth FLUSH' + */ + crocus_emit_pipe_control_flush(batch, + "hiz op: post-flushes (1/2)", + PIPE_CONTROL_DEPTH_STALL); + + crocus_emit_pipe_control_flush(batch, + "hiz op: post-flushes (2/2)", + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + } +} + +/** + * Does the resource's slice have hiz enabled? + */ +bool +crocus_resource_level_has_hiz(const struct crocus_resource *res, uint32_t level) +{ + crocus_resource_check_level_layer(res, level, 0); + return res->aux.has_hiz & 1 << level; +} + +static bool +crocus_resource_level_has_aux(const struct crocus_resource *res, uint32_t level) +{ + if (isl_aux_usage_has_hiz(res->aux.usage)) + return crocus_resource_level_has_hiz(res, level); + else + return level < res->aux.surf.levels; +} + +/** \brief Assert that the level and layer are valid for the resource. */ +void +crocus_resource_check_level_layer(UNUSED const struct crocus_resource *res, + UNUSED uint32_t level, UNUSED uint32_t layer) +{ + assert(level < res->surf.levels); + assert(layer < util_num_layers(&res->base, level)); +} + +static inline uint32_t +miptree_level_range_length(const struct crocus_resource *res, + uint32_t start_level, uint32_t num_levels) +{ + assert(start_level < res->surf.levels); + + if (num_levels == INTEL_REMAINING_LAYERS) + num_levels = res->surf.levels; + + /* Check for overflow */ + assert(start_level + num_levels >= start_level); + assert(start_level + num_levels <= res->surf.levels); + + return num_levels; +} + +static inline uint32_t +miptree_layer_range_length(const struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers) +{ + assert(level <= res->base.last_level); + + const uint32_t total_num_layers = crocus_get_num_logical_layers(res, level); + assert(start_layer < total_num_layers); + if (num_layers == INTEL_REMAINING_LAYERS) + num_layers = total_num_layers - start_layer; + /* Check for overflow */ + assert(start_layer + num_layers >= start_layer); + assert(start_layer + num_layers <= total_num_layers); + + return num_layers; +} + +bool +crocus_has_invalid_primary(const struct crocus_resource *res, + unsigned start_level, unsigned num_levels, + unsigned start_layer, unsigned num_layers) +{ + if (!res->aux.bo) + return false; + + /* Clamp the level range to fit the resource */ + num_levels = miptree_level_range_length(res, start_level, num_levels); + + for (uint32_t l = 0; l < num_levels; l++) { + const uint32_t level = start_level + l; + if (!crocus_resource_level_has_aux(res, level)) + continue; + + const uint32_t level_layers = + miptree_layer_range_length(res, level, start_layer, num_layers); + for (unsigned a = 0; a < level_layers; a++) { + enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, start_layer + a); + if (!isl_aux_state_has_valid_primary(aux_state)) + return true; + } + } + + return false; +} + +void +crocus_resource_prepare_access(struct crocus_context *ice, + struct crocus_resource *res, + uint32_t start_level, uint32_t num_levels, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_usage aux_usage, + bool fast_clear_supported) +{ + if (!res->aux.bo) + return; + + /* We can't do resolves on the compute engine, so awkwardly, we have to + * do them on the render batch... + */ + struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER]; + + const uint32_t clamped_levels = + miptree_level_range_length(res, start_level, num_levels); + for (uint32_t l = 0; l < clamped_levels; l++) { + const uint32_t level = start_level + l; + if (!crocus_resource_level_has_aux(res, level)) + continue; + + const uint32_t level_layers = + miptree_layer_range_length(res, level, start_layer, num_layers); + for (uint32_t a = 0; a < level_layers; a++) { + const uint32_t layer = start_layer + a; + const enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, layer); + const enum isl_aux_op aux_op = + isl_aux_prepare_access(aux_state, aux_usage, fast_clear_supported); + + /* Prepare the aux buffer for a conditional or unconditional access. + * A conditional access is handled by assuming that the access will + * not evaluate to a no-op. If the access does in fact occur, the aux + * will be in the required state. If it does not, no data is lost + * because the aux_op performed is lossless. + */ + if (aux_op == ISL_AUX_OP_NONE) { + /* Nothing to do here. */ + } else if (isl_aux_usage_has_mcs(res->aux.usage)) { + assert(aux_op == ISL_AUX_OP_PARTIAL_RESOLVE); + crocus_mcs_partial_resolve(ice, batch, res, layer, 1); + } else if (isl_aux_usage_has_hiz(res->aux.usage)) { + crocus_hiz_exec(ice, batch, res, level, layer, 1, aux_op, false); + } else if (res->aux.usage == ISL_AUX_USAGE_STC_CCS) { + unreachable("crocus doesn't resolve STC_CCS resources"); + } else { + assert(isl_aux_usage_has_ccs(res->aux.usage)); + crocus_resolve_color(ice, batch, res, level, layer, aux_op); + } + + const enum isl_aux_state new_state = + isl_aux_state_transition_aux_op(aux_state, res->aux.usage, aux_op); + crocus_resource_set_aux_state(ice, res, level, layer, 1, new_state); + } + } +} + +void +crocus_resource_finish_write(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_usage aux_usage) +{ + if (res->base.format == PIPE_FORMAT_S8_UINT) + res->shadow_needs_update = true; + + if (!crocus_resource_level_has_aux(res, level)) + return; + + const uint32_t level_layers = + miptree_layer_range_length(res, level, start_layer, num_layers); + + for (uint32_t a = 0; a < level_layers; a++) { + const uint32_t layer = start_layer + a; + const enum isl_aux_state aux_state = + crocus_resource_get_aux_state(res, level, layer); + + /* Transition the aux state for a conditional or unconditional write. A + * conditional write is handled by assuming that the write applies to + * only part of the render target. This prevents the new state from + * losing the types of compression that might exist in the current state + * (e.g. CLEAR). If the write evaluates to a no-op, the state will still + * be able to communicate when resolves are necessary (but it may + * falsely communicate this as well). + */ + const enum isl_aux_state new_aux_state = + isl_aux_state_transition_write(aux_state, aux_usage, false); + + crocus_resource_set_aux_state(ice, res, level, layer, 1, new_aux_state); + } +} + +enum isl_aux_state +crocus_resource_get_aux_state(const struct crocus_resource *res, + uint32_t level, uint32_t layer) +{ + crocus_resource_check_level_layer(res, level, layer); + assert(crocus_resource_level_has_aux(res, level)); + + return res->aux.state[level][layer]; +} + +void +crocus_resource_set_aux_state(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t num_layers, + enum isl_aux_state aux_state) +{ + assert(crocus_resource_level_has_aux(res, level)); + + num_layers = miptree_layer_range_length(res, level, start_layer, num_layers); + for (unsigned a = 0; a < num_layers; a++) { + if (res->aux.state[level][start_layer + a] != aux_state) { + res->aux.state[level][start_layer + a] = aux_state; + ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES | + CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES; + /* XXX: Need to track which bindings to make dirty */ + ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS; + } + } +} + +static bool +isl_formats_are_fast_clear_compatible(enum isl_format a, enum isl_format b) +{ + /* On gen8 and earlier, the hardware was only capable of handling 0/1 clear + * values so sRGB curve application was a no-op for all fast-clearable + * formats. + * + * On gen9+, the hardware supports arbitrary clear values. For sRGB clear + * values, the hardware interprets the floats, not as what would be + * returned from the sampler (or written by the shader), but as being + * between format conversion and sRGB curve application. This means that + * we can switch between sRGB and UNORM without having to whack the clear + * color. + */ + return isl_format_srgb_to_linear(a) == isl_format_srgb_to_linear(b); +} + +void +crocus_resource_prepare_texture(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format view_format, + uint32_t start_level, uint32_t num_levels, + uint32_t start_layer, uint32_t num_layers) +{ + enum isl_aux_usage aux_usage = + crocus_resource_texture_aux_usage(res); + + bool clear_supported = aux_usage != ISL_AUX_USAGE_NONE; + + /* Clear color is specified as ints or floats and the conversion is done by + * the sampler. If we have a texture view, we would have to perform the + * clear color conversion manually. Just disable clear color. + */ + if (!isl_formats_are_fast_clear_compatible(res->surf.format, view_format)) + clear_supported = false; + + crocus_resource_prepare_access(ice, res, start_level, num_levels, + start_layer, num_layers, + aux_usage, clear_supported); +} + +enum isl_aux_usage +crocus_resource_render_aux_usage(struct crocus_context *ice, + struct crocus_resource *res, + enum isl_format render_format, + bool blend_enabled, + bool draw_aux_disabled) +{ + struct crocus_screen *screen = (void *) ice->ctx.screen; + struct intel_device_info *devinfo = &screen->devinfo; + + if (draw_aux_disabled) + return ISL_AUX_USAGE_NONE; + + switch (res->aux.usage) { + case ISL_AUX_USAGE_MCS: + return res->aux.usage; + + case ISL_AUX_USAGE_CCS_D: + /* Otherwise, we try to fall back to CCS_D */ + if (isl_format_supports_ccs_d(devinfo, render_format)) + return ISL_AUX_USAGE_CCS_D; + + return ISL_AUX_USAGE_NONE; + + default: + return ISL_AUX_USAGE_NONE; + } +} + +void +crocus_resource_prepare_render(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t layer_count, + enum isl_aux_usage aux_usage) +{ + crocus_resource_prepare_access(ice, res, level, 1, start_layer, + layer_count, aux_usage, + aux_usage != ISL_AUX_USAGE_NONE); +} + +void +crocus_resource_finish_render(struct crocus_context *ice, + struct crocus_resource *res, uint32_t level, + uint32_t start_layer, uint32_t layer_count, + enum isl_aux_usage aux_usage) +{ + crocus_resource_finish_write(ice, res, level, start_layer, layer_count, + aux_usage); +} + +static void +crocus_update_stencil_shadow(struct crocus_context *ice, + struct crocus_resource *res) +{ + struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen; + UNUSED const struct intel_device_info *devinfo = &screen->devinfo; + assert(devinfo->ver >= 7); + + if (!res->shadow_needs_update) + return; + + struct pipe_box box; + for (unsigned level = 0; level <= res->base.last_level; level++) { + u_box_2d(0, 0, + u_minify(res->base.width0, level), + u_minify(res->base.height0, level), &box); + const unsigned depth = res->base.target == PIPE_TEXTURE_3D ? + u_minify(res->base.depth0, level) : res->base.array_size; + + for (unsigned layer = 0; layer < depth; layer++) { + box.z = layer; + ice->ctx.resource_copy_region(&ice->ctx, + &res->shadow->base, level, 0, 0, layer, + &res->base, level, &box); + } + } + res->shadow_needs_update = false; +} diff --git a/src/gallium/drivers/crocus/crocus_resource.c b/src/gallium/drivers/crocus/crocus_resource.c new file mode 100644 index 00000000000..b5bf5a42e1a --- /dev/null +++ b/src/gallium/drivers/crocus/crocus_resource.c @@ -0,0 +1,1946 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * @file crocus_resource.c + * + * Resources are images, buffers, and other objects used by the GPU. + * + * XXX: explain resources + */ + +#include <stdio.h> +#include <errno.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/os_memory.h" +#include "util/u_cpu_detect.h" +#include "util/u_inlines.h" +#include "util/format/u_format.h" +#include "util/u_threaded_context.h" +#include "util/u_transfer.h" +#include "util/u_transfer_helper.h" +#include "util/u_upload_mgr.h" +#include "util/ralloc.h" +#include "crocus_batch.h" +#include "crocus_context.h" +#include "crocus_resource.h" +#include "crocus_screen.h" +#include "intel/dev/intel_debug.h" +#include "isl/isl.h" +#include "drm-uapi/drm_fourcc.h" +#include "drm-uapi/i915_drm.h" + +enum modifier_priority { + MODIFIER_PRIORITY_INVALID = 0, + MODIFIER_PRIORITY_LINEAR, + MODIFIER_PRIORITY_X, + MODIFIER_PRIORITY_Y, + MODIFIER_PRIORITY_Y_CCS, +}; + +static const uint64_t priority_to_modifier[] = { + [MODIFIER_PRIORITY_INVALID] = DRM_FORMAT_MOD_INVALID, + [MODIFIER_PRIORITY_LINEAR] = DRM_FORMAT_MOD_LINEAR, + [MODIFIER_PRIORITY_X] = I915_FORMAT_MOD_X_TILED, + [MODIFIER_PRIORITY_Y] = I915_FORMAT_MOD_Y_TILED, + [MODIFIER_PRIORITY_Y_CCS] = I915_FORMAT_MOD_Y_TILED_CCS, +}; + +static bool +modifier_is_supported(const struct intel_device_info *devinfo, + enum pipe_format pfmt, uint64_t modifier) +{ + /* XXX: do something real */ + switch (modifier) { + case I915_FORMAT_MOD_Y_TILED_CCS: + return false; + case I915_FORMAT_MOD_Y_TILED: + return devinfo->ver >= 6; + case I915_FORMAT_MOD_X_TILED: + case DRM_FORMAT_MOD_LINEAR: + return true; + case DRM_FORMAT_MOD_INVALID: + default: + return false; + } +} + +static uint64_t +select_best_modifier(struct intel_device_info *devinfo, enum pipe_format pfmt, + const uint64_t *modifiers, + int count) +{ + enum modifier_priority prio = MODIFIER_PRIORITY_INVALID; + + for (int i = 0; i < count; i++) { + if (!modifier_is_supported(devinfo, pfmt, modifiers[i])) + continue; + + switch (modifiers[i]) { + case I915_FORMAT_MOD_Y_TILED_CCS: + prio = MAX2(prio, MODIFIER_PRIORITY_Y_CCS); + break; + case I915_FORMAT_MOD_Y_TILED: + prio = MAX2(prio, MODIFIER_PRIORITY_Y); + break; + case I915_FORMAT_MOD_X_TILED: + prio = MAX2(prio, MODIFIER_PRIORITY_X); + break; + case DRM_FORMAT_MOD_LINEAR: + prio = MAX2(prio, MODIFIER_PRIORITY_LINEAR); + break; + case DRM_FORMAT_MOD_INVALID: + default: + break; + } + } + + return priority_to_modifier[prio]; +} + +static enum isl_surf_dim +crocus_target_to_isl_surf_dim(enum pipe_texture_target target) +{ + switch (target) { + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return ISL_SURF_DIM_1D; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE_ARRAY: + return ISL_SURF_DIM_2D; + case PIPE_TEXTURE_3D: + return ISL_SURF_DIM_3D; + case PIPE_MAX_TEXTURE_TYPES: + break; + } + unreachable("invalid texture type"); +} + +static void +crocus_query_dmabuf_modifiers(struct pipe_screen *pscreen, + enum pipe_format pfmt, + int max, + uint64_t *modifiers, + unsigned int *external_only, + int *count) +{ + struct crocus_screen *screen = (void *) pscreen; + const struct intel_device_info *devinfo = &screen->devinfo; + + uint64_t all_modifiers[] = { + DRM_FORMAT_MOD_LINEAR, + I915_FORMAT_MOD_X_TILED, + I915_FORMAT_MOD_Y_TILED, + I915_FORMAT_MOD_Y_TILED_CCS, + }; + + int supported_mods = 0; + + for (int i = 0; i < ARRAY_SIZE(all_modifiers); i++) { + if (!modifier_is_supported(devinfo, pfmt, all_modifiers[i])) + continue; + + if (supported_mods < max) { + if (modifiers) + modifiers[supported_mods] = all_modifiers[i]; + + if (external_only) + external_only[supported_mods] = util_format_is_yuv(pfmt); + } + + supported_mods++; + } + + *count = supported_mods; +} + +static isl_surf_usage_flags_t +pipe_bind_to_isl_usage(unsigned bindings) +{ + isl_surf_usage_flags_t usage = 0; + + if (bindings & PIPE_BIND_RENDER_TARGET) + usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT; + + if (bindings & PIPE_BIND_SAMPLER_VIEW) + usage |= ISL_SURF_USAGE_TEXTURE_BIT; + + if (bindings & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SHADER_BUFFER)) + usage |= ISL_SURF_USAGE_STORAGE_BIT; + + if (bindings & PIPE_BIND_DISPLAY_TARGET) + usage |= ISL_SURF_USAGE_DISPLAY_BIT; + + return usage; +} + +struct pipe_resource * +crocus_resource_get_separate_stencil(struct pipe_resource *p_res) +{ + /* For packed depth-stencil, we treat depth as the primary resource + * and store S8 as the "second plane" resource. + */ + if (p_res->next && p_res->next->format == PIPE_FORMAT_S8_UINT) + return p_res->next; + + return NULL; + +} + +static void +crocus_resource_set_separate_stencil(struct pipe_resource *p_res, + struct pipe_resource *stencil) +{ + assert(util_format_has_depth(util_format_description(p_res->format))); + pipe_resource_reference(&p_res->next, stencil); +} + +void +crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo, + struct pipe_resource *res, + struct crocus_resource **out_z, + struct crocus_resource **out_s) +{ + if (!res) { + *out_z = NULL; + *out_s = NULL; + return; + } + + /* gen4/5 only supports packed ds */ + if (devinfo->ver < 6) { + *out_z = (void *)res; + *out_s = (void *)res; + return; + } + + if (res->format != PIPE_FORMAT_S8_UINT) { + *out_z = (void *) res; + *out_s = (void *) crocus_resource_get_separate_stencil(res); + } else { + *out_z = NULL; + *out_s = (void *) res; + } +} + +void +crocus_resource_disable_aux(struct crocus_resource *res) +{ + crocus_bo_unreference(res->aux.bo); + free(res->aux.state); + + res->aux.usage = ISL_AUX_USAGE_NONE; + res->aux.has_hiz = 0; + res->aux.surf.size_B = 0; + res->aux.surf.levels = 0; + res->aux.bo = NULL; + res->aux.extra_aux.surf.size_B = 0; + res->aux.state = NULL; +} + +static void +crocus_resource_destroy(struct pipe_screen *screen, + struct pipe_resource *resource) +{ + struct crocus_resource *res = (struct crocus_resource *)resource; + + if (resource->target == PIPE_BUFFER) + util_range_destroy(&res->valid_buffer_range); + + if (res->shadow) + pipe_resource_reference((struct pipe_resource **)&res->shadow, NULL); + crocus_resource_disable_aux(res); + + crocus_bo_unreference(res->bo); + crocus_pscreen_unref(res->orig_screen); + free(res); +} + +static struct crocus_resource * +crocus_alloc_resource(struct pipe_screen *pscreen, + const struct pipe_resource *templ) +{ + struct crocus_resource *res = calloc(1, sizeof(struct crocus_resource)); + if (!res) + return NULL; + + res->base = *templ; + res->base.screen = pscreen; + res->orig_screen = crocus_pscreen_ref(pscreen); + pipe_reference_init(&res->base.reference, 1); + + if (templ->target == PIPE_BUFFER) + util_range_init(&res->valid_buffer_range); + + return res; +} + +unsigned +crocus_get_num_logical_layers(const struct crocus_resource *res, unsigned level) +{ + if (res->surf.dim == ISL_SURF_DIM_3D) + return minify(res->surf.logical_level0_px.depth, level); + else + return res->surf.logical_level0_px.array_len; +} + +static enum isl_aux_state ** +create_aux_state_map(struct crocus_resource *res, enum isl_aux_state initial) +{ + assert(res->aux.state == NULL); + + uint32_t total_slices = 0; + for (uint32_t level = 0; level < res->surf.levels; level++) + total_slices += crocus_get_num_logical_layers(res, level); + + const size_t per_level_array_size = + res->surf.levels * sizeof(enum isl_aux_state *); + + /* We're going to allocate a single chunk of data for both the per-level + * reference array and the arrays of aux_state. This makes cleanup + * significantly easier. + */ + const size_t total_size = + per_level_array_size + total_slices * sizeof(enum isl_aux_state); + + void *data = malloc(total_size); + if (!data) + return NULL; + + enum isl_aux_state **per_level_arr = data; + enum isl_aux_state *s = data + per_level_array_size; + for (uint32_t level = 0; level < res->surf.levels; level++) { + per_level_arr[level] = s; + const unsigned level_layers = crocus_get_num_logical_layers(res, level); + for (uint32_t a = 0; a < level_layers; a++) + *(s++) = initial; + } + assert((void *)s == data + total_size); + + return per_level_arr; +} + +/** + * Configure aux for the resource, but don't allocate it. For images which + * might be shared with modifiers, we must allocate the image and aux data in + * a single bo. + * + * Returns false on unexpected error (e.g. allocation failed, or invalid + * configuration result). + */ +static bool +crocus_resource_configure_aux(struct crocus_screen *screen, + struct crocus_resource *res, bool imported, + uint64_t *aux_size_B, + uint32_t *alloc_flags) +{ + const struct intel_device_info *devinfo = &screen->devinfo; + + /* Try to create the auxiliary surfaces allowed by the modifier or by + * the user if no modifier is specified. + */ + assert(!res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE); + + const bool has_mcs = devinfo->ver >= 7 && !res->mod_info && + isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf); + + const bool has_hiz = devinfo->ver >= 6 && !res->mod_info && + !(INTEL_DEBUG & DEBUG_NO_HIZ) && + isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf); + + const bool has_ccs = + ((devinfo->ver >= 7 && !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) || + (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) && + isl_surf_get_ccs_surf(&screen->isl_dev, &res->surf, &res->aux.surf, + &res->aux.extra_aux.surf, 0); + + /* Having both HIZ and MCS is impossible. */ + assert(!has_mcs || !has_hiz); + + /* Ensure aux surface creation for MCS_CCS and HIZ_CCS is correct. */ + if (has_ccs && (has_mcs || has_hiz)) { + assert(res->aux.extra_aux.surf.size_B > 0 && + res->aux.extra_aux.surf.usage & ISL_SURF_USAGE_CCS_BIT); + assert(res->aux.surf.size_B > 0 && + res->aux.surf.usage & + (ISL_SURF_USAGE_HIZ_BIT | ISL_SURF_USAGE_MCS_BIT)); + } + + if (res->mod_info && has_ccs) { + res->aux.usage = res->mod_info->aux_usage; + } else if (has_mcs) { + res->aux.usage = ISL_AUX_USAGE_MCS; + } else if (has_hiz) { + res->aux.usage = ISL_AUX_USAGE_HIZ; + } else if (has_ccs) { + if (isl_format_supports_ccs_d(devinfo, res->surf.format)) + res->aux.usage = ISL_AUX_USAGE_CCS_D; + } + + enum isl_aux_state initial_state = ISL_AUX_STATE_AUX_INVALID; + *aux_size_B = 0; + *alloc_flags = 0; + assert(!res->aux.bo); + + switch (res->aux.usage) { + case ISL_AUX_USAGE_NONE: + /* Having no aux buffer is only okay if there's no modifier with aux. */ + res->aux.surf.levels = 0; + return !res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE; + case ISL_AUX_USAGE_HIZ: + initial_state = ISL_AUX_STATE_AUX_INVALID; + break; + case ISL_AUX_USAGE_MCS: + /* The Ivybridge PRM, Vol 2 Part 1 p326 says: + * + * "When MCS buffer is enabled and bound to MSRT, it is required + * that it is cleared prior to any rendering." + * + * Since we only use the MCS buffer for rendering, we just clear it + * immediately on allocation. The clear value for MCS buffers is all + * 1's, so we simply memset it to 0xff. + */ + initial_state = ISL_AUX_STATE_CLEAR; + break; + case ISL_AUX_USAGE_CCS_D: + /* When CCS_E is used, we need to ensure that the CCS starts off in + * a valid state. From the Sky Lake PRM, "MCS Buffer for Render + * Target(s)": + * + * "If Software wants to enable Color Compression without Fast + * clear, Software needs to initialize MCS with zeros." + * + * A CCS value of 0 indicates that the corresponding block is in the + * pass-through state which is what we want. + * + * For CCS_D, do the same thing. On Gen9+, this avoids having any + * undefined bits in the aux buffer. + */ + if (imported) + initial_state = + isl_drm_modifier_get_default_aux_state(res->mod_info->modifier); + else + initial_state = ISL_AUX_STATE_PASS_THROUGH; + *alloc_flags |= BO_ALLOC_ZEROED; + break; + default: + unreachable("non-crocus aux"); + } + + /* Create the aux_state for the auxiliary buffer. */ + res->aux.state = create_aux_state_map(res, initial_state); + if (!res->aux.state) + return false; + + /* Increase the aux offset if the main and aux surfaces will share a BO. */ + res->aux.offset = + !res->mod_info || res->mod_info->aux_usage == res->aux.usage ? + ALIGN(res->surf.size_B, res->aux.surf.alignment_B) : 0; + uint64_t size = res->aux.surf.size_B; + + /* Allocate space in the buffer for storing the CCS. */ + if (res->aux.extra_aux.surf.size_B > 0) { + const uint64_t padded_aux_size = + ALIGN(size, res->aux.extra_aux.surf.alignment_B); + res->aux.extra_aux.offset = res->aux.offset + padded_aux_size; + size = padded_aux_size + res->aux.extra_aux.surf.size_B; + } + + /* Allocate space in the buffer for storing the clear color. On modern + * platforms (gen > 9), we can read it directly from such buffer. + * + * On gen <= 9, we are going to store the clear color on the buffer + * anyways, and copy it back to the surface state during state emission. + * + * Also add some padding to make sure the fast clear color state buffer + * starts at a 4K alignment. We believe that 256B might be enough, but due + * to lack of testing we will leave this as 4K for now. + */ + size = ALIGN(size, 4096); + *aux_size_B = size; + + if (isl_aux_usage_has_hiz(res->aux.usage)) { + for (unsigned level = 0; level < res->surf.levels; ++level) { + uint32_t width = u_minify(res->surf.phys_level0_sa.width, level); + uint32_t height = u_minify(res->surf.phys_level0_sa.height, level); + + /* Disable HiZ for LOD > 0 unless the width/height are 8x4 aligned. + * For LOD == 0, we can grow the dimensions to make it work. + */ + if (!devinfo->is_haswell || + (level == 0 || ((width & 7) == 0 && (height & 3) == 0))) + res->aux.has_hiz |= 1 << level; + } + } + + return true; +} + +/** + * Initialize the aux buffer contents. + * + * Returns false on unexpected error (e.g. mapping a BO failed). + */ +static bool +crocus_resource_init_aux_buf(struct crocus_resource *res, uint32_t alloc_flags) +{ + if (!(alloc_flags & BO_ALLOC_ZEROED)) { + void *map = crocus_bo_map(NULL, res->aux.bo, MAP_WRITE | MAP_RAW); + + if (!map) + return false; + + if (crocus_resource_get_aux_state(res, 0, 0) != ISL_AUX_STATE_AUX_INVALID) { + uint8_t memset_value = isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0; + memset((char*)map + res->aux.offset, memset_value, + res->aux.surf.size_B); + } + + /* Bspec section titled : MCS/CCS Buffers for Render Target(s) states: + * - If Software wants to enable Color Compression without Fast clear, + * Software needs to initialize MCS with zeros. + * - Lossless compression and CCS initialized to all F (using HW Fast + * Clear or SW direct Clear) + * + * We think, the first bullet point above is referring to CCS aux + * surface. Since we initialize the MCS in the clear state, we also + * initialize the CCS in the clear state (via SW direct clear) to keep + * the two in sync. + */ + memset((char*)map + res->aux.extra_aux.offset, + isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0, + res->aux.extra_aux.surf.size_B); + + crocus_bo_unmap(res->aux.bo); + } + + return true; +} + +/** + * Allocate the initial aux surface for a resource based on aux.usage + * + * Returns false on unexpected error (e.g. allocation failed, or invalid + * configuration result). + */ +static bool +crocus_resource_alloc_separate_aux(struct crocus_screen *screen, + struct crocus_resource *res) +{ + uint32_t alloc_flags; + uint64_t size; + if (!crocus_resource_configure_aux(screen, res, false, &size, &alloc_flags)) + return false; + + if (size == 0) + return true; + + /* Allocate the auxiliary buffer. ISL has stricter set of alignment rules + * the drm allocator. Therefore, one can pass the ISL dimensions in terms + * of bytes instead of trying to recalculate based on different format + * block sizes. + */ + res->aux.bo = crocus_bo_alloc_tiled(screen->bufmgr, "aux buffer", size, 4096, + isl_tiling_to_i915_tiling(res->aux.surf.tiling), + res->aux.surf.row_pitch_B, alloc_flags); + if (!res->aux.bo) { + return false; + } + + if (!crocus_resource_init_aux_buf(res, alloc_flags)) + return false; + + return true; +} + +void +crocus_resource_finish_aux_import(struct pipe_screen *pscreen, + struct crocus_resource *res) +{ + struct crocus_screen *screen = (struct crocus_screen *)pscreen; + assert(crocus_resource_unfinished_aux_import(res)); + assert(!res->mod_info->supports_clear_color); + + struct crocus_resource *aux_res = (void *) res->base.next; + assert(aux_res->aux.surf.row_pitch_B && aux_res->aux.offset && + aux_res->aux.bo); + + assert(res->bo == aux_res->aux.bo); + crocus_bo_reference(aux_res->aux.bo); + res->aux.bo = aux_res->aux.bo; + + res->aux.offset = aux_res->aux.offset; + + assert(res->bo->size >= (res->aux.offset + res->aux.surf.size_B)); + assert(aux_res->aux.surf.row_pitch_B == res->aux.surf.row_pitch_B); + + crocus_resource_destroy(&screen->base, res->base.next); + res->base.next = NULL; +}< |