diff options
author | Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> | 2021-02-16 02:37:40 +0100 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-06-21 21:23:51 +0000 |
commit | 8dfb240b1f063307aa5e53fb1bd0865105eef986 (patch) | |
tree | 287df9026062c83942e60ce47831cbdb53cc1e4f /src/compiler | |
parent | 02c5dc8035b8b03d1abc74074767303951fd0a5b (diff) |
nir: Add raytracing shader call lowering pass.
Really copying Jason's pass.
Changes:
- Instead of all the intel lowering introduce rt_{execute_callable,trace_ray,resume}
- Add the ability to use scratch intrinsics directly.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10339>
Diffstat (limited to 'src/compiler')
-rw-r--r-- | src/compiler/Makefile.sources | 1 | ||||
-rw-r--r-- | src/compiler/nir/meson.build | 1 | ||||
-rw-r--r-- | src/compiler/nir/nir.h | 8 | ||||
-rw-r--r-- | src/compiler/nir/nir_lower_shader_calls.c | 1121 |
4 files changed, 1131 insertions, 0 deletions
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index 6a758e725a2..69561c99c14 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -298,6 +298,7 @@ NIR_FILES = \ nir/nir_lower_returns.c \ nir/nir_lower_samplers.c \ nir/nir_lower_scratch.c \ + nir/nir_lower_shader_calls.c \ nir/nir_lower_ssbo.c \ nir/nir_lower_subgroups.c \ nir/nir_lower_system_values.c \ diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build index 8209516d732..229dacdb59d 100644 --- a/src/compiler/nir/meson.build +++ b/src/compiler/nir/meson.build @@ -186,6 +186,7 @@ files_libnir = files( 'nir_lower_returns.c', 'nir_lower_samplers.c', 'nir_lower_scratch.c', + 'nir_lower_shader_calls.c', 'nir_lower_ssbo.c', 'nir_lower_subgroups.c', 'nir_lower_system_values.c', diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index ca123a51a61..6f7732fd8e6 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -4626,6 +4626,14 @@ bool nir_lower_explicit_io(nir_shader *shader, nir_variable_mode modes, nir_address_format); +bool +nir_lower_shader_calls(nir_shader *shader, + nir_address_format address_format, + unsigned stack_alignment, + nir_shader ***resume_shaders_out, + uint32_t *num_resume_shaders_out, + void *mem_ctx); + nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_vertex_index_src(nir_intrinsic_instr *instr); nir_src *nir_get_shader_call_payload_src(nir_intrinsic_instr *call); diff --git a/src/compiler/nir/nir_lower_shader_calls.c b/src/compiler/nir/nir_lower_shader_calls.c new file mode 100644 index 00000000000..cc0f5aa7859 --- /dev/null +++ b/src/compiler/nir/nir_lower_shader_calls.c @@ -0,0 +1,1121 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "nir_builder.h" +#include "nir_phi_builder.h" +#include "util/u_math.h" + +static bool +move_system_values_to_top(nir_shader *shader) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + + bool progress = false; + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + /* These intrinsics not only can't be re-materialized but aren't + * preserved when moving to the continuation shader. We have to move + * them to the top to ensure they get spilled as needed. + */ + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_shader_record_ptr: + case nir_intrinsic_load_btd_local_arg_addr_intel: + nir_instr_remove(instr); + nir_instr_insert(nir_before_cf_list(&impl->body), instr); + progress = true; + break; + + default: + break; + } + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } else { + nir_metadata_preserve(impl, nir_metadata_all); + } + + return progress; +} + +static bool +instr_is_shader_call(nir_instr *instr) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_trace_ray || + intrin->intrinsic == nir_intrinsic_report_ray_intersection || + intrin->intrinsic == nir_intrinsic_execute_callable; +} + +struct bitset { + BITSET_WORD *set; + unsigned size; +}; + +static struct bitset +bitset_create(void *mem_ctx, unsigned size) +{ + return (struct bitset) { + .set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)), + .size = size, + }; +} + +static bool +src_is_in_bitset(nir_src *src, void *_set) +{ + struct bitset *set = _set; + assert(src->is_ssa); + + /* Any SSA values which were added after we generated liveness information + * are things generated by this pass and, while most of it is arithmetic + * which we could re-materialize, we don't need to because it's only used + * for a single load/store and so shouldn't cross any shader calls. + */ + if (src->ssa->index >= set->size) + return false; + + return BITSET_TEST(set->set, src->ssa->index); +} + +static void +add_ssa_def_to_bitset(nir_ssa_def *def, struct bitset *set) +{ + if (def->index >= set->size) + return; + + BITSET_SET(set->set, def->index); +} + +static bool +can_remat_instr(nir_instr *instr, struct bitset *remat) +{ + /* Set of all values which are trivially re-materializable and we shouldn't + * ever spill them. This includes: + * + * - Undef values + * - Constants + * - Uniforms (UBO or push constant) + * - ALU combinations of any of the above + * - Derefs which are either complete or casts of any of the above + * + * Because this pass rewrites things in-order and phis are always turned + * into register writes, We can use "is it SSA?" to answer the question + * "can my source be re-materialized?". + */ + switch (instr->type) { + case nir_instr_type_alu: + if (!nir_instr_as_alu(instr)->dest.dest.is_ssa) + return false; + + return nir_foreach_src(instr, src_is_in_bitset, remat); + + case nir_instr_type_deref: + return nir_foreach_src(instr, src_is_in_bitset, remat); + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_vulkan_resource_index: + case nir_intrinsic_vulkan_resource_reindex: + case nir_intrinsic_load_vulkan_descriptor: + case nir_intrinsic_load_push_constant: + /* These intrinsics don't need to be spilled as long as they don't + * depend on any spilled values. + */ + return nir_foreach_src(instr, src_is_in_bitset, remat); + + case nir_intrinsic_load_scratch_base_ptr: + case nir_intrinsic_load_ray_launch_id: + case nir_intrinsic_load_btd_dss_id_intel: + case nir_intrinsic_load_btd_global_arg_addr_intel: + case nir_intrinsic_load_btd_resume_sbt_addr_intel: + case nir_intrinsic_load_ray_base_mem_addr_intel: + case nir_intrinsic_load_ray_hw_stack_size_intel: + case nir_intrinsic_load_ray_sw_stack_size_intel: + case nir_intrinsic_load_ray_num_dss_rt_stacks_intel: + case nir_intrinsic_load_ray_hit_sbt_addr_intel: + case nir_intrinsic_load_ray_hit_sbt_stride_intel: + case nir_intrinsic_load_ray_miss_sbt_addr_intel: + case nir_intrinsic_load_ray_miss_sbt_stride_intel: + case nir_intrinsic_load_callable_sbt_addr_intel: + case nir_intrinsic_load_callable_sbt_stride_intel: + /* Notably missing from the above list is btd_local_arg_addr_intel. + * This is because the resume shader will have a different local + * argument pointer because it has a different BSR. Any access of + * the original shader's local arguments needs to be preserved so + * that pointer has to be saved on the stack. + * + * TODO: There may be some system values we want to avoid + * re-materializing as well but we have to be very careful + * to ensure that it's a system value which cannot change + * across a shader call. + */ + return true; + + default: + return false; + } + } + + case nir_instr_type_ssa_undef: + case nir_instr_type_load_const: + return true; + + default: + return false; + } +} + +static bool +can_remat_ssa_def(nir_ssa_def *def, struct bitset *remat) +{ + return can_remat_instr(def->parent_instr, remat); +} + +static nir_ssa_def * +remat_ssa_def(nir_builder *b, nir_ssa_def *def) +{ + nir_instr *clone = nir_instr_clone(b->shader, def->parent_instr); + nir_builder_instr_insert(b, clone); + return nir_instr_ssa_def(clone); +} + +struct pbv_array { + struct nir_phi_builder_value **arr; + unsigned len; +}; + +static struct nir_phi_builder_value * +get_phi_builder_value_for_def(nir_ssa_def *def, + struct pbv_array *pbv_arr) +{ + if (def->index >= pbv_arr->len) + return NULL; + + return pbv_arr->arr[def->index]; +} + +static nir_ssa_def * +get_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr, + nir_block *block) +{ + assert(src->is_ssa); + + struct nir_phi_builder_value *pbv = + get_phi_builder_value_for_def(src->ssa, pbv_arr); + if (pbv == NULL) + return NULL; + + return nir_phi_builder_value_get_block_def(pbv, block); +} + +static bool +rewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr) +{ + nir_block *block; + if (src->parent_instr->type == nir_instr_type_phi) { + nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src); + block = phi_src->pred; + } else { + block = src->parent_instr->block; + } + + nir_ssa_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block); + if (new_def != NULL) + nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new_def)); + return true; +} + +static nir_ssa_def * +spill_fill(nir_builder *before, nir_builder *after, nir_ssa_def *def, unsigned offset, + nir_address_format address_format, unsigned stack_alignment) +{ + const unsigned comp_size = def->bit_size / 8; + + switch(address_format) { + case nir_address_format_32bit_offset: + nir_store_scratch(before, def, nir_imm_int(before, offset), + .align_mul = MIN2(comp_size, stack_alignment), .write_mask = ~0); + def = nir_load_scratch(after, def->num_components, def->bit_size, + nir_imm_int(after, offset), .align_mul = MIN2(comp_size, stack_alignment)); + break; + case nir_address_format_64bit_global: { + nir_ssa_def *addr = nir_iadd_imm(before, nir_load_scratch_base_ptr(before, 1, 64, 1), offset); + nir_store_global(before, addr, MIN2(comp_size, stack_alignment), def, ~0); + addr = nir_iadd_imm(after, nir_load_scratch_base_ptr(after, 1, 64, 1), offset); + def = nir_load_global(after, addr, MIN2(comp_size, stack_alignment), + def->num_components, def->bit_size); + break; + } + default: + unreachable("Unimplemented address format"); + } + return def; +} + +static void +spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls, + nir_address_format address_format, + unsigned stack_alignment) +{ + /* TODO: If a SSA def is filled more than once, we probably want to just + * spill it at the LCM of the fill sites so we avoid unnecessary + * extra spills + * + * TODO: If a SSA def is defined outside a loop but live through some call + * inside the loop, we probably want to spill outside the loop. We + * may also want to fill outside the loop if it's not used in the + * loop. + * + * TODO: Right now, we only re-materialize things if their immediate + * sources are things which we filled. We probably want to expand + * that to re-materialize things whose sources are things we can + * re-materialize from things we filled. We may want some DAG depth + * heuristic on this. + */ + + /* This happens per-shader rather than per-impl because we mess with + * nir_shader::scratch_size. + */ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + + nir_metadata_require(impl, nir_metadata_live_ssa_defs | + nir_metadata_dominance | + nir_metadata_block_index); + + void *mem_ctx = ralloc_context(shader); + + const unsigned num_ssa_defs = impl->ssa_alloc; + const unsigned live_words = BITSET_WORDS(num_ssa_defs); + struct bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs); + + /* Array of all live SSA defs which are spill candidates */ + nir_ssa_def **spill_defs = + rzalloc_array(mem_ctx, nir_ssa_def *, num_ssa_defs); + + /* For each spill candidate, an array of every time it's defined by a fill, + * indexed by call instruction index. + */ + nir_ssa_def ***fill_defs = + rzalloc_array(mem_ctx, nir_ssa_def **, num_ssa_defs); + + /* For each call instruction, the liveness set at the call */ + const BITSET_WORD **call_live = + rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls); + + /* For each call instruction, the block index of the block it lives in */ + uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls); + + /* Walk the call instructions and fetch the liveness set and block index + * for each one. We need to do this before we start modifying the shader + * so that liveness doesn't complain that it's been invalidated. Don't + * worry, we'll be very careful with our live sets. :-) + */ + unsigned call_idx = 0; + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (!instr_is_shader_call(instr)) + continue; + + call_block_indices[call_idx] = block->index; + + /* The objective here is to preserve values around shader call + * instructions. Therefore, we use the live set after the + * instruction as the set of things we want to preserve. Because + * none of our shader call intrinsics return anything, we don't have + * to worry about spilling over a return value. + * + * TODO: This isn't quite true for report_intersection. + */ + call_live[call_idx] = + nir_get_live_ssa_defs(nir_after_instr(instr), mem_ctx); + + call_idx++; + } + } + + nir_builder before, after; + nir_builder_init(&before, impl); + nir_builder_init(&after, impl); + + call_idx = 0; + unsigned max_scratch_size = shader->scratch_size; + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + nir_ssa_def *def = nir_instr_ssa_def(instr); + if (def != NULL) { + if (can_remat_ssa_def(def, &trivial_remat)) { + add_ssa_def_to_bitset(def, &trivial_remat); + } else { + spill_defs[def->index] = def; + } + } + + if (!instr_is_shader_call(instr)) + continue; + + const BITSET_WORD *live = call_live[call_idx]; + + /* Make a copy of trivial_remat that we'll update as we crawl through + * the live SSA defs and unspill them. + */ + struct bitset remat = bitset_create(mem_ctx, num_ssa_defs); + memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD)); + + /* Before the two builders are always separated by the call + * instruction, it won't break anything to have two of them. + */ + before.cursor = nir_before_instr(instr); + after.cursor = nir_after_instr(instr); + + unsigned offset = shader->scratch_size; + for (unsigned w = 0; w < live_words; w++) { + BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w]; + while (spill_mask) { + int i = u_bit_scan(&spill_mask); + assert(i >= 0); + unsigned index = w * BITSET_WORDBITS + i; + assert(index < num_ssa_defs); + + nir_ssa_def *def = spill_defs[index]; + if (can_remat_ssa_def(def, &remat)) { + /* If this SSA def is re-materializable or based on other + * things we've already spilled, re-materialize it rather + * than spilling and filling. Anything which is trivially + * re-materializable won't even get here because we take + * those into account in spill_mask above. + */ + def = remat_ssa_def(&after, def); + } else { + bool is_bool = def->bit_size == 1; + if (is_bool) + def = nir_b2b32(&before, def); + + const unsigned comp_size = def->bit_size / 8; + offset = ALIGN(offset, comp_size); + + def = spill_fill(&before, &after, def, offset, + address_format,stack_alignment); + + if (is_bool) + def = nir_b2b1(&after, def); + + offset += def->num_components * comp_size; + } + + /* Mark this SSA def as available in the remat set so that, if + * some other SSA def we need is computed based on it, we can + * just re-compute instead of fetching from memory. + */ + BITSET_SET(remat.set, index); + + /* For now, we just make a note of this new SSA def. We'll + * fix things up with the phi builder as a second pass. + */ + if (fill_defs[index] == NULL) { + fill_defs[index] = + rzalloc_array(mem_ctx, nir_ssa_def *, num_calls); + } + fill_defs[index][call_idx] = def; + } + } + + nir_builder *b = &before; + + max_scratch_size = MAX2(max_scratch_size, offset); + + /* First thing on the called shader's stack is the resume address + * followed by a pointer to the payload. + */ + nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr); + + /* Lower to generic intrinsics with information about the stack & resume shader. */ + switch (call->intrinsic) { + case nir_intrinsic_trace_ray: { + nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa, + call->src[2].ssa, call->src[3].ssa, + call->src[4].ssa, call->src[5].ssa, + call->src[6].ssa, call->src[7].ssa, + call->src[8].ssa, call->src[9].ssa, + call->src[10].ssa, + .call_idx = call_idx, .stack_size = offset); + break; + } + + case nir_intrinsic_report_ray_intersection: + unreachable("Any-hit shaders must be inlined"); + + case nir_intrinsic_execute_callable: { + nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset); + break; + } + + default: + unreachable("Invalid shader call instruction"); + } + + nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset); + + nir_instr_remove(&call->instr); + + call_idx++; + } + } + assert(call_idx == num_calls); + shader->scratch_size = max_scratch_size; + + struct nir_phi_builder *pb = nir_phi_builder_create(impl); + struct pbv_array pbv_arr = { + .arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *, + num_ssa_defs), + .len = num_ssa_defs, + }; + + const unsigned block_words = BITSET_WORDS(impl->num_blocks); + BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words); + + /* Go through and set up phi builder values for each spillable value which + * we ever needed to spill at any point. + */ + for (unsigned index = 0; index < num_ssa_defs; index++) { + if (fill_defs[index] == NULL) + continue; + + nir_ssa_def *def = spill_defs[index]; + + memset(def_blocks, 0, block_words * sizeof(BITSET_WORD)); + BITSET_SET(def_blocks, def->parent_instr->block->index); + for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) { + if (fill_defs[index][call_idx] != NULL) + BITSET_SET(def_blocks, call_block_indices[call_idx]); + } + + pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components, + def->bit_size, def_blocks); + } + + /* Walk the shader one more time and rewrite SSA defs as needed using the + * phi builder. + */ + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + nir_ssa_def *def = nir_instr_ssa_def(instr); + if (def != NULL) { + struct nir_phi_builder_value *pbv = + get_phi_builder_value_for_def(def, &pbv_arr); + if (pbv != NULL) + nir_phi_builder_value_set_block_def(pbv, block, def); + } + + if (instr->type == nir_instr_type_phi) + continue; + + nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr); + + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); + if (resume->intrinsic != nir_intrinsic_rt_resume) + continue; + + call_idx = nir_intrinsic_call_idx(resume); + + /* Technically, this is the wrong place to add the fill defs to the + * phi builder values because we haven't seen any of the load_scratch + * instructions for this call yet. However, we know based on how we + * emitted them that no value ever gets used until after the load + * instruction has been emitted so this should be safe. If we ever + * fail validation due this it likely means a bug in our spilling + * code and not the phi re-construction code here. + */ + for (unsigned index = 0; index < num_ssa_defs; index++) { + if (fill_defs[index] && fill_defs[index][call_idx]) { + nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block, + fill_defs[index][call_idx]); + } + } + } + + nir_if *following_if = nir_block_get_following_if(block); + if (following_if) { + nir_ssa_def *new_def = + get_phi_builder_def_for_src(&following_if->condition, + &pbv_arr, block); + if (new_def != NULL) + nir_if_rewrite_condition(following_if, nir_src_for_ssa(new_def)); + } + + /* Handle phi sources that source from this block. We have to do this + * as a separate pass because the phi builder assumes that uses and + * defs are processed in an order that respects dominance. When we have + * loops, a phi source may be a back-edge so we have to handle it as if + * it were one of the last instructions in the predecessor block. + */ + nir_foreach_phi_src_leaving_block(block, + rewrite_instr_src_from_phi_builder, + &pbv_arr); + } + + nir_phi_builder_finish(pb); + + ralloc_free(mem_ctx); + + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); +} + +static nir_instr * +find_resume_instr(nir_function_impl *impl, unsigned call_idx) +{ + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); + if (resume->intrinsic != nir_intrinsic_rt_resume) + continue; + + if (nir_intrinsic_call_idx(resume) == call_idx) + return &resume->instr; + } + } + unreachable("Couldn't find resume instruction"); +} + +/* Walk the CF tree and duplicate the contents of every loop, one half runs on + * resume and the other half is for any post-resume loop iterations. We are + * careful in our duplication to ensure that resume_instr is in the resume + * half of the loop though a copy of resume_instr will remain in the other + * half as well in case the same shader call happens twice. + */ +static bool +duplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr) +{ + nir_register *resume_reg = NULL; + for (nir_cf_node *node = resume_instr->block->cf_node.parent; + node->type != nir_cf_node_function; node = node->parent) { + if (node->type != nir_cf_node_loop) + continue; + + nir_loop *loop = nir_cf_node_as_loop(node); + + if (resume_reg == NULL) { + /* We only create resume_reg if we encounter a loop. This way we can + * avoid re-validating the shader and calling ssa_to_regs in the case + * where it's just if-ladders. + */ + resume_reg = nir_local_reg_create(impl); + resume_reg->num_components = 1; + resume_reg->bit_size = 1; + + nir_builder b; + nir_builder_init(&b, impl); + + /* Initialize resume to true */ + b.cursor = nir_before_cf_list(&impl->body); + nir_store_reg(&b, resume_reg, nir_imm_true(&b), 1); + + /* Set resume to false right after the resume instruction */ + b.cursor = nir_after_instr(resume_instr); + nir_store_reg(&b, resume_reg, nir_imm_false(&b), 1); + } + + /* Before we go any further, make sure that everything which exits the + * loop or continues around to the top of the loop does so through + * registers. We're about to duplicate the loop body and we'll have + * serious trouble if we don't do this. + */ + nir_convert_loop_to_lcssa(loop); + nir_lower_phis_to_regs_block(nir_loop_first_block(loop)); + nir_lower_phis_to_regs_block( + nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); + + nir_cf_list cf_list; + nir_cf_list_extract(&cf_list, &loop->body); + + nir_if *_if = nir_if_create(impl->function->shader); + _if->condition = nir_src_for_reg(resume_reg); + nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node); + + nir_cf_list clone; + nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL); + + /* Insert the clone in the else and the original in the then so that + * the resume_instr remains valid even after the duplication. + */ + nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list)); + nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list)); + } + + if (resume_reg != NULL) + nir_metadata_preserve(impl, nir_metadata_none); + + return resume_reg != NULL; +} + +static bool +cf_node_contains_instr(nir_cf_node *node, nir_instr *instr) +{ + for (nir_cf_node *n = &instr->block->cf_node; n != NULL; n = n->parent) { + if (n == node) + return true; + } + + return false; +} + +static void +rewrite_phis_to_pred(nir_block *block, nir_block *pred) +{ + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_phi) + break; + + nir_phi_instr *phi = nir_instr_as_phi(instr); + + ASSERTED bool found = false; + nir_foreach_phi_src(phi_src, phi) { + if (phi_src->pred == pred) { + found = true; + assert(phi_src->src.is_ssa); + nir_ssa_def_rewrite_uses(&phi->dest.ssa, phi_src->src.ssa); + break; + } + } + assert(found); + } +} + +/** Flattens if ladders leading up to a resume + * + * Given a resume_instr, this function flattens any if ladders leading to the + * resume instruction and deletes any code that cannot be encountered on a + * direct path to the resume instruction. This way we get, for the most part, + * straight-line control-flow up to the resume instruction. + * + * While we do this flattening, we also move any code which is in the remat + * set up to the top of the function or to the top of the resume portion of + * the current loop. We don't worry about control-flow as we do this because + * phis will never be in the remat set (see can_remat_instr) and so nothing + * control-dependent will ever need to be re-materialized. It is possible + * that this algorithm will preserve too many instructions by moving them to + * the top but we leave that for DCE to clean up. Any code not in the remat + * set is deleted because it's either unused in the continuation or else + * unspilled from a previous continuation and the unspill code is after the + * resume instruction. + * + * If, for instance, we have something like this: + * + * // block 0 + * if (cond1) { + * // block 1 + * } else { + * // block 2 + * if (cond2) { + * // block 3 + * resume; + * if (cond3) { + * // block 4 + * } + * } else { + * // block 5 + * } + * } + * + * then we know, because we know the resume instruction had to be encoutered, + * that cond1 = false and cond2 = true and we lower as follows: + * + * // block 0 + * // block 2 + * // block 3 + * resume; + * if (cond3) { + * // block 4 + * } + * + * As you can see, the code in blocks 1 and 5 was removed because there is no + * path from the start of the shader to the resume instruction which execute + * blocks 1 or 5. Any remat code from blocks 0, 2, and 3 is preserved and + * moved to the top. If the resume instruction is inside a loop then we know + * a priori that it is of the form + * + * loop { + * if (resume) { + * // Contents containing resume_instr + * } else { + * // Second copy of contents + * } + * } + * + * In this case, we only descend into the first half of the loop. The second + * half is left alone as that portion is only ever executed after the resume + * instruction. + */ +static bool +flatten_resume_if_ladder(nir_function_impl *impl, + nir_instr *cursor, + struct exec_list *child_list, + bool child_list_contains_cursor, + nir_instr *resume_instr, + struct bitset *remat) +{ + nir_shader *shader = impl->function->shader; + nir_cf_list cf_list; + + /* If our child list contains the cursor instruction then we start out + * before the cursor instruction. We need to know this so that we can skip + * moving instructions which are already before the cursor. + */ + bool before_cursor = child_list_contains_cursor; + + nir_cf_node *resume_node = NULL; + foreach_list_typed_safe(nir_cf_node, child, node, child_list) { + switch (child->type) { + case nir_cf_node_block: { + nir_block *block = nir_cf_node_as_block(child); + nir_foreach_instr_safe(instr, block) { + if (instr == cursor) { + assert(nir_cf_node_is_first(&block->cf_node)); + assert(before_cursor); + before_cursor = false; + continue; + } + + if (instr == resume_instr) + goto found_resume; + + if (!before_cursor && can_remat_instr(instr, remat)) { + nir_instr_remove(instr); + nir_instr_insert(nir_before_instr(cursor), instr); + + nir_ssa_def *def = nir_instr_ssa_def(instr); + BITSET_SET(remat->set, def->index); + } + } + break; + } + + case nir_cf_node_if: { + assert(!before_cursor); + nir_if *_if = nir_cf_node_as_if(child); + if (flatten_resume_if_ladder(impl, cursor, &_if->then_list, + false, resume_instr, remat)) { + resume_node = child; + rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)), + nir_if_last_then_block(_if)); + goto found_resume; + } + + if (flatten_resume_if_ladder(impl, cursor, &_if->else_list, + false, resume_instr, remat)) { + resume_node = child; + rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)), + nir_if_last_else_block(_if)); + goto found_resume; + } + break; + } + + case nir_cf_node_loop: { + assert(!before_cursor); + nir_loop *loop = nir_cf_node_as_loop(child); + + if (cf_node_contains_instr(&loop->cf_node, resume_instr)) { + /* Thanks to our loop body duplication pass, every level of loop + * containing the resume instruction contains exactly three nodes: + * two blocks and an if. We don't want to lower away this if + * because it's the resume selection if. The resume half is + * always the then_list so that's what we want to flatten. + */ + nir_block *header = nir_loop_first_block(loop); + nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node)); + + /* We want to place anything re-materialized from inside the loop + * at the top of the resume half of the loop. + */ + nir_instr *loop_cursor = + &nir_intrinsic_instr_create(shader, nir_intrinsic_nop)->instr; + nir_instr_insert(nir_before_cf_list(&_if->then_list), loop_cursor); + + ASSERTED bool found = + flatten_resume_if_ladder(impl, loop_cursor, &_if->then_list, + true, resume_instr, remat); + assert(found); + resume_node = child; + goto found_resume; + } else { + ASSERTED bool found = + flatten_resume_if_ladder(impl, cursor, &loop->body, + false, resume_instr, remat); + assert(!found); + } + break; + } + + case nir_cf_node_function: + unreachable("Unsupported CF node type"); + } + } + assert(!before_cursor); + + /* If we got here, we didn't find the resume node or instruction. */ + return false; + +found_resume: + /* If we got here then we found either the resume node or the resume + * instruction in this CF list. + */ + if (resume_node) { + /* If the resume instruction is buried in side one of our children CF + * nodes, resume_node now points to that child. + */ + if (resume_node->type == nir_cf_node_if) { + /* Thanks to the recursive call, all of the interesting contents of + * resume_node have been copied before the cursor. We just need to + * copy the stuff after resume_node. + */ + nir_cf_extract(&cf_list, nir_after_cf_node(resume_node), + nir_after_cf_list(child_list)); + } else { + /* The loop contains its own cursor and still has useful stuff in it. + * We want to move everything after and including the loop to before + * the cursor. + */ + assert(resume_node->type == nir_cf_node_loop); + nir_cf_extract(&cf_list, nir_before_cf_node(resume_node), + nir_after_cf_list(child_list)); + } + } else { + /* If we found the resume instruction in one of our blocks, grab + * everything after it in the entire list (not just the one block), and + * place it before the cursor instr. + */ + nir_cf_extract(&cf_list, nir_after_instr(resume_instr), + nir_after_cf_list(child_list)); + } + nir_cf_reinsert(&cf_list, nir_before_instr(cursor)); + + if (!resume_node) { + /* We want the resume to be the first "interesting" instruction */ + nir_instr_remove(resume_instr); + nir_instr_insert(nir_before_cf_list(&impl->body), resume_instr); + } + + /* We've copied everything interesting out of this CF list to before the + * cursor. Delete everything else. + */ + if (child_list_contains_cursor) { + nir_cf_extract(&cf_list, nir_after_instr(cursor), + nir_after_cf_list(child_list)); + } else { + nir_cf_list_extract(&cf_list, child_list); + } + nir_cf_delete(&cf_list); + + return true; +} + +static nir_instr * +lower_resume(nir_shader *shader, int call_idx) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + + nir_instr *resume_instr = find_resume_instr(impl, call_idx); + + if (duplicate_loop_bodies(impl, resume_instr)) { + nir_validate_shader(shader, "after duplicate_loop_bodies in " + "brw_nir_lower_shader_calls"); + /* If we duplicated the bodies of any loops, run regs_to_ssa to get rid + * of all those pesky registers we just added. + */ + NIR_PASS_V(shader, nir_lower_regs_to_ssa); + } + + /* Re-index nir_ssa_def::index. We don't care about actual liveness in + * this pass but, so we can use the same helpers as the spilling pass, we + * need to make sure that live_index is something sane. It's used + * constantly for determining if an SSA value has been added since the + * start of the pass. + */ + nir_index_ssa_defs(impl); + + void *mem_ctx = ralloc_context(shader); + + /* Used to track which things may have been assumed to be re-materialized + * by the spilling pass and which we shouldn't delete. + */ + struct bitset remat = bitset_create(mem_ctx, impl->ssa_alloc); + + /* Create a nop instruction to use as a cursor as we extract and re-insert + * stuff into the CFG. + */ + nir_instr *cursor = + &nir_intrinsic_instr_create(shader, nir_intrinsic_nop)->instr; + nir_instr_insert(nir_before_cf_list(&impl->body), cursor); + + ASSERTED bool found = + flatten_resume_if_ladder(impl, cursor, &impl->body, + true, resume_instr, &remat); + assert(found); + + ralloc_free(mem_ctx); + + nir_validate_shader(shader, "after flatten_resume_if_ladder in " + "brw_nir_lower_shader_calls"); + + nir_metadata_preserve(impl, nir_metadata_none); + + return resume_instr; +} + +static void +replace_resume_with_halt(nir_shader *shader, nir_instr *keep) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block_safe(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr == keep) + continue; + + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); + if (resume->intrinsic != nir_intrinsic_rt_resume) + continue; + + /* If this is some other resume, then we've kicked off a ray or + * bindless thread and we don't want to go any further in this + * shader. Insert a halt so that NIR will delete any instructions + * dominated by this call instruction including the scratch_load + * instructions we inserted. + */ + nir_cf_list cf_list; + nir_cf_extract(&cf_list, nir_after_instr(&resume->instr), + nir_after_block(block)); + nir_cf_delete(&cf_list); + b.cursor = nir_instr_remove(&resume->instr); + nir_jump(&b, nir_jump_halt); + break; + } + } +} + +/** Lower shader call instructions to split shaders. + * + * Shader calls can be split into an initial shader and a series of "resume" + * shaders. When the shader is first invoked, it is the initial shader which + * is executed. At any point in the initial shader or any one of the resume + * shaders, a shader call operation may be performed. The possible shader call + * operations are: + * + * - trace_ray + * - report_ray_intersection + * - execute_callable + * + * When a shader call operation is performed, we push all live values to the + * stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once + * the operation we invoked is complete, a callee shader will return execution + * to the respective resume shader. The resume shader pops the contents off + * the stack and picks up where the calling shader left off. + * + * Stack management is assumed to be done after this pass. Call + * instructions and their resumes get annotated with stack information that + * should be enough for the backend to implement proper stack management. + */ +bool +nir_lower_shader_calls(nir_shader *shader, + nir_address_format address_format, + unsigned stack_alignment, + nir_shader ***resume_shaders_out, + uint32_t *num_resume_shaders_out, + void *mem_ctx) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + + nir_builder b; + nir_builder_init(&b, impl); + + int num_calls = 0; + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr_is_shader_call(instr)) + num_calls++; + } + } + + if (num_calls == 0) { + nir_shader_preserve_all_metadata(shader); + *num_resume_shaders_out = 0; + return false; + } + + /* Some intrinsics not only can't be re-materialized but aren't preserved + * when moving to the continuation shader. We have to move them to the top + * to ensure they get spilled as needed. + */ + { + bool progress = false; + NIR_PASS(progress, shader, move_system_values_to_top); + if (progress) + NIR_PASS(progress, shader, nir_opt_cse); + } + + NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls, + num_calls, address_format, stack_alignment); + + /* Make N copies of our shader */ + nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls); + for (unsigned i = 0; i < num_calls; i++) + resume_shaders[i] = nir_shader_clone(mem_ctx, shader); + + replace_resume_with_halt(shader, NULL); + for (unsigned i = 0; i < num_calls; i++) { + nir_instr *resume_instr = lower_resume(resume_shaders[i], i); + replace_resume_with_halt(resume_shaders[i], resume_instr); + } + + *resume_shaders_out = resume_shaders; + *num_resume_shaders_out = num_calls; + + return true; +} |