diff options
Diffstat (limited to 'src/intel/compiler/brw_mesh.cpp')
-rw-r--r-- | src/intel/compiler/brw_mesh.cpp | 1606 |
1 files changed, 1606 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp new file mode 100644 index 00000000000..ca8aa9a759c --- /dev/null +++ b/src/intel/compiler/brw_mesh.cpp @@ -0,0 +1,1606 @@ +/* + * Copyright © 2021 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <list> +#include <vector> +#include "brw_compiler.h" +#include "brw_fs.h" +#include "brw_nir.h" +#include "brw_private.h" +#include "compiler/nir/nir_builder.h" +#include "dev/intel_debug.h" + +#include <memory> + +using namespace brw; + +static bool +brw_nir_lower_load_uniforms_filter(const nir_instr *instr, + UNUSED const void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_load_uniform; +} + +static nir_def * +brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr, + UNUSED void *data) +{ + assert(instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + assert(intrin->intrinsic == nir_intrinsic_load_uniform); + + /* Read the first few 32-bit scalars from InlineData. */ + if (nir_src_is_const(intrin->src[0]) && + intrin->def.bit_size == 32 && + intrin->def.num_components == 1) { + unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]); + unsigned off_dw = off / 4; + if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) { + off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW; + return nir_load_mesh_inline_data_intel(b, 32, off_dw); + } + } + + return brw_nir_load_global_const(b, intrin, + nir_load_mesh_inline_data_intel(b, 64, 0), 0); +} + +static bool +brw_nir_lower_load_uniforms(nir_shader *nir) +{ + return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter, + brw_nir_lower_load_uniforms_impl, NULL); +} + +static inline int +type_size_scalar_dwords(const struct glsl_type *type, bool bindless) +{ + return glsl_count_dword_slots(type, bindless); +} + +/* TODO(mesh): Make this a common function. */ +static void +shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size * (length == 3 ? 4 : length); +} + +static bool +brw_nir_lower_launch_mesh_workgroups_instr(nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *local_invocation_index = nir_load_local_invocation_index(b); + + /* Make sure that the mesh workgroup size is taken from the first invocation + * (nir_intrinsic_launch_mesh_workgroups requirement) + */ + nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0); + nir_if *if_stmt = nir_push_if(b, cmp); + { + /* TUE header contains 4 words: + * + * - Word 0 for Task Count. + * + * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a + * 3D dispatch into the 1D dispatch supported by HW. + */ + nir_def *x = nir_channel(b, intrin->src[0].ssa, 0); + nir_def *y = nir_channel(b, intrin->src[0].ssa, 1); + nir_def *z = nir_channel(b, intrin->src[0].ssa, 2); + nir_def *task_count = nir_imul(b, x, nir_imul(b, y, z)); + nir_def *tue_header = nir_vec4(b, task_count, x, y, z); + nir_store_task_payload(b, tue_header, nir_imm_int(b, 0)); + } + nir_pop_if(b, if_stmt); + + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +brw_nir_lower_launch_mesh_workgroups(nir_shader *nir) +{ + return nir_shader_intrinsics_pass(nir, + brw_nir_lower_launch_mesh_workgroups_instr, + nir_metadata_none, + NULL); +} + +static void +brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map) +{ + memset(map, 0, sizeof(*map)); + + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, + type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32); + + /* From bspec: "It is suggested that SW reserve the 16 bytes following the + * TUE Header, and therefore start the SW-defined data structure at 32B + * alignment. This allows the TUE Header to always be written as 32 bytes + * with 32B alignment, the most optimal write performance case." + */ + map->per_task_data_start_dw = 8; + + /* Lowering to explicit types will start offsets from task_payload_size, so + * set it to start after the header. + */ + nir->info.task_payload_size = map->per_task_data_start_dw * 4; + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_mem_task_payload, shared_type_info); + NIR_PASS(_, nir, nir_lower_explicit_io, + nir_var_mem_task_payload, nir_address_format_32bit_offset); + + map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8); +} + +static void +brw_print_tue_map(FILE *fp, const struct brw_tue_map *map) +{ + fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw); +} + +static bool +brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + switch (intrin->intrinsic) { + case nir_intrinsic_store_task_payload: + case nir_intrinsic_load_task_payload: { + nir_src *offset_src = nir_get_io_offset_src(intrin); + + if (nir_src_is_const(*offset_src)) + assert(nir_src_as_uint(*offset_src) % 4 == 0); + + b->cursor = nir_before_instr(&intrin->instr); + + /* Regular I/O uses dwords while explicit I/O used for task payload uses + * bytes. Normalize it to dwords. + * + * TODO(mesh): Figure out how to handle 8-bit, 16-bit. + */ + + nir_def *offset = nir_ishr_imm(b, offset_src->ssa, 2); + nir_src_rewrite(offset_src, offset); + + unsigned base = nir_intrinsic_base(intrin); + assert(base % 4 == 0); + nir_intrinsic_set_base(intrin, base / 4); + + return true; + } + + default: + return false; + } +} + +static bool +brw_nir_adjust_task_payload_offsets(nir_shader *nir) +{ + return nir_shader_intrinsics_pass(nir, + brw_nir_adjust_task_payload_offsets_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} + +void +brw_nir_adjust_payload(nir_shader *shader) +{ + /* Adjustment of task payload offsets must be performed *after* last pass + * which interprets them as bytes, because it changes their unit. + */ + bool adjusted = false; + NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets); + if (adjusted) /* clean up the mess created by offset adjustments */ + NIR_PASS(_, shader, nir_opt_constant_folding); +} + +static bool +brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + if (intrin->intrinsic != nir_intrinsic_launch_mesh_workgroups) + return false; + + /* nir_lower_task_shader uses "range" as task payload size. */ + unsigned range = nir_intrinsic_range(intrin); + /* This will avoid special case in nir_lower_task_shader dealing with + * not vec4-aligned payload when payload_in_shared workaround is enabled. + */ + nir_intrinsic_set_range(intrin, ALIGN(range, 16)); + + return true; +} + +static bool +brw_nir_align_launch_mesh_workgroups(nir_shader *nir) +{ + return nir_shader_intrinsics_pass(nir, + brw_nir_align_launch_mesh_workgroups_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} + +const unsigned * +brw_compile_task(const struct brw_compiler *compiler, + struct brw_compile_task_params *params) +{ + struct nir_shader *nir = params->base.nir; + const struct brw_task_prog_key *key = params->key; + struct brw_task_prog_data *prog_data = params->prog_data; + const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TASK); + + brw_nir_lower_tue_outputs(nir, &prog_data->map); + + NIR_PASS(_, nir, brw_nir_align_launch_mesh_workgroups); + + nir_lower_task_shader_options lower_ts_opt = { + .payload_to_shared_for_atomics = true, + .payload_to_shared_for_small_types = true, + /* The actual payload data starts after the TUE header and padding, + * so skip those when copying. + */ + .payload_offset_in_bytes = prog_data->map.per_task_data_start_dw * 4, + }; + NIR_PASS(_, nir, nir_lower_task_shader, lower_ts_opt); + + NIR_PASS(_, nir, brw_nir_lower_launch_mesh_workgroups); + + prog_data->base.base.stage = MESA_SHADER_TASK; + prog_data->base.base.total_shared = nir->info.shared_size; + prog_data->base.base.total_scratch = 0; + + prog_data->base.local_size[0] = nir->info.workgroup_size[0]; + prog_data->base.local_size[1] = nir->info.workgroup_size[1]; + prog_data->base.local_size[2] = nir->info.workgroup_size[2]; + + prog_data->uses_drawid = + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); + + brw_simd_selection_state simd_state{ + .devinfo = compiler->devinfo, + .prog_data = &prog_data->base, + .required_width = brw_required_dispatch_width(&nir->info), + }; + + std::unique_ptr<fs_visitor> v[3]; + + for (unsigned simd = 0; simd < 3; simd++) { + if (!brw_simd_should_compile(simd_state, simd)) + continue; + + const unsigned dispatch_width = 8 << simd; + + nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir); + brw_nir_apply_key(shader, compiler, &key->base, dispatch_width); + + NIR_PASS(_, shader, brw_nir_lower_load_uniforms); + NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); + + brw_postprocess_nir(shader, compiler, debug_enabled, + key->base.robust_flags); + + v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base, + &key->base, + &prog_data->base.base, + shader, dispatch_width, + params->base.stats != NULL, + debug_enabled); + + if (prog_data->base.prog_mask) { + unsigned first = ffs(prog_data->base.prog_mask) - 1; + v[simd]->import_uniforms(v[first].get()); + } + + const bool allow_spilling = !brw_simd_any_compiled(simd_state); + if (v[simd]->run_task(allow_spilling)) + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); + else + simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg); + } + + int selected_simd = brw_simd_select(simd_state); + if (selected_simd < 0) { + params->base.error_str = + ralloc_asprintf(params->base.mem_ctx, + "Can't compile shader: " + "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n", + simd_state.error[0], simd_state.error[1], + simd_state.error[2]); + return NULL; + } + + fs_visitor *selected = v[selected_simd].get(); + prog_data->base.prog_mask = 1 << selected_simd; + + if (unlikely(debug_enabled)) { + fprintf(stderr, "Task Output "); + brw_print_tue_map(stderr, &prog_data->map); + } + + fs_generator g(compiler, ¶ms->base, &prog_data->base.base, + MESA_SHADER_TASK); + if (unlikely(debug_enabled)) { + g.enable_debug(ralloc_asprintf(params->base.mem_ctx, + "%s task shader %s", + nir->info.label ? nir->info.label + : "unnamed", + nir->info.name)); + } + + g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats, + selected->performance_analysis.require(), params->base.stats); + g.add_const_data(nir->constant_data, nir->constant_data_size); + return g.get_assembly(); +} + +static void +brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map) +{ + if (!map) + return; + + nir->info.task_payload_size = map->per_task_data_start_dw * 4; + + bool progress = false; + + NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types, + nir_var_mem_task_payload, shared_type_info); + + if (progress) { + /* The types for Task Output and Mesh Input should match, so their sizes + * should also match. + */ + assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8)); + } else { + /* Mesh doesn't read any input, to make it clearer set the + * task_payload_size to zero instead of keeping an incomplete size that + * just includes the header. + */ + nir->info.task_payload_size = 0; + } + + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload, + nir_address_format_32bit_offset); +} + +/* Attribute types. Flat attributes have to be a separate class because + * flat and interpolated attributes can't share the same vec4 slot + * (see 3DSTATE_SBE.ConstantInterpolationEnable). + */ +enum { + PRIM, /* per primitive */ + VERT, /* per vertex interpolated */ + VERT_FLAT, /* per vertex flat */ +}; + +struct attr_desc { + int location; + const struct glsl_type *type; + unsigned dwords; + unsigned slots; +}; + +struct attr_type_info { + /* order of attributes, negative values are holes */ + std::list<struct attr_desc> *order; + + /* attributes after which there's hole of size equal to array index */ + std::list<int> holes[5]; +}; + +static void +brw_mue_assign_position(const struct attr_desc *attr, + struct brw_mue_map *map, + unsigned start_dw) +{ + bool is_array = glsl_type_is_array(attr->type); + int location = attr->location; + unsigned remaining = attr->dwords; + + for (unsigned slot = 0; slot < attr->slots; ++slot) { + map->start_dw[location + slot] = start_dw; + + unsigned sz; + + if (is_array) { + assert(attr->dwords % attr->slots == 0); + sz = attr->dwords / attr->slots; + } else { + sz = MIN2(remaining, 4); + } + + map->len_dw[location + slot] = sz; + start_dw += sz; + remaining -= sz; + } +} + +static nir_variable * +brw_nir_find_complete_variable_with_location(nir_shader *shader, + nir_variable_mode mode, + int location) +{ + nir_variable *best_var = NULL; + unsigned last_size = 0; + + nir_foreach_variable_with_modes(var, shader, mode) { + if (var->data.location != location) + continue; + + unsigned new_size = glsl_count_dword_slots(var->type, false); + if (new_size > last_size) { + best_var = var; + last_size = new_size; + } + } + + return best_var; +} + +static unsigned +brw_sum_size(const std::list<struct attr_desc> &orders) +{ + unsigned sz = 0; + for (auto it = orders.cbegin(); it != orders.cend(); ++it) + sz += (*it).dwords; + return sz; +} + +/* Finds order of outputs which require minimum size, without splitting + * of URB read/write messages (which operate on vec4-aligned memory). + */ +static void +brw_compute_mue_layout(const struct brw_compiler *compiler, + std::list<struct attr_desc> *orders, + uint64_t outputs_written, + struct nir_shader *nir, + bool *pack_prim_data_into_header, + bool *pack_vert_data_into_header) +{ + const struct shader_info *info = &nir->info; + + struct attr_type_info data[3]; + + if ((compiler->mesh.mue_header_packing & 1) == 0) + *pack_prim_data_into_header = false; + if ((compiler->mesh.mue_header_packing & 2) == 0) + *pack_vert_data_into_header = false; + + for (unsigned i = PRIM; i <= VERT_FLAT; ++i) + data[i].order = &orders[i]; + + /* If packing into header is enabled, add a hole of size 4 and add + * a virtual location to keep the algorithm happy (it expects holes + * to be preceded by some location). We'll remove those virtual + * locations at the end. + */ + const gl_varying_slot virtual_header_location = VARYING_SLOT_POS; + assert((outputs_written & BITFIELD64_BIT(virtual_header_location)) == 0); + + struct attr_desc d; + d.location = virtual_header_location; + d.type = NULL; + d.dwords = 0; + d.slots = 0; + + struct attr_desc h; + h.location = -1; + h.type = NULL; + h.dwords = 4; + h.slots = 0; + + if (*pack_prim_data_into_header) { + orders[PRIM].push_back(d); + orders[PRIM].push_back(h); + data[PRIM].holes[4].push_back(virtual_header_location); + } + + if (*pack_vert_data_into_header) { + orders[VERT].push_back(d); + orders[VERT].push_back(h); + data[VERT].holes[4].push_back(virtual_header_location); + } + + u_foreach_bit64(location, outputs_written) { + if ((BITFIELD64_BIT(location) & outputs_written) == 0) + continue; + + /* At this point there are both complete and split variables as + * outputs. We need the complete variable to compute the required + * size. + */ + nir_variable *var = + brw_nir_find_complete_variable_with_location(nir, + nir_var_shader_out, + location); + + d.location = location; + d.type = brw_nir_get_var_type(nir, var); + d.dwords = glsl_count_dword_slots(d.type, false); + d.slots = glsl_count_attribute_slots(d.type, false); + + struct attr_type_info *type_data; + + if (BITFIELD64_BIT(location) & info->per_primitive_outputs) + type_data = &data[PRIM]; + else if (var->data.interpolation == INTERP_MODE_FLAT) + type_data = &data[VERT_FLAT]; + else + type_data = &data[VERT]; + + std::list<struct attr_desc> *order = type_data->order; + std::list<int> *holes = type_data->holes; + + outputs_written &= ~BITFIELD64_RANGE(location, d.slots); + + /* special case to use hole of size 4 */ + if (d.dwords == 4 && !holes[4].empty()) { + holes[4].pop_back(); + + assert(order->front().location == virtual_header_location); + order->pop_front(); + + assert(order->front().location == -1); + assert(order->front().dwords == 4); + order->front() = d; + + continue; + } + + int mod = d.dwords % 4; + if (mod == 0) { + order->push_back(d); + continue; + } + + h.location = -1; + h.type = NULL; + h.dwords = 4 - mod; + h.slots = 0; + + if (!compiler->mesh.mue_compaction) { + order->push_back(d); + order->push_back(h); + continue; + } + + if (d.dwords > 4) { + order->push_back(d); + order->push_back(h); + holes[h.dwords].push_back(location); + continue; + } + + assert(d.dwords < 4); + + unsigned found = 0; + /* try to find the smallest hole big enough to hold this attribute */ + for (unsigned sz = d.dwords; sz <= 4; sz++){ + if (!holes[sz].empty()) { + found = sz; + break; + } + } + + /* append at the end if not found */ + if (found == 0) { + order->push_back(d); + order->push_back(h); + holes[h.dwords].push_back(location); + + continue; + } + + assert(found <= 4); + assert(!holes[found].empty()); + int after_loc = holes[found].back(); + holes[found].pop_back(); + + bool inserted_back = false; + + for (auto it = order->begin(); it != order->end(); ++it) { + if ((*it).location != after_loc) + continue; + + ++it; + /* must be a hole */ + assert((*it).location < 0); + /* and it must be big enough */ + assert(d.dwords <= (*it).dwords); + + if (d.dwords == (*it).dwords) { + /* exact size, just replace */ + *it = d; + } else { + /* inexact size, shrink hole */ + (*it).dwords -= d.dwords; + /* and insert new attribute before it */ + order->insert(it, d); + + /* Insert shrunk hole in a spot so that the order of attributes + * is preserved. + */ + std::list<int> &hole_list = holes[(*it).dwords]; + std::list<int>::iterator insert_before = hole_list.end(); + + for (auto it2 = hole_list.begin(); it2 != hole_list.end(); ++it2) { + if ((*it2) >= (int)location) { + insert_before = it2; + break; + } + } + + hole_list.insert(insert_before, location); + } + + inserted_back = true; + break; + } + + assert(inserted_back); + } + + if (*pack_prim_data_into_header) { + if (orders[PRIM].front().location == virtual_header_location) + orders[PRIM].pop_front(); + + if (!data[PRIM].holes[4].empty()) { + *pack_prim_data_into_header = false; + + assert(orders[PRIM].front().location == -1); + assert(orders[PRIM].front().dwords == 4); + orders[PRIM].pop_front(); + } + + if (*pack_prim_data_into_header) { + unsigned sz = brw_sum_size(orders[PRIM]); + + if (sz % 8 == 0 || sz % 8 > 4) + *pack_prim_data_into_header = false; + } + } + + if (*pack_vert_data_into_header) { + if (orders[VERT].front().location == virtual_header_location) + orders[VERT].pop_front(); + + if (!data[VERT].holes[4].empty()) { + *pack_vert_data_into_header = false; + + assert(orders[VERT].front().location == -1); + assert(orders[VERT].front().dwords == 4); + orders[VERT].pop_front(); + } + + if (*pack_vert_data_into_header) { + unsigned sz = brw_sum_size(orders[VERT]) + + brw_sum_size(orders[VERT_FLAT]); + + if (sz % 8 == 0 || sz % 8 > 4) + *pack_vert_data_into_header = false; + } + } + + + if (INTEL_DEBUG(DEBUG_MESH)) { + fprintf(stderr, "MUE attribute order:\n"); + for (unsigned i = PRIM; i <= VERT_FLAT; ++i) { + if (!orders[i].empty()) + fprintf(stderr, "%d: ", i); + for (auto it = orders[i].cbegin(); it != orders[i].cend(); ++it) { + fprintf(stderr, "%d(%d) ", (*it).location, (*it).dwords); + } + if (!orders[i].empty()) + fprintf(stderr, "\n"); + } + } +} + +/* Mesh URB Entry consists of an initial section + * + * - Primitive Count + * - Primitive Indices (from 0 to Max-1) + * - Padding to 32B if needed + * + * optionally followed by a section for per-primitive data, + * in which each primitive (from 0 to Max-1) gets + * + * - Primitive Header (e.g. ViewportIndex) + * - Primitive Custom Attributes + * + * then followed by a section for per-vertex data + * + * - Vertex Header (e.g. Position) + * - Vertex Custom Attributes + * + * Each per-element section has a pitch and a starting offset. All the + * individual attributes offsets in start_dw are considering the first entry + * of the section (i.e. where the Position for first vertex, or ViewportIndex + * for first primitive). Attributes for other elements are calculated using + * the pitch. + */ +static void +brw_compute_mue_map(const struct brw_compiler *compiler, + struct nir_shader *nir, struct brw_mue_map *map, + enum brw_mesh_index_format index_format, bool compact_mue) +{ + memset(map, 0, sizeof(*map)); + + memset(&map->start_dw[0], -1, sizeof(map->start_dw)); + memset(&map->len_dw[0], 0, sizeof(map->len_dw)); + + unsigned vertices_per_primitive = + mesa_vertices_per_prim(nir->info.mesh.primitive_type); + + map->max_primitives = nir->info.mesh.max_primitives_out; + map->max_vertices = nir->info.mesh.max_vertices_out; + + uint64_t outputs_written = nir->info.outputs_written; + + /* One dword for primitives count then K extra dwords for each primitive. */ + switch (index_format) { + case BRW_INDEX_FORMAT_U32: + map->per_primitive_indices_dw = vertices_per_primitive; + break; + case BRW_INDEX_FORMAT_U888X: + map->per_primitive_indices_dw = 1; + break; + default: + unreachable("invalid index format"); + } + + map->per_primitive_start_dw = ALIGN(map->per_primitive_indices_dw * + map->max_primitives + 1, 8); + + /* Assign initial section. */ + if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) { + map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0; + map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 1; + outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT); + } + if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) { + map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1; + map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] = + map->per_primitive_indices_dw * map->max_primitives; + outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES); + } + + const uint64_t per_primitive_header_bits = + BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) | + BITFIELD64_BIT(VARYING_SLOT_LAYER) | + BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) | + BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE); + + const uint64_t per_vertex_header_bits = + BITFIELD64_BIT(VARYING_SLOT_PSIZ) | + BITFIELD64_BIT(VARYING_SLOT_POS) | + BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0) | + BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); + + std::list<struct attr_desc> orders[3]; + uint64_t regular_outputs = outputs_written & + ~(per_primitive_header_bits | per_vertex_header_bits); + + /* packing into prim header is possible only if prim header is present */ + map->user_data_in_primitive_header = compact_mue && + (outputs_written & per_primitive_header_bits) != 0; + + /* Packing into vert header is always possible, but we allow it only + * if full vec4 is available (so point size is not used) and there's + * nothing between it and normal vertex data (so no clip distances). + */ + map->user_data_in_vertex_header = compact_mue && + (outputs_written & per_vertex_header_bits) == + BITFIELD64_BIT(VARYING_SLOT_POS); + + if (outputs_written & per_primitive_header_bits) { + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) { + map->start_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = + map->per_primitive_start_dw + 0; + map->len_dw[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 1; + } + + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER)) { + map->start_dw[VARYING_SLOT_LAYER] = + map->per_primitive_start_dw + 1; /* RTAIndex */ + map->len_dw[VARYING_SLOT_LAYER] = 1; + } + + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)) { + map->start_dw[VARYING_SLOT_VIEWPORT] = + map->per_primitive_start_dw + 2; + map->len_dw[VARYING_SLOT_VIEWPORT] = 1; + } + + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE)) { + map->start_dw[VARYING_SLOT_CULL_PRIMITIVE] = + map->per_primitive_start_dw + 3; + map->len_dw[VARYING_SLOT_CULL_PRIMITIVE] = 1; + } + + map->per_primitive_header_size_dw = 8; + outputs_written &= ~per_primitive_header_bits; + } else { + map->per_primitive_header_size_dw = 0; + } + + map->per_primitive_data_size_dw = 0; + + /* For fast linked libraries, we can't pack the MUE, as the fragment shader + * will be compiled without access to the MUE map and won't be able to find + * out where everything is. + * Instead, keep doing things as we did before the packing, just laying out + * everything in varying order, which is how the FS will expect them. + */ + if (compact_mue) { + brw_compute_mue_layout(compiler, orders, regular_outputs, nir, + &map->user_data_in_primitive_header, + &map->user_data_in_vertex_header); + + unsigned start_dw = map->per_primitive_start_dw; + if (map->user_data_in_primitive_header) + start_dw += 4; /* first 4 dwords are used */ + else + start_dw += map->per_primitive_header_size_dw; + unsigned header_used_dw = 0; + + for (auto it = orders[PRIM].cbegin(); it != orders[PRIM].cend(); ++it) { + int location = (*it).location; + if (location < 0) { + start_dw += (*it).dwords; + if (map->user_data_in_primitive_header && header_used_dw < 4) + header_used_dw += (*it).dwords; + else + map->per_primitive_data_size_dw += (*it).dwords; + assert(header_used_dw <= 4); + continue; + } + + assert(map->start_dw[location] == -1); + + assert(location == VARYING_SLOT_PRIMITIVE_ID || + location >= VARYING_SLOT_VAR0); + + brw_mue_assign_position(&*it, map, start_dw); + + start_dw += (*it).dwords; + if (map->user_data_in_primitive_header && header_used_dw < 4) + header_used_dw += (*it).dwords; + else + map->per_primitive_data_size_dw += (*it).dwords; + assert(header_used_dw <= 4); + outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); + } + } else { + unsigned start_dw = map->per_primitive_start_dw + + map->per_primitive_header_size_dw; + + uint64_t per_prim_outputs = outputs_written & nir->info.per_primitive_outputs; + while (per_prim_outputs) { + uint64_t location = ffsll(per_prim_outputs) - 1; + + assert(map->start_dw[location] == -1); + assert(location == VARYING_SLOT_PRIMITIVE_ID || + location >= VARYING_SLOT_VAR0); + + nir_variable *var = + brw_nir_find_complete_variable_with_location(nir, + nir_var_shader_out, + location); + struct attr_desc d; + d.location = location; + d.type = brw_nir_get_var_type(nir, var); + d.dwords = glsl_count_dword_slots(d.type, false); + d.slots = glsl_count_attribute_slots(d.type, false); + + brw_mue_assign_position(&d, map, start_dw); + + map->per_primitive_data_size_dw += ALIGN(d.dwords, 4); + start_dw += ALIGN(d.dwords, 4); + + per_prim_outputs &= ~BITFIELD64_RANGE(location, d.slots); + } + } + + map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw + + map->per_primitive_data_size_dw, 8); + + map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw + + map->per_primitive_pitch_dw * + map->max_primitives, 8); + + /* TODO(mesh): Multiview. */ + unsigned fixed_header_size = 8; + map->per_vertex_header_size_dw = ALIGN(fixed_header_size + + nir->info.clip_distance_array_size + + nir->info.cull_distance_array_size, 8); + + if (outputs_written & per_vertex_header_bits) { + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_PSIZ)) { + map->start_dw[VARYING_SLOT_PSIZ] = map->per_vertex_start_dw + 3; + map->len_dw[VARYING_SLOT_PSIZ] = 1; + } + + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_POS)) { + map->start_dw[VARYING_SLOT_POS] = map->per_vertex_start_dw + 4; + map->len_dw[VARYING_SLOT_POS] = 4; + } + + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) { + map->start_dw[VARYING_SLOT_CLIP_DIST0] = + map->per_vertex_start_dw + fixed_header_size + 0; + map->len_dw[VARYING_SLOT_CLIP_DIST0] = 4; + } + + if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) { + map->start_dw[VARYING_SLOT_CLIP_DIST1] = + map->per_vertex_start_dw + fixed_header_size + 4; + map->len_dw[VARYING_SLOT_CLIP_DIST1] = 4; + } + + outputs_written &= ~per_vertex_header_bits; + } + + /* cull distances should be lowered earlier */ + assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST0))); + assert(!(outputs_written & BITFIELD64_BIT(VARYING_SLOT_CULL_DIST1))); + + map->per_vertex_data_size_dw = 0; + + /* For fast linked libraries, we can't pack the MUE, as the fragment shader + * will be compiled without access to the MUE map and won't be able to find + * out where everything is. + * Instead, keep doing things as we did before the packing, just laying out + * everything in varying order, which is how the FS will expect them. + */ + if (compact_mue) { + unsigned start_dw = map->per_vertex_start_dw; + if (!map->user_data_in_vertex_header) + start_dw += map->per_vertex_header_size_dw; + + unsigned header_used_dw = 0; + for (unsigned type = VERT; type <= VERT_FLAT; ++type) { + for (auto it = orders[type].cbegin(); it != orders[type].cend(); ++it) { + int location = (*it).location; + if (location < 0) { + start_dw += (*it).dwords; + if (map->user_data_in_vertex_header && header_used_dw < 4) { + header_used_dw += (*it).dwords; + assert(header_used_dw <= 4); + if (header_used_dw == 4) + start_dw += 4; /* jump over gl_position */ + } else { + map->per_vertex_data_size_dw += (*it).dwords; + } + continue; + } + + assert(map->start_dw[location] == -1); + + assert(location >= VARYING_SLOT_VAR0); + + brw_mue_assign_position(&*it, map, start_dw); + + start_dw += (*it).dwords; + if (map->user_data_in_vertex_header && header_used_dw < 4) { + header_used_dw += (*it).dwords; + assert(header_used_dw <= 4); + if (header_used_dw == 4) + start_dw += 4; /* jump over gl_position */ + } else { + map->per_vertex_data_size_dw += (*it).dwords; + } + outputs_written &= ~BITFIELD64_RANGE(location, (*it).slots); + } + } + } else { + unsigned start_dw = map->per_vertex_start_dw + + map->per_vertex_header_size_dw; + + uint64_t per_vertex_outputs = outputs_written & ~nir->info.per_primitive_outputs; + while (per_vertex_outputs) { + uint64_t location = ffsll(per_vertex_outputs) - 1; + + assert(map->start_dw[location] == -1); + assert(location >= VARYING_SLOT_VAR0); + + nir_variable *var = + brw_nir_find_complete_variable_with_location(nir, + nir_var_shader_out, + location); + struct attr_desc d; + d.location = location; + d.type = brw_nir_get_var_type(nir, var); + d.dwords = glsl_count_dword_slots(d.type, false); + d.slots = glsl_count_attribute_slots(d.type, false); + + brw_mue_assign_position(&d, map, start_dw); + + map->per_vertex_data_size_dw += ALIGN(d.dwords, 4); + start_dw += ALIGN(d.dwords, 4); + + per_vertex_outputs &= ~BITFIELD64_RANGE(location, d.slots); + } + } + + map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw + + map->per_vertex_data_size_dw, 8); + + map->size_dw = + map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices; + + assert(map->size_dw % 8 == 0); +} + +static void +brw_print_mue_map(FILE *fp, const struct brw_mue_map *map, struct nir_shader *nir) +{ + fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n", + map->size_dw, map->max_primitives, map->max_vertices); + fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_COUNT\n", + map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT], + map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] + + map->len_dw[VARYING_SLOT_PRIMITIVE_COUNT] - 1); + fprintf(fp, " <%4d, %4d>: VARYING_SLOT_PRIMITIVE_INDICES\n", + map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES], + map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] + + map->len_dw[VARYING_SLOT_PRIMITIVE_INDICES] - 1); + + fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n", + map->per_primitive_start_dw, + map->per_primitive_header_size_dw, + map->per_primitive_data_size_dw, + map->per_primitive_pitch_dw); + + for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { + if (map->start_dw[i] < 0) + continue; + + const unsigned offset = map->start_dw[i]; + const unsigned len = map->len_dw[i]; + + if (offset < map->per_primitive_start_dw || + offset >= map->per_primitive_start_dw + map->per_primitive_pitch_dw) + continue; + + const char *name = + gl_varying_slot_name_for_stage((gl_varying_slot)i, + MESA_SHADER_MESH); + + fprintf(fp, " <%4d, %4d>: %s (%d)\n", offset, offset + len - 1, + name, i); + } + + fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n", + map->per_vertex_start_dw, + map->per_vertex_header_size_dw, + map->per_vertex_data_size_dw, + map->per_vertex_pitch_dw); + + for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { + if (map->start_dw[i] < 0) + continue; + + const unsigned offset = map->start_dw[i]; + const unsigned len = map->len_dw[i]; + + if (offset < map->per_vertex_start_dw || + offset >= map->per_vertex_start_dw + map->per_vertex_pitch_dw) + continue; + + nir_variable *var = + nir_find_variable_with_location(nir, nir_var_shader_out, i); + bool flat = var->data.interpolation == INTERP_MODE_FLAT; + + const char *name = + gl_varying_slot_name_for_stage((gl_varying_slot)i, + MESA_SHADER_MESH); + + fprintf(fp, " <%4d, %4d>: %s (%d)%s\n", offset, offset + len - 1, + name, i, flat ? " (flat)" : ""); + } + + fprintf(fp, "\n"); +} + +static void +brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map) +{ + nir_foreach_shader_out_variable(var, nir) { + int location = var->data.location; + assert(location >= 0); + assert(map->start_dw[location] != -1); + var->data.driver_location = map->start_dw[location]; + } + + NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, + type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32); +} + +static void +brw_nir_initialize_mue(nir_shader *nir, + const struct brw_mue_map *map, + unsigned dispatch_width) +{ + assert(map->per_primitive_header_size_dw > 0); + + nir_builder b; + nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir); + b = nir_builder_at(nir_before_impl(entrypoint)); + + nir_def *dw_off = nir_imm_int(&b, 0); + nir_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0); + + /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */ + + assert(!nir->info.workgroup_size_variable); + const unsigned workgroup_size = nir->info.workgroup_size[0] * + nir->info.workgroup_size[1] * + nir->info.workgroup_size[2]; + + /* Invocations from a single workgroup will cooperate in zeroing MUE. */ + + /* How many prims each invocation needs to cover without checking its index? */ + unsigned prims_per_inv = map->max_primitives / workgroup_size; + + /* Zero first 4 dwords of MUE Primitive Header: + * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask. + */ + + nir_def *local_invocation_index = nir_load_local_invocation_index(&b); + + /* Zero primitive headers distanced by workgroup_size, starting from + * invocation index. + */ + for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) { + nir_def *prim = nir_iadd_imm(&b, local_invocation_index, + prim_in_inv * workgroup_size); + + nir_store_per_primitive_output(&b, zerovec, prim, dw_off, + .base = (int)map->per_primitive_start_dw, + .write_mask = WRITEMASK_XYZW, + .component = 0, + .src_type = nir_type_uint32); + } + + /* How many prims are left? */ + unsigned remaining = map->max_primitives % workgroup_size; + + if (remaining) { + /* Zero "remaining" primitive headers starting from the last one covered + * by the loop above + workgroup_size. + */ + nir_def *cmp = nir_ilt_imm(&b, local_invocation_index, remaining); + nir_if *if_stmt = nir_push_if(&b, cmp); + { + nir_def *prim = nir_iadd_imm(&b, local_invocation_index, + prims_per_inv * workgroup_size); + + nir_store_per_primitive_output(&b, zerovec, prim, dw_off, + .base = (int)map->per_primitive_start_dw, + .write_mask = WRITEMASK_XYZW, + .component = 0, + .src_type = nir_type_uint32); + } + nir_pop_if(&b, if_stmt); + } + + /* If there's more than one subgroup, then we need to wait for all of them + * to finish initialization before we can proceed. Otherwise some subgroups + * may start filling MUE before other finished initializing. + */ + if (workgroup_size > dispatch_width) { + nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, + NIR_MEMORY_ACQ_REL, nir_var_shader_out); + } + + if (remaining) { + nir_metadata_preserve(entrypoint, nir_metadata_none); + } else { + nir_metadata_preserve(entrypoint, nir_metadata_block_index | + nir_metadata_dominance); + } +} + +static void +brw_nir_adjust_offset(nir_builder *b, nir_intrinsic_instr *intrin, uint32_t pitch) +{ + nir_src *index_src = nir_get_io_arrayed_index_src(intrin); + nir_src *offset_src = nir_get_io_offset_src(intrin); + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *offset = + nir_iadd(b, + offset_src->ssa, + nir_imul_imm(b, index_src->ssa, pitch)); + nir_src_rewrite(offset_src, offset); +} + +static bool +brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + const struct brw_mue_map *map = (const struct brw_mue_map *) data; + + /* Remap per_vertex and per_primitive offsets using the extra source and + * the pitch. + */ + switch (intrin->intrinsic) { + case nir_intrinsic_load_per_vertex_output: + case nir_intrinsic_store_per_vertex_output: + brw_nir_adjust_offset(b, intrin, map->per_vertex_pitch_dw); + + return true; + + case nir_intrinsic_load_per_primitive_output: + case nir_intrinsic_store_per_primitive_output: { + struct nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + uint32_t pitch; + if (sem.location == VARYING_SLOT_PRIMITIVE_INDICES) + pitch = map->per_primitive_indices_dw; + else + pitch = map->per_primitive_pitch_dw; + + brw_nir_adjust_offset(b, intrin, pitch); + + return true; + } + + default: + return false; + } +} + +static bool +brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map) +{ + return nir_shader_intrinsics_pass(nir, + brw_nir_adjust_offset_for_arrayed_indices_instr, + nir_metadata_block_index | + nir_metadata_dominance, + (void *)map); +} + +struct index_packing_state { + unsigned vertices_per_primitive; + nir_variable *original_prim_indices; + nir_variable *packed_prim_indices; +}; + +static bool +brw_can_pack_primitive_indices(nir_shader *nir, struct index_packing_state *state) +{ + /* can single index fit into one byte of U888X format? */ + if (nir->info.mesh.max_vertices_out > 255) + return false; + + state->vertices_per_primitive = + mesa_vertices_per_prim(nir->info.mesh.primitive_type); + /* packing point indices doesn't help */ + if (state->vertices_per_primitive == 1) + return false; + + state->original_prim_indices = + nir_find_variable_with_location(nir, + nir_var_shader_out, + VARYING_SLOT_PRIMITIVE_INDICES); + /* no indices = no changes to the shader, but it's still worth it, + * because less URB space will be used + */ + if (!state->original_prim_indices) + return true; + + ASSERTED const struct glsl_type *type = state->original_prim_indices->type; + assert(glsl_type_is_array(type)); + assert(glsl_type_is_vector(glsl_without_array(type))); + assert(glsl_without_array(type)->vector_elements == state->vertices_per_primitive); + + nir_foreach_function_impl(impl, nir) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic != nir_intrinsic_store_deref) { + /* any unknown deref operation on primitive indices -> don't pack */ + unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs; + for (unsigned i = 0; i < num_srcs; i++) { + nir_deref_instr *deref = nir_src_as_deref(intrin->src[i]); + if (!deref) + continue; + nir_variable *var = nir_deref_instr_get_variable(deref); + + if (var == state->original_prim_indices) + return false; + } + + continue; + } + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + if (!deref) + continue; + + nir_variable *var = nir_deref_instr_get_variable(deref); + if (var != state->original_prim_indices) + continue; + + if (deref->deref_type != nir_deref_type_array) + return false; /* unknown chain of derefs */ + + nir_deref_instr *var_deref = nir_src_as_deref(deref->parent); + if (!var_deref || var_deref->deref_type != nir_deref_type_var) + return false; /* unknown chain of derefs */ + + assert (var_deref->var == state->original_prim_indices); + + unsigned write_mask = nir_intrinsic_write_mask(intrin); + + /* If only some components are written, then we can't easily pack. + * In theory we could, by loading current dword value, bitmasking + * one byte and storing back the whole dword, but it would be slow + * and could actually decrease performance. TODO: reevaluate this + * once there will be something hitting this. + */ + if (write_mask != BITFIELD_MASK(state->vertices_per_primitive)) + return false; + } + } + } + + return true; +} + +static bool +brw_pack_primitive_indices_instr(nir_builder *b, nir_intrinsic_instr *intrin, + void *data) +{ + if (intrin->intrinsic != nir_intrinsic_store_deref) + return false; + + nir_deref_instr *array_deref = nir_src_as_deref(intrin->src[0]); + if (!array_deref || array_deref->deref_type != nir_deref_type_array) + return false; + + nir_deref_instr *var_deref = nir_src_as_deref(array_deref->parent); + if (!var_deref || var_deref->deref_type != nir_deref_type_var) + return false; + + struct index_packing_state *state = + (struct index_packing_state *)data; + + nir_variable *var = var_deref->var; + + if (var != state->original_prim_indices) + return false; + + unsigned vertices_per_primitive = state->vertices_per_primitive; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_deref_instr *new_var_deref = + nir_build_deref_var(b, state->packed_prim_indices); + nir_deref_instr *new_array_deref = + nir_build_deref_array(b, new_var_deref, array_deref->arr.index.ssa); + + nir_src *data_src = &intrin->src[1]; + nir_def *data_def = + data_src->ssa; + + nir_def *new_data = + nir_ior(b, nir_ishl_imm(b, nir_channel(b, data_def, 0), 0), + nir_ishl_imm(b, nir_channel(b, data_def, 1), 8)); + + if (vertices_per_primitive >= 3) { + new_data = + nir_ior(b, new_data, + nir_ishl_imm(b, nir_channel(b, data_def, 2), 16)); + } + + nir_build_store_deref(b, &new_array_deref->def, new_data); + + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +brw_pack_primitive_indices(nir_shader *nir, void *data) +{ + struct index_packing_state *state = (struct index_packing_state *)data; + + const struct glsl_type *new_type = + glsl_array_type(glsl_uint_type(), + nir->info.mesh.max_primitives_out, + 0); + + state->packed_prim_indices = + nir_variable_create(nir, nir_var_shader_out, + new_type, "gl_PrimitiveIndicesPacked"); + state->packed_prim_indices->data.location = VARYING_SLOT_PRIMITIVE_INDICES; + state->packed_prim_indices->data.interpolation = INTERP_MODE_NONE; + state->packed_prim_indices->data.per_primitive = 1; + + return nir_shader_intrinsics_pass(nir, brw_pack_primitive_indices_instr, + nir_metadata_block_index | + nir_metadata_dominance, + data); +} + +const unsigned * +brw_compile_mesh(const struct brw_compiler *compiler, + struct brw_compile_mesh_params *params) +{ + struct nir_shader *nir = params->base.nir; + const struct brw_mesh_prog_key *key = params->key; + struct brw_mesh_prog_data *prog_data = params->prog_data; + const bool debug_enabled = brw_should_print_shader(nir, DEBUG_MESH); + + prog_data->base.base.stage = MESA_SHADER_MESH; + prog_data->base.base.total_shared = nir->info.shared_size; + prog_data->base.base.total_scratch = 0; + + prog_data->base.local_size[0] = nir->info.workgroup_size[0]; + prog_data->base.local_size[1] = nir->info.workgroup_size[1]; + prog_data->base.local_size[2] = nir->info.workgroup_size[2]; + + prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1; + prog_data->cull_distance_mask = + ((1 << nir->info.cull_distance_array_size) - 1) << + nir->info.clip_distance_array_size; + prog_data->primitive_type = nir->info.mesh.primitive_type; + + struct index_packing_state index_packing_state = {}; + if (brw_can_pack_primitive_indices(nir, &index_packing_state)) { + if (index_packing_state.original_prim_indices) + NIR_PASS(_, nir, brw_pack_primitive_indices, &index_packing_state); + prog_data->index_format = BRW_INDEX_FORMAT_U888X; + } else { + prog_data->index_format = BRW_INDEX_FORMAT_U32; + } + + prog_data->uses_drawid = + BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); + + brw_nir_lower_tue_inputs(nir, params->tue_map); + + brw_compute_mue_map(compiler, nir, &prog_data->map, + prog_data->index_format, key->compact_mue); + brw_nir_lower_mue_outputs(nir, &prog_data->map); + + brw_simd_selection_state simd_state{ + .devinfo = compiler->devinfo, + .prog_data = &prog_data->base, + .required_width = brw_required_dispatch_width(&nir->info), + }; + + std::unique_ptr<fs_visitor> v[3]; + + for (int simd = 0; simd < 3; simd++) { + if (!brw_simd_should_compile(simd_state, simd)) + continue; + + const unsigned dispatch_width = 8 << simd; + + nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir); + + /* + * When Primitive Header is enabled, we may not generates writes to all + * fields, so let's initialize everything. + */ + if (prog_data->map.per_primitive_header_size_dw > 0) + NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width); + + brw_nir_apply_key(shader, compiler, &key->base, dispatch_width); + + NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map); + /* Load uniforms can do a better job for constants, so fold before it. */ + NIR_PASS(_, shader, nir_opt_constant_folding); + NIR_PASS(_, shader, brw_nir_lower_load_uniforms); + + NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); + + brw_postprocess_nir(shader, compiler, debug_enabled, + key->base.robust_flags); + + v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base, + &key->base, + &prog_data->base.base, + shader, dispatch_width, + params->base.stats != NULL, + debug_enabled); + + if (prog_data->base.prog_mask) { + unsigned first = ffs(prog_data->base.prog_mask) - 1; + v[simd]->import_uniforms(v[first].get()); + } + + const bool allow_spilling = !brw_simd_any_compiled(simd_state); + if (v[simd]->run_mesh(allow_spilling)) + brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); + else + simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg); + } + + int selected_simd = brw_simd_select(simd_state); + if (selected_simd < 0) { + params->base.error_str = + ralloc_asprintf(params->base.mem_ctx, + "Can't compile shader: " + "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n", + simd_state.error[0], simd_state.error[1], + simd_state.error[2]); + return NULL; + } + + fs_visitor *selected = v[selected_simd].get(); + prog_data->base.prog_mask = 1 << selected_simd; + + if (unlikely(debug_enabled)) { + if (params->tue_map) { + fprintf(stderr, "Mesh Input "); + brw_print_tue_map(stderr, params->tue_map); + } + fprintf(stderr, "Mesh Output "); + brw_print_mue_map(stderr, &prog_data->map, nir); + } + + fs_generator g(compiler, ¶ms->base, &prog_data->base.base, + MESA_SHADER_MESH); + if (unlikely(debug_enabled)) { + g.enable_debug(ralloc_asprintf(params->base.mem_ctx, + "%s mesh shader %s", + nir->info.label ? nir->info.label + : "unnamed", + nir->info.name)); + } + + g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats, + selected->performance_analysis.require(), params->base.stats); + g.add_const_data(nir->constant_data, nir->constant_data_size); + return g.get_assembly(); +} |