diff options
Diffstat (limited to 'src/panfrost/util')
-rw-r--r-- | src/panfrost/util/lcra.c | 287 | ||||
-rw-r--r-- | src/panfrost/util/lcra.h | 119 | ||||
-rw-r--r-- | src/panfrost/util/meson.build | 10 | ||||
-rw-r--r-- | src/panfrost/util/nir_mod_helpers.c | 128 | ||||
-rw-r--r-- | src/panfrost/util/pan_collect_varyings.c | 190 | ||||
-rw-r--r-- | src/panfrost/util/pan_ir.c | 180 | ||||
-rw-r--r-- | src/panfrost/util/pan_ir.h | 564 | ||||
-rw-r--r-- | src/panfrost/util/pan_liveness.c | 144 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_64bit_intrin.c | 78 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_framebuffer.c | 862 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_framebuffer.h | 16 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_helper_invocation.c | 30 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_image_index.c | 55 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_image_ms.c | 72 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_sample_position.c | 51 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_store_component.c | 98 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_writeout.c | 306 | ||||
-rw-r--r-- | src/panfrost/util/pan_lower_xfb.c | 105 | ||||
-rw-r--r-- | src/panfrost/util/pan_sysval.c | 162 |
19 files changed, 1835 insertions, 1622 deletions
diff --git a/src/panfrost/util/lcra.c b/src/panfrost/util/lcra.c index 45cff37e61f..00585c646a6 100644 --- a/src/panfrost/util/lcra.c +++ b/src/panfrost/util/lcra.c @@ -24,14 +24,14 @@ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> */ -#include <stdio.h> +#include "lcra.h" #include <assert.h> +#include <limits.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> -#include <limits.h> #include "util/macros.h" #include "util/u_math.h" -#include "lcra.h" /* This module is the reference implementation of "Linearly Constrained * Register Allocation". The paper is available in PDF form @@ -40,161 +40,168 @@ */ struct lcra_state * -lcra_alloc_equations( - unsigned node_count, unsigned class_count) +lcra_alloc_equations(unsigned node_count, unsigned class_count) { - struct lcra_state *l = calloc(1, sizeof(*l)); + struct lcra_state *l = calloc(1, sizeof(*l)); - l->node_count = node_count; - l->class_count = class_count; + l->node_count = node_count; + l->class_count = class_count; - l->alignment = calloc(sizeof(l->alignment[0]), node_count); - l->linear = calloc(sizeof(l->linear[0]), node_count * node_count); - l->modulus = calloc(sizeof(l->modulus[0]), node_count); - l->class = calloc(sizeof(l->class[0]), node_count); - l->class_start = calloc(sizeof(l->class_start[0]), class_count); - l->class_disjoint = calloc(sizeof(l->class_disjoint[0]), class_count * class_count); - l->class_size = calloc(sizeof(l->class_size[0]), class_count); - l->spill_cost = calloc(sizeof(l->spill_cost[0]), node_count); - l->solutions = calloc(sizeof(l->solutions[0]), node_count); + l->alignment = calloc(sizeof(l->alignment[0]), node_count); + l->linear = calloc(sizeof(l->linear[0]), node_count * node_count); + l->modulus = calloc(sizeof(l->modulus[0]), node_count); + l->class = calloc(sizeof(l->class[0]), node_count); + l->class_start = calloc(sizeof(l->class_start[0]), class_count); + l->class_disjoint = + calloc(sizeof(l->class_disjoint[0]), class_count * class_count); + l->class_size = calloc(sizeof(l->class_size[0]), class_count); + l->spill_cost = calloc(sizeof(l->spill_cost[0]), node_count); + l->solutions = calloc(sizeof(l->solutions[0]), node_count); - memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); + memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count); - return l; + return l; } void lcra_free(struct lcra_state *l) { - if (!l) - return; - - free(l->alignment); - free(l->linear); - free(l->modulus); - free(l->class); - free(l->class_start); - free(l->class_disjoint); - free(l->class_size); - free(l->spill_cost); - free(l->solutions); - - free(l); + if (!l) + return; + + free(l->alignment); + free(l->linear); + free(l->modulus); + free(l->class); + free(l->class_start); + free(l->class_disjoint); + free(l->class_size); + free(l->spill_cost); + free(l->solutions); + + free(l); } void -lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, unsigned bound) +lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, + unsigned bound) { - l->alignment[node] = (align_log2 + 1) | (bound << 16); + l->alignment[node] = (align_log2 + 1) | (bound << 16); } void lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2) { - l->class_disjoint[(c1 * l->class_count) + c2] = true; - l->class_disjoint[(c2 * l->class_count) + c1] = true; + l->class_disjoint[(c1 * l->class_count) + c2] = true; + l->class_disjoint[(c2 * l->class_count) + c1] = true; } void lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len) { - if (node < l->node_count && l->alignment[node]) { - unsigned BA = l->alignment[node]; - unsigned alignment = (BA & 0xffff) - 1; - unsigned bound = BA >> 16; - l->modulus[node] = DIV_ROUND_UP(bound - len + 1, 1 << alignment); - } + if (node < l->node_count && l->alignment[node]) { + unsigned BA = l->alignment[node]; + unsigned alignment = (BA & 0xffff) - 1; + unsigned bound = BA >> 16; + l->modulus[node] = DIV_ROUND_UP(bound - len + 1, 1 << alignment); + } } void -lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j) +lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, + unsigned j, unsigned cmask_j) { - if (i == j) - return; + if (i == j) + return; - if (l->class_disjoint[(l->class[i] * l->class_count) + l->class[j]]) - return; + if (l->class_disjoint[(l->class[i] * l -> class_count) + l->class[j]]) + return; - uint32_t constraint_fw = 0; - uint32_t constraint_bw = 0; + uint32_t constraint_fw = 0; + uint32_t constraint_bw = 0; - for (unsigned D = 0; D < 16; ++D) { - if (cmask_i & (cmask_j << D)) { - constraint_bw |= (1 << (15 + D)); - constraint_fw |= (1 << (15 - D)); - } + for (unsigned D = 0; D < 16; ++D) { + if (cmask_i & (cmask_j << D)) { + constraint_bw |= (1 << (15 + D)); + constraint_fw |= (1 << (15 - D)); + } - if (cmask_i & (cmask_j >> D)) { - constraint_fw |= (1 << (15 + D)); - constraint_bw |= (1 << (15 - D)); - } - } + if (cmask_i & (cmask_j >> D)) { + constraint_fw |= (1 << (15 + D)); + constraint_bw |= (1 << (15 - D)); + } + } - l->linear[j * l->node_count + i] |= constraint_fw; - l->linear[i * l->node_count + j] |= constraint_bw; + l->linear[j * l->node_count + i] |= constraint_fw; + l->linear[i * l->node_count + j] |= constraint_bw; } static bool lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i) { - unsigned *row = &l->linear[i * l->node_count]; - signed constant = solutions[i]; + unsigned *row = &l->linear[i * l->node_count]; + signed constant = solutions[i]; - for (unsigned j = 0; j < l->node_count; ++j) { - if (solutions[j] == ~0) continue; + for (unsigned j = 0; j < l->node_count; ++j) { + if (solutions[j] == ~0) + continue; - signed lhs = solutions[j] - constant; + signed lhs = solutions[j] - constant; - if (lhs < -15 || lhs > 15) - continue; + if (lhs < -15 || lhs > 15) + continue; - if (row[j] & (1 << (lhs + 15))) - return false; - } + if (row[j] & (1 << (lhs + 15))) + return false; + } - return true; + return true; } bool lcra_solve(struct lcra_state *l) { - for (unsigned step = 0; step < l->node_count; ++step) { - if (l->solutions[step] != ~0) continue; - if (l->alignment[step] == 0) continue; - - unsigned _class = l->class[step]; - unsigned class_start = l->class_start[_class]; - - unsigned BA = l->alignment[step]; - unsigned shift = (BA & 0xffff) - 1; - unsigned bound = BA >> 16; - - unsigned P = bound >> shift; - unsigned Q = l->modulus[step]; - unsigned r_max = l->class_size[_class]; - unsigned k_max = r_max >> shift; - unsigned m_max = k_max / P; - bool succ = false; - - for (unsigned m = 0; m < m_max; ++m) { - for (unsigned n = 0; n < Q; ++n) { - l->solutions[step] = ((m * P + n) << shift) + class_start; - succ = lcra_test_linear(l, l->solutions, step); - - if (succ) break; - } - - if (succ) break; - } - - /* Out of registers - prepare to spill */ - if (!succ) { - l->spill_class = l->class[step]; - return false; - } - } - - return true; + for (unsigned step = 0; step < l->node_count; ++step) { + if (l->solutions[step] != ~0) + continue; + if (l->alignment[step] == 0) + continue; + + unsigned _class = l->class[step]; + unsigned class_start = l->class_start[_class]; + + unsigned BA = l->alignment[step]; + unsigned shift = (BA & 0xffff) - 1; + unsigned bound = BA >> 16; + + unsigned P = bound >> shift; + unsigned Q = l->modulus[step]; + unsigned r_max = l->class_size[_class]; + unsigned k_max = r_max >> shift; + unsigned m_max = k_max / P; + bool succ = false; + + for (unsigned m = 0; m < m_max; ++m) { + for (unsigned n = 0; n < Q; ++n) { + l->solutions[step] = ((m * P + n) << shift) + class_start; + succ = lcra_test_linear(l, l->solutions, step); + + if (succ) + break; + } + + if (succ) + break; + } + + /* Out of registers - prepare to spill */ + if (!succ) { + l->spill_class = l->class[step]; + return false; + } + } + + return true; } /* Register spilling is implemented with a cost-benefit system. Costs are set @@ -203,45 +210,47 @@ lcra_solve(struct lcra_state *l) void lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost) { - if (node < l->node_count) - l->spill_cost[node] = cost; + if (node < l->node_count) + l->spill_cost[node] = cost; } static unsigned lcra_count_constraints(struct lcra_state *l, unsigned i) { - unsigned count = 0; - unsigned *constraints = &l->linear[i * l->node_count]; + unsigned count = 0; + unsigned *constraints = &l->linear[i * l->node_count]; - for (unsigned j = 0; j < l->node_count; ++j) - count += util_bitcount(constraints[j]); + for (unsigned j = 0; j < l->node_count; ++j) + count += util_bitcount(constraints[j]); - return count; + return count; } signed lcra_get_best_spill_node(struct lcra_state *l) { - /* If there are no constraints on a node, do not pick it to spill under - * any circumstance, or else we would hang rather than fail RA */ - float best_benefit = 0.0; - signed best_node = -1; - - for (unsigned i = 0; i < l->node_count; ++i) { - /* Find spillable nodes */ - if (l->class[i] != l->spill_class) continue; - if (l->spill_cost[i] < 0) continue; - - /* Adapted from Chaitin's heuristic */ - float constraints = lcra_count_constraints(l, i); - float cost = (l->spill_cost[i] + 1); - float benefit = constraints / cost; - - if (benefit > best_benefit) { - best_benefit = benefit; - best_node = i; - } - } - - return best_node; + /* If there are no constraints on a node, do not pick it to spill under + * any circumstance, or else we would hang rather than fail RA */ + float best_benefit = 0.0; + signed best_node = -1; + + for (unsigned i = 0; i < l->node_count; ++i) { + /* Find spillable nodes */ + if (l->class[i] != l->spill_class) + continue; + if (l->spill_cost[i] < 0) + continue; + + /* Adapted from Chaitin's heuristic */ + float constraints = lcra_count_constraints(l, i); + float cost = (l->spill_cost[i] + 1); + float benefit = constraints / cost; + + if (benefit > best_benefit) { + best_benefit = benefit; + best_node = i; + } + } + + return best_node; } diff --git a/src/panfrost/util/lcra.h b/src/panfrost/util/lcra.h index fd47fdc3543..0b1ed13400f 100644 --- a/src/panfrost/util/lcra.h +++ b/src/panfrost/util/lcra.h @@ -31,78 +31,71 @@ #include <stdint.h> struct lcra_state { - unsigned node_count; - - /* Alignment for node in log2(bytes)+1. Since alignment must be - * non-negative power-of-two, the elements are strictly positive - * integers. Zero is the sentinel for a missing node. In upper word, - * bound. */ - unsigned *alignment; - - /* Linear constraints imposed. Nested array sized upfront, organized as - * linear[node_left][node_right]. That is, calculate indices as: - * - * Each element is itself a bit field denoting whether (c_j - c_i) bias - * is present or not, including negative biases. - * - * Note for Midgard, there are 16 components so the bias is in range - * [-15, 15] so encoded by 32-bit field. */ - - uint32_t *linear; - - /* Per node max modulus constraints */ - uint8_t *modulus; - - /* Classes allow nodes to be partitioned with a starting register. - * Classes cannot interfere; that is, they are true partitions in the - * usual sense of the word. class_count is the number of classes. - * class[] is indexed by a node to get the mapped class. class_start is - * biased to all solutions in the class. */ - - unsigned class_count; - unsigned *class; - unsigned *class_start; - unsigned *class_size; - bool *class_disjoint; - - /* Before solving, forced registers; after solving, solutions. */ - unsigned *solutions; - - /* For register spilling, the costs to spill nodes (as set by the user) - * are in spill_cost[], negative if a node is unspillable. Internally, - * spill_class specifies which class to spill (whichever class failed - * to allocate) */ - - signed *spill_cost; - unsigned spill_class; + unsigned node_count; + + /* Alignment for node in log2(bytes)+1. Since alignment must be + * non-negative power-of-two, the elements are strictly positive + * integers. Zero is the sentinel for a missing node. In upper word, + * bound. */ + unsigned *alignment; + + /* Linear constraints imposed. Nested array sized upfront, organized as + * linear[node_left][node_right]. That is, calculate indices as: + * + * Each element is itself a bit field denoting whether (c_j - c_i) bias + * is present or not, including negative biases. + * + * Note for Midgard, there are 16 components so the bias is in range + * [-15, 15] so encoded by 32-bit field. */ + + uint32_t *linear; + + /* Per node max modulus constraints */ + uint8_t *modulus; + + /* Classes allow nodes to be partitioned with a starting register. + * Classes cannot interfere; that is, they are true partitions in the + * usual sense of the word. class_count is the number of classes. + * class[] is indexed by a node to get the mapped class. class_start is + * biased to all solutions in the class. */ + + unsigned class_count; + unsigned *class; + unsigned *class_start; + unsigned *class_size; + bool *class_disjoint; + + /* Before solving, forced registers; after solving, solutions. */ + unsigned *solutions; + + /* For register spilling, the costs to spill nodes (as set by the user) + * are in spill_cost[], negative if a node is unspillable. Internally, + * spill_class specifies which class to spill (whichever class failed + * to allocate) */ + + signed *spill_cost; + unsigned spill_class; }; -struct lcra_state * -lcra_alloc_equations( - unsigned node_count, unsigned class_count); +struct lcra_state *lcra_alloc_equations(unsigned node_count, + unsigned class_count); -void -lcra_free(struct lcra_state *l); +void lcra_free(struct lcra_state *l); -void -lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2); +void lcra_set_disjoint_class(struct lcra_state *l, unsigned c1, unsigned c2); -void -lcra_set_alignment(struct lcra_state *l, unsigned node, unsigned align_log2, unsigned bound); +void lcra_set_alignment(struct lcra_state *l, unsigned node, + unsigned align_log2, unsigned bound); -void -lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len); +void lcra_restrict_range(struct lcra_state *l, unsigned node, unsigned len); -void -lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j); +void lcra_add_node_interference(struct lcra_state *l, unsigned i, + unsigned cmask_i, unsigned j, unsigned cmask_j); -bool -lcra_solve(struct lcra_state *l); +bool lcra_solve(struct lcra_state *l); -void -lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost); +void lcra_set_node_spill_cost(struct lcra_state *l, unsigned node, signed cost); -signed -lcra_get_best_spill_node(struct lcra_state *l); +signed lcra_get_best_spill_node(struct lcra_state *l); #endif diff --git a/src/panfrost/util/meson.build b/src/panfrost/util/meson.build index 75f9024eafb..d2d730fa587 100644 --- a/src/panfrost/util/meson.build +++ b/src/panfrost/util/meson.build @@ -22,22 +22,24 @@ libpanfrost_util_files = files( 'lcra.c', 'lcra.h', - 'nir_mod_helpers.c', + 'pan_collect_varyings.c', 'pan_ir.c', 'pan_ir.h', 'pan_liveness.c', 'pan_lower_framebuffer.c', 'pan_lower_helper_invocation.c', + 'pan_lower_image_ms.c', + 'pan_lower_image_index.c', 'pan_lower_sample_position.c', + 'pan_lower_store_component.c', 'pan_lower_writeout.c', - 'pan_lower_64bit_intrin.c', - 'pan_sysval.c', + 'pan_lower_xfb.c', ) libpanfrost_util = static_library( 'panfrost_util', [libpanfrost_util_files], - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw], + include_directories : [inc_include, inc_src, inc_panfrost_hw], dependencies: [idep_nir], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', diff --git a/src/panfrost/util/nir_mod_helpers.c b/src/panfrost/util/nir_mod_helpers.c deleted file mode 100644 index 2fe7b4fabe5..00000000000 --- a/src/panfrost/util/nir_mod_helpers.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (C) 2020 Collabora, Ltd. - * Copyright (C) 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig <alyssa@collabora.com> - * Jason Ekstrand (jason@jlekstrand.net) - * - */ - -#include "nir.h" -#include "pan_ir.h" - -/* Check if a given ALU source is the result of a particular componentwise 1-op - * ALU source (principally fneg or fabs). If so, return true and rewrite the - * source to be the argument, respecting swizzles as needed. If not (or it - * cannot be proven), return false and leave the source untouched. -*/ - -bool -pan_has_source_mod(nir_alu_src *src, nir_op op) -{ - if (!src->src.is_ssa || src->src.ssa->parent_instr->type != nir_instr_type_alu) - return false; - - nir_alu_instr *alu = nir_instr_as_alu(src->src.ssa->parent_instr); - - if (alu->op != op) - return false; - - /* This only works for unary ops */ - assert(nir_op_infos[op].num_inputs == 1); - - /* If the copied source is not SSA, moving it might not be valid */ - if (!alu->src[0].src.is_ssa) - return false; - - /* Okay - we've found the modifier we wanted. Let's construct the new ALU - * src. In a scalar world, this is just psrc, but for vector archs we need - * to respect the swizzle, so we compose. - */ - - nir_alu_src nsrc = { - .src = alu->src[0].src, - }; - - for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; ++i) { - /* (a o b)(i) = a(b(i)) ... swizzle composition is intense. */ - nsrc.swizzle[i] = alu->src[0].swizzle[src->swizzle[i]]; - } - - *src = nsrc; - return true; -} - -/* Check if a given instruction's result will be fed into a - * componentwise 1-op ALU instruction (principally fsat without - * swizzles). If so, return true and rewrite the destination. The - * backend will need to track the new destinations to avoid - * incorrect double-emits. */ - -bool -pan_has_dest_mod(nir_dest **odest, nir_op op) -{ - /* This only works for unary ops */ - assert(nir_op_infos[op].num_inputs == 1); - - /* If not SSA, this might not be legal */ - nir_dest *dest = *odest; - if (!dest->is_ssa) - return false; - - /* Check the uses. We want a single use, with the op `op` */ - if (!list_is_empty(&dest->ssa.if_uses)) - return false; - - if (!list_is_singular(&dest->ssa.uses)) - return false; - - nir_src *use = list_first_entry(&dest->ssa.uses, nir_src, use_link); - nir_instr *parent = use->parent_instr; - - /* Check if the op is `op` */ - if (parent->type != nir_instr_type_alu) - return false; - - nir_alu_instr *alu = nir_instr_as_alu(parent); - if (alu->op != op) - return false; - - /* We can't do expansions without a move in the middle */ - unsigned nr_components = nir_dest_num_components(alu->dest.dest); - - if (nir_dest_num_components(*dest) != nr_components) - return false; - - /* We don't handle swizzles here, so check for the identity */ - for (unsigned i = 0; i < nr_components; ++i) { - if (alu->src[0].swizzle[i] != i) - return false; - } - - if (!alu->dest.dest.is_ssa) - return false; - - /* Otherwise, we're good */ - *odest = &alu->dest.dest; - return true; -} diff --git a/src/panfrost/util/pan_collect_varyings.c b/src/panfrost/util/pan_collect_varyings.c new file mode 100644 index 00000000000..0134ecfb67b --- /dev/null +++ b/src/panfrost/util/pan_collect_varyings.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * Copyright (C) 2019-2022 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "pan_ir.h" + +static enum pipe_format +varying_format(nir_alu_type t, unsigned ncomps) +{ + assert(ncomps >= 1 && ncomps <= 4); + +#define VARYING_FORMAT(ntype, nsz, ptype, psz) \ + { \ + .type = nir_type_##ntype##nsz, .formats = { \ + PIPE_FORMAT_R##psz##_##ptype, \ + PIPE_FORMAT_R##psz##G##psz##_##ptype, \ + PIPE_FORMAT_R##psz##G##psz##B##psz##_##ptype, \ + PIPE_FORMAT_R##psz##G##psz##B##psz##A##psz##_##ptype, \ + } \ + } + + static const struct { + nir_alu_type type; + enum pipe_format formats[4]; + } conv[] = { + VARYING_FORMAT(float, 32, FLOAT, 32), + VARYING_FORMAT(uint, 32, UINT, 32), + VARYING_FORMAT(float, 16, FLOAT, 16), + }; +#undef VARYING_FORMAT + + assert(ncomps > 0 && ncomps <= ARRAY_SIZE(conv[0].formats)); + + for (unsigned i = 0; i < ARRAY_SIZE(conv); i++) { + if (conv[i].type == t) + return conv[i].formats[ncomps - 1]; + } + + unreachable("Invalid type"); +} + +struct slot_info { + nir_alu_type type; + unsigned count; + unsigned index; +}; + +static bool +walk_varyings(UNUSED nir_builder *b, nir_instr *instr, void *data) +{ + struct slot_info *slots = data; + + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + unsigned count; + + /* Only consider intrinsics that access varyings */ + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + if (b->shader->info.stage != MESA_SHADER_VERTEX) + return false; + + count = nir_src_num_components(intr->src[0]); + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_interpolated_input: + if (b->shader->info.stage != MESA_SHADER_FRAGMENT) + return false; + + count = intr->def.num_components; + break; + + default: + return false; + } + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + if (sem.no_varying) + return false; + + /* In a fragment shader, flat shading is lowered to load_input but + * interpolation is lowered to load_interpolated_input, so we can check + * the intrinsic to distinguish. + * + * In a vertex shader, we consider everything flat, as the information + * will not contribute to the final linked varyings -- flatness is used + * only to determine the type, and the GL linker uses the type from the + * fragment shader instead. + */ + bool flat = (intr->intrinsic != nir_intrinsic_load_interpolated_input); + nir_alu_type type = flat ? nir_type_uint : nir_type_float; + + /* Demote interpolated float varyings to fp16 where possible. We do not + * demote flat varyings, including integer varyings, due to various + * issues with the Midgard hardware behaviour and TGSI shaders, as well + * as having no demonstrable benefit in practice. + */ + if (type == nir_type_float && sem.medium_precision) + type |= 16; + else + type |= 32; + + /* Count currently contains the number of components accessed by this + * intrinsics. However, we may be accessing a fractional location, + * indicating by the NIR component. Add that in. The final value be the + * maximum (component + count), an upper bound on the number of + * components possibly used. + */ + count += nir_intrinsic_component(intr); + + /* Consider each slot separately */ + for (unsigned offset = 0; offset < sem.num_slots; ++offset) { + unsigned location = sem.location + offset; + unsigned index = pan_res_handle_get_index(nir_intrinsic_base(intr)) + offset; + + if (slots[location].type) { + assert(slots[location].type == type); + assert(slots[location].index == index); + } else { + slots[location].type = type; + slots[location].index = index; + } + + slots[location].count = MAX2(slots[location].count, count); + } + + return false; +} + +void +pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info) +{ + if (s->info.stage != MESA_SHADER_VERTEX && + s->info.stage != MESA_SHADER_FRAGMENT) + return; + + struct slot_info slots[64] = {0}; + nir_shader_instructions_pass(s, walk_varyings, nir_metadata_all, slots); + + struct pan_shader_varying *varyings = (s->info.stage == MESA_SHADER_VERTEX) + ? info->varyings.output + : info->varyings.input; + + unsigned count = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(slots); ++i) { + if (!slots[i].type) + continue; + + enum pipe_format format = varying_format(slots[i].type, slots[i].count); + assert(format != PIPE_FORMAT_NONE); + + unsigned index = slots[i].index; + count = MAX2(count, index + 1); + + varyings[index].location = i; + varyings[index].format = format; + } + + if (s->info.stage == MESA_SHADER_VERTEX) + info->varyings.output_count = count; + else + info->varyings.input_count = count; +} diff --git a/src/panfrost/util/pan_ir.c b/src/panfrost/util/pan_ir.c index c469274933f..8524e08bc84 100644 --- a/src/panfrost/util/pan_ir.c +++ b/src/panfrost/util/pan_ir.c @@ -32,73 +32,66 @@ uint16_t pan_to_bytemask(unsigned bytes, unsigned mask) { - switch (bytes) { - case 0: - assert(mask == 0); - return 0; - - case 8: - return mask; - - case 16: { - unsigned space = - (mask & 0x1) | - ((mask & 0x2) << (2 - 1)) | - ((mask & 0x4) << (4 - 2)) | - ((mask & 0x8) << (6 - 3)) | - ((mask & 0x10) << (8 - 4)) | - ((mask & 0x20) << (10 - 5)) | - ((mask & 0x40) << (12 - 6)) | - ((mask & 0x80) << (14 - 7)); - - return space | (space << 1); - } - - case 32: { - unsigned space = - (mask & 0x1) | - ((mask & 0x2) << (4 - 1)) | - ((mask & 0x4) << (8 - 2)) | - ((mask & 0x8) << (12 - 3)); - - return space | (space << 1) | (space << 2) | (space << 3); - } - - case 64: { - unsigned A = (mask & 0x1) ? 0xFF : 0x00; - unsigned B = (mask & 0x2) ? 0xFF : 0x00; - return A | (B << 8); - } - - default: - unreachable("Invalid register mode"); - } + switch (bytes) { + case 0: + assert(mask == 0); + return 0; + + case 8: + return mask; + + case 16: { + unsigned space = + (mask & 0x1) | ((mask & 0x2) << (2 - 1)) | ((mask & 0x4) << (4 - 2)) | + ((mask & 0x8) << (6 - 3)) | ((mask & 0x10) << (8 - 4)) | + ((mask & 0x20) << (10 - 5)) | ((mask & 0x40) << (12 - 6)) | + ((mask & 0x80) << (14 - 7)); + + return space | (space << 1); + } + + case 32: { + unsigned space = (mask & 0x1) | ((mask & 0x2) << (4 - 1)) | + ((mask & 0x4) << (8 - 2)) | ((mask & 0x8) << (12 - 3)); + + return space | (space << 1) | (space << 2) | (space << 3); + } + + case 64: { + unsigned A = (mask & 0x1) ? 0xFF : 0x00; + unsigned B = (mask & 0x2) ? 0xFF : 0x00; + return A | (B << 8); + } + + default: + unreachable("Invalid register mode"); + } } void pan_block_add_successor(pan_block *block, pan_block *successor) { - assert(block); - assert(successor); - - /* Cull impossible edges */ - if (block->unconditional_jumps) - return; - - for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { - if (block->successors[i]) { - if (block->successors[i] == successor) - return; - else - continue; - } - - block->successors[i] = successor; - _mesa_set_add(successor->predecessors, block); - return; - } - - unreachable("Too many successors"); + assert(block); + assert(successor); + + /* Cull impossible edges */ + if (block->unconditional_jumps) + return; + + for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { + if (block->successors[i]) { + if (block->successors[i] == successor) + return; + else + continue; + } + + block->successors[i] = successor; + _mesa_set_add(successor->predecessors, block); + return; + } + + unreachable("Too many successors"); } /* Prints a NIR ALU type in Bifrost-style ".f32" ".i8" etc */ @@ -106,45 +99,42 @@ pan_block_add_successor(pan_block *block, pan_block *successor) void pan_print_alu_type(nir_alu_type t, FILE *fp) { - unsigned size = nir_alu_type_get_type_size(t); - nir_alu_type base = nir_alu_type_get_base_type(t); - - switch (base) { - case nir_type_int: - fprintf(fp, ".i"); - break; - case nir_type_uint: - fprintf(fp, ".u"); - break; - case nir_type_bool: - fprintf(fp, ".b"); - break; - case nir_type_float: - fprintf(fp, ".f"); - break; - default: - fprintf(fp, ".unknown"); - break; - } - - fprintf(fp, "%u", size); + unsigned size = nir_alu_type_get_type_size(t); + nir_alu_type base = nir_alu_type_get_base_type(t); + + switch (base) { + case nir_type_int: + fprintf(fp, ".i"); + break; + case nir_type_uint: + fprintf(fp, ".u"); + break; + case nir_type_bool: + fprintf(fp, ".b"); + break; + case nir_type_float: + fprintf(fp, ".f"); + break; + default: + fprintf(fp, ".unknown"); + break; + } + + fprintf(fp, "%u", size); } /* Could optimize with a better data structure if anyone cares, TODO: profile */ unsigned -pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs) +pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, + unsigned offs) { - struct panfrost_ubo_word word = { - .ubo = ubo, - .offset = offs - }; + struct panfrost_ubo_word word = {.ubo = ubo, .offset = offs}; - for (unsigned i = 0; i < push->count; ++i) { - if (memcmp(push->words + i, &word, sizeof(word)) == 0) - return i; - } - - unreachable("UBO not pushed"); + for (unsigned i = 0; i < push->count; ++i) { + if (memcmp(push->words + i, &word, sizeof(word)) == 0) + return i; + } + unreachable("UBO not pushed"); } diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 1ff240b3070..7b24fb6922e 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -26,326 +26,420 @@ #include <stdint.h> #include "compiler/nir/nir.h" -#include "util/u_dynarray.h" #include "util/hash_table.h" +#include "util/u_dynarray.h" -/* Define the general compiler entry point */ - -#define MAX_SYSVAL_COUNT 32 - -/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal - * their class for equal comparison */ - -#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type) -#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff) -#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16) - -/* Define some common types. We start at one for easy indexing of hash - * tables internal to the compiler */ - -enum { - PAN_SYSVAL_VIEWPORT_SCALE = 1, - PAN_SYSVAL_VIEWPORT_OFFSET = 2, - PAN_SYSVAL_TEXTURE_SIZE = 3, - PAN_SYSVAL_SSBO = 4, - PAN_SYSVAL_NUM_WORK_GROUPS = 5, - PAN_SYSVAL_SAMPLER = 7, - PAN_SYSVAL_LOCAL_GROUP_SIZE = 8, - PAN_SYSVAL_WORK_DIM = 9, - PAN_SYSVAL_IMAGE_SIZE = 10, - PAN_SYSVAL_SAMPLE_POSITIONS = 11, - PAN_SYSVAL_MULTISAMPLED = 12, - PAN_SYSVAL_RT_CONVERSION = 13, - PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14, - PAN_SYSVAL_DRAWID = 15, -}; +/* Indices for named (non-XFB) varyings that are present. These are packed + * tightly so they correspond to a bitfield present (P) indexed by (1 << + * PAN_VARY_*). This has the nice property that you can lookup the buffer index + * of a given special field given a shift S by: + * + * idx = popcount(P & ((1 << S) - 1)) + * + * That is... look at all of the varyings that come earlier and count them, the + * count is the new index since plus one. Likewise, the total number of special + * buffers required is simply popcount(P) + */ + +enum pan_special_varying { + PAN_VARY_GENERAL = 0, + PAN_VARY_POSITION = 1, + PAN_VARY_PSIZ = 2, + PAN_VARY_PNTCOORD = 3, + PAN_VARY_FACE = 4, + PAN_VARY_FRAGCOORD = 5, -#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ - ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) + /* Keep last */ + PAN_VARY_MAX, +}; -#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f) -#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) -#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) +/* Maximum number of attribute descriptors required for varyings. These include + * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL + * special varying */ +#define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1) /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be * consistent with the blob so we can compare traces easier. */ -enum { - PAN_VERTEX_ID = 16, - PAN_INSTANCE_ID = 17, - PAN_MAX_ATTRIBUTE -}; +enum { PAN_VERTEX_ID = 16, PAN_INSTANCE_ID = 17, PAN_MAX_ATTRIBUTE }; -struct panfrost_sysvals { - /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ - unsigned sysvals[MAX_SYSVAL_COUNT]; - unsigned sysval_count; -}; - -/* Technically Midgard could go up to 92 in a pathological case but we don't - * take advantage of that. Likewise Bifrost's FAU encoding can address 128 - * words but actual implementations (G72, G76) are capped at 64 */ - -#define PAN_MAX_PUSH 64 +/* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each. + * In practice, the maximum number of FAU slots is limited by implementation. + * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the + * maximum number of 32-bit words is 128, since there are 2 words per FAU slot. + * + * Midgard can push at most 92 words, so this bound suffices. The Midgard + * compiler pushes less than this, as Midgard uses register-mapped uniforms + * instead of FAU, preventing large numbers of uniforms to be pushed for + * nontrivial programs. + */ +#define PAN_MAX_PUSH 128 /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so * an offset to a word must be < 2^16. There are less than 2^8 UBOs */ struct panfrost_ubo_word { - uint16_t ubo; - uint16_t offset; + uint16_t ubo; + uint16_t offset; }; struct panfrost_ubo_push { - unsigned count; - struct panfrost_ubo_word words[PAN_MAX_PUSH]; + unsigned count; + struct panfrost_ubo_word words[PAN_MAX_PUSH]; }; /* Helper for searching the above. Note this is O(N) to the number of pushed * constants, do not run in the draw call hot path */ -unsigned -pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); - -struct hash_table_u64 * -panfrost_init_sysvals(struct panfrost_sysvals *sysvals, void *memctx); - -unsigned -pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, - struct panfrost_sysvals *sysvals, - int sysval); - -int -panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest); +unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, + unsigned offs); struct panfrost_compile_inputs { - unsigned gpu_id; - bool is_blend, is_blit; - struct { - unsigned rt; - unsigned nr_samples; - uint64_t bifrost_blend_desc; - } blend; - unsigned sysval_ubo; - bool shaderdb; - bool no_ubo_to_push; - - enum pipe_format rt_formats[8]; - unsigned nr_cbufs; + struct util_debug_callback *debug; + + unsigned gpu_id; + bool is_blend, is_blit; + struct { + unsigned nr_samples; + uint64_t bifrost_blend_desc; + } blend; + bool no_idvs; + bool no_ubo_to_push; + + /* Used on Valhall. + * + * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0) + * written by the previous stage (fragment shader) or written by this + * stage (vertex shader). Bits are slots from gl_varying_slot. + * + * For modern APIs (GLES or VK), this should be 0. + */ + uint32_t fixed_varying_mask; + + union { + struct { + uint32_t rt_conv[8]; + } bifrost; + }; }; struct pan_shader_varying { - gl_varying_slot location; - enum pipe_format format; + gl_varying_slot location; + enum pipe_format format; }; struct bifrost_shader_blend_info { - nir_alu_type type; - uint32_t return_offset; + nir_alu_type type; + uint32_t return_offset; - /* mali_bifrost_register_file_format corresponding to nir_alu_type */ - unsigned format; + /* mali_bifrost_register_file_format corresponding to nir_alu_type */ + unsigned format; }; -struct bifrost_shader_info { - struct bifrost_shader_blend_info blend[8]; - nir_alu_type blend_src1_type; - bool wait_6, wait_7; +/* + * Unpacked form of a v7 message preload descriptor, produced by the compiler's + * message preload optimization. By splitting out this struct, the compiler does + * not need to know about data structure packing, avoiding a dependency on + * GenXML. + */ +struct bifrost_message_preload { + /* Whether to preload this message */ + bool enabled; + + /* Varying to load from */ + unsigned varying_index; + + /* Register type, FP32 otherwise */ + bool fp16; - /* Packed, preloaded message descriptors */ - uint16_t messages[2]; + /* Number of components, ignored if texturing */ + unsigned num_components; + + /* If texture is set, performs a texture instruction according to + * texture_index, skip, and zero_lod. If texture is unset, only the + * varying load is performed. + */ + bool texture, skip, zero_lod; + unsigned texture_index; +}; + +struct bifrost_shader_info { + struct bifrost_shader_blend_info blend[8]; + nir_alu_type blend_src1_type; + bool wait_6, wait_7; + struct bifrost_message_preload messages[2]; + + /* Whether any flat varyings are loaded. This may disable optimizations + * that change the provoking vertex, since that would load incorrect + * values for flat varyings. + */ + bool uses_flat_shading; }; struct midgard_shader_info { - unsigned first_tag; + unsigned first_tag; }; struct pan_shader_info { - gl_shader_stage stage; - unsigned work_reg_count; - unsigned tls_size; - unsigned wls_size; - - union { - struct { - bool reads_frag_coord; - bool reads_point_coord; - bool reads_face; - bool helper_invocations; - bool can_discard; - bool writes_depth; - bool writes_stencil; - bool writes_coverage; - bool sidefx; - bool reads_sample_id; - bool reads_sample_pos; - bool reads_sample_mask_in; - bool reads_helper_invocation; - bool sample_shading; - bool early_fragment_tests; - bool can_early_z, can_fpk; - BITSET_WORD outputs_read; - BITSET_WORD outputs_written; - } fs; - - struct { - bool writes_point_size; - } vs; - }; - - bool separable; - bool contains_barrier; - bool writes_global; - uint64_t outputs_written; - - unsigned sampler_count; - unsigned texture_count; - unsigned ubo_count; - unsigned attribute_count; - - struct { - unsigned input_count; - struct pan_shader_varying input[MAX_VARYING]; - unsigned output_count; - struct pan_shader_varying output[MAX_VARYING]; - } varyings; - - struct panfrost_sysvals sysvals; - - /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access - * Uniforms (Bifrost) */ - struct panfrost_ubo_push push; - - uint32_t ubo_mask; - - union { - struct bifrost_shader_info bifrost; - struct midgard_shader_info midgard; - }; + gl_shader_stage stage; + unsigned work_reg_count; + unsigned tls_size; + unsigned wls_size; + + /* Bit mask of preloaded registers */ + uint64_t preload; + + union { + struct { + bool reads_frag_coord; + bool reads_point_coord; + bool reads_face; + bool can_discard; + bool writes_depth; + bool writes_stencil; + bool writes_coverage; + bool sidefx; + bool sample_shading; + bool early_fragment_tests; + bool can_early_z, can_fpk; + bool untyped_color_outputs; + BITSET_WORD outputs_read; + BITSET_WORD outputs_written; + } fs; + + struct { + bool writes_point_size; + + /* If the primary shader writes point size, the Valhall + * driver may need a variant that does not write point + * size. Offset to such a shader in the program binary. + * + * Zero if no such variant is required. + * + * Only used with IDVS on Valhall. + */ + unsigned no_psiz_offset; + + /* Set if Index-Driven Vertex Shading is in use */ + bool idvs; + + /* If IDVS is used, whether a varying shader is used */ + bool secondary_enable; + + /* If a varying shader is used, the varying shader's + * offset in the program binary + */ + unsigned secondary_offset; + + /* If IDVS is in use, number of work registers used by + * the varying shader + */ + unsigned secondary_work_reg_count; + + /* If IDVS is in use, bit mask of preloaded registers + * used by the varying shader + */ + uint64_t secondary_preload; + } vs; + + struct { + /* Is it legal to merge workgroups? This is true if the + * shader uses neither barriers nor shared memory. This + * requires caution: if the API allows specifying shared + * memory at launch time (instead of compile time), that + * memory will not be accounted for by the compiler. + * + * Used by the Valhall hardware. + */ + bool allow_merging_workgroups; + } cs; + }; + + /* Does the shader contains a barrier? or (for fragment shaders) does it + * require helper invocations, which demand the same ordering guarantees + * of the hardware? These notions are unified in the hardware, so we + * unify them here as well. + */ + bool contains_barrier; + bool separable; + bool writes_global; + uint64_t outputs_written; + + /* Floating point controls that the driver should try to honour */ + bool ftz_fp16, ftz_fp32; + + unsigned sampler_count; + unsigned texture_count; + unsigned ubo_count; + unsigned attributes_read_count; + unsigned attribute_count; + unsigned attributes_read; + + struct { + unsigned input_count; + struct pan_shader_varying input[PAN_MAX_VARYINGS]; + unsigned output_count; + struct pan_shader_varying output[PAN_MAX_VARYINGS]; + } varyings; + + /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access + * Uniforms (Bifrost) */ + struct panfrost_ubo_push push; + + uint32_t ubo_mask; + + union { + struct bifrost_shader_info bifrost; + struct midgard_shader_info midgard; + }; }; typedef struct pan_block { - /* Link to next block. Must be first for mir_get_block */ - struct list_head link; + /* Link to next block. Must be first for mir_get_block */ + struct list_head link; - /* List of instructions emitted for the current block */ - struct list_head instructions; + /* List of instructions emitted for the current block */ + struct list_head instructions; - /* Index of the block in source order */ - unsigned name; + /* Index of the block in source order */ + unsigned name; - /* Control flow graph */ - struct pan_block *successors[2]; - struct set *predecessors; - bool unconditional_jumps; + /* Control flow graph */ + struct pan_block *successors[2]; + struct set *predecessors; + bool unconditional_jumps; - /* In liveness analysis, these are live masks (per-component) for - * indices for the block. Scalar compilers have the luxury of using - * simple bit fields, but for us, liveness is a vector idea. */ - uint16_t *live_in; - uint16_t *live_out; + /* In liveness analysis, these are live masks (per-component) for + * indices for the block. Scalar compilers have the luxury of using + * simple bit fields, but for us, liveness is a vector idea. */ + uint16_t *live_in; + uint16_t *live_out; } pan_block; struct pan_instruction { - struct list_head link; + struct list_head link; }; -#define pan_foreach_instr_in_block_rev(block, v) \ - list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link) - -#define pan_foreach_successor(blk, v) \ - pan_block *v; \ - pan_block **_v; \ - for (_v = (pan_block **) &blk->successors[0], \ - v = *_v; \ - v != NULL && _v < (pan_block **) &blk->successors[2]; \ - _v++, v = *_v) \ - -#define pan_foreach_predecessor(blk, v) \ - struct set_entry *_entry_##v; \ - struct pan_block *v; \ - for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \ - v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL); \ - _entry_##v != NULL; \ - _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \ - v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL)) +#define pan_foreach_instr_in_block_rev(block, v) \ + list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, \ + link) + +#define pan_foreach_successor(blk, v) \ + pan_block *v; \ + pan_block **_v; \ + for (_v = (pan_block **)&blk->successors[0], v = *_v; \ + v != NULL && _v < (pan_block **)&blk->successors[2]; _v++, v = *_v) + +#define pan_foreach_predecessor(blk, v) \ + struct set_entry *_entry_##v; \ + struct pan_block *v; \ + for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \ + v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL); \ + _entry_##v != NULL; \ + _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \ + v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL)) static inline pan_block * pan_exit_block(struct list_head *blocks) { - pan_block *last = list_last_entry(blocks, pan_block, link); - assert(!last->successors[0] && !last->successors[1]); - return last; + pan_block *last = list_last_entry(blocks, pan_block, link); + assert(!last->successors[0] && !last->successors[1]); + return last; } typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max); -void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask); -void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask); +void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, + uint16_t mask); +void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, + uint16_t mask); bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max); -void pan_compute_liveness(struct list_head *blocks, - unsigned temp_count, - pan_liveness_update callback); +void pan_compute_liveness(struct list_head *blocks, unsigned temp_count, + pan_liveness_update callback); void pan_free_liveness(struct list_head *blocks); -uint16_t -pan_to_bytemask(unsigned bytes, unsigned mask); +uint16_t pan_to_bytemask(unsigned bytes, unsigned mask); void pan_block_add_successor(pan_block *block, pan_block *successor); /* IR indexing */ #define PAN_IS_REG (1) -static inline unsigned -pan_ssa_index(nir_ssa_def *ssa) -{ - /* Off-by-one ensures BIR_NO_ARG is skipped */ - return ((ssa->index + 1) << 1) | 0; -} - -static inline unsigned -pan_src_index(nir_src *src) -{ - if (src->is_ssa) - return pan_ssa_index(src->ssa); - else { - assert(!src->reg.indirect); - return (src->reg.reg->index << 1) | PAN_IS_REG; - } -} - -static inline unsigned -pan_dest_index(nir_dest *dst) -{ - if (dst->is_ssa) - return pan_ssa_index(&dst->ssa); - else { - assert(!dst->reg.indirect); - return (dst->reg.reg->index << 1) | PAN_IS_REG; - } -} - /* IR printing helpers */ void pan_print_alu_type(nir_alu_type t, FILE *fp); -/* Until it can be upstreamed.. */ -bool pan_has_source_mod(nir_alu_src *src, nir_op op); -bool pan_has_dest_mod(nir_dest **dest, nir_op op); - /* NIR passes to do some backend-specific lowering */ #define PAN_WRITEOUT_C 1 #define PAN_WRITEOUT_Z 2 #define PAN_WRITEOUT_S 4 +#define PAN_WRITEOUT_2 8 -bool pan_nir_reorder_writeout(nir_shader *nir); bool pan_nir_lower_zs_store(nir_shader *nir); +bool pan_nir_lower_store_component(nir_shader *shader); +bool pan_nir_lower_image_ms(nir_shader *shader); bool pan_nir_lower_64bit_intrin(nir_shader *shader); bool pan_lower_helper_invocation(nir_shader *shader); bool pan_lower_sample_pos(nir_shader *shader); +bool pan_lower_xfb(nir_shader *nir); + +bool pan_lower_image_index(nir_shader *shader, unsigned vs_img_attrib_offset); + +void pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info); + +/* + * Helper returning the subgroup size. Generally, this is equal to the number of + * threads in a warp. For Midgard (including warping models), this returns 1, as + * subgroups are not supported. + */ +static inline unsigned +pan_subgroup_size(unsigned arch) +{ + if (arch >= 9) + return 16; + else if (arch >= 7) + return 8; + else if (arch >= 6) + return 4; + else + return 1; +} + +/* + * Helper extracting the table from a given handle of Valhall descriptor model. + */ +static inline unsigned +pan_res_handle_get_table(unsigned handle) +{ + unsigned table = handle >> 24; + + assert(table < 64); + return table; +} + +/* + * Helper returning the index from a given handle of Valhall descriptor model. + */ +static inline unsigned +pan_res_handle_get_index(unsigned handle) +{ + return handle & BITFIELD_MASK(24); +} + +/* + * Helper creating an handle for Valhall descriptor model. + */ +static inline unsigned +pan_res_handle(unsigned table, unsigned index) +{ + assert(table < 64); + assert(index < (1u << 24)); + + return (table << 24) | index; +} #endif diff --git a/src/panfrost/util/pan_liveness.c b/src/panfrost/util/pan_liveness.c index 0ec9652b59d..e299bc29d75 100644 --- a/src/panfrost/util/pan_liveness.c +++ b/src/panfrost/util/pan_liveness.c @@ -21,10 +21,10 @@ * SOFTWARE. */ -#include "pan_ir.h" -#include "util/u_memory.h" #include "util/list.h" #include "util/set.h" +#include "util/u_memory.h" +#include "pan_ir.h" /* Routines for liveness analysis. Liveness is tracked per byte per node. Per * byte granularity is necessary for proper handling of int8 */ @@ -32,28 +32,28 @@ void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask) { - if (node >= max) - return; + if (node >= max) + return; - live[node] |= mask; + live[node] |= mask; } void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask) { - if (node >= max) - return; + if (node >= max) + return; - live[node] &= ~mask; + live[node] &= ~mask; } bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max) { - if (node >= max) - return false; + if (node >= max) + return false; - return live[node]; + return live[node]; } /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */ @@ -61,10 +61,10 @@ pan_liveness_get(uint16_t *live, unsigned node, uint16_t max) static void liveness_block_live_out(pan_block *blk, unsigned temp_count) { - pan_foreach_successor(blk, succ) { - for (unsigned i = 0; i < temp_count; ++i) - blk->live_out[i] |= succ->live_in[i]; - } + pan_foreach_successor(blk, succ) { + for (unsigned i = 0; i < temp_count; ++i) + blk->live_out[i] |= succ->live_in[i]; + } } /* Liveness analysis is a backwards-may dataflow analysis pass. Within a block, @@ -72,32 +72,30 @@ liveness_block_live_out(pan_block *blk, unsigned temp_count) * returns whether progress was made. */ static bool -liveness_block_update( - pan_block *blk, unsigned temp_count, - pan_liveness_update callback) +liveness_block_update(pan_block *blk, unsigned temp_count, + pan_liveness_update callback) { - bool progress = false; + bool progress = false; - liveness_block_live_out(blk, temp_count); + liveness_block_live_out(blk, temp_count); - uint16_t *live = ralloc_array(blk, uint16_t, temp_count); - memcpy(live, blk->live_out, temp_count * sizeof(uint16_t)); + uint16_t *live = ralloc_array(blk, uint16_t, temp_count); + memcpy(live, blk->live_out, temp_count * sizeof(uint16_t)); - pan_foreach_instr_in_block_rev(blk, ins) - callback(live, (void *) ins, temp_count); + pan_foreach_instr_in_block_rev(blk, ins) + callback(live, (void *)ins, temp_count); - /* To figure out progress, diff live_in */ + /* To figure out progress, diff live_in */ - for (unsigned i = 0; (i < temp_count) && !progress; ++i) - progress |= (blk->live_in[i] != live[i]); + for (unsigned i = 0; (i < temp_count) && !progress; ++i) + progress |= (blk->live_in[i] != live[i]); - ralloc_free(blk->live_in); - blk->live_in = live; + ralloc_free(blk->live_in); + blk->live_in = live; - return progress; + return progress; } - /* Globally, liveness analysis uses a fixed-point algorithm based on a * worklist. We initialize a work list with the exit block. We iterate the work * list to compute live_in from live_out for each block on the work list, @@ -105,70 +103,66 @@ liveness_block_update( */ void -pan_compute_liveness( - struct list_head *blocks, - unsigned temp_count, - pan_liveness_update callback) +pan_compute_liveness(struct list_head *blocks, unsigned temp_count, + pan_liveness_update callback) { - /* Set of pan_block */ - struct set *work_list = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + /* Set of pan_block */ + struct set *work_list = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - struct set *visited = _mesa_set_create(NULL, - _mesa_hash_pointer, - _mesa_key_pointer_equal); + struct set *visited = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - /* Free any previous liveness, and allocate */ + /* Free any previous liveness, and allocate */ - pan_free_liveness(blocks); + pan_free_liveness(blocks); - list_for_each_entry(pan_block, block, blocks, link) { - block->live_in = rzalloc_array(block, uint16_t, temp_count); - block->live_out = rzalloc_array(block, uint16_t, temp_count); - } + list_for_each_entry(pan_block, block, blocks, link) { + block->live_in = rzalloc_array(block, uint16_t, temp_count); + block->live_out = rzalloc_array(block, uint16_t, temp_count); + } - /* Initialize the work list with the exit block */ - struct set_entry *cur; + /* Initialize the work list with the exit block */ + struct set_entry *cur; - cur = _mesa_set_add(work_list, pan_exit_block(blocks)); + cur = _mesa_set_add(work_list, pan_exit_block(blocks)); - /* Iterate the work list */ + /* Iterate the work list */ - do { - /* Pop off a block */ - pan_block *blk = (struct pan_block *) cur->key; - _mesa_set_remove(work_list, cur); + do { + /* Pop off a block */ + pan_block *blk = (struct pan_block *)cur->key; + _mesa_set_remove(work_list, cur); - /* Update its liveness information */ - bool progress = liveness_block_update(blk, temp_count, callback); + /* Update its liveness information */ + bool progress = liveness_block_update(blk, temp_count, callback); - /* If we made progress, we need to process the predecessors */ + /* If we made progress, we need to process the predecessors */ - if (progress || !_mesa_set_search(visited, blk)) { - pan_foreach_predecessor(blk, pred) - _mesa_set_add(work_list, pred); - } + if (progress || !_mesa_set_search(visited, blk)) { + pan_foreach_predecessor(blk, pred) + _mesa_set_add(work_list, pred); + } - _mesa_set_add(visited, blk); - } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); + _mesa_set_add(visited, blk); + } while ((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); - _mesa_set_destroy(visited, NULL); - _mesa_set_destroy(work_list, NULL); + _mesa_set_destroy(visited, NULL); + _mesa_set_destroy(work_list, NULL); } void pan_free_liveness(struct list_head *blocks) { - list_for_each_entry(pan_block, block, blocks, link) { - if (block->live_in) - ralloc_free(block->live_in); + list_for_each_entry(pan_block, block, blocks, link) { + if (block->live_in) + ralloc_free(block->live_in); - if (block->live_out) - ralloc_free(block->live_out); + if (block->live_out) + ralloc_free(block->live_out); - block->live_in = NULL; - block->live_out = NULL; - } + block->live_in = NULL; + block->live_out = NULL; + } } diff --git a/src/panfrost/util/pan_lower_64bit_intrin.c b/src/panfrost/util/pan_lower_64bit_intrin.c deleted file mode 100644 index 7c4edcfa9d7..00000000000 --- a/src/panfrost/util/pan_lower_64bit_intrin.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (C) 2020 Icecream95 <ixn@disroot.org> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "pan_ir.h" -#include "compiler/nir/nir_builder.h" - -/* OpenCL uses 64-bit types for some intrinsic functions, including - * global_invocation_id(). This could be worked around during conversion to - * MIR, except that global_invocation_id is a vec3, and the 128-bit registers - * on Midgard can only hold a 64-bit vec2. - * Rather than attempting to add hacky 64-bit vec3 support, convert these - * intrinsics to 32-bit and add a cast back to 64-bit, and rely on NIR not - * vectorizing back to vec3. - */ - -static bool -nir_lower_64bit_intrin_instr(nir_builder *b, nir_instr *instr, void *data) -{ - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { - case nir_intrinsic_load_global_invocation_id: - case nir_intrinsic_load_global_invocation_id_zero_base: - case nir_intrinsic_load_workgroup_id: - case nir_intrinsic_load_num_workgroups: - break; - - default: - return false; - } - - if (nir_dest_bit_size(intr->dest) != 64) - return false; - - b->cursor = nir_after_instr(instr); - - assert(intr->dest.is_ssa); - intr->dest.ssa.bit_size = 32; - - nir_ssa_def *conv = nir_u2u64(b, &intr->dest.ssa); - - nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, conv, - conv->parent_instr); - - return true; -} - -bool -pan_nir_lower_64bit_intrin(nir_shader *shader) -{ - return nir_shader_instructions_pass(shader, - nir_lower_64bit_intrin_instr, - nir_metadata_block_index | nir_metadata_dominance, - NULL); -} diff --git a/src/panfrost/util/pan_lower_framebuffer.c b/src/panfrost/util/pan_lower_framebuffer.c index 71795fd9416..96e24683b25 100644 --- a/src/panfrost/util/pan_lower_framebuffer.c +++ b/src/panfrost/util/pan_lower_framebuffer.c @@ -47,12 +47,11 @@ * smallest precision necessary to store the pixel losslessly. */ +#include "pan_lower_framebuffer.h" #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_format_convert.h" #include "util/format/u_format.h" -#include "pan_lower_framebuffer.h" -#include "panfrost-quirks.h" /* Determines the unpacked type best suiting a given format, so the rest of the * pipeline may be adjusted accordingly */ @@ -60,86 +59,54 @@ nir_alu_type pan_unpacked_type_for_format(const struct util_format_description *desc) { - int c = util_format_get_first_non_void_channel(desc->format); - - if (c == -1) - unreachable("Void format not renderable"); - - bool large = (desc->channel[c].size > 16); - bool large_norm = (desc->channel[c].size > 8); - bool bit8 = (desc->channel[c].size == 8); - assert(desc->channel[c].size <= 32); - - if (desc->channel[c].normalized) - return large_norm ? nir_type_float32 : nir_type_float16; - - switch (desc->channel[c].type) { - case UTIL_FORMAT_TYPE_UNSIGNED: - return bit8 ? nir_type_uint8 : - large ? nir_type_uint32 : nir_type_uint16; - case UTIL_FORMAT_TYPE_SIGNED: - return bit8 ? nir_type_int8 : - large ? nir_type_int32 : nir_type_int16; - case UTIL_FORMAT_TYPE_FLOAT: - return large ? nir_type_float32 : nir_type_float16; - default: - unreachable("Format not renderable"); - } + int c = util_format_get_first_non_void_channel(desc->format); + + if (c == -1) + unreachable("Void format not renderable"); + + bool large = (desc->channel[c].size > 16); + bool large_norm = (desc->channel[c].size > 8); + bool bit8 = (desc->channel[c].size == 8); + assert(desc->channel[c].size <= 32); + + if (desc->channel[c].normalized) + return large_norm ? nir_type_float32 : nir_type_float16; + + switch (desc->channel[c].type) { + case UTIL_FORMAT_TYPE_UNSIGNED: + return bit8 ? nir_type_uint8 : large ? nir_type_uint32 : nir_type_uint16; + case UTIL_FORMAT_TYPE_SIGNED: + return bit8 ? nir_type_int8 : large ? nir_type_int32 : nir_type_int16; + case UTIL_FORMAT_TYPE_FLOAT: + return large ? nir_type_float32 : nir_type_float16; + default: + unreachable("Format not renderable"); + } } -static enum pan_format_class -pan_format_class_load(const struct util_format_description *desc, unsigned quirks) +static bool +pan_is_format_native(const struct util_format_description *desc, + bool broken_ld_special, bool is_store) { - /* Pure integers can be loaded via EXT_framebuffer_fetch and should be - * handled as a raw load with a size conversion (it's cheap). Likewise, - * since float framebuffers are internally implemented as raw (i.e. - * integer) framebuffers with blend shaders to go back and forth, they - * should be s/w as well */ - - if (util_format_is_pure_integer(desc->format) || util_format_is_float(desc->format)) - return PAN_FORMAT_SOFTWARE; - - /* Check if we can do anything better than software architecturally */ - if (quirks & MIDGARD_NO_TYPED_BLEND_LOADS) { - return (quirks & NO_BLEND_PACKS) - ? PAN_FORMAT_SOFTWARE : PAN_FORMAT_PACK; - } - - /* Some formats are missing as typed on some GPUs but have unpacks */ - if (quirks & MIDGARD_MISSING_LOADS) { - switch (desc->format) { - case PIPE_FORMAT_R11G11B10_FLOAT: - return PAN_FORMAT_PACK; - default: - return PAN_FORMAT_NATIVE; - } - } - - /* Otherwise, we can do native */ - return PAN_FORMAT_NATIVE; -} + if (is_store || broken_ld_special) + return false; -static enum pan_format_class -pan_format_class_store(const struct util_format_description *desc, unsigned quirks) -{ - /* Check if we can do anything better than software architecturally */ - if (quirks & MIDGARD_NO_TYPED_BLEND_STORES) { - return (quirks & NO_BLEND_PACKS) - ? PAN_FORMAT_SOFTWARE : PAN_FORMAT_PACK; - } + if (util_format_is_pure_integer(desc->format) || + util_format_is_float(desc->format)) + return false; - return PAN_FORMAT_NATIVE; -} + /* Some formats are missing as typed but have unpacks */ + if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT) + return false; -/* Convenience method */ + if (desc->is_array) { + int c = util_format_get_first_non_void_channel(desc->format); + assert(c >= 0); + if (desc->channel[c].size > 8) + return false; + } -static enum pan_format_class -pan_format_class(const struct util_format_description *desc, unsigned quirks, bool is_store) -{ - if (is_store) - return pan_format_class_store(desc, quirks); - else - return pan_format_class_load(desc, quirks); + return true; } /* Software packs/unpacks, by format class. Packs take in the pixel value typed @@ -147,430 +114,523 @@ pan_format_class(const struct util_format_description *desc, unsigned quirks, bo * suitable for storing (with components replicated to fill). Unpacks do the * reverse but cannot rely on replication. */ -static nir_ssa_def * -pan_replicate(nir_builder *b, nir_ssa_def *v, unsigned num_components) +static nir_def * +pan_replicate(nir_builder *b, nir_def *v, unsigned num_components) { - nir_ssa_def *replicated[4]; - - for (unsigned i = 0; i < 4; ++i) - replicated[i] = nir_channel(b, v, i % num_components); + nir_def *replicated[4]; - return nir_vec(b, replicated, 4); -} + for (unsigned i = 0; i < 4; ++i) + replicated[i] = nir_channel(b, v, i % num_components); -static nir_ssa_def * -pan_unpack_pure_32(nir_builder *b, nir_ssa_def *pack, unsigned num_components) -{ - return nir_channels(b, pack, (1 << num_components) - 1); + return nir_vec(b, replicated, 4); } /* Pure x16 formats are x16 unpacked, so it's similar, but we need to pack * upper/lower halves of course */ -static nir_ssa_def * -pan_pack_pure_16(nir_builder *b, nir_ssa_def *v, unsigned num_components) +static nir_def * +pan_pack_pure_16(nir_builder *b, nir_def *v, unsigned num_components) { - nir_ssa_def *v4 = pan_replicate(b, v, num_components); + nir_def *v4 = pan_replicate(b, v, num_components); - nir_ssa_def *lo = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 0)); - nir_ssa_def *hi = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 2)); + nir_def *lo = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 0)); + nir_def *hi = nir_pack_32_2x16(b, nir_channels(b, v4, 0x3 << 2)); - return nir_vec4(b, lo, hi, lo, hi); + return nir_vec4(b, lo, hi, lo, hi); } -static nir_ssa_def * -pan_unpack_pure_16(nir_builder *b, nir_ssa_def *pack, unsigned num_components) +static nir_def * +pan_unpack_pure_16(nir_builder *b, nir_def *pack, unsigned num_components) { - nir_ssa_def *unpacked[4]; + nir_def *unpacked[4]; + + assert(num_components <= 4); + + for (unsigned i = 0; i < num_components; i += 2) { + nir_def *halves = nir_unpack_32_2x16(b, nir_channel(b, pack, i >> 1)); - assert(num_components <= 4); + unpacked[i + 0] = nir_channel(b, halves, 0); + unpacked[i + 1] = nir_channel(b, halves, 1); + } - for (unsigned i = 0; i < num_components; i += 2) { - nir_ssa_def *halves = - nir_unpack_32_2x16(b, nir_channel(b, pack, i >> 1)); + return nir_pad_vec4(b, nir_vec(b, unpacked, num_components)); +} + +static nir_def * +pan_pack_reorder(nir_builder *b, const struct util_format_description *desc, + nir_def *v) +{ + unsigned swizzle[4] = {0, 1, 2, 3}; - unpacked[i + 0] = nir_channel(b, halves, 0); - unpacked[i + 1] = nir_channel(b, halves, 1); - } + for (unsigned i = 0; i < v->num_components; i++) { + if (desc->swizzle[i] <= PIPE_SWIZZLE_W) + swizzle[i] = desc->swizzle[i]; + } - return nir_pad_vec4(b, nir_vec(b, unpacked, num_components)); + return nir_swizzle(b, v, swizzle, v->num_components); } -static nir_ssa_def * -pan_replicate_4(nir_builder *b, nir_ssa_def *v) +static nir_def * +pan_unpack_reorder(nir_builder *b, const struct util_format_description *desc, + nir_def *v) { - return nir_vec4(b, v, v, v, v); + unsigned swizzle[4] = {0, 1, 2, 3}; + + for (unsigned i = 0; i < v->num_components; i++) { + if (desc->swizzle[i] <= PIPE_SWIZZLE_W) + swizzle[desc->swizzle[i]] = i; + } + + return nir_swizzle(b, v, swizzle, v->num_components); } -static nir_ssa_def * -pan_pack_pure_8(nir_builder *b, nir_ssa_def *v, unsigned num_components) +static nir_def * +pan_pack_pure_8(nir_builder *b, nir_def *v, unsigned num_components) { - return pan_replicate_4(b, nir_pack_32_4x8(b, pan_replicate(b, v, num_components))); + return nir_replicate( + b, nir_pack_32_4x8(b, pan_replicate(b, v, num_components)), 4); } -static nir_ssa_def * -pan_unpack_pure_8(nir_builder *b, nir_ssa_def *pack, unsigned num_components) +static nir_def * +pan_unpack_pure_8(nir_builder *b, nir_def *pack, unsigned num_components) { - nir_ssa_def *unpacked = nir_unpack_32_4x8(b, nir_channel(b, pack, 0)); - return nir_channels(b, unpacked, (1 << num_components) - 1); + nir_def *unpacked = nir_unpack_32_4x8(b, nir_channel(b, pack, 0)); + return nir_trim_vector(b, unpacked, num_components); } -/* For <= 8-bits per channel, UNORM formats are packed like UNORM 8, with - * zeroes spacing out each component as needed */ +static nir_def * +pan_fsat(nir_builder *b, nir_def *v, bool is_signed) +{ + if (is_signed) + return nir_fsat_signed_mali(b, v); + else + return nir_fsat(b, v); +} -static nir_ssa_def * -pan_pack_unorm(nir_builder *b, nir_ssa_def *v, - unsigned x, unsigned y, unsigned z, unsigned w) +static float +norm_scale(bool snorm, unsigned bits) { - /* If a channel has N bits, 1.0 is encoded as 2^N - 1 */ - nir_ssa_def *scales = nir_imm_vec4_16(b, - (1 << x) - 1, (1 << y) - 1, - (1 << z) - 1, (1 << w) - 1); + if (snorm) + return (1 << (bits - 1)) - 1; + else + return (1 << bits) - 1; +} - /* If a channel has N bits, we pad out to the byte by (8 - N) bits */ - nir_ssa_def *shifts = nir_imm_ivec4(b, 8 - x, 8 - y, 8 - z, 8 - w); +/* For <= 8-bits per channel, [U,S]NORM formats are packed like [U,S]NORM 8, + * with zeroes spacing out each component as needed */ - nir_ssa_def *f = nir_fmul(b, nir_fsat(b, nir_pad_vec4(b, v)), scales); - nir_ssa_def *u8 = nir_f2u8(b, nir_fround_even(b, f)); - nir_ssa_def *s = nir_ishl(b, u8, shifts); - nir_ssa_def *repl = nir_pack_32_4x8(b, s); +static nir_def * +pan_pack_norm(nir_builder *b, nir_def *v, unsigned x, unsigned y, unsigned z, + unsigned w, bool is_signed) +{ + /* If a channel has N bits, 1.0 is encoded as 2^N - 1 for UNORMs and + * 2^(N-1) - 1 for SNORMs */ + nir_def *scales = + is_signed ? nir_imm_vec4_16(b, (1 << (x - 1)) - 1, (1 << (y - 1)) - 1, + (1 << (z - 1)) - 1, (1 << (w - 1)) - 1) + : nir_imm_vec4_16(b, (1 << x) - 1, (1 << y) - 1, (1 << z) - 1, + (1 << w) - 1); + + /* If a channel has N bits, we pad out to the byte by (8 - N) bits */ + nir_def *shifts = nir_imm_ivec4(b, 8 - x, 8 - y, 8 - z, 8 - w); + nir_def *clamped = pan_fsat(b, nir_pad_vec4(b, v), is_signed); + + nir_def *f = nir_fmul(b, clamped, scales); + nir_def *u8 = nir_f2u8(b, nir_fround_even(b, f)); + nir_def *s = nir_ishl(b, u8, shifts); + nir_def *repl = nir_pack_32_4x8(b, s); + + return nir_replicate(b, repl, 4); +} - return pan_replicate_4(b, repl); +static nir_def * +pan_pack_unorm(nir_builder *b, nir_def *v, unsigned x, unsigned y, unsigned z, + unsigned w) +{ + return pan_pack_norm(b, v, x, y, z, w, false); } /* RGB10_A2 is packed in the tilebuffer as the bottom 3 bytes being the top * 8-bits of RGB and the top byte being RGBA as 2-bits packed. As imirkin * pointed out, this means free conversion to RGBX8 */ -static nir_ssa_def * -pan_pack_unorm_1010102(nir_builder *b, nir_ssa_def *v) +static nir_def * +pan_pack_unorm_1010102(nir_builder *b, nir_def *v) { - nir_ssa_def *scale = nir_imm_vec4(b, 1023.0, 1023.0, 1023.0, 3.0); - nir_ssa_def *s = nir_f2u32(b, nir_fround_even(b, nir_fmul(b, nir_fsat(b, v), scale))); + nir_def *scale = nir_imm_vec4(b, 1023.0, 1023.0, 1023.0, 3.0); + nir_def *s = + nir_f2u32(b, nir_fround_even(b, nir_fmul(b, nir_fsat(b, v), scale))); - nir_ssa_def *top8 = nir_ushr(b, s, nir_imm_ivec4(b, 0x2, 0x2, 0x2, 0x2)); - nir_ssa_def *top8_rgb = nir_pack_32_4x8(b, nir_u2u8(b, top8)); + nir_def *top8 = nir_ushr(b, s, nir_imm_ivec4(b, 0x2, 0x2, 0x2, 0x2)); + nir_def *top8_rgb = nir_pack_32_4x8(b, nir_u2u8(b, top8)); - nir_ssa_def *bottom2 = nir_iand(b, s, nir_imm_ivec4(b, 0x3, 0x3, 0x3, 0x3)); + nir_def *bottom2 = nir_iand(b, s, nir_imm_ivec4(b, 0x3, 0x3, 0x3, 0x3)); - nir_ssa_def *top = - nir_ior(b, - nir_ior(b, - nir_ishl(b, nir_channel(b, bottom2, 0), nir_imm_int(b, 24 + 0)), - nir_ishl(b, nir_channel(b, bottom2, 1), nir_imm_int(b, 24 + 2))), - nir_ior(b, - nir_ishl(b, nir_channel(b, bottom2, 2), nir_imm_int(b, 24 + 4)), - nir_ishl(b, nir_channel(b, bottom2, 3), nir_imm_int(b, 24 + 6)))); + nir_def *top = + nir_ior(b, + nir_ior(b, nir_ishl_imm(b, nir_channel(b, bottom2, 0), 24 + 0), + nir_ishl_imm(b, nir_channel(b, bottom2, 1), 24 + 2)), + nir_ior(b, nir_ishl_imm(b, nir_channel(b, bottom2, 2), 24 + 4), + nir_ishl_imm(b, nir_channel(b, bottom2, 3), 24 + 6))); - nir_ssa_def *p = nir_ior(b, top, top8_rgb); - return pan_replicate_4(b, p); + nir_def *p = nir_ior(b, top, top8_rgb); + return nir_replicate(b, p, 4); } /* On the other hand, the pure int RGB10_A2 is identical to the spec */ -static nir_ssa_def * -pan_pack_uint_1010102(nir_builder *b, nir_ssa_def *v) +static nir_def * +pan_pack_int_1010102(nir_builder *b, nir_def *v, bool is_signed) { - nir_ssa_def *shift = nir_ishl(b, nir_u2u32(b, v), - nir_imm_ivec4(b, 0, 10, 20, 30)); + v = nir_u2u32(b, v); + + /* Clamp the values */ + if (is_signed) { + v = nir_imin(b, v, nir_imm_ivec4(b, 511, 511, 511, 1)); + v = nir_imax(b, v, nir_imm_ivec4(b, -512, -512, -512, -2)); + } else { + v = nir_umin(b, v, nir_imm_ivec4(b, 1023, 1023, 1023, 3)); + } - nir_ssa_def *p = nir_ior(b, - nir_ior(b, nir_channel(b, shift, 0), nir_channel(b, shift, 1)), - nir_ior(b, nir_channel(b, shift, 2), nir_channel(b, shift, 3))); + v = nir_ishl(b, v, nir_imm_ivec4(b, 0, 10, 20, 30)); + v = nir_ior(b, nir_ior(b, nir_channel(b, v, 0), nir_channel(b, v, 1)), + nir_ior(b, nir_channel(b, v, 2), nir_channel(b, v, 3))); - return pan_replicate_4(b, p); + return nir_replicate(b, v, 4); } -static nir_ssa_def * -pan_unpack_uint_1010102(nir_builder *b, nir_ssa_def *packed) +static nir_def * +pan_unpack_int_1010102(nir_builder *b, nir_def *packed, bool is_signed) { - nir_ssa_def *chan = nir_channel(b, packed, 0); + nir_def *v = nir_replicate(b, nir_channel(b, packed, 0), 4); - nir_ssa_def *shift = nir_ushr(b, pan_replicate_4(b, chan), - nir_imm_ivec4(b, 0, 10, 20, 30)); + /* Left shift all components so the sign bit is on the MSB, and + * can be extended by ishr(). The ishl()+[u,i]shr() combination + * sets all unused bits to 0 without requiring a mask. + */ + v = nir_ishl(b, v, nir_imm_ivec4(b, 22, 12, 2, 0)); - nir_ssa_def *mask = nir_iand(b, shift, - nir_imm_ivec4(b, 0x3ff, 0x3ff, 0x3ff, 0x3)); + if (is_signed) + v = nir_ishr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30)); + else + v = nir_ushr(b, v, nir_imm_ivec4(b, 22, 22, 22, 30)); - return nir_i2i16(b, mask); + return nir_i2i16(b, v); } /* NIR means we can *finally* catch a break */ -static nir_ssa_def * -pan_pack_r11g11b10(nir_builder *b, nir_ssa_def *v) +static nir_def * +pan_pack_r11g11b10(nir_builder *b, nir_def *v) { - return pan_replicate_4(b, nir_format_pack_11f11f10f(b, - nir_f2f32(b, v))); + return nir_replicate(b, nir_format_pack_11f11f10f(b, nir_f2f32(b, v)), 4); } -static nir_ssa_def * -pan_unpack_r11g11b10(nir_builder *b, nir_ssa_def *v) +static nir_def * +pan_unpack_r11g11b10(nir_builder *b, nir_def *v) { - nir_ssa_def *f32 = nir_format_unpack_11f11f10f(b, nir_channel(b, v, 0)); - nir_ssa_def *f16 = nir_f2fmp(b, f32); - - /* Extend to vec4 with alpha */ - nir_ssa_def *components[4] = { - nir_channel(b, f16, 0), - nir_channel(b, f16, 1), - nir_channel(b, f16, 2), - nir_imm_float16(b, 1.0) - }; - - return nir_vec(b, components, 4); + nir_def *f32 = nir_format_unpack_11f11f10f(b, nir_channel(b, v, 0)); + nir_def *f16 = nir_f2fmp(b, f32); + + /* Extend to vec4 with alpha */ + nir_def *components[4] = {nir_channel(b, f16, 0), nir_channel(b, f16, 1), + nir_channel(b, f16, 2), nir_imm_float16(b, 1.0)}; + + return nir_vec(b, components, 4); } /* Wrapper around sRGB conversion */ -static nir_ssa_def * -pan_linear_to_srgb(nir_builder *b, nir_ssa_def *linear) +static nir_def * +pan_linear_to_srgb(nir_builder *b, nir_def *linear) { - nir_ssa_def *rgb = nir_channels(b, linear, 0x7); + nir_def *rgb = nir_trim_vector(b, linear, 3); - /* TODO: fp16 native conversion */ - nir_ssa_def *srgb = nir_f2fmp(b, - nir_format_linear_to_srgb(b, nir_f2f32(b, rgb))); + /* TODO: fp16 native conversion */ + nir_def *srgb = + nir_f2fmp(b, nir_format_linear_to_srgb(b, nir_f2f32(b, rgb))); - nir_ssa_def *comp[4] = { - nir_channel(b, srgb, 0), - nir_channel(b, srgb, 1), - nir_channel(b, srgb, 2), - nir_channel(b, linear, 3), - }; + nir_def *comp[4] = { + nir_channel(b, srgb, 0), + nir_channel(b, srgb, 1), + nir_channel(b, srgb, 2), + nir_channel(b, linear, 3), + }; - return nir_vec(b, comp, 4); + return nir_vec(b, comp, 4); +} + +static nir_def * +pan_unpack_pure(nir_builder *b, nir_def *packed, unsigned size, unsigned nr) +{ + switch (size) { + case 32: + return nir_trim_vector(b, packed, nr); + case 16: + return pan_unpack_pure_16(b, packed, nr); + case 8: + return pan_unpack_pure_8(b, packed, nr); + default: + unreachable("Unrenderable size"); + } } /* Generic dispatches for un/pack regardless of format */ -static nir_ssa_def * -pan_unpack(nir_builder *b, - const struct util_format_description *desc, - nir_ssa_def *packed) +static nir_def * +pan_unpack(nir_builder *b, const struct util_format_description *desc, + nir_def *packed) { - if (desc->is_array) { - int c = util_format_get_first_non_void_channel(desc->format); - assert(c >= 0); - struct util_format_channel_description d = desc->channel[c]; - - if (d.size == 32 || d.size == 16) { - assert(!d.normalized); - assert(d.type == UTIL_FORMAT_TYPE_FLOAT || d.pure_integer); - - return d.size == 32 ? pan_unpack_pure_32(b, packed, desc->nr_channels) : - pan_unpack_pure_16(b, packed, desc->nr_channels); - } else if (d.size == 8) { - assert(d.pure_integer); - return pan_unpack_pure_8(b, packed, desc->nr_channels); - } else { - unreachable("Unrenderable size"); - } - } - - switch (desc->format) { - case PIPE_FORMAT_R10G10B10A2_UINT: - return pan_unpack_uint_1010102(b, packed); - case PIPE_FORMAT_R11G11B10_FLOAT: - return pan_unpack_r11g11b10(b, packed); - default: - break; - } - - fprintf(stderr, "%s\n", desc->name); - unreachable("Unknown format"); + if (desc->is_array) { + int c = util_format_get_first_non_void_channel(desc->format); + assert(c >= 0); + struct util_format_channel_description d = desc->channel[c]; + nir_def *unpacked = pan_unpack_pure(b, packed, d.size, desc->nr_channels); + + /* Normalized formats are unpacked as integers. We need to + * convert to float for the final result. + */ + if (d.normalized) { + bool snorm = desc->is_snorm; + unsigned float_sz = (d.size <= 8 ? 16 : 32); + float multiplier = norm_scale(snorm, d.size); + + nir_def *as_float = snorm ? nir_i2fN(b, unpacked, float_sz) + : nir_u2fN(b, unpacked, float_sz); + + return nir_fmul_imm(b, as_float, 1.0 / multiplier); + } else { + return unpacked; + } + } + + switch (desc->format) { + case PIPE_FORMAT_R10G10B10A2_UINT: + case PIPE_FORMAT_B10G10R10A2_UINT: + return pan_unpack_int_1010102(b, packed, false); + case PIPE_FORMAT_R10G10B10A2_SINT: + case PIPE_FORMAT_B10G10R10A2_SINT: + return pan_unpack_int_1010102(b, packed, true); + case PIPE_FORMAT_R11G11B10_FLOAT: + return pan_unpack_r11g11b10(b, packed); + default: + break; + } + + fprintf(stderr, "%s\n", desc->name); + unreachable("Unknown format"); } -static nir_ssa_def * -pan_pack(nir_builder *b, - const struct util_format_description *desc, - nir_ssa_def *unpacked) +static nir_def *pan_pack(nir_builder *b, + const struct util_format_description *desc, + nir_def * unpacked) { - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) - unpacked = pan_linear_to_srgb(b, unpacked); - - if (util_format_is_unorm8(desc)) - return pan_pack_unorm(b, unpacked, 8, 8, 8, 8); - - if (desc->is_array) { - int c = util_format_get_first_non_void_channel(desc->format); - assert(c >= 0); - struct util_format_channel_description d = desc->channel[c]; - - if (d.size == 32 || d.size == 16) { - assert(!d.normalized); - assert(d.type == UTIL_FORMAT_TYPE_FLOAT || d.pure_integer); - - return d.size == 32 ? - pan_replicate(b, unpacked, desc->nr_channels) : - pan_pack_pure_16(b, unpacked, desc->nr_channels); - } else if (d.size == 8) { - assert(d.pure_integer); - return pan_pack_pure_8(b, unpacked, desc->nr_channels); - } else { - unreachable("Unrenderable size"); - } - } - - switch (desc->format) { - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B4G4R4X4_UNORM: - case PIPE_FORMAT_A4R4_UNORM: - case PIPE_FORMAT_R4A4_UNORM: - case PIPE_FORMAT_A4B4G4R4_UNORM: - case PIPE_FORMAT_R4G4B4A4_UNORM: - return pan_pack_unorm(b, unpacked, 4, 4, 4, 4); - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_R5G5B5A1_UNORM: - return pan_pack_unorm(b, unpacked, 5, 6, 5, 1); - case PIPE_FORMAT_R5G6B5_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - return pan_pack_unorm(b, unpacked, 5, 6, 5, 0); - case PIPE_FORMAT_R10G10B10A2_UNORM: - case PIPE_FORMAT_B10G10R10A2_UNORM: - return pan_pack_unorm_1010102(b, unpacked); - case PIPE_FORMAT_R10G10B10A2_UINT: - return pan_pack_uint_1010102(b, unpacked); - case PIPE_FORMAT_R11G11B10_FLOAT: - return pan_pack_r11g11b10(b, unpacked); - default: - break; - } - - fprintf(stderr, "%s\n", desc->name); - unreachable("Unknown format"); + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + unpacked = pan_linear_to_srgb(b, unpacked); + + if (desc->is_array) { + int c = util_format_get_first_non_void_channel(desc->format); + assert(c >= 0); + struct util_format_channel_description d = desc->channel[c]; + + /* Pure formats are packed as-is */ + nir_def *raw = unpacked; + + /* Normalized formats get normalized first */ + if (d.normalized) { + bool snorm = desc->is_snorm; + float multiplier = norm_scale(snorm, d.size); + nir_def *clamped = pan_fsat(b, unpacked, snorm); + nir_def *normed = nir_fmul_imm(b, clamped, multiplier); + + raw = nir_f2uN(b, normed, d.size); + } + + /* Pack the raw format */ + switch (d.size) { + case 32: + return pan_replicate(b, raw, desc->nr_channels); + case 16: + return pan_pack_pure_16(b, raw, desc->nr_channels); + case 8: + return pan_pack_pure_8(b, raw, desc->nr_channels); + default: + unreachable("Unrenderable size"); + } + } + + switch (desc->format) { + case PIPE_FORMAT_B4G4R4A4_UNORM: + case PIPE_FORMAT_B4G4R4X4_UNORM: + case PIPE_FORMAT_A4R4_UNORM: + case PIPE_FORMAT_R4A4_UNORM: + case PIPE_FORMAT_A4B4G4R4_UNORM: + case PIPE_FORMAT_R4G4B4A4_UNORM: + return pan_pack_unorm(b, unpacked, 4, 4, 4, 4); + case PIPE_FORMAT_B5G5R5A1_UNORM: + case PIPE_FORMAT_R5G5B5A1_UNORM: + return pan_pack_unorm(b, unpacked, 5, 6, 5, 1); + case PIPE_FORMAT_R5G6B5_UNORM: + case PIPE_FORMAT_B5G6R5_UNORM: + return pan_pack_unorm(b, unpacked, 5, 6, 5, 0); + case PIPE_FORMAT_R10G10B10A2_UNORM: + case PIPE_FORMAT_B10G10R10A2_UNORM: + return pan_pack_unorm_1010102(b, unpacked); + case PIPE_FORMAT_R10G10B10A2_UINT: + case PIPE_FORMAT_B10G10R10A2_UINT: + return pan_pack_int_1010102(b, unpacked, false); + case PIPE_FORMAT_R10G10B10A2_SINT: + case PIPE_FORMAT_B10G10R10A2_SINT: + return pan_pack_int_1010102(b, unpacked, true); + case PIPE_FORMAT_R11G11B10_FLOAT: + return pan_pack_r11g11b10(b, unpacked); + default: + break; + } + + fprintf(stderr, "%s\n", desc->name); + unreachable("Unknown format"); } static void -pan_lower_fb_store(nir_shader *shader, - nir_builder *b, - nir_intrinsic_instr *intr, - const struct util_format_description *desc, - unsigned quirks) +pan_lower_fb_store(nir_builder *b, nir_intrinsic_instr *intr, + const struct util_format_description *desc, + bool reorder_comps, unsigned nr_samples) { - /* For stores, add conversion before */ - nir_ssa_def *unpacked = nir_ssa_for_src(b, intr->src[1], 4); - nir_ssa_def *packed = pan_pack(b, desc, unpacked); + /* For stores, add conversion before */ + nir_def *unpacked = intr->src[0].ssa; + unpacked = nir_pad_vec4(b, unpacked); - nir_store_raw_output_pan(b, packed); + /* Re-order the components */ + if (reorder_comps) + unpacked = pan_pack_reorder(b, desc, unpacked); + + nir_def *packed = pan_pack(b, desc, unpacked); + + /* We have to split writeout in 128 bit chunks */ + unsigned iterations = DIV_ROUND_UP(desc->block.bits * nr_samples, 128); + + for (unsigned s = 0; s < iterations; ++s) { + nir_store_raw_output_pan(b, packed, + .io_semantics = nir_intrinsic_io_semantics(intr), + .base = s); + } } -static nir_ssa_def * +static nir_def * pan_sample_id(nir_builder *b, int sample) { - return (sample >= 0) ? nir_imm_int(b, sample) : nir_load_sample_id(b); + return (sample >= 0) ? nir_imm_int(b, sample) : nir_load_sample_id(b); } static void -pan_lower_fb_load(nir_shader *shader, - nir_builder *b, - nir_intrinsic_instr *intr, - const struct util_format_description *desc, - unsigned base, int sample, unsigned quirks) +pan_lower_fb_load(nir_builder *b, nir_intrinsic_instr *intr, + const struct util_format_description *desc, + bool reorder_comps, int sample) +{ + nir_def *packed = + nir_load_raw_output_pan(b, 4, 32, pan_sample_id(b, sample), + .io_semantics = nir_intrinsic_io_semantics(intr)); + + /* Convert the raw value */ + nir_def *unpacked = pan_unpack(b, desc, packed); + + /* Convert to the size of the load intrinsic. + * + * We can assume that the type will match with the framebuffer format: + * + * Page 170 of the PDF of the OpenGL ES 3.0.6 spec says: + * + * If [UNORM or SNORM, convert to fixed-point]; otherwise no type + * conversion is applied. If the values written by the fragment shader + * do not match the format(s) of the corresponding color buffer(s), + * the result is undefined. + */ + + unsigned bits = intr->def.bit_size; + + nir_alu_type src_type = + nir_alu_type_get_base_type(pan_unpacked_type_for_format(desc)); + + unpacked = nir_convert_to_bit_size(b, unpacked, src_type, bits); + unpacked = nir_resize_vector(b, unpacked, intr->def.num_components); + + /* Reorder the components */ + if (reorder_comps) + unpacked = pan_unpack_reorder(b, desc, unpacked); + + nir_def_rewrite_uses_after(&intr->def, unpacked, &intr->instr); +} + +struct inputs { + const enum pipe_format *rt_fmts; + uint8_t raw_fmt_mask; + bool is_blend; + bool broken_ld_special; + unsigned nr_samples; +}; + +static bool +lower(nir_builder *b, nir_instr *instr, void *data) { - nir_ssa_def *packed = - nir_load_raw_output_pan(b, 4, 32, pan_sample_id(b, sample), - .base = base); - - /* Convert the raw value */ - nir_ssa_def *unpacked = pan_unpack(b, desc, packed); - - /* Convert to the size of the load intrinsic. - * - * We can assume that the type will match with the framebuffer format: - * - * Page 170 of the PDF of the OpenGL ES 3.0.6 spec says: - * - * If [UNORM or SNORM, convert to fixed-point]; otherwise no type - * conversion is applied. If the values written by the fragment shader - * do not match the format(s) of the corresponding color buffer(s), - * the result is undefined. - */ - - unsigned bits = nir_dest_bit_size(intr->dest); - - nir_alu_type src_type = nir_alu_type_get_base_type( - pan_unpacked_type_for_format(desc)); - - unpacked = nir_convert_to_bit_size(b, unpacked, src_type, bits); - unpacked = nir_pad_vector(b, unpacked, nir_dest_num_components(intr->dest)); - - nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, unpacked, &intr->instr); + struct inputs *inputs = data; + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + bool is_load = intr->intrinsic == nir_intrinsic_load_output; + bool is_store = intr->intrinsic == nir_intrinsic_store_output; + + if (!(is_load || (is_store && inputs->is_blend))) + return false; + + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + if (sem.location < FRAG_RESULT_DATA0) + return false; + + unsigned rt = sem.location - FRAG_RESULT_DATA0; + if (inputs->rt_fmts[rt] == PIPE_FORMAT_NONE) + return false; + + const struct util_format_description *desc = + util_format_description(inputs->rt_fmts[rt]); + + /* Don't lower */ + if (pan_is_format_native(desc, inputs->broken_ld_special, is_store)) + return false; + + /* EXT_shader_framebuffer_fetch requires per-sample loads. MSAA blend + * shaders are not yet handled, so for now always load sample 0. + */ + int sample = inputs->is_blend ? 0 : -1; + bool reorder_comps = inputs->raw_fmt_mask & BITFIELD_BIT(rt); + + if (is_store) { + b->cursor = nir_before_instr(instr); + pan_lower_fb_store(b, intr, desc, reorder_comps, inputs->nr_samples); + } else { + b->cursor = nir_after_instr(instr); + pan_lower_fb_load(b, intr, desc, reorder_comps, sample); + } + + nir_instr_remove(instr); + return true; } bool pan_lower_framebuffer(nir_shader *shader, const enum pipe_format *rt_fmts, - bool is_blend, unsigned quirks) + uint8_t raw_fmt_mask, unsigned blend_shader_nr_samples, + bool broken_ld_special) { - if (shader->info.stage != MESA_SHADER_FRAGMENT) - return false; - - bool progress = false; - - nir_foreach_function(func, shader) { - nir_foreach_block(block, func->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - bool is_load = intr->intrinsic == nir_intrinsic_load_deref; - bool is_store = intr->intrinsic == nir_intrinsic_store_deref; - - if (!(is_load || (is_store && is_blend))) - continue; - - nir_variable *var = nir_intrinsic_get_var(intr, 0); - - if (var->data.mode != nir_var_shader_out) - continue; - - if (var->data.location < FRAG_RESULT_DATA0) - continue; - - unsigned base = var->data.driver_location; - unsigned rt = var->data.location - FRAG_RESULT_DATA0; - - if (rt_fmts[rt] == PIPE_FORMAT_NONE) - continue; - - const struct util_format_description *desc = - util_format_description(rt_fmts[rt]); - - enum pan_format_class fmt_class = - pan_format_class(desc, quirks, is_store); - - /* Don't lower */ - if (fmt_class == PAN_FORMAT_NATIVE) - continue; - - /* EXT_shader_framebuffer_fetch requires - * per-sample loads. - * MSAA blend shaders are not yet handled, so - * for now always load sample 0. */ - int sample = is_blend ? 0 : -1; - - nir_builder b; - nir_builder_init(&b, func->impl); - - if (is_store) { - b.cursor = nir_before_instr(instr); - pan_lower_fb_store(shader, &b, intr, desc, quirks); - } else { - b.cursor = nir_after_instr(instr); - pan_lower_fb_load(shader, &b, intr, desc, base, sample, quirks); - } - - nir_instr_remove(instr); - - progress = true; - } - } - - nir_metadata_preserve(func->impl, nir_metadata_block_index | - nir_metadata_dominance); - } - - return progress; + assert(shader->info.stage == MESA_SHADER_FRAGMENT); + + return nir_shader_instructions_pass( + shader, lower, nir_metadata_block_index | nir_metadata_dominance, + &(struct inputs){ + .rt_fmts = rt_fmts, + .raw_fmt_mask = raw_fmt_mask, + .nr_samples = blend_shader_nr_samples, + .is_blend = blend_shader_nr_samples > 0, + .broken_ld_special = broken_ld_special, + }); } diff --git a/src/panfrost/util/pan_lower_framebuffer.h b/src/panfrost/util/pan_lower_framebuffer.h index 5491cd346b1..bf6509175bb 100644 --- a/src/panfrost/util/pan_lower_framebuffer.h +++ b/src/panfrost/util/pan_lower_framebuffer.h @@ -30,18 +30,12 @@ #include "compiler/nir/nir.h" #include "util/format/u_format.h" -/* NATIVE formats can use a typed load/store. PACK formats cannot but can use a - * typed pack/unpack instruction. SOFTWARE formats are lowered */ - -enum pan_format_class { - PAN_FORMAT_NATIVE, - PAN_FORMAT_PACK, - PAN_FORMAT_SOFTWARE -}; - -nir_alu_type pan_unpacked_type_for_format(const struct util_format_description *desc); +nir_alu_type +pan_unpacked_type_for_format(const struct util_format_description *desc); bool pan_lower_framebuffer(nir_shader *shader, const enum pipe_format *rt_fmts, - bool is_blend, unsigned quirks); + uint8_t raw_fmt_mask, + unsigned blend_shader_nr_samples, + bool broken_ld_special); #endif diff --git a/src/panfrost/util/pan_lower_helper_invocation.c b/src/panfrost/util/pan_lower_helper_invocation.c index 23a37a15dd3..d4d1614f0f4 100644 --- a/src/panfrost/util/pan_lower_helper_invocation.c +++ b/src/panfrost/util/pan_lower_helper_invocation.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* Lower gl_HelperInvocation to (gl_SampleMaskIn == 0), this depends on * architectural details but is required for correct operation with @@ -30,29 +30,25 @@ * way to implement load_sample_id_no_per_sample. */ static bool -pan_lower_helper_invocation_instr(nir_builder *b, nir_instr *instr, void *data) +pan_lower_helper_invocation_instr(nir_builder *b, nir_intrinsic_instr *intr, + void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_helper_invocation) - return false; + if (intr->intrinsic != nir_intrinsic_load_helper_invocation) + return false; - b->cursor = nir_before_instr(instr); + b->cursor = nir_before_instr(&intr->instr); - nir_ssa_def *mask = nir_load_sample_mask_in(b); - nir_ssa_def *eq = nir_ieq(b, mask, nir_imm_int(b, 0)); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, eq); + nir_def *mask = nir_load_sample_mask_in(b); + nir_def *eq = nir_ieq_imm(b, mask, 0); + nir_def_rewrite_uses(&intr->def, eq); - return true; + return true; } bool pan_lower_helper_invocation(nir_shader *shader) { - return nir_shader_instructions_pass(shader, - pan_lower_helper_invocation_instr, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_intrinsics_pass( + shader, pan_lower_helper_invocation_instr, + nir_metadata_block_index | nir_metadata_dominance, NULL); } diff --git a/src/panfrost/util/pan_lower_image_index.c b/src/panfrost/util/pan_lower_image_index.c new file mode 100644 index 00000000000..4f3655af6e4 --- /dev/null +++ b/src/panfrost/util/pan_lower_image_index.c @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2024 Collabora, Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir_builder.h" +#include "pan_ir.h" + +/* Vertex shader gets passed image attribute descriptors through the + * vertex attribute descriptor array. This forces us to apply an offset + * to all image access to get the actual attribute offset. + * + * The gallium driver emits the vertex attributes on each draw, and puts + * image attributes right after the vertex attributes, which implies passing + * vs_img_attrib_offset = util_bitcount64(nir->info.inputs_read). + * + * The Vulkan driver, on the other hand, uses + * VkVertexInputAttributeDescription to build a table of attributes passed + * to the shader. While there's no reason for the app to define more + * attributes than it actually uses in the vertex shader, it doesn't seem + * to be disallowed either. Not to mention that vkCmdSetVertexInputEXT() + * allows one to dynamically change the vertex input configuration, and + * possibly pass more attributes than referenced by the vertex shader bound to + * the command buffer at draw time. Of course, we could carry this information + * at the pipeline level, and re-emit the attribute array, but emitting only + * when the vertex input configuration is flagged dirty is simpler. + * In order for this to work, we use a fixed image attribute offset. + */ +static bool +lower_image_intr(struct nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic != nir_intrinsic_image_load && + intr->intrinsic != nir_intrinsic_image_store) + return false; + + unsigned img_attr_offset = *(unsigned *)data; + nir_def *index = intr->src[0].ssa; + + b->cursor = nir_before_instr(&intr->instr); + + index = nir_iadd_imm(b, index, img_attr_offset); + nir_src_rewrite(&intr->src[0], index); + return true; +} + +bool +pan_lower_image_index(nir_shader *shader, unsigned vs_img_attrib_offset) +{ + if (shader->info.stage != MESA_SHADER_VERTEX) + return false; + + return nir_shader_intrinsics_pass( + shader, lower_image_intr, + nir_metadata_block_index | nir_metadata_dominance, &vs_img_attrib_offset); +} diff --git a/src/panfrost/util/pan_lower_image_ms.c b/src/panfrost/util/pan_lower_image_ms.c new file mode 100644 index 00000000000..0296cf839cd --- /dev/null +++ b/src/panfrost/util/pan_lower_image_ms.c @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2024 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors (Collabora): + * Eric Smith <eric.smith@collabora.com> + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "pan_ir.h" + +static bool +nir_lower_image_ms(nir_builder *b, nir_intrinsic_instr *intr, + UNUSED void *data) +{ + switch (intr->intrinsic) { + case nir_intrinsic_image_load: + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_store: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_texel_address: + break; + default: + return false; + } + + if (nir_intrinsic_image_dim(intr) != GLSL_SAMPLER_DIM_MS) + return false; + + b->cursor = nir_before_instr(&intr->instr); + + nir_def *coord = intr->src[1].ssa; + nir_def *index = intr->src[2].ssa; + + /* image2DMS is treated by panfrost as if it were a 3D image, so + * the sample index is in src[2]. We need to put this into the coordinates + * in the Z component + */ + nir_src_rewrite(&intr->src[1], + nir_vector_insert_imm(b, coord, + nir_channel(b, index, 0), + 2) ); + nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_3D); + return true; +} + +bool +pan_nir_lower_image_ms(nir_shader *shader) +{ + return nir_shader_intrinsics_pass( + shader, nir_lower_image_ms, + nir_metadata_block_index | nir_metadata_dominance, NULL); +} diff --git a/src/panfrost/util/pan_lower_sample_position.c b/src/panfrost/util/pan_lower_sample_position.c index 12a0c47bbff..1bf4efd3de7 100644 --- a/src/panfrost/util/pan_lower_sample_position.c +++ b/src/panfrost/util/pan_lower_sample_position.c @@ -21,8 +21,8 @@ * SOFTWARE. */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* Sample positions are supplied in a packed 8:8 fixed-point vec2 format in GPU * memory indexed by the sample. We lower in NIR to take advantage of possible @@ -33,43 +33,38 @@ * it's a pretty trivial difference */ static bool -pan_lower_sample_pos_impl(struct nir_builder *b, - nir_instr *instr, UNUSED void *data) +pan_lower_sample_pos_impl(struct nir_builder *b, nir_intrinsic_instr *intr, + UNUSED void *data) { - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_sample_pos) - return false; + if (intr->intrinsic != nir_intrinsic_load_sample_pos) + return false; - b->cursor = nir_before_instr(instr); + b->cursor = nir_before_instr(&intr->instr); - /* Elements are 4 bytes */ - nir_ssa_def *addr = nir_iadd(b, - nir_load_sample_positions_pan(b), - nir_u2u64(b, nir_imul_imm(b, nir_load_sample_id(b), 4))); + /* Elements are 4 bytes */ + nir_def *addr = + nir_iadd(b, nir_load_sample_positions_pan(b), + nir_u2u64(b, nir_imul_imm(b, nir_load_sample_id(b), 4))); - /* Decode 8:8 fixed-point */ - nir_ssa_def *raw = nir_load_global(b, addr, 2, 2, 16); - nir_ssa_def *decoded = nir_fmul_imm(b, nir_i2f16(b, raw), 1.0 / 256.0); + /* Decode 8:8 fixed-point */ + nir_def *raw = nir_load_global(b, addr, 2, 2, 16); + nir_def *decoded = nir_fmul_imm(b, nir_i2f16(b, raw), 1.0 / 256.0); - /* Make NIR validator happy */ - if (decoded->bit_size != nir_dest_bit_size(intr->dest)) - decoded = nir_f2fN(b, decoded, nir_dest_bit_size(intr->dest)); + /* Make NIR validator happy */ + if (decoded->bit_size != intr->def.bit_size) + decoded = nir_f2fN(b, decoded, intr->def.bit_size); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, decoded); - return true; + nir_def_rewrite_uses(&intr->def, decoded); + return true; } bool pan_lower_sample_pos(nir_shader *shader) { - if (shader->info.stage != MESA_SHADER_FRAGMENT) - return false; + if (shader->info.stage != MESA_SHADER_FRAGMENT) + return false; - return nir_shader_instructions_pass(shader, - pan_lower_sample_pos_impl, - nir_metadata_block_index | nir_metadata_dominance, - NULL); + return nir_shader_intrinsics_pass( + shader, pan_lower_sample_pos_impl, + nir_metadata_block_index | nir_metadata_dominance, NULL); } diff --git a/src/panfrost/util/pan_lower_store_component.c b/src/panfrost/util/pan_lower_store_component.c new file mode 100644 index 00000000000..d591aa9234d --- /dev/null +++ b/src/panfrost/util/pan_lower_store_component.c @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2020-2022 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> + */ + +#include "compiler/nir/nir_builder.h" +#include "pan_ir.h" + +/* + * If the shader packs multiple varyings into the same location with different + * location_frac, we'll need to lower to a single varying store that collects + * all of the channels together. This is because the varying instruction on + * Midgard and Bifrost is slot-based, writing out an entire vec4 slot at a time. + */ +static bool +lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + struct hash_table_u64 *slots = data; + unsigned component = nir_intrinsic_component(intr); + nir_src *slot_src = nir_get_io_offset_src(intr); + uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); + + nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); + unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); + + nir_def *value = intr->src[0].ssa; + b->cursor = nir_before_instr(&intr->instr); + + nir_def *undef = nir_undef(b, 1, value->bit_size); + nir_def *channels[4] = {undef, undef, undef, undef}; + + /* Copy old */ + u_foreach_bit(i, mask) { + assert(prev != NULL); + nir_def *prev_ssa = prev->src[0].ssa; + channels[i] = nir_channel(b, prev_ssa, i); + } + + /* Copy new */ + unsigned new_mask = nir_intrinsic_write_mask(intr); + mask |= (new_mask << component); + + u_foreach_bit(i, new_mask) { + assert(component + i < 4); + channels[component + i] = nir_channel(b, value, i); + } + + intr->num_components = util_last_bit(mask); + nir_src_rewrite(&intr->src[0], nir_vec(b, channels, intr->num_components)); + + nir_intrinsic_set_component(intr, 0); + nir_intrinsic_set_write_mask(intr, mask); + + if (prev) { + _mesa_hash_table_u64_remove(slots, slot); + nir_instr_remove(&prev->instr); + } + + _mesa_hash_table_u64_insert(slots, slot, intr); + return false; +} + +bool +pan_nir_lower_store_component(nir_shader *s) +{ + assert(s->info.stage == MESA_SHADER_VERTEX); + + struct hash_table_u64 *stores = _mesa_hash_table_u64_create(NULL); + bool progress = nir_shader_intrinsics_pass( + s, lower_store_component, + nir_metadata_block_index | nir_metadata_dominance, stores); + _mesa_hash_table_u64_destroy(stores); + return progress; +} diff --git a/src/panfrost/util/pan_lower_writeout.c b/src/panfrost/util/pan_lower_writeout.c index 7c298208515..eb528ce3bf3 100644 --- a/src/panfrost/util/pan_lower_writeout.c +++ b/src/panfrost/util/pan_lower_writeout.c @@ -22,8 +22,8 @@ * SOFTWARE. */ -#include "pan_ir.h" #include "compiler/nir/nir_builder.h" +#include "pan_ir.h" /* Midgard can write all of color, depth and stencil in a single writeout * operation, so we merge depth/stencil stores with color stores. @@ -33,224 +33,158 @@ * +ZS_EMIT with respect to +ATEST and +BLEND, as well as combining * depth/stencil stores into a single +ZS_EMIT op. */ -bool -pan_nir_lower_zs_store(nir_shader *nir) -{ - if (nir->info.stage != MESA_SHADER_FRAGMENT) - return false; - - nir_variable *z_var = NULL, *s_var = NULL; - - nir_foreach_shader_out_variable(var, nir) { - if (var->data.location == FRAG_RESULT_DEPTH) - z_var = var; - else if (var->data.location == FRAG_RESULT_STENCIL) - s_var = var; - } - - if (!z_var && !s_var) - return false; - - bool progress = false; - - nir_foreach_function(function, nir) { - if (!function->impl) continue; - - nir_intrinsic_instr *z_store = NULL, *s_store = NULL; - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; - - if (z_var && nir_intrinsic_base(intr) == z_var->data.driver_location) { - assert(!z_store); - z_store = intr; - } - - if (s_var && nir_intrinsic_base(intr) == s_var->data.driver_location) { - assert(!s_store); - s_store = intr; - } - } - } - - if (!z_store && !s_store) continue; - - bool replaced = false; - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; - - const nir_variable *var = nir_find_variable_with_driver_location(nir, nir_var_shader_out, nir_intrinsic_base(intr)); - assert(var); - - if (var->data.location < FRAG_RESULT_DATA0) - continue; - if (var->data.index) - continue; - - assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); - - nir_builder b; - nir_builder_init(&b, function->impl); - - assert(!z_store || z_store->instr.block == instr->block); - assert(!s_store || s_store->instr.block == instr->block); - b.cursor = nir_after_block_before_jump(instr->block); - - nir_intrinsic_instr *combined_store; - combined_store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_combined_output_pan); - - combined_store->num_components = intr->src[0].ssa->num_components; - - nir_intrinsic_set_base(combined_store, nir_intrinsic_base(intr)); - nir_intrinsic_set_src_type(combined_store, nir_intrinsic_src_type(intr)); - - unsigned writeout = PAN_WRITEOUT_C; - if (z_store) - writeout |= PAN_WRITEOUT_Z; - if (s_store) - writeout |= PAN_WRITEOUT_S; - - nir_intrinsic_set_component(combined_store, writeout); +/* + * Get the type to report for a piece of a combined store, given the store it + * is combining from. If there is no store to render target #0, a dummy <0.0, + * 0.0, 0.0, 0.0> write is used, so report a matching float32 type. + */ +static nir_alu_type +pan_nir_rt_store_type(nir_intrinsic_instr *store) +{ + return store ? nir_intrinsic_src_type(store) : nir_type_float32; +} - struct nir_ssa_def *zero = nir_imm_int(&b, 0); +static void +pan_nir_emit_combined_store(nir_builder *b, nir_intrinsic_instr *rt0_store, + unsigned writeout, nir_intrinsic_instr **stores) +{ + nir_intrinsic_instr *intr = nir_intrinsic_instr_create( + b->shader, nir_intrinsic_store_combined_output_pan); - struct nir_ssa_def *src[4] = { - intr->src[0].ssa, - intr->src[1].ssa, - z_store ? z_store->src[0].ssa : zero, - s_store ? s_store->src[0].ssa : zero, - }; + intr->num_components = rt0_store ? rt0_store->src[0].ssa->num_components : 4; - for (int i = 0; i < 4; ++i) - combined_store->src[i] = nir_src_for_ssa(src[i]); + if (rt0_store) + nir_intrinsic_set_io_semantics(intr, + nir_intrinsic_io_semantics(rt0_store)); + nir_intrinsic_set_src_type(intr, pan_nir_rt_store_type(rt0_store)); + nir_intrinsic_set_dest_type(intr, pan_nir_rt_store_type(stores[2])); + nir_intrinsic_set_component(intr, writeout); - nir_builder_instr_insert(&b, &combined_store->instr); + nir_def *zero = nir_imm_int(b, 0); + nir_def *zero4 = nir_imm_ivec4(b, 0, 0, 0, 0); - nir_instr_remove(instr); + nir_def *src[] = { + rt0_store ? rt0_store->src[0].ssa : zero4, + rt0_store ? rt0_store->src[1].ssa : zero, + stores[0] ? stores[0]->src[0].ssa : zero, + stores[1] ? stores[1]->src[0].ssa : zero, + stores[2] ? stores[2]->src[0].ssa : zero4, + }; - replaced = true; - } - } + for (int i = 0; i < ARRAY_SIZE(src); ++i) + intr->src[i] = nir_src_for_ssa(src[i]); - /* Insert a store to the depth RT (0xff) if needed */ - if (!replaced) { - nir_builder b; - nir_builder_init(&b, function->impl); + nir_builder_instr_insert(b, &intr->instr); +} +bool +pan_nir_lower_zs_store(nir_shader *nir) +{ + bool progress = false; - nir_block *block = NULL; - if (z_store && s_store) - assert(z_store->instr.block == s_store->instr.block); + if (nir->info.stage != MESA_SHADER_FRAGMENT) + return false; - if (z_store) - block = z_store->instr.block; - else - block = s_store->instr.block; + nir_foreach_function_impl(impl, nir) { + nir_intrinsic_instr *stores[3] = {NULL}; + unsigned writeout = 0; - b.cursor = nir_after_block_before_jump(block); + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - nir_intrinsic_instr *combined_store; - combined_store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_combined_output_pan); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; - combined_store->num_components = 4; + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + if (sem.location == FRAG_RESULT_DEPTH) { + stores[0] = intr; + writeout |= PAN_WRITEOUT_Z; + } else if (sem.location == FRAG_RESULT_STENCIL) { + stores[1] = intr; + writeout |= PAN_WRITEOUT_S; + } else if (sem.dual_source_blend_index) { + assert(!stores[2]); /* there should be only 1 source for dual blending */ + stores[2] = intr; + writeout |= PAN_WRITEOUT_2; + } + } + } - unsigned base; - if (z_store) - base = nir_intrinsic_base(z_store); - else - base = nir_intrinsic_base(s_store); - nir_intrinsic_set_base(combined_store, base); - nir_intrinsic_set_src_type(combined_store, nir_type_float32); + if (!writeout) + continue; - unsigned writeout = 0; - if (z_store) - writeout |= PAN_WRITEOUT_Z; - if (s_store) - writeout |= PAN_WRITEOUT_S; + nir_block *common_block = NULL; - nir_intrinsic_set_component(combined_store, writeout); + /* Ensure all stores are in the same block */ + for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { + if (!stores[i]) + continue; - struct nir_ssa_def *zero = nir_imm_int(&b, 0); + nir_block *block = stores[i]->instr.block; - struct nir_ssa_def *src[4] = { - nir_imm_vec4(&b, 0, 0, 0, 0), - zero, - z_store ? z_store->src[0].ssa : zero, - s_store ? s_store->src[0].ssa : zero, - }; + if (common_block) + assert(common_block == block); + else + common_block = block; + } - for (int i = 0; i < 4; ++i) - combined_store->src[i] = nir_src_for_ssa(src[i]); + bool replaced = false; - nir_builder_instr_insert(&b, &combined_store->instr); - } + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - if (z_store) - nir_instr_remove(&z_store->instr); + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; - if (s_store) - nir_instr_remove(&s_store->instr); + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); - progress = true; - } + if (sem.location < FRAG_RESULT_DATA0) + continue; - return progress; -} + if (sem.dual_source_blend_index) + continue; -/* Real writeout stores, which break execution, need to be moved to after - * dual-source stores, which are just standard register writes. */ -bool -pan_nir_reorder_writeout(nir_shader *nir) -{ - bool progress = false; + assert(nir_src_is_const(intr->src[1]) && "no indirect outputs"); - nir_foreach_function(function, nir) { - if (!function->impl) continue; + nir_builder b = + nir_builder_at(nir_after_block_before_jump(instr->block)); - nir_foreach_block(block, function->impl) { - nir_instr *last_writeout = NULL; + /* Trying to write depth twice results in the + * wrong blend shader being executed on + * Midgard */ + unsigned this_store = PAN_WRITEOUT_C | (replaced ? 0 : writeout); - nir_foreach_instr_reverse_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; + pan_nir_emit_combined_store(&b, intr, this_store, stores); - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; + nir_instr_remove(instr); - const nir_variable *var = nir_find_variable_with_driver_location(nir, nir_var_shader_out, nir_intrinsic_base(intr)); + replaced = true; + } + } - if (var->data.index) { - if (!last_writeout) - last_writeout = instr; - continue; - } + /* Insert a store to the depth RT (0xff) if needed */ + if (!replaced) { + nir_builder b = + nir_builder_at(nir_after_block_before_jump(common_block)); - if (!last_writeout) - continue; + pan_nir_emit_combined_store(&b, NULL, writeout, stores); + } - /* This is a real store, so move it to after dual-source stores */ - exec_node_remove(&instr->node); - exec_node_insert_after(&last_writeout->node, &instr->node); + for (unsigned i = 0; i < ARRAY_SIZE(stores); ++i) { + if (stores[i]) + nir_instr_remove(&stores[i]->instr); + } - progress = true; - } - } - } + nir_metadata_preserve(impl, + nir_metadata_block_index | nir_metadata_dominance); + progress = true; + } - return progress; + return progress; } diff --git a/src/panfrost/util/pan_lower_xfb.c b/src/panfrost/util/pan_lower_xfb.c new file mode 100644 index 00000000000..3fc96b2e380 --- /dev/null +++ b/src/panfrost/util/pan_lower_xfb.c @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2022 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compiler/nir/nir_builder.h" +#include "pan_ir.h" + +static void +lower_xfb_output(nir_builder *b, nir_intrinsic_instr *intr, + unsigned start_component, unsigned num_components, + unsigned buffer, unsigned offset_words) +{ + assert(buffer < MAX_XFB_BUFFERS); + assert(nir_intrinsic_component(intr) == 0); // TODO + + /* Transform feedback info in units of words, convert to bytes. */ + uint16_t stride = b->shader->info.xfb_stride[buffer] * 4; + assert(stride != 0); + + uint16_t offset = offset_words * 4; + + nir_def *index = nir_iadd( + b, nir_imul(b, nir_load_instance_id(b), nir_load_num_vertices(b)), + nir_load_vertex_id_zero_base(b)); + + BITSET_SET(b->shader->info.system_values_read, + SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + + nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer); + nir_def *addr = nir_iadd( + b, buf, + nir_u2u64(b, nir_iadd_imm(b, nir_imul_imm(b, index, stride), offset))); + + nir_def *src = intr->src[0].ssa; + nir_def *value = + nir_channels(b, src, BITFIELD_MASK(num_components) << start_component); + nir_store_global(b, addr, 4, value, BITFIELD_MASK(num_components)); +} + +static bool +lower_xfb(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) +{ + /* In transform feedback programs, vertex ID becomes zero-based, so apply + * that lowering even on Valhall. + */ + if (intr->intrinsic == nir_intrinsic_load_vertex_id) { + b->cursor = nir_instr_remove(&intr->instr); + + nir_def *repl = + nir_iadd(b, nir_load_vertex_id_zero_base(b), nir_load_first_vertex(b)); + + nir_def_rewrite_uses(&intr->def, repl); + return true; + } + + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + bool progress = false; + + b->cursor = nir_before_instr(&intr->instr); + + for (unsigned i = 0; i < 2; ++i) { + nir_io_xfb xfb = + i ? nir_intrinsic_io_xfb2(intr) : nir_intrinsic_io_xfb(intr); + for (unsigned j = 0; j < 2; ++j) { + if (!xfb.out[j].num_components) + continue; + + lower_xfb_output(b, intr, i * 2 + j, xfb.out[j].num_components, + xfb.out[j].buffer, xfb.out[j].offset); + progress = true; + } + } + + nir_instr_remove(&intr->instr); + return progress; +} + +bool +pan_lower_xfb(nir_shader *nir) +{ + return nir_shader_intrinsics_pass( + nir, lower_xfb, nir_metadata_block_index | nir_metadata_dominance, NULL); +} diff --git a/src/panfrost/util/pan_sysval.c b/src/panfrost/util/pan_sysval.c deleted file mode 100644 index 80a509f5b8b..00000000000 --- a/src/panfrost/util/pan_sysval.c +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (C) 2020 Collabora Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> - */ - -#include "pan_ir.h" -#include "compiler/nir/nir_builder.h" - -/* TODO: ssbo_size */ -static int -panfrost_sysval_for_ssbo(nir_intrinsic_instr *instr) -{ - nir_src index = instr->src[0]; - assert(nir_src_is_const(index)); - uint32_t uindex = nir_src_as_uint(index); - - return PAN_SYSVAL(SSBO, uindex); -} - -static int -panfrost_sysval_for_sampler(nir_intrinsic_instr *instr) -{ - /* TODO: indirect samplers !!! */ - nir_src index = instr->src[0]; - assert(nir_src_is_const(index)); - uint32_t uindex = nir_src_as_uint(index); - - return PAN_SYSVAL(SAMPLER, uindex); -} - -static int -panfrost_sysval_for_image_size(nir_intrinsic_instr *instr) -{ - nir_src index = instr->src[0]; - assert(nir_src_is_const(index)); - - bool is_array = nir_intrinsic_image_array(instr); - uint32_t uindex = nir_src_as_uint(index); - unsigned dim = nir_intrinsic_dest_components(instr) - is_array; - - return PAN_SYSVAL(IMAGE_SIZE, PAN_TXS_SYSVAL_ID(uindex, dim, is_array)); -} - -static unsigned -panfrost_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr) -{ - switch (instr->intrinsic) { - case nir_intrinsic_load_viewport_scale: - return PAN_SYSVAL_VIEWPORT_SCALE; - case nir_intrinsic_load_viewport_offset: - return PAN_SYSVAL_VIEWPORT_OFFSET; - case nir_intrinsic_load_num_workgroups: - return PAN_SYSVAL_NUM_WORK_GROUPS; - case nir_intrinsic_load_workgroup_size: - return PAN_SYSVAL_LOCAL_GROUP_SIZE; - case nir_intrinsic_load_work_dim: - return PAN_SYSVAL_WORK_DIM; - case nir_intrinsic_load_sample_positions_pan: - return PAN_SYSVAL_SAMPLE_POSITIONS; - case nir_intrinsic_load_first_vertex: - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_base_instance: - return PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS; - case nir_intrinsic_load_draw_id: - return PAN_SYSVAL_DRAWID; - case nir_intrinsic_load_ssbo_address: - case nir_intrinsic_get_ssbo_size: - return panfrost_sysval_for_ssbo(instr); - case nir_intrinsic_load_sampler_lod_parameters_pan: - return panfrost_sysval_for_sampler(instr); - case nir_intrinsic_image_size: - return panfrost_sysval_for_image_size(instr); - default: - return ~0; - } -} - -int -panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest) -{ - nir_intrinsic_instr *intr; - nir_dest *dst = NULL; - nir_tex_instr *tex; - unsigned sysval = ~0; - - switch (instr->type) { - case nir_instr_type_intrinsic: - intr = nir_instr_as_intrinsic(instr); - sysval = panfrost_nir_sysval_for_intrinsic(intr); - dst = &intr->dest; - break; - case nir_instr_type_tex: - tex = nir_instr_as_tex(instr); - if (tex->op != nir_texop_txs) - break; - - sysval = PAN_SYSVAL(TEXTURE_SIZE, - PAN_TXS_SYSVAL_ID(tex->texture_index, - nir_tex_instr_dest_size(tex) - - (tex->is_array ? 1 : 0), - tex->is_array)); - dst = &tex->dest; - break; - default: - break; - } - - if (dest && dst) - *dest = *dst; - - return sysval; -} - -unsigned -pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, - struct panfrost_sysvals *sysvals, - int sysval) -{ - /* Try to lookup */ - - void *cached = _mesa_hash_table_u64_search(sysval_to_id, sysval); - - if (cached) - return ((uintptr_t) cached) - 1; - - /* Else assign */ - - unsigned id = sysvals->sysval_count++; - assert(id < MAX_SYSVAL_COUNT); - _mesa_hash_table_u64_insert(sysval_to_id, sysval, (void *) ((uintptr_t) id + 1)); - sysvals->sysvals[id] = sysval; - - return id; -} - -struct hash_table_u64 * -panfrost_init_sysvals(struct panfrost_sysvals *sysvals, void *memctx) -{ - sysvals->sysval_count = 0; - return _mesa_hash_table_u64_create(memctx); -} |