summaryrefslogtreecommitdiff
path: root/src/broadcom/compiler
diff options
context:
space:
mode:
Diffstat (limited to 'src/broadcom/compiler')
-rw-r--r--src/broadcom/compiler/meson.build13
-rw-r--r--src/broadcom/compiler/nir_to_vir.c1994
-rw-r--r--src/broadcom/compiler/qpu_schedule.c1158
-rw-r--r--src/broadcom/compiler/qpu_validate.c102
-rw-r--r--src/broadcom/compiler/v3d33_tex.c195
-rw-r--r--src/broadcom/compiler/v3d33_vpm_setup.c75
-rw-r--r--src/broadcom/compiler/v3d_compiler.h309
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_image_load_store.c352
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_io.c221
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_line_smooth.c84
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c260
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_logic_ops.c153
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c167
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_scratch.c83
-rw-r--r--src/broadcom/compiler/v3d_nir_lower_txf_ms.c33
-rw-r--r--src/broadcom/compiler/v3d_packing.c50
-rw-r--r--src/broadcom/compiler/v3d_tex.c (renamed from src/broadcom/compiler/v3d40_tex.c)202
-rw-r--r--src/broadcom/compiler/vir.c745
-rw-r--r--src/broadcom/compiler/vir_dump.c16
-rw-r--r--src/broadcom/compiler/vir_live_variables.c23
-rw-r--r--src/broadcom/compiler/vir_opt_constant_alu.c3
-rw-r--r--src/broadcom/compiler/vir_opt_copy_propagate.c97
-rw-r--r--src/broadcom/compiler/vir_opt_dead_code.c25
-rw-r--r--src/broadcom/compiler/vir_opt_redundant_flags.c9
-rw-r--r--src/broadcom/compiler/vir_opt_small_immediates.c26
-rw-r--r--src/broadcom/compiler/vir_register_allocate.c1349
-rw-r--r--src/broadcom/compiler/vir_to_qpu.c202
27 files changed, 5299 insertions, 2647 deletions
diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build
index 95156140ad9..d5aafb3879e 100644
--- a/src/broadcom/compiler/meson.build
+++ b/src/broadcom/compiler/meson.build
@@ -32,23 +32,22 @@ libbroadcom_compiler_files = files(
'vir_to_qpu.c',
'qpu_schedule.c',
'qpu_validate.c',
- 'v3d33_tex.c',
- 'v3d40_tex.c',
- 'v3d33_vpm_setup.c',
+ 'v3d_tex.c',
'v3d_compiler.h',
'v3d_nir_lower_io.c',
'v3d_nir_lower_image_load_store.c',
'v3d_nir_lower_line_smooth.c',
+ 'v3d_nir_lower_load_store_bitsize.c',
'v3d_nir_lower_logic_ops.c',
- 'v3d_nir_lower_robust_buffer_access.c',
'v3d_nir_lower_scratch.c',
'v3d_nir_lower_txf_ms.c',
+ 'v3d_packing.c',
)
libbroadcom_compiler = static_library(
- ['broadcom_compiler', v3d_xml_pack],
- libbroadcom_compiler_files,
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom],
+ 'broadcom_compiler',
+ [libbroadcom_compiler_files, v3d_xml_pack],
+ include_directories : [inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_broadcom],
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',
dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index d0a89f1a7d4..acc62a092f2 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -38,7 +38,7 @@
#define __gen_address_type uint32_t
#define __gen_address_offset(reloc) (*reloc)
#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
@@ -164,7 +164,7 @@ vir_emit_thrsw(struct v3d_compile *c)
c->last_thrsw->qpu.sig.thrsw = true;
c->last_thrsw_at_top_level = !c->in_control_flow;
- /* We need to lock the scoreboard before any tlb acess happens. If this
+ /* We need to lock the scoreboard before any tlb access happens. If this
* thread switch comes after we have emitted a tlb load, then it means
* that we can't lock on the last thread switch any more.
*/
@@ -187,6 +187,28 @@ v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
}
static uint32_t
+v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
+{
+ nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+ switch (atomic_op) {
+ case nir_atomic_op_iadd:
+ return instr->intrinsic == nir_intrinsic_ssbo_atomic ?
+ v3d_get_op_for_atomic_add(instr, 2) :
+ v3d_get_op_for_atomic_add(instr, 1);
+ case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN;
+ case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+ case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX;
+ case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX;
+ case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC;
+ case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC;
+ case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+ case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+ case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+ default: unreachable("unknown atomic op");
+ }
+}
+
+static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
@@ -195,41 +217,21 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_global_2x32:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
return V3D_TMU_OP_REGULAR;
- case nir_intrinsic_ssbo_atomic_add:
- return v3d_get_op_for_atomic_add(instr, 2);
- case nir_intrinsic_shared_atomic_add:
- return v3d_get_op_for_atomic_add(instr, 1);
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_shared_atomic_imin:
- return V3D_TMU_OP_WRITE_SMIN;
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_shared_atomic_umin:
- return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_shared_atomic_imax:
- return V3D_TMU_OP_WRITE_SMAX;
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_shared_atomic_umax:
- return V3D_TMU_OP_WRITE_UMAX;
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_shared_atomic_and:
- return V3D_TMU_OP_WRITE_AND_READ_INC;
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_shared_atomic_or:
- return V3D_TMU_OP_WRITE_OR_READ_DEC;
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_shared_atomic_xor:
- return V3D_TMU_OP_WRITE_XOR_READ_NOT;
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_shared_atomic_exchange:
- return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
- case nir_intrinsic_ssbo_atomic_comp_swap:
- case nir_intrinsic_shared_atomic_comp_swap:
- return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+ case nir_intrinsic_ssbo_atomic:
+ case nir_intrinsic_ssbo_atomic_swap:
+ case nir_intrinsic_shared_atomic:
+ case nir_intrinsic_shared_atomic_swap:
+ case nir_intrinsic_global_atomic_2x32:
+ case nir_intrinsic_global_atomic_swap_2x32:
+ return v3d_general_tmu_op_for_atomic(instr);
+
default:
unreachable("unknown intrinsic op");
}
@@ -270,13 +272,13 @@ ntq_flush_tmu(struct v3d_compile *c)
bool emitted_tmuwt = false;
for (int i = 0; i < c->tmu.flush_count; i++) {
if (c->tmu.flush[i].component_mask > 0) {
- nir_dest *dest = c->tmu.flush[i].dest;
- assert(dest);
+ nir_def *def = c->tmu.flush[i].def;
+ assert(def);
for (int j = 0; j < 4; j++) {
if (c->tmu.flush[i].component_mask & (1 << j)) {
- ntq_store_dest(c, dest, j,
- vir_MOV(c, vir_LDTMU(c)));
+ ntq_store_def(c, def, j,
+ vir_MOV(c, vir_LDTMU(c)));
}
}
} else if (!emitted_tmuwt) {
@@ -292,12 +294,12 @@ ntq_flush_tmu(struct v3d_compile *c)
/**
* Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
- * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
+ * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
* and more specifically, the output fifo, since that can't stall.
*/
void
ntq_add_pending_tmu_flush(struct v3d_compile *c,
- nir_dest *dest,
+ nir_def *def,
uint32_t component_mask)
{
const uint32_t num_components = util_bitcount(component_mask);
@@ -305,13 +307,18 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
if (num_components > 0) {
c->tmu.output_fifo_size += num_components;
- if (!dest->is_ssa)
- _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
+
+ nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+ if (store != NULL) {
+ nir_def *reg = store->src[1].ssa;
+ _mesa_set_add(c->tmu.outstanding_regs, reg);
+ }
}
- c->tmu.flush[c->tmu.flush_count].dest = dest;
+ c->tmu.flush[c->tmu.flush_count].def = def;
c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
c->tmu.flush_count++;
+ c->tmu.total_count++;
if (c->disable_tmu_pipelining)
ntq_flush_tmu(c);
@@ -342,6 +349,7 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
uint32_t base_const_offset,
uint32_t *writemask,
uint32_t *const_offset,
+ uint32_t *type_size,
uint32_t *tmu_writes)
{
struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
@@ -371,7 +379,9 @@ emit_tmu_general_store_writes(struct v3d_compile *c,
/* Update the offset for the TMU write based on the
* the first component we are writing.
*/
- *const_offset = base_const_offset + first_component * 4;
+ *type_size = nir_src_bit_size(instr->src[0]) / 8;
+ *const_offset =
+ base_const_offset + first_component * (*type_size);
/* Clear these components from the writemask */
uint32_t written_mask =
@@ -433,6 +443,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
int offset_src,
struct qreg base_offset,
uint32_t const_offset,
+ uint32_t dest_components,
uint32_t *tmu_writes)
{
if (mode == MODE_COUNT) {
@@ -478,6 +489,8 @@ emit_tmu_general_address_write(struct v3d_compile *c,
if (vir_in_nonuniform_control_flow(c))
vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+ tmu->ldtmu_count = dest_components;
}
/**
@@ -486,7 +499,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
*/
static void
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
- bool is_shared_or_scratch)
+ bool is_shared_or_scratch, bool is_global)
{
uint32_t tmu_op = v3d_general_tmu_op(instr);
@@ -495,25 +508,32 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
* amount to add/sub, as that is implicit.
*/
bool atomic_add_replaced =
- ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
- instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+ (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
+ instr->intrinsic == nir_intrinsic_shared_atomic ||
+ instr->intrinsic == nir_intrinsic_global_atomic_2x32) &&
+ nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
(tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
- tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+ tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
instr->intrinsic == nir_intrinsic_store_scratch ||
- instr->intrinsic == nir_intrinsic_store_shared);
+ instr->intrinsic == nir_intrinsic_store_shared ||
+ instr->intrinsic == nir_intrinsic_store_global_2x32);
bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
instr->intrinsic == nir_intrinsic_load_ubo ||
instr->intrinsic == nir_intrinsic_load_ssbo ||
instr->intrinsic == nir_intrinsic_load_scratch ||
- instr->intrinsic == nir_intrinsic_load_shared);
+ instr->intrinsic == nir_intrinsic_load_shared ||
+ instr->intrinsic == nir_intrinsic_load_global_2x32);
if (!is_load)
c->tmu_dirty_rcl = true;
- bool has_index = !is_shared_or_scratch;
+ if (is_global)
+ c->has_global_address = true;
+
+ bool has_index = !is_shared_or_scratch && !is_global;
int offset_src;
if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -522,6 +542,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
instr->intrinsic == nir_intrinsic_load_ubo ||
instr->intrinsic == nir_intrinsic_load_scratch ||
instr->intrinsic == nir_intrinsic_load_shared ||
+ instr->intrinsic == nir_intrinsic_load_global_2x32 ||
atomic_add_replaced) {
offset_src = 0 + has_index;
} else if (is_store) {
@@ -542,13 +563,11 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
v3d_unit_data_create(0, const_offset));
const_offset = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ubo) {
- uint32_t index = nir_src_as_uint(instr->src[0]);
- /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
- * shifted up by 1 (0 is gallium's constant buffer 0).
+ /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
+ * is gallium's constant buffer 0 in GL and push constants
+ * in Vulkan)).
*/
- if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
- index++;
-
+ uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
base_offset =
vir_uniform(c, QUNIFORM_UBO_ADDR,
v3d_unit_data_create(index, const_offset));
@@ -565,10 +584,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
base_offset = c->cs_shared_offset;
const_offset += nir_intrinsic_base(instr);
}
+ } else if (is_global) {
+ /* Global load/store intrinsics use gloal addresses, so the
+ * offset is the target address and we don't need to add it
+ * to a base offset.
+ */
+ base_offset = vir_uniform_ui(c, 0);
} else {
+ uint32_t idx = is_store ? 1 : 0;
base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
- nir_src_as_uint(instr->src[is_store ?
- 1 : 0]));
+ nir_src_comp_as_uint(instr->src[idx], 0));
}
/* We are ready to emit TMU register writes now, but before we actually
@@ -588,16 +613,21 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
assert(mode == MODE_COUNT || tmu_writes > 0);
+ uint32_t type_size = 4;
+
if (is_store) {
emit_tmu_general_store_writes(c, mode, instr,
base_const_offset,
&writemask,
&const_offset,
+ &type_size,
&tmu_writes);
} else if (!is_load && !atomic_add_replaced) {
- emit_tmu_general_atomic_writes(c, mode, instr,
- tmu_op, has_index,
- &tmu_writes);
+ emit_tmu_general_atomic_writes(c, mode, instr,
+ tmu_op, has_index,
+ &tmu_writes);
+ } else if (is_load) {
+ type_size = instr->def.bit_size / 8;
}
/* For atomics we use 32bit except for CMPXCHG, that we need
@@ -618,17 +648,40 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
V3D_TMU_OP_TYPE_ATOMIC;
+ /* Only load per-quad if we can be certain that all
+ * lines in the quad are active. Notice that demoted
+ * invocations, unlike terminated ones, are still
+ * active: we want to skip memory writes for them but
+ * loads should still work.
+ */
uint32_t perquad =
- is_load && !vir_in_nonuniform_control_flow(c)
- ? GENERAL_TMU_LOOKUP_PER_QUAD
- : GENERAL_TMU_LOOKUP_PER_PIXEL;
+ is_load && !vir_in_nonuniform_control_flow(c) &&
+ ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
+ c->s->info.fs.needs_quad_helper_invocations &&
+ !c->emitted_discard) ||
+ c->s->info.uses_wide_subgroup_intrinsics) ?
+ GENERAL_TMU_LOOKUP_PER_QUAD :
+ GENERAL_TMU_LOOKUP_PER_PIXEL;
config = 0xffffff00 | tmu_op << 3 | perquad;
if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
} else if (is_atomic || num_components == 1) {
- config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+ switch (type_size) {
+ case 4:
+ config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+ break;
+ case 2:
+ config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
+ break;
+ case 1:
+ config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
+ break;
+ default:
+ unreachable("Unsupported bitsize");
+ }
} else {
+ assert(type_size == 4);
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
num_components - 2;
}
@@ -637,7 +690,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
emit_tmu_general_address_write(c, mode, instr, config,
dynamic_src, offset_src,
base_offset, const_offset,
- &tmu_writes);
+ dest_components, &tmu_writes);
assert(tmu_writes > 0);
if (mode == MODE_COUNT) {
@@ -660,7 +713,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
*/
const uint32_t component_mask =
(1 << dest_components) - 1;
- ntq_add_pending_tmu_flush(c, &instr->dest,
+ ntq_add_pending_tmu_flush(c, &instr->def,
component_mask);
}
}
@@ -673,7 +726,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
}
static struct qreg *
-ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
+ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
{
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
def->num_components);
@@ -717,8 +770,8 @@ is_ldunif_signal(const struct v3d_qpu_sig *sig)
* its destination to be the NIR reg's destination
*/
void
-ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
- struct qreg result)
+ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+ struct qreg result)
{
struct qinst *last_inst = NULL;
if (!list_is_empty(&c->cur_block->instructions))
@@ -731,23 +784,25 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
assert(result.file == QFILE_TEMP && last_inst &&
(last_inst == c->defs[result.index] || is_reused_uniform));
- if (dest->is_ssa) {
- assert(chan < dest->ssa.num_components);
+ nir_intrinsic_instr *store = nir_store_reg_for_def(def);
+ if (store == NULL) {
+ assert(chan < def->num_components);
struct qreg *qregs;
struct hash_entry *entry =
- _mesa_hash_table_search(c->def_ht, &dest->ssa);
+ _mesa_hash_table_search(c->def_ht, def);
if (entry)
qregs = entry->data;
else
- qregs = ntq_init_ssa_def(c, &dest->ssa);
+ qregs = ntq_init_ssa_def(c, def);
qregs[chan] = result;
} else {
- nir_register *reg = dest->reg.reg;
- assert(dest->reg.base_offset == 0);
- assert(reg->num_array_elems == 0);
+ nir_def *reg = store->src[1].ssa;
+ ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+ assert(nir_intrinsic_base(store) == 0);
+ assert(nir_intrinsic_num_array_elems(decl) == 0);
struct hash_entry *entry =
_mesa_hash_table_search(c->def_ht, reg);
struct qreg *qregs = entry->data;
@@ -802,7 +857,9 @@ struct qreg
ntq_get_src(struct v3d_compile *c, nir_src src, int i)
{
struct hash_entry *entry;
- if (src.is_ssa) {
+
+ nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
+ if (load == NULL) {
assert(i < src.ssa->num_components);
entry = _mesa_hash_table_search(c->def_ht, src.ssa);
@@ -811,10 +868,11 @@ ntq_get_src(struct v3d_compile *c, nir_src src, int i)
entry = _mesa_hash_table_search(c->def_ht, src.ssa);
}
} else {
- nir_register *reg = src.reg.reg;
- assert(reg->num_array_elems == 0);
- assert(src.reg.base_offset == 0);
- assert(i < reg->num_components);
+ nir_def *reg = load->src[0].ssa;
+ ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+ assert(nir_intrinsic_base(load) == 0);
+ assert(nir_intrinsic_num_array_elems(decl) == 0);
+ assert(i < nir_intrinsic_num_components(decl));
if (_mesa_set_search(c->tmu.outstanding_regs, reg))
ntq_flush_tmu(c);
@@ -830,13 +888,8 @@ static struct qreg
ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
unsigned src)
{
- assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
- unsigned chan = ffs(instr->dest.write_mask) - 1;
struct qreg r = ntq_get_src(c, instr->src[src].src,
- instr->src[src].swizzle[chan]);
-
- assert(!instr->src[src].abs);
- assert(!instr->src[src].negate);
+ instr->src[src].swizzle[0]);
return r;
};
@@ -876,6 +929,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
case GLSL_SAMPLER_DIM_3D:
case GLSL_SAMPLER_DIM_CUBE:
case GLSL_SAMPLER_DIM_BUF:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
/* Don't minify the array size. */
if (!(instr->is_array && i == dest_size - 1)) {
size = ntq_minify(c, size, lod);
@@ -890,7 +944,7 @@ ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
unreachable("Bad sampler type");
}
- ntq_store_dest(c, &instr->dest, i, size);
+ ntq_store_def(c, &instr->def, i, size);
}
}
@@ -905,12 +959,12 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
*/
switch (instr->op) {
case nir_texop_query_levels:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
return;
case nir_texop_texture_samples:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
return;
case nir_texop_txs:
ntq_emit_txs(c, instr);
@@ -919,10 +973,7 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
break;
}
- if (c->devinfo->ver >= 40)
- v3d40_vir_emit_tex(c, instr);
- else
- v3d33_vir_emit_tex(c, instr);
+ v3d_vir_emit_tex(c, instr);
}
static struct qreg
@@ -963,44 +1014,43 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
static struct qreg
emit_smooth_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg w, struct qreg r5)
+ struct qreg vary, struct qreg w, struct qreg c_reg)
{
- return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
}
static struct qreg
emit_noperspective_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
- return vir_FADD(c, vir_MOV(c, vary), r5);
+ return vir_FADD(c, vir_MOV(c, vary), c_reg);
}
static struct qreg
emit_flat_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
vir_MOV_dest(c, c->undef, vary);
- return vir_MOV(c, r5);
+ return vir_MOV(c, c_reg);
}
static struct qreg
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
int8_t input_idx, uint8_t swizzle, int array_index)
{
- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ struct qreg c_reg; /* C coefficient */
+
+ if (c->devinfo->has_accumulators)
+ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ else
+ c_reg = vir_reg(QFILE_REG, 0);
struct qinst *ldvary = NULL;
struct qreg vary;
- if (c->devinfo->ver >= 41) {
- ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
- c->undef, c->undef);
- ldvary->qpu.sig.ldvary = true;
- vary = vir_emit_def(c, ldvary);
- } else {
- vir_NOP(c)->qpu.sig.ldvary = true;
- vary = r3;
- }
+ ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+ c->undef, c->undef);
+ ldvary->qpu.sig.ldvary = true;
+ vary = vir_emit_def(c, ldvary);
/* Store the input value before interpolation so we can implement
* GLSL's interpolateAt functions if the shader uses them.
@@ -1008,7 +1058,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (input_idx >= 0) {
assert(var);
c->interp[input_idx].vp = vary;
- c->interp[input_idx].C = vir_MOV(c, r5);
+ c->interp[input_idx].C = vir_MOV(c, c_reg);
c->interp[input_idx].mode = var->data.interpolation;
}
@@ -1018,7 +1068,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
*/
if (!var) {
assert(input_idx < 0);
- return emit_smooth_varying(c, vary, c->payload_w, r5);
+ return emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
int i = c->num_inputs++;
@@ -1033,20 +1083,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (var->data.centroid) {
BITSET_SET(c->centroid_flags, i);
result = emit_smooth_varying(c, vary,
- c->payload_w_centroid, r5);
+ c->payload_w_centroid, c_reg);
} else {
- result = emit_smooth_varying(c, vary, c->payload_w, r5);
+ result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
break;
case INTERP_MODE_NOPERSPECTIVE:
BITSET_SET(c->noperspective_flags, i);
- result = emit_noperspective_varying(c, vary, r5);
+ result = emit_noperspective_varying(c, vary, c_reg);
break;
case INTERP_MODE_FLAT:
BITSET_SET(c->flat_shade_flags, i);
- result = emit_flat_varying(c, vary, r5);
+ result = emit_flat_varying(c, vary, c_reg);
break;
default:
@@ -1163,16 +1213,6 @@ ntq_emit_comparison(struct v3d_compile *c,
vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
break;
- case nir_op_i2b32:
- vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
- cond_invert = true;
- break;
-
- case nir_op_f2b32:
- vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
- cond_invert = true;
- break;
-
default:
return false;
}
@@ -1188,7 +1228,7 @@ ntq_emit_comparison(struct v3d_compile *c,
static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)
{
- if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
+ if (src.ssa->parent_instr->type != nir_instr_type_alu)
return NULL;
nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
if (!instr)
@@ -1199,7 +1239,7 @@ ntq_get_alu_parent(nir_src src)
* src.
*/
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
- if (!instr->src[i].src.is_ssa)
+ if (nir_load_reg_for_def(instr->src[i].src.ssa))
return NULL;
}
@@ -1242,12 +1282,78 @@ ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
return result;
}
+static struct qreg
+ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+ struct qreg result =
+ vir_MOV(c, vir_SEL(c, cond,
+ vir_uniform_ui(c, 1),
+ vir_uniform_ui(c, 0)));
+ c->flags_temp = result.index;
+ c->flags_cond = cond;
+ return result;
+}
+
+static struct qreg
+f2f16_rtz(struct v3d_compile *c, struct qreg f32)
+{
+ /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
+ * method and seems to be using RTE by default, so we need to implement
+ * RTZ rounding in software.
+ */
+ struct qreg rf16 = vir_FMOV(c, f32);
+ vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
+
+ struct qreg rf32 = vir_FMOV(c, rf16);
+ vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
+
+ struct qreg f32_abs = vir_FMOV(c, f32);
+ vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+ struct qreg rf32_abs = vir_FMOV(c, rf32);
+ vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
+
+ vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
+ V3D_QPU_PF_PUSHN);
+ return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+ vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
+}
+
+/**
+ * Takes the result value of a signed integer width conversion from a smaller
+ * type to a larger type and if needed, it applies sign extension to it.
+ */
+static struct qreg
+sign_extend(struct v3d_compile *c,
+ struct qreg value,
+ uint32_t src_bit_size,
+ uint32_t dst_bit_size)
+{
+ assert(src_bit_size < dst_bit_size);
+
+ struct qreg tmp = vir_MOV(c, value);
+
+ /* Do we need to sign-extend? */
+ uint32_t sign_mask = 1 << (src_bit_size - 1);
+ struct qinst *sign_check =
+ vir_AND_dest(c, vir_nop_reg(),
+ tmp, vir_uniform_ui(c, sign_mask));
+ vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
+
+ /* If so, fill in leading sign bits */
+ uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
+ ((1ull << dst_bit_size) - 1);
+ struct qinst *extend_inst =
+ vir_OR_dest(c, tmp, tmp,
+ vir_uniform_ui(c, extend_bits));
+ vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
+
+ return tmp;
+}
+
static void
ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
{
- /* This should always be lowered to ALU operations for V3D. */
- assert(!instr->dest.saturate);
-
/* Vectors are special in that they have non-scalarized writemasks,
* and just take the first swizzle channel for each argument in order
* into each writemask channel.
@@ -1260,8 +1366,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
srcs[i] = ntq_get_src(c, instr->src[i].src,
instr->src[i].swizzle[0]);
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
- ntq_store_dest(c, &instr->dest.dest, i,
- vir_MOV(c, srcs[i]));
+ ntq_store_def(c, &instr->def, i,
+ vir_MOV(c, srcs[i]));
return;
}
@@ -1327,6 +1433,94 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
break;
+ case nir_op_f2f16:
+ case nir_op_f2f16_rtne:
+ assert(nir_src_bit_size(instr->src[0].src) == 32);
+ result = vir_FMOV(c, src[0]);
+ vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
+ break;
+
+ case nir_op_f2f16_rtz:
+ assert(nir_src_bit_size(instr->src[0].src) == 32);
+ result = f2f16_rtz(c, src[0]);
+ break;
+
+ case nir_op_f2f32:
+ assert(nir_src_bit_size(instr->src[0].src) == 16);
+ result = vir_FMOV(c, src[0]);
+ vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
+ break;
+
+ case nir_op_i2i16: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 32 || bit_size == 8);
+ if (bit_size == 32) {
+ /* We don't have integer pack/unpack methods for
+ * converting between 16-bit and 32-bit, so we implement
+ * the conversion manually by truncating the src.
+ */
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+ } else {
+ struct qreg tmp = vir_AND(c, src[0],
+ vir_uniform_ui(c, 0xff));
+ result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
+ }
+ break;
+ }
+
+ case nir_op_u2u16: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 32 || bit_size == 8);
+
+ /* We don't have integer pack/unpack methods for converting
+ * between 16-bit and 32-bit, so we implement the conversion
+ * manually by truncating the src. For the 8-bit case, we
+ * want to make sure we don't copy garbage from any of the
+ * 24 MSB bits.
+ */
+ if (bit_size == 32)
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
+ else
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+ break;
+ }
+
+ case nir_op_i2i8:
+ case nir_op_u2u8:
+ assert(nir_src_bit_size(instr->src[0].src) == 32 ||
+ nir_src_bit_size(instr->src[0].src) == 16);
+ /* We don't have integer pack/unpack methods for converting
+ * between 8-bit and 32-bit, so we implement the conversion
+ * manually by truncating the src.
+ */
+ result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
+ break;
+
+ case nir_op_u2u32: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 16 || bit_size == 8);
+
+ /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
+ * from the src but we make sure to clear any garbage bits that
+ * may be present in the invalid src bits.
+ */
+ uint32_t mask = (1 << bit_size) - 1;
+ result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
+ break;
+ }
+
+ case nir_op_i2i32: {
+ uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+ assert(bit_size == 16 || bit_size == 8);
+
+ uint32_t mask = (1 << bit_size) - 1;
+ struct qreg tmp = vir_AND(c, src[0],
+ vir_uniform_ui(c, mask));
+
+ result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
+ break;
+ }
+
case nir_op_iadd:
result = vir_ADD(c, src[0], src[1]);
break;
@@ -1390,8 +1584,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
break;
}
- case nir_op_i2b32:
- case nir_op_f2b32:
case nir_op_feq32:
case nir_op_fneu32:
case nir_op_fge32:
@@ -1485,13 +1677,35 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
case nir_op_uadd_carry:
vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
V3D_QPU_PF_PUSHC);
- result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+ result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
+ break;
+
+ case nir_op_usub_borrow:
+ vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
+ V3D_QPU_PF_PUSHC);
+ result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
break;
case nir_op_pack_half_2x16_split:
result = vir_VFPACK(c, src[0], src[1]);
break;
+ case nir_op_pack_2x32_to_2x16_v3d:
+ result = vir_VPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_pack_32_to_r11g11b10_v3d:
+ result = vir_V11FPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
+ result = vir_V10PACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_pack_4x16_to_4x8_v3d:
+ result = vir_V8PACK(c, src[0], src[1]);
+ break;
+
case nir_op_unpack_half_2x16_split_x:
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1502,26 +1716,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
break;
- case nir_op_fquantize2f16: {
- /* F32 -> F16 -> F32 conversion */
- struct qreg tmp = vir_FMOV(c, src[0]);
- vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
- tmp = vir_FMOV(c, tmp);
- vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
+ case nir_op_pack_2x16_to_unorm_2x8_v3d:
+ result = vir_VFTOUNORM8(c, src[0]);
+ break;
- /* Check for denorm */
- struct qreg abs_src = vir_FMOV(c, src[0]);
- vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
- struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
- vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
- V3D_QPU_PF_PUSHC);
+ case nir_op_pack_2x16_to_snorm_2x8_v3d:
+ result = vir_VFTOSNORM8(c, src[0]);
+ break;
- /* Return +/-0 for denorms */
- struct qreg zero =
- vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
- result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
+ case nir_op_pack_2x16_to_unorm_2x10_v3d:
+ result = vir_VFTOUNORM10LO(c, src[0]);
+ break;
+
+ case nir_op_pack_2x16_to_unorm_10_2_v3d:
+ result = vir_VFTOUNORM10HI(c, src[0]);
+ break;
+
+ case nir_op_f2unorm_16_v3d:
+ result = vir_FTOUNORM16(c, src[0]);
+ break;
+
+ case nir_op_f2snorm_16_v3d:
+ result = vir_FTOSNORM16(c, src[0]);
break;
- }
default:
fprintf(stderr, "unknown NIR ALU inst: ");
@@ -1530,17 +1747,12 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
abort();
}
- /* We have a scalar result, so the instruction should only have a
- * single channel written to.
- */
- assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
- ntq_store_dest(c, &instr->dest.dest,
- ffs(instr->dest.write_mask) - 1, result);
+ ntq_store_def(c, &instr->def, 0, result);
}
/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
* specifier. They come from a register that's preloaded with 0xffffffff
- * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
+ * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
* 8 bits are shifted off the bottom and 0xff shifted in from the top.
*/
#define TLB_TYPE_F16_COLOR (3 << 6)
@@ -1670,15 +1882,6 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
static void
emit_frag_end(struct v3d_compile *c)
{
- /* If the shader has no non-TLB side effects and doesn't write Z
- * we can promote it to enabling early_fragment_tests even
- * if the user didn't.
- */
- if (c->output_position_index == -1 &&
- !(c->s->info.num_images || c->s->info.num_ssbos)) {
- c->s->info.fs.early_fragment_tests = true;
- }
-
if (c->output_sample_mask_index != -1) {
vir_SETMSF_dest(c, vir_nop_reg(),
vir_AND(c,
@@ -1703,55 +1906,75 @@ emit_frag_end(struct v3d_compile *c)
}
struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
- if (c->output_position_index != -1 &&
- !c->s->info.fs.early_fragment_tests) {
- struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
- c->outputs[c->output_position_index]);
- uint8_t tlb_specifier = TLB_TYPE_DEPTH;
- if (c->devinfo->ver >= 42) {
- tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
- TLB_SAMPLE_MODE_PER_PIXEL);
- } else
- tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
+ /* If the shader has no non-TLB side effects and doesn't write Z
+ * we can promote it to enabling early_fragment_tests even
+ * if the user didn't.
+ */
+ if (c->output_position_index == -1 &&
+ !(c->s->info.num_images || c->s->info.num_ssbos) &&
+ !c->s->info.fs.uses_discard &&
+ !c->s->info.fs.uses_demote &&
+ !c->fs_key->sample_alpha_to_coverage &&
+ c->output_sample_mask_index == -1 &&
+ has_any_tlb_color_write) {
+ c->s->info.fs.early_fragment_tests = true;
+ }
- inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
- tlb_specifier |
- 0xffffff00);
+ /* By default, Z buffer writes are implicit using the Z values produced
+ * from FEP (Z value produced from rasterization). When this is not
+ * desirable (shader writes Z explicitly, has discards, etc) we need
+ * to let the hardware know by setting c->writes_z to true, in which
+ * case we always need to write a Z value from the QPU, even if it is
+ * just the passthrough Z value produced from FEP.
+ *
+ * Also, from the V3D 4.2 spec:
+ *
+ * "If a shader performs a Z read the “Fragment shader does Z writes”
+ * bit in the shader record must be enabled to ensure deterministic
+ * results"
+ *
+ * So if c->reads_z is set we always need to write Z, even if it is
+ * a passthrough from the Z value produced from FEP.
+ */
+ if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
c->writes_z = true;
- } else if (c->s->info.fs.uses_discard ||
- !c->s->info.fs.early_fragment_tests ||
- c->fs_key->sample_alpha_to_coverage ||
- !has_any_tlb_color_write) {
- /* Emit passthrough Z if it needed to be delayed until shader
- * end due to potential discards.
- *
- * Since (single-threaded) fragment shaders always need a TLB
- * write, emit passthrouh Z if we didn't have any color
- * buffers and flag us as potentially discarding, so that we
- * can use Z as the TLB write.
- */
- c->s->info.fs.uses_discard = true;
-
- struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
- vir_nop_reg());
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
+ struct qinst *inst;
+
+ if (c->output_position_index != -1) {
+ /* Shader writes to gl_FragDepth, use that */
+ inst = vir_MOV_dest(c, tlbu_reg,
+ c->outputs[c->output_position_index]);
+
+ tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
+ TLB_SAMPLE_MODE_PER_PIXEL);
+ } else {
+ /* Shader doesn't write to gl_FragDepth, take Z from
+ * FEP.
+ */
+ c->writes_z_from_fep = true;
+ inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
- if (c->devinfo->ver >= 42) {
/* The spec says the PER_PIXEL flag is ignored for
* invariant writes, but the simulator demands it.
*/
tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
TLB_SAMPLE_MODE_PER_PIXEL);
- } else {
- tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
+
+ /* Since (single-threaded) fragment shaders always need
+ * a TLB write, if we dond't have any we emit a
+ * passthrouh Z and flag us as potentially discarding,
+ * so that we can use Z as the required TLB write.
+ */
+ if (!has_any_tlb_color_write)
+ c->s->info.fs.uses_discard = true;
}
- inst->uniform = vir_get_uniform_index(c,
- QUNIFORM_CONSTANT,
+ inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
tlb_specifier |
0xffffff00);
- c->writes_z = true;
+ inst->is_tlb_z_write = true;
}
/* XXX: Performance improvement: Merge Z write and color writes TLB
@@ -1767,7 +1990,6 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
struct qreg vpm_index,
bool uniform_vpm_index)
{
- assert(c->devinfo->ver >= 40);
if (uniform_vpm_index)
vir_STVPMV(c, vpm_index, val);
else
@@ -1777,13 +1999,8 @@ vir_VPM_WRITE_indirect(struct v3d_compile *c,
static void
vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
{
- if (c->devinfo->ver >= 40) {
- vir_VPM_WRITE_indirect(c, val,
- vir_uniform_ui(c, vpm_index), true);
- } else {
- /* XXX: v3d33_vir_vpm_write_setup(c); */
- vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
- }
+ vir_VPM_WRITE_indirect(c, val,
+ vir_uniform_ui(c, vpm_index), true);
}
static void
@@ -1791,7 +2008,7 @@ emit_vert_end(struct v3d_compile *c)
{
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
*/
- if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+ if (c->devinfo->ver == 42)
vir_VPMWT(c);
}
@@ -1800,7 +2017,7 @@ emit_geom_end(struct v3d_compile *c)
{
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
*/
- if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
+ if (c->devinfo->ver == 42)
vir_VPMWT(c);
}
@@ -1812,8 +2029,11 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
nir_intrinsic_instr *high,
void *data)
{
- /* Our backend is 32-bit only at present */
- if (bit_size != 32)
+ /* TMU general access only supports 32-bit vectors */
+ if (bit_size > 32)
+ return false;
+
+ if ((bit_size == 8 || bit_size == 16) && num_components > 1)
return false;
if (align_mul % 4 != 0 || align_offset % 4 != 0)
@@ -1843,7 +2063,29 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
do {
progress = false;
- NIR_PASS_V(s, nir_lower_vars_to_ssa);
+ NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
+ NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
+ NIR_PASS(progress, s, nir_opt_deref);
+
+ NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+ if (!s->info.var_copies_lowered) {
+ /* Only run this pass if nir_lower_var_copies was not called
+ * yet. That would lower away any copy_deref instructions and we
+ * don't want to introduce any more.
+ */
+ NIR_PASS(progress, s, nir_opt_find_array_copies);
+ }
+
+ NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+ NIR_PASS(progress, s, nir_opt_dead_write_vars);
+ NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
+
+ NIR_PASS(progress, s, nir_remove_dead_variables,
+ (nir_variable_mode)(nir_var_function_temp |
+ nir_var_shader_temp |
+ nir_var_mem_shared),
+ NULL);
+
NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
NIR_PASS(progress, s, nir_copy_prop);
@@ -1851,10 +2093,39 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
NIR_PASS(progress, s, nir_opt_dce);
NIR_PASS(progress, s, nir_opt_dead_cf);
NIR_PASS(progress, s, nir_opt_cse);
- NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
+ NIR_PASS(progress, s, nir_opt_intrinsics);
+ NIR_PASS(progress, s, nir_opt_idiv_const, 32);
+ NIR_PASS(progress, s, nir_lower_alu);
+
+ if (nir_opt_loop(s)) {
+ progress = true;
+ NIR_PASS(progress, s, nir_copy_prop);
+ NIR_PASS(progress, s, nir_opt_dce);
+ }
+
+ NIR_PASS(progress, s, nir_opt_conditional_discard);
+
+ NIR_PASS(progress, s, nir_opt_remove_phis);
+ NIR_PASS(progress, s, nir_opt_if, false);
+ if (c && !c->disable_gcm) {
+ bool local_progress = false;
+ NIR_PASS(local_progress, s, nir_opt_gcm, false);
+ c->gcm_progress |= local_progress;
+ progress |= local_progress;
+ }
+
+ /* Note that vectorization may undo the load/store scalarization
+ * pass we run for non 32-bit TMU general load/store by
+ * converting, for example, 2 consecutive 16-bit loads into a
+ * single 32-bit load. This is fine (and desirable) as long as
+ * the resulting 32-bit load meets 32-bit alignment requirements,
+ * which mem_vectorize_callback() should be enforcing.
+ */
nir_load_store_vectorize_options vectorize_opts = {
.modes = nir_var_mem_ssbo | nir_var_mem_ubo |
nir_var_mem_push_const | nir_var_mem_shared |
@@ -1862,7 +2133,24 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
.callback = mem_vectorize_callback,
.robust_modes = 0,
};
- NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
+ bool vectorize_progress = false;
+
+
+ /* This requires that we have called
+ * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
+ * first, which we may not have done yet if we call here too
+ * early durign NIR pre-processing. We can detect this because
+ * in that case we won't have a compile object
+ */
+ if (c) {
+ NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
+ &vectorize_opts);
+ if (vectorize_progress) {
+ NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
+ NIR_PASS(progress, s, nir_lower_pack);
+ progress = true;
+ }
+ }
if (lower_flrp != 0) {
bool lower_flrp_progress = false;
@@ -1895,10 +2183,8 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
nir_move_options sink_opts =
nir_move_const_undef | nir_move_comparisons | nir_move_copies |
- nir_move_load_ubo;
+ nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
NIR_PASS(progress, s, nir_opt_sink, sink_opts);
-
- NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
}
static int
@@ -1915,27 +2201,9 @@ ntq_emit_vpm_read(struct v3d_compile *c,
uint32_t *remaining,
uint32_t vpm_index)
{
- struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
-
- if (c->devinfo->ver >= 40 ) {
- return vir_LDVPMV_IN(c,
- vir_uniform_ui(c,
- (*num_components_queued)++));
- }
-
- if (*num_components_queued != 0) {
- (*num_components_queued)--;
- return vir_MOV(c, vpm);
- }
-
- uint32_t num_components = MIN2(*remaining, 32);
-
- v3d33_vir_vpm_read_setup(c, num_components);
-
- *num_components_queued = num_components - 1;
- *remaining -= num_components;
-
- return vir_MOV(c, vpm);
+ return vir_LDVPMV_IN(c,
+ vir_uniform_ui(c,
+ (*num_components_queued)++));
}
static void
@@ -2005,31 +2273,8 @@ ntq_setup_vs_inputs(struct v3d_compile *c)
}
/* The actual loads will happen directly in nir_intrinsic_load_input
- * on newer versions.
*/
- if (c->devinfo->ver >= 40)
- return;
-
- for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
- resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
- (loc + 1) * 4);
-
- for (int i = 0; i < c->vattr_sizes[loc]; i++) {
- c->inputs[loc * 4 + i] =
- ntq_emit_vpm_read(c,
- &vpm_components_queued,
- &num_components,
- loc * 4 + i);
-
- }
- }
-
- if (c->devinfo->ver >= 40) {
- assert(vpm_components_queued == num_components);
- } else {
- assert(vpm_components_queued == 0);
- assert(num_components == 0);
- }
+ return;
}
static bool
@@ -2058,14 +2303,14 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
*/
assert(glsl_type_is_array(var->type));
const struct glsl_type *type = glsl_get_array_element(var->type);
- unsigned array_len = MAX2(glsl_get_length(type), 1);
+ unsigned var_len = glsl_count_vec4_slots(type, false, false);
unsigned loc = var->data.driver_location;
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
- (loc + array_len) * 4);
+ (loc + var_len) * 4);
if (var->data.compact) {
- for (unsigned j = 0; j < array_len; j++) {
+ for (unsigned j = 0; j < var_len; j++) {
unsigned input_idx = c->num_inputs++;
unsigned loc_frac = var->data.location_frac + j;
unsigned loc = var->data.location + loc_frac / 4;
@@ -2076,8 +2321,10 @@ ntq_setup_gs_inputs(struct v3d_compile *c)
continue;
}
- for (unsigned j = 0; j < array_len; j++) {
- unsigned num_elements = glsl_get_vector_elements(type);
+ for (unsigned j = 0; j < var_len; j++) {
+ unsigned num_elements =
+ glsl_type_is_struct(glsl_without_array(type)) ?
+ 4 : glsl_get_vector_elements(type);
for (unsigned k = 0; k < num_elements; k++) {
unsigned chan = var->data.location_frac + k;
unsigned input_idx = c->num_inputs++;
@@ -2124,7 +2371,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
} else if (var->data.compact) {
for (int j = 0; j < var_len; j++)
emit_compact_fragment_input(c, loc, var, j);
- } else if (glsl_type_is_struct(var->type)) {
+ } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
for (int j = 0; j < var_len; j++) {
emit_fragment_input(c, loc, var, j, 4);
}
@@ -2143,12 +2390,9 @@ ntq_setup_outputs(struct v3d_compile *c)
return;
nir_foreach_shader_out_variable(var, c->s) {
- unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+ assert(glsl_type_is_vector_or_scalar(var->type));
unsigned loc = var->data.driver_location * 4;
- assert(array_len == 1);
- (void)array_len;
-
for (int i = 0; i < 4 - var->data.location_frac; i++) {
add_output(c, loc + var->data.location_frac + i,
var->data.location,
@@ -2157,15 +2401,17 @@ ntq_setup_outputs(struct v3d_compile *c)
switch (var->data.location) {
case FRAG_RESULT_COLOR:
- c->output_color_var[0] = var;
- c->output_color_var[1] = var;
- c->output_color_var[2] = var;
- c->output_color_var[3] = var;
+ for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+ c->output_color_var[i] = var;
break;
case FRAG_RESULT_DATA0:
case FRAG_RESULT_DATA1:
case FRAG_RESULT_DATA2:
case FRAG_RESULT_DATA3:
+ case FRAG_RESULT_DATA4:
+ case FRAG_RESULT_DATA5:
+ case FRAG_RESULT_DATA6:
+ case FRAG_RESULT_DATA7:
c->output_color_var[var->data.location -
FRAG_RESULT_DATA0] = var;
break;
@@ -2185,17 +2431,19 @@ ntq_setup_outputs(struct v3d_compile *c)
* Each nir_register gets a struct qreg per 32-bit component being stored.
*/
static void
-ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
+ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
{
- foreach_list_typed(nir_register, nir_reg, node, list) {
- unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
+ nir_foreach_reg_decl(decl, impl) {
+ unsigned num_components = nir_intrinsic_num_components(decl);
+ unsigned array_len = nir_intrinsic_num_array_elems(decl);
+ array_len = MAX2(array_len, 1);
struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
- array_len *
- nir_reg->num_components);
+ array_len * num_components);
+ nir_def *nir_reg = &decl->def;
_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
- for (int i = 0; i < array_len * nir_reg->num_components; i++)
+ for (int i = 0; i < array_len * num_components; i++)
qregs[i] = vir_get_temp(c);
}
}
@@ -2222,23 +2470,23 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
assert(nir_src_as_uint(instr->src[1]) == 0);
- ntq_store_dest(c, &instr->dest, 0,
+ ntq_store_def(c, &instr->def, 0,
vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
if (instr->num_components > 1) {
- ntq_store_dest(c, &instr->dest, 1,
- vir_uniform(c,
- instr->num_components == 2 && is_array ?
- QUNIFORM_IMAGE_ARRAY_SIZE :
- QUNIFORM_IMAGE_HEIGHT,
- image_index));
+ ntq_store_def(c, &instr->def, 1,
+ vir_uniform(c,
+ instr->num_components == 2 && is_array ?
+ QUNIFORM_IMAGE_ARRAY_SIZE :
+ QUNIFORM_IMAGE_HEIGHT,
+ image_index));
}
if (instr->num_components > 2) {
- ntq_store_dest(c, &instr->dest, 2,
- vir_uniform(c,
- is_array ?
- QUNIFORM_IMAGE_ARRAY_SIZE :
- QUNIFORM_IMAGE_DEPTH,
- image_index));
+ ntq_store_def(c, &instr->def, 2,
+ vir_uniform(c,
+ is_array ?
+ QUNIFORM_IMAGE_ARRAY_SIZE :
+ QUNIFORM_IMAGE_DEPTH,
+ image_index));
}
}
@@ -2263,16 +2511,14 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
*
* To fix that, we make sure we always emit a thread switch before the
* first tlb color read. If that happens to be the last thread switch
- * we emit, then everything is fine, but otherwsie, if any code after
+ * we emit, then everything is fine, but otherwise, if any code after
* this point needs to emit additional thread switches, then we will
* switch the strategy to locking the scoreboard on the first thread
* switch instead -- see vir_emit_thrsw().
*/
if (!c->emitted_tlb_load) {
- if (!c->last_thrsw_at_top_level) {
- assert(c->devinfo->ver >= 41);
+ if (!c->last_thrsw_at_top_level)
vir_emit_thrsw(c);
- }
c->emitted_tlb_load = true;
}
@@ -2371,27 +2617,96 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
assert(color_reads_for_sample[component].file != QFILE_NULL);
- ntq_store_dest(c, &instr->dest, 0,
- vir_MOV(c, color_reads_for_sample[component]));
+ ntq_store_def(c, &instr->def, 0,
+ vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
+
+static bool
+try_emit_uniform(struct v3d_compile *c,
+ int offset,
+ int num_components,
+ nir_def *def,
+ enum quniform_contents contents)
+{
+ /* Even though ldunif is strictly 32-bit we can still use it
+ * to load scalar 8-bit/16-bit uniforms so long as their offset
+ * is 32-bit aligned. In this case, ldunif would still load
+ * 32-bit into the destination with the 8-bit/16-bit uniform
+ * data in the LSB and garbage in the MSB, but that is fine
+ * because we should only be accessing the valid bits of the
+ * destination.
+ *
+ * FIXME: if in the future we improve our register allocator to
+ * pack 2 16-bit variables in the MSB and LSB of the same
+ * register then this optimization would not be valid as is,
+ * since the load clobbers the MSB.
+ */
+ if (offset % 4 != 0)
+ return false;
+
+ /* We need dwords */
+ offset = offset / 4;
+
+ for (int i = 0; i < num_components; i++) {
+ ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
+ }
+
+ return true;
}
static void
ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
+ /* We scalarize general TMU access for anything that is not 32-bit. */
+ assert(instr->def.bit_size == 32 ||
+ instr->num_components == 1);
+
+ /* Try to emit ldunif if possible, otherwise fallback to general TMU */
if (nir_src_is_const(instr->src[0])) {
int offset = (nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[0]));
- assert(offset % 4 == 0);
- /* We need dwords */
- offset = offset / 4;
- for (int i = 0; i < instr->num_components; i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_UNIFORM,
- offset + i));
+
+ if (try_emit_uniform(c, offset, instr->num_components,
+ &instr->def, QUNIFORM_UNIFORM)) {
+ return;
+ }
+ }
+
+ if (!ntq_emit_load_unifa(c, instr)) {
+ ntq_emit_tmu_general(c, instr, false, false);
+ c->has_general_tmu_load = true;
+ }
+}
+
+static bool
+ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ if (c->compiler->max_inline_uniform_buffers <= 0)
+ return false;
+
+ /* Regular UBOs start after inline UBOs */
+ uint32_t index = nir_src_as_uint(instr->src[0]);
+ if (index >= c->compiler->max_inline_uniform_buffers)
+ return false;
+
+ /* We scalarize general TMU access for anything that is not 32-bit */
+ assert(instr->def.bit_size == 32 ||
+ instr->num_components == 1);
+
+ if (nir_src_is_const(instr->src[1])) {
+ int offset = nir_src_as_uint(instr->src[1]);
+ if (try_emit_uniform(c, offset, instr->num_components,
+ &instr->def,
+ QUNIFORM_INLINE_UBO_0 + index)) {
+ return true;
}
- } else {
- ntq_emit_tmu_general(c, instr, false);
}
+
+ /* Fallback to regular UBO load */
+ return false;
}
static void
@@ -2411,7 +2726,7 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
unsigned offset =
nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
- if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
/* Emit the LDVPM directly now, rather than at the top
* of the shader like we did for V3D 3.x (which needs
* vpmsetup when not just taking the next offset).
@@ -2433,19 +2748,38 @@ ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
SYSTEM_VALUE_VERTEX_ID)) {
index++;
}
- for (int i = 0; i < offset; i++)
- index += c->vattr_sizes[i];
+
+ for (int i = 0; i < offset; i++) {
+ /* GFXH-1602: if any builtins (vid, iid, etc) are read then
+ * attribute 0 must be active (size > 0). When we hit this,
+ * the driver is expected to program attribute 0 to have a
+ * size of 1, so here we need to add that.
+ */
+ if (i == 0 && c->vs_key->is_coord &&
+ c->vattr_sizes[i] == 0 && index > 0) {
+ index++;
+ } else {
+ index += c->vattr_sizes[i];
+ }
+ }
+
index += nir_intrinsic_component(instr);
for (int i = 0; i < instr->num_components; i++) {
struct qreg vpm_offset = vir_uniform_ui(c, index++);
- ntq_store_dest(c, &instr->dest, i,
- vir_LDVPMV_IN(c, vpm_offset));
+ ntq_store_def(c, &instr->def, i,
+ vir_LDVPMV_IN(c, vpm_offset));
}
} else {
for (int i = 0; i < instr->num_components; i++) {
int comp = nir_intrinsic_component(instr) + i;
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[offset * 4 + comp]));
+ struct qreg input = c->inputs[offset * 4 + comp];
+ ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
+
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
+ input.file == c->payload_z.file &&
+ input.index == c->payload_z.index) {
+ c->reads_z = true;
+ }
}
}
}
@@ -2610,18 +2944,18 @@ ntq_get_barycentric_centroid(struct v3d_compile *c,
/* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
struct qreg F = vir_uniform_ui(c, 0);
struct qreg T = vir_uniform_ui(c, ~0);
- struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
+ struct qreg s0 = vir_AND(c, sample_mask, i1);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
- s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
- struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
+ s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+ struct qreg s1 = vir_AND(c, sample_mask, i2);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
- s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
- struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
+ s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+ struct qreg s2 = vir_AND(c, sample_mask, i4);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
- s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
- struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
+ s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
+ struct qreg s3 = vir_AND(c, sample_mask, i8);
vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
- s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
+ s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
/* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
struct qreg sample_idx = i3;
@@ -2708,28 +3042,142 @@ emit_ldunifa(struct v3d_compile *c, struct qreg *result)
c->current_unifa_offset += 4;
}
-static void
-ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
+/* Checks if the value of a nir src is derived from a nir register */
+static bool
+nir_src_derived_from_reg(nir_src src)
+{
+ nir_def *def = src.ssa;
+ if (nir_load_reg_for_def(def))
+ return true;
+
+ nir_instr *parent = def->parent_instr;
+ switch (parent->type) {
+ case nir_instr_type_alu: {
+ nir_alu_instr *alu = nir_instr_as_alu(parent);
+ int num_srcs = nir_op_infos[alu->op].num_inputs;
+ for (int i = 0; i < num_srcs; i++) {
+ if (nir_src_derived_from_reg(alu->src[i].src))
+ return true;
+ }
+ return false;
+ }
+ case nir_instr_type_intrinsic: {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+ int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
+ for (int i = 0; i < num_srcs; i++) {
+ if (nir_src_derived_from_reg(intr->src[i]))
+ return true;
+ }
+ return false;
+ }
+ case nir_instr_type_load_const:
+ case nir_instr_type_undef:
+ return false;
+ default:
+ /* By default we assume it may come from a register, the above
+ * cases should be able to handle the majority of situations
+ * though.
+ */
+ return true;
+ };
+}
+
+static bool
+ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
+ assert(instr->intrinsic == nir_intrinsic_load_ubo ||
+ instr->intrinsic == nir_intrinsic_load_ssbo ||
+ instr->intrinsic == nir_intrinsic_load_uniform);
+
+ bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
+ bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
+ bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
+
/* Every ldunifa auto-increments the unifa address by 4 bytes, so our
* current unifa offset is 4 bytes ahead of the offset of the last load.
*/
static const int32_t max_unifa_skip_dist =
MAX_UNIFA_SKIP_DISTANCE - 4;
- bool dynamic_src = !nir_src_is_const(instr->src[1]);
- uint32_t const_offset =
- dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
+ /* We can only use unifa if the offset is uniform */
+ nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
+ if (nir_src_is_divergent(offset))
+ return false;
- /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
- * shifted up by 1 (0 is gallium's constant buffer 0).
+ /* Emitting loads from unifa may not be safe under non-uniform control
+ * flow. It seems the address that is used to write to the unifa
+ * register is taken from the first lane and if that lane is disabled
+ * by control flow then the value we read may be bogus and lead to
+ * invalid memory accesses on follow-up ldunifa instructions. However,
+ * ntq_store_def only emits conditional writes for nir registersas long
+ * we can be certain that the offset isn't derived from a load_reg we
+ * should be fine.
+ *
+ * The following CTS test can be used to trigger the problem, which
+ * causes a GMP violations in the sim without this check:
+ * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
*/
- uint32_t index = nir_src_as_uint(instr->src[0]);
- if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
+ if (vir_in_nonuniform_control_flow(c) &&
+ nir_src_derived_from_reg(offset)) {
+ return false;
+ }
+
+ /* We can only use unifa with SSBOs if they are read-only. Otherwise
+ * ldunifa won't see the shader writes to that address (possibly
+ * because ldunifa doesn't read from the L2T cache).
+ */
+ if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
+ return false;
+
+ /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
+ * that we may have been written to scratch using the TMU.
+ */
+ bool dynamic_src = !nir_src_is_const(offset);
+ if (is_uniform && dynamic_src && c->s->scratch_size > 0)
+ return false;
+
+ uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
+ if (is_uniform)
+ const_offset += nir_intrinsic_base(instr);
+
+ /* ldunifa is a 32-bit load instruction so we can only use it with
+ * 32-bit aligned addresses. We always produce 32-bit aligned addresses
+ * except for types smaller than 32-bit, so in these cases we can only
+ * use ldunifa if we can verify alignment, which we can only do for
+ * loads with a constant offset.
+ */
+ uint32_t bit_size = instr->def.bit_size;
+ uint32_t value_skips = 0;
+ if (bit_size < 32) {
+ if (dynamic_src) {
+ return false;
+ } else if (const_offset % 4 != 0) {
+ /* If we are loading from an unaligned offset, fix
+ * alignment and skip over unused elements in result.
+ */
+ value_skips = (const_offset % 4) / (bit_size / 8);
+ const_offset &= ~0x3;
+ }
+ }
+
+ assert((bit_size == 32 && value_skips == 0) ||
+ (bit_size == 16 && value_skips <= 1) ||
+ (bit_size == 8 && value_skips <= 3));
+
+ /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
+ * constants.
+ */
+ uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
+
+ /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
+ * index 0 for Gallium's constant buffer (GL) or push constants
+ * (Vulkan).
+ */
+ if (is_ubo)
index++;
/* We can only keep track of the last unifa address we used with
- * constant offset loads. If the new load targets the same UBO and
+ * constant offset loads. If the new load targets the same buffer and
* is close enough to the previous load, we can skip the unifa register
* write by emitting dummy ldunifa instructions to update the unifa
* address.
@@ -2739,6 +3187,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
if (dynamic_src) {
c->current_unifa_block = NULL;
} else if (c->cur_block == c->current_unifa_block &&
+ c->current_unifa_is_ubo == !is_ssbo &&
c->current_unifa_index == index &&
c->current_unifa_offset <= const_offset &&
c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
@@ -2746,32 +3195,98 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
} else {
c->current_unifa_block = c->cur_block;
+ c->current_unifa_is_ubo = !is_ssbo;
c->current_unifa_index = index;
c->current_unifa_offset = const_offset;
}
if (!skip_unifa) {
- struct qreg base_offset =
+ struct qreg base_offset = !is_ssbo ?
vir_uniform(c, QUNIFORM_UBO_ADDR,
- v3d_unit_data_create(index, const_offset));
+ v3d_unit_data_create(index, const_offset)) :
+ vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
if (!dynamic_src) {
- vir_MOV_dest(c, unifa, base_offset);
+ if (!is_ssbo) {
+ /* Avoid the extra MOV to UNIFA by making
+ * ldunif load directly into it. We can't
+ * do this if we have not actually emitted
+ * ldunif and are instead reusing a previous
+ * one.
+ */
+ struct qinst *inst =
+ (struct qinst *)c->cur_block->instructions.prev;
+ if (inst == c->defs[base_offset.index]) {
+ inst->dst = unifa;
+ c->defs[base_offset.index] = NULL;
+ } else {
+ vir_MOV_dest(c, unifa, base_offset);
+ }
+ } else {
+ vir_ADD_dest(c, unifa, base_offset,
+ vir_uniform_ui(c, const_offset));
+ }
} else {
vir_ADD_dest(c, unifa, base_offset,
- ntq_get_src(c, instr->src[1], 0));
+ ntq_get_src(c, offset, 0));
}
} else {
for (int i = 0; i < ldunifa_skips; i++)
emit_ldunifa(c, NULL);
}
- for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+ uint32_t num_components = nir_intrinsic_dest_components(instr);
+ for (uint32_t i = 0; i < num_components; ) {
struct qreg data;
emit_ldunifa(c, &data);
- ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
+
+ if (bit_size == 32) {
+ assert(value_skips == 0);
+ ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
+ i++;
+ } else {
+ assert((bit_size == 16 && value_skips <= 1) ||
+ (bit_size == 8 && value_skips <= 3));
+
+ /* If we have any values to skip, shift to the first
+ * valid value in the ldunifa result.
+ */
+ if (value_skips > 0) {
+ data = vir_SHR(c, data,
+ vir_uniform_ui(c, bit_size *
+ value_skips));
+ }
+
+ /* Check how many valid components we have discounting
+ * read components to skip.
+ */
+ uint32_t valid_count = (32 / bit_size) - value_skips;
+ assert((bit_size == 16 && valid_count <= 2) ||
+ (bit_size == 8 && valid_count <= 4));
+ assert(valid_count > 0);
+
+ /* Process the valid components */
+ do {
+ struct qreg tmp;
+ uint32_t mask = (1 << bit_size) - 1;
+ tmp = vir_AND(c, vir_MOV(c, data),
+ vir_uniform_ui(c, mask));
+ ntq_store_def(c, &instr->def, i,
+ vir_MOV(c, tmp));
+ i++;
+ valid_count--;
+
+ /* Shift to next component */
+ if (i < num_components && valid_count > 0) {
+ data = vir_SHR(c, data,
+ vir_uniform_ui(c, bit_size));
+ }
+ } while (i < num_components && valid_count > 0);
+ }
}
+
+ return true;
}
static inline struct qreg
@@ -2781,187 +3296,273 @@ emit_load_local_invocation_index(struct v3d_compile *c)
vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
}
-/* Various subgroup operations rely on the A flags, so this helper ensures that
- * A flags represents currently active lanes in the subgroup.
+/* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
+ * fragment shaders a lane is considered active if any sample flags are set
+ * for *any* lane in the same quad, however, we still need to ensure that
+ * terminated lanes (OpTerminate) are not included. Further, we also need to
+ * disable lanes that may be disabled because of non-uniform control
+ * flow.
*/
-static void
-set_a_flags_for_subgroup(struct v3d_compile *c)
+static enum v3d_qpu_cond
+setup_subgroup_control_flow_condition(struct v3d_compile *c)
{
- /* MSF returns 0 for disabled lanes in compute shaders so
- * PUSHZ will set A=1 for disabled lanes. We want the inverse
- * of this but we don't have any means to negate the A flags
- * directly, but we can do it by repeating the same operation
- * with NORZ (A = ~A & ~Z).
+ assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
+ c->s->info.stage == MESA_SHADER_COMPUTE);
+
+ enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
+
+ /* We need to make sure that terminated lanes in fragment shaders are
+ * not included. We can identify these lanes by comparing the inital
+ * sample mask with the current. This fixes:
+ * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
*/
- assert(c->s->info.stage == MESA_SHADER_COMPUTE);
- vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
- vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
+ vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
+ vir_NOT(c, vir_XOR(c, c->start_msf,
+ vir_MSF(c)))),
+ V3D_QPU_PF_PUSHZ);
+ cond = V3D_QPU_COND_IFNA;
+ }
- /* If we are under non-uniform control flow we also need to
- * AND the A flags with the current execute mask.
+ /* If we are in non-uniform control-flow update the condition to
+ * also limit lanes to those in the current execution mask.
*/
if (vir_in_nonuniform_control_flow(c)) {
- const uint32_t bidx = c->cur_block->index;
- vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
- c->execute,
- vir_uniform_ui(c, bidx)),
- V3D_QPU_UF_ANDZ);
+ if (cond == V3D_QPU_COND_IFNA) {
+ vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_UF_NORNZ);
+ } else {
+ assert(cond == V3D_QPU_COND_NONE);
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
+ cond = V3D_QPU_COND_IFA;
}
+
+ return cond;
+}
+
+static void
+emit_compute_barrier(struct v3d_compile *c)
+{
+ /* Ensure we flag the use of the control barrier. NIR's
+ * gather info pass usually takes care of this, but that
+ * requires that we call that pass after any other pass
+ * may emit a control barrier, so this is safer.
+ */
+ c->s->info.uses_control_barrier = true;
+
+ /* Emit a TSY op to get all invocations in the workgroup
+ * (actually supergroup) to block until the last
+ * invocation reaches the TSY op.
+ */
+ vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
+}
+
+static void
+emit_barrier(struct v3d_compile *c)
+{
+ struct qreg eidx = vir_EIDX(c);
+
+ /* The config for the TSY op should be setup like this:
+ * - Lane 0: Quorum
+ * - Lane 2: TSO id
+ * - Lane 3: TSY opcode
+ */
+
+ /* Lane 0: we want to synchronize across one subgroup. Here we write to
+ * all lanes unconditionally and will overwrite other lanes below.
+ */
+ struct qreg tsy_conf = vir_uniform_ui(c, 1);
+
+ /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
+ * curent QPU index and thread index to ensure we get a unique one for
+ * this group of invocations in this core.
+ */
+ struct qreg tso_id =
+ vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
+ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
+ V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
+
+ /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
+ struct qreg tsy_op = vir_uniform_ui(c, 16);
+ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
+ V3D_QPU_PF_PUSHZ);
+ vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
+
+ /* Emit TSY sync */
+ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
}
static void
ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ break; /* Ignore these */
+
case nir_intrinsic_load_uniform:
ntq_emit_load_uniform(c, instr);
break;
+ case nir_intrinsic_load_global_2x32:
+ ntq_emit_tmu_general(c, instr, false, true);
+ c->has_general_tmu_load = true;
+ break;
+
case nir_intrinsic_load_ubo:
- if (!nir_src_is_divergent(instr->src[1]))
- ntq_emit_load_ubo_unifa(c, instr);
- else
- ntq_emit_tmu_general(c, instr, false);
- break;
-
- case nir_intrinsic_ssbo_atomic_add:
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_ssbo_atomic_comp_swap:
+ if (ntq_emit_inline_ubo_load(c, instr))
+ break;
+ FALLTHROUGH;
case nir_intrinsic_load_ssbo:
+ if (!ntq_emit_load_unifa(c, instr)) {
+ ntq_emit_tmu_general(c, instr, false, false);
+ c->has_general_tmu_load = true;
+ }
+ break;
+
case nir_intrinsic_store_ssbo:
- ntq_emit_tmu_general(c, instr, false);
- break;
-
- case nir_intrinsic_shared_atomic_add:
- case nir_intrinsic_shared_atomic_imin:
- case nir_intrinsic_shared_atomic_umin:
- case nir_intrinsic_shared_atomic_imax:
- case nir_intrinsic_shared_atomic_umax:
- case nir_intrinsic_shared_atomic_and:
- case nir_intrinsic_shared_atomic_or:
- case nir_intrinsic_shared_atomic_xor:
- case nir_intrinsic_shared_atomic_exchange:
- case nir_intrinsic_shared_atomic_comp_swap:
- case nir_intrinsic_load_shared:
+ case nir_intrinsic_ssbo_atomic:
+ case nir_intrinsic_ssbo_atomic_swap:
+ ntq_emit_tmu_general(c, instr, false, false);
+ break;
+
+ case nir_intrinsic_store_global_2x32:
+ case nir_intrinsic_global_atomic_2x32:
+ case nir_intrinsic_global_atomic_swap_2x32:
+ ntq_emit_tmu_general(c, instr, false, true);
+ break;
+
+ case nir_intrinsic_shared_atomic:
+ case nir_intrinsic_shared_atomic_swap:
case nir_intrinsic_store_shared:
- case nir_intrinsic_load_scratch:
case nir_intrinsic_store_scratch:
- ntq_emit_tmu_general(c, instr, true);
+ ntq_emit_tmu_general(c, instr, true, false);
+ break;
+
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_shared:
+ ntq_emit_tmu_general(c, instr, true, false);
+ c->has_general_tmu_load = true;
break;
- case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
- case nir_intrinsic_image_atomic_add:
- case nir_intrinsic_image_atomic_imin:
- case nir_intrinsic_image_atomic_umin:
- case nir_intrinsic_image_atomic_imax:
- case nir_intrinsic_image_atomic_umax:
- case nir_intrinsic_image_atomic_and:
- case nir_intrinsic_image_atomic_or:
- case nir_intrinsic_image_atomic_xor:
- case nir_intrinsic_image_atomic_exchange:
- case nir_intrinsic_image_atomic_comp_swap:
- v3d40_vir_emit_image_load_store(c, instr);
+ case nir_intrinsic_image_atomic:
+ case nir_intrinsic_image_atomic_swap:
+ v3d_vir_emit_image_load_store(c, instr);
+ break;
+
+ case nir_intrinsic_image_load:
+ v3d_vir_emit_image_load_store(c, instr);
+ /* Not really a general TMU load, but we only use this flag
+ * for NIR scheduling and we do schedule these under the same
+ * policy as general TMU.
+ */
+ c->has_general_tmu_load = true;
break;
case nir_intrinsic_get_ssbo_size:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
- nir_src_comp_as_uint(instr->src[0], 0)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
+ nir_src_comp_as_uint(instr->src[0], 0)));
break;
case nir_intrinsic_get_ubo_size:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
- nir_src_comp_as_uint(instr->src[0], 0)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
+ nir_src_comp_as_uint(instr->src[0], 0)));
break;
case nir_intrinsic_load_user_clip_plane:
for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
- nir_intrinsic_ucp_id(instr) *
- 4 + i));
+ ntq_store_def(c, &instr->def, i,
+ vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+ nir_intrinsic_ucp_id(instr) *
+ 4 + i));
}
break;
case nir_intrinsic_load_viewport_x_scale:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
break;
case nir_intrinsic_load_viewport_y_scale:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
break;
case nir_intrinsic_load_viewport_z_scale:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
break;
case nir_intrinsic_load_viewport_z_offset:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
break;
case nir_intrinsic_load_line_coord:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
break;
case nir_intrinsic_load_line_width:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
break;
case nir_intrinsic_load_aa_line_width:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
break;
case nir_intrinsic_load_sample_mask_in:
- ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
+ ntq_store_def(c, &instr->def, 0, vir_MSF(c));
break;
case nir_intrinsic_load_helper_invocation:
vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
- ntq_store_dest(c, &instr->dest, 0, qdest);
+ ntq_store_def(c, &instr->def, 0, qdest);
break;
case nir_intrinsic_load_front_face:
/* The register contains 0 (front) or 1 (back), and we need to
* turn it into a NIR bool where true means front.
*/
- ntq_store_dest(c, &instr->dest, 0,
- vir_ADD(c,
- vir_uniform_ui(c, -1),
- vir_REVF(c)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_ADD(c,
+ vir_uniform_ui(c, -1),
+ vir_REVF(c)));
break;
case nir_intrinsic_load_base_instance:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
break;
case nir_intrinsic_load_instance_id:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
break;
case nir_intrinsic_load_vertex_id:
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
+ break;
+
+ case nir_intrinsic_load_draw_id:
+ ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
break;
case nir_intrinsic_load_tlb_color_v3d:
vir_emit_tlb_color_read(c, instr);
break;
+ case nir_intrinsic_load_fep_w_v3d:
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
+ break;
+
case nir_intrinsic_load_input:
ntq_emit_load_input(c, instr);
break;
@@ -2978,7 +3579,19 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_emit_image_size(c, instr);
break;
+ /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
+ * is intended to match the semantics of GLSL's discard) should
+ * terminate the invocation immediately. Our implementation doesn't
+ * do that. What we do is actually a demote by removing the invocations
+ * from the sample mask. Maybe we could be more strict and force an
+ * early termination by emitting a (maybe conditional) jump to the
+ * end section of the fragment shader for affected invocations.
+ */
case nir_intrinsic_discard:
+ case nir_intrinsic_terminate:
+ c->emitted_discard = true;
+ FALLTHROUGH;
+ case nir_intrinsic_demote:
ntq_flush_tmu(c);
if (vir_in_nonuniform_control_flow(c)) {
@@ -2993,7 +3606,11 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
break;
- case nir_intrinsic_discard_if: {
+ case nir_intrinsic_discard_if:
+ case nir_intrinsic_terminate_if:
+ c->emitted_discard = true;
+ FALLTHROUGH;
+ case nir_intrinsic_demote_if: {
ntq_flush_tmu(c);
enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
@@ -3011,102 +3628,79 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
vir_uniform_ui(c, 0)), cond);
-
break;
}
- case nir_intrinsic_memory_barrier:
- case nir_intrinsic_memory_barrier_buffer:
- case nir_intrinsic_memory_barrier_image:
- case nir_intrinsic_memory_barrier_shared:
- case nir_intrinsic_memory_barrier_tcs_patch:
- case nir_intrinsic_group_memory_barrier:
- /* We don't do any instruction scheduling of these NIR
- * instructions between each other, so we just need to make
- * sure that the TMU operations before the barrier are flushed
+ case nir_intrinsic_barrier:
+ /* Ensure that the TMU operations before the barrier are flushed
* before the ones after the barrier.
*/
ntq_flush_tmu(c);
- break;
-
- case nir_intrinsic_control_barrier:
- /* Emit a TSY op to get all invocations in the workgroup
- * (actually supergroup) to block until the last invocation
- * reaches the TSY op.
- */
- ntq_flush_tmu(c);
- if (c->devinfo->ver >= 42) {
- vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
- V3D_QPU_WADDR_SYNCB));
- } else {
- struct qinst *sync =
- vir_BARRIERID_dest(c,
- vir_reg(QFILE_MAGIC,
- V3D_QPU_WADDR_SYNCU));
- sync->uniform =
- vir_get_uniform_index(c, QUNIFORM_CONSTANT,
- 0xffffff00 |
- V3D_TSY_WAIT_INC_CHECK);
+ if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
+ if (c->s->info.stage == MESA_SHADER_COMPUTE)
+ emit_compute_barrier(c);
+ else
+ emit_barrier(c);
+ /* The blocking of a TSY op only happens at the next
+ * thread switch. No texturing may be outstanding at the
+ * time of a TSY blocking operation.
+ */
+ vir_emit_thrsw(c);
}
-
- /* The blocking of a TSY op only happens at the next thread
- * switch. No texturing may be outstanding at the time of a
- * TSY blocking operation.
- */
- vir_emit_thrsw(c);
break;
case nir_intrinsic_load_num_workgroups:
for (int i = 0; i < 3; i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
- i));
+ ntq_store_def(c, &instr->def, i,
+ vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
+ i));
}
break;
case nir_intrinsic_load_workgroup_id: {
struct qreg x = vir_AND(c, c->cs_payload[0],
vir_uniform_ui(c, 0xffff));
+ ntq_store_def(c, &instr->def, 0, x);
struct qreg y = vir_SHR(c, c->cs_payload[0],
vir_uniform_ui(c, 16));
+ ntq_store_def(c, &instr->def, 1, y);
struct qreg z = vir_AND(c, c->cs_payload[1],
vir_uniform_ui(c, 0xffff));
+ ntq_store_def(c, &instr->def, 2, z);
+ break;
+ }
- /* We only support dispatch base in Vulkan */
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
- x = vir_ADD(c, x,
- vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
- y = vir_ADD(c, y,
- vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
- z = vir_ADD(c, z,
- vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
- }
+ case nir_intrinsic_load_base_workgroup_id: {
+ struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
+ ntq_store_def(c, &instr->def, 0, x);
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
- ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
- ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
+ struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
+ ntq_store_def(c, &instr->def, 1, y);
+
+ struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
+ ntq_store_def(c, &instr->def, 2, z);
break;
}
case nir_intrinsic_load_local_invocation_index:
- ntq_store_dest(c, &instr->dest, 0,
- emit_load_local_invocation_index(c));
+ ntq_store_def(c, &instr->def, 0,
+ emit_load_local_invocation_index(c));
break;
case nir_intrinsic_load_subgroup_id: {
/* This is basically the batch index, which is the Local
* Invocation Index divided by the SIMD width).
*/
- STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+ STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
struct qreg lii = emit_load_local_invocation_index(c);
- ntq_store_dest(c, &instr->dest, 0,
- vir_SHR(c, lii,
- vir_uniform_ui(c, divide_shift)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_SHR(c, lii,
+ vir_uniform_ui(c, divide_shift)));
break;
}
@@ -3143,8 +3737,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
struct qreg col = ntq_get_src(c, instr->src[0], 0);
for (int i = 0; i < instr->num_components; i++) {
struct qreg row = vir_uniform_ui(c, row_idx++);
- ntq_store_dest(c, &instr->dest, i,
- vir_LDVPMG_IN(c, row, col));
+ ntq_store_def(c, &instr->def, i,
+ vir_LDVPMG_IN(c, row, col));
}
break;
}
@@ -3160,47 +3754,47 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
* using ldvpm(v,d)_in (See Table 71).
*/
assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
- ntq_store_dest(c, &instr->dest, 0,
- vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
break;
}
case nir_intrinsic_load_invocation_id:
- ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
+ ntq_store_def(c, &instr->def, 0, vir_IID(c));
break;
case nir_intrinsic_load_fb_layers_v3d:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
break;
case nir_intrinsic_load_sample_id:
- ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
+ ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
break;
case nir_intrinsic_load_sample_pos:
- ntq_store_dest(c, &instr->dest, 0,
- vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
- ntq_store_dest(c, &instr->dest, 1,
- vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
+ ntq_store_def(c, &instr->def, 0,
+ vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
+ ntq_store_def(c, &instr->def, 1,
+ vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
break;
case nir_intrinsic_load_barycentric_at_offset:
- ntq_store_dest(c, &instr->dest, 0,
- vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
- ntq_store_dest(c, &instr->dest, 1,
- vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
+ ntq_store_def(c, &instr->def, 1,
+ vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
break;
case nir_intrinsic_load_barycentric_pixel:
- ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
- ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
break;
case nir_intrinsic_load_barycentric_at_sample: {
if (!c->fs_key->msaa) {
- ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
- ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
+ ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
return;
}
@@ -3208,8 +3802,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
- ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+ ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
break;
}
@@ -3219,18 +3813,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
struct qreg offset_y =
vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
- ntq_store_dest(c, &instr->dest, 0,
- vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
- ntq_store_dest(c, &instr->dest, 1,
- vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
+ ntq_store_def(c, &instr->def, 0,
+ vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
+ ntq_store_def(c, &instr->def, 1,
+ vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
break;
}
case nir_intrinsic_load_barycentric_centroid: {
struct qreg offset_x, offset_y;
ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
- ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
- ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
+ ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
break;
}
@@ -3249,8 +3843,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
*/
if (!c->fs_key->msaa ||
c->interp[input_idx].vp.file == QFILE_NULL) {
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[input_idx]));
+ ntq_store_def(c, &instr->def, i,
+ vir_MOV(c, c->inputs[input_idx]));
continue;
}
@@ -3268,30 +3862,150 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_emit_load_interpolated_input(c, p, C,
offset_x, offset_y,
interp_mode);
- ntq_store_dest(c, &instr->dest, i, result);
+ ntq_store_def(c, &instr->def, i, result);
}
break;
}
case nir_intrinsic_load_subgroup_size:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform_ui(c, V3D_CHANNELS));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform_ui(c, V3D_CHANNELS));
break;
case nir_intrinsic_load_subgroup_invocation:
- ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+ ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
break;
case nir_intrinsic_elect: {
- set_a_flags_for_subgroup(c);
- struct qreg first = vir_FLAFIRST(c);
+ struct qreg first;
+ if (vir_in_nonuniform_control_flow(c)) {
+ /* Sets A=1 for lanes enabled in the execution mask */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ /* Updates A ANDing with lanes enabled in MSF */
+ vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
+ V3D_QPU_UF_ANDNZ);
+ first = vir_FLAFIRST(c);
+ } else {
+ /* Sets A=1 for inactive lanes */
+ vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
+ V3D_QPU_PF_PUSHZ);
+ first = vir_FLNAFIRST(c);
+ }
- /* Produce a boolean result from Flafirst */
+ /* Produce a boolean result */
vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
first, vir_uniform_ui(c, 1)),
V3D_QPU_PF_PUSHZ);
struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
- ntq_store_dest(c, &instr->dest, 0, result);
+ ntq_store_def(c, &instr->def, 0, result);
+ break;
+ }
+
+ case nir_intrinsic_ballot: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_read_invocation: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ struct qreg index = ntq_get_src(c, instr->src[1], 0);
+ struct qreg res = vir_SHUFFLE(c, value, index);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_read_first_invocation: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_shuffle: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ struct qreg indices = ntq_get_src(c, instr->src[1], 0);
+ struct qreg res = vir_SHUFFLE(c, value, indices);
+ ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
+ break;
+ }
+
+ case nir_intrinsic_vote_feq:
+ case nir_intrinsic_vote_ieq: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
+ vir_ALLEQ_dest(c, res, value) :
+ vir_ALLFEQ_dest(c, res, value),
+ cond);
+
+ /* Produce boolean result */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+ V3D_QPU_PF_PUSHZ);
+ struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+ ntq_store_def(c, &instr->def, 0, result);
+ break;
+ }
+
+ case nir_intrinsic_vote_all: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+ /* We want to check if 'all lanes are equal (alleq != 0) and
+ * their value is True (value != 0)'.
+ *
+ * The first MOV.pushz generates predicate for 'alleq == 0'.
+ * The second MOV.NORZ generates predicate for:
+ * '!(alleq == 0) & !(value == 0).
+ */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+ V3D_QPU_PF_PUSHZ);
+ vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+ V3D_QPU_UF_NORZ);
+ struct qreg result =
+ ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
+ ntq_store_def(c, &instr->def, 0, result);
+ break;
+ }
+
+ case nir_intrinsic_vote_any: {
+ assert(c->devinfo->ver >= 71);
+ struct qreg value = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
+ struct qreg res = vir_get_temp(c);
+ vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
+
+ /* We want to check 'not (all lanes are equal (alleq != 0)'
+ * and their value is False (value == 0))'.
+ *
+ * The first MOV.pushz generates predicate for 'alleq == 0'.
+ * The second MOV.NORNZ generates predicate for:
+ * '!(alleq == 0) & (value == 0).
+ * The IFNA condition negates the predicate when evaluated:
+ * '!(!alleq == 0) & (value == 0))
+ */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
+ V3D_QPU_PF_PUSHZ);
+ vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
+ V3D_QPU_UF_NORNZ);
+ struct qreg result =
+ ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
+ ntq_store_def(c, &instr->def, 0, result);
break;
}
@@ -3300,8 +4014,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_view_index:
- ntq_store_dest(c, &instr->dest, 0,
- vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
+ ntq_store_def(c, &instr->def, 0,
+ vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
break;
default:
@@ -3329,6 +4043,36 @@ ntq_activate_execute_for_block(struct v3d_compile *c)
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
}
+static bool
+is_cheap_block(nir_block *block)
+{
+ int32_t cost = 3;
+ nir_foreach_instr(instr, block) {
+ switch (instr->type) {
+ case nir_instr_type_alu:
+ case nir_instr_type_undef:
+ case nir_instr_type_load_const:
+ if (--cost <= 0)
+ return false;
+ break;
+ case nir_instr_type_intrinsic: {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ continue;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+ }
+ return true;
+}
+
static void
ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
{
@@ -3473,15 +4217,27 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
c->execute,
vir_uniform_ui(c, else_block->index));
- /* Jump to ELSE if nothing is active for THEN, otherwise fall
- * through.
+ /* Set the flags for taking the THEN block */
+ vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+
+ /* Jump to ELSE if nothing is active for THEN (unless THEN block is
+ * so small it won't pay off), otherwise fall through.
*/
- vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
- vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
- vir_link_blocks(c->cur_block, else_block);
+ bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
+ is_cheap_block(nir_if_first_then_block(if_stmt));
+ if (!is_cheap) {
+ vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
+ vir_link_blocks(c->cur_block, else_block);
+ }
vir_link_blocks(c->cur_block, then_block);
- /* Process the THEN block. */
+ /* Process the THEN block.
+ *
+ * Notice we don't call ntq_activate_execute_for_block here on purpose:
+ * c->execute is already set up to be 0 for lanes that must take the
+ * THEN block.
+ */
vir_set_emit_block(c, then_block);
ntq_emit_cf_list(c, &if_stmt->then_list);
@@ -3495,13 +4251,19 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, after_block->index));
- /* If everything points at ENDIF, then jump there immediately. */
- vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
- c->execute,
- vir_uniform_ui(c, after_block->index)),
- V3D_QPU_PF_PUSHZ);
- vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
- vir_link_blocks(c->cur_block, after_block);
+ /* If everything points at ENDIF, then jump there immediately
+ * (unless ELSE block is so small it won't pay off).
+ */
+ bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
+ is_cheap_block(nir_else_block);
+ if (!is_cheap) {
+ vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
+ c->execute,
+ vir_uniform_ui(c, after_block->index)),
+ V3D_QPU_PF_PUSHZ);
+ vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
+ vir_link_blocks(c->cur_block, after_block);
+ }
vir_link_blocks(c->cur_block, else_block);
vir_set_emit_block(c, else_block);
@@ -3605,7 +4367,7 @@ ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
ntq_emit_load_const(c, nir_instr_as_load_const(instr));
break;
- case nir_instr_type_ssa_undef:
+ case nir_instr_type_undef:
unreachable("Should've been lowered by nir_lower_undef_to_zero");
break;
@@ -3699,7 +4461,6 @@ ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
static void
ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
{
-
c->loop_cont_block = vir_new_block(c);
c->loop_break_block = vir_new_block(c);
@@ -3719,6 +4480,25 @@ ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
static void
ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
{
+ assert(!nir_loop_has_continue_construct(loop));
+
+ /* Disable flags optimization for loop conditions. The problem here is
+ * that we can have code like this:
+ *
+ * // block_0
+ * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
+ * loop {
+ * // block_1
+ * if ssa_9 {
+ *
+ * In this example we emit flags to compute ssa_9 and the optimization
+ * will skip regenerating them again for the loop condition in the
+ * loop continue block (block_1). However, this is not safe after the
+ * first iteration because the loop body can stomp the flags if it has
+ * any conditionals.
+ */
+ c->flags_temp = -1;
+
bool was_in_control_flow = c->in_control_flow;
c->in_control_flow = true;
@@ -3777,7 +4557,7 @@ ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
static void
ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
{
- ntq_setup_registers(c, &impl->registers);
+ ntq_setup_registers(c, impl);
ntq_emit_cf_list(c, &impl->body);
}
@@ -3786,7 +4566,12 @@ nir_to_vir(struct v3d_compile *c)
{
switch (c->s->info.stage) {
case MESA_SHADER_FRAGMENT:
- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->start_msf = vir_MSF(c);
+ if (c->devinfo->ver < 71)
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ else
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
@@ -3799,25 +4584,16 @@ nir_to_vir(struct v3d_compile *c)
emit_fragment_varying(c, NULL, -1, 0, 0);
}
- if (c->fs_key->is_points &&
- (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
+ if (c->fs_key->is_points && program_reads_point_coord(c)) {
c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
c->uses_implicit_point_line_varyings = true;
} else if (c->fs_key->is_lines &&
- (c->devinfo->ver < 40 ||
- BITSET_TEST(c->s->info.system_values_read,
+ (BITSET_TEST(c->s->info.system_values_read,
SYSTEM_VALUE_LINE_COORD))) {
c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
c->uses_implicit_point_line_varyings = true;
}
-
- c->force_per_sample_msaa =
- c->s->info.fs.uses_sample_qualifier ||
- BITSET_TEST(c->s->info.system_values_read,
- SYSTEM_VALUE_SAMPLE_ID) ||
- BITSET_TEST(c->s->info.system_values_read,
- SYSTEM_VALUE_SAMPLE_POS);
break;
case MESA_SHADER_COMPUTE:
/* Set up the TSO for barriers, assuming we do some. */
@@ -3826,8 +4602,13 @@ nir_to_vir(struct v3d_compile *c)
V3D_QPU_WADDR_SYNC));
}
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ if (c->devinfo->ver == 42) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ } else if (c->devinfo->ver >= 71) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ }
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
@@ -3889,7 +4670,7 @@ nir_to_vir(struct v3d_compile *c)
/* Find the main function and emit the body. */
nir_foreach_function(function, c->s) {
- assert(strcmp(function->name, "main") == 0);
+ assert(function->is_entrypoint);
assert(function->impl);
ntq_emit_impl(c, function->impl);
}
@@ -3932,25 +4713,12 @@ vir_emit_last_thrsw(struct v3d_compile *c,
{
*restore_last_thrsw = c->last_thrsw;
- /* On V3D before 4.1, we need a TMU op to be outstanding when thread
- * switching, so disable threads if we didn't do any TMU ops (each of
- * which would have emitted a THRSW).
- */
- if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
- c->threads = 1;
- if (c->last_thrsw)
- vir_remove_thrsw(c);
- *restore_last_thrsw = NULL;
- }
-
/* If we're threaded and the last THRSW was in conditional code, then
* we need to emit another one so that we can flag it as the last
* thrsw.
*/
- if (c->last_thrsw && !c->last_thrsw_at_top_level) {
- assert(c->devinfo->ver >= 41);
+ if (c->last_thrsw && !c->last_thrsw_at_top_level)
vir_emit_thrsw(c);
- }
/* If we're threaded, then we need to mark the last THRSW instruction
* so we can emit a pair of them at QPU emit time.
@@ -3958,10 +4726,8 @@ vir_emit_last_thrsw(struct v3d_compile *c,
* For V3D 4.x, we can spawn the non-fragment shaders already in the
* post-last-THRSW state, so we can skip this.
*/
- if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
- assert(c->devinfo->ver >= 41);
+ if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
vir_emit_thrsw(c);
- }
/* If we have not inserted a last thread switch yet, do it now to ensure
* any potential spilling we do happens before this. If we don't spill
@@ -4006,8 +4772,8 @@ vir_check_payload_w(struct v3d_compile *c)
vir_for_each_inst_inorder(inst, c) {
for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_REG &&
- inst->src[i].index == 0) {
+ if (inst->src[i].file == c->payload_w.file &&
+ inst->src[i].index == c->payload_w.index) {
c->uses_center_w = true;
return;
}
@@ -4018,8 +4784,8 @@ vir_check_payload_w(struct v3d_compile *c)
void
v3d_nir_to_vir(struct v3d_compile *c)
{
- if (V3D_DEBUG & (V3D_DEBUG_NIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(NIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
fprintf(stderr, "%s prog %d/%d NIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
@@ -4053,8 +4819,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
unreachable("bad stage");
}
- if (V3D_DEBUG & (V3D_DEBUG_VIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(VIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
@@ -4075,8 +4841,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
* instructions until the results are needed.
*/
- if (V3D_DEBUG & (V3D_DEBUG_VIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(VIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
fprintf(stderr, "%s prog %d/%d VIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
@@ -4087,19 +4853,17 @@ v3d_nir_to_vir(struct v3d_compile *c)
/* Attempt to allocate registers for the temporaries. If we fail,
* reduce thread count and try again.
*/
- int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
+ int min_threads = 2;
struct qpu_reg *temp_registers;
while (true) {
- bool spilled;
- temp_registers = v3d_register_allocate(c, &spilled);
- if (spilled)
- continue;
-
- if (temp_registers)
+ temp_registers = v3d_register_allocate(c);
+ if (temp_registers) {
+ assert(c->spills + c->fills <= c->max_tmu_spills);
break;
+ }
if (c->threads == min_threads &&
- (V3D_DEBUG & V3D_DEBUG_RA)) {
+ V3D_DBG(RA)) {
fprintf(stderr,
"Failed to register allocate using %s\n",
c->fallback_scheduler ? "the fallback scheduler:" :
@@ -4116,18 +4880,20 @@ v3d_nir_to_vir(struct v3d_compile *c)
}
if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
- if (V3D_DEBUG & V3D_DEBUG_PERF) {
+ if (V3D_DBG(PERF)) {
fprintf(stderr,
- "Failed to register allocate %s at "
- "%d threads.\n", vir_get_stage_name(c),
- c->threads);
+ "Failed to register allocate %s "
+ "prog %d/%d at %d threads.\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id, c->threads);
}
c->compilation_result =
V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
return;
}
- c->spill_count = 0;
+ c->spills = 0;
+ c->fills = 0;
c->threads /= 2;
if (c->threads == 1)
@@ -4141,8 +4907,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
if (c->spills &&
- (V3D_DEBUG & (V3D_DEBUG_VIR |
- v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+ (V3D_DBG(VIR) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
vir_get_stage_name(c),
c->program_id, c->variant_id);
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index c559814b9ea..ba76ac87e1e 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -85,6 +85,7 @@ struct schedule_state {
struct schedule_node *last_unif;
struct schedule_node *last_rtop;
struct schedule_node *last_unifa;
+ struct schedule_node *last_setmsf;
enum direction dir;
/* Estimated cycle when the current instruction would start. */
uint32_t time;
@@ -97,7 +98,7 @@ add_dep(struct schedule_state *state,
bool write)
{
bool write_after_read = !write && state->dir == R;
- void *edge_data = (void *)(uintptr_t)write_after_read;
+ uintptr_t edge_data = write_after_read;
if (!before || !after)
return;
@@ -136,12 +137,14 @@ qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (inst->alu.add.magic_write &&
+ if (inst->alu.add.op != V3D_QPU_A_NOP &&
+ inst->alu.add.magic_write &&
(inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
return true;
- if (inst->alu.mul.magic_write &&
+ if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+ inst->alu.mul.magic_write &&
(inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
return true;
@@ -153,12 +156,13 @@ static void
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
enum v3d_qpu_mux mux)
{
+ assert(state->devinfo->ver < 71);
switch (mux) {
case V3D_QPU_MUX_A:
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
break;
case V3D_QPU_MUX_B:
- if (!n->inst->qpu.sig.small_imm) {
+ if (!n->inst->qpu.sig.small_imm_b) {
add_read_dep(state,
state->last_rf[n->inst->qpu.raddr_b], n);
}
@@ -169,6 +173,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
}
}
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+ uint8_t raddr, bool is_small_imm)
+{
+ assert(state->devinfo->ver >= 71);
+
+ if (!is_small_imm)
+ add_read_dep(state, state->last_rf[raddr], n);
+}
+
static bool
tmu_write_is_sequence_terminator(uint32_t waddr)
{
@@ -188,9 +203,6 @@ tmu_write_is_sequence_terminator(uint32_t waddr)
static bool
can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
{
- if (devinfo->ver < 40)
- return false;
-
if (tmu_write_is_sequence_terminator(waddr))
return false;
@@ -253,8 +265,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
break;
case V3D_QPU_WADDR_UNIFA:
- if (state->devinfo->ver >= 40)
- add_write_dep(state, &state->last_unifa, n);
+ add_write_dep(state, &state->last_unifa, n);
break;
case V3D_QPU_WADDR_NOP:
@@ -283,6 +294,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* If the input and output segments are shared, then all VPM reads to
* a location need to happen before all writes. We handle this by
* serializing all VPM operations for now.
+ *
+ * FIXME: we are assuming that the segments are shared. That is
+ * correct right now as we are only using shared, but technically you
+ * can choose.
*/
bool separate_vpm_segment = false;
@@ -303,15 +318,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* XXX: LOAD_IMM */
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
- process_mux_deps(state, n, inst->alu.add.a);
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
- process_mux_deps(state, n, inst->alu.add.b);
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.a.raddr,
+ inst->sig.small_imm_a);
+ }
+ }
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.b.raddr,
+ inst->sig.small_imm_b);
+ }
+ }
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
- process_mux_deps(state, n, inst->alu.mul.a);
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
- process_mux_deps(state, n, inst->alu.mul.b);
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+ inst->sig.small_imm_c);
+ }
+ }
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+ inst->sig.small_imm_d);
+ }
+ }
switch (inst->alu.add.op) {
case V3D_QPU_A_VPMSETUP:
@@ -340,13 +379,24 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
case V3D_QPU_A_MSF:
add_read_dep(state, state->last_tlb, n);
+ add_read_dep(state, state->last_setmsf, n);
break;
case V3D_QPU_A_SETMSF:
+ add_write_dep(state, &state->last_setmsf, n);
+ add_write_dep(state, &state->last_tmu_write, n);
+ FALLTHROUGH;
case V3D_QPU_A_SETREVF:
add_write_dep(state, &state->last_tlb, n);
break;
+ case V3D_QPU_A_BALLOT:
+ case V3D_QPU_A_BCASTF:
+ case V3D_QPU_A_ALLEQ:
+ case V3D_QPU_A_ALLFEQ:
+ add_read_dep(state, state->last_setmsf, n);
+ break;
+
default:
break;
}
@@ -384,6 +434,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_write_dep(state, &state->last_r[4], n);
if (v3d_qpu_writes_r5(devinfo, inst))
add_write_dep(state, &state->last_r[5], n);
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+ add_write_dep(state, &state->last_rf[0], n);
/* If we add any more dependencies here we should consider whether we
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -492,9 +544,16 @@ struct choose_scoreboard {
int last_thrsw_tick;
int last_branch_tick;
int last_setmsf_tick;
- bool tlb_locked;
+ bool first_thrsw_emitted;
+ bool last_thrsw_emitted;
bool fixup_ldvary;
int ldvary_count;
+ int pending_ldtmu_count;
+ bool first_ldtmu_after_thrsw;
+
+ /* V3D 7.x */
+ int last_implicit_rf0_write_tick;
+ bool has_rf0_flops_conflict;
};
static bool
@@ -519,7 +578,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
}
static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ switch (raddr) {
+ case 0: /* ldvary delayed write of C coefficient to rf0 */
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
struct qinst *qinst)
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -531,24 +607,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
if (inst->alu.add.op != V3D_QPU_A_NOP) {
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+ return true;
+ }
}
}
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
}
@@ -572,45 +668,83 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
v3d_qpu_writes_r4(devinfo, inst))
return true;
+ if (devinfo->ver == 42)
+ return false;
+
+ /* Don't schedule anything that writes rf0 right after ldvary, since
+ * that would clash with the ldvary's delayed rf0 write (the exception
+ * is another ldvary, since its implicit rf0 write would also have
+ * one cycle of delay and would not clash).
+ */
+ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !inst->sig.ldvary))) {
+ return true;
+ }
+
return false;
}
static bool
-pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
+scoreboard_is_locked(struct choose_scoreboard *scoreboard,
+ bool lock_scoreboard_on_first_thrsw)
+{
+ if (lock_scoreboard_on_first_thrsw) {
+ return scoreboard->first_thrsw_emitted &&
+ scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+ }
+
+ return scoreboard->last_thrsw_emitted &&
+ scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
+}
+
+static bool
+pixel_scoreboard_too_soon(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct v3d_qpu_instr *inst)
{
- return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
+ return qpu_inst_is_tlb(inst) &&
+ !scoreboard_is_locked(scoreboard,
+ c->lock_scoreboard_on_first_thrsw);
}
static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
uint32_t waddr) {
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
- inst->raddr_a == waddr)
- return true;
+ if (devinfo->ver < 71) {
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+ inst->raddr_a == waddr)
+ return true;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
- !inst->sig.small_imm && (inst->raddr_b == waddr))
- return true;
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+ return true;
+ } else {
+ if (v3d71_qpu_reads_raddr(inst, waddr))
+ return true;
+ }
return false;
}
static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
{
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
- qpu_instruction_uses_rf(inst,
+ qpu_instruction_uses_rf(devinfo, inst,
scoreboard->last_stallable_sfu_reg);
}
/* We define a max schedule priority to allow negative priorities as result of
- * substracting this max when an instruction stalls. So instructions that
+ * subtracting this max when an instruction stalls. So instructions that
* stall have lower priority than regular instructions. */
#define MAX_SCHEDULE_PRIORITY 16
@@ -628,19 +762,32 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
return next_score;
next_score++;
+ /* Empirical testing shows that using priorities to hide latency of
+ * TMU operations when scheduling QPU leads to slightly worse
+ * performance, even at 2 threads. We think this is because the thread
+ * switching is already quite effective at hiding latency and NIR
+ * scheduling (and possibly TMU pipelining too) are sufficient to hide
+ * TMU latency, so piling up on that here doesn't provide any benefits
+ * and instead may cause us to postpone critical paths that depend on
+ * the TMU results.
+ */
+#if 0
/* Schedule texture read results collection late to hide latency. */
if (v3d_qpu_waits_on_tmu(inst))
return next_score;
next_score++;
+#endif
/* Default score for things that aren't otherwise special. */
baseline_score = next_score;
next_score++;
+#if 0
/* Schedule texture read setup early to hide their latency better. */
if (v3d_qpu_writes_tmu(devinfo, inst))
return next_score;
next_score++;
+#endif
/* We should increase the maximum if we assert here */
assert(next_score < MAX_SCHEDULE_PRIORITY);
@@ -648,48 +795,59 @@ get_instruction_priority(const struct v3d_device_info *devinfo,
return baseline_score;
}
-static bool
-qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
- enum v3d_qpu_waddr waddr)
-{
- return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
- v3d_qpu_magic_waddr_is_sfu(waddr) ||
- v3d_qpu_magic_waddr_is_tlb(waddr) ||
- v3d_qpu_magic_waddr_is_vpm(waddr) ||
- v3d_qpu_magic_waddr_is_tsy(waddr));
-}
+enum {
+ V3D_PERIPHERAL_VPM_READ = (1 << 0),
+ V3D_PERIPHERAL_VPM_WRITE = (1 << 1),
+ V3D_PERIPHERAL_VPM_WAIT = (1 << 2),
+ V3D_PERIPHERAL_SFU = (1 << 3),
+ V3D_PERIPHERAL_TMU_WRITE = (1 << 4),
+ V3D_PERIPHERAL_TMU_READ = (1 << 5),
+ V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
+ V3D_PERIPHERAL_TSY = (1 << 8),
+ V3D_PERIPHERAL_TLB_READ = (1 << 9),
+ V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
+};
-static bool
-qpu_accesses_peripheral(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *inst)
+static uint32_t
+qpu_peripherals(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst)
{
- if (v3d_qpu_uses_vpm(inst))
- return true;
+ uint32_t result = 0;
+ if (v3d_qpu_reads_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_READ;
+ if (v3d_qpu_writes_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_WRITE;
+ if (v3d_qpu_waits_vpm(inst))
+ result |= V3D_PERIPHERAL_VPM_WAIT;
+
+ if (v3d_qpu_writes_tmu(devinfo, inst))
+ result |= V3D_PERIPHERAL_TMU_WRITE;
+ if (inst->sig.ldtmu)
+ result |= V3D_PERIPHERAL_TMU_READ;
+ if (inst->sig.wrtmuc)
+ result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
+
if (v3d_qpu_uses_sfu(inst))
- return true;
+ result |= V3D_PERIPHERAL_SFU;
+
+ if (v3d_qpu_reads_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_READ;
+ if (v3d_qpu_writes_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_WRITE;
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
- qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
- return true;
+ v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
+ result |= V3D_PERIPHERAL_TSY;
}
if (inst->alu.add.op == V3D_QPU_A_TMUWT)
- return true;
-
- if (inst->alu.mul.op != V3D_QPU_M_NOP &&
- inst->alu.mul.magic_write &&
- qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
- return true;
- }
+ result |= V3D_PERIPHERAL_TMU_WAIT;
}
- return (inst->sig.ldvpm ||
- inst->sig.ldtmu ||
- inst->sig.ldtlb ||
- inst->sig.ldtlbu ||
- inst->sig.wrtmuc);
+ return result;
}
static bool
@@ -697,30 +855,82 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *a,
const struct v3d_qpu_instr *b)
{
- const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
- const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
+ const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
+ const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
/* We can always do one peripheral access per instruction. */
- if (!a_uses_peripheral || !b_uses_peripheral)
+ if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
return true;
- if (devinfo->ver < 41)
+ /* V3D 4.x can't do more than one peripheral access except in a
+ * few cases:
+ */
+ if (devinfo->ver == 42) {
+ /* WRTMUC signal with TMU register write (other than tmuc). */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ }
+
+ /* TMU read with VPM read/write. */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+
return false;
+ }
- /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
- * WRTMUC with a TMU magic register write (other than tmuc).
- */
- if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
- (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
- return true;
+ /* V3D 7.x can't have more than one of these restricted peripherals */
+ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+ V3D_PERIPHERAL_TSY |
+ V3D_PERIPHERAL_TLB_READ |
+ V3D_PERIPHERAL_SFU |
+ V3D_PERIPHERAL_VPM_READ |
+ V3D_PERIPHERAL_VPM_WRITE;
+
+ const uint32_t a_restricted = a_peripherals & restricted;
+ const uint32_t b_restricted = b_peripherals & restricted;
+ if (a_restricted && b_restricted) {
+ /* WRTMUC signal with TMU register write (other than tmuc) is
+ * allowed though.
+ */
+ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+ return false;
+ }
}
- if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
- (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
- return true;
+ /* Only one TMU read per instruction */
+ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+ return false;
}
- return false;
+ /* Only one TLB access per instruction */
+ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ)) &&
+ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ))) {
+ return false;
+ }
+
+ return true;
}
/* Compute a bitmask of which rf registers are used between
@@ -736,42 +946,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
uint64_t raddrs_used = 0;
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
raddrs_used |= (1ll << a->raddr_a);
- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
raddrs_used |= (1ll << a->raddr_b);
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
raddrs_used |= (1ll << b->raddr_a);
- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
raddrs_used |= (1ll << b->raddr_b);
return raddrs_used;
}
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
*/
static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *add_instr,
- const struct v3d_qpu_instr *mul_instr)
+ const struct v3d_qpu_instr *mul_instr,
+ const struct v3d_device_info *devinfo)
{
+ if (devinfo->ver >= 71) {
+ assert(add_instr->sig.small_imm_a +
+ add_instr->sig.small_imm_b <= 1);
+ assert(add_instr->sig.small_imm_c +
+ add_instr->sig.small_imm_d == 0);
+ assert(mul_instr->sig.small_imm_a +
+ mul_instr->sig.small_imm_b == 0);
+ assert(mul_instr->sig.small_imm_c +
+ mul_instr->sig.small_imm_d <= 1);
+
+ result->sig.small_imm_a = add_instr->sig.small_imm_a;
+ result->sig.small_imm_b = add_instr->sig.small_imm_b;
+ result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+ result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+ return (result->sig.small_imm_a +
+ result->sig.small_imm_b +
+ result->sig.small_imm_c +
+ result->sig.small_imm_d) <= 1;
+ }
+
+ assert(devinfo->ver == 42);
+
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
int naddrs = util_bitcount64(raddrs_used);
if (naddrs > 2)
return false;
- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
if (naddrs > 1)
return false;
- if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
if (add_instr->raddr_b != mul_instr->raddr_b)
return false;
- result->sig.small_imm = true;
- result->raddr_b = add_instr->sig.small_imm ?
+ result->sig.small_imm_b = true;
+ result->raddr_b = add_instr->sig.small_imm_b ?
add_instr->raddr_b : mul_instr->raddr_b;
}
@@ -782,23 +1017,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
raddrs_used &= ~(1ll << raddr_a);
result->raddr_a = raddr_a;
- if (!result->sig.small_imm) {
+ if (!result->sig.small_imm_b) {
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
raddr_a == add_instr->raddr_b) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_B)
- result->alu.add.a = V3D_QPU_MUX_A;
- if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+ result->alu.add.a.mux = V3D_QPU_MUX_A;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_A;
+ result->alu.add.b.mux = V3D_QPU_MUX_A;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
raddr_a == mul_instr->raddr_b) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
- result->alu.mul.a = V3D_QPU_MUX_A;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+ result->alu.mul.a.mux = V3D_QPU_MUX_A;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_A;
+ result->alu.mul.b.mux = V3D_QPU_MUX_A;
}
}
}
@@ -809,20 +1044,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
result->raddr_b = raddr_b;
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
raddr_b == add_instr->raddr_a) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_A)
- result->alu.add.a = V3D_QPU_MUX_B;
- if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+ result->alu.add.a.mux = V3D_QPU_MUX_B;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_B;
+ result->alu.add.b.mux = V3D_QPU_MUX_B;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
raddr_b == mul_instr->raddr_a) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
- result->alu.mul.a = V3D_QPU_MUX_B;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+ result->alu.mul.a.mux = V3D_QPU_MUX_B;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_B;
+ result->alu.mul.b.mux = V3D_QPU_MUX_B;
}
}
@@ -855,7 +1090,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
}
static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst)
{
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -871,6 +1107,87 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->flags.ac = V3D_QPU_COND_NONE;
inst->flags.apf = V3D_QPU_PF_NONE;
inst->flags.auf = V3D_QPU_UF_NONE;
+
+ inst->alu.mul.output_pack = inst->alu.add.output_pack;
+
+ inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+ inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
+ inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ if (devinfo->ver >= 71) {
+ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+ if (inst->sig.small_imm_a) {
+ inst->sig.small_imm_c = true;
+ inst->sig.small_imm_a = false;
+ } else if (inst->sig.small_imm_b) {
+ inst->sig.small_imm_d = true;
+ inst->sig.small_imm_b = false;
+ }
+ }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ case V3D_QPU_M_FMOV:
+ return devinfo->ver >= 71;
+ default:
+ return false;
+ }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ return V3D_QPU_A_MOV;
+ case V3D_QPU_M_FMOV:
+ return V3D_QPU_A_FMOV;
+ default:
+ unreachable("unexpected mov opcode");
+ }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+ assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+ inst->alu.mul.op = V3D_QPU_M_NOP;
+
+ inst->flags.ac = inst->flags.mc;
+ inst->flags.apf = inst->flags.mpf;
+ inst->flags.auf = inst->flags.muf;
+ inst->flags.mc = V3D_QPU_COND_NONE;
+ inst->flags.mpf = V3D_QPU_PF_NONE;
+ inst->flags.muf = V3D_QPU_UF_NONE;
+
+ inst->alu.add.output_pack = inst->alu.mul.output_pack;
+ inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+ inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+ if (inst->sig.small_imm_c) {
+ inst->sig.small_imm_a = true;
+ inst->sig.small_imm_c = false;
+ } else if (inst->sig.small_imm_d) {
+ inst->sig.small_imm_b = true;
+ inst->sig.small_imm_d = false;
+ }
}
static bool
@@ -909,20 +1226,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(b->alu.add.op)) {
mul_inst = *b;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge.alu.mul = mul_inst.alu.mul;
- merge.flags.mc = b->flags.ac;
- merge.flags.mpf = b->flags.apf;
- merge.flags.muf = b->flags.auf;
+ merge.flags.mc = mul_inst.flags.mc;
+ merge.flags.mpf = mul_inst.flags.mpf;
+ merge.flags.muf = mul_inst.flags.muf;
add_instr = a;
mul_instr = &mul_inst;
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(a->alu.add.op)) {
mul_inst = *a;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge = mul_inst;
merge.alu.add = b->alu.add;
@@ -938,22 +1255,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
}
}
+ struct v3d_qpu_instr add_inst;
if (b->alu.mul.op != V3D_QPU_M_NOP) {
- if (a->alu.mul.op != V3D_QPU_M_NOP)
- return false;
- merge.alu.mul = b->alu.mul;
+ if (a->alu.mul.op == V3D_QPU_M_NOP) {
+ merge.alu.mul = b->alu.mul;
- merge.flags.mc = b->flags.mc;
- merge.flags.mpf = b->flags.mpf;
- merge.flags.muf = b->flags.muf;
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
- mul_instr = b;
- add_instr = a;
+ mul_instr = b;
+ add_instr = a;
+ }
+ /* If a's mul op is used but its add op is not, then see if we
+ * can convert either a's mul op or b's mul op to an add op
+ * so we can merge.
+ */
+ else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+ add_inst = *b;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge.alu.add = add_inst.alu.add;
+
+ merge.flags.ac = add_inst.flags.ac;
+ merge.flags.apf = add_inst.flags.apf;
+ merge.flags.auf = add_inst.flags.auf;
+
+ mul_instr = a;
+ add_instr = &add_inst;
+ } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+ add_inst = *a;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge = add_inst;
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = &add_inst;
+ } else {
+ return false;
+ }
}
+ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+ * they have restrictions on the number of raddrs that can be adressed
+ * in a single instruction. In V3D 7.x, we don't have that restriction,
+ * but we are still limited to a single small immediate per instruction.
+ */
if (add_instr && mul_instr &&
- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
- return false;
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+ return false;
}
merge.sig.thrsw |= b->sig.thrsw;
@@ -964,7 +1321,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.sig.ldtmu |= b->sig.ldtmu;
merge.sig.ldvary |= b->sig.ldvary;
merge.sig.ldvpm |= b->sig.ldvpm;
- merge.sig.small_imm |= b->sig.small_imm;
merge.sig.ldtlb |= b->sig.ldtlb;
merge.sig.ldtlbu |= b->sig.ldtlbu;
merge.sig.ucb |= b->sig.ucb;
@@ -1047,24 +1403,25 @@ retry:
* regfile A or B that was written to by the previous
* instruction."
*/
- if (reads_too_soon_after_write(scoreboard, n->inst))
+ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
- /* "A scoreboard wait must not occur in the first two
- * instructions of a fragment shader. This is either the
- * explicit Wait for Scoreboard signal or an implicit wait
- * with the first tile-buffer read or write instruction."
+ /* "Before doing a TLB access a scoreboard wait must have been
+ * done. This happens either on the first or last thread
+ * switch, depending on a setting (scb_wait_on_first_thrsw) in
+ * the shader state."
*/
- if (pixel_scoreboard_too_soon(scoreboard, inst))
+ if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* ldunif and ldvary both write r5, but ldunif does so a tick
- * sooner. If the ldvary's r5 wasn't used, then ldunif might
+ /* ldunif and ldvary both write the same register (r5 for v42
+ * and below, rf0 for v71), but ldunif does so a tick sooner.
+ * If the ldvary's register wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
- * r5 in the same tick.
+ * the register in the same tick.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1131,24 +1488,54 @@ retry:
continue;
}
- /* Don't merge in something that will lock the TLB.
- * Hopwefully what we have in inst will release some
- * other instructions, allowing us to delay the
- * TLB-locking instruction until later.
+ /* Don't merge TLB instructions before we have acquired
+ * the scoreboard lock.
*/
- if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+ if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* When we succesfully pair up an ldvary we then try
+ /* When we successfully pair up an ldvary we then try
* to merge it into the previous instruction if
* possible to improve pipelining. Don't pick up the
* ldvary now if the follow-up fixup would place
* it in the delay slots of a thrsw, which is not
* allowed and would prevent the fixup from being
- * successul.
+ * successful. In V3D 7.x we can allow this to happen
+ * as long as it is not the last delay slot.
*/
- if (inst->sig.ldvary &&
- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
+ if (inst->sig.ldvary) {
+ if (c->devinfo->ver == 42 &&
+ scoreboard->last_thrsw_tick + 2 >=
+ scoreboard->tick - 1) {
+ continue;
+ }
+ if (c->devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 ==
+ scoreboard->tick - 1) {
+ continue;
+ }
+ }
+
+ /* We can emit a new tmu lookup with a previous ldtmu
+ * if doing this would free just enough space in the
+ * TMU output fifo so we don't overflow, however, this
+ * is only safe if the ldtmu cannot stall.
+ *
+ * A ldtmu can stall if it is not the first following a
+ * thread switch and corresponds to the first word of a
+ * read request.
+ *
+ * FIXME: For now we forbid pairing up a new lookup
+ * with a previous ldtmu that is not the first after a
+ * thrsw if that could overflow the TMU output fifo
+ * regardless of whether the ldtmu is reading the first
+ * word of a TMU result or not, since we don't track
+ * this aspect in the compiler yet.
+ */
+ if (prev_inst->inst->qpu.sig.ldtmu &&
+ !scoreboard->first_ldtmu_after_thrsw &&
+ (scoreboard->pending_ldtmu_count +
+ n->inst->ldtmu_count > 16 / c->threads)) {
continue;
}
@@ -1161,7 +1548,7 @@ retry:
int prio = get_instruction_priority(c->devinfo, inst);
- if (mux_read_stalls(scoreboard, inst)) {
+ if (read_stalls(c->devinfo, scoreboard, inst)) {
/* Don't merge an instruction that stalls */
if (prev_inst)
continue;
@@ -1225,7 +1612,7 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
{
if (v3d_qpu_magic_waddr_is_sfu(waddr))
scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
- else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
+ else if (waddr == V3D_QPU_WADDR_UNIFA)
scoreboard->last_unifa_write_tick = scoreboard->tick;
}
@@ -1240,10 +1627,87 @@ update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
}
static void
+update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+ const struct qinst *inst)
+{
+ /* Track if the have seen any ldtmu after the last thread switch */
+ if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
+ scoreboard->first_ldtmu_after_thrsw = true;
+
+ /* Track the number of pending ldtmu instructions for outstanding
+ * TMU lookups.
+ */
+ scoreboard->pending_ldtmu_count += inst->ldtmu_count;
+ if (inst->qpu.sig.ldtmu) {
+ assert(scoreboard->pending_ldtmu_count > 0);
+ scoreboard->pending_ldtmu_count--;
+ scoreboard->first_ldtmu_after_thrsw = false;
+ }
+}
+
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+ v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ scoreboard->has_rf0_flops_conflict = true;
+ }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return;
+
+ /* Thread switch restrictions:
+ *
+ * At the point of a thread switch or thread end (when the actual
+ * thread switch or thread end happens, not when the signalling
+ * instruction is processed):
+ *
+ * - If the most recent write to rf0 was from a ldunif, ldunifa, or
+ * ldvary instruction in which another signal also wrote to the
+ * register file, and the final instruction of the thread section
+ * contained a signal which wrote to the register file, then the
+ * value of rf0 is undefined at the start of the new section
+ *
+ * Here we use the scoreboard to track if our last rf0 implicit write
+ * happens at the same time that another signal writes the register
+ * file (has_rf0_flops_conflict). We will use that information when
+ * scheduling thrsw instructions to avoid putting anything in their
+ * last delay slot which has a signal that writes to the register file.
+ */
+
+ /* Reset tracking if we have an explicit rf0 write or we are starting
+ * a new thread section.
+ */
+ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+ scoreboard->last_implicit_rf0_write_tick = -10;
+ scoreboard->has_rf0_flops_conflict = false;
+ }
+
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+ scoreboard->tick + 1 : scoreboard->tick;
+ }
+
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
+static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst,
+ const struct qinst *qinst,
const struct v3d_device_info *devinfo)
{
+ const struct v3d_qpu_instr *inst = &qinst->qpu;
+
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
return;
@@ -1271,11 +1735,18 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
}
}
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
+ update_scoreboard_for_magic_waddr(scoreboard,
+ inst->sig_addr,
+ devinfo);
+ }
+
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
- if (qpu_inst_is_tlb(inst))
- scoreboard->tlb_locked = true;
+ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
+ update_scoreboard_tmu_tracking(scoreboard, qinst);
}
static void
@@ -1352,23 +1823,25 @@ instruction_latency(const struct v3d_device_info *devinfo,
after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
return latency;
- if (before_inst->alu.add.magic_write) {
+ if (v3d_qpu_instr_is_sfu(before_inst))
+ return 2;
+
+ if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
+ before_inst->alu.add.magic_write) {
latency = MAX2(latency,
magic_waddr_latency(devinfo,
before_inst->alu.add.waddr,
after_inst));
}
- if (before_inst->alu.mul.magic_write) {
+ if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
+ before_inst->alu.mul.magic_write) {
latency = MAX2(latency,
magic_waddr_latency(devinfo,
before_inst->alu.mul.waddr,
after_inst));
}
- if (v3d_qpu_instr_is_sfu(before_inst))
- return 2;
-
return latency;
}
@@ -1437,7 +1910,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
{
list_addtail(&inst->link, &block->instructions);
- update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
+ update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
c->qpu_inst_count++;
scoreboard->tick++;
}
@@ -1464,16 +1937,13 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
- /* Only TLB Z writes are prohibited in the last slot, but we don't
- * have those flagged so prohibit all TLB ops for now.
- */
- if (slot == 2 && qpu_inst_is_tlb(inst))
+ if (slot == 2 && qinst->is_tlb_z_write)
return false;
if (slot > 0 && qinst->uniform != ~0)
return false;
- if (v3d_qpu_uses_vpm(inst))
+ if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
return false;
if (inst->sig.ldvary)
@@ -1481,36 +1951,64 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
/* GFXH-1625: TMUWT not allowed in the final instruction. */
- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+ if (c->devinfo->ver == 42 && slot == 2 &&
+ inst->alu.add.op == V3D_QPU_A_TMUWT) {
return false;
+ }
- /* No writing physical registers at the end. */
- if (!inst->alu.add.magic_write ||
- !inst->alu.mul.magic_write) {
- return false;
+ if (c->devinfo->ver == 42) {
+ /* No writing physical registers at the end. */
+ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+ if ((!add_is_nop && !inst->alu.add.magic_write) ||
+ (!mul_is_nop && !inst->alu.mul.magic_write)) {
+ return false;
+ }
+
+ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ return false;
+ }
}
- if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* The thread end instruction must not write to the
+ * register file via the add/mul ALUs.
+ */
+ if (slot == 0 &&
+ (!inst->alu.add.magic_write ||
+ !inst->alu.mul.magic_write)) {
+ return false;
+ }
+ }
- /* RF0-2 might be overwritten during the delay slots by
- * fragment shader setup.
- */
- if (inst->raddr_a < 3 &&
- (inst->alu.add.a == V3D_QPU_MUX_A ||
- inst->alu.add.b == V3D_QPU_MUX_A ||
- inst->alu.mul.a == V3D_QPU_MUX_A ||
- inst->alu.mul.b == V3D_QPU_MUX_A)) {
- return false;
+ if (c->devinfo->ver == 42) {
+ /* RF0-2 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+ return false;
+
+ if (inst->raddr_b < 3 &&
+ !inst->sig.small_imm_b &&
+ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+ return false;
+ }
}
- if (inst->raddr_b < 3 &&
- !inst->sig.small_imm &&
- (inst->alu.add.a == V3D_QPU_MUX_B ||
- inst->alu.add.b == V3D_QPU_MUX_B ||
- inst->alu.mul.a == V3D_QPU_MUX_B ||
- inst->alu.mul.b == V3D_QPU_MUX_B)) {
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* RF2-3 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (v3d71_qpu_reads_raddr(inst, 2) ||
+ v3d71_qpu_reads_raddr(inst, 3)) {
+ return false;
+ }
+
+ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+ return false;
+ }
}
}
@@ -1526,6 +2024,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
*/
static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
uint32_t slot)
{
@@ -1533,15 +2032,19 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* thread. The simulator complains for safety, though it
* would only occur for dead code in our case.
*/
- if (slot > 0 &&
- qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
- (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
- v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
- return false;
+ if (slot > 0) {
+ if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
+ return false;
+ if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
+ return false;
}
- if (slot > 0 && qinst->qpu.sig.ldvary)
- return false;
+ if (qinst->qpu.sig.ldvary) {
+ if (c->devinfo->ver == 42 && slot > 0)
+ return false;
+ if (c->devinfo->ver >= 71 && slot == 2)
+ return false;
+ }
/* unifa and the following 3 instructions can't overlap a
* thread switch/end. The docs further clarify that this means
@@ -1560,6 +2063,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
return false;
+ /* See comment when we set has_rf0_flops_conflict for details */
+ if (c->devinfo->ver >= 71 &&
+ slot == 2 &&
+ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+ !qinst->qpu.sig_magic) {
+ if (scoreboard->has_rf0_flops_conflict)
+ return false;
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+ return false;
+ }
+
return true;
}
@@ -1579,7 +2093,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
assert(slot <= 2);
/* We merge thrsw instructions back into the instruction stream
- * manually, so any instructions scheduled after a thrsw shold be
+ * manually, so any instructions scheduled after a thrsw should be
* in the actual delay slots and not in the same slot as the thrsw.
*/
assert(slot >= 1);
@@ -1592,7 +2106,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* also apply to instructions scheduled after the thrsw that we want
* to place in its delay slots.
*/
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
return false;
/* TLB access is disallowed until scoreboard wait is executed, which
@@ -1648,6 +2162,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_flags(&qinst->qpu))
return false;
+ /* TSY sync ops materialize at the point of the next thread switch,
+ * therefore, if we have a TSY sync right after a thread switch, we
+ * cannot place it in its delay slots, or we would be moving the sync
+ * to the thrsw before it instead.
+ */
+ if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
+ return false;
+
return true;
}
@@ -1656,15 +2178,11 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
struct qinst *qinst, int instructions_in_sequence,
bool is_thrend)
{
- /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
- if (scoreboard->last_thrsw_tick + 3 >
- scoreboard->tick - instructions_in_sequence) {
- return false;
- }
-
for (int slot = 0; slot < instructions_in_sequence; slot++) {
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+ qinst, slot)) {
return false;
+ }
if (is_thrend &&
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1714,26 +2232,77 @@ emit_thrsw(struct v3d_compile *c,
/* Find how far back into previous instructions we can put the THRSW. */
int slots_filled = 0;
+ int invalid_sig_count = 0;
+ int invalid_seq_count = 0;
+ bool last_thrsw_after_invalid_ok = false;
struct qinst *merge_inst = NULL;
vir_for_each_inst_rev(prev_inst, block) {
- struct v3d_qpu_sig sig = prev_inst->qpu.sig;
- sig.thrsw = true;
- uint32_t packed_sig;
-
- if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
+ /* No emitting our thrsw while the previous thrsw hasn't
+ * happened yet.
+ */
+ if (scoreboard->last_thrsw_tick + 3 >
+ scoreboard->tick - (slots_filled + 1)) {
break;
+ }
+
if (!valid_thrsw_sequence(c, scoreboard,
prev_inst, slots_filled + 1,
is_thrend)) {
- break;
+ /* Even if the current sequence isn't valid, we may
+ * be able to get a valid sequence by trying to move the
+ * thrsw earlier, so keep going.
+ */
+ invalid_seq_count++;
+ goto cont_block;
+ }
+
+ struct v3d_qpu_sig sig = prev_inst->qpu.sig;
+ sig.thrsw = true;
+ uint32_t packed_sig;
+ if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
+ /* If we can't merge the thrsw here because of signal
+ * incompatibility, keep going, we might be able to
+ * merge it in an earlier instruction.
+ */
+ invalid_sig_count++;
+ goto cont_block;
}
+ /* For last thrsw we need 2 consecutive slots that are
+ * thrsw compatible, so if we have previously jumped over
+ * an incompatible signal, flag that we have found the first
+ * valid slot here and keep going.
+ */
+ if (inst->is_last_thrsw && invalid_sig_count > 0 &&
+ !last_thrsw_after_invalid_ok) {
+ last_thrsw_after_invalid_ok = true;
+ invalid_sig_count++;
+ goto cont_block;
+ }
+
+ /* We can merge the thrsw in this instruction */
+ last_thrsw_after_invalid_ok = false;
+ invalid_sig_count = 0;
+ invalid_seq_count = 0;
merge_inst = prev_inst;
+
+cont_block:
if (++slots_filled == 3)
break;
}
+ /* If we jumped over a signal incompatibility and did not manage to
+ * merge the thrsw in the end, we need to adjust slots filled to match
+ * the last valid merge point.
+ */
+ assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
+ slots_filled >= invalid_sig_count + invalid_seq_count);
+ if (invalid_sig_count > 0)
+ slots_filled -= invalid_sig_count;
+ if (invalid_seq_count > 0)
+ slots_filled -= invalid_seq_count;
+
bool needs_free = false;
if (merge_inst) {
merge_inst->qpu.sig.thrsw = true;
@@ -1747,6 +2316,8 @@ emit_thrsw(struct v3d_compile *c,
merge_inst = inst;
}
+ scoreboard->first_thrsw_emitted = true;
+
/* If we're emitting the last THRSW (other than program end), then
* signal that to the HW by emitting two THRSWs in a row.
*/
@@ -1758,6 +2329,7 @@ emit_thrsw(struct v3d_compile *c,
struct qinst *second_inst =
(struct qinst *)merge_inst->link.next;
second_inst->qpu.sig.thrsw = true;
+ scoreboard->last_thrsw_emitted = true;
}
/* Make sure the thread end executes within the program lifespan */
@@ -1811,10 +2383,11 @@ emit_branch(struct v3d_compile *c,
assert(scoreboard->last_branch_tick + 3 < branch_tick);
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
* setmsf.
*/
bool is_safe_msf_branch =
+ c->devinfo->ver >= 71 ||
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -1851,6 +2424,14 @@ emit_branch(struct v3d_compile *c,
break;
}
+ /* Do not move up a branch if it can disrupt an ldvary sequence
+ * as that can cause stomping of the r5 register.
+ */
+ if (scoreboard->last_ldvary_tick + 2 >=
+ branch_tick - slots_filled) {
+ break;
+ }
+
/* Can't move a conditional branch before the instruction
* that writes the flags for its condition.
*/
@@ -1890,46 +2471,72 @@ emit_branch(struct v3d_compile *c,
}
static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst,
bool add, bool magic, uint32_t index)
{
uint32_t num_src;
- enum v3d_qpu_mux mux_a, mux_b;
-
- if (add) {
+ if (add)
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
- mux_a = inst->alu.add.a;
- mux_b = inst->alu.add.b;
- } else {
+ else
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- mux_a = inst->alu.mul.a;
- mux_b = inst->alu.mul.b;
- }
- for (int i = 0; i < num_src; i++) {
- if (magic) {
- if (i == 0 && mux_a == index)
- return true;
- if (i == 1 && mux_b == index)
- return true;
+ if (devinfo->ver == 42) {
+ enum v3d_qpu_mux mux_a, mux_b;
+ if (add) {
+ mux_a = inst->alu.add.a.mux;
+ mux_b = inst->alu.add.b.mux;
} else {
- if (i == 0 && mux_a == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 0 && mux_a == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
+ mux_a = inst->alu.mul.a.mux;
+ mux_b = inst->alu.mul.b.mux;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (magic) {
+ if (i == 0 && mux_a == index)
+ return true;
+ if (i == 1 && mux_b == index)
+ return true;
+ } else {
+ if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
}
}
+
+ return false;
+ }
+
+ assert(devinfo->ver >= 71);
+ assert(!magic);
+
+ uint32_t raddr_a, raddr_b;
+ if (add) {
+ raddr_a = inst->alu.add.a.raddr;
+ raddr_b = inst->alu.add.b.raddr;
+ } else {
+ raddr_a = inst->alu.mul.a.raddr;
+ raddr_b = inst->alu.mul.b.raddr;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (i == 0 && raddr_a == index)
+ return true;
+ if (i == 1 && raddr_b == index)
+ return true;
}
return false;
@@ -1964,7 +2571,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
struct qblock *block,
struct v3d_qpu_instr *inst)
{
- /* We only call this if we have successfuly merged an ldvary into a
+ const struct v3d_device_info *devinfo = c->devinfo;
+
+ /* We only call this if we have successfully merged an ldvary into a
* previous instruction.
*/
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
@@ -1976,9 +2585,20 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
* the ldvary destination, if it does, then moving the ldvary before
* it would overwrite it.
*/
- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
return false;
- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
+ return false;
+
+ /* The implicit ldvary destination may not be written to by a signal
+ * in the instruction following ldvary. Since we are planning to move
+ * ldvary to the previous instruction, this means we need to check if
+ * the current instruction has any other signal that could create this
+ * conflict. The only other signal that can write to the implicit
+ * ldvary destination that is compatible with ldvary in the same
+ * instruction is ldunif.
+ */
+ if (inst->sig.ldunif)
return false;
/* The previous instruction can't write to the same destination as the
@@ -2003,7 +2623,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
}
/* The previous instruction cannot have a conflicting signal */
- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
+ return false;
+
+ uint32_t sig;
+ struct v3d_qpu_sig new_sig = prev->qpu.sig;
+ new_sig.ldvary = true;
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
return false;
/* The previous instruction cannot use flags since ldvary uses the
@@ -2016,9 +2642,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
/* We can't put an ldvary in the delay slots of a thrsw. We should've
* prevented this when pairing up the ldvary with another instruction
- * and flagging it for a fixup.
+ * and flagging it for a fixup. In V3D 7.x this is limited only to the
+ * second delay slot.
*/
- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+ assert((devinfo->ver == 42 &&
+ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+ (devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
/* Move the ldvary to the previous instruction and remove it from the
* current one.
@@ -2032,14 +2662,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
inst->sig_magic = false;
inst->sig_addr = 0;
- /* By moving ldvary to the previous instruction we make it update
- * r5 in the current one, so nothing else in it should write r5.
+ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+ if (devinfo->ver >= 71) {
+ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+ }
+
+ /* By moving ldvary to the previous instruction we make it update r5
+ * (rf0 for ver >= 71) in the current one, so nothing else in it
+ * should write this register.
+ *
* This should've been prevented by our depedency tracking, which
* would not allow ldvary to be paired up with an instruction that
- * writes r5 (since our dependency tracking doesn't know that the
- * ldvary write r5 happens in the next instruction).
+ * writes r5/rf0 (since our dependency tracking doesn't know that the
+ * ldvary write to r5/rf0 happens in the next instruction).
*/
- assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+ assert(!v3d_qpu_writes_r5(devinfo, inst));
+ assert(devinfo->ver == 42 ||
+ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
return true;
}
@@ -2102,6 +2743,9 @@ schedule_instructions(struct v3d_compile *c,
merge->inst->uniform;
}
+ chosen->inst->ldtmu_count +=
+ merge->inst->ldtmu_count;
+
if (debug) {
fprintf(stderr, "t=%4d: merging: ",
time);
@@ -2127,7 +2771,7 @@ schedule_instructions(struct v3d_compile *c,
}
}
}
- if (mux_read_stalls(scoreboard, inst))
+ if (read_stalls(c->devinfo, scoreboard, inst))
c->qpu_inst_stalled_count++;
}
@@ -2351,6 +2995,8 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_branch_tick = -10;
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
+ scoreboard.first_ldtmu_after_thrsw = true;
+ scoreboard.last_implicit_rf0_write_tick = - 10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index ec9ed66650c..538b247e3e0 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
int last_sfu_write;
int last_branch_ip;
int last_thrsw_ip;
+ int first_tlb_z_write;
/* Set when we've found the last-THRSW signal, or if we were started
* in single-segment mode.
@@ -110,11 +111,58 @@ static void
qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
{
const struct v3d_device_info *devinfo = state->c->devinfo;
+
+ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+ state->first_tlb_z_write = state->ip;
+
const struct v3d_qpu_instr *inst = &qinst->qpu;
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+ fail_instr(state, "Implicit branch MSF read after TLB Z write");
+ }
+
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return;
+ if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write) {
+ fail_instr(state, "SETMSF after TLB Z write");
+ }
+
+ if (state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->alu.add.op == V3D_QPU_A_MSF) {
+ fail_instr(state, "MSF read after TLB Z write");
+ }
+
+ if (devinfo->ver < 71) {
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+ inst->sig.small_imm_d) {
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
+ }
+ } else {
+ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+ !vir_is_add(qinst)) {
+ fail_instr(state, "small imm a/b used but no ADD inst");
+ }
+ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+ !vir_is_mul(qinst)) {
+ fail_instr(state, "small imm c/d used but no MUL inst");
+ }
+ if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+ fail_instr(state, "only one small immediate can be "
+ "enabled per instruction");
+ }
+ }
+
/* LDVARY writes r5 two instructions later and LDUNIF writes
* r5 one instruction later, which is illegal to have
* together.
@@ -128,7 +176,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
*
* FIXME: This would not check correctly for V3D 4.2 versions lower
* than V3D 4.2.14, but that is not a real issue because the simulator
- * will still catch this, and we are not really targetting any such
+ * will still catch this, and we are not really targeting any such
* versions anyway.
*/
if (state->c->devinfo->ver < 42) {
@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
"SFU write started during THRSW delay slots ");
}
- if (inst->sig.ldvary)
- fail_instr(state, "LDVARY during THRSW delay slots");
+ if (inst->sig.ldvary) {
+ if (devinfo->ver == 42)
+ fail_instr(state, "LDVARY during THRSW delay slots");
+ if (devinfo->ver >= 71 &&
+ state->ip - state->last_thrsw_ip == 2) {
+ fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+ }
+ }
}
(void)qpu_magic_waddr_matches; /* XXX */
@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
vpm_writes +
tlb_writes +
tsy_writes +
- inst->sig.ldtmu +
+ (devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
inst->sig.ldtlb +
inst->sig.ldvpm +
inst->sig.ldtlbu > 1) {
@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if ((inst->alu.add.op != V3D_QPU_A_NOP &&
!inst->alu.add.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver == 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "ADD RF write at THREND");
+ }
+ if (inst->alu.add.waddr == 2 ||
+ inst->alu.add.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
!inst->alu.mul.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver == 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "MUL RF write at THREND");
+ }
+
+ if (inst->alu.mul.waddr == 2 ||
+ inst->alu.mul.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
!inst->sig_magic) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver == 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71 &&
+ (inst->sig_addr == 2 ||
+ inst->sig_addr == 3)) {
+ fail_instr(state, "RF2-3 write after THREND");
+ }
}
/* GFXH-1625: No TMUWT in the last instruction */
@@ -312,7 +397,7 @@ qpu_validate(struct v3d_compile *c)
* keep compiling the validation code to make sure it doesn't get
* broken.
*/
-#ifndef DEBUG
+#if !MESA_DEBUG
return;
#endif
@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
.last_sfu_write = -10,
.last_thrsw_ip = -10,
.last_branch_ip = -10,
+ .first_tlb_z_write = INT_MAX,
.ip = 0,
.last_thrsw_found = !c->last_thrsw,
diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c
deleted file mode 100644
index b933635f6fe..00000000000
--- a/src/broadcom/compiler/v3d33_tex.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
-{
- /* FIXME: We don't bother implementing pipelining for texture reads
- * for any pre 4.x hardware. It should be straight forward to do but
- * we are not really testing or even targetting this hardware at
- * present.
- */
- ntq_flush_tmu(c);
-
- unsigned unit = instr->texture_index;
-
- struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = {
- V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header,
-
- .fetch_sample_mode = instr->op == nir_texop_txf,
- };
-
- struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked = {
- };
-
- switch (instr->sampler_dim) {
- case GLSL_SAMPLER_DIM_1D:
- if (instr->is_array)
- p0_unpacked.lookup_type = TEXTURE_1D_ARRAY;
- else
- p0_unpacked.lookup_type = TEXTURE_1D;
- break;
- case GLSL_SAMPLER_DIM_2D:
- case GLSL_SAMPLER_DIM_RECT:
- if (instr->is_array)
- p0_unpacked.lookup_type = TEXTURE_2D_ARRAY;
- else
- p0_unpacked.lookup_type = TEXTURE_2D;
- break;
- case GLSL_SAMPLER_DIM_3D:
- p0_unpacked.lookup_type = TEXTURE_3D;
- break;
- case GLSL_SAMPLER_DIM_CUBE:
- p0_unpacked.lookup_type = TEXTURE_CUBE_MAP;
- break;
- default:
- unreachable("Bad sampler type");
- }
-
- struct qreg coords[5];
- int next_coord = 0;
- for (unsigned i = 0; i < instr->num_srcs; i++) {
- switch (instr->src[i].src_type) {
- case nir_tex_src_coord:
- for (int j = 0; j < instr->coord_components; j++) {
- coords[next_coord++] =
- ntq_get_src(c, instr->src[i].src, j);
- }
- if (instr->coord_components < 2)
- coords[next_coord++] = vir_uniform_f(c, 0.5);
- break;
- case nir_tex_src_bias:
- coords[next_coord++] =
- ntq_get_src(c, instr->src[i].src, 0);
-
- p0_unpacked.bias_supplied = true;
- break;
- case nir_tex_src_lod:
- coords[next_coord++] =
- vir_FADD(c,
- ntq_get_src(c, instr->src[i].src, 0),
- vir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL,
- unit));
-
- if (instr->op != nir_texop_txf &&
- instr->op != nir_texop_tg4) {
- p0_unpacked.disable_autolod_use_bias_only = true;
- }
- break;
- case nir_tex_src_comparator:
- coords[next_coord++] =
- ntq_get_src(c, instr->src[i].src, 0);
-
- p0_unpacked.shadow = true;
- break;
-
- case nir_tex_src_offset: {
- p0_unpacked.texel_offset_for_s_coordinate =
- nir_src_comp_as_int(instr->src[i].src, 0);
-
- if (instr->coord_components >= 2)
- p0_unpacked.texel_offset_for_t_coordinate =
- nir_src_comp_as_int(instr->src[i].src, 1);
-
- if (instr->coord_components >= 3)
- p0_unpacked.texel_offset_for_r_coordinate =
- nir_src_comp_as_int(instr->src[i].src, 2);
- break;
- }
-
- default:
- unreachable("unknown texture source");
- }
- }
-
- /* Limit the number of channels returned to both how many the NIR
- * instruction writes and how many the instruction could produce.
- */
- p1_unpacked.return_words_of_texture_data =
- instr->dest.is_ssa ?
- nir_ssa_def_components_read(&instr->dest.ssa) :
- (1 << instr->dest.reg.reg->num_components) - 1;
-
- uint32_t p0_packed;
- V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
- (uint8_t *)&p0_packed,
- &p0_unpacked);
-
- uint32_t p1_packed;
- V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL,
- (uint8_t *)&p1_packed,
- &p1_unpacked);
- /* Load unit number into the address field, which will be be used by
- * the driver to decide which texture to put in the actual address
- * field.
- */
- p1_packed |= unit << 5;
-
- /* There is no native support for GL texture rectangle coordinates, so
- * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
- * 1]).
- */
- if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
- coords[0] = vir_FMUL(c, coords[0],
- vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X,
- unit));
- coords[1] = vir_FMUL(c, coords[1],
- vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y,
- unit));
- }
-
- int texture_u[] = {
- vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
- vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
- };
-
- for (int i = 0; i < next_coord; i++) {
- struct qreg dst;
-
- if (i == next_coord - 1)
- dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL);
- else
- dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU);
-
- struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
-
- if (i < 2)
- tmu->uniform = texture_u[i];
- }
-
- vir_emit_thrsw(c);
-
- for (int i = 0; i < 4; i++) {
- if (p1_unpacked.return_words_of_texture_data & (1 << i))
- ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
- }
-}
diff --git a/src/broadcom/compiler/v3d33_vpm_setup.c b/src/broadcom/compiler/v3d33_vpm_setup.c
deleted file mode 100644
index 8bce67dfae9..00000000000
--- a/src/broadcom/compiler/v3d33_vpm_setup.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright © 2016-2018 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "v3d_compiler.h"
-
-/* We don't do any address packing. */
-#define __gen_user_data void
-#define __gen_address_type uint32_t
-#define __gen_address_offset(reloc) (*reloc)
-#define __gen_emit_reloc(cl, reloc)
-#include "broadcom/cle/v3d_packet_v33_pack.h"
-
-void
-v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components)
-{
- struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = {
- V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header,
-
- .horiz = true,
- .laned = false,
- /* If the field is 0, that means a read count of 32. */
- .num = num_components & 31,
- .segs = true,
- .stride = 1,
- .size = VPM_SETUP_SIZE_32_BIT,
- .addr = c->num_inputs,
- };
-
- uint32_t packed;
- V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL,
- (uint8_t *)&packed,
- &unpacked);
- vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
-
-void
-v3d33_vir_vpm_write_setup(struct v3d_compile *c)
-{
- uint32_t packed;
- struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = {
- V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header,
-
- .horiz = true,
- .laned = false,
- .segs = true,
- .stride = 1,
- .size = VPM_SETUP_SIZE_32_BIT,
- .addr = 0,
- };
-
- V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL,
- (uint8_t *)&packed,
- &unpacked);
- vir_VPMSETUP(c, vir_uniform_ui(c, packed));
-}
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 0c1419661d3..12aaacdc14a 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -31,6 +31,7 @@
#include <stdint.h>
#include <string.h>
+#include "util/blend.h"
#include "util/macros.h"
#include "common/v3d_debug.h"
#include "common/v3d_device_info.h"
@@ -40,7 +41,6 @@
#include "util/u_math.h"
#include "qpu/qpu_instr.h"
-#include "pipe/p_state.h"
/**
* Maximum number of outstanding TMU operations we can queue for execution.
@@ -87,7 +87,7 @@ enum qfile {
/** A physical register, such as the W coordinate payload. */
QFILE_REG,
- /** One of the regsiters for fixed function interactions. */
+ /** One of the registers for fixed function interactions. */
QFILE_MAGIC,
/**
@@ -97,12 +97,6 @@ enum qfile {
QFILE_TEMP,
/**
- * VPM reads use this with an index value to say what part of the VPM
- * is being read.
- */
- QFILE_VPM,
-
- /**
* Stores an immediate value in the index field that will be used
* directly by qpu_load_imm().
*/
@@ -169,6 +163,19 @@ struct qinst {
* otherwise.
*/
int uniform;
+
+ /* If this is a a TLB Z write */
+ bool is_tlb_z_write;
+
+ /* If this is a retiring TMU instruction (the last in a lookup sequence),
+ * how many ldtmu instructions are required to read the results.
+ */
+ uint32_t ldtmu_count;
+
+ /* Position of this instruction in the program. Filled in during
+ * register allocation.
+ */
+ int32_t ip;
};
enum quniform_contents {
@@ -330,6 +337,19 @@ enum quniform_contents {
* Current value of gl_ViewIndex for Multiview rendering.
*/
QUNIFORM_VIEW_INDEX,
+
+ /**
+ * Inline uniform buffers
+ */
+ QUNIFORM_INLINE_UBO_0,
+ QUNIFORM_INLINE_UBO_1,
+ QUNIFORM_INLINE_UBO_2,
+ QUNIFORM_INLINE_UBO_3,
+
+ /**
+ * Current value of DrawIndex for Multidraw
+ */
+ QUNIFORM_DRAW_ID,
};
static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
@@ -369,13 +389,7 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
return slot.slot_and_component & 3;
}
-enum v3d_execution_environment {
- V3D_ENVIRONMENT_OPENGL = 0,
- V3D_ENVIRONMENT_VULKAN,
-};
-
struct v3d_key {
- void *shader_state;
struct {
uint8_t swizzle[4];
} tex[V3D_MAX_TEXTURE_SAMPLERS];
@@ -388,9 +402,9 @@ struct v3d_key {
uint8_t num_samplers_used;
uint8_t ucp_enables;
bool is_last_geometry_stage;
- bool robust_buffer_access;
-
- enum v3d_execution_environment environment;
+ bool robust_uniform_access;
+ bool robust_storage_access;
+ bool robust_image_access;
};
struct v3d_fs_key {
@@ -400,7 +414,6 @@ struct v3d_fs_key {
bool line_smoothing;
bool point_coord_upper_left;
bool msaa;
- bool sample_coverage;
bool sample_alpha_to_coverage;
bool sample_alpha_to_one;
/* Mask of which color render targets are present. */
@@ -419,14 +432,12 @@ struct v3d_fs_key {
*/
struct {
enum pipe_format format;
- const uint8_t *swizzle;
+ uint8_t swizzle[4];
} color_fmt[V3D_MAX_DRAW_BUFFERS];
- uint8_t logicop_func;
+ enum pipe_logicop logicop_func;
uint32_t point_sprite_mask;
- struct pipe_rt_blend_state blend;
-
/* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios:
*
* - If there is a geometry shader, then gl_PrimitiveID must be written
@@ -468,7 +479,7 @@ struct v3d_vs_key {
bool clamp_color;
};
-/** A basic block of VIR intructions. */
+/** A basic block of VIR instructions. */
struct qblock {
struct list_head link;
@@ -566,6 +577,7 @@ enum v3d_compilation_result {
*/
struct v3d_compiler {
const struct v3d_device_info *devinfo;
+ uint32_t max_inline_uniform_buffers;
struct ra_regs *regs;
struct ra_class *reg_class_any[3];
struct ra_class *reg_class_r5[3];
@@ -584,6 +596,19 @@ struct v3d_interp_input {
unsigned mode; /* interpolation mode */
};
+struct v3d_ra_node_info {
+ struct {
+ uint32_t priority;
+ uint8_t class_bits;
+ bool is_program_end;
+ bool unused;
+
+ /* V3D 7.x */
+ bool is_ldunif_dst;
+ } *info;
+ uint32_t alloc_count;
+};
+
struct v3d_compile {
const struct v3d_device_info *devinfo;
nir_shader *s;
@@ -596,7 +621,7 @@ struct v3d_compile {
void *debug_output_data;
/**
- * Mapping from nir_register * or nir_ssa_def * to array of struct
+ * Mapping from nir_register * or nir_def * to array of struct
* qreg for the values.
*/
struct hash_table *def_ht;
@@ -615,11 +640,12 @@ struct v3d_compile {
uint32_t output_fifo_size;
struct {
- nir_dest *dest;
+ nir_def *def;
uint8_t num_components;
uint8_t component_mask;
} flush[MAX_TMU_QUEUE_SIZE];
uint32_t flush_count;
+ uint32_t total_count;
} tmu;
/**
@@ -652,16 +678,13 @@ struct v3d_compile {
bool uses_center_w;
bool writes_z;
+ bool writes_z_from_fep;
+ bool reads_z;
bool uses_implicit_point_line_varyings;
/* True if a fragment shader reads gl_PrimitiveID */
bool fs_uses_primitive_id;
- /* If the fragment shader does anything that requires to force
- * per-sample MSAA, such as reading gl_SampleID.
- */
- bool force_per_sample_msaa;
-
/* Whether we are using the fallback scheduler. This will be set after
* register allocation has failed once.
*/
@@ -681,6 +704,11 @@ struct v3d_compile {
bool disable_constant_ubo_load_sorting;
bool sorted_any_ubo_loads;
+ /* Moves UBO/SSBO loads right before their first user (nir_opt_move).
+ * This can reduce register pressure.
+ */
+ bool move_buffer_loads;
+
/* Emits ldunif for each new uniform, even if the uniform was already
* emitted in the same block. Useful to compile shaders with high
* register pressure or to disable the optimization during uniform
@@ -692,6 +720,19 @@ struct v3d_compile {
bool disable_loop_unrolling;
bool unrolled_any_loops;
+ /* Disables nir_opt_gcm to reduce register pressure. */
+ bool disable_gcm;
+
+ /* If calling nir_opt_gcm made any progress. Used to skip new rebuilds
+ * if possible
+ */
+ bool gcm_progress;
+
+ /* Disables scheduling of general TMU loads (and unfiltered image load).
+ */
+ bool disable_general_tmu_sched;
+ bool has_general_tmu_load;
+
/* Minimum number of threads we are willing to use to register allocate
* a shader with the current compilation strategy. This only prevents
* us from lowering the thread count to register allocate successfully,
@@ -705,7 +746,9 @@ struct v3d_compile {
* strategies that can reduce register pressure and hopefully reduce or
* eliminate TMU spills in the shader.
*/
- bool tmu_spilling_allowed;
+ uint32_t max_tmu_spills;
+
+ uint32_t compile_strategy_idx;
/* The UBO index and block used with the last unifa load, as well as the
* current unifa offset *after* emitting that load. This is used to skip
@@ -715,6 +758,7 @@ struct v3d_compile {
struct qblock *current_unifa_block;
int32_t current_unifa_index;
uint32_t current_unifa_offset;
+ bool current_unifa_is_ubo;
/* State for whether we're executing on each channel currently. 0 if
* yes, otherwise a block number + 1 that the channel jumped to.
@@ -749,6 +793,11 @@ struct v3d_compile {
struct qreg cs_shared_offset;
int local_invocation_index_bits;
+ /* Starting value of the sample mask in a fragment shader. We use
+ * this to identify lanes that have been terminated/discarded.
+ */
+ struct qreg start_msf;
+
/* If the shader uses subgroup functionality */
bool has_subgroups;
@@ -761,14 +810,27 @@ struct v3d_compile {
uint32_t spill_size;
/* Shader-db stats */
uint32_t spills, fills, loops;
+
+ /* Whether we are in the process of spilling registers for
+ * register allocation
+ */
+ bool spilling;
+
/**
* Register spilling's per-thread base address, shared between each
- * spill/fill's addressing calculations.
+ * spill/fill's addressing calculations (also used for scratch
+ * access).
*/
struct qreg spill_base;
+
/* Bit vector of which temps may be spilled */
BITSET_WORD *spillable;
+ /* Used during register allocation */
+ int thread_index;
+ struct v3d_ra_node_info nodes;
+ struct ra_graph *g;
+
/**
* Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
*
@@ -799,11 +861,16 @@ struct v3d_compile {
uint32_t uniform_array_size;
uint32_t num_uniforms;
uint32_t output_position_index;
- nir_variable *output_color_var[4];
+ nir_variable *output_color_var[V3D_MAX_DRAW_BUFFERS];
uint32_t output_sample_mask_index;
struct qreg undef;
uint32_t num_temps;
+ /* Number of temps in the program right before we spill a new temp. We
+ * use this to know which temps existed before a spill and which were
+ * added with the spill itself.
+ */
+ uint32_t spill_start_num_temps;
struct vir_cursor cursor;
struct list_head blocks;
@@ -848,12 +915,16 @@ struct v3d_compile {
bool emitted_tlb_load;
bool lock_scoreboard_on_first_thrsw;
- /* Total number of spilled registers in the program */
- uint32_t spill_count;
-
enum v3d_compilation_result compilation_result;
bool tmu_dirty_rcl;
+ bool has_global_address;
+
+ /* If we have processed a discard/terminate instruction. This may
+ * cause some lanes to be inactive even during uniform control
+ * flow.
+ */
+ bool emitted_discard;
};
struct v3d_uniform_list {
@@ -866,6 +937,13 @@ struct v3d_prog_data {
struct v3d_uniform_list uniforms;
uint32_t spill_size;
+ uint32_t tmu_spills;
+ uint32_t tmu_fills;
+ uint32_t tmu_count;
+
+ uint32_t qpu_read_stalls;
+
+ uint8_t compile_strategy_idx;
uint8_t threads;
@@ -877,6 +955,8 @@ struct v3d_prog_data {
bool tmu_dirty_rcl;
bool has_control_barrier;
+
+ bool has_global_address;
};
struct v3d_vs_prog_data {
@@ -964,10 +1044,15 @@ struct v3d_fs_prog_data {
uint8_t num_inputs;
bool writes_z;
+ bool writes_z_from_fep;
bool disable_ez;
bool uses_center_w;
bool uses_implicit_point_line_varyings;
bool lock_scoreboard_on_first_thrsw;
+
+ /* If the fragment shader does anything that requires to force
+ * per-sample MSAA, such as reading gl_SampleID.
+ */
bool force_per_sample_msaa;
};
@@ -998,6 +1083,10 @@ v3d_compute_vpm_config(struct v3d_device_info *devinfo,
struct v3d_gs_prog_data *gs,
struct vpm_config *vpm_cfg_bin,
struct vpm_config *vpm_cfg);
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+ uint32_t *p1_packed,
+ bool unnormalized_coordinates);
static inline bool
vir_has_uniform(struct qinst *inst)
@@ -1005,7 +1094,8 @@ vir_has_uniform(struct qinst *inst)
return inst->uniform != ~0;
}
-const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo,
+ uint32_t max_inline_uniform_buffers);
void v3d_compiler_free(const struct v3d_compiler *compiler);
void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);
@@ -1066,15 +1156,14 @@ bool vir_is_raw_mov(struct qinst *inst);
bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_is_add(struct qinst *inst);
bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
uint8_t vir_channels_written(struct qinst *inst);
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
-void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
- struct qreg result);
+void ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
+ struct qreg result);
bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
-void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
+void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_def *def,
uint32_t component_mask);
void ntq_flush_tmu(struct v3d_compile *c);
void vir_emit_thrsw(struct v3d_compile *c);
@@ -1095,32 +1184,27 @@ bool vir_opt_redundant_flags(struct v3d_compile *c);
bool vir_opt_small_immediates(struct v3d_compile *c);
bool vir_opt_vpm(struct v3d_compile *c);
bool vir_opt_constant_alu(struct v3d_compile *c);
-void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_line_smooth(nir_shader *shader);
-void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c);
-void v3d_nir_lower_scratch(nir_shader *s);
-void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
-void v3d_nir_lower_image_load_store(nir_shader *s);
-void vir_lower_uniforms(struct v3d_compile *c);
-
-void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
-void v3d33_vir_vpm_write_setup(struct v3d_compile *c);
-void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
-void v3d40_vir_emit_image_load_store(struct v3d_compile *c,
- nir_intrinsic_instr *instr);
+bool v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_line_smooth(nir_shader *shader);
+bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_scratch(nir_shader *s);
+bool v3d_nir_lower_txf_ms(nir_shader *s);
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
+bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
+
+void v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
+void v3d_vir_emit_image_load_store(struct v3d_compile *c,
+ nir_intrinsic_instr *instr);
void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
void qpu_validate(struct v3d_compile *c);
-struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
bool vir_init_reg_sets(struct v3d_compiler *compiler);
int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str);
-bool v3d_gl_format_is_return_32(GLenum format);
+bool v3d_gl_format_is_return_32(enum pipe_format format);
uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
@@ -1220,28 +1304,35 @@ vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \
#define VIR_SFU(name) \
static inline struct qreg \
vir_##name(struct v3d_compile *c, struct qreg a) \
-{ \
- if (c->devinfo->ver >= 41) { \
- return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \
- c->undef, \
- a, c->undef)); \
- } else { \
- vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
- return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
- } \
+{ \
+ return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \
+ c->undef, \
+ a, c->undef)); \
} \
static inline struct qinst * \
vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \
struct qreg a) \
{ \
- if (c->devinfo->ver >= 41) { \
- return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
- dest, \
- a, c->undef)); \
- } else { \
- vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
- return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
- } \
+ return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
+ dest, \
+ a, c->undef)); \
+}
+
+#define VIR_SFU2(name) \
+static inline struct qreg \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b) \
+{ \
+ return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name, \
+ c->undef, \
+ a, b)); \
+} \
+static inline struct qinst * \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest, \
+ struct qreg a, struct qreg b) \
+{ \
+ return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
+ dest, \
+ a, b)); \
}
#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
@@ -1343,6 +1434,28 @@ VIR_SFU(LOG)
VIR_SFU(SIN)
VIR_SFU(RSQRT2)
+VIR_SFU(BALLOT)
+VIR_SFU(BCASTF)
+VIR_SFU(ALLEQ)
+VIR_SFU(ALLFEQ)
+VIR_SFU2(ROTQ)
+VIR_SFU2(ROT)
+VIR_SFU2(SHUFFLE)
+
+VIR_A_ALU2(VPACK)
+VIR_A_ALU2(V8PACK)
+VIR_A_ALU2(V10PACK)
+VIR_A_ALU2(V11FPACK)
+
+VIR_M_ALU1(FTOUNORM16)
+VIR_M_ALU1(FTOSNORM16)
+
+VIR_M_ALU1(VFTOUNORM8)
+VIR_M_ALU1(VFTOSNORM8)
+
+VIR_M_ALU1(VFTOUNORM10LO)
+VIR_M_ALU1(VFTOUNORM10HI)
+
static inline struct qinst *
vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
struct qreg dest, struct qreg src)
@@ -1372,16 +1485,11 @@ vir_NOP(struct v3d_compile *c)
static inline struct qreg
vir_LDTMU(struct v3d_compile *c)
{
- if (c->devinfo->ver >= 41) {
- struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
- c->undef, c->undef);
- ldtmu->qpu.sig.ldtmu = true;
-
- return vir_emit_def(c, ldtmu);
- } else {
- vir_NOP(c)->qpu.sig.ldtmu = true;
- return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
- }
+ struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+ c->undef, c->undef);
+ ldtmu->qpu.sig.ldtmu = true;
+
+ return vir_emit_def(c, ldtmu);
}
static inline struct qreg
@@ -1394,7 +1502,6 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
static inline struct qreg
vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
{
- assert(c->devinfo->ver >= 41); /* XXX */
assert((config & 0xffffff00) == 0xffffff00);
struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
@@ -1407,38 +1514,12 @@ vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
static inline struct qreg
vir_TLB_COLOR_READ(struct v3d_compile *c)
{
- assert(c->devinfo->ver >= 41); /* XXX */
-
struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
c->undef, c->undef);
ldtlb->qpu.sig.ldtlb = true;
return vir_emit_def(c, ldtlb);
}
-/*
-static inline struct qreg
-vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
-{
- return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
- vir_reg(QFILE_LOAD_IMM, val), c->undef));
-}
-
-static inline struct qreg
-vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
-{
- return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
- vir_reg(QFILE_LOAD_IMM, val),
- c->undef));
-}
-static inline struct qreg
-vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
-{
- return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
- vir_reg(QFILE_LOAD_IMM, val),
- c->undef));
-}
-*/
-
static inline struct qinst *
vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
{
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2706432d5ef..9a651bfc6a7 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,9 +40,20 @@
* calculations and load/store using the TMU general memory access path.
*/
+static const unsigned bits_8[4] = {8, 8, 8, 8};
+static const unsigned bits_16[4] = {16, 16, 16, 16};
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+
bool
v3d_gl_format_is_return_32(enum pipe_format format)
{
+ /* We can get a NONE format in Vulkan because we support the
+ * shaderStorageImageReadWithoutFormat feature. We consider these to
+ * always use 32-bit precision.
+ */
+ if (format == PIPE_FORMAT_NONE)
+ return true;
+
const struct util_format_description *desc =
util_format_description(format);
const struct util_format_channel_description *chan = &desc->channel[0];
@@ -52,15 +63,17 @@ v3d_gl_format_is_return_32(enum pipe_format format)
/* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
* 32-bit SSA value, with as many channels as necessary to store all the bits
+ *
+ * This is the generic helper, using all common nir operations.
*/
-static nir_ssa_def *
-pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
+static nir_def *
+pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
int num_components, bool mask)
{
- nir_ssa_def *results[4];
+ nir_def *results[4];
int offset = 0;
for (int i = 0; i < num_components; i++) {
- nir_ssa_def *chan = nir_channel(b, color, i);
+ nir_def *chan = nir_channel(b, color, i);
/* Channels being stored shouldn't cross a 32-bit boundary. */
assert((offset & ~31) == ((offset + bits[i] - 1) & ~31));
@@ -84,10 +97,187 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
}
-static void
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static inline nir_def *
+nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2)
+{
+ return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_def *
+pack_11f11f10f(nir_builder *b, nir_def *color)
+{
+ nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ nir_def *undef = nir_undef(b, 1, color->bit_size);
+ nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+ return nir_pack_32_to_r11g11b10_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_def *color)
+{
+ nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+
+ return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color)
+{
+ nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1);
+
+ nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2);
+
+ return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
+}
+
+enum hw_conversion {
+ NONE,
+ TO_SNORM,
+ TO_UNORM
+};
+
+static inline nir_def *
+pack_8bit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ /* Note that usually you should not use this method (that relies on
+ * custom packing) for 1 component if we are not doing any
+ * conversion. But we support also that case, and let the caller
+ * decide which method to use.
+ */
+ nir_def *p1;
+ nir_def *p2;
+
+ if (conversion == NONE) {
+ p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ } else {
+ p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ p1 = (conversion == TO_UNORM) ?
+ nir_pack_2x16_to_unorm_2x8_v3d(b, p1) :
+ nir_pack_2x16_to_snorm_2x8_v3d(b, p1);
+ }
+ if (num_components == 4) {
+ if (conversion == NONE) {
+ p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ } else {
+ p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = (conversion == TO_UNORM) ?
+ nir_pack_2x16_to_unorm_2x8_v3d(b, p2) :
+ nir_pack_2x16_to_snorm_2x8_v3d(b, p2);
+ }
+ } else {
+ /* Using an undef here would be more correct. But for this
+ * case we are getting worse shader-db values with some CTS
+ * tests, so we just reuse the first packing.
+ */
+ p2 = p1;
+ }
+
+ return nir_pack_4x16_to_4x8_v3d(b, p1, p2);
+}
+
+static inline nir_def *
+pack_16bit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ nir_def *results[2] = {0};
+ nir_def *channels[4] = {0};
+
+ for (unsigned i = 0; i < num_components; i++) {
+ channels[i] = nir_channel(b, color, i);
+ switch (conversion) {
+ case TO_SNORM:
+ channels[i] = nir_f2snorm_16_v3d(b, channels[i]);
+ break;
+ case TO_UNORM:
+ channels[i] = nir_f2unorm_16_v3d(b, channels[i]);
+ break;
+ default:
+ /* Note that usually you should not use this method
+ * (that relies on custom packing) if we are not doing
+ * any conversion. But we support also that case, and
+ * let the caller decide which method to use.
+ */
+ break;
+ }
+ }
+
+ switch (num_components) {
+ case 1:
+ results[0] = channels[0];
+ break;
+ case 4:
+ results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]);
+ FALLTHROUGH;
+ case 2:
+ results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]);
+ break;
+ default:
+ unreachable("Invalid number of components");
+ }
+
+ return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_def *
+pack_xbit(nir_builder *b, nir_def *color,
+ unsigned num_components,
+ const struct util_format_channel_description *r_chan)
+{
+ bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
+ enum hw_conversion conversion = NONE;
+ if (r_chan->normalized) {
+ conversion =
+ (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
+ }
+
+ switch (r_chan->size) {
+ case 8:
+ if (conversion == NONE && num_components < 2)
+ return pack_bits(b, color, bits_8, num_components, pack_mask);
+ else
+ return pack_8bit(b, color, num_components, conversion);
+ break;
+ case 16:
+ /* pack_mask implies that the generic packing method would
+ * need to include extra operations to handle negative values,
+ * so in that case, even without a conversion, it is better to
+ * use the packing using custom hw operations.
+ */
+ if (conversion == NONE && !pack_mask)
+ return pack_bits(b, color, bits_16, num_components, pack_mask);
+ else
+ return pack_16bit(b, color, num_components, conversion);
+ break;
+ default:
+ unreachable("unrecognized bits");
+ }
+}
+
+static bool
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
{
enum pipe_format format = nir_intrinsic_format(instr);
+ assert(format != PIPE_FORMAT_NONE);
const struct util_format_description *desc =
util_format_description(format);
const struct util_format_channel_description *r_chan = &desc->channel[0];
@@ -95,10 +285,10 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *color = nir_channels(b,
- nir_ssa_for_src(b, instr->src[3], 4),
- (1 << num_components) - 1);
- nir_ssa_def *formatted = NULL;
+ nir_def *color = nir_trim_vector(b,
+ instr->src[3].ssa,
+ num_components);
+ nir_def *formatted = NULL;
if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
formatted = nir_format_pack_11f11f10f(b, color);
@@ -110,9 +300,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
*/
formatted = color;
} else {
- static const unsigned bits_8[4] = {8, 8, 8, 8};
- static const unsigned bits_16[4] = {16, 16, 16, 16};
- static const unsigned bits_1010102[4] = {10, 10, 10, 2};
const unsigned *bits;
switch (r_chan->size) {
@@ -132,11 +319,13 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
bool pack_mask = false;
if (r_chan->pure_integer &&
r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
- formatted = nir_format_clamp_sint(b, color, bits);
+ /* We don't need to do any conversion or clamping in this case */
+ formatted = color;
pack_mask = true;
} else if (r_chan->pure_integer &&
r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
- formatted = nir_format_clamp_uint(b, color, bits);
+ /* We don't need to do any conversion or clamping in this case */
+ formatted = color;
} else if (r_chan->normalized &&
r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
formatted = nir_format_float_to_snorm(b, color, bits);
@@ -154,75 +343,116 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
pack_mask);
}
- nir_instr_rewrite_src(&instr->instr, &instr->src[3],
- nir_src_for_ssa(formatted));
+ nir_src_rewrite(&instr->src[3], formatted);
instr->num_components = formatted->num_components;
+
+ return true;
}
-static void
+
+static bool
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ enum pipe_format format = nir_intrinsic_format(instr);
+ assert(format != PIPE_FORMAT_NONE);
+ const struct util_format_description *desc =
+ util_format_description(format);
+ const struct util_format_channel_description *r_chan = &desc->channel[0];
+ unsigned num_components = util_format_get_nr_components(format);
+ b->cursor = nir_before_instr(&instr->instr);
+
+ nir_def *color =
+ nir_trim_vector(b, instr->src[3].ssa, num_components);
+ nir_def *formatted = NULL;
+ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ formatted = nir_format_pack_r9g9b9e5(b, color);
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ formatted = pack_11f11f10f(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
+ formatted = pack_r10g10b10a2_uint(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+ formatted = pack_r10g10b10a2_unorm(b, color);
+ } else if (r_chan->size == 32) {
+ /* For 32-bit formats, we just have to move the vector
+ * across (possibly reducing the number of channels).
+ */
+ formatted = color;
+ } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+ assert(r_chan->size == 16);
+ formatted = nir_format_float_to_half(b, color);
+ formatted = pack_bits(b, formatted, bits_16, num_components,
+ false);
+ } else {
+ assert(r_chan->size == 8 || r_chan->size == 16);
+ formatted = pack_xbit(b, color, num_components, r_chan);
+ }
+
+ nir_src_rewrite(&instr->src[3], formatted);
+ instr->num_components = formatted->num_components;
+
+ return true;
+}
+
+static bool
v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
{
static const unsigned bits16[] = {16, 16, 16, 16};
enum pipe_format format = nir_intrinsic_format(instr);
if (v3d_gl_format_is_return_32(format))
- return;
+ return false;
b->cursor = nir_after_instr(&instr->instr);
- assert(instr->dest.is_ssa);
- nir_ssa_def *result = &instr->dest.ssa;
+ nir_def *result = &instr->def;
if (util_format_is_pure_uint(format)) {
result = nir_format_unpack_uint(b, result, bits16, 4);
} else if (util_format_is_pure_sint(format)) {
result = nir_format_unpack_sint(b, result, bits16, 4);
} else {
- nir_ssa_def *rg = nir_channel(b, result, 0);
- nir_ssa_def *ba = nir_channel(b, result, 1);
- result = nir_vec4(b,
- nir_unpack_half_2x16_split_x(b, rg),
- nir_unpack_half_2x16_split_y(b, rg),
- nir_unpack_half_2x16_split_x(b, ba),
- nir_unpack_half_2x16_split_y(b, ba));
+ nir_def *rg = nir_channel(b, result, 0);
+ nir_def *ba = nir_channel(b, result, 1);
+ result = nir_vec4(b,
+ nir_unpack_half_2x16_split_x(b, rg),
+ nir_unpack_half_2x16_split_y(b, rg),
+ nir_unpack_half_2x16_split_x(b, ba),
+ nir_unpack_half_2x16_split_y(b, ba));
}
- nir_ssa_def_rewrite_uses_after(&instr->dest.ssa, result,
+ nir_def_rewrite_uses_after(&instr->def, result,
result->parent_instr);
+
+ return true;
}
-void
-v3d_nir_lower_image_load_store(nir_shader *s)
+static bool
+v3d_nir_lower_image_load_store_cb(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ void *_state)
{
- nir_foreach_function(function, s) {
- if (!function->impl)
- continue;
-
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *intr =
- nir_instr_as_intrinsic(instr);
-
- switch (intr->intrinsic) {
- case nir_intrinsic_image_load:
- v3d_nir_lower_image_load(&b, intr);
- break;
- case nir_intrinsic_image_store:
- v3d_nir_lower_image_store(&b, intr);
- break;
- default:
- break;
- }
- }
- }
+ struct v3d_compile *c = (struct v3d_compile *) _state;
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_image_load:
+ return v3d_nir_lower_image_load(b, intr);
+ case nir_intrinsic_image_store:
+ if (c->devinfo->ver >= 71)
+ return v3d_nir_lower_image_store_v71(b, intr);
+ else
+ return v3d_nir_lower_image_store_v42(b, intr);
+ break;
+ default:
+ return false;
}
+
+ return false;
+}
+
+bool
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
+{
+ return nir_shader_intrinsics_pass(s,
+ v3d_nir_lower_image_load_store_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance, c);
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 895b1a39163..55e2e4f2e11 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -24,8 +24,6 @@
#include "compiler/v3d_compiler.h"
#include "compiler/nir/nir_builder.h"
-#include "util/u_helpers.h"
-
/**
* Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
* intrinsics into something amenable to the V3D architecture.
@@ -64,7 +62,7 @@ struct v3d_nir_lower_io_state {
BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_ANY_STAGE_INPUTS)];
- nir_ssa_def *pos[4];
+ nir_def *pos[4];
};
static void
@@ -72,8 +70,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
struct v3d_nir_lower_io_state *state);
static void
-v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
- nir_ssa_def *chan)
+v3d_nir_store_output(nir_builder *b, int base, nir_def *offset,
+ nir_def *chan)
{
if (offset) {
/* When generating the VIR instruction, the base and the offset
@@ -90,29 +88,6 @@ v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *offset,
nir_store_output(b, chan, offset, .base = base, .write_mask = 0x1, .component = 0);
}
-/* Convert the uniform offset to bytes. If it happens to be a constant,
- * constant-folding will clean up the shift for us.
- */
-static void
-v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
- nir_intrinsic_instr *intr)
-{
- /* On SPIR-V/Vulkan we are already getting our offsets in
- * bytes.
- */
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
- return;
-
- b->cursor = nir_before_instr(&intr->instr);
-
- nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
-
- nir_instr_rewrite_src(&intr->instr,
- &intr->src[0],
- nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
- nir_imm_int(b, 4))));
-}
-
static int
v3d_varying_slot_vpm_offset(struct v3d_compile *c, unsigned location, unsigned component)
{
@@ -159,14 +134,13 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
/* If this is a geometry shader we need to emit our outputs
* to the current vertex offset in the VPM.
*/
- nir_ssa_def *offset_reg =
+ nir_def *offset_reg =
c->s->info.stage == MESA_SHADER_GEOMETRY ?
nir_load_var(b, state->gs.output_offset_var) : NULL;
int start_comp = nir_intrinsic_component(intr);
unsigned location = nir_intrinsic_io_semantics(intr).location;
- nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
- intr->num_components);
+ nir_def *src = intr->src[0].ssa;
/* Save off the components of the position for the setup of VPM inputs
* read by fixed function HW.
*/
@@ -184,8 +158,8 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
if (location == VARYING_SLOT_LAYER) {
assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
- nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
- header = nir_iand(b, header, nir_imm_int(b, 0xff00ffff));
+ nir_def *header = nir_load_var(b, state->gs.header_var);
+ header = nir_iand_imm(b, header, 0xff00ffff);
/* From the GLES 3.2 spec:
*
@@ -205,24 +179,26 @@ v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
* to 0 in that case (we always allocate tile state for at
* least one layer).
*/
- nir_ssa_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
- nir_ssa_def *cond = nir_ige(b, src, fb_layers);
- nir_ssa_def *layer_id =
+ nir_def *fb_layers = nir_load_fb_layers_v3d(b, 32);
+ nir_def *cond = nir_ige(b, src, fb_layers);
+ nir_def *layer_id =
nir_bcsel(b, cond,
nir_imm_int(b, 0),
- nir_ishl(b, src, nir_imm_int(b, 16)));
+ nir_ishl_imm(b, src, 16));
header = nir_ior(b, header, layer_id);
nir_store_var(b, state->gs.header_var, header, 0x1);
}
/* Scalarize outputs if it hasn't happened already, since we want to
- * schedule each VPM write individually. We can skip any outut
+ * schedule each VPM write individually. We can skip any output
* components not read by the FS.
*/
for (int i = 0; i < intr->num_components; i++) {
int vpm_offset =
v3d_varying_slot_vpm_offset(c, location, start_comp + i);
+ if (!(nir_intrinsic_write_mask(intr) & (1 << i)))
+ continue;
if (vpm_offset == -1)
continue;
@@ -261,9 +237,9 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
{
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *header = nir_load_var(b, state->gs.header_var);
- nir_ssa_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
- nir_ssa_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
+ nir_def *header = nir_load_var(b, state->gs.header_var);
+ nir_def *header_offset = nir_load_var(b, state->gs.header_offset_var);
+ nir_def *output_offset = nir_load_var(b, state->gs.output_offset_var);
/* Emit fixed function outputs */
v3d_nir_emit_ff_vpm_outputs(c, b, state);
@@ -273,13 +249,13 @@ v3d_nir_lower_emit_vertex(struct v3d_compile *c, nir_builder *b,
/* Update VPM offset for next vertex output data and header */
output_offset =
- nir_iadd(b, output_offset,
- nir_imm_int(b, state->gs.output_vertex_data_size));
+ nir_iadd_imm(b, output_offset,
+ state->gs.output_vertex_data_size);
- header_offset = nir_iadd(b, header_offset, nir_imm_int(b, 1));
+ header_offset = nir_iadd_imm(b, header_offset, 1);
/* Reset the New Primitive bit */
- header = nir_iand(b, header, nir_imm_int(b, 0xfffffffe));
+ header = nir_iand_imm(b, header, 0xfffffffe);
nir_store_var(b, state->gs.output_offset_var, output_offset, 0x1);
nir_store_var(b, state->gs.header_offset_var, header_offset, 0x1);
@@ -304,7 +280,7 @@ v3d_nir_lower_end_primitive(struct v3d_compile *c, nir_builder *b,
* doesn't provide means to do that, so we need to apply the swizzle in the
* vertex shader.
*
- * This is required at least in Vulkan to support madatory vertex attribute
+ * This is required at least in Vulkan to support mandatory vertex attribute
* format VK_FORMAT_B8G8R8A8_UNORM.
*/
static void
@@ -327,59 +303,6 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
nir_intrinsic_set_component(instr, (comp + 2) % 4);
}
-/* Sometimes the origin of gl_PointCoord is in the upper left rather than the
- * lower left so we need to flip it.
- *
- * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
- */
-static void
-v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
- nir_intrinsic_instr *intr)
-{
- assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
-
- /* Gallium uses lower_wpos_pntc */
- if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
- return;
-
- b->cursor = nir_after_instr(&intr->instr);
-
- int comp = nir_intrinsic_component(intr);
-
- nir_variable *input_var =
- nir_find_variable_with_driver_location(c->s,
- nir_var_shader_in,
- nir_intrinsic_base(intr));
-
- if (input_var && util_varying_is_point_coord(input_var->data.location,
- c->fs_key->point_sprite_mask)) {
- assert(intr->num_components == 1);
-
- nir_ssa_def *result = &intr->dest.ssa;
-
- switch (comp) {
- case 0:
- case 1:
- if (!c->fs_key->is_points)
- result = nir_imm_float(b, 0.0);
- break;
- case 2:
- result = nir_imm_float(b, 0.0);
- break;
- case 3:
- result = nir_imm_float(b, 1.0);
- break;
- }
- if (c->fs_key->point_coord_upper_left && comp == 1)
- result = nir_fsub(b, nir_imm_float(b, 1.0), result);
- if (result != &intr->dest.ssa) {
- nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
- result,
- result->parent_instr);
- }
- }
-}
-
static void
v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
struct nir_instr *instr,
@@ -393,12 +316,6 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
case nir_intrinsic_load_input:
if (c->s->info.stage == MESA_SHADER_VERTEX)
v3d_nir_lower_vertex_input(c, b, intr);
- else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
- v3d_nir_lower_fragment_input(c, b, intr);
- break;
-
- case nir_intrinsic_load_uniform:
- v3d_nir_lower_uniform(c, b, intr);
break;
case nir_intrinsic_store_output:
@@ -558,16 +475,16 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
/* If this is a geometry shader we need to emit our fixed function
* outputs to the current vertex offset in the VPM.
*/
- nir_ssa_def *offset_reg =
+ nir_def *offset_reg =
c->s->info.stage == MESA_SHADER_GEOMETRY ?
nir_load_var(b, state->gs.output_offset_var) : NULL;
for (int i = 0; i < 4; i++) {
if (!state->pos[i])
- state->pos[i] = nir_ssa_undef(b, 1, 32);
+ state->pos[i] = nir_undef(b, 1, 32);
}
- nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
+ nir_def *rcp_wc = nir_frcp(b, state->pos[3]);
if (state->pos_vpm_offset != -1) {
for (int i = 0; i < 4; i++) {
@@ -578,8 +495,8 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
if (state->vp_vpm_offset != -1) {
for (int i = 0; i < 2; i++) {
- nir_ssa_def *pos;
- nir_ssa_def *scale;
+ nir_def *pos;
+ nir_def *scale;
pos = state->pos[i];
if (i == 0)
scale = nir_load_viewport_x_scale(b);
@@ -598,14 +515,18 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
* The correct fix for this as recommended by Broadcom
* is to convert to .8 fixed-point with ffloor().
*/
- pos = nir_f2i32(b, nir_ffloor(b, pos));
- v3d_nir_store_output(b, state->vp_vpm_offset + i,
- offset_reg, pos);
+ if (c->devinfo->ver == 42)
+ pos = nir_f2i32(b, nir_ffloor(b, pos));
+ else
+ pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+ v3d_nir_store_output(b, state->vp_vpm_offset + i,
+ offset_reg, pos);
}
}
if (state->zs_vpm_offset != -1) {
- nir_ssa_def *z = state->pos[2];
+ nir_def *z = state->pos[2];
z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
z = nir_fmul(b, z, rcp_wc);
z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
@@ -679,21 +600,22 @@ emit_gs_vpm_output_header_prolog(struct v3d_compile *c, nir_builder *b,
* have a variable just to keep track of the number of vertices we
* emitted and instead we can just compute it here from the header
* offset variable by removing the one generic header slot that always
- * goes at the begining of out header.
+ * goes at the beginning of out header.
*/
- nir_ssa_def *header_offset =
+ nir_def *header_offset =
nir_load_var(b, state->gs.header_offset_var);
- nir_ssa_def *vertex_count =
- nir_isub(b, header_offset, nir_imm_int(b, 1));
- nir_ssa_def *header =
- nir_ior(b, nir_imm_int(b, state->gs.output_header_size),
- nir_ishl(b, vertex_count,
- nir_imm_int(b, VERTEX_COUNT_OFFSET)));
+ nir_def *vertex_count =
+ nir_iadd_imm(b, header_offset, -1);
+ nir_def *header =
+ nir_ior_imm(b,
+ nir_ishl_imm(b, vertex_count,
+ VERTEX_COUNT_OFFSET),
+ state->gs.output_header_size);
v3d_nir_store_output(b, 0, NULL, header);
}
-void
+bool
v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
{
struct v3d_nir_lower_io_state state = { 0 };
@@ -713,36 +635,39 @@ v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
unreachable("Unsupported shader stage");
}
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- if (c->s->info.stage == MESA_SHADER_GEOMETRY)
- emit_gs_prolog(c, &b, function->impl, &state);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block)
- v3d_nir_lower_io_instr(c, &b, instr,
- &state);
- }
-
- nir_block *last = nir_impl_last_block(function->impl);
- b.cursor = nir_after_block(last);
- if (s->info.stage == MESA_SHADER_VERTEX) {
- v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
- } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
- emit_gs_vpm_output_header_prolog(c, &b, &state);
- }
-
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ nir_foreach_function_impl(impl, s) {
+ nir_builder b = nir_builder_create(impl);
+
+ if (c->s->info.stage == MESA_SHADER_GEOMETRY)
+ emit_gs_prolog(c, &b, impl, &state);
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block)
+ v3d_nir_lower_io_instr(c, &b, instr,
+ &state);
}
+
+ nir_block *last = nir_impl_last_block(impl);
+ b.cursor = nir_after_block(last);
+ if (s->info.stage == MESA_SHADER_VERTEX) {
+ v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+ } else if (s->info.stage == MESA_SHADER_GEOMETRY) {
+ emit_gs_vpm_output_header_prolog(c, &b, &state);
+ }
+
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
}
if (s->info.stage == MESA_SHADER_VERTEX ||
s->info.stage == MESA_SHADER_GEOMETRY) {
v3d_nir_lower_io_update_output_var_base(c, &state);
}
+
+ /* It is really unlikely that we don't get progress here, and fully
+ * filtering when not would make code more complex, but we are still
+ * interested on getting this lowering going through NIR_PASS
+ */
+ return true;
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
index 8f6e7d4e648..05b5224bc52 100644
--- a/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
+++ b/src/broadcom/compiler/v3d_nir_lower_line_smooth.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2020 Raspberry Pi
+ * Copyright © 2020 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -42,25 +42,23 @@ lower_line_smooth_intrinsic(struct lower_line_smooth_state *state,
{
b->cursor = nir_before_instr(&intr->instr);
- nir_ssa_def *one = nir_imm_float(b, 1.0f);
+ nir_def *one = nir_imm_float(b, 1.0f);
- nir_ssa_def *coverage = nir_load_var(b, state->coverage);
+ nir_def *coverage = nir_load_var(b, state->coverage);
- nir_ssa_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
+ nir_def *new_val = nir_fmul(b, nir_vec4(b, one, one, one, coverage),
intr->src[0].ssa);
- nir_instr_rewrite_src(&intr->instr,
- &intr->src[0],
- nir_src_for_ssa(new_val));
+ nir_src_rewrite(&intr->src[0], new_val);
}
-static void
+static bool
lower_line_smooth_func(struct lower_line_smooth_state *state,
nir_function_impl *impl)
{
- nir_builder b;
+ bool progress = false;
- nir_builder_init(&b, impl);
+ nir_builder b = nir_builder_create(impl);
nir_foreach_block(block, impl) {
nir_foreach_instr_safe(instr, block) {
@@ -72,58 +70,66 @@ lower_line_smooth_func(struct lower_line_smooth_state *state,
if (intr->intrinsic != nir_intrinsic_store_output ||
nir_intrinsic_base(intr) != 0 ||
- intr->num_components != 4 ||
- !intr->src[0].is_ssa)
+ intr->num_components != 4)
continue;
lower_line_smooth_intrinsic(state, &b, intr);
+ progress = true;
}
}
+
+ return progress;
}
static void
initialise_coverage_var(struct lower_line_smooth_state *state,
nir_function_impl *impl)
{
- nir_builder b;
-
- nir_builder_init(&b, impl);
+ nir_builder b = nir_builder_at(nir_before_impl(impl));
- b.cursor = nir_before_block(nir_start_block(impl));
+ nir_def *line_width = nir_load_line_width(&b);
- nir_ssa_def *line_width = nir_load_line_width(&b);
+ nir_def *real_line_width = nir_load_aa_line_width(&b);
- nir_ssa_def *real_line_width = nir_load_aa_line_width(&b);
-
- /* The line coord varies from 0.0 to 1.0 across the width of the line */
- nir_ssa_def *line_coord = nir_load_line_coord(&b);
+ /* According to the PRM, the line coord varies from 0.0 to 1.0 across
+ * the width of the line. But actually, when a perspective projection
+ * is used, it is also applied to the line coords, so the values end
+ * up being between [min_coord, 1], based on the Wc coordinate. We
+ * need to re-map the values to be between [0.0, 1.0].
+ */
+ nir_def *line_coord = nir_load_line_coord(&b);
+ nir_def *wc = nir_load_fep_w_v3d(&b, 32);
+ nir_def *min_coord_val = nir_fsub(&b, nir_imm_float(&b, 1.0f), wc);
+ nir_def *normalized_line_coord = nir_fdiv(&b,
+ nir_fsub(&b, line_coord, min_coord_val),
+ nir_fsub_imm(&b, 1.0, min_coord_val));;
/* fabs(line_coord - 0.5) * real_line_width */
- nir_ssa_def *pixels_from_center =
+ nir_def *pixels_from_center =
nir_fmul(&b, real_line_width,
- nir_fabs(&b, nir_fsub(&b, line_coord,
+ nir_fabs(&b, nir_fsub(&b, normalized_line_coord,
nir_imm_float(&b, 0.5f))));
/* 0.5 - 1/√2 * (pixels_from_center - line_width * 0.5) */
- nir_ssa_def *coverage =
+ nir_def *coverage =
nir_fsub(&b,
nir_imm_float(&b, 0.5f),
nir_fmul(&b,
nir_imm_float(&b, 1.0f / M_SQRT2),
nir_fsub(&b, pixels_from_center,
- nir_fmul(&b,
- line_width,
- nir_imm_float(&b, 0.5f)))));
+ nir_fmul_imm(&b,
+ line_width,
+ 0.5f))));
/* Discard fragments that aren’t covered at all by the line */
- nir_ssa_def *outside = nir_fge(&b, nir_imm_float(&b, 0.0f), coverage);
+ nir_def *outside = nir_fle_imm(&b, coverage, 0.0f);
nir_discard_if(&b, outside);
/* Clamp to at most 1.0. If it was less than 0.0 then the fragment will
* be discarded so we don’t need to handle that.
*/
- nir_ssa_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
+ nir_def *clamped = nir_fmin(&b, coverage, nir_imm_float(&b, 1.0f));
nir_store_var(&b, state->coverage, clamped, 0x1 /* writemask */);
}
@@ -140,9 +146,11 @@ make_coverage_var(nir_shader *s)
return var;
}
-void
+bool
v3d_nir_lower_line_smooth(nir_shader *s)
{
+ bool progress = false;
+
assert(s->info.stage == MESA_SHADER_FRAGMENT);
struct lower_line_smooth_state state = {
@@ -150,10 +158,20 @@ v3d_nir_lower_line_smooth(nir_shader *s)
.coverage = make_coverage_var(s),
};
- nir_foreach_function(function, s) {
+ nir_foreach_function_with_impl(function, impl, s) {
if (function->is_entrypoint)
- initialise_coverage_var(&state, function->impl);
+ initialise_coverage_var(&state, impl);
+
+ progress |= lower_line_smooth_func(&state, impl);
- lower_line_smooth_func(&state, function->impl);
+ if (progress) {
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
+ } else {
+ nir_metadata_preserve(impl, nir_metadata_all);
+ }
}
+
+ return progress;
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
new file mode 100644
index 00000000000..0caf5dbc92c
--- /dev/null
+++ b/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright © 2021 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * The V3D TMU unit can only do 32-bit general vector access so for anything
+ * else we need to split vector load/store instructions to scalar.
+ *
+ * Note that a vectorization pass after this lowering may be able to
+ * re-vectorize some of these using 32-bit load/store instructions instead,
+ * which we do support.
+ */
+
+static int
+value_src(nir_intrinsic_op intrinsic)
+{
+ switch (intrinsic) {
+ case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
+ return 0;
+ default:
+ unreachable("Unsupported intrinsic");
+ }
+}
+
+static int
+offset_src(nir_intrinsic_op intrinsic)
+{
+ switch (intrinsic) {
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_global_2x32:
+ return 0;
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
+ return 1;
+ case nir_intrinsic_store_ssbo:
+ return 2;
+ default:
+ unreachable("Unsupported intrinsic");
+ }
+}
+
+static nir_intrinsic_instr *
+init_scalar_intrinsic(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ uint32_t component,
+ nir_def *offset,
+ uint32_t bit_size,
+ nir_def **scalar_offset)
+{
+
+ nir_intrinsic_instr *new_intr =
+ nir_intrinsic_instr_create(b->shader, intr->intrinsic);
+
+ nir_intrinsic_copy_const_indices(new_intr, intr);
+
+ const int offset_units = bit_size / 8;
+ assert(offset_units >= 1);
+
+ if (nir_intrinsic_has_align_mul(intr)) {
+ assert(nir_intrinsic_has_align_offset(intr));
+ unsigned align_mul = nir_intrinsic_align_mul(intr);
+ unsigned align_off = nir_intrinsic_align_offset(intr);
+
+ align_off += offset_units * component;
+ align_off = align_off % align_mul;
+
+ nir_intrinsic_set_align(new_intr, align_mul, align_off);
+ }
+
+ *scalar_offset = offset;
+ unsigned offset_adj = offset_units * component;
+ if (nir_intrinsic_has_base(intr)) {
+ nir_intrinsic_set_base(
+ new_intr, nir_intrinsic_base(intr) + offset_adj);
+ } else {
+ *scalar_offset =
+ nir_iadd(b, offset,
+ nir_imm_intN_t(b, offset_adj,
+ offset->bit_size));
+ }
+
+ new_intr->num_components = 1;
+
+ return new_intr;
+}
+
+static bool
+lower_load_bitsize(nir_builder *b,
+ nir_intrinsic_instr *intr)
+{
+ uint32_t bit_size = intr->def.bit_size;
+ if (bit_size == 32)
+ return false;
+
+ /* No need to split if it is already scalar */
+ int num_comp = nir_intrinsic_dest_components(intr);
+ if (num_comp <= 1)
+ return false;
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ /* For global 2x32 we ignore Y component because it must be zero */
+ unsigned offset_idx = offset_src(intr->intrinsic);
+ nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+ /* Split vector store to multiple scalar loads */
+ nir_def *dest_components[4] = { NULL };
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+ for (int component = 0; component < num_comp; component++) {
+ nir_def *scalar_offset;
+ nir_intrinsic_instr *new_intr =
+ init_scalar_intrinsic(b, intr, component, offset,
+ bit_size, &scalar_offset);
+
+ for (unsigned i = 0; i < info->num_srcs; i++) {
+ if (i == offset_idx) {
+ nir_def *final_offset;
+ final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
+ scalar_offset :
+ nir_vec2(b, scalar_offset,
+ nir_imm_int(b, 0));
+ new_intr->src[i] = nir_src_for_ssa(final_offset);
+ } else {
+ new_intr->src[i] = intr->src[i];
+ }
+ }
+
+ nir_def_init(&new_intr->instr, &new_intr->def, 1,
+ bit_size);
+ dest_components[component] = &new_intr->def;
+
+ nir_builder_instr_insert(b, &new_intr->instr);
+ }
+
+ nir_def *new_dst = nir_vec(b, dest_components, num_comp);
+ nir_def_rewrite_uses(&intr->def, new_dst);
+
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_store_bitsize(nir_builder *b,
+ nir_intrinsic_instr *intr)
+{
+ /* No need to split if it is already scalar */
+ int value_idx = value_src(intr->intrinsic);
+ int num_comp = nir_intrinsic_src_components(intr, value_idx);
+ if (num_comp <= 1)
+ return false;
+
+ /* No need to split if it is 32-bit */
+ if (nir_src_bit_size(intr->src[value_idx]) == 32)
+ return false;
+
+ nir_def *value = intr->src[value_idx].ssa;
+
+ b->cursor = nir_before_instr(&intr->instr);
+
+ /* For global 2x32 we ignore Y component because it must be zero */
+ unsigned offset_idx = offset_src(intr->intrinsic);
+ nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
+
+ /* Split vector store to multiple scalar stores */
+ const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+ unsigned wrmask = nir_intrinsic_write_mask(intr);
+ while (wrmask) {
+ unsigned component = ffs(wrmask) - 1;
+
+ nir_def *scalar_offset;
+ nir_intrinsic_instr *new_intr =
+ init_scalar_intrinsic(b, intr, component, offset,
+ value->bit_size, &scalar_offset);
+
+ nir_intrinsic_set_write_mask(new_intr, 0x1);
+
+ for (unsigned i = 0; i < info->num_srcs; i++) {
+ if (i == value_idx) {
+ nir_def *scalar_value =
+ nir_channels(b, value, 1 << component);
+ new_intr->src[i] = nir_src_for_ssa(scalar_value);
+ } else if (i == offset_idx) {
+ nir_def *final_offset;
+ final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
+ scalar_offset :
+ nir_vec2(b, scalar_offset,
+ nir_imm_int(b, 0));
+ new_intr->src[i] = nir_src_for_ssa(final_offset);
+ } else {
+ new_intr->src[i] = intr->src[i];
+ }
+ }
+
+ nir_builder_instr_insert(b, &new_intr->instr);
+
+ wrmask &= ~(1 << component);
+ }
+
+ nir_instr_remove(&intr->instr);
+ return true;
+}
+
+static bool
+lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
+ void *data)
+{
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_uniform:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_global_2x32:
+ return lower_load_bitsize(b, intr);
+
+ case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_store_scratch:
+ case nir_intrinsic_store_global_2x32:
+ return lower_store_bitsize(b, intr);
+
+ default:
+ return false;
+ }
+}
+
+bool
+v3d_nir_lower_load_store_bitsize(nir_shader *s)
+{
+ return nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
+ nir_metadata_block_index |
+ nir_metadata_dominance,
+ NULL);
+}
diff --git a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
index 11782c7348f..4affb79a7e2 100644
--- a/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
+++ b/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
@@ -36,8 +36,8 @@
#include "v3d_compiler.h"
-typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c);
-typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c);
+typedef nir_def *(*nir_pack_func)(nir_builder *b, nir_def *c);
+typedef nir_def *(*nir_unpack_func)(nir_builder *b, nir_def *c);
static bool
logicop_depends_on_dst_color(int logicop_func)
@@ -53,9 +53,9 @@ logicop_depends_on_dst_color(int logicop_func)
}
}
-static nir_ssa_def *
+static nir_def *
v3d_logicop(nir_builder *b, int logicop_func,
- nir_ssa_def *src, nir_ssa_def *dst)
+ nir_def *src, nir_def *dst)
{
switch (logicop_func) {
case PIPE_LOGICOP_CLEAR:
@@ -96,8 +96,8 @@ v3d_logicop(nir_builder *b, int logicop_func,
}
}
-static nir_ssa_def *
-v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+static nir_def *
+v3d_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz)
{
switch (swiz) {
default:
@@ -116,57 +116,57 @@ v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
}
}
-static nir_ssa_def *
-v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans,
+static nir_def *
+v3d_nir_swizzle_and_pack(nir_builder *b, nir_def **chans,
const uint8_t *swiz, nir_pack_func pack_func)
{
- nir_ssa_def *c[4];
+ nir_def *c[4];
for (int i = 0; i < 4; i++)
c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]);
return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3]));
}
-static nir_ssa_def *
-v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed,
+static nir_def *
+v3d_nir_unpack_and_swizzle(nir_builder *b, nir_def *packed,
const uint8_t *swiz, nir_unpack_func unpack_func)
{
- nir_ssa_def *unpacked = unpack_func(b, packed);
+ nir_def *unpacked = unpack_func(b, packed);
- nir_ssa_def *unpacked_chans[4];
+ nir_def *unpacked_chans[4];
for (int i = 0; i < 4; i++)
unpacked_chans[i] = nir_channel(b, unpacked, i);
- nir_ssa_def *c[4];
+ nir_def *c[4];
for (int i = 0; i < 4; i++)
c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]);
return nir_vec4(b, c[0], c[1], c[2], c[3]);
}
-static nir_ssa_def *
-pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+pack_unorm_rgb10a2(nir_builder *b, nir_def *c)
{
static const unsigned bits[4] = { 10, 10, 10, 2 };
- nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits);
+ nir_def *unorm = nir_format_float_to_unorm(b, c, bits);
- nir_ssa_def *chans[4];
+ nir_def *chans[4];
for (int i = 0; i < 4; i++)
chans[i] = nir_channel(b, unorm, i);
- nir_ssa_def *result = nir_mov(b, chans[0]);
+ nir_def *result = nir_mov(b, chans[0]);
int offset = bits[0];
for (int i = 1; i < 4; i++) {
- nir_ssa_def *shifted_chan =
- nir_ishl(b, chans[i], nir_imm_int(b, offset));
+ nir_def *shifted_chan =
+ nir_ishl_imm(b, chans[i], offset);
result = nir_ior(b, result, shifted_chan);
offset += bits[i];
}
return result;
}
-static nir_ssa_def *
-unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+static nir_def *
+unpack_unorm_rgb10a2(nir_builder *b, nir_def *c)
{
static const unsigned bits[4] = { 10, 10, 10, 2 };
const unsigned masks[4] = { BITFIELD_MASK(bits[0]),
@@ -174,11 +174,11 @@ unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
BITFIELD_MASK(bits[2]),
BITFIELD_MASK(bits[3]) };
- nir_ssa_def *chans[4];
+ nir_def *chans[4];
for (int i = 0; i < 4; i++) {
- nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i]));
+ nir_def *unorm = nir_iand_imm(b, c, masks[i]);
chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]);
- c = nir_ushr(b, c, nir_imm_int(b, bits[i]));
+ c = nir_ushr_imm(b, c, bits[i]);
}
return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
@@ -201,13 +201,13 @@ v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt)
}
}
-static nir_ssa_def *
+static nir_def *
v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
{
uint32_t num_components =
util_format_get_nr_components(c->fs_key->color_fmt[rt].format);
- nir_ssa_def *color[4];
+ nir_def *color[4];
for (int i = 0; i < 4; i++) {
if (i < num_components) {
color[i] =
@@ -222,71 +222,68 @@ v3d_nir_get_tlb_color(nir_builder *b, struct v3d_compile *c, int rt, int sample)
return nir_vec4(b, color[0], color[1], color[2], color[3]);
}
-static nir_ssa_def *
+static nir_def *
v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b,
- nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+ nir_def **src_chans, nir_def **dst_chans,
int rt, int sample)
{
const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
- nir_ssa_def *op_res[4];
+ nir_def *op_res[4];
for (int i = 0; i < 4; i++) {
- nir_ssa_def *src = src_chans[i];
- nir_ssa_def *dst =
+ nir_def *src = src_chans[i];
+ nir_def *dst =
v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]);
op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst);
- /* In Vulkan we configure our integer RTs to clamp, so we need
- * to ignore result bits that don't fit in the destination RT
- * component size.
+ /* We configure our integer RTs to clamp, so we need to ignore
+ * result bits that don't fit in the destination RT component
+ * size.
*/
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
- uint32_t bits =
- util_format_get_component_bits(
- c->fs_key->color_fmt[rt].format,
- UTIL_FORMAT_COLORSPACE_RGB, i);
- if (bits > 0 && bits < 32) {
- nir_ssa_def *mask =
- nir_imm_int(b, (1u << bits) - 1);
- op_res[i] = nir_iand(b, op_res[i], mask);
- }
+ uint32_t bits =
+ util_format_get_component_bits(
+ c->fs_key->color_fmt[rt].format,
+ UTIL_FORMAT_COLORSPACE_RGB, i);
+ if (bits > 0 && bits < 32) {
+ op_res[i] =
+ nir_iand_imm(b, op_res[i], (1u << bits) - 1);
}
}
- nir_ssa_def *r[4];
+ nir_def *r[4];
for (int i = 0; i < 4; i++)
r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]);
return nir_vec4(b, r[0], r[1], r[2], r[3]);
}
-static nir_ssa_def *
+static nir_def *
v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b,
- nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+ nir_def **src_chans, nir_def **dst_chans,
int rt, int sample,
nir_pack_func pack_func, nir_unpack_func unpack_func)
{
static const uint8_t src_swz[4] = { 0, 1, 2, 3 };
- nir_ssa_def *packed_src =
+ nir_def *packed_src =
v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func);
const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
- nir_ssa_def *packed_dst =
+ nir_def *packed_dst =
v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func);
- nir_ssa_def *packed_result =
+ nir_def *packed_result =
v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst);
return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func);
}
-static nir_ssa_def *
+static nir_def *
v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
- nir_ssa_def *src, int rt, int sample)
+ nir_def *src, int rt, int sample)
{
- nir_ssa_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
+ nir_def *dst = v3d_nir_get_tlb_color(b, c, rt, sample);
- nir_ssa_def *src_chans[4], *dst_chans[4];
+ nir_def *src_chans[4], *dst_chans[4];
for (unsigned i = 0; i < 4; i++) {
src_chans[i] = nir_channel(b, src, i);
dst_chans[i] = nir_channel(b, dst, i);
@@ -309,7 +306,7 @@ v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
static void
v3d_emit_ms_output(nir_builder *b,
- nir_ssa_def *color, nir_src *offset,
+ nir_def *color, nir_src *offset,
nir_alu_type type, int rt, int sample)
{
nir_store_tlb_sample_color_v3d(b, color, nir_imm_int(b, rt), .base = sample, .component = 0, .src_type = type);
@@ -321,7 +318,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
nir_intrinsic_instr *intr,
int rt)
{
- nir_ssa_def *frag_color = intr->src[0].ssa;
+ nir_def *frag_color = intr->src[0].ssa;
const int logic_op = c->fs_key->logicop_func;
@@ -331,7 +328,7 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
nir_src *offset = &intr->src[1];
nir_alu_type type = nir_intrinsic_src_type(intr);
for (int i = 0; i < V3D_MAX_SAMPLES; i++) {
- nir_ssa_def *sample =
+ nir_def *sample =
v3d_nir_emit_logic_op(c, b, frag_color, rt, i);
v3d_emit_ms_output(b, sample, offset, type, rt, i);
@@ -339,11 +336,10 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
nir_instr_remove(&intr->instr);
} else {
- nir_ssa_def *result =
+ nir_def *result =
v3d_nir_emit_logic_op(c, b, frag_color, rt, 0);
- nir_instr_rewrite_src(&intr->instr, &intr->src[0],
- nir_src_for_ssa(result));
+ nir_src_rewrite(&intr->src[0], result);
intr->num_components = result->num_components;
}
}
@@ -351,6 +347,8 @@ v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
static bool
v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
{
+ bool progress = false;
+
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
@@ -384,35 +382,40 @@ v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
continue;
}
- nir_function_impl *impl =
- nir_cf_node_get_function(&block->cf_node);
- nir_builder b;
- nir_builder_init(&b, impl);
- b.cursor = nir_before_instr(&intr->instr);
+ nir_builder b = nir_builder_at(nir_before_instr(&intr->instr));
v3d_nir_lower_logic_op_instr(c, &b, intr, rt);
+
+ progress = true;
}
}
- return true;
+ return progress;
}
-void
+bool
v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c)
{
+ bool progress = false;
+
/* Nothing to do if logic op is 'copy src to dst' or if logic ops are
* disabled (we set the logic op to copy in that case).
*/
if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY)
- return;
+ return false;
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_foreach_block(block, function->impl)
- v3d_nir_lower_logic_ops_block(block, c);
+ nir_foreach_function_impl(impl, s) {
+ nir_foreach_block(block, impl)
+ progress |= v3d_nir_lower_logic_ops_block(block, c);
- nir_metadata_preserve(function->impl,
+ if (progress) {
+ nir_metadata_preserve(impl,
nir_metadata_block_index |
nir_metadata_dominance);
+ } else {
+ nir_metadata_preserve(impl,
+ nir_metadata_all);
}
}
+
+ return progress;
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c b/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
deleted file mode 100644
index 40f1cc23b1a..00000000000
--- a/src/broadcom/compiler/v3d_nir_lower_robust_buffer_access.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright © 2020 Raspberry Pi
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "compiler/v3d_compiler.h"
-#include "compiler/nir/nir_builder.h"
-
-static void
-rewrite_offset(nir_builder *b,
- nir_intrinsic_instr *instr,
- uint32_t buffer_idx,
- uint32_t offset_src,
- nir_intrinsic_op buffer_size_op)
-{
- b->cursor = nir_before_instr(&instr->instr);
-
- /* Get size of the buffer */
- nir_intrinsic_instr *size =
- nir_intrinsic_instr_create(b->shader, buffer_size_op);
- size->src[0] = nir_src_for_ssa(nir_imm_int(b, buffer_idx));
- nir_ssa_dest_init(&size->instr, &size->dest, 1, 32, NULL);
- nir_builder_instr_insert(b, &size->instr);
-
- /* All out TMU accesses are 32-bit aligned */
- nir_ssa_def *aligned_buffer_size =
- nir_iand(b, &size->dest.ssa, nir_imm_int(b, 0xfffffffc));
-
- /* Rewrite offset */
- nir_ssa_def *offset =
- nir_umin(b, instr->src[offset_src].ssa, aligned_buffer_size);
- nir_instr_rewrite_src(&instr->instr, &instr->src[offset_src],
- nir_src_for_ssa(offset));
-}
-
-static void
-lower_load(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
-
- nir_intrinsic_op op;
- if (instr->intrinsic == nir_intrinsic_load_ubo) {
- op = nir_intrinsic_get_ubo_size;
- if (c->key->environment == V3D_ENVIRONMENT_VULKAN)
- index--;
- } else {
- op = nir_intrinsic_get_ssbo_size;
- }
-
- rewrite_offset(b, instr, index, 1, op);
-}
-
-static void
-lower_store(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- uint32_t index = nir_src_comp_as_uint(instr->src[1], 0);
- rewrite_offset(b, instr, index, 2, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_atomic(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- uint32_t index = nir_src_comp_as_uint(instr->src[0], 0);
- rewrite_offset(b, instr, index, 1, nir_intrinsic_get_ssbo_size);
-}
-
-static void
-lower_shared(struct v3d_compile *c,
- nir_builder *b,
- nir_intrinsic_instr *instr)
-{
- b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *aligned_size =
- nir_imm_int(b, c->s->info.shared_size & 0xfffffffc);
- nir_ssa_def *offset = nir_umin(b, instr->src[0].ssa, aligned_size);
- nir_instr_rewrite_src(&instr->instr, &instr->src[0],
- nir_src_for_ssa(offset));
-}
-
-static void
-lower_instr(struct v3d_compile *c, nir_builder *b, struct nir_instr *instr)
-{
- if (instr->type != nir_instr_type_intrinsic)
- return;
- nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
- switch (intr->intrinsic) {
- case nir_intrinsic_load_ubo:
- case nir_intrinsic_load_ssbo:
- lower_load(c, b, intr);
- break;
- case nir_intrinsic_store_ssbo:
- lower_store(c, b, intr);
- break;
- case nir_intrinsic_ssbo_atomic_add:
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_ssbo_atomic_comp_swap:
- lower_atomic(c, b, intr);
- break;
- case nir_intrinsic_load_shared:
- case nir_intrinsic_shared_atomic_add:
- case nir_intrinsic_shared_atomic_imin:
- case nir_intrinsic_shared_atomic_umin:
- case nir_intrinsic_shared_atomic_imax:
- case nir_intrinsic_shared_atomic_umax:
- case nir_intrinsic_shared_atomic_and:
- case nir_intrinsic_shared_atomic_or:
- case nir_intrinsic_shared_atomic_xor:
- case nir_intrinsic_shared_atomic_exchange:
- case nir_intrinsic_shared_atomic_comp_swap:
- lower_shared(c, b, intr);
- break;
- default:
- break;
- }
-}
-
-void
-v3d_nir_lower_robust_buffer_access(nir_shader *s, struct v3d_compile *c)
-{
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block)
- lower_instr(c, &b, instr);
- }
-
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
- }
- }
-}
diff --git a/src/broadcom/compiler/v3d_nir_lower_scratch.c b/src/broadcom/compiler/v3d_nir_lower_scratch.c
index 893b6f6ae28..93ed1bb6e26 100644
--- a/src/broadcom/compiler/v3d_nir_lower_scratch.c
+++ b/src/broadcom/compiler/v3d_nir_lower_scratch.c
@@ -34,11 +34,11 @@
* writemasks in the process.
*/
-static nir_ssa_def *
+static nir_def *
v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr)
{
bool is_store = instr->intrinsic == nir_intrinsic_store_scratch;
- nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1);
+ nir_def *offset = instr->src[is_store ? 1 : 0].ssa;
assert(nir_intrinsic_align_mul(instr) >= 4);
assert(nir_intrinsic_align_offset(instr) == 0);
@@ -55,18 +55,18 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
{
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr);
+ nir_def *offset = v3d_nir_scratch_offset(b,instr);
- nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS];
+ nir_def *chans[NIR_MAX_VEC_COMPONENTS];
for (int i = 0; i < instr->num_components; i++) {
- nir_ssa_def *chan_offset =
+ nir_def *chan_offset =
nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
nir_intrinsic_instr *chan_instr =
nir_intrinsic_instr_create(b->shader, instr->intrinsic);
chan_instr->num_components = 1;
- nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1,
- instr->dest.ssa.bit_size, NULL);
+ nir_def_init(&chan_instr->instr, &chan_instr->def, 1,
+ instr->def.bit_size);
chan_instr->src[0] = nir_src_for_ssa(chan_offset);
@@ -74,11 +74,11 @@ v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
nir_builder_instr_insert(b, &chan_instr->instr);
- chans[i] = &chan_instr->dest.ssa;
+ chans[i] = &chan_instr->def;
}
- nir_ssa_def *result = nir_vec(b, chans, instr->num_components);
- nir_ssa_def_rewrite_uses(&instr->dest.ssa, result);
+ nir_def *result = nir_vec(b, chans, instr->num_components);
+ nir_def_rewrite_uses(&instr->def, result);
nir_instr_remove(&instr->instr);
}
@@ -87,15 +87,14 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
{
b->cursor = nir_before_instr(&instr->instr);
- nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr);
- nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0],
- instr->num_components);
+ nir_def *offset = v3d_nir_scratch_offset(b, instr);
+ nir_def *value = instr->src[0].ssa;
for (int i = 0; i < instr->num_components; i++) {
if (!(nir_intrinsic_write_mask(instr) & (1 << i)))
continue;
- nir_ssa_def *chan_offset =
+ nir_def *chan_offset =
nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
nir_intrinsic_instr *chan_instr =
@@ -115,39 +114,29 @@ v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
nir_instr_remove(&instr->instr);
}
-void
-v3d_nir_lower_scratch(nir_shader *s)
+static bool
+v3d_nir_lower_scratch_cb(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ void *_state)
{
- nir_foreach_function(function, s) {
- if (!function->impl)
- continue;
-
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *intr =
- nir_instr_as_intrinsic(instr);
-
- switch (intr->intrinsic) {
- case nir_intrinsic_load_scratch:
- v3d_nir_lower_load_scratch(&b, intr);
- break;
- case nir_intrinsic_store_scratch:
- v3d_nir_lower_store_scratch(&b, intr);
- break;
- default:
- break;
- }
- }
- }
-
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_scratch:
+ v3d_nir_lower_load_scratch(b, intr);
+ return true;
+ case nir_intrinsic_store_scratch:
+ v3d_nir_lower_store_scratch(b, intr);
+ return true;
+ default:
+ return false;
}
+
+ return false;
+}
+
+bool
+v3d_nir_lower_scratch(nir_shader *s)
+{
+ return nir_shader_intrinsics_pass(s, v3d_nir_lower_scratch_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance, NULL);
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
index d79969374d5..e78c3cb9e3e 100644
--- a/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
+++ b/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
@@ -32,25 +32,21 @@
* 2x2 quad.
*/
-#define V3D_MAX_SAMPLES 4
-
-static nir_ssa_def *
+static nir_def *
v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
{
nir_tex_instr *instr = nir_instr_as_tex(in_instr);
b->cursor = nir_before_instr(&instr->instr);
- int coord_index = nir_tex_instr_src_index(instr, nir_tex_src_coord);
- int sample_index = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
- nir_ssa_def *coord = instr->src[coord_index].src.ssa;
- nir_ssa_def *sample = instr->src[sample_index].src.ssa;
+ nir_def *coord = nir_steal_tex_src(instr, nir_tex_src_coord);
+ nir_def *sample = nir_steal_tex_src(instr, nir_tex_src_ms_index);
- nir_ssa_def *one = nir_imm_int(b, 1);
- nir_ssa_def *x = nir_iadd(b,
+ nir_def *one = nir_imm_int(b, 1);
+ nir_def *x = nir_iadd(b,
nir_ishl(b, nir_channel(b, coord, 0), one),
nir_iand(b, sample, one));
- nir_ssa_def *y = nir_iadd(b,
+ nir_def *y = nir_iadd(b,
nir_ishl(b, nir_channel(b, coord, 1), one),
nir_iand(b, nir_ushr(b, sample, one), one));
if (instr->is_array)
@@ -58,10 +54,7 @@ v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
else
coord = nir_vec2(b, x, y);
- nir_instr_rewrite_src(&instr->instr,
- &instr->src[nir_tex_src_coord].src,
- nir_src_for_ssa(coord));
- nir_tex_instr_remove_src(instr, sample_index);
+ nir_tex_instr_add_src(instr, nir_tex_src_coord, coord);
instr->op = nir_texop_txf;
instr->sampler_dim = GLSL_SAMPLER_DIM_2D;
@@ -75,11 +68,11 @@ v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data)
nir_instr_as_tex(instr)->op == nir_texop_txf_ms);
}
-void
-v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c)
+bool
+v3d_nir_lower_txf_ms(nir_shader *s)
{
- nir_shader_lower_instructions(s,
- v3d_nir_lower_txf_ms_filter,
- v3d_nir_lower_txf_ms_instr,
- NULL);
+ return nir_shader_lower_instructions(s,
+ v3d_nir_lower_txf_ms_filter,
+ v3d_nir_lower_txf_ms_instr,
+ NULL);
}
diff --git a/src/broadcom/compiler/v3d_packing.c b/src/broadcom/compiler/v3d_packing.c
new file mode 100644
index 00000000000..46643edd5e6
--- /dev/null
+++ b/src/broadcom/compiler/v3d_packing.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#define __gen_unpack_address(cl, s, e) (__gen_unpack_uint(cl, s, e) << (31 - (e - s)))
+#include "cle/v3d_packet_v42_pack.h"
+
+
+/* Typically, this method would wrap calling version-specific variant of this
+ * method, but as TMU_CONFIG_PARAMETER_1 doesn't change between v42 and v71,
+ * we can assume that p1_packed is the same struct, and use the same method.
+ */
+void
+v3d_pack_unnormalized_coordinates(struct v3d_device_info *devinfo,
+ uint32_t *p1_packed,
+ bool unnormalized_coordinates)
+{
+ assert(devinfo->ver == 71 || devinfo->ver == 42);
+
+ struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked;
+ V3D42_TMU_CONFIG_PARAMETER_1_unpack((uint8_t *)p1_packed, &p1_unpacked);
+ p1_unpacked.unnormalized_coordinates = unnormalized_coordinates;
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL, (uint8_t *)p1_packed,
+ &p1_unpacked);
+}
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d_tex.c
index 7bebfe95552..643c73c4e58 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d_tex.c
@@ -28,27 +28,29 @@
#define __gen_address_type uint32_t
#define __gen_address_offset(reloc) (*reloc)
#define __gen_emit_reloc(cl, reloc)
-#include "cle/v3d_packet_v41_pack.h"
+#include "cle/v3d_packet_v42_pack.h"
-static inline void
+static inline struct qinst *
vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
{
/* XXX perf: We should figure out how to merge ALU operations
* producing the val with this MOV, when possible.
*/
- vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+ return vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
}
-static inline void
+static inline struct qinst *
vir_TMU_WRITE_or_count(struct v3d_compile *c,
enum v3d_qpu_waddr waddr,
struct qreg val,
uint32_t *tmu_writes)
{
- if (tmu_writes)
+ if (tmu_writes) {
(*tmu_writes)++;
- else
- vir_TMU_WRITE(c, waddr, val);
+ return NULL;
+ } else {
+ return vir_TMU_WRITE(c, waddr, val);
+ }
}
static void
@@ -59,11 +61,11 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data
inst->uniform = vir_get_uniform_index(c, contents, data);
}
-static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
.per_pixel_mask_enable = true,
};
-static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
+static const struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
.op = V3D_TMU_OP_REGULAR,
};
@@ -84,7 +86,7 @@ handle_tex_src(struct v3d_compile *c,
nir_tex_instr *instr,
unsigned src_idx,
unsigned non_array_components,
- struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+ struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
struct qreg *s_out,
unsigned *tmu_writes)
{
@@ -199,7 +201,7 @@ handle_tex_src(struct v3d_compile *c,
static void
vir_tex_handle_srcs(struct v3d_compile *c,
nir_tex_instr *instr,
- struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+ struct V3D42_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
struct qreg *s,
unsigned *tmu_writes)
{
@@ -222,31 +224,62 @@ get_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
}
void
-v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
{
- assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);
-
unsigned texture_idx = instr->texture_index;
- unsigned sampler_idx = instr->sampler_index;
- struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+ /* For instructions that don't have a sampler (i.e. txf) we bind
+ * default sampler state via the backend_flags to handle precision.
+ */
+ unsigned sampler_idx = nir_tex_instr_need_sampler(instr) ?
+ instr->sampler_index : instr->backend_flags;
+
+ /* Even if the texture operation doesn't need a sampler by
+ * itself, we still need to add the sampler configuration
+ * parameter if the output is 32 bit
+ */
+ assert(sampler_idx < c->key->num_samplers_used);
+ bool output_type_32_bit =
+ c->key->sampler[sampler_idx].return_size == 32;
+
+ struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
};
/* Limit the number of channels returned to both how many the NIR
* instruction writes and how many the instruction could produce.
*/
- p0_unpacked.return_words_of_texture_data =
- instr->dest.is_ssa ?
- nir_ssa_def_components_read(&instr->dest.ssa) :
- (1 << instr->dest.reg.reg->num_components) - 1;
+ nir_intrinsic_instr *store = nir_store_reg_for_def(&instr->def);
+ if (store == NULL) {
+ p0_unpacked.return_words_of_texture_data =
+ nir_def_components_read(&instr->def);
+ } else {
+ nir_def *reg = store->src[1].ssa;
+ nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
+ unsigned reg_num_components =
+ nir_intrinsic_num_components(decl);
+
+ /* For the non-ssa case we don't have a full equivalent to
+ * nir_def_components_read. This is a problem for the 16
+ * bit case. nir_lower_tex will not change the destination as
+ * nir_tex_instr_dest_size will still return 4. The driver is
+ * just expected to not store on other channels, so we
+ * manually ensure that here.
+ */
+ uint32_t num_components = output_type_32_bit ?
+ MIN2(reg_num_components, 4) :
+ MIN2(reg_num_components, 2);
+
+ p0_unpacked.return_words_of_texture_data = (1 << num_components) - 1;
+ }
assert(p0_unpacked.return_words_of_texture_data != 0);
- struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
.op = V3D_TMU_OP_REGULAR,
.gather_mode = instr->op == nir_texop_tg4,
.gather_component = instr->component,
.coefficient_mode = instr->op == nir_texop_txd,
- .disable_autolod = instr->op == nir_texop_tg4
+ .disable_autolod = instr->op == nir_texop_tg4,
+ .lod_query = instr->op == nir_texop_lod,
};
const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr);
@@ -270,22 +303,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
uint32_t p0_packed;
- V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
(uint8_t *)&p0_packed,
&p0_unpacked);
uint32_t p2_packed;
- V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
(uint8_t *)&p2_packed,
&p2_unpacked);
- /* We manually set the LOD Query bit (see
- * V3D42_TMU_CONFIG_PARAMETER_2) as right now is the only V42 specific
- * feature over V41 we are using
- */
- if (instr->op == nir_texop_lod)
- p2_packed |= 1UL << 24;
-
/* Load texture_idx number into the high bits of the texture address field,
* which will be be used by the driver to decide which texture to put
* in the actual address field.
@@ -294,14 +320,6 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
- /* Even if the texture operation doesn't need a sampler by
- * itself, we still need to add the sampler configuration
- * parameter if the output is 32 bit
- */
- bool output_type_32_bit =
- c->key->sampler[sampler_idx].return_size == 32 &&
- !instr->is_shadow;
-
/* p1 is optional, but we can skip it only if p2 can be skipped too */
bool needs_p2_config =
(instr->op == nir_texop_lod ||
@@ -313,7 +331,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
output_type_32_bit;
if (non_default_p1_config) {
- struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
.output_type_32_bit = output_type_32_bit,
.unnormalized_coordinates = (instr->sampler_dim ==
@@ -330,7 +348,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
p0_unpacked.return_words_of_texture_data < (1 << 2));
uint32_t p1_packed;
- V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
(uint8_t *)&p1_packed,
&p1_unpacked);
@@ -358,7 +376,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
* address
*/
uint32_t p1_packed_default;
- V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
(uint8_t *)&p1_packed_default,
&p1_unpacked_default);
vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default);
@@ -368,48 +386,54 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
/* Emit retiring TMU write */
+ struct qinst *retiring;
if (instr->op == nir_texop_txf) {
assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
} else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
} else if (instr->op == nir_texop_txl) {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s);
} else {
- vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+ retiring = vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
}
- ntq_add_pending_tmu_flush(c, &instr->dest,
+ retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+ ntq_add_pending_tmu_flush(c, &instr->def,
p0_unpacked.return_words_of_texture_data);
}
static uint32_t
-v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
+v3d_image_atomic_tmu_op(nir_intrinsic_instr *instr)
+{
+ nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
+ switch (atomic_op) {
+ case nir_atomic_op_iadd: return v3d_get_op_for_atomic_add(instr, 3);
+ case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN;
+ case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+ case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX;
+ case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX;
+ case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC;
+ case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC;
+ case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+ case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+ case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+ default: unreachable("unknown atomic op");
+ }
+}
+
+static uint32_t
+v3d_image_load_store_tmu_op(nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
return V3D_TMU_OP_REGULAR;
- case nir_intrinsic_image_atomic_add:
- return v3d_get_op_for_atomic_add(instr, 3);
- case nir_intrinsic_image_atomic_imin:
- return V3D_TMU_OP_WRITE_SMIN;
- case nir_intrinsic_image_atomic_umin:
- return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
- case nir_intrinsic_image_atomic_imax:
- return V3D_TMU_OP_WRITE_SMAX;
- case nir_intrinsic_image_atomic_umax:
- return V3D_TMU_OP_WRITE_UMAX;
- case nir_intrinsic_image_atomic_and:
- return V3D_TMU_OP_WRITE_AND_READ_INC;
- case nir_intrinsic_image_atomic_or:
- return V3D_TMU_OP_WRITE_OR_READ_DEC;
- case nir_intrinsic_image_atomic_xor:
- return V3D_TMU_OP_WRITE_XOR_READ_NOT;
- case nir_intrinsic_image_atomic_exchange:
- return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
- case nir_intrinsic_image_atomic_comp_swap:
- return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+
+ case nir_intrinsic_image_atomic:
+ case nir_intrinsic_image_atomic_swap:
+ return v3d_image_atomic_tmu_op(instr);
+
default:
unreachable("unknown image intrinsic");
};
@@ -427,7 +451,7 @@ v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
* which is why we always call ntq_get_src() even if we are only interested in
* register write counts.
*/
-static void
+static struct qinst *
vir_image_emit_register_writes(struct v3d_compile *c,
nir_intrinsic_instr *instr,
bool atomic_add_replaced,
@@ -480,7 +504,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
}
/* Second atomic argument */
- if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) {
+ if (instr->intrinsic == nir_intrinsic_image_atomic_swap &&
+ nir_intrinsic_atomic_op(instr) == nir_atomic_op_cmpxchg) {
struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0);
vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0,
tmu_writes);
@@ -494,7 +519,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
V3D_QPU_PF_PUSHZ);
}
- vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
+ struct qinst *retiring =
+ vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes);
if (!tmu_writes && vir_in_nonuniform_control_flow(c) &&
instr->intrinsic != nir_intrinsic_image_load) {
@@ -502,6 +528,8 @@ vir_image_emit_register_writes(struct v3d_compile *c,
(struct qinst *)c->cur_block->instructions.prev;
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
}
+
+ return retiring;
}
static unsigned
@@ -516,21 +544,21 @@ get_required_image_tmu_writes(struct v3d_compile *c,
}
void
-v3d40_vir_emit_image_load_store(struct v3d_compile *c,
- nir_intrinsic_instr *instr)
+v3d_vir_emit_image_load_store(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
{
unsigned format = nir_intrinsic_format(instr);
unsigned unit = nir_src_as_uint(instr->src[0]);
- struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
};
- struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
+ struct V3D42_TMU_CONFIG_PARAMETER_1 p1_unpacked = {
.per_pixel_mask_enable = true,
.output_type_32_bit = v3d_gl_format_is_return_32(format),
};
- struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
+ struct V3D42_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
/* Limit the number of channels returned to both how many the NIR
* instruction writes and how many the instruction could produce.
@@ -542,19 +570,20 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
p0_unpacked.return_words_of_texture_data =
(1 << instr_return_channels) - 1;
- p2_unpacked.op = v3d40_image_load_store_tmu_op(instr);
+ p2_unpacked.op = v3d_image_load_store_tmu_op(instr);
/* If we were able to replace atomic_add for an inc/dec, then we
* need/can to do things slightly different, like not loading the
* amount to add/sub, as that is implicit.
*/
bool atomic_add_replaced =
- (instr->intrinsic == nir_intrinsic_image_atomic_add &&
- (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
- p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC));
+ instr->intrinsic == nir_intrinsic_image_atomic &&
+ nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
+ (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+ p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC);
uint32_t p0_packed;
- V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_0_pack(NULL,
(uint8_t *)&p0_packed,
&p0_unpacked);
@@ -565,12 +594,12 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
p0_packed |= unit << 24;
uint32_t p1_packed;
- V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_1_pack(NULL,
(uint8_t *)&p1_packed,
&p1_unpacked);
uint32_t p2_packed;
- V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL,
+ V3D42_TMU_CONFIG_PARAMETER_2_pack(NULL,
(uint8_t *)&p2_packed,
&p2_unpacked);
@@ -599,8 +628,9 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)))
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
- vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
-
- ntq_add_pending_tmu_flush(c, &instr->dest,
+ struct qinst *retiring =
+ vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
+ retiring->ldtmu_count = p0_unpacked.return_words_of_texture_data;
+ ntq_add_pending_tmu_flush(c, &instr->def,
p0_unpacked.return_words_of_texture_data);
}
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 27869a35a3b..c59a8aac434 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -23,7 +23,6 @@
#include "broadcom/common/v3d_device_info.h"
#include "v3d_compiler.h"
-#include "util/u_prim.h"
#include "compiler/nir/nir_schedule.h"
#include "compiler/nir/nir_builder.h"
@@ -89,7 +88,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
* pointer, so each read has a side effect (we don't care for ldunif
* because we reconstruct the uniform stream buffer after compiling
* with the surviving uniforms), so allowing DCE to remove
- * one would break follow-up loads. We could fix this by emiting a
+ * one would break follow-up loads. We could fix this by emitting a
* unifa for each ldunifa, but each unifa requires 3 delay slots
* before a ldunifa, so that would be quite expensive.
*/
@@ -113,10 +112,10 @@ vir_is_raw_mov(struct qinst *inst)
return false;
}
- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
@@ -156,30 +155,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
- for (int i = 0; i < vir_get_nsrc(inst); i++) {
- switch (inst->src[i].file) {
- case QFILE_VPM:
- return true;
- default:
- break;
- }
- }
-
- if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
- inst->qpu.sig.ldtlb ||
- inst->qpu.sig.ldtlbu ||
- inst->qpu.sig.ldvpm)) {
- return true;
- }
-
- return false;
-}
+ if (!devinfo->has_accumulators)
+ return false;
-bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
-{
switch (inst->dst.file) {
case QFILE_MAGIC:
switch (inst->dst.index) {
@@ -195,9 +176,6 @@ vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
break;
}
- if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
- return true;
-
return false;
}
@@ -209,15 +187,15 @@ vir_set_unpack(struct qinst *inst, int src,
if (vir_is_add(inst)) {
if (src == 0)
- inst->qpu.alu.add.a_unpack = unpack;
+ inst->qpu.alu.add.a.unpack = unpack;
else
- inst->qpu.alu.add.b_unpack = unpack;
+ inst->qpu.alu.add.b.unpack = unpack;
} else {
assert(vir_is_mul(inst));
if (src == 0)
- inst->qpu.alu.mul.a_unpack = unpack;
+ inst->qpu.alu.mul.a.unpack = unpack;
else
- inst->qpu.alu.mul.b_unpack = unpack;
+ inst->qpu.alu.mul.b.unpack = unpack;
}
}
@@ -369,6 +347,8 @@ vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct q
inst->src[1] = src1;
inst->uniform = ~0;
+ inst->ip = -1;
+
return inst;
}
@@ -385,6 +365,8 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q
inst->src[1] = src1;
inst->uniform = ~0;
+ inst->ip = -1;
+
return inst;
}
@@ -404,12 +386,16 @@ vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
inst->dst = vir_nop_reg();
inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
+ inst->ip = -1;
+
return inst;
}
static void
vir_emit(struct v3d_compile *c, struct qinst *inst)
{
+ inst->ip = -1;
+
switch (c->cursor.mode) {
case vir_cursor_add:
list_add(&inst->link, c->cursor.link);
@@ -509,13 +495,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
}
const struct v3d_compiler *
-v3d_compiler_init(const struct v3d_device_info *devinfo)
+v3d_compiler_init(const struct v3d_device_info *devinfo,
+ uint32_t max_inline_uniform_buffers)
{
struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
if (!compiler)
return NULL;
compiler->devinfo = devinfo;
+ compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
if (!vir_init_reg_sets(compiler)) {
ralloc_free(compiler);
@@ -531,6 +519,19 @@ v3d_compiler_free(const struct v3d_compiler *compiler)
ralloc_free((void *)compiler);
}
+struct v3d_compiler_strategy {
+ const char *name;
+ uint32_t max_threads;
+ uint32_t min_threads;
+ bool disable_general_tmu_sched;
+ bool disable_gcm;
+ bool disable_loop_unrolling;
+ bool disable_ubo_load_sorting;
+ bool move_buffer_loads;
+ bool disable_tmu_pipelining;
+ uint32_t max_tmu_spills;
+};
+
static struct v3d_compile *
vir_compile_init(const struct v3d_compiler *compiler,
struct v3d_key *key,
@@ -539,12 +540,8 @@ vir_compile_init(const struct v3d_compiler *compiler,
void *debug_output_data),
void *debug_output_data,
int program_id, int variant_id,
- uint32_t max_threads,
- uint32_t min_threads_for_reg_alloc,
- bool tmu_spilling_allowed,
- bool disable_loop_unrolling,
- bool disable_constant_ubo_load_sorting,
- bool disable_tmu_pipelining,
+ uint32_t compile_strategy_idx,
+ const struct v3d_compiler_strategy *strategy,
bool fallback_scheduler)
{
struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
@@ -554,17 +551,22 @@ vir_compile_init(const struct v3d_compiler *compiler,
c->key = key;
c->program_id = program_id;
c->variant_id = variant_id;
- c->threads = max_threads;
+ c->compile_strategy_idx = compile_strategy_idx;
+ c->threads = strategy->max_threads;
c->debug_output = debug_output;
c->debug_output_data = debug_output_data;
c->compilation_result = V3D_COMPILATION_SUCCEEDED;
- c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
- c->tmu_spilling_allowed = tmu_spilling_allowed;
+ c->min_threads_for_reg_alloc = strategy->min_threads;
+ c->max_tmu_spills = strategy->max_tmu_spills;
c->fallback_scheduler = fallback_scheduler;
- c->disable_tmu_pipelining = disable_tmu_pipelining;
- c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
- c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
- ? true : disable_loop_unrolling;
+ c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
+ c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
+ c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
+ c->move_buffer_loads = strategy->move_buffer_loads;
+ c->disable_gcm = strategy->disable_gcm;
+ c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
+ ? true : strategy->disable_loop_unrolling;
+
s = nir_shader_clone(c, s);
c->s = s;
@@ -590,17 +592,107 @@ type_size_vec4(const struct glsl_type *type, bool bindless)
return glsl_count_attribute_slots(type, false);
}
+static enum nir_lower_tex_packing
+lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
+{
+ struct v3d_compile *c = (struct v3d_compile *) data;
+
+ int sampler_index = nir_tex_instr_need_sampler(tex) ?
+ tex->sampler_index : tex->backend_flags;
+
+ assert(sampler_index < c->key->num_samplers_used);
+ return c->key->sampler[sampler_index].return_size == 16 ?
+ nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
+}
+
+static bool
+v3d_nir_lower_null_pointers_cb(nir_builder *b,
+ nir_intrinsic_instr *intr,
+ void *_state)
+{
+ uint32_t buffer_src_idx;
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_ssbo:
+ buffer_src_idx = 0;
+ break;
+ case nir_intrinsic_store_ssbo:
+ buffer_src_idx = 1;
+ break;
+ default:
+ return false;
+ }
+
+ /* If index if constant we are good */
+ nir_src *src = &intr->src[buffer_src_idx];
+ if (nir_src_is_const(*src))
+ return false;
+
+ /* Otherwise, see if it comes from a bcsel including a null pointer */
+ if (src->ssa->parent_instr->type != nir_instr_type_alu)
+ return false;
+
+ nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
+ if (alu->op != nir_op_bcsel)
+ return false;
+
+ /* A null pointer is specified using block index 0xffffffff */
+ int32_t null_src_idx = -1;
+ for (int i = 1; i < 3; i++) {
+ /* FIXME: since we are running this before optimization maybe
+ * we need to also handle the case where we may have bcsel
+ * chain that we need to recurse?
+ */
+ if (!nir_src_is_const(alu->src[i].src))
+ continue;
+ if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
+ continue;
+
+ /* One of the bcsel srcs is a null pointer reference */
+ null_src_idx = i;
+ break;
+ }
+
+ if (null_src_idx < 0)
+ return false;
+
+ assert(null_src_idx == 1 || null_src_idx == 2);
+ int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
+
+ /* Rewrite the null pointer reference so we use the same buffer index
+ * as the other bcsel branch. This will allow optimization to remove
+ * the bcsel and we should then end up with a constant buffer index
+ * like we need.
+ */
+ b->cursor = nir_before_instr(&alu->instr);
+ nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
+ nir_src_rewrite(&alu->src[null_src_idx].src, copy);
+
+ return true;
+}
+
+static bool
+v3d_nir_lower_null_pointers(nir_shader *s)
+{
+ return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
+ nir_metadata_block_index |
+ nir_metadata_dominance, NULL);
+}
+
static void
v3d_lower_nir(struct v3d_compile *c)
{
struct nir_lower_tex_options tex_options = {
.lower_txd = true,
+ .lower_tg4_offsets = true,
.lower_tg4_broadcom_swizzle = true,
.lower_rect = false, /* XXX: Use this on V3D 3.x */
.lower_txp = ~0,
/* Apply swizzles to all samplers. */
.swizzle_result = ~0,
+ .lower_invalid_implicit_lod = true,
};
/* Lower the format swizzle and (for 32-bit returns)
@@ -612,38 +704,35 @@ v3d_lower_nir(struct v3d_compile *c)
tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
}
- assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
- for (int i = 0; i < c->key->num_samplers_used; i++) {
- if (c->key->sampler[i].return_size == 16) {
- tex_options.lower_tex_packing[i] =
- nir_lower_tex_packing_16;
- }
- }
-
- /* CS textures may not have return_size reflecting the shadow state. */
- nir_foreach_uniform_variable(var, c->s) {
- const struct glsl_type *type = glsl_without_array(var->type);
- unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+ tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
+ tex_options.lower_tex_packing_data = c;
- if (!glsl_type_is_sampler(type) ||
- !glsl_sampler_type_is_shadow(type))
- continue;
+ NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
+ NIR_PASS(_, c->s, nir_lower_system_values);
- for (int i = 0; i < array_len; i++) {
- tex_options.lower_tex_packing[var->data.binding + i] =
- nir_lower_tex_packing_16;
- }
+ if (c->s->info.zero_initialize_shared_memory &&
+ c->s->info.shared_size > 0) {
+ /* All our BOs allocate full pages, so the underlying allocation
+ * for shared memory will always be a multiple of 4KB. This
+ * ensures that we can do an exact number of full chunk_size
+ * writes to initialize the memory independently of the actual
+ * shared_size used by the shader, which is a requirement of
+ * the initialization pass.
+ */
+ const unsigned chunk_size = 16; /* max single store size */
+ NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
+ align(c->s->info.shared_size, chunk_size), chunk_size);
}
- NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
- NIR_PASS_V(c->s, nir_lower_system_values);
- NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
+ NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
- NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
- nir_var_function_temp,
- 0,
- glsl_get_natural_size_align_bytes);
- NIR_PASS_V(c->s, v3d_nir_lower_scratch);
+ NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
+ nir_var_function_temp,
+ 0,
+ glsl_get_natural_size_align_bytes);
+ NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
+ NIR_PASS(_, c->s, v3d_nir_lower_scratch);
+ NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
}
static void
@@ -711,6 +800,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
/* Set us up for shared input/output segments. This is apparently
* necessary for our VCM setup to avoid varying corruption.
+ *
+ * FIXME: initial testing on V3D 7.1 seems to work fine when using
+ * separate segments. So we could try to reevaluate in the future, if
+ * there is any advantage of using separate segments.
*/
prog_data->separate_segments = false;
prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
@@ -807,13 +900,14 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
{
v3d_set_fs_prog_data_inputs(c, prog_data);
prog_data->writes_z = c->writes_z;
+ prog_data->writes_z_from_fep = c->writes_z_from_fep;
prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
prog_data->uses_center_w = c->uses_center_w;
prog_data->uses_implicit_point_line_varyings =
c->uses_implicit_point_line_varyings;
prog_data->lock_scoreboard_on_first_thrsw =
c->lock_scoreboard_on_first_thrsw;
- prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
+ prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
prog_data->uses_pid = c->fs_uses_primitive_id;
}
@@ -837,8 +931,14 @@ v3d_set_prog_data(struct v3d_compile *c,
prog_data->threads = c->threads;
prog_data->single_seg = !c->last_thrsw;
prog_data->spill_size = c->spill_size;
+ prog_data->tmu_spills = c->spills;
+ prog_data->tmu_fills = c->fills;
+ prog_data->tmu_count = c->tmu.total_count;
+ prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
+ prog_data->compile_strategy_idx = c->compile_strategy_idx;
prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
prog_data->has_control_barrier = c->s->info.uses_control_barrier;
+ prog_data->has_global_address = c->has_global_address;
v3d_set_prog_data_uniforms(c, prog_data);
@@ -882,32 +982,32 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
/* Split our I/O vars and dead code eliminate the unused
* components.
*/
- NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
- nir_var_shader_in | nir_var_shader_out);
+ NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+ nir_var_shader_in | nir_var_shader_out);
uint64_t used_outputs[4] = {0};
for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
used_outputs[comp] |= 1ull << slot;
}
- NIR_PASS_V(c->s, nir_remove_unused_io_vars,
- nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
- NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+ NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+ nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+ NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
v3d_optimize_nir(c, c->s);
- NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+ NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
/* This must go before nir_lower_io */
if (c->vs_key->per_vertex_point_size)
- NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+ NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
- NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
- type_size_vec4,
- (nir_lower_io_options)0);
+ NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+ type_size_vec4,
+ (nir_lower_io_options)0);
/* clean up nir_lower_io's deref_var remains and do a constant folding pass
* on the code it generated.
*/
- NIR_PASS_V(c->s, nir_opt_dce);
- NIR_PASS_V(c->s, nir_opt_constant_folding);
+ NIR_PASS(_, c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
}
static void
@@ -916,29 +1016,32 @@ v3d_nir_lower_gs_early(struct v3d_compile *c)
/* Split our I/O vars and dead code eliminate the unused
* components.
*/
- NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
- nir_var_shader_in | nir_var_shader_out);
+ NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
+ nir_var_shader_in | nir_var_shader_out);
uint64_t used_outputs[4] = {0};
for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
used_outputs[comp] |= 1ull << slot;
}
- NIR_PASS_V(c->s, nir_remove_unused_io_vars,
- nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
- NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+ NIR_PASS(_, c->s, nir_remove_unused_io_vars,
+ nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
+ NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
v3d_optimize_nir(c, c->s);
- NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
+ NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
/* This must go before nir_lower_io */
if (c->gs_key->per_vertex_point_size)
- NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+ NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
- NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
- type_size_vec4,
- (nir_lower_io_options)0);
- /* clean up nir_lower_io's deref_var remains */
- NIR_PASS_V(c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+ type_size_vec4,
+ (nir_lower_io_options)0);
+ /* clean up nir_lower_io's deref_var remains and do a constant folding pass
+ * on the code it generated.
+ */
+ NIR_PASS(_, c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
}
static void
@@ -977,11 +1080,11 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
v3d_fixup_fs_output_types(c);
- NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
+ NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
if (c->fs_key->line_smoothing) {
- v3d_nir_lower_line_smooth(c->s);
- NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
+ NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
+ NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
/* The lowering pass can introduce new sysval reads */
nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
}
@@ -991,26 +1094,26 @@ static void
v3d_nir_lower_gs_late(struct v3d_compile *c)
{
if (c->key->ucp_enables) {
- NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
- false, NULL);
+ NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
+ true, NULL);
}
/* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
- NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
}
static void
v3d_nir_lower_vs_late(struct v3d_compile *c)
{
if (c->key->ucp_enables) {
- NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
- false, false, NULL);
+ NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
+ false, true, NULL);
NIR_PASS_V(c->s, nir_lower_io_to_scalar,
- nir_var_shader_out);
+ nir_var_shader_out, NULL, NULL);
}
/* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
- NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
}
static void
@@ -1024,9 +1127,9 @@ v3d_nir_lower_fs_late(struct v3d_compile *c)
* are using.
*/
if (c->key->ucp_enables)
- NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
+ NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
- NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
+ NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
}
static uint32_t
@@ -1107,6 +1210,69 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
return false;
}
+static unsigned
+v3d_instr_delay_cb(nir_instr *instr, void *data)
+{
+ struct v3d_compile *c = (struct v3d_compile *) data;
+
+ switch (instr->type) {
+ case nir_instr_type_undef:
+ case nir_instr_type_load_const:
+ case nir_instr_type_alu:
+ case nir_instr_type_deref:
+ case nir_instr_type_jump:
+ case nir_instr_type_parallel_copy:
+ case nir_instr_type_call:
+ case nir_instr_type_phi:
+ return 1;
+
+ /* We should not use very large delays for TMU instructions. Typically,
+ * thread switches will be sufficient to hide all or most of the latency,
+ * so we typically only need a little bit of extra room. If we over-estimate
+ * the latency here we may end up unnecessarily delaying the critical path in
+ * the shader, which would have a negative effect in performance, so here
+ * we are trying to strike a balance based on empirical testing.
+ */
+ case nir_instr_type_intrinsic: {
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ if (!c->disable_general_tmu_sched) {
+ switch (intr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ return 0;
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_image_load:
+ return 3;
+ case nir_intrinsic_load_ubo:
+ if (nir_src_is_divergent(intr->src[1]))
+ return 3;
+ FALLTHROUGH;
+ default:
+ return 1;
+ }
+ } else {
+ switch (intr->intrinsic) {
+ case nir_intrinsic_decl_reg:
+ case nir_intrinsic_load_reg:
+ case nir_intrinsic_store_reg:
+ return 0;
+ default:
+ return 1;
+ }
+ }
+ break;
+ }
+
+ case nir_instr_type_tex:
+ return 5;
+ }
+
+ return 0;
+}
+
static bool
should_split_wrmask(const nir_instr *instr, const void *data)
{
@@ -1197,7 +1363,7 @@ v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
* reference offset, since otherwise we would not be able to
* skip the unifa write for them. See ntq_emit_load_ubo_unifa.
*/
- if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
+ if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
continue;
/* We will move this load if its offset is smaller than ref's
@@ -1349,16 +1515,14 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
static bool
v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
{
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_foreach_block(block, function->impl) {
- c->sorted_any_ubo_loads |=
- v3d_nir_sort_constant_ubo_loads_block(c, block);
- }
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
+ nir_foreach_function_impl(impl, s) {
+ nir_foreach_block(block, impl) {
+ c->sorted_any_ubo_loads |=
+ v3d_nir_sort_constant_ubo_loads_block(c, block);
}
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
}
return c->sorted_any_ubo_loads;
}
@@ -1376,8 +1540,8 @@ lower_load_num_subgroups(struct v3d_compile *c,
DIV_ROUND_UP(c->s->info.workgroup_size[0] *
c->s->info.workgroup_size[1] *
c->s->info.workgroup_size[2], V3D_CHANNELS);
- nir_ssa_def *result = nir_imm_int(b, num_subgroups);
- nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
+ nir_def *result = nir_imm_int(b, num_subgroups);
+ nir_def_rewrite_uses(&intr->def, result);
nir_instr_remove(&intr->instr);
}
@@ -1404,6 +1568,36 @@ lower_subgroup_intrinsics(struct v3d_compile *c,
case nir_intrinsic_load_subgroup_size:
case nir_intrinsic_load_subgroup_invocation:
case nir_intrinsic_elect:
+ case nir_intrinsic_ballot:
+ case nir_intrinsic_inverse_ballot:
+ case nir_intrinsic_ballot_bitfield_extract:
+ case nir_intrinsic_ballot_bit_count_reduce:
+ case nir_intrinsic_ballot_find_lsb:
+ case nir_intrinsic_ballot_find_msb:
+ case nir_intrinsic_ballot_bit_count_exclusive:
+ case nir_intrinsic_ballot_bit_count_inclusive:
+ case nir_intrinsic_reduce:
+ case nir_intrinsic_inclusive_scan:
+ case nir_intrinsic_exclusive_scan:
+ case nir_intrinsic_read_invocation:
+ case nir_intrinsic_read_first_invocation:
+ case nir_intrinsic_load_subgroup_eq_mask:
+ case nir_intrinsic_load_subgroup_ge_mask:
+ case nir_intrinsic_load_subgroup_gt_mask:
+ case nir_intrinsic_load_subgroup_le_mask:
+ case nir_intrinsic_load_subgroup_lt_mask:
+ case nir_intrinsic_shuffle:
+ case nir_intrinsic_shuffle_xor:
+ case nir_intrinsic_shuffle_up:
+ case nir_intrinsic_shuffle_down:
+ case nir_intrinsic_vote_all:
+ case nir_intrinsic_vote_any:
+ case nir_intrinsic_vote_feq:
+ case nir_intrinsic_vote_ieq:
+ case nir_intrinsic_quad_broadcast:
+ case nir_intrinsic_quad_swap_horizontal:
+ case nir_intrinsic_quad_swap_vertical:
+ case nir_intrinsic_quad_swap_diagonal:
c->has_subgroups = true;
break;
default:
@@ -1418,18 +1612,15 @@ static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
{
bool progress = false;
- nir_foreach_function(function, s) {
- if (function->impl) {
- nir_builder b;
- nir_builder_init(&b, function->impl);
+ nir_foreach_function_impl(impl, s) {
+ nir_builder b = nir_builder_create(impl);
- nir_foreach_block(block, function->impl)
- progress |= lower_subgroup_intrinsics(c, block, &b);
+ nir_foreach_block(block, impl)
+ progress |= lower_subgroup_intrinsics(c, block, &b);
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
- }
+ nir_metadata_preserve(impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
}
return progress;
}
@@ -1483,30 +1674,54 @@ v3d_attempt_compile(struct v3d_compile *c)
break;
}
- NIR_PASS_V(c->s, v3d_nir_lower_io, c);
- NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
- NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
+ NIR_PASS(_, c->s, v3d_nir_lower_io, c);
+ NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
+ NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
+
+ NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
nir_lower_idiv_options idiv_options = {
- .imprecise_32bit_lowering = true,
.allow_fp16 = true,
};
- NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
-
- if (c->key->robust_buffer_access) {
- /* v3d_nir_lower_robust_buffer_access assumes constant buffer
- * indices on ubo/ssbo intrinsics so run copy propagation and
- * constant folding passes before we run the lowering to warrant
- * this. We also want to run the lowering before v3d_optimize to
- * clean-up redundant get_buffer_size calls produced in the pass.
- */
- NIR_PASS_V(c->s, nir_copy_prop);
- NIR_PASS_V(c->s, nir_opt_constant_folding);
- NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
+ NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
+ NIR_PASS(_, c->s, nir_lower_alu);
+
+ if (c->key->robust_uniform_access || c->key->robust_storage_access ||
+ c->key->robust_image_access) {
+ /* nir_lower_robust_access assumes constant buffer
+ * indices on ubo/ssbo intrinsics so run copy propagation and
+ * constant folding passes before we run the lowering to warrant
+ * this. We also want to run the lowering before v3d_optimize to
+ * clean-up redundant get_buffer_size calls produced in the pass.
+ */
+ NIR_PASS(_, c->s, nir_copy_prop);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
+
+ nir_lower_robust_access_options opts = {
+ .lower_image = c->key->robust_image_access,
+ .lower_ssbo = c->key->robust_storage_access,
+ .lower_ubo = c->key->robust_uniform_access,
+ };
+
+ NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
}
- NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
+ NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
- NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
+ NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
+
+ NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
+
+ const nir_lower_subgroups_options subgroup_opts = {
+ .subgroup_size = V3D_CHANNELS,
+ .ballot_components = 1,
+ .ballot_bit_size = 32,
+ .lower_to_scalar = true,
+ .lower_inverse_ballot = true,
+ .lower_subgroup_masks = true,
+ .lower_relative_shuffle = true,
+ .lower_quad = true,
+ };
+ NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
v3d_optimize_nir(c, c->s);
@@ -1519,25 +1734,25 @@ v3d_attempt_compile(struct v3d_compile *c)
while (more_late_algebraic) {
more_late_algebraic = false;
NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
- NIR_PASS_V(c->s, nir_opt_constant_folding);
- NIR_PASS_V(c->s, nir_copy_prop);
- NIR_PASS_V(c->s, nir_opt_dce);
- NIR_PASS_V(c->s, nir_opt_cse);
+ NIR_PASS(_, c->s, nir_opt_constant_folding);
+ NIR_PASS(_, c->s, nir_copy_prop);
+ NIR_PASS(_, c->s, nir_opt_dce);
+ NIR_PASS(_, c->s, nir_opt_cse);
}
- NIR_PASS_V(c->s, nir_lower_bool_to_int32);
- nir_convert_to_lcssa(c->s, true, true);
+ NIR_PASS(_, c->s, nir_lower_bool_to_int32);
+ NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
NIR_PASS_V(c->s, nir_divergence_analysis);
- NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+ NIR_PASS(_, c->s, nir_convert_from_ssa, true);
struct nir_schedule_options schedule_options = {
/* Schedule for about half our register space, to enable more
* shaders to hit 4 threads.
*/
- .threshold = 24,
+ .threshold = c->threads == 4 ? 24 : 48,
/* Vertex shaders share the same memory for inputs and outputs,
- * fragement and geometry shaders do not.
+ * fragment and geometry shaders do not.
*/
.stages_with_shared_io_memory =
(((1 << MESA_ALL_SHADER_STAGES) - 1) &
@@ -1548,11 +1763,22 @@ v3d_attempt_compile(struct v3d_compile *c)
.intrinsic_cb = v3d_intrinsic_dependency_cb,
.intrinsic_cb_data = c,
+
+ .instr_delay_cb = v3d_instr_delay_cb,
+ .instr_delay_cb_data = c,
};
NIR_PASS_V(c->s, nir_schedule, &schedule_options);
if (!c->disable_constant_ubo_load_sorting)
- NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
+ NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
+
+ const nir_move_options buffer_opts = c->move_buffer_loads ?
+ (nir_move_load_ubo | nir_move_load_ssbo) : 0;
+ NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
+ nir_move_const_undef |
+ buffer_opts);
+
+ NIR_PASS_V(c->s, nir_trivialize_registers);
v3d_nir_to_vir(c);
}
@@ -1611,32 +1837,28 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
* register allocation to any particular thread count). This is fine
* because v3d_nir_to_vir will cap this to the actual minimum.
*/
-struct v3d_compiler_strategy {
- const char *name;
- uint32_t max_threads;
- uint32_t min_threads;
- bool disable_loop_unrolling;
- bool disable_ubo_load_sorting;
- bool disable_tmu_pipelining;
- bool tmu_spilling_allowed;
-} static const strategies[] = {
- /*0*/ { "default", 4, 4, false, false, false, false },
- /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false },
- /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false },
- /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false },
- /*4*/ { "lower thread count", 2, 1, false, false, false, false },
- /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false },
- /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false },
- /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true },
- /*8*/ { "fallback scheduler", 2, 1, true, true, true, true }
+static const struct v3d_compiler_strategy strategies[] = {
+ /*0*/ { "default", 4, 4, false, false, false, false, false, false, 0 },
+ /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, false, false, 0 },
+ /*2*/ { "disable gcm", 4, 4, true, true, false, false, false, false, 0 },
+ /*3*/ { "disable loop unrolling", 4, 4, true, true, true, false, false, false, 0 },
+ /*4*/ { "disable UBO load sorting", 4, 4, true, true, true, true, false, false, 0 },
+ /*5*/ { "disable TMU pipelining", 4, 4, true, true, true, true, false, true, 0 },
+ /*6*/ { "lower thread count", 2, 1, false, false, false, false, false, false, -1 },
+ /*7*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, false, false, -1 },
+ /*8*/ { "disable gcm (2t)", 2, 1, true, true, false, false, false, false, -1 },
+ /*9*/ { "disable loop unrolling (2t)", 2, 1, true, true, true, false, false, false, -1 },
+ /*10*/ { "Move buffer loads (2t)", 2, 1, true, true, true, true, true, false, -1 },
+ /*11*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, true, true, -1 },
+ /*12*/ { "fallback scheduler", 2, 1, true, true, true, true, true, true, -1 }
};
/**
* If a particular optimization didn't make any progress during a compile
- * attempt disabling it alone won't allow us to compile the shader successfuly,
+ * attempt disabling it alone won't allow us to compile the shader successfully,
* since we'll end up with the same code. Detect these scenarios so we can
* avoid wasting time with useless compiles. We should also consider if the
- * strategy changes other aspects of the compilation process though, like
+ * gy changes other aspects of the compilation process though, like
* spilling, and not skip it in that case.
*/
static bool
@@ -1649,31 +1871,55 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
assert(idx > 0);
/* Don't skip a strategy that changes spilling behavior */
- if (strategies[idx].tmu_spilling_allowed !=
- strategies[idx - 1].tmu_spilling_allowed) {
+ if (strategies[idx].max_tmu_spills !=
+ strategies[idx - 1].max_tmu_spills) {
return false;
}
switch (idx) {
- /* Loop unrolling: skip if we didn't unroll any loops */
+ /* General TMU sched.: skip if we didn't emit any TMU loads */
case 1:
- case 5:
+ case 7:
+ return !c->has_general_tmu_load;
+ /* Global code motion: skip if nir_opt_gcm didn't make any progress */
+ case 2:
+ case 8:
+ return !c->gcm_progress;
+ /* Loop unrolling: skip if we didn't unroll any loops */
+ case 3:
+ case 9:
return !c->unrolled_any_loops;
/* UBO load sorting: skip if we didn't sort any loads */
- case 2:
- case 6:
+ case 4:
return !c->sorted_any_ubo_loads;
+ /* Move buffer loads: we assume any shader with difficult RA
+ * most likely has UBO / SSBO loads so we never try to skip.
+ * For now, we only try this for 2-thread compiles since it
+ * is expected to impact instruction counts and latency.
+ */
+ case 10:
+ assert(c->threads < 4);
+ return false;
/* TMU pipelining: skip if we didn't pipeline any TMU ops */
- case 3:
- case 7:
+ case 5:
+ case 11:
return !c->pipelined_any_tmu;
/* Lower thread count: skip if we already tried less that 4 threads */
- case 4:
+ case 6:
return c->threads < 4;
default:
return false;
};
}
+
+static inline void
+set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
+{
+ if (*best)
+ vir_compile_destroy(*best);
+ *best = c;
+}
+
uint64_t *v3d_compile(const struct v3d_compiler *compiler,
struct v3d_key *key,
struct v3d_prog_data **out_prog_data,
@@ -1685,58 +1931,106 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
uint32_t *final_assembly_size)
{
struct v3d_compile *c = NULL;
- for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
+
+ uint32_t best_spill_fill_count = UINT32_MAX;
+ struct v3d_compile *best_c = NULL;
+ for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
/* Fallback strategy */
- if (i > 0) {
+ if (strat > 0) {
assert(c);
- if (skip_compile_strategy(c, i))
+ if (skip_compile_strategy(c, strat))
continue;
char *debug_msg;
int ret = asprintf(&debug_msg,
- "Falling back to strategy '%s' for %s",
- strategies[i].name,
- vir_get_stage_name(c));
+ "Falling back to strategy '%s' "
+ "for %s prog %d/%d",
+ strategies[strat].name,
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
if (ret >= 0) {
- if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
+ if (V3D_DBG(PERF))
fprintf(stderr, "%s\n", debug_msg);
c->debug_output(debug_msg, c->debug_output_data);
free(debug_msg);
}
- vir_compile_destroy(c);
+ if (c != best_c)
+ vir_compile_destroy(c);
}
c = vir_compile_init(compiler, key, s,
debug_output, debug_output_data,
program_id, variant_id,
- strategies[i].max_threads,
- strategies[i].min_threads,
- strategies[i].tmu_spilling_allowed,
- strategies[i].disable_loop_unrolling,
- strategies[i].disable_ubo_load_sorting,
- strategies[i].disable_tmu_pipelining,
- i == ARRAY_SIZE(strategies) - 1);
+ strat, &strategies[strat],
+ strat == ARRAY_SIZE(strategies) - 1);
v3d_attempt_compile(c);
- if (i >= ARRAY_SIZE(strategies) - 1 ||
- c->compilation_result !=
- V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
+ /* Broken shader or driver bug */
+ if (c->compilation_result == V3D_COMPILATION_FAILED)
break;
+
+ /* If we compiled without spills, choose this.
+ * Otherwise if this is a 4-thread compile, choose this (these
+ * have a very low cap on the allowed TMU spills so we assume
+ * it will be better than a 2-thread compile without spills).
+ * Otherwise, keep going while tracking the strategy with the
+ * lowest spill count.
+ */
+ if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
+ if (c->spills == 0 ||
+ strategies[strat].min_threads == 4 ||
+ V3D_DBG(OPT_COMPILE_TIME)) {
+ set_best_compile(&best_c, c);
+ break;
+ } else if (c->spills + c->fills <
+ best_spill_fill_count) {
+ set_best_compile(&best_c, c);
+ best_spill_fill_count = c->spills + c->fills;
+ }
+
+ if (V3D_DBG(PERF)) {
+ char *debug_msg;
+ int ret = asprintf(&debug_msg,
+ "Compiled %s prog %d/%d with %d "
+ "spills and %d fills. Will try "
+ "more strategies.",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id,
+ c->spills, c->fills);
+ if (ret >= 0) {
+ fprintf(stderr, "%s\n", debug_msg);
+ c->debug_output(debug_msg, c->debug_output_data);
+ free(debug_msg);
+ }
+ }
}
+
+ /* Only try next streategy if we failed to register allocate
+ * or we had to spill.
+ */
+ assert(c->compilation_result ==
+ V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
+ c->spills > 0);
}
- if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
+ /* If the best strategy was not the last, choose that */
+ if (best_c && c != best_c)
+ set_best_compile(&c, best_c);
+
+ if (V3D_DBG(PERF) &&
c->compilation_result !=
V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
c->spills > 0) {
char *debug_msg;
int ret = asprintf(&debug_msg,
- "Compiled %s with %d spills and %d fills",
+ "Compiled %s prog %d/%d with %d "
+ "spills and %d fills",
vir_get_stage_name(c),
+ c->program_id, c->variant_id,
c->spills, c->fills);
fprintf(stderr, "%s\n", debug_msg);
@@ -1747,8 +2041,12 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
}
if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
- fprintf(stderr, "Failed to compile %s with any strategy.\n",
- vir_get_stage_name(c));
+ fprintf(stderr, "Failed to compile %s prog %d/%d "
+ "with any strategy.\n",
+ vir_get_stage_name(c), c->program_id, c->variant_id);
+
+ vir_compile_destroy(c);
+ return NULL;
}
struct v3d_prog_data *prog_data;
@@ -1762,8 +2060,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
char *shaderdb;
int ret = v3d_shaderdb_dump(c, &shaderdb);
if (ret >= 0) {
- if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
- fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
+ if (V3D_DBG(SHADERDB))
+ fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
c->debug_output(shaderdb, c->debug_output_data);
free(shaderdb);
@@ -1872,8 +2170,11 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
struct qinst *prev_inst = NULL;
assert(c->cur_block);
-#ifdef DEBUG
- /* Check if the current instruction is part of the current block */
+#if MESA_DEBUG
+ /* We can only reuse a uniform if it was emitted in the same block,
+ * so callers must make sure the current instruction is being emitted
+ * in the current block.
+ */
bool found = false;
vir_for_each_inst(inst, c->cur_block) {
if (&inst->link == c->cursor.link) {
@@ -1882,7 +2183,7 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
}
}
- assert(found || list_is_empty(&c->cur_block->instructions));
+ assert(found || &c->cur_block->instructions == c->cursor.link);
#endif
list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
@@ -1900,6 +2201,12 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
if (!prev_inst)
return false;
+ /* Only reuse the ldunif result if it was written to a temp register,
+ * otherwise there may be special restrictions (for example, ldunif
+ * may write directly to unifa, which is a write-only register).
+ */
+ if (prev_inst->dst.file != QFILE_TEMP)
+ return false;
list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
&c->cur_block->instructions, link) {
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..631eeee52ab 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -182,11 +182,6 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
break;
}
- case QFILE_VPM:
- fprintf(stderr, "vpm%d.%d",
- reg.index / 4, reg.index % 4);
- break;
-
case QFILE_TEMP:
fprintf(stderr, "t%d", reg.index);
break;
@@ -197,9 +192,6 @@ static void
vir_dump_sig_addr(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr)
{
- if (devinfo->ver < 41)
- return;
-
if (!instr->sig_magic)
fprintf(stderr, ".rf%d", instr->sig_addr);
else {
@@ -270,8 +262,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
- unpack[0] = instr->alu.add.a_unpack;
- unpack[1] = instr->alu.add.b_unpack;
+ unpack[0] = instr->alu.add.a.unpack;
+ unpack[1] = instr->alu.add.b.unpack;
} else {
fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +274,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
- unpack[0] = instr->alu.mul.a_unpack;
- unpack[1] = instr->alu.mul.b_unpack;
+ unpack[0] = instr->alu.mul.a.unpack;
+ unpack[1] = instr->alu.mul.b.unpack;
}
for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 2fd6430a0f4..d1f44aa9cf7 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
flags_inst = NULL;
}
- /* Payload registers: r0/1/2 contain W, centroid W,
- * and Z at program start. Register allocation will
- * force their nodes to R0/1/2.
+ /* Payload registers: for fragment shaders, W,
+ * centroid W, and Z will be initialized in r0/1/2
+ * until v42, or r1/r2/r3 since v71.
+ *
+ * For compute shaders, payload is in r0/r2 up to v42,
+ * r2/r3 since v71.
+ *
+ * Register allocation will force their nodes to those
+ * registers.
*/
if (inst->src[0].file == QFILE_REG) {
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
+ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+ if (inst->src[0].index >= min_payload_r ||
+ inst->src[0].index <= max_payload_r) {
c->temp_start[inst->dst.index] = 0;
- break;
}
}
@@ -306,6 +311,8 @@ vir_calculate_live_intervals(struct v3d_compile *c)
vir_for_each_block(block, c) {
ralloc_free(block->def);
+ ralloc_free(block->defin);
+ ralloc_free(block->defout);
ralloc_free(block->use);
ralloc_free(block->live_in);
ralloc_free(block->live_out);
diff --git a/src/broadcom/compiler/vir_opt_constant_alu.c b/src/broadcom/compiler/vir_opt_constant_alu.c
index 483646f882e..dc4c8a65026 100644
--- a/src/broadcom/compiler/vir_opt_constant_alu.c
+++ b/src/broadcom/compiler/vir_opt_constant_alu.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2021 Raspberry Pi
+ * Copyright © 2021 Raspberry Pi Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -155,6 +155,7 @@ vir_opt_constant_alu(struct v3d_compile *c)
{
bool progress = false;
vir_for_each_block(block, c) {
+ c->cur_block = block;
vir_for_each_inst_safe(inst, block) {
progress = try_opt_constant_alu(c, inst) || progress;
}
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index c5bb6112173..611c4693ed3 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
#include "v3d_compiler.h"
static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
{
if (!inst)
return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
return false;
}
- switch (inst->src[0].file) {
- case QFILE_MAGIC:
- /* No copy propagating from R3/R4/R5 -- the MOVs from those
- * are there to register allocate values produced into R3/4/5
- * to other regs (though hopefully r3/4/5).
- */
- switch (inst->src[0].index) {
- case V3D_QPU_WADDR_R3:
- case V3D_QPU_WADDR_R4:
- case V3D_QPU_WADDR_R5:
- return false;
+ if (devinfo->ver == 42) {
+ switch (inst->src[0].file) {
+ case QFILE_MAGIC:
+ /* No copy propagating from R3/R4/R5 -- the MOVs from
+ * those are there to register allocate values produced
+ * into R3/4/5 to other regs (though hopefully r3/4/5).
+ */
+ switch (inst->src[0].index) {
+ case V3D_QPU_WADDR_R3:
+ case V3D_QPU_WADDR_R4:
+ case V3D_QPU_WADDR_R5:
+ return false;
+ default:
+ break;
+ }
+ break;
+
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ /* MOVs from rf0/1/2 are only to track the live
+ * intervals for W/centroid W/Z.
+ */
+ return false;
+ }
+ break;
+
default:
break;
}
- break;
-
- case QFILE_REG:
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
- /* MOVs from rf0/1/2 are only to track the live
+ } else {
+ assert(devinfo->ver >= 71);
+ switch (inst->src[0].file) {
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ /* MOVs from rf1/2/3 are only to track the live
* intervals for W/centroid W/Z.
+ *
+ * Note: rf0 can be implicitly written by ldvary
+ * (no temp involved), so it is not an SSA value and
+ * could clash with writes to other temps that are
+ * also allocated to rf0. In theory, that would mean
+ * that we can't copy propagate from it, but we handle
+ * this at register allocation time, preventing temps
+ * from being allocated to rf0 while the rf0 value from
+ * ldvary is still live.
*/
- return false;
- }
- break;
+ case 1:
+ case 2:
+ case 3:
+ return false;
+ }
+ break;
- default:
- break;
+ default:
+ break;
+ }
}
return true;
@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)
if (vir_is_add(inst)) {
if (chan == 0)
- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
} else {
if (chan == 0)
- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
}
}
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
*/
struct qinst *mov = movs[inst->src[i].index];
if (!mov) {
- if (!is_copy_mov(c->defs[inst->src[i].index]))
+ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
continue;
mov = c->defs[inst->src[i].index];
@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
continue;
/* these ops can't represent abs. */
- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_VFPACK:
case V3D_QPU_A_FROUND:
@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
inst->src[i] = mov->src[0];
if (vir_has_unpack(mov, 0)) {
- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
vir_set_unpack(inst, i, unpack);
}
@@ -238,12 +267,14 @@ vir_opt_copy_propagate(struct v3d_compile *c)
*/
memset(movs, 0, sizeof(struct qinst *) * c->num_temps);
+ c->cur_block = block;
vir_for_each_inst(inst, block) {
+
progress = try_copy_prop(c, inst, movs) || progress;
apply_kills(c, movs, inst);
- if (is_copy_mov(inst))
+ if (is_copy_mov(c->devinfo, inst))
movs[inst->dst.index] = inst;
}
}
diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c
index 64c762c88db..fd1af944427 100644
--- a/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/src/broadcom/compiler/vir_opt_dead_code.c
@@ -52,21 +52,10 @@ dce(struct v3d_compile *c, struct qinst *inst)
}
static bool
-has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
-{
- for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_VPM)
- return true;
- }
-
- return false;
-}
-
-static bool
can_write_to_null(struct v3d_compile *c, struct qinst *inst)
{
/* The SFU instructions must write to a physical register. */
- if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu))
+ if (v3d_qpu_uses_sfu(&inst->qpu))
return false;
return true;
@@ -149,30 +138,25 @@ check_first_ldunifa(struct v3d_compile *c,
}
static bool
-increment_unifa_address(struct v3d_compile *c, struct qblock *block, struct qinst *unifa)
+increment_unifa_address(struct v3d_compile *c, struct qinst *unifa)
{
- struct qblock *current_block = c->cur_block;
if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
unifa->qpu.alu.mul.op == V3D_QPU_M_MOV) {
c->cursor = vir_after_inst(unifa);
- c->cur_block = block;
struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
vir_ADD_dest(c, unifa_reg, unifa->src[0], vir_uniform_ui(c, 4u));
vir_remove_instruction(c, unifa);
- c->cur_block = current_block;
return true;
}
if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
unifa->qpu.alu.add.op == V3D_QPU_A_ADD) {
c->cursor = vir_after_inst(unifa);
- c->cur_block = block;
struct qreg unifa_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
struct qreg tmp =
vir_ADD(c, unifa->src[1], vir_uniform_ui(c, 4u));
vir_ADD_dest(c, unifa_reg, unifa->src[0], tmp);
vir_remove_instruction(c, unifa);
- c->cur_block = current_block;
return true;
}
@@ -200,7 +184,7 @@ vir_opt_dead_code(struct v3d_compile *c)
vir_for_each_block(block, c) {
struct qinst *last_flags_write = NULL;
-
+ c->cur_block = block;
vir_for_each_inst_safe(inst, block) {
/* If this instruction reads the flags, we can't
* remove the flags generation for it.
@@ -246,7 +230,6 @@ vir_opt_dead_code(struct v3d_compile *c)
}
if (v3d_qpu_writes_flags(&inst->qpu) ||
- has_nonremovable_reads(c, inst) ||
(is_ldunifa && !is_first_ldunifa && !is_last_ldunifa)) {
/* If we can't remove the instruction, but we
* don't need its destination value, just
@@ -276,7 +259,7 @@ vir_opt_dead_code(struct v3d_compile *c)
*/
if (is_first_ldunifa) {
assert(unifa);
- if (!increment_unifa_address(c, block, unifa))
+ if (!increment_unifa_address(c, unifa))
continue;
}
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index 4609ef9c361..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
a->qpu.flags.mpf != b->qpu.flags.mpf ||
a->qpu.alu.add.op != b->qpu.alu.add.op ||
a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
return false;
}
@@ -99,6 +99,7 @@ vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block)
struct qinst *last_flags = NULL;
bool progress = false;
+ c->cur_block = block;
vir_for_each_inst(inst, block) {
if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..56f0bf20706 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
/* The small immediate value sits in the raddr B field, so we
* can't have 2 small immediates in one instruction (unless
* they're the same value, but that should be optimized away
- * elsewhere).
+ * elsewhere). Since 7.x we can encode small immediates in
+ * any raddr field, but each instruction can still only use
+ * one.
*/
bool uses_small_imm = false;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
*/
struct v3d_qpu_sig new_sig = inst->qpu.sig;
uint32_t sig_packed;
- new_sig.small_imm = true;
+ if (c->devinfo->ver == 42) {
+ new_sig.small_imm_b = true;
+ } else {
+ if (vir_is_add(inst)) {
+ if (i == 0)
+ new_sig.small_imm_a = true;
+ else
+ new_sig.small_imm_b = true;
+ } else {
+ if (i == 0)
+ new_sig.small_imm_c = true;
+ else
+ new_sig.small_imm_d = true;
+ }
+ }
+
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
continue;
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
vir_dump_inst(c, inst);
fprintf(stderr, "\n");
}
- inst->qpu.sig.small_imm = true;
+ inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+ inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+ inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+ inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
inst->qpu.raddr_b = packed;
inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 08698b4ece1..53e84840899 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -26,12 +26,100 @@
#include "common/v3d_device_info.h"
#include "v3d_compiler.h"
-#define QPU_R(i) { .magic = false, .index = i }
-
#define ACC_INDEX 0
#define ACC_COUNT 6
-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT 64
+
+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return ACC_INDEX + ACC_COUNT;
+ else
+ return 0;
+}
+
+/* ACC as accumulator */
+#define CLASS_BITS_PHYS (1 << 0)
+#define CLASS_BITS_ACC (1 << 1)
+#define CLASS_BITS_R5 (1 << 4)
+
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+ else
+ return CLASS_BITS_PHYS;
+}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+ if (!devinfo->has_accumulators) {
+ assert(class_bits & CLASS_BITS_PHYS);
+ class_bits = CLASS_BITS_PHYS;
+ }
+ return class_bits;
+}
+
+static inline uint32_t
+temp_to_node(struct v3d_compile *c, uint32_t temp)
+{
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
+}
+
+static inline uint32_t
+node_to_temp(struct v3d_compile *c, uint32_t node)
+{
+ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
+}
+
+static inline uint8_t
+get_temp_class_bits(struct v3d_compile *c,
+ uint32_t temp)
+{
+ return c->nodes.info[temp_to_node(c, temp)].class_bits;
+}
+
+static inline void
+set_temp_class_bits(struct v3d_compile *c,
+ uint32_t temp, uint8_t class_bits)
+{
+ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
+}
+
+static struct ra_class *
+choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
+{
+ if (class_bits == CLASS_BITS_PHYS) {
+ return c->compiler->reg_class_phys[c->thread_index];
+ } else if (class_bits == (CLASS_BITS_R5)) {
+ assert(c->devinfo->has_accumulators);
+ return c->compiler->reg_class_r5[c->thread_index];
+ } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+ assert(c->devinfo->has_accumulators);
+ return c->compiler->reg_class_phys_or_acc[c->thread_index];
+ } else {
+ assert(class_bits == get_class_bit_any(c->devinfo));
+ return c->compiler->reg_class_any[c->thread_index];
+ }
+}
+
+static inline struct ra_class *
+choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
+{
+ assert(temp < c->num_temps && temp < c->nodes.alloc_count);
+ return choose_reg_class(c, get_temp_class_bits(c, temp));
+}
static inline bool
qinst_writes_tmu(const struct v3d_device_info *devinfo,
@@ -46,23 +134,22 @@ static bool
is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
struct qinst *inst, struct qblock *block)
{
- if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
- inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
- return true;
- }
-
- if (!inst->qpu.sig.ldtmu)
+ /* Only tmuwt and ldtmu can finish TMU sequences */
+ bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+ bool is_ldtmu = inst->qpu.sig.ldtmu;
+ if (!is_tmuwt && !is_ldtmu)
return false;
+ /* Check if this is the last tmuwt or ldtmu in the sequence */
list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
&block->instructions, link) {
- if (scan_inst->qpu.sig.ldtmu)
- return false;
+ is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+ scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
+ is_ldtmu = scan_inst->qpu.sig.ldtmu;
- if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
- inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
- return true;
- }
+ if (is_tmuwt || is_ldtmu)
+ return false;
if (qinst_writes_tmu(devinfo, scan_inst))
return true;
@@ -79,11 +166,101 @@ vir_is_mov_uniform(struct v3d_compile *c, int temp)
return def && def->qpu.sig.ldunif;
}
+static bool
+can_reconstruct_inst(struct qinst *inst)
+{
+ assert(inst);
+
+ if (vir_is_add(inst)) {
+ switch (inst->qpu.alu.add.op) {
+ case V3D_QPU_A_FXCD:
+ case V3D_QPU_A_FYCD:
+ case V3D_QPU_A_XCD:
+ case V3D_QPU_A_YCD:
+ case V3D_QPU_A_IID:
+ case V3D_QPU_A_EIDX:
+ case V3D_QPU_A_TIDX:
+ case V3D_QPU_A_SAMPID:
+ /* No need to check input unpacks because none of these
+ * opcodes read sources. FXCD,FYCD have pack variants.
+ */
+ return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+ inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
+ inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
+ inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
+ default:
+ return false;
+ }
+ }
+
+ return false;
+}
+
+static bool
+can_reconstruct_temp(struct v3d_compile *c, int temp)
+{
+ struct qinst *def = c->defs[temp];
+ return def && can_reconstruct_inst(def);
+}
+
+static struct qreg
+reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
+{
+ struct qreg dest;
+ switch (op) {
+ case V3D_QPU_A_FXCD:
+ dest = vir_FXCD(c);
+ break;
+ case V3D_QPU_A_FYCD:
+ dest = vir_FYCD(c);
+ break;
+ case V3D_QPU_A_XCD:
+ dest = vir_XCD(c);
+ break;
+ case V3D_QPU_A_YCD:
+ dest = vir_YCD(c);
+ break;
+ case V3D_QPU_A_IID:
+ dest = vir_IID(c);
+ break;
+ case V3D_QPU_A_EIDX:
+ dest = vir_EIDX(c);
+ break;
+ case V3D_QPU_A_TIDX:
+ dest = vir_TIDX(c);
+ break;
+ case V3D_QPU_A_SAMPID:
+ dest = vir_SAMPID(c);
+ break;
+ default:
+ unreachable("Unexpected opcode for reconstruction");
+ }
+
+ return dest;
+}
+
+enum temp_spill_type {
+ SPILL_TYPE_UNIFORM,
+ SPILL_TYPE_RECONSTRUCT,
+ SPILL_TYPE_TMU
+};
+
+static enum temp_spill_type
+get_spill_type_for_temp(struct v3d_compile *c, int temp)
+{
+ if (vir_is_mov_uniform(c, temp))
+ return SPILL_TYPE_UNIFORM;
+
+ if (can_reconstruct_temp(c, temp))
+ return SPILL_TYPE_RECONSTRUCT;
+
+ return SPILL_TYPE_TMU;
+}
+
static int
-v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
- uint32_t *temp_to_node)
+v3d_choose_spill_node(struct v3d_compile *c)
{
- const float tmu_scale = 5;
+ const float tmu_scale = 10;
float block_scale = 1.0;
float spill_costs[c->num_temps];
bool in_tmu_operation = false;
@@ -99,7 +276,8 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
* starting output writes.
*/
bool no_spilling =
- c->threads > 1 && started_last_seg;
+ (c->threads > 1 && started_last_seg) ||
+ (c->max_tmu_spills == 0);
/* Discourage spilling of TMU operations */
for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -107,7 +285,10 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
continue;
int temp = inst->src[i].index;
- if (vir_is_mov_uniform(c, temp)) {
+ enum temp_spill_type spill_type =
+ get_spill_type_for_temp(c, temp);
+
+ if (spill_type != SPILL_TYPE_TMU) {
spill_costs[temp] += block_scale;
} else if (!no_spilling) {
float tmu_op_scale = in_tmu_operation ?
@@ -122,11 +303,11 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
if (inst->dst.file == QFILE_TEMP) {
int temp = inst->dst.index;
+ enum temp_spill_type spill_type =
+ get_spill_type_for_temp(c, temp);
- if (vir_is_mov_uniform(c, temp)) {
- /* We just rematerialize the unform
- * later.
- */
+ if (spill_type != SPILL_TYPE_TMU) {
+ /* We just rematerialize it later */
} else if (!no_spilling) {
spill_costs[temp] += (block_scale *
tmu_scale);
@@ -147,10 +328,6 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
if (inst->is_last_thrsw)
started_last_seg = true;
- if (v3d_qpu_writes_vpm(&inst->qpu) ||
- v3d_qpu_uses_tlb(&inst->qpu))
- started_last_seg = true;
-
/* Track when we're in between a TMU setup and the
* final LDTMU or TMUWT from that TMU setup. We
* penalize spills during that time.
@@ -163,12 +340,53 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
}
}
+ /* We always emit a "last thrsw" to ensure all our spilling occurs
+ * before the last thread section. See vir_emit_last_thrsw.
+ */
+ assert(started_last_seg);
+
for (unsigned i = 0; i < c->num_temps; i++) {
- if (BITSET_TEST(c->spillable, i))
- ra_set_node_spill_cost(g, temp_to_node[i], spill_costs[i]);
+ if (BITSET_TEST(c->spillable, i)) {
+ ra_set_node_spill_cost(c->g, temp_to_node(c, i),
+ spill_costs[i]);
+ }
}
- return ra_get_best_spill_node(g);
+ return ra_get_best_spill_node(c->g);
+}
+
+static void
+ensure_nodes(struct v3d_compile *c)
+{
+ if (c->num_temps < c->nodes.alloc_count)
+ return;
+
+ c->nodes.alloc_count *= 2;
+ c->nodes.info = reralloc_array_size(c,
+ c->nodes.info,
+ sizeof(c->nodes.info[0]),
+ c->nodes.alloc_count +
+ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
+}
+
+/* Creates the interference node for a new temp. We use this to keep the node
+ * list updated during the spilling process, which generates new temps/nodes.
+ */
+static void
+add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+{
+ ensure_nodes(c);
+
+ int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
+ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+ node == temp + IMPLICIT_RF_COUNT);
+
+ /* We fill the node priority after we are done inserting spills */
+ c->nodes.info[node].class_bits = class_bits;
+ c->nodes.info[node].priority = 0;
+ c->nodes.info[node].is_ldunif_dst = false;
+ c->nodes.info[node].is_program_end = false;
+ c->nodes.info[node].unused = false;
}
/* The spill offset for this thread takes a bit of setup, so do it once at
@@ -206,79 +424,224 @@ v3d_setup_spill_base(struct v3d_compile *c)
vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
/* Make sure that we don't spill the spilling setup instructions. */
- for (int i = start_num_temps; i < c->num_temps; i++)
+ for (int i = start_num_temps; i < c->num_temps; i++) {
BITSET_CLEAR(c->spillable, i);
+ /* If we are spilling, update the RA map with the temps added
+ * by the spill setup. Our spill_base register can never be an
+ * accumulator because it is used for TMU spill/fill and thus
+ * needs to persist across thread switches.
+ */
+ if (c->spilling) {
+ int temp_class = CLASS_BITS_PHYS;
+ if (c->devinfo->has_accumulators &&
+ i != c->spill_base.index) {
+ temp_class |= CLASS_BITS_ACC;
+ }
+ add_node(c, i, temp_class);
+ }
+ }
+
/* Restore the current block. */
c->cur_block = current_block;
c->cursor = vir_after_block(c->cur_block);
}
-static struct qinst *
-v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
+/**
+ * Computes the address for a spill/fill sequence and completes the spill/fill
+ * sequence by emitting the following code:
+ *
+ * ldunif.spill_offset
+ * add tmua spill_base spill_offset
+ * thrsw
+ *
+ * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
+ * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
+ *
+ * The parameter 'ip' represents the ip at which the spill/fill is happening.
+ * This is used to disallow accumulators on temps that cross this ip boundary
+ * due to the new thrsw itroduced in the sequence above.
+ */
+static void
+v3d_emit_spill_tmua(struct v3d_compile *c,
+ uint32_t spill_offset,
+ enum v3d_qpu_cond cond,
+ int32_t ip,
+ struct qreg *fill_dst)
{
- return vir_ADD_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
- c->spill_base, vir_uniform_ui(c, spill_offset));
-}
+ assert(ip >= 0);
+
+ /* Load a uniform with the spill offset and add it to the spill base
+ * to obtain the TMUA address. It can be of class ANY because we know
+ * we are consuming it immediately without thrsw in between.
+ */
+ assert(c->disable_ldunif_opt);
+ struct qreg offset = vir_uniform_ui(c, spill_offset);
+ add_node(c, offset.index, get_class_bit_any(c->devinfo));
+ /* We always enable per-quad on spills/fills to ensure we spill
+ * any channels involved with helper invocations.
+ */
+ struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+ struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
+ inst->qpu.flags.ac = cond;
+ inst->ldtmu_count = 1;
+ inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+ 0xffffff7f); /* per-quad */
+
+ vir_emit_thrsw(c);
+
+ /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
+ * result of the fill. The TMUWT temp is not really read, the ldtmu
+ * temp will be used immediately so just like the uniform above we
+ * can allow accumulators.
+ */
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ if (!fill_dst) {
+ struct qreg dst = vir_TMUWT(c);
+ assert(dst.file == QFILE_TEMP);
+ add_node(c, dst.index, temp_class);
+ } else {
+ *fill_dst = vir_LDTMU(c);
+ assert(fill_dst->file == QFILE_TEMP);
+ add_node(c, fill_dst->index, temp_class);
+ }
+
+ /* Temps across the thread switch we injected can't be assigned to
+ * accumulators.
+ *
+ * Fills inject code before ip, so anything that starts at ip or later
+ * is not affected by the thrsw. Something that ends at ip will be
+ * affected though.
+ *
+ * Spills inject code after ip, so anything that starts strictly later
+ * than ip is not affected (the temp starting at ip is usually the
+ * spilled temp except for postponed spills). Something that ends at ip
+ * won't be affected either.
+ */
+ for (int i = 0; i < c->spill_start_num_temps; i++) {
+ bool thrsw_cross = fill_dst ?
+ c->temp_start[i] < ip && c->temp_end[i] >= ip :
+ c->temp_start[i] <= ip && c->temp_end[i] > ip;
+ if (thrsw_cross) {
+ ra_set_node_class(c->g, temp_to_node(c, i),
+ choose_reg_class(c, CLASS_BITS_PHYS));
+ }
+ }
+}
static void
-v3d_emit_tmu_spill(struct v3d_compile *c, struct qinst *inst,
- struct qinst *position, uint32_t spill_offset)
+v3d_emit_tmu_spill(struct v3d_compile *c,
+ struct qinst *inst,
+ struct qreg spill_temp,
+ struct qinst *position,
+ uint32_t ip,
+ uint32_t spill_offset)
{
assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+ assert(inst->dst.file == QFILE_TEMP);
c->cursor = vir_after_inst(position);
- inst->dst = vir_get_temp(c);
+
enum v3d_qpu_cond cond = vir_get_cond(inst);
+
+ /* If inst and position don't match, this is a postponed spill,
+ * in which case we have already allocated the temp for the spill
+ * and we should use that, otherwise create a new temp with the
+ * same register class bits as the original.
+ */
+ if (inst == position) {
+ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
+ inst->dst = vir_get_temp(c);
+ add_node(c, inst->dst.index, class_bits);
+ } else {
+ inst->dst = spill_temp;
+
+ /* If this is a postponed spill the register being spilled may
+ * have been written more than once including conditional
+ * writes, so ignore predication on the spill instruction and
+ * always spill the full register.
+ */
+ cond = V3D_QPU_COND_NONE;
+ }
+
struct qinst *tmp =
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
inst->dst);
tmp->qpu.flags.mc = cond;
- tmp = v3d_emit_spill_tmua(c, spill_offset);
- tmp->qpu.flags.ac = cond;
- vir_emit_thrsw(c);
- vir_TMUWT(c);
+
+ v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
+
c->spills++;
c->tmu_dirty_rcl = true;
}
+static inline bool
+interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
+{
+ return !(t0_start >= t1_end || t1_start >= t0_end);
+}
+
static void
-v3d_spill_reg(struct v3d_compile *c, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+ int spill_temp)
{
- c->spill_count++;
+ c->spill_start_num_temps = c->num_temps;
+ c->spilling = true;
- bool is_uniform = vir_is_mov_uniform(c, spill_temp);
+ enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
uint32_t spill_offset = 0;
-
- if (!is_uniform) {
+ if (spill_type == SPILL_TYPE_TMU) {
spill_offset = c->spill_size;
c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
- if (spill_offset == 0)
+ if (spill_offset == 0) {
v3d_setup_spill_base(c);
+
+ /* Don't allocate our spill base to rf0 to avoid
+ * conflicts with instructions doing implicit writes
+ * to that register.
+ */
+ if (!c->devinfo->has_accumulators) {
+ ra_add_node_interference(
+ c->g,
+ temp_to_node(c, c->spill_base.index),
+ implicit_rf_nodes[0]);
+ }
+ }
}
struct qinst *last_thrsw = c->last_thrsw;
assert(last_thrsw && last_thrsw->is_last_thrsw);
- int start_num_temps = c->num_temps;
-
int uniform_index = ~0;
- if (is_uniform) {
+ if (spill_type == SPILL_TYPE_UNIFORM) {
struct qinst *orig_unif = c->defs[spill_temp];
uniform_index = orig_unif->uniform;
}
+ enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
+ if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+ struct qinst *orig_def = c->defs[spill_temp];
+ assert(vir_is_add(orig_def));
+ reconstruct_op = orig_def->qpu.alu.add.op;
+ }
+
+ uint32_t spill_node = temp_to_node(c, spill_temp);
+
/* We must disable the ldunif optimization if we are spilling uniforms */
bool had_disable_ldunif_opt = c->disable_ldunif_opt;
c->disable_ldunif_opt = true;
struct qinst *start_of_tmu_sequence = NULL;
struct qinst *postponed_spill = NULL;
+ struct qreg postponed_spill_temp = { 0 };
vir_for_each_block(block, c) {
vir_for_each_inst_safe(inst, block) {
+ int32_t ip = inst->ip;
+
/* Track when we're in between a TMU setup and the final
* LDTMU or TMUWT from that TMU setup. We can't spill/fill any
* temps during that time, because that involves inserting a
@@ -289,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
if (postponed_spill) {
v3d_emit_tmu_spill(c, postponed_spill,
- inst, spill_offset);
+ postponed_spill_temp,
+ inst, ip, spill_offset);
}
start_of_tmu_sequence = NULL;
@@ -302,49 +666,103 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
}
/* fills */
+ int filled_src = -1;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
if (inst->src[i].file != QFILE_TEMP ||
inst->src[i].index != spill_temp) {
continue;
}
+ if (filled_src >= 0) {
+ inst->src[i] = inst->src[filled_src];
+ continue;
+ }
+
c->cursor = vir_before_inst(inst);
- if (is_uniform) {
+ if (spill_type == SPILL_TYPE_UNIFORM) {
struct qreg unif =
vir_uniform(c,
c->uniform_contents[uniform_index],
c->uniform_data[uniform_index]);
inst->src[i] = unif;
+ /* We are using the uniform in the
+ * instruction immediately after, so
+ * we can use any register class for it.
+ */
+ add_node(c, unif.index,
+ get_class_bit_any(c->devinfo));
+ } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+ struct qreg temp =
+ reconstruct_temp(c, reconstruct_op);
+ inst->src[i] = temp;
+ /* We are using the temp in the
+ * instruction immediately after so we
+ * can use ACC.
+ */
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+ CLASS_BITS_ACC);
+ add_node(c, temp.index, temp_class);
} else {
- /* If we have a postponed spill, we don't need
- * a fill as the temp would not have been
- * spilled yet.
+ /* If we have a postponed spill, we
+ * don't need a fill as the temp would
+ * not have been spilled yet, however,
+ * we need to update the temp index.
*/
- if (postponed_spill)
- continue;
- if (start_of_tmu_sequence)
- c->cursor = vir_before_inst(start_of_tmu_sequence);
-
- v3d_emit_spill_tmua(c, spill_offset);
- vir_emit_thrsw(c);
- inst->src[i] = vir_LDTMU(c);
- c->fills++;
+ if (postponed_spill) {
+ inst->src[i] =
+ postponed_spill_temp;
+ } else {
+ int32_t fill_ip = ip;
+ if (start_of_tmu_sequence) {
+ c->cursor = vir_before_inst(start_of_tmu_sequence);
+ fill_ip = start_of_tmu_sequence->ip;
+ }
+
+ v3d_emit_spill_tmua(c, spill_offset,
+ V3D_QPU_COND_NONE,
+ fill_ip, &inst->src[i]);
+ c->fills++;
+ }
}
+
+ filled_src = i;
}
/* spills */
if (inst->dst.file == QFILE_TEMP &&
inst->dst.index == spill_temp) {
- if (is_uniform) {
+ if (spill_type != SPILL_TYPE_TMU) {
c->cursor.link = NULL;
vir_remove_instruction(c, inst);
} else {
- if (start_of_tmu_sequence)
+ /* If we are in the middle of a TMU
+ * sequence, we postpone the actual
+ * spill until we have finished it. We,
+ * still need to replace the spill temp
+ * with a new temp though.
+ */
+ if (start_of_tmu_sequence) {
+ if (postponed_spill) {
+ postponed_spill->dst =
+ postponed_spill_temp;
+ }
+ if (!postponed_spill ||
+ vir_get_cond(inst) == V3D_QPU_COND_NONE) {
+ postponed_spill_temp =
+ vir_get_temp(c);
+ add_node(c,
+ postponed_spill_temp.index,
+ c->nodes.info[spill_node].class_bits);
+ }
postponed_spill = inst;
- else
- v3d_emit_tmu_spill(c, inst, inst,
+ } else {
+ v3d_emit_tmu_spill(c, inst,
+ postponed_spill_temp,
+ inst, ip,
spill_offset);
+ }
}
}
}
@@ -358,21 +776,64 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
/* Don't allow spilling of our spilling instructions. There's no way
* they can help get things colored.
*/
- for (int i = start_num_temps; i < c->num_temps; i++)
+ for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
BITSET_CLEAR(c->spillable, i);
+ /* Reset interference for spilled node */
+ ra_set_node_spill_cost(c->g, spill_node, 0);
+ ra_reset_node_interference(c->g, spill_node);
+ BITSET_CLEAR(c->spillable, spill_temp);
+
+ /* Rebuild program ips */
+ int32_t ip = 0;
+ vir_for_each_inst_inorder(inst, c)
+ inst->ip = ip++;
+
+ /* Rebuild liveness */
+ vir_calculate_live_intervals(c);
+
+ /* Add interferences for the new spilled temps and update interferences
+ * for c->spill_base (since we may have modified its liveness). Also,
+ * update node priorities based one new liveness data.
+ */
+ uint32_t sb_temp =c->spill_base.index;
+ uint32_t sb_node = temp_to_node(c, sb_temp);
+ for (uint32_t i = 0; i < c->num_temps; i++) {
+ if (c->temp_end[i] == -1)
+ continue;
+
+ uint32_t node_i = temp_to_node(c, i);
+ c->nodes.info[node_i].priority =
+ c->temp_end[i] - c->temp_start[i];
+
+ for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
+ j < c->num_temps; j++) {
+ if (interferes(c->temp_start[i], c->temp_end[i],
+ c->temp_start[j], c->temp_end[j])) {
+ uint32_t node_j = temp_to_node(c, j);
+ ra_add_node_interference(c->g, node_i, node_j);
+ }
+ }
+
+ if (spill_type == SPILL_TYPE_TMU) {
+ if (i != sb_temp &&
+ interferes(c->temp_start[i], c->temp_end[i],
+ c->temp_start[sb_temp], c->temp_end[sb_temp])) {
+ ra_add_node_interference(c->g, node_i, sb_node);
+ }
+ }
+ }
+
c->disable_ldunif_opt = had_disable_ldunif_opt;
+ c->spilling = false;
}
-struct node_to_temp_map {
- uint32_t temp;
- uint32_t priority;
-};
-
struct v3d_ra_select_callback_data {
+ uint32_t phys_index;
uint32_t next_acc;
uint32_t next_phys;
- struct node_to_temp_map *map;
+ struct v3d_ra_node_info *nodes;
+ const struct v3d_device_info *devinfo;
};
/* Choosing accumulators improves chances of merging QPU instructions
@@ -384,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
int priority)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Favor accumulators if we have less that this number of physical
* registers. Accumulators have more restrictions (like being
* invalidated through thrsw), so running out of physical registers
@@ -393,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
static const int available_rf_threshold = 5;
int available_rf = 0 ;
for (int i = 0; i < PHYS_COUNT; i++) {
- if (BITSET_TEST(regs, PHYS_INDEX + i))
+ if (BITSET_TEST(regs, v3d_ra->phys_index + i))
available_rf++;
if (available_rf >= available_rf_threshold)
break;
@@ -419,6 +883,19 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
+ /* Choose r5 for our ldunifs if possible (nobody else can load to that
+ * reg, and it keeps the QPU cond field free from being occupied by
+ * ldunifrf).
+ */
+ int r5 = ACC_INDEX + 5;
+ if (BITSET_TEST(regs, r5)) {
+ *out = r5;
+ return true;
+ }
+
/* Round-robin through our accumulators to give post-RA instruction
* selection more options.
*/
@@ -438,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+ unsigned int node,
BITSET_WORD *regs,
unsigned int *out)
{
+ /* If this node is for an unused temp, ignore. */
+ if (v3d_ra->nodes->info[node].unused) {
+ *out = 0;
+ return true;
+ }
+
+ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+ * so we can avoid turning them into ldunifrf (which uses the
+ * cond field to encode the dst and would prevent merge with
+ * instructions that use cond flags).
+ */
+ if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ assert(v3d_ra->devinfo->ver >= 71);
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
+ /* The last 3 instructions in a shader can't use some specific registers
+ * (usually early rf registers, depends on v3d version) so try to
+ * avoid allocating these to registers used by the last instructions
+ * in the shader.
+ */
+ const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
+ if (v3d_ra->nodes->info[node].is_program_end &&
+ v3d_ra->next_phys < safe_rf_start) {
+ v3d_ra->next_phys = safe_rf_start;
+ }
+
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
- int phys = PHYS_INDEX + phys_off;
+
+ /* Try to keep rf0 available for ldunif in 7.x (see above). */
+ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+ continue;
+
+ int phys = v3d_ra->phys_index + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
@@ -452,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
}
}
+ /* If we couldn't allocate, do try to assign rf0 if it is available. */
+ if (v3d_ra->devinfo->ver >= 71 &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ v3d_ra->next_phys = 1;
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
return false;
}
@@ -459,22 +979,14 @@ static unsigned int
v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
{
struct v3d_ra_select_callback_data *v3d_ra = data;
- int r5 = ACC_INDEX + 5;
-
- /* Choose r5 for our ldunifs if possible (nobody else can load to that
- * reg, and it keeps the QPU cond field free from being occupied by
- * ldunifrf).
- */
- if (BITSET_TEST(regs, r5))
- return r5;
unsigned int reg;
- if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&
+ if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
v3d_ra_select_accum(v3d_ra, regs, &reg)) {
return reg;
}
- if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+ if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
return reg;
/* If we ran out of physical registers try to assign an accumulator
@@ -492,9 +1004,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
/* Allocate up to 3 regfile classes, for the ways the physical
* register file can be divided up for fragment shader threading.
*/
- int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+ int max_thread_index = 2;
+ uint8_t phys_index = get_phys_index(compiler->devinfo);
- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
false);
if (!compiler->regs)
return false;
@@ -502,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
for (int threads = 0; threads < max_thread_index; threads++) {
compiler->reg_class_any[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_r5[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_phys_or_acc[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
+ if (compiler->devinfo->has_accumulators) {
+ compiler->reg_class_r5[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ compiler->reg_class_phys_or_acc[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ }
compiler->reg_class_phys[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- for (int i = PHYS_INDEX;
- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ /* Init physical regs */
+ for (int i = phys_index;
+ i < phys_index + (PHYS_COUNT >> threads); i++) {
+ if (compiler->devinfo->has_accumulators)
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->reg_class_phys[threads], i);
ra_class_add_reg(compiler->reg_class_any[threads], i);
}
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
- ra_class_add_reg(compiler->reg_class_any[threads], i);
+ /* Init accumulator regs */
+ if (compiler->devinfo->has_accumulators) {
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ ra_class_add_reg(compiler->reg_class_any[threads], i);
+ }
+ /* r5 can only store a single 32-bit value, so not much can
+ * use it.
+ */
+ ra_class_add_reg(compiler->reg_class_r5[threads],
+ ACC_INDEX + 5);
+ ra_class_add_reg(compiler->reg_class_any[threads],
+ ACC_INDEX + 5);
}
- /* r5 can only store a single 32-bit value, so not much can
- * use it.
- */
- ra_class_add_reg(compiler->reg_class_r5[threads],
- ACC_INDEX + 5);
- ra_class_add_reg(compiler->reg_class_any[threads],
- ACC_INDEX + 5);
}
ra_set_finalize(compiler->regs, NULL);
@@ -534,52 +1054,220 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
return true;
}
-static int
-node_to_temp_priority(const void *in_a, const void *in_b)
+static inline bool
+tmu_spilling_allowed(struct v3d_compile *c)
{
- const struct node_to_temp_map *a = in_a;
- const struct node_to_temp_map *b = in_b;
-
- return a->priority - b->priority;
+ return c->spills + c->fills < c->max_tmu_spills;
}
-/**
- * Computes the number of registers to spill in a batch after a register
- * allocation failure.
- */
-static uint32_t
-get_spill_batch_size(struct v3d_compile *c)
-{
- /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of
- * over-spilling if the program requires few spills to compile.
- */
- if (c->spill_count < 10)
- return 1;
-
- /* If we have to spill more than that we assume performance is not going to
- * be great and we shift focus to batching spills to cut down compile
- * time at the expense of over-spilling.
- */
- return 20;
-}
-
-/* Don't emit spills using the TMU until we've dropped thread count first. We,
- * may also disable spilling when certain optimizations that are known to
- * increase register pressure are active so we favor recompiling with
- * optimizations disabled instead of spilling.
- */
-static inline bool
-tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
+static void
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+ int *acc_nodes,
+ int *implicit_rf_nodes,
+ int last_ldvary_ip,
+ struct qinst *inst)
{
- return thread_index == 0 && c->tmu_spilling_allowed;
+ int32_t ip = inst->ip;
+ assert(ip >= 0);
+
+ /* If the instruction writes r4 (and optionally moves its
+ * result to a temp), nothing else can be stored in r4 across
+ * it.
+ */
+ if (vir_writes_r4_implicitly(c->devinfo, inst)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ acc_nodes[4]);
+ }
+ }
+ }
+
+ /* If any instruction writes to a physical register implicitly
+ * nothing else can write the same register across it.
+ */
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
+ if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+ switch (inst->qpu.alu.add.op) {
+ case V3D_QPU_A_LDVPMV_IN:
+ case V3D_QPU_A_LDVPMV_OUT:
+ case V3D_QPU_A_LDVPMD_IN:
+ case V3D_QPU_A_LDVPMD_OUT:
+ case V3D_QPU_A_LDVPMP:
+ case V3D_QPU_A_LDVPMG_IN:
+ case V3D_QPU_A_LDVPMG_OUT: {
+ /* LDVPMs only store to temps (the MA flag
+ * decides whether the LDVPM is in or out)
+ */
+ assert(inst->dst.file == QFILE_TEMP);
+ set_temp_class_bits(c, inst->dst.index,
+ CLASS_BITS_PHYS);
+ break;
+ }
+
+ case V3D_QPU_A_RECIP:
+ case V3D_QPU_A_RSQRT:
+ case V3D_QPU_A_EXP:
+ case V3D_QPU_A_LOG:
+ case V3D_QPU_A_SIN:
+ case V3D_QPU_A_RSQRT2: {
+ /* The SFU instructions write directly to the
+ * phys regfile.
+ */
+ assert(inst->dst.file == QFILE_TEMP);
+ set_temp_class_bits(c, inst->dst.index,
+ CLASS_BITS_PHYS);
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+
+ if (inst->src[0].file == QFILE_REG) {
+ switch (inst->src[0].index) {
+ case 0:
+ /* V3D 7.x doesn't use rf0 for thread payload */
+ if (c->devinfo->ver >= 71)
+ break;
+ else
+ FALLTHROUGH;
+ case 1:
+ case 2:
+ case 3: {
+ /* Payload setup instructions: Force allocate
+ * the dst to the given register (so the MOV
+ * will disappear).
+ */
+ assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+ assert(inst->dst.file == QFILE_TEMP);
+ uint32_t node = temp_to_node(c, inst->dst.index);
+ ra_set_node_reg(c->g, node,
+ get_phys_index(c->devinfo) +
+ inst->src[0].index);
+ break;
+ }
+ }
+ }
+
+ /* Don't allocate rf0 to temps that cross ranges where we have
+ * live implicit rf0 writes from ldvary. We can identify these
+ * by tracking the last ldvary instruction and explicit reads
+ * of rf0.
+ */
+ if (c->devinfo->ver >= 71 &&
+ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+ (vir_get_nsrc(inst) > 1 &&
+ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip &&
+ c->temp_end[i] > last_ldvary_ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ /* Only a ldunif gets to write to R5, which only has a
+ * single 32-bit channel of storage.
+ *
+ * NOTE: ldunifa is subject to the same, however, going by
+ * shader-db it is best to keep r5 exclusive to ldunif, probably
+ * because ldunif has usually a shorter lifespan, allowing for
+ * more accumulator reuse and QPU merges.
+ */
+ if (c->devinfo->has_accumulators) {
+ if (!inst->qpu.sig.ldunif) {
+ uint8_t class_bits =
+ get_temp_class_bits(c, inst->dst.index) &
+ ~CLASS_BITS_R5;
+ set_temp_class_bits(c, inst->dst.index,
+ class_bits);
+
+ }
+ } else {
+ /* Make sure we don't allocate the ldvary's
+ * destination to rf0, since it would clash
+ * with its implicit write to that register.
+ */
+ if (inst->qpu.sig.ldvary) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, inst->dst.index),
+ implicit_rf_nodes[0]);
+ }
+ /* Flag dst temps from ldunif(a) instructions
+ * so we can try to assign rf0 to them and avoid
+ * converting these to ldunif(a)rf.
+ */
+ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+ const uint32_t dst_n =
+ temp_to_node(c, inst->dst.index);
+ c->nodes.info[dst_n].is_ldunif_dst = true;
+ }
+ }
+ }
+
+ /* All accumulators are invalidated across a thread switch. */
+ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ set_temp_class_bits(c, i,
+ CLASS_BITS_PHYS);
+ }
+ }
+ }
}
-#define CLASS_BIT_PHYS (1 << 0)
-#define CLASS_BIT_ACC (1 << 1)
-#define CLASS_BIT_R5 (1 << 4)
-#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \
- CLASS_BIT_ACC | \
- CLASS_BIT_R5)
+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+ /* Only look for registers used in this many instructions */
+ uint32_t last_set_count = 6;
+
+ struct qblock *last_block = vir_exit_block(c);
+ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+ if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+ continue;
+
+ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+ }
+ }
+
+ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->dst.index);
+ c->nodes.info[node].is_program_end = true;
+ }
+
+ if (--last_set_count == 0)
+ break;
+ }
+}
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
@@ -587,24 +1275,37 @@ tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
* The return value should be freed by the caller.
*/
struct qpu_reg *
-v3d_register_allocate(struct v3d_compile *c, bool *spilled)
+v3d_register_allocate(struct v3d_compile *c)
{
- uint32_t UNUSED start_num_temps = c->num_temps;
- struct node_to_temp_map map[c->num_temps];
- uint32_t temp_to_node[c->num_temps];
- uint8_t class_bits[c->num_temps];
int acc_nodes[ACC_COUNT];
+ int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+ unsigned num_ra_nodes = c->num_temps;
+ if (c->devinfo->has_accumulators)
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
+ else
+ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
+ c->nodes = (struct v3d_ra_node_info) {
+ .alloc_count = c->num_temps,
+ .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
+ num_ra_nodes),
+ };
+
+ uint32_t phys_index = get_phys_index(c->devinfo);
+
struct v3d_ra_select_callback_data callback_data = {
+ .phys_index = phys_index,
.next_acc = 0,
/* Start at RF3, to try to keep the TLB writes from using
- * RF0-2.
+ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+ * using RF2-3.
*/
- .next_phys = 3,
- .map = map,
+ .next_phys = c->devinfo->ver == 42 ? 3 : 4,
+ .nodes = &c->nodes,
+ .devinfo = c->devinfo,
};
- *spilled = false;
-
vir_calculate_live_intervals(c);
/* Convert 1, 2, 4 threads to 0, 1, 2 index.
@@ -612,257 +1313,163 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
* V3D 4.x has double the physical register space, so 64 physical regs
* are available at both 1x and 2x threading, and 4x has 32.
*/
- int thread_index = ffs(c->threads) - 1;
- if (c->devinfo->ver >= 40) {
- if (thread_index >= 1)
- thread_index--;
- }
+ c->thread_index = ffs(c->threads) - 1;
+ if (c->thread_index >= 1)
+ c->thread_index--;
- struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
- c->num_temps +
- ARRAY_SIZE(acc_nodes));
- ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
+ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
+ ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread
* switches. We could represent these as classes for the nodes to
* live in, but the classes take up a lot of memory to set up, so we
- * don't want to make too many.
+ * don't want to make too many. We use the same mechanism on platforms
+ * without accumulators that can have implicit writes to phys regs.
*/
- for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
- acc_nodes[i] = c->num_temps + i;
- ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
- }
-
- for (uint32_t i = 0; i < c->num_temps; i++) {
- map[i].temp = i;
- map[i].priority = c->temp_end[i] - c->temp_start[i];
- }
- qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
- for (uint32_t i = 0; i < c->num_temps; i++) {
- temp_to_node[map[i].temp] = i;
+ for (uint32_t i = 0; i < num_ra_nodes; i++) {
+ c->nodes.info[i].is_ldunif_dst = false;
+ c->nodes.info[i].is_program_end = false;
+ c->nodes.info[i].unused = false;
+ c->nodes.info[i].priority = 0;
+ c->nodes.info[i].class_bits = 0;
+ if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+ acc_nodes[i] = i;
+ ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+ } else if (!c->devinfo->has_accumulators &&
+ i < ARRAY_SIZE(implicit_rf_nodes)) {
+ implicit_rf_nodes[i] = i;
+ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
+ } else {
+ uint32_t t = node_to_temp(c, i);
+ c->nodes.info[i].priority =
+ c->temp_end[t] - c->temp_start[t];
+ c->nodes.info[i].class_bits =
+ get_class_bit_any(c->devinfo);
+ }
}
- /* Figure out our register classes and preallocated registers. We
- * start with any temp being able to be in any file, then instructions
- * incrementally remove bits that the temp definitely can't be in.
+ /* Walk the instructions adding register class restrictions and
+ * interferences.
*/
- memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
-
int ip = 0;
+ int last_ldvary_ip = -1;
vir_for_each_inst_inorder(inst, c) {
- /* If the instruction writes r3/r4 (and optionally moves its
- * result to a temp), nothing else can be stored in r3/r4 across
- * it.
+ inst->ip = ip++;
+
+ /* ldunif(a) always write to a temporary, so we have
+ * liveness info available to decide if rf0 is
+ * available for them, however, ldvary is different:
+ * it always writes to rf0 directly so we don't have
+ * liveness information for its implicit rf0 write.
+ *
+ * That means the allocator may assign rf0 to a temp
+ * that is defined while an implicit rf0 write from
+ * ldvary is still live. We fix that by manually
+ * tracking rf0 live ranges from ldvary instructions.
*/
- if (vir_writes_r3(c->devinfo, inst)) {
- for (int i = 0; i < c->num_temps; i++) {
- if (c->temp_start[i] < ip &&
- c->temp_end[i] > ip) {
- ra_add_node_interference(g,
- temp_to_node[i],
- acc_nodes[3]);
- }
- }
- }
- if (vir_writes_r4(c->devinfo, inst)) {
- for (int i = 0; i < c->num_temps; i++) {
- if (c->temp_start[i] < ip &&
- c->temp_end[i] > ip) {
- ra_add_node_interference(g,
- temp_to_node[i],
- acc_nodes[4]);
- }
- }
- }
-
- if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
- switch (inst->qpu.alu.add.op) {
- case V3D_QPU_A_LDVPMV_IN:
- case V3D_QPU_A_LDVPMV_OUT:
- case V3D_QPU_A_LDVPMD_IN:
- case V3D_QPU_A_LDVPMD_OUT:
- case V3D_QPU_A_LDVPMP:
- case V3D_QPU_A_LDVPMG_IN:
- case V3D_QPU_A_LDVPMG_OUT:
- /* LDVPMs only store to temps (the MA flag
- * decides whether the LDVPM is in or out)
- */
- assert(inst->dst.file == QFILE_TEMP);
- class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
- break;
-
- case V3D_QPU_A_RECIP:
- case V3D_QPU_A_RSQRT:
- case V3D_QPU_A_EXP:
- case V3D_QPU_A_LOG:
- case V3D_QPU_A_SIN:
- case V3D_QPU_A_RSQRT2:
- /* The SFU instructions write directly to the
- * phys regfile.
- */
- assert(inst->dst.file == QFILE_TEMP);
- class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
- break;
-
- default:
- break;
- }
- }
+ if (inst->qpu.sig.ldvary)
+ last_ldvary_ip = ip;
- if (inst->src[0].file == QFILE_REG) {
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
- case 3:
- /* Payload setup instructions: Force allocate
- * the dst to the given register (so the MOV
- * will disappear).
- */
- assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
- assert(inst->dst.file == QFILE_TEMP);
- ra_set_node_reg(g,
- temp_to_node[inst->dst.index],
- PHYS_INDEX +
- inst->src[0].index);
- break;
- }
- }
-
- if (inst->dst.file == QFILE_TEMP) {
- /* Only a ldunif gets to write to R5, which only has a
- * single 32-bit channel of storage.
- */
- if (!inst->qpu.sig.ldunif) {
- class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
- } else {
- /* Until V3D 4.x, we could only load a uniform
- * to r5, so we'll need to spill if uniform
- * loads interfere with each other.
- */
- if (c->devinfo->ver < 40) {
- class_bits[inst->dst.index] &=
- CLASS_BIT_R5;
- }
- }
- }
-
- if (inst->qpu.sig.thrsw) {
- /* All accumulators are invalidated across a thread
- * switch.
- */
- for (int i = 0; i < c->num_temps; i++) {
- if (c->temp_start[i] < ip && c->temp_end[i] > ip)
- class_bits[i] &= CLASS_BIT_PHYS;
- }
- }
-
- ip++;
+ update_graph_and_reg_classes_for_inst(c, acc_nodes,
+ implicit_rf_nodes,
+ last_ldvary_ip, inst);
}
+ /* Flag the nodes that are used in the last instructions of the program
+ * (there are some registers that cannot be used in the last 3
+ * instructions). We only do this for fragment shaders, because the idea
+ * is that by avoiding this conflict we may be able to emit the last
+ * thread switch earlier in some cases, however, in non-fragment shaders
+ * this won't happen because the last instructions are always VPM stores
+ * with a small immediate, which conflicts with other signals,
+ * preventing us from ever moving the thrsw earlier.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+ flag_program_end_nodes(c);
+
+ /* Set the register classes for all our temporaries in the graph */
for (uint32_t i = 0; i < c->num_temps; i++) {
- if (class_bits[i] == CLASS_BIT_PHYS) {
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_phys[thread_index]);
- } else if (class_bits[i] == (CLASS_BIT_R5)) {
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_r5[thread_index]);
- } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_phys_or_acc[thread_index]);
- } else {
- assert(class_bits[i] == CLASS_BITS_ANY);
- ra_set_node_class(g, temp_to_node[i],
- c->compiler->reg_class_any[thread_index]);
- }
+ ra_set_node_class(c->g, temp_to_node(c, i),
+ choose_reg_class_for_temp(c, i));
}
+ /* Add register interferences based on liveness data */
for (uint32_t i = 0; i < c->num_temps; i++) {
+ /* And while we are here, let's also flag nodes for
+ * unused temps.
+ */
+ if (c->temp_start[i] > c->temp_end[i])
+ c->nodes.info[temp_to_node(c, i)].unused = true;
+
for (uint32_t j = i + 1; j < c->num_temps; j++) {
- if (!(c->temp_start[i] >= c->temp_end[j] ||
- c->temp_start[j] >= c->temp_end[i])) {
- ra_add_node_interference(g,
- temp_to_node[i],
- temp_to_node[j]);
+ if (interferes(c->temp_start[i], c->temp_end[i],
+ c->temp_start[j], c->temp_end[j])) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ temp_to_node(c, j));
}
}
}
- /* Debug code to force a bit of register spilling, for running across
- * conformance tests to make sure that spilling works.
+ /* Debug option to force a bit of TMU spilling, for running
+ * across conformance tests to make sure that spilling works.
*/
- int force_register_spills = 0;
- if (c->spill_size <
- V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
- int node = v3d_choose_spill_node(c, g, temp_to_node);
- if (node != -1) {
- v3d_spill_reg(c, map[node].temp);
- ralloc_free(g);
- *spilled = true;
- return NULL;
+ const int force_register_spills = 0;
+ if (force_register_spills > 0)
+ c->max_tmu_spills = UINT32_MAX;
+
+ struct qpu_reg *temp_registers = NULL;
+ while (true) {
+ if (c->spill_size <
+ V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
+ int node = v3d_choose_spill_node(c);
+ uint32_t temp = node_to_temp(c, node);
+ if (node != -1) {
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+ continue;
+ }
}
- }
-
- bool ok = ra_allocate(g);
- if (!ok) {
- const uint32_t spill_batch_size = get_spill_batch_size(c);
-
- for (uint32_t i = 0; i < spill_batch_size; i++) {
- int node = v3d_choose_spill_node(c, g, temp_to_node);
- if (node == -1)
- break;
-
- /* TMU spills inject thrsw signals that invalidate
- * accumulators, so we can't batch them.
- */
- bool is_uniform = vir_is_mov_uniform(c, map[node].temp);
- if (i > 0 && !is_uniform)
- break;
- if (is_uniform || tmu_spilling_allowed(c, thread_index)) {
- v3d_spill_reg(c, map[node].temp);
-
- /* Ask the outer loop to call back in. */
- *spilled = true;
+ if (ra_allocate(c->g))
+ break;
- /* See comment above about batching TMU spills.
- */
- if (!is_uniform) {
- assert(i == 0);
- break;
- }
- } else {
- break;
- }
+ /* Failed allocation, try to spill */
+ int node = v3d_choose_spill_node(c);
+ if (node == -1)
+ goto spill_fail;
+
+ uint32_t temp = node_to_temp(c, node);
+ enum temp_spill_type spill_type =
+ get_spill_type_for_temp(c, temp);
+ if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+ if (c->spills + c->fills > c->max_tmu_spills)
+ goto spill_fail;
+ } else {
+ goto spill_fail;
}
-
- ralloc_free(g);
- return NULL;
}
- /* Ensure that we are not accessing temp_to_node out of bounds. We
- * should never trigger this assertion because `c->num_temps` only
- * grows when we spill, in which case we return early and don't get
- * here.
- */
- assert(start_num_temps == c->num_temps);
- struct qpu_reg *temp_registers = calloc(c->num_temps,
- sizeof(*temp_registers));
-
+ /* Allocation was successful, build the 'temp -> reg' map */
+ temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
for (uint32_t i = 0; i < c->num_temps; i++) {
- int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
- if (ra_reg < PHYS_INDEX) {
+ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+ if (ra_reg < phys_index) {
temp_registers[i].magic = true;
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
ra_reg - ACC_INDEX);
} else {
temp_registers[i].magic = false;
- temp_registers[i].index = ra_reg - PHYS_INDEX;
+ temp_registers[i].index = ra_reg - phys_index;
}
}
- ralloc_free(g);
-
+spill_fail:
+ ralloc_free(c->nodes.info);
+ c->nodes.info = NULL;
+ c->nodes.alloc_count = 0;
+ ralloc_free(c->g);
+ c->g = NULL;
return temp_registers;
}
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index aa33545420e..605c3e4c7d5 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr)
return reg;
}
-static inline struct qpu_reg
-qpu_acc(int acc)
-{
- return qpu_magic(V3D_QPU_WADDR_R0 + acc);
-}
-
struct v3d_qpu_instr
v3d_qpu_nop(void)
{
@@ -92,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
return q;
}
+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+ /* If we have a small immediate move it from inst->raddr_b to the
+ * corresponding raddr.
+ */
+ if (src.smimm) {
+ assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+ instr->sig.small_imm_c || instr->sig.small_imm_d);
+ *raddr = instr->raddr_b;
+ return;
+ }
+
+ assert(!src.magic);
+ *raddr = src.index;
+}
+
/**
* Allocates the src register (accumulator or register file) into the RADDR
* fields of the instruction.
*/
static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d42_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
{
if (src.smimm) {
- assert(instr->sig.small_imm);
+ assert(instr->sig.small_imm_b);
*mux = V3D_QPU_MUX_B;
return;
}
@@ -112,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
return;
}
- if (instr->alu.add.a != V3D_QPU_MUX_A &&
- instr->alu.add.b != V3D_QPU_MUX_A &&
- instr->alu.mul.a != V3D_QPU_MUX_A &&
- instr->alu.mul.b != V3D_QPU_MUX_A) {
+ if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
instr->raddr_a = src.index;
*mux = V3D_QPU_MUX_A;
} else {
if (instr->raddr_a == src.index) {
*mux = V3D_QPU_MUX_A;
} else {
- assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
- instr->alu.add.b == V3D_QPU_MUX_B &&
- instr->alu.mul.a == V3D_QPU_MUX_B &&
- instr->alu.mul.b == V3D_QPU_MUX_B) ||
+ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
src.index == instr->raddr_b);
instr->raddr_b = src.index;
@@ -134,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
}
}
-static bool
-is_no_op_mov(struct qinst *qinst)
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux *mux,
+ uint8_t *raddr,
+ struct qpu_reg src,
+ const struct v3d_device_info *devinfo)
{
- static const struct v3d_qpu_sig no_sig = {0};
-
- /* Make sure it's just a lone MOV. */
- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
- qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
- return false;
- }
+ if (devinfo->ver < 71)
+ return v3d42_set_src(instr, mux, src);
+ else
+ return v3d71_set_src(instr, raddr, src);
+}
- /* Check if it's a MOV from a register to itself. */
+static bool
+v3d42_mov_src_and_dst_equal(struct qinst *qinst)
+{
enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
if (qinst->qpu.alu.mul.magic_write) {
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
return false;
- if (qinst->qpu.alu.mul.a !=
+ if (qinst->qpu.alu.mul.a.mux !=
V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
return false;
}
} else {
int raddr;
- switch (qinst->qpu.alu.mul.a) {
+ switch (qinst->qpu.alu.mul.a.mux) {
case V3D_QPU_MUX_A:
raddr = qinst->qpu.raddr_a;
break;
@@ -174,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
return false;
}
+ return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+ if (qinst->qpu.alu.mul.magic_write)
+ return false;
+
+ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+ int raddr;
+
+ raddr = qinst->qpu.alu.mul.a.raddr;
+ if (raddr != waddr)
+ return false;
+
+ return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return v3d42_mov_src_and_dst_equal(qinst);
+ else
+ return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ static const struct v3d_qpu_sig no_sig = {0};
+
+ /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+ * for V3D 7.x there is also A_MOV, we don't need to check for it as
+ * we always emit using M_MOV. We could use A_MOV later on the
+ * squedule to improve performance
+ */
+ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+ qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+ return false;
+ }
+
+ if (!mov_src_and_dst_equal(qinst, devinfo))
+ return false;
+
/* No packing or flags updates, or we need to execute the
* instruction.
*/
- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -193,8 +262,6 @@ v3d_generate_code_block(struct v3d_compile *c,
struct qblock *block,
struct qpu_reg *temp_registers)
{
- int last_vpm_read_index = -1;
-
vir_for_each_inst_safe(qinst, block) {
#if 0
fprintf(stderr, "translating qinst to qpu: ");
@@ -202,8 +269,6 @@ v3d_generate_code_block(struct v3d_compile *c,
fprintf(stderr, "\n");
#endif
- struct qinst *temp;
-
if (vir_has_uniform(qinst))
c->num_uniforms++;
@@ -219,8 +284,14 @@ v3d_generate_code_block(struct v3d_compile *c,
src[i] = qpu_magic(qinst->src[i].index);
break;
case QFILE_NULL:
+ /* QFILE_NULL is an undef, so we can load
+ * anything. Using a reg that doesn't have
+ * sched. restrictions.
+ */
+ src[i] = qpu_reg(5);
+ break;
case QFILE_LOAD_IMM:
- src[i] = qpu_acc(0);
+ assert(!"not reached");
break;
case QFILE_TEMP:
src[i] = temp_registers[index];
@@ -228,18 +299,6 @@ v3d_generate_code_block(struct v3d_compile *c,
case QFILE_SMALL_IMM:
src[i].smimm = true;
break;
-
- case QFILE_VPM:
- assert((int)qinst->src[i].index >=
- last_vpm_read_index);
- (void)last_vpm_read_index;
- last_vpm_read_index = qinst->src[i].index;
-
- temp = new_qpu_nop_before(qinst);
- temp->qpu.sig.ldvpm = true;
-
- src[i] = qpu_acc(3);
- break;
}
}
@@ -261,10 +320,6 @@ v3d_generate_code_block(struct v3d_compile *c,
dst = temp_registers[qinst->dst.index];
break;
- case QFILE_VPM:
- dst = qpu_magic(V3D_QPU_WADDR_VPM);
- break;
-
case QFILE_SMALL_IMM:
case QFILE_LOAD_IMM:
assert(!"not reached");
@@ -276,10 +331,15 @@ v3d_generate_code_block(struct v3d_compile *c,
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
- if (!dst.magic ||
- dst.index != V3D_QPU_WADDR_R5) {
- assert(c->devinfo->ver >= 40);
+ bool use_rf;
+ if (c->devinfo->has_accumulators) {
+ use_rf = !dst.magic ||
+ dst.index != V3D_QPU_WADDR_R5;
+ } else {
+ use_rf = dst.magic || dst.index != 0;
+ }
+ if (use_rf) {
if (qinst->qpu.sig.ldunif) {
qinst->qpu.sig.ldunif = false;
qinst->qpu.sig.ldunifrf = true;
@@ -299,13 +359,18 @@ v3d_generate_code_block(struct v3d_compile *c,
qinst->qpu.sig_magic = dst.magic;
} else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.a, src[0]);
+ &qinst->qpu.alu.add.a.mux,
+ &qinst->qpu.alu.add.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.b, src[1]);
+ &qinst->qpu.alu.add.b.mux,
+ &qinst->qpu.alu.add.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.add.waddr = dst.index;
@@ -313,17 +378,21 @@ v3d_generate_code_block(struct v3d_compile *c,
} else {
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.a, src[0]);
+ &qinst->qpu.alu.mul.a.mux,
+ &qinst->qpu.alu.mul.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.b, src[1]);
+ &qinst->qpu.alu.mul.b.mux,
+ &qinst->qpu.alu.mul.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.mul.waddr = dst.index;
qinst->qpu.alu.mul.magic_write = dst.magic;
- if (is_no_op_mov(qinst)) {
+ if (is_no_op_mov(qinst, c->devinfo)) {
vir_remove_instruction(c, qinst);
continue;
}
@@ -378,11 +447,7 @@ v3d_dump_qpu(struct v3d_compile *c)
const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str);
- /* We can only do this on 4.x, because we're not tracking TMU
- * implicit uniforms here on 3.x.
- */
- if (c->devinfo->ver >= 40 &&
- reads_uniform(c->devinfo, c->qpu_insts[i])) {
+ if (reads_uniform(c->devinfo, c->qpu_insts[i])) {
fprintf(stderr, " (");
vir_dump_uniform(c->uniform_contents[next_uniform],
c->uniform_data[next_uniform]);
@@ -394,8 +459,7 @@ v3d_dump_qpu(struct v3d_compile *c)
}
/* Make sure our dumping lined up. */
- if (c->devinfo->ver >= 40)
- assert(next_uniform == c->num_uniforms);
+ assert(next_uniform == c->num_uniforms);
fprintf(stderr, "\n");
}
@@ -431,8 +495,8 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
}
assert(i == c->qpu_inst_count);
- if (V3D_DEBUG & (V3D_DEBUG_QPU |
- v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
+ if (V3D_DBG(QPU) ||
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
v3d_dump_qpu(c);
}