diff options
Diffstat (limited to 'src/panfrost/util/pan_ir.h')
-rw-r--r-- | src/panfrost/util/pan_ir.h | 564 |
1 files changed, 329 insertions, 235 deletions
diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 1ff240b3070..7b24fb6922e 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -26,326 +26,420 @@ #include <stdint.h> #include "compiler/nir/nir.h" -#include "util/u_dynarray.h" #include "util/hash_table.h" +#include "util/u_dynarray.h" -/* Define the general compiler entry point */ - -#define MAX_SYSVAL_COUNT 32 - -/* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal - * their class for equal comparison */ - -#define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type) -#define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff) -#define PAN_SYSVAL_ID(sysval) ((sysval) >> 16) - -/* Define some common types. We start at one for easy indexing of hash - * tables internal to the compiler */ - -enum { - PAN_SYSVAL_VIEWPORT_SCALE = 1, - PAN_SYSVAL_VIEWPORT_OFFSET = 2, - PAN_SYSVAL_TEXTURE_SIZE = 3, - PAN_SYSVAL_SSBO = 4, - PAN_SYSVAL_NUM_WORK_GROUPS = 5, - PAN_SYSVAL_SAMPLER = 7, - PAN_SYSVAL_LOCAL_GROUP_SIZE = 8, - PAN_SYSVAL_WORK_DIM = 9, - PAN_SYSVAL_IMAGE_SIZE = 10, - PAN_SYSVAL_SAMPLE_POSITIONS = 11, - PAN_SYSVAL_MULTISAMPLED = 12, - PAN_SYSVAL_RT_CONVERSION = 13, - PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14, - PAN_SYSVAL_DRAWID = 15, -}; +/* Indices for named (non-XFB) varyings that are present. These are packed + * tightly so they correspond to a bitfield present (P) indexed by (1 << + * PAN_VARY_*). This has the nice property that you can lookup the buffer index + * of a given special field given a shift S by: + * + * idx = popcount(P & ((1 << S) - 1)) + * + * That is... look at all of the varyings that come earlier and count them, the + * count is the new index since plus one. Likewise, the total number of special + * buffers required is simply popcount(P) + */ + +enum pan_special_varying { + PAN_VARY_GENERAL = 0, + PAN_VARY_POSITION = 1, + PAN_VARY_PSIZ = 2, + PAN_VARY_PNTCOORD = 3, + PAN_VARY_FACE = 4, + PAN_VARY_FRAGCOORD = 5, -#define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \ - ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0)) + /* Keep last */ + PAN_VARY_MAX, +}; -#define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f) -#define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3) -#define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9)) +/* Maximum number of attribute descriptors required for varyings. These include + * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL + * special varying */ +#define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1) /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be * consistent with the blob so we can compare traces easier. */ -enum { - PAN_VERTEX_ID = 16, - PAN_INSTANCE_ID = 17, - PAN_MAX_ATTRIBUTE -}; +enum { PAN_VERTEX_ID = 16, PAN_INSTANCE_ID = 17, PAN_MAX_ATTRIBUTE }; -struct panfrost_sysvals { - /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ - unsigned sysvals[MAX_SYSVAL_COUNT]; - unsigned sysval_count; -}; - -/* Technically Midgard could go up to 92 in a pathological case but we don't - * take advantage of that. Likewise Bifrost's FAU encoding can address 128 - * words but actual implementations (G72, G76) are capped at 64 */ - -#define PAN_MAX_PUSH 64 +/* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each. + * In practice, the maximum number of FAU slots is limited by implementation. + * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the + * maximum number of 32-bit words is 128, since there are 2 words per FAU slot. + * + * Midgard can push at most 92 words, so this bound suffices. The Midgard + * compiler pushes less than this, as Midgard uses register-mapped uniforms + * instead of FAU, preventing large numbers of uniforms to be pushed for + * nontrivial programs. + */ +#define PAN_MAX_PUSH 128 /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so * an offset to a word must be < 2^16. There are less than 2^8 UBOs */ struct panfrost_ubo_word { - uint16_t ubo; - uint16_t offset; + uint16_t ubo; + uint16_t offset; }; struct panfrost_ubo_push { - unsigned count; - struct panfrost_ubo_word words[PAN_MAX_PUSH]; + unsigned count; + struct panfrost_ubo_word words[PAN_MAX_PUSH]; }; /* Helper for searching the above. Note this is O(N) to the number of pushed * constants, do not run in the draw call hot path */ -unsigned -pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); - -struct hash_table_u64 * -panfrost_init_sysvals(struct panfrost_sysvals *sysvals, void *memctx); - -unsigned -pan_lookup_sysval(struct hash_table_u64 *sysval_to_id, - struct panfrost_sysvals *sysvals, - int sysval); - -int -panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest); +unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, + unsigned offs); struct panfrost_compile_inputs { - unsigned gpu_id; - bool is_blend, is_blit; - struct { - unsigned rt; - unsigned nr_samples; - uint64_t bifrost_blend_desc; - } blend; - unsigned sysval_ubo; - bool shaderdb; - bool no_ubo_to_push; - - enum pipe_format rt_formats[8]; - unsigned nr_cbufs; + struct util_debug_callback *debug; + + unsigned gpu_id; + bool is_blend, is_blit; + struct { + unsigned nr_samples; + uint64_t bifrost_blend_desc; + } blend; + bool no_idvs; + bool no_ubo_to_push; + + /* Used on Valhall. + * + * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0) + * written by the previous stage (fragment shader) or written by this + * stage (vertex shader). Bits are slots from gl_varying_slot. + * + * For modern APIs (GLES or VK), this should be 0. + */ + uint32_t fixed_varying_mask; + + union { + struct { + uint32_t rt_conv[8]; + } bifrost; + }; }; struct pan_shader_varying { - gl_varying_slot location; - enum pipe_format format; + gl_varying_slot location; + enum pipe_format format; }; struct bifrost_shader_blend_info { - nir_alu_type type; - uint32_t return_offset; + nir_alu_type type; + uint32_t return_offset; - /* mali_bifrost_register_file_format corresponding to nir_alu_type */ - unsigned format; + /* mali_bifrost_register_file_format corresponding to nir_alu_type */ + unsigned format; }; -struct bifrost_shader_info { - struct bifrost_shader_blend_info blend[8]; - nir_alu_type blend_src1_type; - bool wait_6, wait_7; +/* + * Unpacked form of a v7 message preload descriptor, produced by the compiler's + * message preload optimization. By splitting out this struct, the compiler does + * not need to know about data structure packing, avoiding a dependency on + * GenXML. + */ +struct bifrost_message_preload { + /* Whether to preload this message */ + bool enabled; + + /* Varying to load from */ + unsigned varying_index; + + /* Register type, FP32 otherwise */ + bool fp16; - /* Packed, preloaded message descriptors */ - uint16_t messages[2]; + /* Number of components, ignored if texturing */ + unsigned num_components; + + /* If texture is set, performs a texture instruction according to + * texture_index, skip, and zero_lod. If texture is unset, only the + * varying load is performed. + */ + bool texture, skip, zero_lod; + unsigned texture_index; +}; + +struct bifrost_shader_info { + struct bifrost_shader_blend_info blend[8]; + nir_alu_type blend_src1_type; + bool wait_6, wait_7; + struct bifrost_message_preload messages[2]; + + /* Whether any flat varyings are loaded. This may disable optimizations + * that change the provoking vertex, since that would load incorrect + * values for flat varyings. + */ + bool uses_flat_shading; }; struct midgard_shader_info { - unsigned first_tag; + unsigned first_tag; }; struct pan_shader_info { - gl_shader_stage stage; - unsigned work_reg_count; - unsigned tls_size; - unsigned wls_size; - - union { - struct { - bool reads_frag_coord; - bool reads_point_coord; - bool reads_face; - bool helper_invocations; - bool can_discard; - bool writes_depth; - bool writes_stencil; - bool writes_coverage; - bool sidefx; - bool reads_sample_id; - bool reads_sample_pos; - bool reads_sample_mask_in; - bool reads_helper_invocation; - bool sample_shading; - bool early_fragment_tests; - bool can_early_z, can_fpk; - BITSET_WORD outputs_read; - BITSET_WORD outputs_written; - } fs; - - struct { - bool writes_point_size; - } vs; - }; - - bool separable; - bool contains_barrier; - bool writes_global; - uint64_t outputs_written; - - unsigned sampler_count; - unsigned texture_count; - unsigned ubo_count; - unsigned attribute_count; - - struct { - unsigned input_count; - struct pan_shader_varying input[MAX_VARYING]; - unsigned output_count; - struct pan_shader_varying output[MAX_VARYING]; - } varyings; - - struct panfrost_sysvals sysvals; - - /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access - * Uniforms (Bifrost) */ - struct panfrost_ubo_push push; - - uint32_t ubo_mask; - - union { - struct bifrost_shader_info bifrost; - struct midgard_shader_info midgard; - }; + gl_shader_stage stage; + unsigned work_reg_count; + unsigned tls_size; + unsigned wls_size; + + /* Bit mask of preloaded registers */ + uint64_t preload; + + union { + struct { + bool reads_frag_coord; + bool reads_point_coord; + bool reads_face; + bool can_discard; + bool writes_depth; + bool writes_stencil; + bool writes_coverage; + bool sidefx; + bool sample_shading; + bool early_fragment_tests; + bool can_early_z, can_fpk; + bool untyped_color_outputs; + BITSET_WORD outputs_read; + BITSET_WORD outputs_written; + } fs; + + struct { + bool writes_point_size; + + /* If the primary shader writes point size, the Valhall + * driver may need a variant that does not write point + * size. Offset to such a shader in the program binary. + * + * Zero if no such variant is required. + * + * Only used with IDVS on Valhall. + */ + unsigned no_psiz_offset; + + /* Set if Index-Driven Vertex Shading is in use */ + bool idvs; + + /* If IDVS is used, whether a varying shader is used */ + bool secondary_enable; + + /* If a varying shader is used, the varying shader's + * offset in the program binary + */ + unsigned secondary_offset; + + /* If IDVS is in use, number of work registers used by + * the varying shader + */ + unsigned secondary_work_reg_count; + + /* If IDVS is in use, bit mask of preloaded registers + * used by the varying shader + */ + uint64_t secondary_preload; + } vs; + + struct { + /* Is it legal to merge workgroups? This is true if the + * shader uses neither barriers nor shared memory. This + * requires caution: if the API allows specifying shared + * memory at launch time (instead of compile time), that + * memory will not be accounted for by the compiler. + * + * Used by the Valhall hardware. + */ + bool allow_merging_workgroups; + } cs; + }; + + /* Does the shader contains a barrier? or (for fragment shaders) does it + * require helper invocations, which demand the same ordering guarantees + * of the hardware? These notions are unified in the hardware, so we + * unify them here as well. + */ + bool contains_barrier; + bool separable; + bool writes_global; + uint64_t outputs_written; + + /* Floating point controls that the driver should try to honour */ + bool ftz_fp16, ftz_fp32; + + unsigned sampler_count; + unsigned texture_count; + unsigned ubo_count; + unsigned attributes_read_count; + unsigned attribute_count; + unsigned attributes_read; + + struct { + unsigned input_count; + struct pan_shader_varying input[PAN_MAX_VARYINGS]; + unsigned output_count; + struct pan_shader_varying output[PAN_MAX_VARYINGS]; + } varyings; + + /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access + * Uniforms (Bifrost) */ + struct panfrost_ubo_push push; + + uint32_t ubo_mask; + + union { + struct bifrost_shader_info bifrost; + struct midgard_shader_info midgard; + }; }; typedef struct pan_block { - /* Link to next block. Must be first for mir_get_block */ - struct list_head link; + /* Link to next block. Must be first for mir_get_block */ + struct list_head link; - /* List of instructions emitted for the current block */ - struct list_head instructions; + /* List of instructions emitted for the current block */ + struct list_head instructions; - /* Index of the block in source order */ - unsigned name; + /* Index of the block in source order */ + unsigned name; - /* Control flow graph */ - struct pan_block *successors[2]; - struct set *predecessors; - bool unconditional_jumps; + /* Control flow graph */ + struct pan_block *successors[2]; + struct set *predecessors; + bool unconditional_jumps; - /* In liveness analysis, these are live masks (per-component) for - * indices for the block. Scalar compilers have the luxury of using - * simple bit fields, but for us, liveness is a vector idea. */ - uint16_t *live_in; - uint16_t *live_out; + /* In liveness analysis, these are live masks (per-component) for + * indices for the block. Scalar compilers have the luxury of using + * simple bit fields, but for us, liveness is a vector idea. */ + uint16_t *live_in; + uint16_t *live_out; } pan_block; struct pan_instruction { - struct list_head link; + struct list_head link; }; -#define pan_foreach_instr_in_block_rev(block, v) \ - list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link) - -#define pan_foreach_successor(blk, v) \ - pan_block *v; \ - pan_block **_v; \ - for (_v = (pan_block **) &blk->successors[0], \ - v = *_v; \ - v != NULL && _v < (pan_block **) &blk->successors[2]; \ - _v++, v = *_v) \ - -#define pan_foreach_predecessor(blk, v) \ - struct set_entry *_entry_##v; \ - struct pan_block *v; \ - for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \ - v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL); \ - _entry_##v != NULL; \ - _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \ - v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL)) +#define pan_foreach_instr_in_block_rev(block, v) \ + list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, \ + link) + +#define pan_foreach_successor(blk, v) \ + pan_block *v; \ + pan_block **_v; \ + for (_v = (pan_block **)&blk->successors[0], v = *_v; \ + v != NULL && _v < (pan_block **)&blk->successors[2]; _v++, v = *_v) + +#define pan_foreach_predecessor(blk, v) \ + struct set_entry *_entry_##v; \ + struct pan_block *v; \ + for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \ + v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL); \ + _entry_##v != NULL; \ + _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \ + v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL)) static inline pan_block * pan_exit_block(struct list_head *blocks) { - pan_block *last = list_last_entry(blocks, pan_block, link); - assert(!last->successors[0] && !last->successors[1]); - return last; + pan_block *last = list_last_entry(blocks, pan_block, link); + assert(!last->successors[0] && !last->successors[1]); + return last; } typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max); -void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask); -void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask); +void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, + uint16_t mask); +void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, + uint16_t mask); bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max); -void pan_compute_liveness(struct list_head *blocks, - unsigned temp_count, - pan_liveness_update callback); +void pan_compute_liveness(struct list_head *blocks, unsigned temp_count, + pan_liveness_update callback); void pan_free_liveness(struct list_head *blocks); -uint16_t -pan_to_bytemask(unsigned bytes, unsigned mask); +uint16_t pan_to_bytemask(unsigned bytes, unsigned mask); void pan_block_add_successor(pan_block *block, pan_block *successor); /* IR indexing */ #define PAN_IS_REG (1) -static inline unsigned -pan_ssa_index(nir_ssa_def *ssa) -{ - /* Off-by-one ensures BIR_NO_ARG is skipped */ - return ((ssa->index + 1) << 1) | 0; -} - -static inline unsigned -pan_src_index(nir_src *src) -{ - if (src->is_ssa) - return pan_ssa_index(src->ssa); - else { - assert(!src->reg.indirect); - return (src->reg.reg->index << 1) | PAN_IS_REG; - } -} - -static inline unsigned -pan_dest_index(nir_dest *dst) -{ - if (dst->is_ssa) - return pan_ssa_index(&dst->ssa); - else { - assert(!dst->reg.indirect); - return (dst->reg.reg->index << 1) | PAN_IS_REG; - } -} - /* IR printing helpers */ void pan_print_alu_type(nir_alu_type t, FILE *fp); -/* Until it can be upstreamed.. */ -bool pan_has_source_mod(nir_alu_src *src, nir_op op); -bool pan_has_dest_mod(nir_dest **dest, nir_op op); - /* NIR passes to do some backend-specific lowering */ #define PAN_WRITEOUT_C 1 #define PAN_WRITEOUT_Z 2 #define PAN_WRITEOUT_S 4 +#define PAN_WRITEOUT_2 8 -bool pan_nir_reorder_writeout(nir_shader *nir); bool pan_nir_lower_zs_store(nir_shader *nir); +bool pan_nir_lower_store_component(nir_shader *shader); +bool pan_nir_lower_image_ms(nir_shader *shader); bool pan_nir_lower_64bit_intrin(nir_shader *shader); bool pan_lower_helper_invocation(nir_shader *shader); bool pan_lower_sample_pos(nir_shader *shader); +bool pan_lower_xfb(nir_shader *nir); + +bool pan_lower_image_index(nir_shader *shader, unsigned vs_img_attrib_offset); + +void pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info); + +/* + * Helper returning the subgroup size. Generally, this is equal to the number of + * threads in a warp. For Midgard (including warping models), this returns 1, as + * subgroups are not supported. + */ +static inline unsigned +pan_subgroup_size(unsigned arch) +{ + if (arch >= 9) + return 16; + else if (arch >= 7) + return 8; + else if (arch >= 6) + return 4; + else + return 1; +} + +/* + * Helper extracting the table from a given handle of Valhall descriptor model. + */ +static inline unsigned +pan_res_handle_get_table(unsigned handle) +{ + unsigned table = handle >> 24; + + assert(table < 64); + return table; +} + +/* + * Helper returning the index from a given handle of Valhall descriptor model. + */ +static inline unsigned +pan_res_handle_get_index(unsigned handle) +{ + return handle & BITFIELD_MASK(24); +} + +/* + * Helper creating an handle for Valhall descriptor model. + */ +static inline unsigned +pan_res_handle(unsigned table, unsigned index) +{ + assert(table < 64); + assert(index < (1u << 24)); + + return (table << 24) | index; +} #endif |