1 files changed, 156 insertions, 0 deletions
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 47ebfdc9105..4f8209e5d76 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -2585,6 +2585,25 @@ bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
 }
 
 /* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */
+static bi_index
+bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
+{
+        /* For (u)int we can just passthrough */
+        nir_alu_type base = nir_alu_type_get_base_type(T);
+        if (base == nir_type_int || base == nir_type_uint)
+                return idx;
+
+        /* Otherwise we convert */
+        assert(T == nir_type_float32);
+
+        /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
+         * Texel Selection") defines the layer to be taken from clamp(RNE(r),
+         * 0, dt - 1). So we use round RTE, clamping is handled at the data
+         * structure level */
+
+        return bi_f32_to_u32(b, idx, BI_ROUND_NONE);
+}
+
 static unsigned
 bi_emit_array_index(bi_context *ctx, unsigned idx, nir_alu_type T, unsigned *c)
 {
@@ -2624,6 +2643,28 @@ bi_emit_array_index(bi_context *ctx, unsigned idx, nir_alu_type T, unsigned *c)
  * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
  */
 
+static bi_index
+bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
+{
+        /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
+         * the max LOD (16 since we cap at 2^16 texture dimensions), and
+         * preferably small to minimize precision loss */
+        const float max_lod = 16.0;
+
+        bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
+                        fp16 ? bi_half(lod, false) : lod,
+                        bi_imm_f32(1.0f / max_lod), bi_zero(), BI_ROUND_NONE);
+
+        fsat->clamp = BI_CLAMP_CLAMP_M1_1;
+
+        bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
+                        bi_zero(), BI_ROUND_NONE);
+
+        return bi_mkvec_v2i16(b,
+                        bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),
+                        bi_imm_u16(0));
+}
+
 static unsigned
 bi_emit_lod_88(bi_context *ctx, unsigned lod, bool fp16)
 {
@@ -2693,6 +2734,12 @@ bi_emit_lod_88(bi_context *ctx, unsigned lod, bool fp16)
  * TODO: Cube face.
  */
 
+static bi_index
+bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
+{
+        return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
+}
+
 static unsigned
 bi_emit_lod_cube(bi_context *ctx, unsigned lod)
 {
@@ -2724,6 +2771,36 @@ bi_emit_lod_cube(bi_context *ctx, unsigned lod)
  * the bits we need and return that to be passed as a staging register. Else we
  * return 0 to avoid allocating a data register when everything is zero. */
 
+static bi_index
+bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
+{
+        bi_index dest = bi_zero();
+
+        int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
+        if (offs_idx >= 0 &&
+            (!nir_src_is_const(instr->src[offs_idx].src) ||
+             nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
+                unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
+                bi_index idx = bi_src_index(&instr->src[offs_idx].src);
+                dest = bi_mkvec_v4i8(b, 
+                                (nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0),
+                                (nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0),
+                                (nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0),
+                                bi_imm_u8(0));
+        }
+
+        int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
+        if (ms_idx >= 0 &&
+            (!nir_src_is_const(instr->src[ms_idx].src) ||
+             nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
+                dest = bi_lshift_or_i32(b,
+                                bi_src_index(&instr->src[ms_idx].src), dest,
+                                bi_imm_u8(24));
+        }
+
+        return dest;
+}
+
 static unsigned
 bi_emit_tex_offset_ms_index(bi_context *ctx, nir_tex_instr *instr)
 {
@@ -2788,6 +2865,67 @@ bi_emit_tex_offset_ms_index(bi_context *ctx, nir_tex_instr *instr)
 }
 
 static void
+bi_emit_cube_coord(bi_builder *b, bi_index coord,
+                    bi_index *face, bi_index *s, bi_index *t)
+{
+        /* Compute max { |x|, |y|, |z| } */
+        bi_index cubeface1 = bi_cubeface1(b, coord,
+                        bi_word(coord, 1), bi_word(coord, 2));
+
+        /* Calculate packed exponent / face / infinity. In reality this reads
+         * the destination from cubeface1 but that's handled by lowering */
+        bi_instr *cubeface2 = bi_cubeface1_to(b, bi_temp(b->shader), coord,
+                        bi_word(coord, 1), bi_word(coord, 2));
+        cubeface2->op = BI_OPCODE_CUBEFACE2; /* XXX: DEEP VOODOO */
+
+        /* Select coordinates */
+
+        bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,
+                        cubeface2->dest[0]);
+
+        bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),
+                        cubeface2->dest[0]);
+
+        /* The OpenGL ES specification requires us to transform an input vector
+         * (x, y, z) to the coordinate, given the selected S/T:
+         *
+         * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
+         *
+         * We implement (s shown, t similar) in a form friendlier to FMA
+         * instructions, and clamp coordinates at the end for correct
+         * NaN/infinity handling:
+         *
+         * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
+         *
+         * Take the reciprocal of max{x, y, z}
+         */
+
+        bi_index rcp = bi_frcp_f32(b, cubeface1);
+
+        /* Calculate 0.5 * (1.0 / max{x, y, z}) */
+        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_zero(),
+                        BI_ROUND_NONE);
+
+        /* Transform the coordinates */
+        *s = bi_temp(b->shader);
+        *t = bi_temp(b->shader);
+
+        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),
+                        BI_ROUND_NONE);
+        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),
+                        BI_ROUND_NONE);
+
+        S->clamp = BI_CLAMP_CLAMP_0_1;
+        T->clamp = BI_CLAMP_CLAMP_0_1;
+
+        /* Cube face is stored in bit[29:31], we don't apply the shift here
+         * because the TEXS_CUBE and TEXC instructions expect the face index to
+         * be at this position.
+         */
+        *face = cubeface2->dest[0];
+}
+
+static void
 bi_lower_cube_coord(bi_context *ctx, unsigned coord,
                     unsigned *face, unsigned *s, unsigned *t)
 {
@@ -2909,6 +3047,24 @@ bi_lower_cube_coord(bi_context *ctx, unsigned coord,
         *t = fma3.dest;
 }
 
+/* Emits a cube map descriptor, returning lower 32-bits and putting upper
+ * 32-bits in passed pointer t */
+
+static bi_index
+bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
+{
+        bi_index face, s;
+        bi_emit_cube_coord(b, coord, &face, &s, t);
+
+        bi_index and1 = bi_lshift_and_i32(b, face, bi_imm_u32(0xe0000000),
+                        bi_imm_u8(0));
+
+        bi_index and2 = bi_lshift_and_i32(b, s, bi_imm_u32(0x1fffffff),
+                        bi_imm_u8(0));
+
+        return bi_lshift_or_i32(b, and1, and2, bi_imm_u8(0));
+}
+
 static void
 texc_pack_cube_coord(bi_context *ctx, unsigned coord,
                      unsigned *face_s, unsigned *t)