gallium: implement compute pbo download

this reworks PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER into an enum as PIPE_CAP_TEXTURE_TRANSFER_MODES, enabling drivers to choose a (sometimes) faster, compute-based download mechanism based on a new pipe_screen hook compute pbo download is implemented using shaders with a prolog to convert the input format to generic rgb float values, then an epilog to convert to the output value. the prolog and epilog are determined based on a vec4 of packed ubo data which is dynamically updated based on the API usage currently, the only known limitations are: * GL_ARB_texture_cube_map_array is broken somehow (and disabled) * AMD hardware somehow can't do depth readback? otherwise it should work for every possible case Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Acked-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11984>
author: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> 2021-08-05 15:28:52 -0400
committer: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> 2021-11-18 08:00:07 -0500
commit: e7b95619596167025d26f07efb04dc5bfd5e9aee (patch)
tree: a9c128cd87b0df57e8cbabb51408cf893bbdec7b /src/mesa
parent: ed65b5e8399fc59fe011381338cc06bc3f7ec5d6 (diff)
7 files changed, 1153 insertions, 18 deletions
diff --git a/src/mesa/meson.build b/src/mesa/meson.build
index 7d945a7712c..4cdf0f41f1e 100644
--- a/src/mesa/meson.build
+++ b/src/mesa/meson.build
@@ -587,6 +587,7 @@ files_libmesa_gallium = files(
   'state_tracker/st_nir_lower_builtin.c',
   'state_tracker/st_nir_lower_tex_src_plane.c',
   'state_tracker/st_pbo.c',
+  'state_tracker/st_pbo_compute.c',
   'state_tracker/st_pbo.h',
   'state_tracker/st_program.c',
   'state_tracker/st_program.h',
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 273ac249fea..599d2099d6f 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -51,6 +51,7 @@
 #include "state_tracker/st_debug.h"
 #include "state_tracker/st_context.h"
 #include "state_tracker/st_cb_bitmap.h"
+#include "state_tracker/st_cb_drawpixels.h"
 #include "state_tracker/st_cb_fbo.h"
 #include "state_tracker/st_cb_flush.h"
 #include "state_tracker/st_cb_texture.h"
@@ -2518,31 +2519,31 @@ st_GetTexSubImage(struct gl_context * ctx,
 
    if (!st->prefer_blit_based_texture_transfer &&
        !_mesa_is_format_compressed(texImage->TexFormat)) {
-      /* Try to avoid the fallback if we're doing texture decompression here */
-      goto fallback;
+      /* Try to avoid the non_blit_transfer if we're doing texture decompression here */
+      goto non_blit_transfer;
    }
 
    /* Handle non-finalized textures. */
    if (!stImage->pt || stImage->pt != stObj->pt || !src) {
-      goto fallback;
+      goto cpu_transfer;
    }
 
    /* XXX Fallback to _mesa_GetTexImage_sw for depth-stencil formats
     * due to an incomplete stencil blit implementation in some drivers. */
    if (format == GL_DEPTH_STENCIL || format == GL_STENCIL_INDEX) {
-      goto fallback;
+      goto non_blit_transfer;
    }
 
    /* If the base internal format and the texture format don't match, we have
     * to fall back to _mesa_GetTexImage_sw. */
    if (texImage->_BaseFormat !=
        _mesa_get_format_base_format(texImage->TexFormat)) {
-      goto fallback;
+      goto non_blit_transfer;
    }
 
    src_format = st_pbo_get_src_format(screen, stObj->surface_based ? stObj->surface_format : src->format, src);
    if (src_format == PIPE_FORMAT_NONE)
-      goto fallback;
+      goto non_blit_transfer;
 
    if (format == GL_DEPTH_COMPONENT || format == GL_DEPTH_STENCIL)
       bind = PIPE_BIND_DEPTH_STENCIL;
@@ -2552,7 +2553,7 @@ st_GetTexSubImage(struct gl_context * ctx,
    dst_format = st_pbo_get_dst_format(ctx, pipe_target, src_format, util_format_is_compressed(src->format),
                                format, type, bind);
    if (dst_format == PIPE_FORMAT_NONE)
-      goto fallback;
+      goto non_blit_transfer;
 
    if (st->pbo.download_enabled && ctx->Pack.BufferObj) {
       if (try_pbo_download(st, texImage,
@@ -2567,11 +2568,11 @@ st_GetTexSubImage(struct gl_context * ctx,
     * in which case the memcpy-based fast path will be used. */
    if (_mesa_format_matches_format_and_type(texImage->TexFormat, format,
                                             type, ctx->Pack.SwapBytes, NULL))
-      goto fallback;
+      goto non_blit_transfer;
 
    dst = create_dst_texture(ctx, dst_format, pipe_target, width, height, depth, gl_target, bind);
    if (!dst)
-      goto fallback;
+      goto non_blit_transfer;
 
    /* From now on, we need the gallium representation of dimensions. */
    if (gl_target == GL_TEXTURE_1D_ARRAY) {
@@ -2612,12 +2613,17 @@ st_GetTexSubImage(struct gl_context * ctx,
                            depth, format, type, pixels, texImage);
    pipe_resource_reference(&dst, NULL);
 
-fallback:
-   if (!done) {
-      _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
-                              width, height, depth,
-                              format, type, pixels, texImage);
+non_blit_transfer:
+   if (done)
+      return;
+   if (st->allow_compute_based_texture_transfer) {
+      if (st_GetTexSubImage_shader(ctx, xoffset, yoffset, zoffset, width, height, depth, format, type, pixels, texImage))
+         return;
    }
+cpu_transfer:
+   _mesa_GetTexSubImage_sw(ctx, xoffset, yoffset, zoffset,
+                           width, height, depth,
+                           format, type, pixels, texImage);
 }
 
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index b6fe8f83ab9..16cdac1737e 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -625,6 +625,11 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
 
    st_init_atoms(st);
    st_init_clear(st);
+   {
+      enum pipe_texture_transfer_mode val = screen->get_param(screen, PIPE_CAP_TEXTURE_TRANSFER_MODES);
+      st->prefer_blit_based_texture_transfer = (val & PIPE_TEXTURE_TRANSFER_BLIT) != 0;
+      st->allow_compute_based_texture_transfer = (val & PIPE_TEXTURE_TRANSFER_COMPUTE) != 0;
+   }
    st_init_pbo_helpers(st);
 
    /* Choose texture target for glDrawPixels, glBitmap, renderbuffers */
@@ -689,10 +694,6 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
    st->has_astc_5x5_ldr =
       screen->is_format_supported(screen, PIPE_FORMAT_ASTC_5x5_SRGB,
                                   PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW);
-   {
-      enum pipe_texture_transfer_mode val = screen->get_param(screen, PIPE_CAP_TEXTURE_TRANSFER_MODES);
-      st->prefer_blit_based_texture_transfer = (val & PIPE_TEXTURE_TRANSFER_BLIT) == PIPE_TEXTURE_TRANSFER_BLIT;
-   }
    st->force_persample_in_shader =
       screen->get_param(screen, PIPE_CAP_SAMPLE_SHADING) &&
       !screen->get_param(screen, PIPE_CAP_FORCE_PERSAMPLE_INTERP);
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index f79c7506cf9..4f2a02054b4 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -147,6 +147,7 @@ struct st_context
    boolean has_astc_2d_ldr;
    boolean has_astc_5x5_ldr;
    boolean prefer_blit_based_texture_transfer;
+   boolean allow_compute_based_texture_transfer;
    boolean force_persample_in_shader;
    boolean has_shareable_shaders;
    boolean has_half_float_packing;
@@ -326,6 +327,7 @@ struct st_context
       void *gs;
       void *upload_fs[5][2];
       void *download_fs[5][PIPE_MAX_TEXTURE_TYPES][2];
+      struct hash_table *shaders;
       bool upload_enabled;
       bool download_enabled;
       bool rgba_only;
diff --git a/src/mesa/state_tracker/st_pbo.c b/src/mesa/state_tracker/st_pbo.c
index 403d47e91c7..66ffb54428a 100644
--- a/src/mesa/state_tracker/st_pbo.c
+++ b/src/mesa/state_tracker/st_pbo.c
@@ -661,6 +661,9 @@ st_init_pbo_helpers(struct st_context *st)
    /* Rasterizer state */
    memset(&st->pbo.raster, 0, sizeof(struct pipe_rasterizer_state));
    st->pbo.raster.half_pixel_center = 1;
+
+   if (st->allow_compute_based_texture_transfer)
+      st->pbo.shaders = _mesa_hash_table_create_u32_keys(NULL);
 }
 
 void
@@ -697,4 +700,10 @@ st_destroy_pbo_helpers(struct st_context *st)
       st->pipe->delete_vs_state(st->pipe, st->pbo.vs);
       st->pbo.vs = NULL;
    }
+
+   if (st->pbo.shaders) {
+      hash_table_foreach(st->pbo.shaders, entry)
+         st->pipe->delete_compute_state(st->pipe, entry->data);
+      _mesa_hash_table_destroy(st->pbo.shaders, NULL);
+   }
 }
diff --git a/src/mesa/state_tracker/st_pbo.h b/src/mesa/state_tracker/st_pbo.h
index 15df574193d..b1eafc8fd0a 100644
--- a/src/mesa/state_tracker/st_pbo.h
+++ b/src/mesa/state_tracker/st_pbo.h
@@ -110,6 +110,12 @@ st_pbo_get_download_fs(struct st_context *st, enum pipe_texture_target target,
                        enum pipe_format dst_format,
                        bool need_layer);
 
+bool
+st_GetTexSubImage_shader(struct gl_context * ctx,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLint depth,
+                         GLenum format, GLenum type, void * pixels,
+                         struct gl_texture_image *texImage);
 
 enum pipe_format
 st_pbo_get_dst_format(struct gl_context *ctx, enum pipe_texture_target target,
diff --git a/src/mesa/state_tracker/st_pbo_compute.c b/src/mesa/state_tracker/st_pbo_compute.c
new file mode 100644
index 00000000000..5a33c1f197b
--- /dev/null
+++ b/src/mesa/state_tracker/st_pbo_compute.c
@@ -0,0 +1,1110 @@
+/*
+ * Copyright © 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * 
+ * Authors:
+ *    Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
+ */
+
+#include <stdbool.h>
+#include "main/image.h"
+#include "main/pbo.h"
+
+#include "state_tracker/st_nir.h"
+#include "state_tracker/st_format.h"
+#include "state_tracker/st_pbo.h"
+#include "state_tracker/st_texture.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+#include "compiler/glsl/gl_nir.h"
+#include "util/u_sampler.h"
+
+#define BGR_FORMAT(NAME) \
+    {{ \
+     [0] = PIPE_FORMAT_##NAME##_SNORM, \
+     [1] = PIPE_FORMAT_##NAME##_SINT, \
+    }, \
+    { \
+     [0] = PIPE_FORMAT_##NAME##_UNORM, \
+     [1] = PIPE_FORMAT_##NAME##_UINT, \
+    }}
+
+#define FORMAT(NAME, NAME16, NAME32) \
+   {{ \
+    [1] = PIPE_FORMAT_##NAME##_SNORM, \
+    [2] = PIPE_FORMAT_##NAME16##_SNORM, \
+    [4] = PIPE_FORMAT_##NAME32##_SNORM, \
+   }, \
+   { \
+    [1] = PIPE_FORMAT_##NAME##_UNORM, \
+    [2] = PIPE_FORMAT_##NAME16##_UNORM, \
+    [4] = PIPE_FORMAT_##NAME32##_UNORM, \
+   }}
+
+/* don't try these at home */
+static enum pipe_format
+get_convert_format(struct gl_context *ctx,
+                enum pipe_format src_format,
+                GLenum format, GLenum type,
+                bool *need_bgra_swizzle)
+{
+   struct st_context *st = st_context(ctx);
+   GLint bpp = _mesa_bytes_per_pixel(format, type);
+   if (_mesa_is_depth_format(format) ||
+       format == GL_GREEN_INTEGER ||
+       format == GL_BLUE_INTEGER) {
+      switch (bpp) {
+      case 1:
+         return _mesa_is_type_unsigned(type) ? PIPE_FORMAT_R8_UINT : PIPE_FORMAT_R8_SINT;
+      case 2:
+         return _mesa_is_type_unsigned(type) ? PIPE_FORMAT_R16_UINT : PIPE_FORMAT_R16_SINT;
+      case 4:
+         return _mesa_is_type_unsigned(type) ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R32_SINT;
+      }
+   }
+   mesa_format mformat = _mesa_tex_format_from_format_and_type(ctx, format, type);
+   enum pipe_format pformat = st_mesa_format_to_pipe_format(st, mformat);
+   if (!pformat) {
+      GLint dst_components = _mesa_components_in_format(format);
+      bpp /= dst_components;
+      if (format == GL_BGR || format == GL_BGRA) {
+         pformat = st_pbo_get_dst_format(ctx, PIPE_TEXTURE_2D, src_format, false, format == GL_BGR ? GL_RGB : GL_RGBA, type, 0);
+         if (!pformat)
+            pformat = get_convert_format(ctx, src_format, format == GL_BGR ? GL_RGB : GL_RGBA, type, need_bgra_swizzle);
+         assert(pformat);
+         *need_bgra_swizzle = true;
+      } else if (format == GL_BGR_INTEGER || format == GL_BGRA_INTEGER) {
+         pformat = st_pbo_get_dst_format(ctx, PIPE_TEXTURE_2D, src_format, false, format == GL_BGR_INTEGER ? GL_RGB_INTEGER : GL_RGBA_INTEGER, type, 0);
+         if (!pformat)
+            pformat = get_convert_format(ctx, src_format, format == GL_BGR_INTEGER ? GL_RGB_INTEGER : GL_RGBA_INTEGER, type, need_bgra_swizzle);
+         assert(pformat);
+         *need_bgra_swizzle = true;
+      } else {
+         /* [signed,unsigned][bpp] */
+         enum pipe_format rgb[5][2][5] = {
+            [1] = FORMAT(R8, R16, R32),
+            [2] = FORMAT(R8G8, R16G16, R32G32),
+            [3] = FORMAT(R8G8B8, R16G16B16, R32G32B32),
+            [4] = FORMAT(R8G8B8A8, R16G16B16A16, R32G32B32A32),
+         };
+         pformat = rgb[dst_components][_mesa_is_type_unsigned(type)][bpp];
+      }
+      assert(util_format_get_nr_components(pformat) == dst_components);
+   }
+   assert(pformat);
+   return pformat;
+}
+#undef BGR_FORMAT
+#undef FORMAT
+
+
+struct pbo_shader_data {
+   nir_ssa_def *offset;
+   nir_ssa_def *range;
+   nir_ssa_def *invert;
+   nir_ssa_def *blocksize;
+   nir_ssa_def *alignment;
+   nir_ssa_def *dst_bit_size;
+   nir_ssa_def *channels;
+   nir_ssa_def *normalized;
+   nir_ssa_def *integer;
+   nir_ssa_def *clamp_uint;
+   nir_ssa_def *r11g11b10_or_sint;
+   nir_ssa_def *r9g9b9e5;
+   nir_ssa_def *bits1;
+   nir_ssa_def *bits2;
+   nir_ssa_def *bits3;
+   nir_ssa_def *bits4;
+   nir_ssa_def *swap;
+   nir_ssa_def *bits; //vec4
+};
+
+
+/* must be under 16bytes / sizeof(vec4) / 128 bits) */
+struct pbo_data {
+   union {
+       struct {
+          struct {
+             uint16_t x, y;
+          };
+          struct {
+             uint16_t width, height;
+          };
+          struct {
+             uint16_t depth;
+             uint8_t invert : 1;
+             uint8_t blocksize : 7;
+
+             uint8_t clamp_uint : 1;
+             uint8_t r11g11b10_or_sint : 1;
+             uint8_t r9g9b9e5 : 1;
+             uint8_t swap : 1;
+             uint16_t alignment : 2;
+             uint8_t dst_bit_size : 2; //8, 16, 32, 64
+          };
+
+          struct {
+             uint8_t channels : 2;
+             uint8_t bits1 : 6;
+             uint8_t normalized : 1;
+             uint8_t integer : 1;
+             uint8_t bits2 : 6;
+             uint8_t bits3 : 6;
+             uint8_t pad1 : 2;
+             uint8_t bits4 : 6;
+             uint8_t pad2 : 2;
+          };
+      };
+      float vec[4];
+   };
+};
+
+
+#define STRUCT_OFFSET(name) (offsetof(struct pbo_data, name) * 8)
+
+#define STRUCT_BLOCK(offset, ...) \
+   do { \
+      assert(offset % 8 == 0); \
+      nir_ssa_def *block##offset = nir_u2u32(b, nir_extract_bits(b, &ubo_load, 1, (offset), 1, 8)); \
+      __VA_ARGS__ \
+   } while (0)
+#define STRUCT_MEMBER(blockoffset, name, offset, size, op, clamp) \
+   do { \
+      assert(offset + size <= 8); \
+      nir_ssa_def *val = nir_iand_imm(b, block##blockoffset, u_bit_consecutive(offset, size)); \
+      if (offset) \
+         val = nir_ushr_imm(b, val, offset); \
+      sd->name = op; \
+      if (clamp) \
+         sd->name = nir_umin(b, sd->name, nir_imm_int(b, clamp)); \
+   } while (0)
+#define STRUCT_MEMBER_SHIFTED_2BIT(blockoffset, name, offset, shift, clamp) \
+   STRUCT_MEMBER(blockoffset, name, offset, 2, nir_ishl(b, nir_imm_int(b, shift), val), clamp)
+
+#define STRUCT_MEMBER_BOOL(blockoffset, name, offset) \
+   STRUCT_MEMBER(blockoffset, name, offset, 1, nir_ieq_imm(b, val, 1), 0)
+
+/* this function extracts the conversion data from pbo_data using the
+ * size annotations for each grouping. data is compacted into bitfields,
+ * so bitwise operations must be used to "unpact" everything
+ */
+static void
+init_pbo_shader_data(nir_builder *b, struct pbo_shader_data *sd)
+{
+   nir_variable *ubo = nir_variable_create(b->shader, nir_var_uniform, glsl_uvec4_type(), "offset");
+   nir_ssa_def *ubo_load = nir_load_var(b, ubo);
+
+   sd->offset = nir_umin(b, nir_u2u32(b, nir_extract_bits(b, &ubo_load, 1, STRUCT_OFFSET(x), 2, 16)), nir_imm_int(b, 65535));
+   sd->range = nir_umin(b, nir_u2u32(b, nir_extract_bits(b, &ubo_load, 1, STRUCT_OFFSET(width), 3, 16)), nir_imm_int(b, 65535));
+
+   STRUCT_BLOCK(80,
+      STRUCT_MEMBER_BOOL(80, invert, 0);
+      STRUCT_MEMBER(80, blocksize, 1, 7, nir_iadd_imm(b, val, 1), 128);
+   );
+
+   STRUCT_BLOCK(88,
+      STRUCT_MEMBER_BOOL(88, clamp_uint, 0);
+      STRUCT_MEMBER_BOOL(88, r11g11b10_or_sint, 1);
+      STRUCT_MEMBER_BOOL(88, r9g9b9e5, 2);
+      STRUCT_MEMBER_BOOL(88, swap, 3);
+      STRUCT_MEMBER_SHIFTED_2BIT(88, alignment, 4, 1, 8);
+      STRUCT_MEMBER_SHIFTED_2BIT(88, dst_bit_size, 6, 8, 64);
+   );
+
+   STRUCT_BLOCK(96,
+      STRUCT_MEMBER(96, channels, 0, 2, nir_iadd_imm(b, val, 1), 4);
+      STRUCT_MEMBER(96, bits1, 2, 6, val, 32);
+   );
+
+   STRUCT_BLOCK(104,
+      STRUCT_MEMBER_BOOL(104, normalized, 0);
+      STRUCT_MEMBER_BOOL(104, integer, 1);
+      STRUCT_MEMBER(104, bits2, 2, 6, val, 32);
+   );
+
+
+   STRUCT_BLOCK(112,
+      STRUCT_MEMBER(112, bits3, 0, 6, val, 32);
+   );
+
+   STRUCT_BLOCK(120,
+      STRUCT_MEMBER(120, bits4, 0, 6, val, 32);
+   );
+   sd->bits = nir_vec4(b, sd->bits1, sd->bits2, sd->bits3, sd->bits4);
+
+   /* clamp swap in the shader to enable better optimizing */
+   /* TODO?
+   sd->swap = nir_bcsel(b, nir_ior(b,
+                                   nir_ieq_imm(b, sd->blocksize, 8),
+                                   nir_bcsel(b,
+                                             nir_ieq_imm(b, sd->bits1, 8),
+                                             nir_bcsel(b,
+                                                       nir_uge(b, sd->channels, nir_imm_int(b, 2)),
+                                                       nir_bcsel(b,
+                                                                 nir_uge(b, sd->channels, nir_imm_int(b, 3)),
+                                                                 nir_bcsel(b,
+                                                                           nir_ieq(b, sd->channels, nir_imm_int(b, 4)),
+                                                                           nir_ball(b, nir_ieq(b, sd->bits, nir_imm_ivec4(b, 8, 8, 8, 8))),
+                                                                           nir_ball(b, nir_ieq(b, nir_channels(b, sd->bits, 7), nir_imm_ivec3(b, 8, 8, 8)))),
+                                                                 nir_ball(b, nir_ieq(b, nir_channels(b, sd->bits, 3), nir_imm_ivec2(b, 8, 8)))),
+                                                       nir_imm_bool(b, 0)),
+                                             nir_imm_bool(b, 0))),
+                           nir_imm_bool(b, 0),
+                           sd->swap);
+     */
+}
+
+static unsigned
+fill_pbo_data(struct pbo_data *pd, enum pipe_format src_format, enum pipe_format dst_format, bool swap)
+{
+   unsigned bits[4] = {0};
+   bool weird_packed = false;
+   const struct util_format_description *dst_desc = util_format_description(dst_format);
+   bool is_8bit = true;
+
+   for (unsigned c = 0; c < 4; c++) {
+      bits[c] = dst_desc->channel[c].size;
+      if (c < dst_desc->nr_channels) {
+         weird_packed |= bits[c] != bits[0] || bits[c] % 8 != 0;
+         if (bits[c] != 8)
+            is_8bit = false;
+      }
+   }
+
+   if (is_8bit || dst_desc->block.bits == 8)
+      swap = false;
+
+   unsigned dst_bit_size = 0;
+   if (weird_packed) {
+      dst_bit_size = dst_desc->block.bits;
+   } else {
+      dst_bit_size = dst_desc->block.bits / dst_desc->nr_channels;
+   }
+   assert(dst_bit_size);
+   assert(dst_bit_size <= 64);
+
+   pd->dst_bit_size = dst_bit_size >> 4;
+   pd->channels = dst_desc->nr_channels - 1;
+   pd->normalized = dst_desc->is_unorm || dst_desc->is_snorm;
+   pd->clamp_uint = dst_desc->is_unorm ||
+                    (util_format_is_pure_sint(dst_format) &&
+                     !util_format_is_pure_sint(src_format) &&
+                     !util_format_is_snorm(src_format)) ||
+                    util_format_is_pure_uint(dst_format);
+   pd->integer = util_format_is_pure_uint(dst_format) || util_format_is_pure_sint(dst_format);
+   pd->r11g11b10_or_sint = dst_format == PIPE_FORMAT_R11G11B10_FLOAT || util_format_is_pure_sint(dst_format);
+   pd->r9g9b9e5 = dst_format == PIPE_FORMAT_R9G9B9E5_FLOAT;
+   pd->bits1 = bits[0];
+   pd->bits2 = bits[1];
+   pd->bits3 = bits[2];
+   pd->bits4 = bits[3];
+   pd->swap = swap;
+
+   return weird_packed ? 1 : dst_desc->nr_channels;
+}
+
+static nir_ssa_def *
+get_buffer_offset(nir_builder *b, nir_ssa_def *coord, struct pbo_shader_data *sd)
+{
+/* from _mesa_image_offset():
+      offset = topOfImage
+               + (skippixels + column) * bytes_per_pixel
+               + (skiprows + row) * bytes_per_row
+               + (skipimages + img) * bytes_per_image;
+ */
+   nir_ssa_def *bytes_per_row = nir_imul(b, nir_channel(b, sd->range, 0), sd->blocksize);
+   bytes_per_row = nir_bcsel(b, nir_ult(b, sd->alignment, nir_imm_int(b, 2)),
+                             bytes_per_row,
+                             nir_iand(b,
+                                      nir_isub(b, nir_iadd(b, bytes_per_row, sd->alignment), nir_imm_int(b, 1)),
+                                      nir_inot(b, nir_isub(b, sd->alignment, nir_imm_int(b, 1)))));
+   nir_ssa_def *bytes_per_image = nir_imul(b, bytes_per_row, nir_channel(b, sd->range, 1));
+   bytes_per_row = nir_bcsel(b, sd->invert,
+                             nir_isub(b, nir_imm_int(b, 0), bytes_per_row),
+                             bytes_per_row);
+   return nir_iadd(b,
+                   nir_imul(b, nir_channel(b, coord, 0), sd->blocksize),
+                   nir_iadd(b,
+                            nir_imul(b, nir_channel(b, coord, 1), bytes_per_row),
+                            nir_imul(b, nir_channel(b, coord, 2), bytes_per_image)));
+}
+
+static inline void
+write_ssbo(nir_builder *b, nir_ssa_def *pixel, nir_ssa_def *buffer_offset)
+{
+   nir_store_ssbo(b, pixel, nir_imm_zero(b, 1, 32), buffer_offset,
+                  .align_mul = pixel->bit_size / 8,
+                  .write_mask = (1 << pixel->num_components) - 1);
+}
+
+static void
+write_conversion(nir_builder *b, nir_ssa_def *pixel, nir_ssa_def *buffer_offset, struct pbo_shader_data *sd)
+{
+   nir_push_if(b, nir_ilt(b, sd->dst_bit_size, nir_imm_int(b, 32)));
+      nir_push_if(b, nir_ieq_imm(b, sd->dst_bit_size, 16));
+         write_ssbo(b, nir_u2u16(b, pixel), buffer_offset);
+      nir_push_else(b, NULL);
+         write_ssbo(b, nir_u2u8(b, pixel), buffer_offset);
+      nir_pop_if(b, NULL);
+   nir_push_else(b, NULL);
+      write_ssbo(b, pixel, buffer_offset);
+   nir_pop_if(b, NULL);
+}
+
+static nir_ssa_def *
+swap2(nir_builder *b, nir_ssa_def *src)
+{
+   /* dst[i] = (src[i] >> 8) | ((src[i] << 8) & 0xff00); */
+   return nir_ior(b,
+                  nir_ushr_imm(b, src, 8),
+                  nir_iand_imm(b, nir_ishl(b, src, nir_imm_int(b, 8)), 0xff00));
+}
+
+static nir_ssa_def *
+swap4(nir_builder *b, nir_ssa_def *src)
+{
+   /* a = (b >> 24) | ((b >> 8) & 0xff00) | ((b << 8) & 0xff0000) | ((b << 24) & 0xff000000); */
+   return nir_ior(b,
+                  /* (b >> 24) */
+                  nir_ushr_imm(b, src, 24),
+                  nir_ior(b,
+                          /* ((b >> 8) & 0xff00) */
+                          nir_iand(b, nir_ushr_imm(b, src, 8), nir_imm_int(b, 0xff00)),
+                          nir_ior(b,
+                                  /* ((b << 8) & 0xff0000) */
+                                  nir_iand(b, nir_ishl(b, src, nir_imm_int(b, 8)), nir_imm_int(b, 0xff0000)),
+                                  /* ((b << 24) & 0xff000000) */
+                                  nir_iand(b, nir_ishl(b, src, nir_imm_int(b, 24)), nir_imm_int(b, 0xff000000)))));
+}
+
+/* explode the cf to handle channel counts in the shader */
+static void
+grab_components(nir_builder *b, nir_ssa_def *pixel, nir_ssa_def *buffer_offset, struct pbo_shader_data *sd, bool weird_packed)
+{
+   if (weird_packed) {
+      nir_push_if(b, nir_ieq_imm(b, sd->bits1, 32));
+         write_conversion(b, nir_channels(b, pixel, 3), buffer_offset, sd);
+      nir_push_else(b, NULL);
+         write_conversion(b, nir_channel(b, pixel, 0), buffer_offset, sd);
+      nir_pop_if(b, NULL);
+   } else {
+      nir_push_if(b, nir_ieq_imm(b, sd->channels, 1));
+         write_conversion(b, nir_channel(b, pixel, 0), buffer_offset, sd);
+      nir_push_else(b, NULL);
+         nir_push_if(b, nir_ieq_imm(b, sd->channels, 2));
+            write_conversion(b, nir_channels(b, pixel, (1 << 2) - 1), buffer_offset, sd);
+         nir_push_else(b, NULL);
+            nir_push_if(b, nir_ieq_imm(b, sd->channels, 3));
+               write_conversion(b, nir_channels(b, pixel, (1 << 3) - 1), buffer_offset, sd);
+            nir_push_else(b, NULL);
+               write_conversion(b, nir_channels(b, pixel, (1 << 4) - 1), buffer_offset, sd);
+            nir_pop_if(b, NULL);
+         nir_pop_if(b, NULL);
+      nir_pop_if(b, NULL);
+   }
+}
+
+/* if byteswap is enabled, handle that and then write the components */
+static void
+handle_swap(nir_builder *b, nir_ssa_def *pixel, nir_ssa_def *buffer_offset,
+            struct pbo_shader_data *sd, unsigned num_components, bool weird_packed)
+{
+   nir_push_if(b, sd->swap); {
+      nir_push_if(b, nir_ieq_imm(b, nir_udiv_imm(b, sd->blocksize, num_components), 2)); {
+         /* this is a single high/low swap per component */
+         nir_ssa_def *components[4];
+         for (unsigned i = 0; i < 4; i++)
+            components[i] = swap2(b, nir_channel(b, pixel, i));
+         nir_ssa_def *v = nir_vec(b, components, 4);
+         grab_components(b, v, buffer_offset, sd, weird_packed);
+      } nir_push_else(b, NULL); {
+         /* this is a pair of high/low swaps for each half of the component */
+         nir_ssa_def *components[4];
+         for (unsigned i = 0; i < 4; i++)
+            components[i] = swap4(b, nir_channel(b, pixel, i));
+         nir_ssa_def *v = nir_vec(b, components, 4);
+         grab_components(b, v, buffer_offset, sd, weird_packed);
+      } nir_pop_if(b, NULL);
+   } nir_push_else(b, NULL); {
+      /* swap disabled */
+      grab_components(b, pixel, buffer_offset, sd, weird_packed);
+   } nir_pop_if(b, NULL);
+}
+
+static nir_ssa_def *
+check_for_weird_packing(nir_builder *b, struct pbo_shader_data *sd, unsigned component)
+{
+   nir_ssa_def *c = nir_channel(b, sd->bits, component - 1);
+
+   return nir_bcsel(b,
+                    nir_ige(b, sd->channels, nir_imm_int(b, component)),
+                    nir_ior(b,
+                            nir_ine(b, c, sd->bits1),
+                            nir_ine(b, nir_imod(b, c, nir_imm_int(b, 8)), nir_imm_int(b, 0))),
+                    nir_imm_bool(b, 0));
+}
+
+/* convenience function for clamping signed integers */
+static inline nir_ssa_def *
+nir_imin_imax(nir_builder *build, nir_ssa_def *src, nir_ssa_def *clamp_to_min, nir_ssa_def *clamp_to_max)
+{
+   return nir_imax(build, nir_imin(build, src, clamp_to_min), clamp_to_max);
+}
+
+static inline nir_ssa_def *
+nir_format_float_to_unorm_with_factor(nir_builder *b, nir_ssa_def *f, nir_ssa_def *factor)
+{
+   /* Clamp to the range [0, 1] */
+   f = nir_fsat(b, f);
+
+   return nir_f2u32(b, nir_fround_even(b, nir_fmul(b, f, factor)));
+}
+
+static inline nir_ssa_def *
+nir_format_float_to_snorm_with_factor(nir_builder *b, nir_ssa_def *f, nir_ssa_def *factor)
+{
+   /* Clamp to the range [-1, 1] */
+   f = nir_fmin(b, nir_fmax(b, f, nir_imm_float(b, -1)), nir_imm_float(b, 1));
+
+   return nir_f2i32(b, nir_fround_even(b, nir_fmul(b, f, factor)));
+}
+
+static nir_ssa_def *
+clamp_and_mask(nir_builder *b, nir_ssa_def *src, nir_ssa_def *channels)
+{
+   nir_ssa_def *one = nir_imm_ivec4(b, 1, 0, 0, 0);
+   nir_ssa_def *two = nir_imm_ivec4(b, 1, 1, 0, 0);
+   nir_ssa_def *three = nir_imm_ivec4(b, 1, 1, 1, 0);
+   nir_ssa_def *four = nir_imm_ivec4(b, 1, 1, 1, 1);
+   /* avoid underflow by clamping to channel count */
+   src = nir_bcsel(b,
+                   nir_ieq(b, channels, one),
+                   nir_isub(b, src, one),
+                   nir_bcsel(b,
+                             nir_ieq_imm(b, channels, 2),
+                             nir_isub(b, src, two),
+                             nir_bcsel(b,
+                                       nir_ieq_imm(b, channels, 3),
+                                       nir_isub(b, src, three),
+                                       nir_isub(b, src, four))));
+
+   return nir_mask(b, src, 32);
+}
+
+static void
+convert_swap_write(nir_builder *b, nir_ssa_def *pixel, nir_ssa_def *buffer_offset,
+                   unsigned num_components,
+                   struct pbo_shader_data *sd)
+{
+
+   nir_ssa_def *weird_packed = nir_ior(b,
+                                       nir_ior(b,
+                                               check_for_weird_packing(b, sd, 4),
+                                               check_for_weird_packing(b, sd, 3)),
+                                       check_for_weird_packing(b, sd, 2));
+   if (num_components == 1) {
+      nir_push_if(b, weird_packed);
+         nir_push_if(b, sd->r11g11b10_or_sint);
+            handle_swap(b, nir_pad_vec4(b, nir_format_pack_11f11f10f(b, pixel)), buffer_offset, sd, 1, true);
+         nir_push_else(b, NULL);
+            nir_push_if(b, sd->r9g9b9e5);
+               handle_swap(b, nir_pad_vec4(b, nir_format_pack_r9g9b9e5(b, pixel)), buffer_offset, sd, 1, true);
+            nir_push_else(b, NULL);
+               nir_push_if(b, nir_ieq_imm(b, sd->bits1, 32)); { //PIPE_FORMAT_Z32_FLOAT_S8X24_UINT
+                  nir_ssa_def *pack[2];
+                  pack[0] = nir_format_pack_uint_unmasked_ssa(b, nir_channel(b, pixel, 0), nir_channel(b, sd->bits, 0));
+                  pack[1] = nir_format_pack_uint_unmasked_ssa(b, nir_channels(b, pixel, 6), nir_channels(b, sd->bits, 6));
+                  handle_swap(b, nir_pad_vec4(b, nir_vec2(b, pack[0], pack[1])), buffer_offset, sd, 2, true);
+               } nir_push_else(b, NULL);
+                  handle_swap(b, nir_pad_vec4(b, nir_format_pack_uint_unmasked_ssa(b, pixel, sd->bits)), buffer_offset, sd, 1, true);
+               nir_pop_if(b, NULL);
+            nir_pop_if(b, NULL);
+         nir_pop_if(b, NULL);
+      nir_push_else(b, NULL);
+         handle_swap(b, pixel, buffer_offset, sd, num_components, false);
+      nir_pop_if(b, NULL);
+   } else {
+      nir_push_if(b, weird_packed);
+         handle_swap(b, pixel, buffer_offset, sd, num_components, true);
+      nir_push_else(b, NULL);
+         handle_swap(b, pixel, buffer_offset, sd, num_components, false);
+      nir_pop_if(b, NULL);
+   }
+}
+
+static void
+do_shader_conversion(nir_builder *b, nir_ssa_def *pixel,
+                     unsigned num_components,
+                     nir_ssa_def *coord, struct pbo_shader_data *sd)
+{
+   nir_ssa_def *buffer_offset = get_buffer_offset(b, coord, sd);
+
+   nir_ssa_def *signed_bit_mask = clamp_and_mask(b, sd->bits, sd->channels);
+
+#define CONVERT_SWAP_WRITE(PIXEL) \
+   convert_swap_write(b, PIXEL, buffer_offset, num_components, sd);
+   nir_push_if(b, sd->normalized);
+      nir_push_if(b, sd->clamp_uint); //unorm
+         CONVERT_SWAP_WRITE(nir_format_float_to_unorm_with_factor(b, pixel, nir_u2f32(b, nir_mask(b, sd->bits, 32))));
+      nir_push_else(b, NULL);
+         CONVERT_SWAP_WRITE(nir_format_float_to_snorm_with_factor(b, pixel, nir_u2f32(b, signed_bit_mask)));
+      nir_pop_if(b, NULL);
+   nir_push_else(b, NULL);
+      nir_push_if(b, sd->integer);
+         nir_push_if(b, sd->r11g11b10_or_sint); //sint
+            nir_push_if(b, sd->clamp_uint); //uint -> sint
+               CONVERT_SWAP_WRITE(nir_umin(b, pixel, signed_bit_mask));
+            nir_push_else(b, NULL);
+               CONVERT_SWAP_WRITE(nir_imin_imax(b, pixel, signed_bit_mask, nir_isub(b, nir_ineg(b, signed_bit_mask), nir_imm_int(b, 1))));
+            nir_pop_if(b, NULL);
+         nir_push_else(b, NULL);
+            nir_push_if(b, sd->clamp_uint); //uint
+               /* nir_format_clamp_uint */
+               CONVERT_SWAP_WRITE(nir_umin(b, pixel, nir_mask(b, sd->bits, 32)));
+            nir_pop_if(b, NULL);
+         nir_pop_if(b, NULL);
+      nir_push_else(b, NULL);
+         nir_push_if(b, nir_ieq_imm(b, sd->bits1, 16)); //half
+            CONVERT_SWAP_WRITE(nir_format_float_to_half(b, pixel));
+         nir_push_else(b, NULL);
+            CONVERT_SWAP_WRITE(pixel);
+         nir_pop_if(b, NULL);
+   nir_pop_if(b, NULL);
+}
+
+static void *
+create_conversion_shader(struct st_context *st, enum pipe_texture_target target, unsigned num_components)
+{
+   const nir_shader_compiler_options *options = st_get_nir_compiler_options(st, MESA_SHADER_COMPUTE);
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "%s", "convert");
+   b.shader->info.workgroup_size[0] = target != PIPE_TEXTURE_1D ? 8 : 64;
+   b.shader->info.workgroup_size[1] = target != PIPE_TEXTURE_1D ? 8 : 1;
+
+   b.shader->info.workgroup_size[2] = 1;
+   b.shader->info.textures_used[0] = 1;
+   b.shader->info.num_ssbos = 1;
+   b.shader->num_uniforms = 2;
+   nir_variable_create(b.shader, nir_var_mem_ssbo, glsl_array_type(glsl_float_type(), 0, 4), "ssbo");
+   nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, st_pbo_sampler_type_for_target(target, ST_PBO_CONVERT_FLOAT), "sampler");
+   unsigned coord_components = glsl_get_sampler_coordinate_components(sampler->type);
+   sampler->data.explicit_binding = 1;
+
+   struct pbo_shader_data sd;
+   init_pbo_shader_data(&b, &sd);
+
+   nir_ssa_def *bsize = nir_imm_ivec4(&b,
+                                      b.shader->info.workgroup_size[0],
+                                      b.shader->info.workgroup_size[1],
+                                      b.shader->info.workgroup_size[2],
+                                      0);
+   nir_ssa_def *wid = nir_load_workgroup_id(&b, 32);
+   nir_ssa_def *iid = nir_load_local_invocation_id(&b);
+   nir_ssa_def *tile = nir_imul(&b, wid, bsize);
+   nir_ssa_def *global_id = nir_iadd(&b, tile, iid);
+   nir_ssa_def *start = nir_iadd(&b, global_id, sd.offset);
+
+   nir_ssa_def *coord = nir_channels(&b, start, (1<<coord_components)-1);
+   nir_ssa_def *max = nir_iadd(&b, sd.offset, sd.range);
+   nir_push_if(&b, nir_ball(&b, nir_ilt(&b, coord, nir_channels(&b, max, (1<<coord_components)-1))));
+   nir_tex_instr *txf = nir_tex_instr_create(b.shader, 3);
+   txf->is_array = glsl_sampler_type_is_array(sampler->type);
+   txf->op = nir_texop_txf;
+   txf->sampler_dim = glsl_get_sampler_dim(sampler->type);
+   txf->dest_type = nir_type_float32;
+   txf->coord_components = coord_components;
+   txf->texture_index = 0;
+   txf->sampler_index = 0;
+   txf->src[0].src_type = nir_tex_src_coord;
+   txf->src[0].src = nir_src_for_ssa(coord);
+   txf->src[1].src_type = nir_tex_src_lod;
+   txf->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
+   txf->src[2].src_type = nir_tex_src_texture_deref;
+   nir_deref_instr *sampler_deref = nir_build_deref_var(&b, sampler);
+   txf->src[2].src = nir_src_for_ssa(&sampler_deref->dest.ssa);
+
+   nir_ssa_dest_init(&txf->instr, &txf->dest, 4, 32, NULL);
+   nir_builder_instr_insert(&b, &txf->instr);
+
+   /* pass the grid offset as the coord to get the zero-indexed buffer offset */
+   do_shader_conversion(&b, &txf->dest.ssa, num_components, global_id, &sd);
+
+   nir_pop_if(&b, NULL);
+
+   nir_validate_shader(b.shader, NULL);
+   st_nir_opts(b.shader);
+   return st_nir_finish_builtin_shader(st, b.shader);
+}
+
+static void
+invert_swizzle(uint8_t *out, const uint8_t *in)
+{
+   /* First, default to all zeroes to prevent uninitialized junk */
+   for (unsigned c = 0; c < 4; ++c)
+      out[c] = PIPE_SWIZZLE_0;
+
+   /* Now "do" what the swizzle says */
+   for (unsigned c = 0; c < 4; ++c) {
+      unsigned char i = in[c];
+
+      /* Who cares? */
+      assert(PIPE_SWIZZLE_X == 0);
+      if (i > PIPE_SWIZZLE_W)
+         continue;
+      /* Invert */
+      unsigned idx = i - PIPE_SWIZZLE_X;
+      out[idx] = PIPE_SWIZZLE_X + c;
+   }
+}
+
+static uint32_t
+compute_shader_key(enum pipe_texture_target target, unsigned num_components)
+{
+   uint8_t key_target[] = {
+      [PIPE_BUFFER] = UINT8_MAX,
+      [PIPE_TEXTURE_1D] = 1,
+      [PIPE_TEXTURE_2D] = 2,
+      [PIPE_TEXTURE_3D] = 3,
+      [PIPE_TEXTURE_CUBE] = 4,
+      [PIPE_TEXTURE_RECT] = UINT8_MAX,
+      [PIPE_TEXTURE_1D_ARRAY] = 5,
+      [PIPE_TEXTURE_2D_ARRAY] = 6,
+      [PIPE_TEXTURE_CUBE_ARRAY] = UINT8_MAX,
+   };
+   assert(target < ARRAY_SIZE(key_target));
+   assert(key_target[target] != UINT8_MAX);
+   return key_target[target] | (num_components << 3);
+}
+
+static unsigned
+get_dim_from_target(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+      return 1;
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_3D:
+      return 3;
+   default:
+      return 2;
+   }
+}
+
+static enum pipe_texture_target
+get_target_from_texture(struct pipe_resource *src)
+{
+   enum pipe_texture_target view_target;
+   switch (src->target) {
+   case PIPE_TEXTURE_RECT:
+      view_target = PIPE_TEXTURE_2D;
+      break;
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      view_target = PIPE_TEXTURE_2D_ARRAY;
+      break;
+   default:
+      view_target = src->target;
+      break;
+   }
+   return view_target;
+}
+
+/* force swizzling behavior for sampling */
+enum swizzle_clamp {
+   /* force component selection for named format */
+   SWIZZLE_CLAMP_LUMINANCE = 1,
+   SWIZZLE_CLAMP_ALPHA = 2,
+   SWIZZLE_CLAMP_LUMINANCE_ALPHA = 3,
+   SWIZZLE_CLAMP_INTENSITY = 4,
+   SWIZZLE_CLAMP_RGBX = 5,
+
+   /* select only 1 component */
+   SWIZZLE_CLAMP_GREEN = 8,
+   SWIZZLE_CLAMP_BLUE = 16,
+
+   /* reverse ordering for format emulation */
+   SWIZZLE_CLAMP_BGRA = 32,
+};
+
+static struct pipe_resource *
+download_texture_compute(struct st_context *st,
+                         const struct gl_pixelstore_attrib *pack,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLint depth,
+                         unsigned level, unsigned layer,
+                         GLenum format, GLenum type,
+                         enum pipe_format src_format,
+                         enum pipe_texture_target view_target,
+                         struct pipe_resource *src,
+                         enum pipe_format dst_format,
+                         enum swizzle_clamp swizzle_clamp)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = st->screen;
+   struct pipe_resource *dst = NULL;
+   unsigned dim = get_dim_from_target(view_target);
+
+   /* clamp 3d offsets based on slice */
+   if (view_target == PIPE_TEXTURE_3D)
+      zoffset += layer;
+
+   unsigned num_components = 0;
+   /* Upload constants */
+   {
+      struct pipe_constant_buffer cb;
+      assert(view_target != PIPE_TEXTURE_1D_ARRAY || !yoffset);
+      struct pbo_data pd = {
+         .x = xoffset,
+         .y = yoffset,
+         .width = width, .height = height, .depth = depth,
+         .invert = pack->Invert,
+         .blocksize = util_format_get_blocksize(dst_format) - 1,
+         .alignment = ffs(MAX2(pack->Alignment, 1)) - 1,
+      };
+      num_components = fill_pbo_data(&pd, src_format, dst_format, pack->SwapBytes == 1);
+
+      cb.buffer = NULL;
+      cb.user_buffer = &pd;
+      cb.buffer_offset = 0;
+      cb.buffer_size = sizeof(pd);
+
+      pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &cb);
+   }
+
+   uint32_t hash_key = compute_shader_key(view_target, num_components);
+   assert(hash_key != 0);
+
+   struct hash_entry *he = _mesa_hash_table_search(st->pbo.shaders, (void*)(uintptr_t)hash_key);
+   void *cs;
+   if (!he) {
+      cs = create_conversion_shader(st, view_target, num_components);
+      he = _mesa_hash_table_insert(st->pbo.shaders, (void*)(uintptr_t)hash_key, cs);
+   }
+   cs = he->data;
+   assert(cs);
+   struct cso_context *cso = st->cso_context;
+
+   cso_save_compute_state(cso, CSO_BIT_COMPUTE_SHADER | CSO_BIT_COMPUTE_SAMPLERS);
+   cso_set_compute_shader_handle(cso, cs);
+
+   /* Set up the sampler_view */
+   {
+      struct pipe_sampler_view templ;
+      struct pipe_sampler_view *sampler_view;
+      struct pipe_sampler_state sampler = {0};
+      const struct pipe_sampler_state *samplers[1] = {&sampler};
+      const struct util_format_description *desc = util_format_description(dst_format);
+
+      u_sampler_view_default_template(&templ, src, src_format);
+      if (util_format_is_depth_or_stencil(dst_format)) {
+         templ.swizzle_r = PIPE_SWIZZLE_X;
+         templ.swizzle_g = PIPE_SWIZZLE_X;
+         templ.swizzle_b = PIPE_SWIZZLE_X;
+         templ.swizzle_a = PIPE_SWIZZLE_X;
+      } else {
+         uint8_t invswizzle[4];
+         const uint8_t *swizzle;
+
+         /* these swizzle output bits require explicit component selection/ordering */
+         if (swizzle_clamp & SWIZZLE_CLAMP_GREEN) {
+            for (unsigned i = 0; i < 4; i++)
+               invswizzle[i] = PIPE_SWIZZLE_Y;
+         } else if (swizzle_clamp & SWIZZLE_CLAMP_BLUE) {
+            for (unsigned i = 0; i < 4; i++)
+               invswizzle[i] = PIPE_SWIZZLE_Z;
+         } else {
+            if (swizzle_clamp & SWIZZLE_CLAMP_BGRA) {
+               if (util_format_get_nr_components(dst_format) == 3)
+                  swizzle = util_format_description(PIPE_FORMAT_B8G8R8_UNORM)->swizzle;
+               else
+                  swizzle = util_format_description(PIPE_FORMAT_B8G8R8A8_UNORM)->swizzle;
+            } else {
+               swizzle = desc->swizzle;
+            }
+            invert_swizzle(invswizzle, swizzle);
+         }
+         swizzle_clamp &= ~(SWIZZLE_CLAMP_BGRA | SWIZZLE_CLAMP_GREEN | SWIZZLE_CLAMP_BLUE);
+
+         /* these swizzle input modes clamp unused components to 0 and (sometimes) alpha to 1 */
+         switch (swizzle_clamp) {
+         case SWIZZLE_CLAMP_LUMINANCE:
+            if (util_format_is_luminance(dst_format))
+               break;
+            for (unsigned i = 0; i < 4; i++) {
+               if (invswizzle[i] != PIPE_SWIZZLE_X)
+                  invswizzle[i] = invswizzle[i] == PIPE_SWIZZLE_W ? PIPE_SWIZZLE_1 : PIPE_SWIZZLE_0;
+            }
+            break;
+         case SWIZZLE_CLAMP_ALPHA:
+            for (unsigned i = 0; i < 4; i++) {
+               if (invswizzle[i] != PIPE_SWIZZLE_W)
+                  invswizzle[i] = PIPE_SWIZZLE_0;
+            }
+            break;
+         case SWIZZLE_CLAMP_LUMINANCE_ALPHA:
+            if (util_format_is_luminance_alpha(dst_format))
+               break;
+            for (unsigned i = 0; i < 4; i++) {
+               if (invswizzle[i] != PIPE_SWIZZLE_X && invswizzle[i] != PIPE_SWIZZLE_W)
+                  invswizzle[i] = PIPE_SWIZZLE_0;
+            }
+            break;
+         case SWIZZLE_CLAMP_INTENSITY:
+            for (unsigned i = 0; i < 4; i++) {
+               if (invswizzle[i] == PIPE_SWIZZLE_W)
+                  invswizzle[i] = PIPE_SWIZZLE_1;
+               else if (invswizzle[i] != PIPE_SWIZZLE_X)
+                  invswizzle[i] = PIPE_SWIZZLE_0;
+            }
+            break;
+         case SWIZZLE_CLAMP_RGBX:
+            for (unsigned i = 0; i < 4; i++) {
+               if (invswizzle[i] == PIPE_SWIZZLE_W)
+                  invswizzle[i] = PIPE_SWIZZLE_1;
+            }
+            break;
+         default: break;
+         }
+         templ.swizzle_r = invswizzle[0];
+         templ.swizzle_g = invswizzle[1];
+         templ.swizzle_b = invswizzle[2];
+         templ.swizzle_a = invswizzle[3];
+      }
+      templ.target = view_target;
+      templ.u.tex.first_level = level;
+      templ.u.tex.last_level = level;
+
+      /* array textures expect to have array index provided */
+      if (view_target != PIPE_TEXTURE_3D && src->array_size) {
+         templ.u.tex.first_layer = layer;
+         if (view_target == PIPE_TEXTURE_1D_ARRAY) {
+            templ.u.tex.first_layer += yoffset;
+            templ.u.tex.last_layer = templ.u.tex.first_layer + height - 1;
+         } else {
+            templ.u.tex.first_layer += zoffset;
+            templ.u.tex.last_layer = templ.u.tex.first_layer + depth - 1;
+         }
+      }
+
+      sampler_view = pipe->create_sampler_view(pipe, src, &templ);
+      if (sampler_view == NULL)
+         goto fail;
+
+      pipe->set_sampler_views(pipe, PIPE_SHADER_COMPUTE, 0, 1, 0, false,
+                              &sampler_view);
+      st->state.num_sampler_views[PIPE_SHADER_COMPUTE] =
+         MAX2(st->state.num_sampler_views[PIPE_SHADER_COMPUTE], 1);
+
+      pipe_sampler_view_reference(&sampler_view, NULL);
+
+      cso_set_samplers(cso, PIPE_SHADER_COMPUTE, 1, samplers);
+   }
+
+   /* Set up destination buffer */
+   unsigned img_stride = _mesa_image_image_stride(pack, width, height, format, type);
+   unsigned buffer_size = (depth + (dim == 3 ? pack->SkipImages : 0)) * img_stride;
+   {
+      dst = pipe_buffer_create(screen, PIPE_BIND_SHADER_BUFFER, PIPE_USAGE_STAGING, buffer_size);
+      if (!dst)
+         goto fail;
+
+      struct pipe_shader_buffer buffer;
+      memset(&buffer, 0, sizeof(buffer));
+      buffer.buffer = dst;
+      buffer.buffer_size = buffer_size;
+
+      pipe->set_shader_buffers(pipe, PIPE_SHADER_COMPUTE, 0, 1, &buffer, 0x1);
+   }
+
+   struct pipe_grid_info info = { 0 };
+   info.block[0] = src->target != PIPE_TEXTURE_1D ? 8 : 64;
+   info.block[1] = src->target != PIPE_TEXTURE_1D ? 8 : 1;
+   info.last_block[0] = width % info.block[0];
+   info.last_block[1] = height % info.block[1];
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
+   info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
+   info.grid[2] = depth;
+
+   pipe->launch_grid(pipe, &info);
+
+fail:
+   cso_restore_compute_state(cso);
+
+   /* Unbind all because st/mesa won't do it if the current shader doesn't
+    * use them.
+    */
+   pipe->set_sampler_views(pipe, PIPE_SHADER_COMPUTE, 0, 0, false,
+                           st->state.num_sampler_views[PIPE_SHADER_COMPUTE],
+                           NULL);
+   st->state.num_sampler_views[PIPE_SHADER_COMPUTE] = 0;
+   pipe->set_shader_buffers(pipe, PIPE_SHADER_COMPUTE, 0, 1, NULL, 0);
+
+   st->dirty |= ST_NEW_CS_CONSTANTS |
+                ST_NEW_CS_SSBOS |
+                ST_NEW_CS_SAMPLER_VIEWS;
+
+   return dst;
+}
+
+static void
+copy_converted_buffer(struct gl_context * ctx,
+                    struct gl_pixelstore_attrib *pack,
+                    enum pipe_texture_target view_target,
+                    struct pipe_resource *dst, enum pipe_format dst_format,
+                    GLint xoffset, GLint yoffset, GLint zoffset,
+                    GLsizei width, GLsizei height, GLint depth,
+                    GLenum format, GLenum type, void *pixels)
+{
+   struct pipe_transfer *xfer;
+   struct st_context *st = st_context(ctx);
+   unsigned dim = get_dim_from_target(view_target);
+   uint8_t *map = pipe_buffer_map(st->pipe, dst, PIPE_MAP_READ | PIPE_MAP_ONCE, &xfer);
+   if (!map)
+      return;
+
+   pixels = _mesa_map_pbo_dest(ctx, pack, pixels);
+   /* compute shader doesn't handle these to cut down on uniform size */
+   if (pack->RowLength ||
+       pack->SkipPixels ||
+       pack->SkipRows ||
+       pack->ImageHeight ||
+       pack->SkipImages) {
+
+      if (view_target == PIPE_TEXTURE_1D_ARRAY) {
+         depth = height;
+         height = 1;
+         zoffset = yoffset;
+         yoffset = 0;
+      }
+      struct gl_pixelstore_attrib packing = *pack;
+      memset(&packing.RowLength, 0, offsetof(struct gl_pixelstore_attrib, SwapBytes) - offsetof(struct gl_pixelstore_attrib, RowLength));
+      for (unsigned z = 0; z < depth; z++) {
+         for (unsigned y = 0; y < height; y++) {
+            GLubyte *dst = _mesa_image_address(dim, pack, pixels,
+                                       width, height, format, type,
+                                       z, y, 0);
+            GLubyte *srcpx = _mesa_image_address(dim, &packing, map,
+                                                 width, height, format, type,
+                                                 z, y, 0);
+            memcpy(dst, srcpx, util_format_get_stride(dst_format, width));
+         }
+      }
+   } else {
+      /* direct copy for all other cases */
+      memcpy(pixels, map, dst->width0);
+   }
+
+   _mesa_unmap_pbo_dest(ctx, pack);
+   pipe_buffer_unmap(st->pipe, xfer);
+}
+
+bool
+st_GetTexSubImage_shader(struct gl_context * ctx,
+                         GLint xoffset, GLint yoffset, GLint zoffset,
+                         GLsizei width, GLsizei height, GLint depth,
+                         GLenum format, GLenum type, void * pixels,
+                         struct gl_texture_image *texImage)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_screen *screen = st->screen;
+   struct st_texture_object *stObj = st_texture_object(texImage->TexObject);
+   struct pipe_resource *src = stObj->pt;
+   struct pipe_resource *dst = NULL;
+   enum pipe_format dst_format, src_format;
+   unsigned level = texImage->Level + texImage->TexObject->Attrib.MinLevel;
+   unsigned layer = texImage->Face + texImage->TexObject->Attrib.MinLayer;
+   enum pipe_texture_target view_target;
+
+   assert(!_mesa_is_format_etc2(texImage->TexFormat) &&
+          !_mesa_is_format_astc_2d(texImage->TexFormat) &&
+          texImage->TexFormat != MESA_FORMAT_ETC1_RGB8);
+
+   /* See if the texture format already matches the format and type,
+    * in which case the memcpy-based fast path will be used. */
+   if (_mesa_format_matches_format_and_type(texImage->TexFormat, format,
+                                            type, ctx->Pack.SwapBytes, NULL)) {
+      return false;
+   }
+   enum swizzle_clamp swizzle_clamp = 0;
+   src_format = st_pbo_get_src_format(screen, stObj->surface_based ? stObj->surface_format : src->format, src);
+   if (src_format == PIPE_FORMAT_NONE)
+      return false;
+
+   if (texImage->_BaseFormat != _mesa_get_format_base_format(texImage->TexFormat)) {
+      /* special handling for drivers that don't support these formats natively */
+      if (texImage->_BaseFormat == GL_LUMINANCE)
+         swizzle_clamp = SWIZZLE_CLAMP_LUMINANCE;
+      else if (texImage->_BaseFormat == GL_LUMINANCE_ALPHA)
+         swizzle_clamp = SWIZZLE_CLAMP_LUMINANCE_ALPHA;
+      else if (texImage->_BaseFormat == GL_ALPHA)
+         swizzle_clamp = SWIZZLE_CLAMP_ALPHA;
+      else if (texImage->_BaseFormat == GL_INTENSITY)
+         swizzle_clamp = SWIZZLE_CLAMP_INTENSITY;
+      else if (texImage->_BaseFormat == GL_RGB)
+         swizzle_clamp = SWIZZLE_CLAMP_RGBX;
+   }
+
+   dst_format = st_pbo_get_dst_format(ctx, PIPE_BUFFER, src_format, false, format, type, 0);
+
+   if (dst_format == PIPE_FORMAT_NONE) {
+      bool need_bgra_swizzle = false;
+      dst_format = get_convert_format(ctx, src_format, format, type, &need_bgra_swizzle);
+      if (dst_format == PIPE_FORMAT_NONE)
+         return false;
+      /* special swizzling for component selection */
+      if (need_bgra_swizzle)
+         swizzle_clamp |= SWIZZLE_CLAMP_BGRA;
+      else if (format == GL_GREEN_INTEGER)
+         swizzle_clamp |= SWIZZLE_CLAMP_GREEN;
+      else if (format == GL_BLUE_INTEGER)
+         swizzle_clamp |= SWIZZLE_CLAMP_BLUE;
+   }
+
+   /* check with the driver to see if memcpy is likely to be faster */
+   if (!screen->is_compute_copy_faster(screen, src_format, dst_format, width, height, depth, true))
+      return false;
+
+   view_target = get_target_from_texture(src);
+   /* I don't know why this works
+    * only for the texture rects
+    * but that's how it is
+    */
+   if ((src->target != PIPE_TEXTURE_RECT &&
+       /* this would need multiple samplerviews */
+       ((util_format_is_depth_and_stencil(src_format) && util_format_is_depth_and_stencil(dst_format)) ||
+       /* these format just doesn't work and science can't explain why */
+       dst_format == PIPE_FORMAT_Z32_FLOAT)) ||
+       /* L8 -> L32_FLOAT is another thinker */
+       (!util_format_is_float(src_format) && dst_format == PIPE_FORMAT_L32_FLOAT))
+      return false;
+
+   dst = download_texture_compute(st, &ctx->Pack, xoffset, yoffset, zoffset, width, height, depth,
+                                  level, layer, format, type, src_format, view_target, src, dst_format,
+                                  swizzle_clamp);
+
+   copy_converted_buffer(ctx, &ctx->Pack, view_target, dst, dst_format, xoffset, yoffset, zoffset,
+                       width, height, depth, format, type, pixels);
+
+   pipe_resource_reference(&dst, NULL);
+
+   return true;
+}
+
author	Mike Blumenkrantz <michael.blumenkrantz@gmail.com>	2021-08-05 15:28:52 -0400
committer	Mike Blumenkrantz <michael.blumenkrantz@gmail.com>	2021-11-18 08:00:07 -0500
commit	e7b95619596167025d26f07efb04dc5bfd5e9aee (patch)
tree	a9c128cd87b0df57e8cbabb51408cf893bbdec7b /src/mesa
parent	ed65b5e8399fc59fe011381338cc06bc3f7ec5d6 (diff)