i965/tiled_memcpy: Rework the RGBA -> BGRA mem_copy functions

This splits the two copy functions into three: One for unaligned copies, one for aligned sources, and one for aligned destinations. Thanks to the previous commit, we are now guaranteed that the aligned ones will *only* operate on aligned memory so they should be safe. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93962 Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org> Reviewed-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Chad Versace <chad.versace@intel.com> (cherry picked from commit d2b32656e18607f5807b3f4d4dde02568370b9bf)
author: Jason Ekstrand <jason.ekstrand@intel.com> 2016-04-07 11:21:19 -0700
committer: Emil Velikov <emil.l.velikov@gmail.com> 2016-05-05 11:46:43 +0100
commit: 2274ec7dbaded24627907bdfad0bcaf9fe6c679b (patch)
tree: 86a91f8f36e27f6f9b2ae6dffdd4972daddfa6e4
parent: 9009d58eba65931ad569c67d63955322ddf96dc8 (diff)
1 files changed, 63 insertions, 76 deletions
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 89de5941868..8082b9d1e58 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -56,21 +56,32 @@ static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;
 
+/**
+ * Copy RGBA to BGRA - swap R and B.
+ */
+static inline void *
+rgba8_copy(void *dst, const void *src, size_t bytes)
+{
+   uint8_t *d = dst;
+   uint8_t const *s = src;
+
+   assert(bytes % 4 == 0);
+
+   while (bytes >= 4) {
+      d[0] = s[2];
+      d[1] = s[1];
+      d[2] = s[0];
+      d[3] = s[3];
+      d += 4;
+      s += 4;
+      bytes -= 4;
+   }
+   return dst;
+}
+
 #ifdef __SSSE3__
 static const uint8_t rgba8_permutation[16] =
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
-
-/* NOTE: dst must be 16-byte aligned. src may be unaligned. */
-#define rgba8_copy_16_aligned_dst(dst, src)                            \
-   _mm_store_si128((__m128i *)(dst),                                   \
-                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
-                                    *(__m128i *) rgba8_permutation))
-
-/* NOTE: src must be 16-byte aligned. dst may be unaligned. */
-#define rgba8_copy_16_aligned_src(dst, src)                            \
-   _mm_storeu_si128((__m128i *)(dst),                                  \
-                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
-                                     *(__m128i *) rgba8_permutation))
 #endif
 
 /**
@@ -82,32 +93,21 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      return dst;
-   }
+   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
 
-   if (bytes == 64) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_dst(d+16, s+16);
-      rgba8_copy_16_aligned_dst(d+32, s+32);
-      rgba8_copy_16_aligned_dst(d+48, s+48);
-      return dst;
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_store_si128((__m128i *)d,
+                      _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)s),
+                                       *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
    }
 #endif
 
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(d, s, bytes);
+
    return dst;
 }
 
@@ -120,32 +120,21 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      return dst;
-   }
+   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
 
-   if (bytes == 64) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_src(d+16, s+16);
-      rgba8_copy_16_aligned_src(d+32, s+32);
-      rgba8_copy_16_aligned_src(d+48, s+48);
-      return dst;
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_storeu_si128((__m128i *)d,
+                       _mm_shuffle_epi8(_mm_load_si128((__m128i *)s),
+                                        *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
    }
 #endif
 
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(d, s, bytes);
+
    return dst;
 }
 
@@ -404,10 +393,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
@@ -415,10 +404,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
                                  memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
@@ -447,20 +436,20 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
@@ -489,20 +478,20 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }
@@ -531,20 +520,20 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }
@@ -773,8 +762,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
       if (format == GL_BGRA) {
          *mem_copy = memcpy;
       } else if (format == GL_RGBA) {
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
       }
    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
@@ -783,8 +771,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
           * use the same function.
           */
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
       } else if (format == GL_RGBA) {
          *mem_copy = memcpy;
       }
author	Jason Ekstrand <jason.ekstrand@intel.com>	2016-04-07 11:21:19 -0700
committer	Emil Velikov <emil.l.velikov@gmail.com>	2016-05-05 11:46:43 +0100
commit	2274ec7dbaded24627907bdfad0bcaf9fe6c679b (patch)
tree	86a91f8f36e27f6f9b2ae6dffdd4972daddfa6e4
parent	9009d58eba65931ad569c67d63955322ddf96dc8 (diff)