summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicolai Hähnle <nicolai.haehnle@amd.com>2016-06-15 11:06:38 +0200
committerNicolai Hähnle <nicolai.haehnle@amd.com>2016-06-16 13:28:09 +0200
commita96d83464c64048d2ec5ee810b331d01b5838903 (patch)
treebe4076bf17730886908a0af4e9822347bd7d309c
parent9c85441397bfecefa9d91f4540fdbc6784567519 (diff)
WIP st/mesa: memcpy drop-ins for readpixelsreadpixels
-rw-r--r--src/mesa/Makefile.am2
-rw-r--r--src/mesa/Makefile.sources2
-rw-r--r--src/mesa/state_tracker/st_cb_readpixels.c8
-rw-r--r--src/mesa/state_tracker/st_manager.c2
-rw-r--r--src/mesa/state_tracker/st_memcpy_read.c797
-rw-r--r--src/mesa/state_tracker/st_memcpy_read.h34
6 files changed, 843 insertions, 2 deletions
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 6d7a3cc948..26618ab607 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -154,6 +154,8 @@ libmesagallium_la_LIBADD = \
$(top_builddir)/src/compiler/glsl/libglsl.la \
$(ARCH_LIBS)
+libmesagallium_la_CFLAGS = $(AM_CFLAGS) $(SSE41_CFLAGS) -mavx -mavx2
+
libmesa_sse41_la_SOURCES = \
$(X86_SSE41_FILES)
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index a49ad953c0..8e9c65584b 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -497,6 +497,8 @@ STATETRACKER_FILES = \
state_tracker/st_glsl_types.h \
state_tracker/st_manager.c \
state_tracker/st_manager.h \
+ state_tracker/st_memcpy_read.c \
+ state_tracker/st_memcpy_read.h \
state_tracker/st_mesa_to_tgsi.c \
state_tracker/st_mesa_to_tgsi.h \
state_tracker/st_nir.h \
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 4c2f2ce768..cbecc787ab 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -41,6 +41,7 @@
#include "st_context.h"
#include "st_cb_bitmap.h"
#include "st_cb_readpixels.h"
+#include "st_memcpy_read.h"
#include "state_tracker/st_cb_texture.h"
#include "state_tracker/st_format.h"
#include "state_tracker/st_pbo.h"
@@ -519,6 +520,9 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y,
if (!window)
map += y * tex_xfer->stride + x * util_format_get_blocksize(dst_format);
+// static ubyte dummy[4 * 1024 * 1024];
+// map = dummy; //HACK
+
/* memcpy data into a user buffer */
{
const uint bytesPerRow = width * util_format_get_blocksize(dst_format);
@@ -528,12 +532,12 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y,
type, 0, 0);
if (tex_xfer->stride == bytesPerRow && destStride == bytesPerRow) {
- memcpy(dest, map, bytesPerRow * height);
+ (*st_memcpy_read)(dest, map, bytesPerRow * height);
} else {
GLuint row;
for (row = 0; row < (unsigned) height; row++) {
- memcpy(dest, map, bytesPerRow);
+ (*st_memcpy_read)(dest, map, bytesPerRow);
map += tex_xfer->stride;
dest += destStride;
}
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 997d428449..26b22ccd05 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -46,6 +46,7 @@
#include "st_cb_fbo.h"
#include "st_cb_flush.h"
#include "st_manager.h"
+#include "st_memcpy_read.h"
#include "state_tracker/st_gl_api.h"
@@ -969,5 +970,6 @@ static const struct st_api st_gl_api = {
struct st_api *
st_gl_api_create(void)
{
+ st_init_memcpy_read();
return (struct st_api *) &st_gl_api;
}
diff --git a/src/mesa/state_tracker/st_memcpy_read.c b/src/mesa/state_tracker/st_memcpy_read.c
new file mode 100644
index 0000000000..4f0ac112d9
--- /dev/null
+++ b/src/mesa/state_tracker/st_memcpy_read.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * memcpy implementations used for copying from temporary staging textures to
+ * user memory.
+ */
+
+#include "util/u_debug.h"
+#include "util/u_math.h"
+
+#include "st_memcpy_read.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#define ALIGN_SRC(d, s, len, align) \
+ do { \
+ if ((uintptr_t)s & ((align) - 1)) { \
+ uintptr_t missing = align - ((uintptr_t)s & (align - 1)); \
+ uintptr_t copy = MIN2(missing, len); \
+ memcpy(d, s, copy); \
+ d += copy; \
+ s += copy; \
+ len -= copy; \
+ } \
+ } while (0)
+
+#define ALIGN_DST(d, s, len, align) \
+ do { \
+ if ((uintptr_t)d & ((align) - 1)) { \
+ uintptr_t missing = align - ((uintptr_t)d & (align - 1)); \
+ uintptr_t copy = MIN2(missing, len); \
+ memcpy(d, s, copy); \
+ d += copy; \
+ s += copy; \
+ len -= copy; \
+ } \
+ } while (0)
+
+#define SSE2_DQU_DQA(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_loadu_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_loadu_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_loadu_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_loadu_si128(src_cacheline + 3);\
+\
+ _mm_store_si128(dst_cacheline + 0, temp1); \
+ _mm_store_si128(dst_cacheline + 1, temp2); \
+ _mm_store_si128(dst_cacheline + 2, temp3); \
+ _mm_store_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+#define SSE2_DQA_DQA(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_load_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_load_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_load_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_load_si128(src_cacheline + 3);\
+\
+ _mm_store_si128(dst_cacheline + 0, temp1); \
+ _mm_store_si128(dst_cacheline + 1, temp2); \
+ _mm_store_si128(dst_cacheline + 2, temp3); \
+ _mm_store_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+#define SSE2_DQA_DQU(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_load_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_load_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_load_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_load_si128(src_cacheline + 3);\
+\
+ _mm_storeu_si128(dst_cacheline + 0, temp1); \
+ _mm_storeu_si128(dst_cacheline + 1, temp2); \
+ _mm_storeu_si128(dst_cacheline + 2, temp3); \
+ _mm_storeu_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+#define SSE41_NTDQA_DQA(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\
+\
+ _mm_store_si128(dst_cacheline + 0, temp1); \
+ _mm_store_si128(dst_cacheline + 1, temp2); \
+ _mm_store_si128(dst_cacheline + 2, temp3); \
+ _mm_store_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+#define SSE41_NTDQA_DQU(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\
+\
+ _mm_storeu_si128(dst_cacheline + 0, temp1); \
+ _mm_storeu_si128(dst_cacheline + 1, temp2); \
+ _mm_storeu_si128(dst_cacheline + 2, temp3); \
+ _mm_storeu_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+#define SSE41_DQA_NTDQ(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_load_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_load_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_load_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_load_si128(src_cacheline + 3);\
+\
+ _mm_stream_si128(dst_cacheline + 0, temp1); \
+ _mm_stream_si128(dst_cacheline + 1, temp2); \
+ _mm_stream_si128(dst_cacheline + 2, temp3); \
+ _mm_stream_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+#define SSE41_DQU_NTDQ(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_loadu_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_loadu_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_loadu_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_loadu_si128(src_cacheline + 3);\
+\
+ _mm_stream_si128(dst_cacheline + 0, temp1); \
+ _mm_stream_si128(dst_cacheline + 1, temp2); \
+ _mm_stream_si128(dst_cacheline + 2, temp3); \
+ _mm_stream_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+#define SSE41_NTDQA_NTDQ(d, s, len) \
+ do { \
+ __m128i *dst_cacheline = (__m128i *)d; \
+ __m128i *src_cacheline = (__m128i *)s; \
+\
+ __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \
+ __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \
+ __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \
+ __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\
+\
+ _mm_stream_si128(dst_cacheline + 0, temp1); \
+ _mm_stream_si128(dst_cacheline + 1, temp2); \
+ _mm_stream_si128(dst_cacheline + 2, temp3); \
+ _mm_stream_si128(dst_cacheline + 3, temp4); \
+\
+ d += 64; \
+ s += 64; \
+ len -= 64; \
+ } while (len >= 64)
+
+static void
+sse2_dqa_dqx(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_SRC(d, s, len, 16);
+
+ if (len >= 64) {
+ if ((uintptr_t)d & 15)
+ SSE2_DQA_DQU(d, s, len);
+ else
+ SSE2_DQA_DQA(d, s, len);
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+sse2_dqx_dqa(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_DST(d, s, len, 16);
+
+ if (len >= 64) {
+ if ((uintptr_t)s & 15)
+ SSE2_DQU_DQA(d, s, len);
+ else
+ SSE2_DQA_DQA(d, s, len);
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+sse41_ntdqa_dqu(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_SRC(d, s, len, 16);
+
+ if (len >= 64) {
+ _mm_mfence();
+
+ SSE41_NTDQA_DQU(d, s, len);
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+sse41_ntdqa_dqx(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_SRC(d, s, len, 16);
+
+ if (len >= 64) {
+ _mm_mfence();
+
+ if ((uintptr_t)d & 15)
+ SSE41_NTDQA_DQU(d, s, len);
+ else
+ SSE41_NTDQA_DQA(d, s, len);
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+sse41_dqu_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_DST(d, s, len, 16);
+
+ if (len >= 64) {
+ SSE41_DQU_NTDQ(d, s, len);
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+sse41_dqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_DST(d, s, len, 16);
+
+ if (len >= 64) {
+ if ((uintptr_t)s & 15)
+ SSE41_DQU_NTDQ(d, s, len);
+ else
+ SSE41_DQA_NTDQ(d, s, len);
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+sse41_ntdqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_DST(d, s, len, 16);
+
+ if (len >= 64) {
+ if ((uintptr_t)s & 15)
+ SSE41_DQU_NTDQ(d, s, len);
+ else {
+ _mm_mfence();
+
+ SSE41_NTDQA_NTDQ(d, s, len);
+ }
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+#define AVX_DQA_NTDQ(d, s, len) \
+ do { \
+ __m256i *dst_cacheline = (__m256i *)d; \
+ __m256i *src_cacheline = (__m256i *)s; \
+\
+ __m256i temp0 = _mm256_load_si256(src_cacheline + 0); \
+ __m256i temp1 = _mm256_load_si256(src_cacheline + 1); \
+ __m256i temp2 = _mm256_load_si256(src_cacheline + 2); \
+ __m256i temp3 = _mm256_load_si256(src_cacheline + 3); \
+\
+ _mm256_stream_si256(dst_cacheline + 0, temp0); \
+ _mm256_stream_si256(dst_cacheline + 1, temp1); \
+ _mm256_stream_si256(dst_cacheline + 2, temp2); \
+ _mm256_stream_si256(dst_cacheline + 3, temp3); \
+\
+ d += 128; \
+ s += 128; \
+ len -= 128; \
+ } while (len >= 128);
+
+#define AVX_DQU_NTDQ(d, s, len) \
+ do { \
+ __m256i *dst_cacheline = (__m256i *)d; \
+ __m256i *src_cacheline = (__m256i *)s; \
+\
+ __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0); \
+ __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1); \
+ __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2); \
+ __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3); \
+\
+ _mm256_stream_si256(dst_cacheline + 0, temp0); \
+ _mm256_stream_si256(dst_cacheline + 1, temp1); \
+ _mm256_stream_si256(dst_cacheline + 2, temp2); \
+ _mm256_stream_si256(dst_cacheline + 3, temp3); \
+\
+ d += 128; \
+ s += 128; \
+ len -= 128; \
+ } while (len >= 128)
+
+#define AVX_NTDQA_NTDQ(d, s, len) \
+ do { \
+ __m256i *dst_cacheline = (__m256i *)d; \
+ __m256i *src_cacheline = (__m256i *)s; \
+\
+ __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0); \
+ __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1); \
+ __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2); \
+ __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3); \
+\
+ _mm256_stream_si256(dst_cacheline + 0, temp0); \
+ _mm256_stream_si256(dst_cacheline + 1, temp1); \
+ _mm256_stream_si256(dst_cacheline + 2, temp2); \
+ _mm256_stream_si256(dst_cacheline + 3, temp3); \
+\
+ d += 128; \
+ s += 128; \
+ len -= 128; \
+ } while (len >= 128)
+
+static void
+avx_dqu_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_DST(d, s, len, 32);
+
+ if (len >= 64) {
+ AVX_DQU_NTDQ(d, s, len);
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+avx_dqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_DST(d, s, len, 32);
+
+ if (len >= 64) {
+ if ((uintptr_t)s & 31)
+ AVX_DQU_NTDQ(d, s, len);
+ else
+ AVX_DQA_NTDQ(d, s, len);
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+avx_ntdqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ const char *restrict s = src;
+
+ ALIGN_DST(d, s, len, 32);
+
+ if (len >= 64) {
+ if ((uintptr_t)s & 31)
+ AVX_DQU_NTDQ(d, s, len);
+ else {
+ _mm_mfence();
+
+ AVX_NTDQA_NTDQ(d, s, len);
+ }
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+#if 0
+static void
+memcpy_avx(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ char *restrict s = src;
+
+ /* memcpy() the misaligned header. At the end of this if block, <d> and <s>
+ * are aligned to a 16-byte boundary or <len> == 0.
+ */
+ if ((uintptr_t)s & 31) {
+ uintptr_t bytes_before_alignment_boundary = 32 - ((uintptr_t)s & 31);
+ assert(bytes_before_alignment_boundary < 32);
+
+ uintptr_t copy_bytes = MIN2(bytes_before_alignment_boundary, len);
+ memcpy(d, s, copy_bytes);
+
+ d += copy_bytes;
+ s += copy_bytes;
+ len -= copy_bytes;
+ }
+
+ if (len >= 128) {
+ _mm_mfence();
+
+ while (len >= 128) {
+ __m256i *dst_cacheline = (__m256i *)d;
+ __m256i *src_cacheline = (__m256i *)s;
+
+#if 0
+ __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0);
+ __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1);
+ __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2);
+ __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3);
+
+ *(dst_cacheline + 0) = temp0;
+ *(dst_cacheline + 1) = temp1;
+ *(dst_cacheline + 2) = temp2;
+ *(dst_cacheline + 3) = temp3;
+#elif 0
+ __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0);
+ __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1);
+ __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2);
+ __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3);
+
+ _mm256_storeu_si256(dst_cacheline + 0, temp0);
+ _mm256_storeu_si256(dst_cacheline + 1, temp1);
+ _mm256_storeu_si256(dst_cacheline + 2, temp2);
+ _mm256_storeu_si256(dst_cacheline + 3, temp3);
+#elif 1
+ __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0);
+ __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1);
+ __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2);
+ __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3);
+
+ _mm256_stream_si256(dst_cacheline + 0, temp0);
+ _mm256_stream_si256(dst_cacheline + 1, temp1);
+ _mm256_stream_si256(dst_cacheline + 2, temp2);
+ _mm256_stream_si256(dst_cacheline + 3, temp3);
+#else
+ _mm256_stream_si256(dst_cacheline + 0, *(src_cacheline + 0));
+ _mm256_stream_si256(dst_cacheline + 1, *(src_cacheline + 1));
+ _mm256_stream_si256(dst_cacheline + 2, *(src_cacheline + 2));
+ _mm256_stream_si256(dst_cacheline + 3, *(src_cacheline + 3));
+#endif
+
+ d += 128;
+ s += 128;
+ len -= 128;
+ }
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+
+static void
+memcpy_avx_source_unaligned(void *restrict dst, const void *restrict src, size_t len)
+{
+ char *restrict d = dst;
+ char *restrict s = src;
+
+ /* memcpy() the misaligned header. At the end of this if block, <d> and <s>
+ * are aligned to a 16-byte boundary or <len> == 0.
+ */
+ if ((uintptr_t)d & 31) {
+ uintptr_t bytes_before_alignment_boundary = 32 - ((uintptr_t)d & 31);
+ assert(bytes_before_alignment_boundary < 32);
+
+ uintptr_t copy_bytes = MIN2(bytes_before_alignment_boundary, len);
+ memcpy(d, s, copy_bytes);
+
+ d += copy_bytes;
+ s += copy_bytes;
+ len -= copy_bytes;
+ }
+
+ if (len >= 256) {
+ _mm_mfence();
+
+#if 0
+ __asm__(
+ ".p2align 4\n"
+ "1:\n"
+ "vmovdqu (%1), %%ymm0\n"
+ "addq $64, %0\n"
+ "vmovdqu 32(%1), %%ymm1\n"
+ "addq $64, %1\n"
+ "subq $64, %2\n"
+ "vmovntdq %%ymm0, -64(%0)\n"
+ "vmovntdq %%ymm1, -32(%0)\n"
+ "cmpq $127, %2\n"
+ "ja 1b\n"
+ : "=r" (d), "=r" (s), "=r" (len)
+ : "0" (d), "1" (s), "2" (len)
+ : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory");
+#elif 1
+ __asm__(
+ ".p2align 4\n"
+ "1:\n"
+ "vmovntdqa (%1), %%ymm0\n"
+ "subq $-128, %0\n"
+ "vmovntdqa 32(%1), %%ymm1\n"
+ "vmovntdqa 64(%1), %%ymm2\n"
+ "vmovntdqa 96(%1), %%ymm3\n"
+ "subq $-128, %1\n"
+ "addq $-128, %2\n"
+ "vmovdqa %%ymm0, -128(%0)\n"
+ "vmovdqa %%ymm1, -96(%0)\n"
+ "vmovdqa %%ymm2, -64(%0)\n"
+ "vmovdqa %%ymm3, -32(%0)\n"
+ "cmpq $127, %2\n"
+ "ja 1b\n"
+ ".p2align 4\n"
+ "vzeroupper\n"
+ : "=r" (d), "=r" (s), "=r" (len)
+ : "0" (d), "1" (s), "2" (len)
+ : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory");
+#elif 0
+ __asm__(
+ ".p2align 4\n"
+ "1:\n"
+ "vmovdqu (%1), %%ymm0\n"
+ "addq $256, %0\n"
+ "vmovdqu 32(%1), %%ymm1\n"
+ "vmovdqu 64(%1), %%ymm2\n"
+ "vmovdqu 96(%1), %%ymm3\n"
+ "vmovdqu 128(%1), %%ymm4\n"
+ "vmovdqu 160(%1), %%ymm5\n"
+ "vmovdqu 192(%1), %%ymm6\n"
+ "vmovdqu 224(%1), %%ymm6\n"
+ "addq $256, %1\n"
+ "subq $256, %2\n"
+ "vmovntdq %%ymm0, -256(%0)\n"
+ "vmovntdq %%ymm1, -224(%0)\n"
+ "vmovntdq %%ymm2, -192(%0)\n"
+ "vmovntdq %%ymm3, -160(%0)\n"
+ "vmovntdq %%ymm4, -128(%0)\n"
+ "vmovntdq %%ymm5, -96(%0)\n"
+ "vmovntdq %%ymm6, -64(%0)\n"
+ "vmovntdq %%ymm7, -32(%0)\n"
+ "cmpq $255, %2\n"
+ "ja 1b\n"
+ : "=r" (d), "=r" (s), "=r" (len)
+ : "0" (d), "1" (s), "2" (len)
+ : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory");
+#else
+ while (len >= 128) {
+ __m256i *dst_cacheline = (__m256i *)d;
+ __m256i *src_cacheline = (__m256i *)s;
+
+// _mm_prefetch(s + 0x1c0, _MM_HINT_NTA);
+// _mm_prefetch(s + 0x280, _MM_HINT_NTA);
+
+#if 0
+ __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0);
+ __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1);
+ __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2);
+ __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3);
+
+ _mm256_stream_si256(dst_cacheline + 0, temp0);
+ _mm256_stream_si256(dst_cacheline + 1, temp1);
+ _mm256_stream_si256(dst_cacheline + 2, temp2);
+ _mm256_stream_si256(dst_cacheline + 3, temp3);
+#elif 0
+ __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0);
+ __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1);
+ __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2);
+ __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3);
+ __m256i temp4 = _mm256_loadu_si256(src_cacheline + 4);
+ __m256i temp5 = _mm256_loadu_si256(src_cacheline + 5);
+ __m256i temp6 = _mm256_loadu_si256(src_cacheline + 6);
+ __m256i temp7 = _mm256_loadu_si256(src_cacheline + 7);
+
+ _mm256_stream_si256(dst_cacheline + 0, temp0);
+ _mm256_stream_si256(dst_cacheline + 1, temp1);
+ _mm256_stream_si256(dst_cacheline + 2, temp2);
+ _mm256_stream_si256(dst_cacheline + 3, temp3);
+ _mm256_stream_si256(dst_cacheline + 4, temp4);
+ _mm256_stream_si256(dst_cacheline + 5, temp5);
+ _mm256_stream_si256(dst_cacheline + 6, temp6);
+ _mm256_stream_si256(dst_cacheline + 7, temp7);
+
+ d += 128;
+ s += 128;
+ len -= 128;
+#else
+ __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0);
+ __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1);
+ __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2);
+ __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3);
+
+ _mm256_store_si256(dst_cacheline + 0, temp0);
+ _mm256_store_si256(dst_cacheline + 1, temp1);
+ _mm256_store_si256(dst_cacheline + 2, temp2);
+ _mm256_store_si256(dst_cacheline + 3, temp3);
+#endif
+
+ d += 128;
+ s += 128;
+ len -= 128;
+ }
+#endif
+
+ _mm_sfence();
+ }
+
+ /* memcpy() the tail. */
+ if (len) {
+ memcpy(d, s, len);
+ }
+}
+#endif
+
+static void
+memcpy_wrapper(void *restrict dst, const void *restrict src, size_t len)
+{
+ memcpy(dst, src, len);
+}
+
+static const struct {
+ const char *name;
+ void (*ptr)(void *restrict dst, const void *restrict src, size_t len);
+} memcpy_table[] = {
+ { "sse2_dqx_dqa", sse2_dqx_dqa },
+ { "sse2_dqa_dqx", sse2_dqa_dqx },
+
+ { "sse41_ntdqa_dqu", sse41_ntdqa_dqu },
+ { "sse41_ntdqa_dqx", sse41_ntdqa_dqx },
+ { "sse41_dqu_ntdq", sse41_dqu_ntdq },
+ { "sse41_dqx_ntdq", sse41_dqx_ntdq },
+ { "sse41_ntdqx_ntdq", sse41_ntdqx_ntdq },
+
+ { "avx_dqu_ntdq", avx_dqu_ntdq },
+ { "avx_dqx_ntdq", avx_dqx_ntdq },
+
+ { "avx_ntdqx_ntdq", avx_ntdqx_ntdq },
+
+ { "memcpy", memcpy_wrapper }
+};
+
+void
+(*st_memcpy_read)(void *restrict dst, const void* restrict src, size_t len) = NULL;
+
+void
+st_init_memcpy_read()
+{
+ static const char envvar[] = "ST_MEMCPY_READ";
+ const char *name = debug_get_option(envvar, NULL);
+
+ if (name) {
+ for (unsigned i = 0; i < ARRAY_SIZE(memcpy_table); ++i) {
+ if (strcmp(name, memcpy_table[i].name) == 0) {
+ st_memcpy_read = memcpy_table[i].ptr;
+ return;
+ }
+ }
+
+ fprintf(stderr, "Unknown value of %s. Available options:\n", envvar);
+ for (unsigned i = 0; i < ARRAY_SIZE(memcpy_table); ++i) {
+ fprintf(stderr, "%s\n", memcpy_table[i].name);
+ }
+ }
+
+ st_memcpy_read = memcpy_wrapper;
+}
diff --git a/src/mesa/state_tracker/st_memcpy_read.h b/src/mesa/state_tracker/st_memcpy_read.h
new file mode 100644
index 0000000000..8014c5d5f8
--- /dev/null
+++ b/src/mesa/state_tracker/st_memcpy_read.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef ST_MEMCPY_READ_H
+#define ST_MEMCPY_READ_H
+
+extern void
+(*st_memcpy_read)(void *restrict dst, const void* restrict src, size_t len);
+
+void
+st_init_memcpy_read(void);
+
+#endif /* ST_MEMCPY_READ_H */