summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2013-01-01 19:41:54 +0000
committerChris Wilson <chris@chris-wilson.co.uk>2013-01-27 14:04:15 +0000
commit7ced3beec99e9965717f76cc822d0702383a1fce (patch)
treea7e5082f7f2aa0979db287b98cceb43b2c00544c
parentb7f523e3bcbef1f08bf9b374f2704723d5298c1f (diff)
sse2: Add a fast path for add_n_8888
This path is being exercised by inplace compositing of trapezoids, for instance as used in the firefox-asteroids cairo-trace. IVB i3-3720qm ./tests/lowlevel-blt-bench add_n_888: reference memcpy speed = 14918.3MB/s (3729.6MP/s for 32bpp fills) before: L1:1752.44 L2:2259.48 M:2215.73 ( 58.80%) HT:589.49 VT:404.04 R:424.69 RT:134.68 (1182Kops/s) after: L1:3931.21 L2:6132.78 M:3440.17 ( 92.24%) HT:1337.70 VT:1357.64 R:1270.27 RT:359.78 (2161Kops/s) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r--pixman/pixman-sse2.c65
1 files changed, 65 insertions, 0 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 5a0e062..f4a7d51 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -4523,9 +4523,70 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
sse2_combine_add_u (imp, op, dst, src, NULL, width);
}
+}
+
+static void
+sse2_composite_add_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst, src;
+ int dst_stride;
+
+ __m128i xmm_src;
+
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+ if (src == 0)
+ return;
+
+ if (src == ~0)
+ {
+ pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
+ dest_x, dest_y, width, height, ~0);
+
+ return;
+ }
+
+ xmm_src = _mm_set_epi32 (src, src, src, src);
+ while (height--)
+ {
+ int w = width;
+ uint32_t d;
+
+ dst = dst_line;
+ dst_line += dst_stride;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+ *dst++ =
+ _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ save_128_aligned
+ ((__m128i*)dst,
+ _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+ dst += 4;
+ w -= 4;
+ }
+ while (w--)
+ {
+ d = *dst;
+ *dst++ =
+ _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
+ _mm_cvtsi32_si128 (d)));
+ }
+ }
}
+
static pixman_bool_t
sse2_blt (pixman_implementation_t *imp,
uint32_t * src_bits,
@@ -5848,6 +5909,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
/* PIXMAN_OP_SRC */
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),