summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2013-01-01 19:41:54 +0000
committerChris Wilson <chris@chris-wilson.co.uk>2013-01-27 14:04:15 +0000
commitd00ce4091215e8a648c6f1912829b35c02b06add (patch)
treef29d60de5cb616af4dc8af79c744385f5c997e75
parent7ced3beec99e9965717f76cc822d0702383a1fce (diff)
sse2: Add a fast path for add_n_8_8888
This path is being exercised by compositing of trapezoids for clipmasks, for instance as used in the firefox-asteroids cairo-trace. IVB i7-3720qm ./tests/lowlevel-blt-bench add_n_8_8888: reference memcpy speed = 14846.7MB/s (3711.7MP/s for 32bpp fills) before: L1: 681.10 L2: 735.14 M:701.44 ( 28.35%) HT:283.32 VT:213.23 R:208.93 RT: 77.89 ( 793Kops/s) after: L1: 992.91 L2:1017.33 M:982.58 ( 39.88%) HT:458.93 VT:332.32 R:326.13 RT:136.66 (1287Kops/s) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r--pixman/pixman-sse2.c99
1 files changed, 99 insertions, 0 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index f4a7d51..ff8c946 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -4586,6 +4586,101 @@ sse2_composite_add_n_8888 (pixman_implementation_t *imp,
}
}
+static void
+sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+
+ __m128i xmm_src;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+ if (src == 0)
+ return;
+ xmm_src = expand_pixel_32_1x128 (src);
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ uint8_t m = *mask++;
+ if (m)
+ {
+ *dst = pack_1x128_32
+ (_mm_adds_epu16
+ (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+ unpack_32_1x128 (*dst)));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ uint32_t m = *(uint32_t*)mask;
+ if (m)
+ {
+ __m128i xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
+ __m128i xmm_mask =
+ _mm_unpacklo_epi8 (unpack_32_1x128(m),
+ _mm_setzero_si128 ());
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+ xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+ if (m)
+ {
+ *dst = pack_1x128_32
+ (_mm_adds_epu16
+ (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+ unpack_32_1x128 (*dst)));
+ }
+ dst++;
+ w--;
+ }
+ }
+}
static pixman_bool_t
sse2_blt (pixman_implementation_t *imp,
@@ -5913,6 +6008,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
/* PIXMAN_OP_SRC */
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),