diff options
author | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2012-06-25 22:36:52 -0400 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2012-09-26 00:03:10 -0400 |
commit | aff796d6cee4cb81f0352c2f7d0c994229bd5ca1 (patch) | |
tree | f82cdf4e2aa7ae7b92d8b2e2a8dbe4610fa07db5 /pixman/pixman-sse2.c | |
parent | 05560828c495ed9226b43b30e1824447e3d8eff3 (diff) |
Add scaled nearest repeat fast paths
Before this patch it was often faster to scale and repeat
in two passes because each pass used a fast path vs.
the slow path that the single pass approach takes. This
makes it so that the single pass approach has competitive
performance.
Diffstat (limited to 'pixman/pixman-sse2.c')
-rw-r--r-- | pixman/pixman-sse2.c | 66 |
1 files changed, 52 insertions, 14 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index cf21ef8..efed310 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5159,7 +5159,7 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx, + pixman_fixed_t src_width_fixed, pixman_bool_t fully_transparent_src) { uint32_t s, d; @@ -5176,8 +5176,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, while (w && ((unsigned long)pd & 15)) { d = *pd; - s = combine1 (ps + (vx >> 16), pm); + s = combine1 (ps + pixman_fixed_to_int (vx), pm); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; *pd++ = core_combine_over_u_pixel_sse2 (s, d); if (pm) @@ -5190,14 +5192,22 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, __m128i tmp; uint32_t tmp1, tmp2, tmp3, tmp4; - tmp1 = ps[vx >> 16]; + tmp1 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp2 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp3 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp4 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); @@ -5235,8 +5245,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, while (w) { d = *pd; - s = combine1 (ps + (vx >> 16), pm); + s = combine1 (ps + pixman_fixed_to_int (vx), pm); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; *pd++ = core_combine_over_u_pixel_sse2 (s, d); if (pm) @@ -5255,6 +5267,9 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, scaled_nearest_scanline_sse2_8888_8888_OVER, uint32_t, uint32_t, PAD) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, NORMAL) static force_inline void scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, @@ -5263,7 +5278,7 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx, + pixman_fixed_t src_width_fixed, pixman_bool_t zero_src) { __m128i xmm_mask; @@ -5278,8 +5293,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, while (w && (unsigned long)dst & 15) { - uint32_t s = src[pixman_fixed_to_int (vx)]; + uint32_t s = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; if (s) { @@ -5301,14 +5318,22 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, { uint32_t tmp1, tmp2, tmp3, tmp4; - tmp1 = src[pixman_fixed_to_int (vx)]; + tmp1 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp2 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp3 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp4 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); @@ -5336,8 +5361,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, while (w) { - uint32_t s = src[pixman_fixed_to_int (vx)]; + uint32_t s = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; if (s) { @@ -5367,6 +5394,9 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) @@ -5856,11 +5886,19 @@ static const pixman_fast_path_t sse2_fast_paths[] = SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |