summaryrefslogtreecommitdiff
path: root/pixman/pixman-sse2.c
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2012-06-25 22:36:52 -0400
committerSøren Sandmann Pedersen <ssp@redhat.com>2012-09-26 00:03:10 -0400
commitaff796d6cee4cb81f0352c2f7d0c994229bd5ca1 (patch)
treef82cdf4e2aa7ae7b92d8b2e2a8dbe4610fa07db5 /pixman/pixman-sse2.c
parent05560828c495ed9226b43b30e1824447e3d8eff3 (diff)
Add scaled nearest repeat fast paths
Before this patch it was often faster to scale and repeat in two passes because each pass used a fast path vs. the slow path that the single pass approach takes. This makes it so that the single pass approach has competitive performance.
Diffstat (limited to 'pixman/pixman-sse2.c')
-rw-r--r--pixman/pixman-sse2.c66
1 files changed, 52 insertions, 14 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index cf21ef8..efed310 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5159,7 +5159,7 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
int32_t w,
pixman_fixed_t vx,
pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
+ pixman_fixed_t src_width_fixed,
pixman_bool_t fully_transparent_src)
{
uint32_t s, d;
@@ -5176,8 +5176,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
while (w && ((unsigned long)pd & 15))
{
d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
if (pm)
@@ -5190,14 +5192,22 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
__m128i tmp;
uint32_t tmp1, tmp2, tmp3, tmp4;
- tmp1 = ps[vx >> 16];
+ tmp1 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp2 = ps[vx >> 16];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp2 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp3 = ps[vx >> 16];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp3 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp4 = ps[vx >> 16];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp4 = *(ps + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
@@ -5235,8 +5245,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
while (w)
{
d = *pd;
- s = combine1 (ps + (vx >> 16), pm);
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
if (pm)
@@ -5255,6 +5267,9 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
scaled_nearest_scanline_sse2_8888_8888_OVER,
uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, NORMAL)
static force_inline void
scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
@@ -5263,7 +5278,7 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
int32_t w,
pixman_fixed_t vx,
pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
+ pixman_fixed_t src_width_fixed,
pixman_bool_t zero_src)
{
__m128i xmm_mask;
@@ -5278,8 +5293,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
while (w && (unsigned long)dst & 15)
{
- uint32_t s = src[pixman_fixed_to_int (vx)];
+ uint32_t s = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
if (s)
{
@@ -5301,14 +5318,22 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
{
uint32_t tmp1, tmp2, tmp3, tmp4;
- tmp1 = src[pixman_fixed_to_int (vx)];
+ tmp1 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp2 = src[pixman_fixed_to_int (vx)];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp2 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp3 = src[pixman_fixed_to_int (vx)];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp3 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
- tmp4 = src[pixman_fixed_to_int (vx)];
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp4 = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
@@ -5336,8 +5361,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
while (w)
{
- uint32_t s = src[pixman_fixed_to_int (vx)];
+ uint32_t s = *(src + pixman_fixed_to_int (vx));
vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
if (s)
{
@@ -5367,6 +5394,9 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
@@ -5856,11 +5886,19 @@ static const pixman_fast_path_t sse2_fast_paths[] =
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),