summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2012-06-19 00:30:51 -0400
committerMatt Turner <mattst88@gmail.com>2012-07-01 16:33:08 -0400
commit9209cd746b7a81d0536df6dadd6a0b0b983291cb (patch)
tree984626d908f10f256cfdae70df550a053c8034cc
parent51f27d7364d66e47d882ee531b6655368159231a (diff)
mmx: add scaled bilinear src_8888_8888
Loongson: image firefox-planet-gnome 170.025 170.229 0.09% 3/4 image firefox-planet-gnome 157.012 158.087 0.30% 6/6 ARM/iwMMXt: image firefox-planet-gnome 164.192 164.875 0.34% 3/4 image firefox-planet-gnome 148.086 149.339 0.76% 6/6
-rw-r--r--pixman/loongson-mmintrin.h73
-rw-r--r--pixman/pixman-mmx.c96
2 files changed, 169 insertions, 0 deletions
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 1a114fe..f0931ac 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -45,6 +45,28 @@ _mm_setzero_si64 (void)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("paddh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("paddw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -150,6 +172,35 @@ _mm_packs_pu16 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("packsswh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
+{
+ uint64_t val = ((uint64_t)__w3 << 48)
+ | ((uint64_t)__w2 << 32)
+ | ((uint64_t)__w1 << 16)
+ | ((uint64_t)__w0 << 0);
+ return *(__m64 *)&val;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (unsigned __i1, unsigned __i0)
+{
+ uint64_t val = ((uint64_t)__i1 << 32)
+ | ((uint64_t)__i0 << 0);
+ return *(__m64 *)&val;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16 (__m64 __m, int64_t __n)
{
__m64 ret;
@@ -193,6 +244,17 @@ _mm_srli_pi16 (__m64 __m, int64_t __count)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int64_t __count)
+{
+ __m64 ret;
+ asm("psrlw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64 (__m64 __m, int64_t __count)
{
__m64 ret;
@@ -204,6 +266,17 @@ _mm_srli_si64 (__m64 __m, int64_t __count)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("psubh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
{
__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 0ebe119..018a2ba 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -42,6 +42,7 @@
#endif
#include "pixman-private.h"
#include "pixman-combine32.h"
+#include "pixman-inlines.h"
#define no_vERBOSE
@@ -3502,6 +3503,94 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
_mm_empty ();
}
+#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+
+#define BILINEAR_DECLARE_VARIABLES \
+ const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
+ const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
+ const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \
+ const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
+ const __m64 mm_zero = _mm_setzero_si64 (); \
+ __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
+do { \
+ /* fetch 2x2 pixel block into 2 mmx registers */ \
+ __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
+ __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
+ __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
+ __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
+ __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
+ __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
+ __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
+ /* calculate horizontal weights */ \
+ __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS)); \
+ __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
+ 16 - BILINEAR_INTERPOLATION_BITS); \
+ mm_x = _mm_add_pi16 (mm_x, mm_ux); \
+ /* horizontal interpolation */ \
+ __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
+ __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
+ __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \
+ __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \
+ lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \
+ _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \
+ hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
+ _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
+ /* shift and pack the result */ \
+ hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
+ lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
+ lo = _mm_packs_pi32 (lo, hi); \
+ lo = _mm_packs_pu16 (lo, lo); \
+ store (&pix, lo); \
+} while (0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix;
+
+ while (w--)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
+ *dst++ = pix;
+ }
+
+ _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+ scaled_bilinear_scanline_mmx_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
+
static uint32_t *
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
{
@@ -3757,6 +3846,13 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+
{ PIXMAN_OP_NONE },
};