summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2012-04-18 16:14:08 -0400
committerMatt Turner <mattst88@gmail.com>2012-04-27 13:42:26 -0400
commitc2b1630d9603f80c2636e8a8bfebca87707d4235 (patch)
tree82a7cf47f11d2f7bc178ac256aeb365fac173c57
parent20bad64d9a7ff5c2662f12a87f66fcf77c1f3f2c (diff)
mmx: add a8 fetcher
oprofile of xfce4-terminal-a1 210535 9.0407 libpixman-1.so.0.25.3 fetch_scanline_a8 144802 6.0054 libpixman-1.so.0.25.3 mmx_fetch_a8 Loongson: add_8_8_8 = L1: 17.98 L2: 17.28 M: 14.28 ( 19.79%) HT: 11.11 VT: 10.38 R: 9.97 RT: 5.14 ( 55Kops/s) add_8_8_8 = L1: 20.44 L2: 19.65 M: 15.62 ( 21.53%) HT: 12.86 VT: 11.98 R: 11.32 RT: 6.13 ( 64Kops/s) src_8888_8_0565 = L1: 19.97 L2: 18.59 M: 13.42 ( 32.55%) HT: 11.46 VT: 10.78 R: 10.33 RT: 5.87 ( 61Kops/s) src_8888_8_0565 = L1: 21.16 L2: 19.68 M: 13.94 ( 33.64%) HT: 12.31 VT: 11.52 R: 11.02 RT: 6.54 ( 68Kops/s) src_x888_8_x888 = L1: 20.54 L2: 18.88 M: 13.07 ( 40.74%) HT: 11.05 VT: 10.36 R: 10.02 RT: 5.68 ( 60Kops/s) src_x888_8_x888 = L1: 21.92 L2: 20.15 M: 13.35 ( 41.42%) HT: 11.70 VT: 10.95 R: 10.53 RT: 6.18 ( 65Kops/s) over_x888_8_0565 = L1: 10.32 L2: 9.85 M: 7.63 ( 21.13%) HT: 6.56 VT: 6.30 R: 6.12 RT: 3.80 ( 43Kops/s) over_x888_8_0565 = L1: 10.64 L2: 10.17 M: 7.74 ( 21.35%) HT: 6.83 VT: 6.55 R: 6.34 RT: 4.03 ( 46Kops/s) ARM/iwMMXt: add_8_8_8 = L1: 13.10 L2: 11.67 M: 10.74 ( 13.46%) HT: 8.62 VT: 8.15 R: 7.94 RT: 4.39 ( 44Kops/s) add_8_8_8 = L1: 13.81 L2: 12.79 M: 11.63 ( 13.93%) HT: 9.33 VT: 9.20 R: 9.04 RT: 5.43 ( 52Kops/s) src_8888_8_0565 = L1: 16.62 L2: 15.07 M: 12.52 ( 27.46%) HT: 10.07 VT: 10.17 R: 9.95 RT: 5.64 ( 54Kops/s) src_8888_8_0565 = L1: 16.84 L2: 16.11 M: 13.22 ( 27.71%) HT: 11.74 VT: 10.90 R: 10.80 RT: 6.66 ( 62Kops/s) src_x888_8_x888 = L1: 17.49 L2: 16.22 M: 13.73 ( 38.73%) HT: 10.10 VT: 10.33 R: 9.55 RT: 5.21 ( 52Kops/s) src_x888_8_x888 = L1: 19.33 L2: 17.66 M: 14.26 ( 38.43%) HT: 11.53 VT: 10.83 R: 10.57 RT: 6.12 ( 58Kops/s) over_x888_8_0565 = L1: 7.57 L2: 7.29 M: 6.37 ( 15.97%) HT: 5.53 VT: 5.33 R: 5.21 RT: 3.22 ( 35Kops/s) over_x888_8_0565 = L1: 8.15 L2: 7.56 M: 6.50 ( 15.58%) HT: 5.73 VT: 5.49 R: 5.50 RT: 3.53 ( 38Kops/s)
-rw-r--r--pixman/loongson-mmintrin.h22
-rw-r--r--pixman/pixman-mmx.c46
2 files changed, 68 insertions, 0 deletions
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 508366c4..76ae8920 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -183,6 +183,17 @@ _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("punpckhhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
{
__m64 ret;
@@ -207,6 +218,17 @@ _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("punpcklhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si64 (__m64 __m1, __m64 __m2)
{
__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 80aa59d8..98fb84e2 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3243,6 +3243,51 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
return iter->buffer;
}
+static uint32_t *
+mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+
+ iter->bits += iter->stride;
+
+ while (w && (((unsigned long)dst) & 15))
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ __m64 mm0 = ldq_u ((__m64 *)src);
+
+ __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
+ __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
+ __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
+ __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
+ __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
+ __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
+
+ *(__m64 *)(dst + 0) = mm3;
+ *(__m64 *)(dst + 2) = mm4;
+ *(__m64 *)(dst + 4) = mm5;
+ *(__m64 *)(dst + 6) = mm6;
+
+ dst += 8;
+ src += 8;
+ w -= 8;
+ }
+
+ while (w)
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
typedef struct
{
pixman_format_code_t format;
@@ -3252,6 +3297,7 @@ typedef struct
static const fetcher_info_t fetchers[] =
{
{ PIXMAN_r5g6b5, mmx_fetch_r5g6b5 },
+ { PIXMAN_a8, mmx_fetch_a8 },
{ PIXMAN_null }
};