ARM: added 'neon_composite_over_n_0565' fast path

author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2009-12-09 11:02:04 +0200
committer: Siarhei Siamashka <siarhei.siamashka@nokia.com> 2009-12-09 11:27:57 +0200
commit: 96fd17488f0966d2df53623195810dc640bf5ca6 (patch)
tree: b59bab442e01c2d6cb18f938b3125178956f562e
parent: 2d332c7a569803107e11b41c7b2c020b4050e26e (diff)
2 files changed, 112 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index bffc676..57680bb 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -344,6 +344,75 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
 .macro pixman_composite_src_8888_0565_process_pixblock_head
     vshll.u8    q8, d1, #8
     vshll.u8    q14, d2, #8
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 4894285..8ae79ae 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -71,6 +71,46 @@ neon_composite_##name (pixman_implementation_t *imp,                    \
                                         src_line, src_stride);          \
 }
 
+#define BIND_N_NULL_DST(name, dst_type, dst_cnt)                        \
+void                                                                    \
+pixman_composite_##name##_asm_neon (int32_t    w,                       \
+                                    int32_t    h,                       \
+                                    dst_type  *dst,                     \
+                                    int32_t    dst_stride,              \
+                                    uint32_t   src);                    \
+                                                                        \
+static void                                                             \
+neon_composite_##name (pixman_implementation_t *imp,                    \
+                       pixman_op_t              op,                     \
+                       pixman_image_t *         src_image,              \
+                       pixman_image_t *         mask_image,             \
+                       pixman_image_t *         dst_image,              \
+                       int32_t                  src_x,                  \
+                       int32_t                  src_y,                  \
+                       int32_t                  mask_x,                 \
+                       int32_t                  mask_y,                 \
+                       int32_t                  dest_x,                 \
+                       int32_t                  dest_y,                 \
+                       int32_t                  width,                  \
+                       int32_t                  height)                 \
+{                                                                       \
+    dst_type  *dst_line;                                                \
+    int32_t    dst_stride;                                              \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);  \
+                                                                        \
+    if (src == 0)                                                       \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type,         \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_neon (width, height,                  \
+                                        dst_line, dst_stride,           \
+                                        src);                           \
+}
+
 #define BIND_N_MASK_DST(name, mask_type, mask_cnt, dst_type, dst_cnt)   \
 void                                                                    \
 pixman_composite_##name##_asm_neon (int32_t    w,                       \
@@ -218,6 +258,8 @@ BIND_SRC_NULL_DST(src_0565_8888, uint16_t, 1, uint32_t, 1)
 BIND_SRC_NULL_DST(add_8000_8000, uint8_t, 1, uint8_t, 1)
 BIND_SRC_NULL_DST(add_8888_8888, uint32_t, 1, uint32_t, 1)
 
+BIND_N_NULL_DST(over_n_0565, uint16_t, 1)
+
 BIND_SRC_NULL_DST(over_8888_0565, uint32_t, 1, uint16_t, 1)
 BIND_SRC_NULL_DST(over_8888_8888, uint32_t, 1, uint32_t, 1)
 
@@ -360,6 +402,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] =
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888    },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888    },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888    },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565      },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_solid,    PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_solid,    PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_8_8888 },
author	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2009-12-09 11:02:04 +0200
committer	Siarhei Siamashka <siarhei.siamashka@nokia.com>	2009-12-09 11:27:57 +0200
commit	96fd17488f0966d2df53623195810dc640bf5ca6 (patch)
tree	b59bab442e01c2d6cb18f938b3125178956f562e
parent	2d332c7a569803107e11b41c7b2c020b4050e26e (diff)