summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--configure.ac34
-rw-r--r--pixman/Makefile.am15
-rw-r--r--pixman/pixman-arm-neon.c4
-rwxr-xr-x[-rw-r--r--]pixman/pixman-arm.c6
-rw-r--r--pixman/pixman-arma64-neon-asm-bilinear.S1203
-rw-r--r--pixman/pixman-arma64-neon-asm.S3341
-rw-r--r--pixman/pixman-arma64-neon-asm.h694
-rw-r--r--pixman/pixman-private.h5
8 files changed, 2778 insertions, 2524 deletions
diff --git a/configure.ac b/configure.ac
index 6b2134e..bb0192a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -665,10 +665,44 @@ AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
665AC_MSG_RESULT($have_arm_neon) 665AC_MSG_RESULT($have_arm_neon)
666if test $enable_arm_neon = yes && test $have_arm_neon = no ; then 666if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
667 AC_MSG_ERROR([ARM NEON intrinsics not detected]) 667 AC_MSG_ERROR([ARM NEON intrinsics not detected])
668fi 668fi
669 669
670dnl ==========================================================================
671dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions
672have_arm_a64_neon=no
673AC_MSG_CHECKING(whether to use ARM A64 NEON assembler)
674xserver_save_CFLAGS=$CFLAGS
675CFLAGS="-x assembler-with-cpp $CFLAGS"
676AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
677.text
678.arch armv8-a
679.altmacro
680prfm pldl2strm, [x0]
681xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes)
682CFLAGS=$xserver_save_CFLAGS
683
684AC_ARG_ENABLE(arm-a64-neon,
685 [AC_HELP_STRING([--disable-arm-a64-neon],
686 [disable ARM A64 NEON fast paths])],
687 [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto])
688
689if test $enable_arm_a64_neon = no ; then
690 have_arm_a64_neon=disabled
691fi
692
693if test $have_arm_a64_neon = yes ; then
694 AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations])
695fi
696
697AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes)
698
699AC_MSG_RESULT($have_arm_a64_neon)
700if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then
701 AC_MSG_ERROR([ARM A64 NEON intrinsics not detected])
702fi
703
670dnl =========================================================================== 704dnl ===========================================================================
671dnl Check for IWMMXT 705dnl Check for IWMMXT
672 706
673AC_ARG_ENABLE(arm-iwmmxt, 707AC_ARG_ENABLE(arm-iwmmxt,
674 [AC_HELP_STRING([--disable-arm-iwmmxt], 708 [AC_HELP_STRING([--disable-arm-iwmmxt],
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 581b6f6..9229e78 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -92,10 +92,25 @@ libpixman_arm_neon_la_SOURCES = \
92libpixman_1_la_LIBADD += libpixman-arm-neon.la 92libpixman_1_la_LIBADD += libpixman-arm-neon.la
93 93
94ASM_CFLAGS_arm_neon= 94ASM_CFLAGS_arm_neon=
95endif 95endif
96 96
97# arm a64 neon code
98if USE_ARM_A64_NEON
99noinst_LTLIBRARIES += libpixman-arma64-neon.la
100libpixman_arma64_neon_la_SOURCES = \
101 pixman-arm-neon.c \
102 pixman-arm-common.h \
103 pixman-arma64-neon-asm.S \
104 pixman-arma64-neon-asm-bilinear.S \
105 pixman-arm-asm.h \
106 pixman-arma64-neon-asm.h
107libpixman_1_la_LIBADD += libpixman-arma64-neon.la
108
109ASM_CFLAGS_arm_neon=
110endif
111
97# iwmmxt code 112# iwmmxt code
98if USE_ARM_IWMMXT 113if USE_ARM_IWMMXT
99libpixman_iwmmxt_la_SOURCES = pixman-mmx.c 114libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
100noinst_LTLIBRARIES += libpixman-iwmmxt.la 115noinst_LTLIBRARIES += libpixman-iwmmxt.la
101libpixman_1_la_LIBADD += libpixman-iwmmxt.la 116libpixman_1_la_LIBADD += libpixman-iwmmxt.la
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index be761c9..28e13d1 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -192,12 +192,12 @@ arm_neon_fill (pixman_implementation_t *imp,
192 int width, 192 int width,
193 int height, 193 int height,
194 uint32_t _xor) 194 uint32_t _xor)
195{ 195{
196 /* stride is always multiple of 32bit units in pixman */ 196 /* stride is always multiple of 32bit units in pixman */
197 uint32_t byte_stride = stride * sizeof(uint32_t); 197 int32_t byte_stride = stride * sizeof(uint32_t);
198 198
199 switch (bpp) 199 switch (bpp)
200 { 200 {
201 case 8: 201 case 8:
202 pixman_composite_src_n_8_asm_neon ( 202 pixman_composite_src_n_8_asm_neon (
203 width, 203 width,
diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
index 23374e4..734cbea 100644..100755
--- a/pixman/pixman-arm.c
+++ b/pixman/pixman-arm.c
@@ -219,7 +219,13 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp)
219#ifdef USE_ARM_NEON 219#ifdef USE_ARM_NEON
220 if (!_pixman_disabled ("arm-neon") && have_feature (ARM_NEON)) 220 if (!_pixman_disabled ("arm-neon") && have_feature (ARM_NEON))
221 imp = _pixman_implementation_create_arm_neon (imp); 221 imp = _pixman_implementation_create_arm_neon (imp);
222#endif 222#endif
223 223
224#ifdef USE_ARM_A64_NEON
225 /* neon is a part of aarch64 */
226 if (!_pixman_disabled ("arm-neon"))
227 imp = _pixman_implementation_create_arm_neon (imp);
228#endif
229
224 return imp; 230 return imp;
225} 231}
diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S
index a7d94c3..41ee753 100644
--- a/pixman/pixman-arma64-neon-asm-bilinear.S
+++ b/pixman/pixman-arma64-neon-asm-bilinear.S
@@ -53,22 +53,17 @@
53#if defined(__linux__) && defined (__ELF__) 53#if defined(__linux__) && defined (__ELF__)
54.section .note.GNU-stack,"",%progbits 54.section .note.GNU-stack,"",%progbits
55#endif 55#endif
56 56
57.text 57.text
58.fpu neon 58.arch armv8-a
59.arch armv7a
60.object_arch armv4
61.eabi_attribute 10, 0
62.eabi_attribute 12, 0
63.arm
64.altmacro 59.altmacro
65.p2align 2 60.p2align 2
66 61
67#include "pixman-private.h" 62#include "pixman-private.h"
68#include "pixman-arm-asm.h" 63#include "pixman-arm-asm.h"
69#include "pixman-arm-neon-asm.h" 64#include "pixman-arma64-neon-asm.h"
70 65
71/* 66/*
72 * Bilinear macros from pixman-arm-neon-asm.S 67 * Bilinear macros from pixman-arm-neon-asm.S
73 */ 68 */
74 69
@@ -77,35 +72,39 @@
77 * format conversion, and interpolation as separate macros which can be used 72 * format conversion, and interpolation as separate macros which can be used
78 * as the basic building blocks for constructing bilinear scanline functions. 73 * as the basic building blocks for constructing bilinear scanline functions.
79 */ 74 */
80 75
81.macro bilinear_load_8888 reg1, reg2, tmp 76.macro bilinear_load_8888 reg1, reg2, tmp
82 asr TMP1, X, #16 77 asr WTMP1, X, #16
83 add X, X, UX 78 add X, X, UX
84 add TMP1, TOP, TMP1, lsl #2 79 lsl TMP2, TMP1, #2
85 vld1.32 {reg1}, [TMP1], STRIDE 80 add TMP1, TOP, TMP2
86 vld1.32 {reg2}, [TMP1] 81 ld1 {&reg1&.2s}, [TMP1], STRIDE
82 ld1 {&reg2&.2s}, [TMP1]
87.endm 83.endm
88 84
89.macro bilinear_load_0565 reg1, reg2, tmp 85.macro bilinear_load_0565 reg1, reg2, tmp
90 asr TMP1, X, #16 86 asr WTMP1, X, #16
91 add X, X, UX 87 add X, X, UX
92 add TMP1, TOP, TMP1, lsl #1 88 lsl TMP2, TMP1, #1
93 vld1.32 {reg2[0]}, [TMP1], STRIDE 89 add TMP1, TOP, TMP2
94 vld1.32 {reg2[1]}, [TMP1] 90 ld1 {&reg2&.s}[0], [TMP1], STRIDE
91 ld1 {&reg2&.s}[1], [TMP1]
95 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp 92 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
96.endm 93.endm
97 94
98.macro bilinear_load_and_vertical_interpolate_two_8888 \ 95.macro bilinear_load_and_vertical_interpolate_two_8888 \
99 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 96 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
100 97
101 bilinear_load_8888 reg1, reg2, tmp1 98 bilinear_load_8888 reg1, reg2, tmp1
102 vmull.u8 acc1, reg1, d28 99 umull &tmp1&.8h, &reg1&.8b, v28.8b
103 vmlal.u8 acc1, reg2, d29 100 umlal &tmp1&.8h, &reg2&.8b, v29.8b
101 mov &acc1&.16b, &tmp1&.16b
104 bilinear_load_8888 reg3, reg4, tmp2 102 bilinear_load_8888 reg3, reg4, tmp2
105 vmull.u8 acc2, reg3, d28 103 umull &tmp2&.8h, &reg3&.8b, v28.8b
106 vmlal.u8 acc2, reg4, d29 104 umlal &tmp2&.8h, &reg4&.8b, v29.8b
105 mov &acc2&.16b, &tmp2&.16b
107.endm 106.endm
108 107
109.macro bilinear_load_and_vertical_interpolate_four_8888 \ 108.macro bilinear_load_and_vertical_interpolate_four_8888 \
110 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 109 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
111 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 110 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
@@ -114,126 +113,149 @@
114 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 113 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
115 bilinear_load_and_vertical_interpolate_two_8888 \ 114 bilinear_load_and_vertical_interpolate_two_8888 \
116 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 115 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
117.endm 116.endm
118 117
118.macro vzip reg1, reg2
119 zip1 v24.8b, reg1, reg2
120 zip2 reg2, reg1, reg2
121 mov reg1, v24.8b
122.endm
123
124.macro vuzp reg1, reg2
125 uzp1 v24.8b, reg1, reg2
126 uzp2 reg2, reg1, reg2
127 mov reg1, v24.8b
128.endm
129
119.macro bilinear_load_and_vertical_interpolate_two_0565 \ 130.macro bilinear_load_and_vertical_interpolate_two_0565 \
120 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 131 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
121 132 asr WTMP1, X, #16
122 asr TMP1, X, #16
123 add X, X, UX 133 add X, X, UX
124 add TMP1, TOP, TMP1, lsl #1 134 lsl TMP2, TMP1, #1
125 asr TMP2, X, #16 135 add TMP1, TOP, TMP2
136 asr WTMP2, X, #16
126 add X, X, UX 137 add X, X, UX
127 add TMP2, TOP, TMP2, lsl #1 138 lsl TMP3, TMP2, #1
128 vld1.32 {acc2lo[0]}, [TMP1], STRIDE 139 add TMP2, TOP, TMP3
129 vld1.32 {acc2hi[0]}, [TMP2], STRIDE 140 ld1 {&acc2lo&.s}[0], [TMP1], STRIDE
130 vld1.32 {acc2lo[1]}, [TMP1] 141 ld1 {&acc2hi&.s}[0], [TMP2], STRIDE
131 vld1.32 {acc2hi[1]}, [TMP2] 142 ld1 {&acc2lo&.s}[1], [TMP1]
143 ld1 {&acc2hi&.s}[1], [TMP2]
144 mov &acc2&.d[0], &acc2lo&.d[0]
145 mov &acc2&.d[1], &acc2hi&.d[0]
132 convert_0565_to_x888 acc2, reg3, reg2, reg1 146 convert_0565_to_x888 acc2, reg3, reg2, reg1
133 vzip.u8 reg1, reg3 147 vzip &reg1&.8b, &reg3&.8b
134 vzip.u8 reg2, reg4 148 vzip &reg2&.8b, &reg4&.8b
135 vzip.u8 reg3, reg4 149 vzip &reg3&.8b, &reg4&.8b
136 vzip.u8 reg1, reg2 150 vzip &reg1&.8b, &reg2&.8b
137 vmull.u8 acc1, reg1, d28 151 umull &acc1&.8h, &reg1&.8b, v28.8b
138 vmlal.u8 acc1, reg2, d29 152 umlal &acc1&.8h, &reg2&.8b, v29.8b
139 vmull.u8 acc2, reg3, d28 153 umull &acc2&.8h, &reg3&.8b, v28.8b
140 vmlal.u8 acc2, reg4, d29 154 umlal &acc2&.8h, &reg4&.8b, v29.8b
141.endm 155.endm
142 156
143.macro bilinear_load_and_vertical_interpolate_four_0565 \ 157.macro bilinear_load_and_vertical_interpolate_four_0565 \
144 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 158 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
145 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 159 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
146 160
147 asr TMP1, X, #16 161 asr WTMP1, X, #16
148 add X, X, UX 162 add X, X, UX
149 add TMP1, TOP, TMP1, lsl #1 163 lsl TMP2, TMP1, #1
150 asr TMP2, X, #16 164 add TMP1, TOP, TMP2
165 asr WTMP2, X, #16
151 add X, X, UX 166 add X, X, UX
152 add TMP2, TOP, TMP2, lsl #1 167 lsl TMP3, TMP2, #1
153 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE 168 add TMP2, TOP, TMP3
154 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE 169 ld1 {&xacc2lo&.s}[0], [TMP1], STRIDE
155 vld1.32 {xacc2lo[1]}, [TMP1] 170 ld1 {&xacc2hi&.s}[0], [TMP2], STRIDE
156 vld1.32 {xacc2hi[1]}, [TMP2] 171 ld1 {&xacc2lo&.s}[1], [TMP1]
172 ld1 {&xacc2hi&.s}[1], [TMP2]
173 mov &xacc2&.d[0], &xacc2lo&.d[0]
174 mov &xacc2&.d[1], &xacc2hi&.d[0]
157 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 175 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
158 asr TMP1, X, #16 176 asr WTMP1, X, #16
159 add X, X, UX 177 add X, X, UX
160 add TMP1, TOP, TMP1, lsl #1 178 lsl TMP2, TMP1, #1
161 asr TMP2, X, #16 179 add TMP1, TOP, TMP2
180 asr WTMP2, X, #16
162 add X, X, UX 181 add X, X, UX
163 add TMP2, TOP, TMP2, lsl #1 182 lsl TMP3, TMP2, #1
164 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE 183 add TMP2, TOP, TMP3
165 vzip.u8 xreg1, xreg3 184 ld1 {&yacc2lo&.s}[0], [TMP1], STRIDE
166 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE 185 vzip &xreg1&.8b, &xreg3&.8b
167 vzip.u8 xreg2, xreg4 186 ld1 {&yacc2hi&.s}[0], [TMP2], STRIDE
168 vld1.32 {yacc2lo[1]}, [TMP1] 187 vzip &xreg2&.8b, &xreg4&.8b
169 vzip.u8 xreg3, xreg4 188 ld1 {&yacc2lo&.s}[1], [TMP1]
170 vld1.32 {yacc2hi[1]}, [TMP2] 189 vzip &xreg3&.8b, &xreg4&.8b
171 vzip.u8 xreg1, xreg2 190 ld1 {&yacc2hi&.s}[1], [TMP2]
191 vzip &xreg1&.8b, &xreg2&.8b
192 mov &yacc2&.d[0], &yacc2lo&.d[0]
193 mov &yacc2&.d[1], &yacc2hi&.d[0]
172 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 194 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
173 vmull.u8 xacc1, xreg1, d28 195 umull &xacc1&.8h, &xreg1&.8b, v28.8b
174 vzip.u8 yreg1, yreg3 196 vzip &yreg1&.8b, &yreg3&.8b
175 vmlal.u8 xacc1, xreg2, d29 197 umlal &xacc1&.8h, &xreg2&.8b, v29.8b
176 vzip.u8 yreg2, yreg4 198 vzip &yreg2&.8b, &yreg4&.8b
177 vmull.u8 xacc2, xreg3, d28 199 umull &xacc2&.8h, &xreg3&.8b, v28.8b
178 vzip.u8 yreg3, yreg4 200 vzip &yreg3&.8b, &yreg4&.8b
179 vmlal.u8 xacc2, xreg4, d29 201 umlal &xacc2&.8h, &xreg4&.8b, v29.8b
180 vzip.u8 yreg1, yreg2 202 vzip &yreg1&.8b, &yreg2&.8b
181 vmull.u8 yacc1, yreg1, d28 203 umull &yacc1&.8h, &yreg1&.8b, v28.8b
182 vmlal.u8 yacc1, yreg2, d29 204 umlal &yacc1&.8h, &yreg2&.8b, v29.8b
183 vmull.u8 yacc2, yreg3, d28 205 umull &yacc2&.8h, &yreg3&.8b, v28.8b
184 vmlal.u8 yacc2, yreg4, d29 206 umlal &yacc2&.8h, &yreg4&.8b, v29.8b
185.endm 207.endm
186 208
187.macro bilinear_store_8888 numpix, tmp1, tmp2 209.macro bilinear_store_8888 numpix, tmp1, tmp2
188.if numpix == 4 210.if numpix == 4
189 vst1.32 {d0, d1}, [OUT]! 211 st1 {v0.2s, v1.2s}, [OUT], #16
190.elseif numpix == 2 212.elseif numpix == 2
191 vst1.32 {d0}, [OUT]! 213 st1 {v0.2s}, [OUT], #8
192.elseif numpix == 1 214.elseif numpix == 1
193 vst1.32 {d0[0]}, [OUT, :32]! 215 st1 {v0.s}[0], [OUT], #4
194.else 216.else
195 .error bilinear_store_8888 numpix is unsupported 217 .error bilinear_store_8888 numpix is unsupported
196.endif 218.endif
197.endm 219.endm
198 220
199.macro bilinear_store_0565 numpix, tmp1, tmp2 221.macro bilinear_store_0565 numpix, tmp1, tmp2
200 vuzp.u8 d0, d1 222 vuzp v0.8b, v1.8b
201 vuzp.u8 d2, d3 223 vuzp v2.8b, v3.8b
202 vuzp.u8 d1, d3 224 vuzp v1.8b, v3.8b
203 vuzp.u8 d0, d2 225 vuzp v0.8b, v2.8b
204 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 226 convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
205.if numpix == 4 227.if numpix == 4
206 vst1.16 {d2}, [OUT]! 228 st1 {v1.4h}, [OUT], #8
207.elseif numpix == 2 229.elseif numpix == 2
208 vst1.32 {d2[0]}, [OUT]! 230 st1 {v1.s}[0], [OUT], #4
209.elseif numpix == 1 231.elseif numpix == 1
210 vst1.16 {d2[0]}, [OUT]! 232 st1 {v1.h}[0], [OUT], #2
211.else 233.else
212 .error bilinear_store_0565 numpix is unsupported 234 .error bilinear_store_0565 numpix is unsupported
213.endif 235.endif
214.endm 236.endm
215 237
216 238
217/* 239/*
218 * Macros for loading mask pixels into register 'mask'. 240 * Macros for loading mask pixels into register 'mask'.
219 * vdup must be done in somewhere else. 241 * dup must be done in somewhere else.
220 */ 242 */
221.macro bilinear_load_mask_x numpix, mask 243.macro bilinear_load_mask_x numpix, mask
222.endm 244.endm
223 245
224.macro bilinear_load_mask_8 numpix, mask 246.macro bilinear_load_mask_8 numpix, mask
225.if numpix == 4 247.if numpix == 4
226 vld1.32 {mask[0]}, [MASK]! 248 ld1 {&mask&.s}[0], [MASK], #4
227.elseif numpix == 2 249.elseif numpix == 2
228 vld1.16 {mask[0]}, [MASK]! 250 ld1 {&mask&.h}[0], [MASK], #2
229.elseif numpix == 1 251.elseif numpix == 1
230 vld1.8 {mask[0]}, [MASK]! 252 ld1 {&mask&.b}[0], [MASK], #1
231.else 253.else
232 .error bilinear_load_mask_8 numpix is unsupported 254 .error bilinear_load_mask_8 numpix is unsupported
233.endif 255.endif
234 pld [MASK, #prefetch_offset] 256 prfm PLDL2STRM, [MASK, #prefetch_offset]
235.endm 257.endm
236 258
237.macro bilinear_load_mask mask_fmt, numpix, mask 259.macro bilinear_load_mask mask_fmt, numpix, mask
238 bilinear_load_mask_&mask_fmt numpix, mask 260 bilinear_load_mask_&mask_fmt numpix, mask
239.endm 261.endm
@@ -249,19 +271,21 @@
249.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 271.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
250.endm 272.endm
251 273
252.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 274.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
253.if numpix == 4 275.if numpix == 4
254 vld1.32 {dst0, dst1}, [OUT] 276 ld1 {&dst0&.2s, &dst1&.2s}, [OUT]
255.elseif numpix == 2 277.elseif numpix == 2
256 vld1.32 {dst0}, [OUT] 278 ld1 {&dst0&.2s}, [OUT]
257.elseif numpix == 1 279.elseif numpix == 1
258 vld1.32 {dst0[0]}, [OUT] 280 ld1 {&dst0&.s}[0], [OUT]
259.else 281.else
260 .error bilinear_load_dst_8888 numpix is unsupported 282 .error bilinear_load_dst_8888 numpix is unsupported
261.endif 283.endif
262 pld [OUT, #(prefetch_offset * 4)] 284 mov &dst01&.d[0], &dst0&.d[0]
285 mov &dst01&.d[1], &dst1&.d[0]
286 prfm PLDL2STRM, [OUT, #(prefetch_offset * 4)]
263.endm 287.endm
264 288
265.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 289.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
266 bilinear_load_dst_8888 numpix, dst0, dst1, dst01 290 bilinear_load_dst_8888 numpix, dst0, dst1, dst01
267.endm 291.endm
@@ -289,15 +313,15 @@
289.macro bilinear_duplicate_mask_x numpix, mask 313.macro bilinear_duplicate_mask_x numpix, mask
290.endm 314.endm
291 315
292.macro bilinear_duplicate_mask_8 numpix, mask 316.macro bilinear_duplicate_mask_8 numpix, mask
293.if numpix == 4 317.if numpix == 4
294 vdup.32 mask, mask[0] 318 dup &mask&.2s, &mask&.s[0]
295.elseif numpix == 2 319.elseif numpix == 2
296 vdup.16 mask, mask[0] 320 dup &mask&.4h, &mask&.h[0]
297.elseif numpix == 1 321.elseif numpix == 1
298 vdup.8 mask, mask[0] 322 dup &mask&.8b, &mask&.b[0]
299.else 323.else
300 .error bilinear_duplicate_mask_8 is unsupported 324 .error bilinear_duplicate_mask_8 is unsupported
301.endif 325.endif
302.endm 326.endm
303 327
@@ -307,47 +331,52 @@
307 331
308/* 332/*
309 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. 333 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
310 * Interleave should be done when maks is enabled or operator is 'over'. 334 * Interleave should be done when maks is enabled or operator is 'over'.
311 */ 335 */
312.macro bilinear_interleave src0, src1, dst0, dst1 336.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
313 vuzp.8 src0, src1 337 vuzp &src0&.8b, &src1&.8b
314 vuzp.8 dst0, dst1 338 vuzp &dst0&.8b, &dst1&.8b
315 vuzp.8 src0, src1 339 vuzp &src0&.8b, &src1&.8b
316 vuzp.8 dst0, dst1 340 vuzp &dst0&.8b, &dst1&.8b
341 mov &src01&.d[1], &src1&.d[0]
342 mov &src01&.d[0], &src0&.d[0]
343 mov &dst01&.d[1], &dst1&.d[0]
344 mov &dst01&.d[0], &dst0&.d[0]
317.endm 345.endm
318 346
319.macro bilinear_interleave_src_dst_x_src \ 347.macro bilinear_interleave_src_dst_x_src \
320 numpix, src0, src1, src01, dst0, dst1, dst01 348 numpix, src0, src1, src01, dst0, dst1, dst01
321.endm 349.endm
322 350
323.macro bilinear_interleave_src_dst_x_over \ 351.macro bilinear_interleave_src_dst_x_over \
324 numpix, src0, src1, src01, dst0, dst1, dst01 352 numpix, src0, src1, src01, dst0, dst1, dst01
325 353
326 bilinear_interleave src0, src1, dst0, dst1 354 bilinear_interleave src0, src1, src01, dst0, dst1, dst01
327.endm 355.endm
328 356
329.macro bilinear_interleave_src_dst_x_add \ 357.macro bilinear_interleave_src_dst_x_add \
330 numpix, src0, src1, src01, dst0, dst1, dst01 358 numpix, src0, src1, src01, dst0, dst1, dst01
359 bilinear_interleave src0, src1, src01, dst0, dst1, dst01
331.endm 360.endm
332 361
333.macro bilinear_interleave_src_dst_8_src \ 362.macro bilinear_interleave_src_dst_8_src \
334 numpix, src0, src1, src01, dst0, dst1, dst01 363 numpix, src0, src1, src01, dst0, dst1, dst01
335 364
336 bilinear_interleave src0, src1, dst0, dst1 365 bilinear_interleave src0, src1, src01, dst0, dst1, dst01
337.endm 366.endm
338 367
339.macro bilinear_interleave_src_dst_8_over \ 368.macro bilinear_interleave_src_dst_8_over \
340 numpix, src0, src1, src01, dst0, dst1, dst01 369 numpix, src0, src1, src01, dst0, dst1, dst01
341 370
342 bilinear_interleave src0, src1, dst0, dst1 371 bilinear_interleave src0, src1, src01, dst0, dst1, dst01
343.endm 372.endm
344 373
345.macro bilinear_interleave_src_dst_8_add \ 374.macro bilinear_interleave_src_dst_8_add \
346 numpix, src0, src1, src01, dst0, dst1, dst01 375 numpix, src0, src1, src01, dst0, dst1, dst01
347 376
348 bilinear_interleave src0, src1, dst0, dst1 377 bilinear_interleave src0, src1, src01, dst0, dst1, dst01
349.endm 378.endm
350 379
351.macro bilinear_interleave_src_dst \ 380.macro bilinear_interleave_src_dst \
352 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 381 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
353 382
@@ -368,18 +397,20 @@
368 397
369.macro bilinear_apply_mask_to_src_8 \ 398.macro bilinear_apply_mask_to_src_8 \
370 numpix, src0, src1, src01, mask, \ 399 numpix, src0, src1, src01, mask, \
371 tmp01, tmp23, tmp45, tmp67 400 tmp01, tmp23, tmp45, tmp67
372 401
373 vmull.u8 tmp01, src0, mask 402 umull &tmp01&.8h, &src0&.8b, &mask&.8b
374 vmull.u8 tmp23, src1, mask 403 umull &tmp23&.8h, &src1&.8b, &mask&.8b
375 /* bubbles */ 404 /* bubbles */
376 vrshr.u16 tmp45, tmp01, #8 405 urshr &tmp45&.8h, &tmp01&.8h, #8
377 vrshr.u16 tmp67, tmp23, #8 406 urshr &tmp67&.8h, &tmp23&.8h, #8
378 /* bubbles */ 407 /* bubbles */
379 vraddhn.u16 src0, tmp45, tmp01 408 raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h
380 vraddhn.u16 src1, tmp67, tmp23 409 raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h
410 mov &src01&.d[0], &src0&.d[0]
411 mov &src01&.d[1], &src1&.d[0]
381.endm 412.endm
382 413
383.macro bilinear_apply_mask_to_src \ 414.macro bilinear_apply_mask_to_src \
384 mask_fmt, numpix, src0, src1, src01, mask, \ 415 mask_fmt, numpix, src0, src1, src01, mask, \
385 tmp01, tmp23, tmp45, tmp67 416 tmp01, tmp23, tmp45, tmp67
@@ -401,32 +432,40 @@
401 432
402.macro bilinear_combine_over \ 433.macro bilinear_combine_over \
403 numpix, src0, src1, src01, dst0, dst1, dst01, \ 434 numpix, src0, src1, src01, dst0, dst1, dst01, \
404 tmp01, tmp23, tmp45, tmp67, tmp8 435 tmp01, tmp23, tmp45, tmp67, tmp8
405 436
406 vdup.32 tmp8, src1[1] 437 dup &tmp8&.2s, &src1&.s[1]
407 /* bubbles */ 438 /* bubbles */
408 vmvn.8 tmp8, tmp8 439 mvn &tmp8&.8b, &tmp8&.8b
409 /* bubbles */ 440 /* bubbles */
410 vmull.u8 tmp01, dst0, tmp8 441 umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b
411 /* bubbles */ 442 /* bubbles */
412 vmull.u8 tmp23, dst1, tmp8 443 umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b
413 /* bubbles */ 444 /* bubbles */
414 vrshr.u16 tmp45, tmp01, #8 445 urshr &tmp45&.8h, &tmp01&.8h, #8
415 vrshr.u16 tmp67, tmp23, #8 446 urshr &tmp67&.8h, &tmp23&.8h, #8
416 /* bubbles */ 447 /* bubbles */
417 vraddhn.u16 dst0, tmp45, tmp01 448 raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h
418 vraddhn.u16 dst1, tmp67, tmp23 449 raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h
450 mov &dst01&.d[0], &dst0&.d[0]
451 mov &dst01&.d[1], &dst1&.d[0]
419 /* bubbles */ 452 /* bubbles */
420 vqadd.u8 src01, dst01, src01 453 uqadd &src0&.8b, &dst0&.8b, &src0&.8b
454 uqadd &src1&.8b, &dst1&.8b, &src1&.8b
455 mov &src01&.d[0], &src0&.d[0]
456 mov &src01&.d[1], &src1&.d[0]
421.endm 457.endm
422 458
423.macro bilinear_combine_add \ 459.macro bilinear_combine_add \
424 numpix, src0, src1, src01, dst0, dst1, dst01, \ 460 numpix, src0, src1, src01, dst0, dst1, dst01, \
425 tmp01, tmp23, tmp45, tmp67, tmp8 461 tmp01, tmp23, tmp45, tmp67, tmp8
426 462
427 vqadd.u8 src01, dst01, src01 463 uqadd &src0&.8b, &dst0&.8b, &src0&.8b
464 uqadd &src1&.8b, &dst1&.8b, &src1&.8b
465 mov &src01&.d[0], &src0&.d[0]
466 mov &src01&.d[1], &src1&.d[0]
428.endm 467.endm
429 468
430.macro bilinear_combine \ 469.macro bilinear_combine \
431 op, numpix, src0, src1, src01, dst0, dst1, dst01, \ 470 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
432 tmp01, tmp23, tmp45, tmp67, tmp8 471 tmp01, tmp23, tmp45, tmp67, tmp8
@@ -438,23 +477,26 @@
438 477
439/* 478/*
440 * Macros for final deinterleaving of destination pixels if needed. 479 * Macros for final deinterleaving of destination pixels if needed.
441 */ 480 */
442.macro bilinear_deinterleave numpix, dst0, dst1, dst01 481.macro bilinear_deinterleave numpix, dst0, dst1, dst01
443 vuzp.8 dst0, dst1 482 vuzp &dst0&.8b, &dst1&.8b
444 /* bubbles */ 483 /* bubbles */
445 vuzp.8 dst0, dst1 484 vuzp &dst0&.8b, &dst1&.8b
485 mov &dst01&.d[0], &dst0&.d[0]
486 mov &dst01&.d[1], &dst1&.d[0]
446.endm 487.endm
447 488
448.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 489.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
449.endm 490.endm
450 491
451.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 492.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
452 bilinear_deinterleave numpix, dst0, dst1, dst01 493 bilinear_deinterleave numpix, dst0, dst1, dst01
453.endm 494.endm
454 495
455.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 496.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
497 bilinear_deinterleave numpix, dst0, dst1, dst01
456.endm 498.endm
457 499
458.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 500.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
459 bilinear_deinterleave numpix, dst0, dst1, dst01 501 bilinear_deinterleave numpix, dst0, dst1, dst01
460.endm 502.endm
@@ -471,294 +513,381 @@
471 bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 513 bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
472.endm 514.endm
473 515
474 516
475.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op 517.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
476 bilinear_load_&src_fmt d0, d1, d2 518 bilinear_load_&src_fmt v0, v1, v2
477 bilinear_load_mask mask_fmt, 1, d4 519 bilinear_load_mask mask_fmt, 1, v4
478 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 520 bilinear_load_dst dst_fmt, op, 1, v18, v19, v9
479 vmull.u8 q1, d0, d28 521 mov v9.d[0], v18.d[0]
480 vmlal.u8 q1, d1, d29 522 mov v9.d[1], v19.d[0]
523 umull v2.8h, v0.8b, v28.8b
524 umlal v2.8h, v1.8b, v29.8b
481 /* 5 cycles bubble */ 525 /* 5 cycles bubble */
482 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 526 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
483 vmlsl.u16 q0, d2, d30 527 umlsl v0.4s, v2.4h, v30.4h
484 vmlal.u16 q0, d3, d30 528 mov v2.d[0], v2.d[1]
529 umlal v0.4s, v2.4h, v30.4h
485 /* 5 cycles bubble */ 530 /* 5 cycles bubble */
486 bilinear_duplicate_mask mask_fmt, 1, d4 531 bilinear_duplicate_mask mask_fmt, 1, v4
487 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 532 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
488 /* 3 cycles bubble */ 533 /* 3 cycles bubble */
489 vmovn.u16 d0, q0 534 xtn v0.8b, v0.8h
490 /* 1 cycle bubble */ 535 /* 1 cycle bubble */
491 bilinear_interleave_src_dst \ 536 bilinear_interleave_src_dst \
492 mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 537 mask_fmt, op, 1, v0, v1, v0, v18, v19, v9
538 mov v1.d[0], v0.d[1]
539 mov v18.d[0], v9.d[0]
540 mov v19.d[0], v9.d[1]
493 bilinear_apply_mask_to_src \ 541 bilinear_apply_mask_to_src \
494 mask_fmt, 1, d0, d1, q0, d4, \ 542 mask_fmt, 1, v0, v1, v0, v4, \
495 q3, q8, q10, q11 543 v3, v8, v10, v11
544 mov v1.d[0], v0.d[1]
496 bilinear_combine \ 545 bilinear_combine \
497 op, 1, d0, d1, q0, d18, d19, q9, \ 546 op, 1, v0, v1, v0, v18, v19, v9, \
498 q3, q8, q10, q11, d5 547 v3, v8, v10, v11, v5
499 bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 548 mov v1.d[0], v0.d[1]
500 bilinear_store_&dst_fmt 1, q2, q3 549 bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0
550 mov v1.d[0], v0.d[1]
551 bilinear_store_&dst_fmt 1, v17, v18
501.endm 552.endm
502 553
503.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op 554.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
504 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 555 bilinear_load_and_vertical_interpolate_two_&src_fmt \
505 q1, q11, d0, d1, d20, d21, d22, d23 556 v1, v11, v18, v19, v20, v21, v22, v23
506 bilinear_load_mask mask_fmt, 2, d4 557 mov v2.d[0], v1.d[0]
507 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 558 mov v3.d[0], v1.d[1]
508 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 559 mov v22.d[0], v11.d[0]
509 vmlsl.u16 q0, d2, d30 560 mov v23.d[0], v11.d[1]
510 vmlal.u16 q0, d3, d30 561 bilinear_load_mask mask_fmt, 2, v4
511 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 562 bilinear_load_dst dst_fmt, op, 2, v18, v19, v9
512 vmlsl.u16 q10, d22, d31 563 mov v9.d[0], v18.d[0]
513 vmlal.u16 q10, d23, d31 564 mov v9.d[1], v19.d[0]
514 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 565 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
515 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 566 umlsl v0.4s, v2.4h, v30.4h
516 bilinear_duplicate_mask mask_fmt, 2, d4 567 umlal v0.4s, v3.4h, v30.4h
517 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 568 ushll v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
518 vadd.u16 q12, q12, q13 569 umlsl v10.4s, v22.4h, v31.4h
519 vmovn.u16 d0, q0 570 umlal v10.4s, v23.4h, v31.4h
571 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
572 shrn v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
573 mov v0.d[1], v1.d[0]
574 bilinear_duplicate_mask mask_fmt, 2, v4
575 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
576 mov v31.d[0], v30.d[1]
577 add v12.8h, v12.8h, v13.8h
578 xtn v0.8b, v0.8h
520 bilinear_interleave_src_dst \ 579 bilinear_interleave_src_dst \
521 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 580 mask_fmt, op, 2, v0, v1, v0, v18, v19, v9
581 mov v1.d[0], v0.d[1]
582 mov v18.d[0], v9.d[0]
583 mov v19.d[0], v9.d[1]
522 bilinear_apply_mask_to_src \ 584 bilinear_apply_mask_to_src \
523 mask_fmt, 2, d0, d1, q0, d4, \ 585 mask_fmt, 2, v0, v1, v0, v4, \
524 q3, q8, q10, q11 586 v3, v8, v10, v11
587 mov v1.d[0], v0.d[1]
525 bilinear_combine \ 588 bilinear_combine \
526 op, 2, d0, d1, q0, d18, d19, q9, \ 589 op, 2, v0, v1, v0, v18, v19, v9, \
527 q3, q8, q10, q11, d5 590 v3, v8, v10, v11, v5
528 bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 591 mov v1.d[0], v0.d[1]
529 bilinear_store_&dst_fmt 2, q2, q3 592 bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0
593 mov v1.d[0], v0.d[1]
594 bilinear_store_&dst_fmt 2, v16, v17
530.endm 595.endm
531 596
532.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op 597.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
533 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 598 bilinear_load_and_vertical_interpolate_four_&src_fmt \
534 q1, q11, d0, d1, d20, d21, d22, d23 \ 599 v1, v11, v4, v5, v6, v7, v22, v23 \
535 q3, q9, d4, d5, d16, d17, d18, d19 600 v3, v9, v16, v17, v20, v21, v18, v19
536 pld [TMP1, PF_OFFS] 601 mov v6.d[0], v3.d[0]
602 mov v7.d[0], v3.d[1]
603 mov v18.d[0], v9.d[0]
604 mov v19.d[0], v9.d[1]
605 mov v2.d[0], v1.d[0]
606 mov v3.d[0], v1.d[1]
607 mov v22.d[0], v11.d[0]
608 mov v23.d[0], v11.d[1]
609 prfm PLDL2STRM, [TMP1, PF_OFFS]
537 sub TMP1, TMP1, STRIDE 610 sub TMP1, TMP1, STRIDE
538 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 611 prfm PLDL2STRM, [TMP1, PF_OFFS]
539 vmlsl.u16 q0, d2, d30 612 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
540 vmlal.u16 q0, d3, d30 613 umlsl v0.4s, v2.4h, v30.4h
541 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 614 umlal v0.4s, v3.4h, v30.4h
542 vmlsl.u16 q10, d22, d31 615 ushll v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
543 vmlal.u16 q10, d23, d31 616 umlsl v10.4s, v22.4h, v31.4h
544 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 617 umlal v10.4s, v23.4h, v31.4h
545 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 618 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
546 vmlsl.u16 q2, d6, d30 619 mov v31.d[0], v30.d[1]
547 vmlal.u16 q2, d7, d30 620 ushll v2.4s, v6.4h, #BILINEAR_INTERPOLATION_BITS
548 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 621 umlsl v2.4s, v6.4h, v30.4h
549 bilinear_load_mask mask_fmt, 4, d22 622 umlal v2.4s, v7.4h, v30.4h
550 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 623 ushll v8.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
551 pld [TMP1, PF_OFFS] 624 umlsl v8.4s, v18.4h, v31.4h
552 vmlsl.u16 q8, d18, d31 625 umlal v8.4s, v19.4h, v31.4h
553 vmlal.u16 q8, d19, d31 626 add v12.8h, v12.8h, v13.8h
554 vadd.u16 q12, q12, q13 627 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
555 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 628 shrn v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
556 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 629 mov v0.d[1], v1.d[0]
557 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 630 shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
558 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 631 shrn v5.4h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
559 bilinear_duplicate_mask mask_fmt, 4, d22 632 mov v2.d[0], v4.d[0]
560 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 633 mov v2.d[1], v5.d[0]
561 vmovn.u16 d0, q0 634 bilinear_load_mask mask_fmt, 4, v4
562 vmovn.u16 d1, q2 635 bilinear_duplicate_mask mask_fmt, 4, v4
563 vadd.u16 q12, q12, q13 636 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
637 mov v31.d[0], v30.d[1]
638 xtn v0.8b, v0.8h
639 xtn v1.8b, v2.8h
640 add v12.8h, v12.8h, v13.8h
641 bilinear_load_dst dst_fmt, op, 4, v2, v3, v21
642 mov v21.d[0], v2.d[0]
643 mov v21.d[1], v3.d[0]
564 bilinear_interleave_src_dst \ 644 bilinear_interleave_src_dst \
565 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 645 mask_fmt, op, 4, v0, v1, v0, v2, v3, v1
646 mov v2.d[0], v1.d[0]
647 mov v3.d[0], v1.d[1]
648 mov v1.d[0], v0.d[1]
566 bilinear_apply_mask_to_src \ 649 bilinear_apply_mask_to_src \
567 mask_fmt, 4, d0, d1, q0, d22, \ 650 mask_fmt, 4, v0, v1, v0, v4, \
568 q3, q8, q9, q10 651 v6, v8, v9, v10
652 mov v1.d[0], v0.d[1]
569 bilinear_combine \ 653 bilinear_combine \
570 op, 4, d0, d1, q0, d2, d3, q1, \ 654 op, 4, v0, v1, v0, v2, v3, v1, \
571 q3, q8, q9, q10, d23 655 v6, v8, v9, v10, v23
572 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 656 mov v1.d[0], v0.d[1]
573 bilinear_store_&dst_fmt 4, q2, q3 657 bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0
658 mov v1.d[0], v0.d[1]
659 bilinear_store_&dst_fmt 4, v6, v7
574.endm 660.endm
575 661
576.set BILINEAR_FLAG_USE_MASK, 1 662.set BILINEAR_FLAG_USE_MASK, 1
577.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 663.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
578 664
579/* 665/*
580 * Main template macro for generating NEON optimized bilinear scanline functions. 666 * Main template macro for generating NEON optimized bilinear scanline functions.
581 * 667 *
582 * Bilinear scanline generator macro take folling arguments: 668 * Bilinear scanline generator macro take folling arguments:
583 * fname - name of the function to generate 669 * fname - name of the function to generate
584 * src_fmt - source color format (8888 or 0565) 670 * src_fmt - source color format (8888 or 0565)
585 * dst_fmt - destination color format (8888 or 0565) 671 * dst_fmt - destination color format (8888 or 0565)
586 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes 672 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
587 * process_last_pixel - code block that interpolate one pixel and does not 673 * process_last_pixel - code block that interpolate one pixel and does not
588 * update horizontal weight 674 * update horizontal weight
589 * process_two_pixels - code block that interpolate two pixels and update 675 * process_two_pixels - code block that interpolate two pixels and update
590 * horizontal weight 676 * horizontal weight
591 * process_four_pixels - code block that interpolate four pixels and update 677 * process_four_pixels - code block that interpolate four pixels and update
592 * horizontal weight 678 * horizontal weight
593 * process_pixblock_head - head part of middle loop 679 * process_pixblock_head - head part of middle loop
594 * process_pixblock_tail - tail part of middle loop 680 * process_pixblock_tail - tail part of middle loop
595 * process_pixblock_tail_head - tail_head of middle loop 681 * process_pixblock_tail_head - tail_head of middle loop
596 * pixblock_size - number of pixels processed in a single middle loop 682 * pixblock_size - number of pixels processed in a single middle loop
597 * prefetch_distance - prefetch in the source image by that many pixels ahead 683 * prefetch_distance - prefetch in the source image by that many pixels ahead
598 */ 684 */
599 685
600.macro generate_bilinear_scanline_func \ 686.macro generate_bilinear_scanline_func \
601 fname, \ 687 fname, \
602 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ 688 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
603 bilinear_process_last_pixel, \ 689 bilinear_process_last_pixel, \
604 bilinear_process_two_pixels, \ 690 bilinear_process_two_pixels, \
605 bilinear_process_four_pixels, \ 691 bilinear_process_four_pixels, \
606 bilinear_process_pixblock_head, \ 692 bilinear_process_pixblock_head, \
607 bilinear_process_pixblock_tail, \ 693 bilinear_process_pixblock_tail, \
608 bilinear_process_pixblock_tail_head, \ 694 bilinear_process_pixblock_tail_head, \
609 pixblock_size, \ 695 pixblock_size, \
610 prefetch_distance, \ 696 prefetch_distance, \
611 flags 697 flags
612 698
613pixman_asm_function fname 699pixman_asm_function fname
614.if pixblock_size == 8 700.if pixblock_size == 8
615.elseif pixblock_size == 4 701.elseif pixblock_size == 4
616.else 702.else
617 .error unsupported pixblock size 703 .error unsupported pixblock size
618.endif 704.endif
619 705
620.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 706.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
621 OUT .req r0 707 OUT .req x0
622 TOP .req r1 708 TOP .req x1
623 BOTTOM .req r2 709 BOTTOM .req x2
624 WT .req r3 710 WT .req x3
625 WB .req r4 711 WWT .req w3
626 X .req r5 712 WB .req x4
627 UX .req r6 713 WWB .req w4
628 WIDTH .req ip 714 X .req w5
629 TMP1 .req r3 715 UX .req w6
630 TMP2 .req r4 716 WIDTH .req x7
631 PF_OFFS .req r7 717 TMP1 .req x10
632 TMP3 .req r8 718 WTMP1 .req w10
633 TMP4 .req r9 719 TMP2 .req x11
634 STRIDE .req r2 720 WTMP2 .req w11
635 721 PF_OFFS .req x12
636 mov ip, sp 722 TMP3 .req x13
637 push {r4, r5, r6, r7, r8, r9} 723 WTMP3 .req w13
638 mov PF_OFFS, #prefetch_distance 724 TMP4 .req x14
639 ldmia ip, {WB, X, UX, WIDTH} 725 WTMP4 .req w14
726 STRIDE .req x15
727 DUMMY .req x30
728
729 stp x29, x30, [sp, -16]!
730 mov x29, sp
731 sub sp, sp, 112
732 sub x29, x29, 64
733 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
734 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
735 stp x10, x11, [x29, -80]
736 stp x12, x13, [x29, -96]
737 stp x14, x15, [x29, -112]
640.else 738.else
641 OUT .req r0 739 OUT .req x0
642 MASK .req r1 740 MASK .req x1
643 TOP .req r2 741 TOP .req x2
644 BOTTOM .req r3 742 BOTTOM .req x3
645 WT .req r4 743 WT .req x4
646 WB .req r5 744 WWT .req w4
647 X .req r6 745 WB .req x5
648 UX .req r7 746 WWB .req w5
649 WIDTH .req ip 747 X .req w6
650 TMP1 .req r4 748 UX .req w7
651 TMP2 .req r5 749 WIDTH .req x8
652 PF_OFFS .req r8 750 TMP1 .req x10
653 TMP3 .req r9 751 WTMP1 .req w10
654 TMP4 .req r10 752 TMP2 .req x11
655 STRIDE .req r3 753 WTMP2 .req w11
754 PF_OFFS .req x12
755 TMP3 .req x13
756 WTMP3 .req w13
757 TMP4 .req x14
758 WTMP4 .req w14
759 STRIDE .req x15
760 DUMMY .req x30
656 761
657 .set prefetch_offset, prefetch_distance 762 .set prefetch_offset, prefetch_distance
658 763
659 mov ip, sp 764 stp x29, x30, [sp, -16]!
660 push {r4, r5, r6, r7, r8, r9, r10, ip} 765 mov x29, sp
661 mov PF_OFFS, #prefetch_distance 766 sub x29, x29, 64
662 ldmia ip, {WT, WB, X, UX, WIDTH} 767 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
768 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
769 stp x10, x11, [x29, -80]
770 stp x12, x13, [x29, -96]
771 stp x14, x15, [x29, -112]
772 str x8, [x29, -120]
773 ldr w8, [x29, 16]
774 sub sp, sp, 120
663.endif 775.endif
664 776
665 mul PF_OFFS, PF_OFFS, UX 777 mov WTMP1, #prefetch_distance
666 778 umull PF_OFFS, WTMP1, UX
667.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
668 vpush {d8-d15}
669.endif
670 779
671 sub STRIDE, BOTTOM, TOP 780 sub STRIDE, BOTTOM, TOP
672 .unreq BOTTOM 781 .unreq BOTTOM
673 782
674 cmp WIDTH, #0 783 cmp WIDTH, #0
675 ble 3f 784 ble 300f
676 785
677 vdup.u16 q12, X 786 dup v12.8h, X
678 vdup.u16 q13, UX 787 dup v13.8h, UX
679 vdup.u8 d28, WT 788 dup v28.8b, WWT
680 vdup.u8 d29, WB 789 dup v29.8b, WWB
681 vadd.u16 d25, d25, d26 790 mov v25.d[0], v12.d[1]
791 mov v26.d[0], v13.d[0]
792 add v25.4h, v25.4h, v26.4h
793 mov v12.d[1], v25.d[0]
682 794
683 /* ensure good destination alignment */ 795 /* ensure good destination alignment */
684 cmp WIDTH, #1 796 cmp WIDTH, #1
685 blt 0f 797 blt 100f
686 tst OUT, #(1 << dst_bpp_shift) 798 tst OUT, #(1 << dst_bpp_shift)
687 beq 0f 799 beq 100f
688 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 800 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
689 vadd.u16 q12, q12, q13 801 mov v31.d[0], v30.d[1]
802 add v12.8h, v12.8h, v13.8h
690 bilinear_process_last_pixel 803 bilinear_process_last_pixel
691 sub WIDTH, WIDTH, #1 804 sub WIDTH, WIDTH, #1
6920: 805100:
693 vadd.u16 q13, q13, q13 806 add v13.8h, v13.8h, v13.8h
694 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 807 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
695 vadd.u16 q12, q12, q13 808 mov v31.d[0], v30.d[1]
809 add v12.8h, v12.8h, v13.8h
696 810
697 cmp WIDTH, #2 811 cmp WIDTH, #2
698 blt 0f 812 blt 100f
699 tst OUT, #(1 << (dst_bpp_shift + 1)) 813 tst OUT, #(1 << (dst_bpp_shift + 1))
700 beq 0f 814 beq 100f
701 bilinear_process_two_pixels 815 bilinear_process_two_pixels
702 sub WIDTH, WIDTH, #2 816 sub WIDTH, WIDTH, #2
7030: 817100:
704.if pixblock_size == 8 818.if pixblock_size == 8
705 cmp WIDTH, #4 819 cmp WIDTH, #4
706 blt 0f 820 blt 100f
707 tst OUT, #(1 << (dst_bpp_shift + 2)) 821 tst OUT, #(1 << (dst_bpp_shift + 2))
708 beq 0f 822 beq 100f
709 bilinear_process_four_pixels 823 bilinear_process_four_pixels
710 sub WIDTH, WIDTH, #4 824 sub WIDTH, WIDTH, #4
7110: 825100:
712.endif 826.endif
713 subs WIDTH, WIDTH, #pixblock_size 827 subs WIDTH, WIDTH, #pixblock_size
714 blt 1f 828 blt 100f
715 asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) 829 asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
716 bilinear_process_pixblock_head 830 bilinear_process_pixblock_head
717 subs WIDTH, WIDTH, #pixblock_size 831 subs WIDTH, WIDTH, #pixblock_size
718 blt 5f 832 blt 500f
7190: 8330:
720 bilinear_process_pixblock_tail_head 834 bilinear_process_pixblock_tail_head
721 subs WIDTH, WIDTH, #pixblock_size 835 subs WIDTH, WIDTH, #pixblock_size
722 bge 0b 836 bge 0b
7235: 837500:
724 bilinear_process_pixblock_tail 838 bilinear_process_pixblock_tail
7251: 839100:
726.if pixblock_size == 8 840.if pixblock_size == 8
727 tst WIDTH, #4 841 tst WIDTH, #4
728 beq 2f 842 beq 200f
729 bilinear_process_four_pixels 843 bilinear_process_four_pixels
7302: 844200:
731.endif 845.endif
732 /* handle the remaining trailing pixels */ 846 /* handle the remaining trailing pixels */
733 tst WIDTH, #2 847 tst WIDTH, #2
734 beq 2f 848 beq 200f
735 bilinear_process_two_pixels 849 bilinear_process_two_pixels
7362: 850200:
737 tst WIDTH, #1 851 tst WIDTH, #1
738 beq 3f 852 beq 300f
739 bilinear_process_last_pixel 853 bilinear_process_last_pixel
7403: 854300:
741.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
742 vpop {d8-d15}
743.endif
744 855
745.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 856.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
746 pop {r4, r5, r6, r7, r8, r9} 857 sub x29, x29, 64
858 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
859 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
860 ldp x10, x11, [x29, -80]
861 ldp x12, x13, [x29, -96]
862 ldp x14, x15, [x29, -112]
863 mov sp, x29
864 ldp x29, x30, [sp], 16
747.else 865.else
748 pop {r4, r5, r6, r7, r8, r9, r10, ip} 866 sub x29, x29, 64
867 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
868 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
869 ldp x10, x11, [x29, -80]
870 ldp x12, x13, [x29, -96]
871 ldp x14, x15, [x29, -112]
872 ldr x8, [x29, -120]
873 mov sp, x29
874 ldp x29, x30, [sp], 16
749.endif 875.endif
750 bx lr 876 ret
751 877
752 .unreq OUT 878 .unreq OUT
753 .unreq TOP 879 .unreq TOP
754 .unreq WT 880 .unreq WT
881 .unreq WWT
755 .unreq WB 882 .unreq WB
883 .unreq WWB
756 .unreq X 884 .unreq X
757 .unreq UX 885 .unreq UX
758 .unreq WIDTH 886 .unreq WIDTH
759 .unreq TMP1 887 .unreq TMP1
888 .unreq WTMP1
760 .unreq TMP2 889 .unreq TMP2
761 .unreq PF_OFFS 890 .unreq PF_OFFS
762 .unreq TMP3 891 .unreq TMP3
763 .unreq TMP4 892 .unreq TMP4
764 .unreq STRIDE 893 .unreq STRIDE
@@ -882,162 +1011,188 @@ pixman_asm_function fname
882.macro bilinear_over_8888_8888_process_four_pixels 1011.macro bilinear_over_8888_8888_process_four_pixels
883 bilinear_interpolate_four_pixels 8888, x, 8888, over 1012 bilinear_interpolate_four_pixels 8888, x, 8888, over
884.endm 1013.endm
885 1014
886.macro bilinear_over_8888_8888_process_pixblock_head 1015.macro bilinear_over_8888_8888_process_pixblock_head
887 asr TMP1, X, #16 1016 asr WTMP1, X, #16
888 add X, X, UX 1017 add X, X, UX
889 add TMP1, TOP, TMP1, lsl #2 1018 lsl TMP2, TMP1, #2
890 asr TMP2, X, #16 1019 add TMP1, TOP, TMP2
1020 asr WTMP2, X, #16
891 add X, X, UX 1021 add X, X, UX
892 add TMP2, TOP, TMP2, lsl #2 1022 lsl TMP3, TMP2, #2
1023 add TMP2, TOP, TMP3
893 1024
894 vld1.32 {d22}, [TMP1], STRIDE 1025 ld1 {v22.2s}, [TMP1], STRIDE
895 vld1.32 {d23}, [TMP1] 1026 ld1 {v23.2s}, [TMP1]
896 asr TMP3, X, #16 1027 asr WTMP3, X, #16
897 add X, X, UX 1028 add X, X, UX
898 add TMP3, TOP, TMP3, lsl #2 1029 lsl TMP4, TMP3, #2
899 vmull.u8 q8, d22, d28 1030 add TMP3, TOP, TMP4
900 vmlal.u8 q8, d23, d29 1031 umull v16.8h, v22.8b, v28.8b
901 1032 umlal v16.8h, v23.8b, v29.8b
902 vld1.32 {d22}, [TMP2], STRIDE 1033 mov v17.d[0], v16.d[1]
903 vld1.32 {d23}, [TMP2] 1034
904 asr TMP4, X, #16 1035 ld1 {v22.2s}, [TMP2], STRIDE
1036 ld1 {v23.2s}, [TMP2]
1037 asr WTMP4, X, #16
905 add X, X, UX 1038 add X, X, UX
906 add TMP4, TOP, TMP4, lsl #2 1039 lsl TMP1, TMP4, #2
907 vmull.u8 q9, d22, d28 1040 add TMP4, TOP, TMP1
908 vmlal.u8 q9, d23, d29 1041 umull v18.8h, v22.8b, v28.8b
909 1042 umlal v18.8h, v23.8b, v29.8b
910 vld1.32 {d22}, [TMP3], STRIDE 1043 mov v19.d[0], v18.d[1]
911 vld1.32 {d23}, [TMP3] 1044
912 vmull.u8 q10, d22, d28 1045 ld1 {v22.2s}, [TMP3], STRIDE
913 vmlal.u8 q10, d23, d29 1046 ld1 {v23.2s}, [TMP3]
914 1047 umull v20.8h, v22.8b, v28.8b
915 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 1048 umlal v20.8h, v23.8b, v29.8b
916 vmlsl.u16 q0, d16, d30 1049 mov v21.d[0], v20.d[1]
917 vmlal.u16 q0, d17, d30 1050
918 1051 ushll v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
919 pld [TMP4, PF_OFFS] 1052 umlsl v0.4s, v16.4h, v30.4h
920 vld1.32 {d16}, [TMP4], STRIDE 1053 umlal v0.4s, v17.4h, v30.4h
921 vld1.32 {d17}, [TMP4] 1054
922 pld [TMP4, PF_OFFS] 1055 prfm PLDL2STRM, [TMP4, PF_OFFS]
923 vmull.u8 q11, d16, d28 1056 ld1 {v16.2s}, [TMP4], STRIDE
924 vmlal.u8 q11, d17, d29 1057 ld1 {v17.2s}, [TMP4]
925 1058 prfm PLDL2STRM, [TMP4, PF_OFFS]
926 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 1059 umull v22.8h, v16.8b, v28.8b
927 vmlsl.u16 q1, d18, d31 1060 umlal v22.8h, v17.8b, v29.8b
928 vmlal.u16 q1, d19, d31 1061 mov v23.d[0], v22.d[1]
929 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1062
930 vadd.u16 q12, q12, q13 1063 ushll v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
1064 umlsl v1.4s, v18.4h, v31.4h
1065 umlal v1.4s, v19.4h, v31.4h
1066 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
1067 mov v31.d[0], v30.d[1]
1068 add v12.8h, v12.8h, v13.8h
931.endm 1069.endm
932 1070
933.macro bilinear_over_8888_8888_process_pixblock_tail 1071.macro bilinear_over_8888_8888_process_pixblock_tail
934 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 1072 ushll v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
935 vmlsl.u16 q2, d20, d30 1073 umlsl v2.4s, v20.4h, v30.4h
936 vmlal.u16 q2, d21, d30 1074 umlal v2.4s, v21.4h, v30.4h
937 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 1075 ushll v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
938 vmlsl.u16 q3, d22, d31 1076 umlsl v3.4s, v22.4h, v31.4h
939 vmlal.u16 q3, d23, d31 1077 umlal v3.4s, v23.4h, v31.4h
940 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1078 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
941 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1079 shrn v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
942 vld1.32 {d2, d3}, [OUT, :128] 1080 mov v0.d[1], v1.d[0]
943 pld [OUT, #(prefetch_offset * 4)] 1081 ld1 {v22.2s, v23.2s}, [OUT]
944 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 1082 prfm PLDL2STRM, [OUT, #(prefetch_offset * 4)]
945 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1083 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
946 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 1084 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
947 vmovn.u16 d6, q0 1085 mov v31.d[0], v30.d[1]
948 vmovn.u16 d7, q2 1086 shrn v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
949 vuzp.8 d6, d7 1087 mov v2.d[1], v5.d[0]
950 vuzp.8 d2, d3 1088 xtn v6.8b, v0.8h
951 vuzp.8 d6, d7 1089 xtn v7.8b, v2.8h
952 vuzp.8 d2, d3 1090 vuzp v6.8b, v7.8b
953 vdup.32 d4, d7[1] 1091 vuzp v22.8b, v23.8b
954 vmvn.8 d4, d4 1092 vuzp v6.8b, v7.8b
955 vmull.u8 q11, d2, d4 1093 vuzp v22.8b, v23.8b
956 vmull.u8 q2, d3, d4 1094 dup v4.2s, v7.s[1]
957 vrshr.u16 q1, q11, #8 1095 mvn v4.8b, v4.8b
958 vrshr.u16 q10, q2, #8 1096 umull v11.8h, v22.8b, v4.8b
959 vraddhn.u16 d2, q1, q11 1097 umull v2.8h, v23.8b, v4.8b
960 vraddhn.u16 d3, q10, q2 1098 urshr v1.8h, v11.8h, #8
961 vqadd.u8 q3, q1, q3 1099 urshr v10.8h, v2.8h, #8
962 vuzp.8 d6, d7 1100 raddhn v3.8b, v10.8h, v2.8h
963 vuzp.8 d6, d7 1101 raddhn v2.8b, v1.8h, v11.8h
964 vadd.u16 q12, q12, q13 1102 uqadd v6.8b, v2.8b, v6.8b
965 vst1.32 {d6, d7}, [OUT, :128]! 1103 uqadd v7.8b, v3.8b, v7.8b
1104 vuzp v6.8b, v7.8b
1105 vuzp v6.8b, v7.8b
1106 add v12.8h, v12.8h, v13.8h
1107 st1 {v6.2s, v7.2s}, [OUT], #16
966.endm 1108.endm
967 1109
968.macro bilinear_over_8888_8888_process_pixblock_tail_head 1110.macro bilinear_over_8888_8888_process_pixblock_tail_head
969 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 1111 ushll v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
970 asr TMP1, X, #16 1112 asr WTMP1, X, #16
971 add X, X, UX 1113 add X, X, UX
972 add TMP1, TOP, TMP1, lsl #2 1114 lsl TMP2, TMP1, #2
973 vmlsl.u16 q2, d20, d30 1115 add TMP1, TOP, TMP2
974 asr TMP2, X, #16 1116 umlsl v2.4s, v20.4h, v30.4h
1117 asr WTMP2, X, #16
975 add X, X, UX 1118 add X, X, UX
976 add TMP2, TOP, TMP2, lsl #2 1119 lsl TMP3, TMP2, #2
977 vmlal.u16 q2, d21, d30 1120 add TMP2, TOP, TMP3
978 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 1121 umlal v2.4s, v21.4h, v30.4h
979 vld1.32 {d20}, [TMP1], STRIDE 1122 ushll v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
980 vmlsl.u16 q3, d22, d31 1123 ld1 {v20.2s}, [TMP1], STRIDE
981 vmlal.u16 q3, d23, d31 1124 umlsl v3.4s, v22.4h, v31.4h
982 vld1.32 {d21}, [TMP1] 1125 umlal v3.4s, v23.4h, v31.4h
983 vmull.u8 q8, d20, d28 1126 ld1 {v21.2s}, [TMP1]
984 vmlal.u8 q8, d21, d29 1127 umull v16.8h, v20.8b, v28.8b
985 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1128 umlal v16.8h, v21.8b, v29.8b
986 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1129 mov v17.d[0], v16.d[1]
987 vld1.32 {d2, d3}, [OUT, :128] 1130 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
988 pld [OUT, PF_OFFS] 1131 shrn v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
989 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 1132 mov v0.d[1], v1.d[0]
990 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1133 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
991 vld1.32 {d22}, [TMP2], STRIDE 1134 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
992 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 1135 mov v31.d[0], v30.d[1]
993 vmovn.u16 d6, q0 1136 ld1 {v22.2s}, [TMP2], STRIDE
994 vld1.32 {d23}, [TMP2] 1137 shrn v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
995 vmull.u8 q9, d22, d28 1138 mov v2.d[1], v5.d[0]
996 asr TMP3, X, #16 1139 xtn v6.8b, v0.8h
1140 ld1 {v23.2s}, [TMP2]
1141 umull v18.8h, v22.8b, v28.8b
1142 asr WTMP3, X, #16
997 add X, X, UX 1143 add X, X, UX
998 add TMP3, TOP, TMP3, lsl #2 1144 lsl TMP4, TMP3, #2
999 asr TMP4, X, #16 1145 add TMP3, TOP, TMP4
1146 asr WTMP4, X, #16
1000 add X, X, UX 1147 add X, X, UX
1001 add TMP4, TOP, TMP4, lsl #2 1148 lsl TMP1, TMP4, #2
1002 vmlal.u8 q9, d23, d29 1149 add TMP4, TOP, TMP1
1003 vmovn.u16 d7, q2 1150 umlal v18.8h, v23.8b, v29.8b
1004 vld1.32 {d22}, [TMP3], STRIDE 1151 mov v19.d[0], v18.d[1]
1005 vuzp.8 d6, d7 1152 xtn v7.8b, v2.8h
1006 vuzp.8 d2, d3 1153 ld1 {v2.2s, v3.2s}, [OUT]
1007 vuzp.8 d6, d7 1154 prfm PLDL2STRM, [OUT, PF_OFFS]
1008 vuzp.8 d2, d3 1155 ld1 {v22.2s}, [TMP3], STRIDE
1009 vdup.32 d4, d7[1] 1156 vuzp v6.8b, v7.8b
1010 vld1.32 {d23}, [TMP3] 1157 vuzp v2.8b, v3.8b
1011 vmvn.8 d4, d4 1158 vuzp v6.8b, v7.8b
1012 vmull.u8 q10, d22, d28 1159 vuzp v2.8b, v3.8b
1013 vmlal.u8 q10, d23, d29 1160 dup v4.2s, v7.s[1]
1014 vmull.u8 q11, d2, d4 1161 ld1 {v23.2s}, [TMP3]
1015 vmull.u8 q2, d3, d4 1162 mvn v4.8b, v4.8b
1016 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 1163 umull v20.8h, v22.8b, v28.8b
1017 vmlsl.u16 q0, d16, d30 1164 umlal v20.8h, v23.8b, v29.8b
1018 vrshr.u16 q1, q11, #8 1165 umull v11.8h, v2.8b, v4.8b
1019 vmlal.u16 q0, d17, d30 1166 umull v2.8h, v3.8b, v4.8b
1020 vrshr.u16 q8, q2, #8 1167 mov v21.d[0], v20.d[1]
1021 vraddhn.u16 d2, q1, q11 1168 ushll v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
1022 vraddhn.u16 d3, q8, q2 1169 umlsl v0.4s, v16.4h, v30.4h
1023 pld [TMP4, PF_OFFS] 1170 urshr v1.8h, v11.8h, #8
1024 vld1.32 {d16}, [TMP4], STRIDE 1171 umlal v0.4s, v17.4h, v30.4h
1025 vqadd.u8 q3, q1, q3 1172 urshr v8.8h, v2.8h, #8
1026 vld1.32 {d17}, [TMP4] 1173 raddhn v3.8b, v8.8h, v2.8h
1027 pld [TMP4, PF_OFFS] 1174 raddhn v2.8b, v1.8h, v11.8h
1028 vmull.u8 q11, d16, d28 1175 prfm PLDL2STRM, [TMP4, PF_OFFS]
1029 vmlal.u8 q11, d17, d29 1176 ld1 {v16.2s}, [TMP4], STRIDE
1030 vuzp.8 d6, d7 1177 uqadd v6.8b, v2.8b, v6.8b
1031 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 1178 uqadd v7.8b, v3.8b, v7.8b
1032 vuzp.8 d6, d7 1179 ld1 {v17.2s}, [TMP4]
1033 vmlsl.u16 q1, d18, d31 1180 prfm PLDL2STRM, [TMP4, PF_OFFS]
1034 vadd.u16 q12, q12, q13 1181 umull v22.8h, v16.8b, v28.8b
1035 vmlal.u16 q1, d19, d31 1182 umlal v22.8h, v17.8b, v29.8b
1036 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1183 mov v23.d[0], v22.d[1]
1037 vadd.u16 q12, q12, q13 1184 vuzp v6.8b, v7.8b
1038 vst1.32 {d6, d7}, [OUT, :128]! 1185 ushll v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
1186 vuzp v6.8b, v7.8b
1187 umlsl v1.4s, v18.4h, v31.4h
1188 add v12.8h, v12.8h, v13.8h
1189 umlal v1.4s, v19.4h, v31.4h
1190 ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
1191 mov v31.d[0], v30.d[1]
1192 add v12.8h, v12.8h, v13.8h
1193 st1 {v6.2s, v7.2s}, [OUT], #16
1039.endm 1194.endm
1040 1195
1041/* over_8888_8_8888 */ 1196/* over_8888_8_8888 */
1042.macro bilinear_over_8888_8_8888_process_last_pixel 1197.macro bilinear_over_8888_8_8888_process_last_pixel
1043 bilinear_interpolate_last_pixel 8888, 8, 8888, over 1198 bilinear_interpolate_last_pixel 8888, 8, 8888, over
@@ -1046,177 +1201,24 @@ pixman_asm_function fname
1046.macro bilinear_over_8888_8_8888_process_two_pixels 1201.macro bilinear_over_8888_8_8888_process_two_pixels
1047 bilinear_interpolate_two_pixels 8888, 8, 8888, over 1202 bilinear_interpolate_two_pixels 8888, 8, 8888, over
1048.endm 1203.endm
1049 1204
1050.macro bilinear_over_8888_8_8888_process_four_pixels 1205.macro bilinear_over_8888_8_8888_process_four_pixels
1051 bilinear_interpolate_four_pixels 8888, 8, 8888, over 1206 bilinear_interpolate_two_pixels 8888, 8, 8888, over
1207 bilinear_interpolate_two_pixels 8888, 8, 8888, over
1052.endm 1208.endm
1053 1209
1054.macro bilinear_over_8888_8_8888_process_pixblock_head 1210.macro bilinear_over_8888_8_8888_process_pixblock_head
1055 asr TMP1, X, #16 1211 bilinear_over_8888_8_8888_process_four_pixels
1056 add X, X, UX
1057 add TMP1, TOP, TMP1, lsl #2
1058 vld1.32 {d0}, [TMP1], STRIDE
1059 asr TMP2, X, #16
1060 add X, X, UX
1061 add TMP2, TOP, TMP2, lsl #2
1062 vld1.32 {d1}, [TMP1]
1063 asr TMP3, X, #16
1064 add X, X, UX
1065 add TMP3, TOP, TMP3, lsl #2
1066 vld1.32 {d2}, [TMP2], STRIDE
1067 asr TMP4, X, #16
1068 add X, X, UX
1069 add TMP4, TOP, TMP4, lsl #2
1070 vld1.32 {d3}, [TMP2]
1071 vmull.u8 q2, d0, d28
1072 vmull.u8 q3, d2, d28
1073 vmlal.u8 q2, d1, d29
1074 vmlal.u8 q3, d3, d29
1075 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
1076 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
1077 vmlsl.u16 q0, d4, d30
1078 vmlsl.u16 q1, d6, d31
1079 vmlal.u16 q0, d5, d30
1080 vmlal.u16 q1, d7, d31
1081 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1082 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1083 vld1.32 {d2}, [TMP3], STRIDE
1084 vld1.32 {d3}, [TMP3]
1085 pld [TMP4, PF_OFFS]
1086 vld1.32 {d4}, [TMP4], STRIDE
1087 vld1.32 {d5}, [TMP4]
1088 pld [TMP4, PF_OFFS]
1089 vmull.u8 q3, d2, d28
1090 vmlal.u8 q3, d3, d29
1091 vmull.u8 q1, d4, d28
1092 vmlal.u8 q1, d5, d29
1093 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1094 vld1.32 {d22[0]}, [MASK]!
1095 pld [MASK, #prefetch_offset]
1096 vadd.u16 q12, q12, q13
1097 vmovn.u16 d16, q0
1098.endm 1212.endm
1099 1213
1100.macro bilinear_over_8888_8_8888_process_pixblock_tail 1214.macro bilinear_over_8888_8_8888_process_pixblock_tail
1101 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS
1102 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
1103 vmlsl.u16 q9, d6, d30
1104 vmlsl.u16 q10, d2, d31
1105 vmlal.u16 q9, d7, d30
1106 vmlal.u16 q10, d3, d31
1107 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1108 vadd.u16 q12, q12, q13
1109 vdup.32 d22, d22[0]
1110 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1111 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1112 vmovn.u16 d17, q9
1113 vld1.32 {d18, d19}, [OUT, :128]
1114 pld [OUT, PF_OFFS]
1115 vuzp.8 d16, d17
1116 vuzp.8 d18, d19
1117 vuzp.8 d16, d17
1118 vuzp.8 d18, d19
1119 vmull.u8 q10, d16, d22
1120 vmull.u8 q11, d17, d22
1121 vrsra.u16 q10, q10, #8
1122 vrsra.u16 q11, q11, #8
1123 vrshrn.u16 d16, q10, #8
1124 vrshrn.u16 d17, q11, #8
1125 vdup.32 d22, d17[1]
1126 vmvn.8 d22, d22
1127 vmull.u8 q10, d18, d22
1128 vmull.u8 q11, d19, d22
1129 vrshr.u16 q9, q10, #8
1130 vrshr.u16 q0, q11, #8
1131 vraddhn.u16 d18, q9, q10
1132 vraddhn.u16 d19, q0, q11
1133 vqadd.u8 q9, q8, q9
1134 vuzp.8 d18, d19
1135 vuzp.8 d18, d19
1136 vst1.32 {d18, d19}, [OUT, :128]!
1137.endm 1215.endm
1138 1216
1139.macro bilinear_over_8888_8_8888_process_pixblock_tail_head 1217.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
1140 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS 1218 bilinear_over_8888_8_8888_process_pixblock_tail
1141 asr TMP1, X, #16 1219 bilinear_over_8888_8_8888_process_pixblock_head
1142 add X, X, UX
1143 add TMP1, TOP, TMP1, lsl #2
1144 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
1145 vld1.32 {d0}, [TMP1], STRIDE
1146 asr TMP2, X, #16
1147 add X, X, UX
1148 add TMP2, TOP, TMP2, lsl #2
1149 vmlsl.u16 q9, d6, d30
1150 vmlsl.u16 q10, d2, d31
1151 vld1.32 {d1}, [TMP1]
1152 asr TMP3, X, #16
1153 add X, X, UX
1154 add TMP3, TOP, TMP3, lsl #2
1155 vmlal.u16 q9, d7, d30
1156 vmlal.u16 q10, d3, d31
1157 vld1.32 {d2}, [TMP2], STRIDE
1158 asr TMP4, X, #16
1159 add X, X, UX
1160 add TMP4, TOP, TMP4, lsl #2
1161 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1162 vadd.u16 q12, q12, q13
1163 vld1.32 {d3}, [TMP2]
1164 vdup.32 d22, d22[0]
1165 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1166 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1167 vmull.u8 q2, d0, d28
1168 vmull.u8 q3, d2, d28
1169 vmovn.u16 d17, q9
1170 vld1.32 {d18, d19}, [OUT, :128]
1171 pld [OUT, #(prefetch_offset * 4)]
1172 vmlal.u8 q2, d1, d29
1173 vmlal.u8 q3, d3, d29
1174 vuzp.8 d16, d17
1175 vuzp.8 d18, d19
1176 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
1177 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
1178 vuzp.8 d16, d17
1179 vuzp.8 d18, d19
1180 vmlsl.u16 q0, d4, d30
1181 vmlsl.u16 q1, d6, d31
1182 vmull.u8 q10, d16, d22
1183 vmull.u8 q11, d17, d22
1184 vmlal.u16 q0, d5, d30
1185 vmlal.u16 q1, d7, d31
1186 vrsra.u16 q10, q10, #8
1187 vrsra.u16 q11, q11, #8
1188 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1189 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1190 vrshrn.u16 d16, q10, #8
1191 vrshrn.u16 d17, q11, #8
1192 vld1.32 {d2}, [TMP3], STRIDE
1193 vdup.32 d22, d17[1]
1194 vld1.32 {d3}, [TMP3]
1195 vmvn.8 d22, d22
1196 pld [TMP4, PF_OFFS]
1197 vld1.32 {d4}, [TMP4], STRIDE
1198 vmull.u8 q10, d18, d22
1199 vmull.u8 q11, d19, d22
1200 vld1.32 {d5}, [TMP4]
1201 pld [TMP4, PF_OFFS]
1202 vmull.u8 q3, d2, d28
1203 vrshr.u16 q9, q10, #8
1204 vrshr.u16 q15, q11, #8
1205 vmlal.u8 q3, d3, d29
1206 vmull.u8 q1, d4, d28
1207 vraddhn.u16 d18, q9, q10
1208 vraddhn.u16 d19, q15, q11
1209 vmlal.u8 q1, d5, d29
1210 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1211 vqadd.u8 q9, q8, q9
1212 vld1.32 {d22[0]}, [MASK]!
1213 vuzp.8 d18, d19
1214 vadd.u16 q12, q12, q13
1215 vuzp.8 d18, d19
1216 vmovn.u16 d16, q0
1217 vst1.32 {d18, d19}, [OUT, :128]!
1218.endm 1220.endm
1219 1221
1220/* add_8888_8888 */ 1222/* add_8888_8888 */
1221.macro bilinear_add_8888_8888_process_last_pixel 1223.macro bilinear_add_8888_8888_process_last_pixel
1222 bilinear_interpolate_last_pixel 8888, x, 8888, add 1224 bilinear_interpolate_last_pixel 8888, x, 8888, add
@@ -1225,11 +1227,12 @@ pixman_asm_function fname
1225.macro bilinear_add_8888_8888_process_two_pixels 1227.macro bilinear_add_8888_8888_process_two_pixels
1226 bilinear_interpolate_two_pixels 8888, x, 8888, add 1228 bilinear_interpolate_two_pixels 8888, x, 8888, add
1227.endm 1229.endm
1228 1230
1229.macro bilinear_add_8888_8888_process_four_pixels 1231.macro bilinear_add_8888_8888_process_four_pixels
1230 bilinear_interpolate_four_pixels 8888, x, 8888, add 1232 bilinear_interpolate_two_pixels 8888, x, 8888, add
1233 bilinear_interpolate_two_pixels 8888, x, 8888, add
1231.endm 1234.endm
1232 1235
1233.macro bilinear_add_8888_8888_process_pixblock_head 1236.macro bilinear_add_8888_8888_process_pixblock_head
1234 bilinear_add_8888_8888_process_four_pixels 1237 bilinear_add_8888_8888_process_four_pixels
1235.endm 1238.endm
diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
index 059b285..2b08766 100644
--- a/pixman/pixman-arma64-neon-asm.S
+++ b/pixman/pixman-arma64-neon-asm.S
@@ -37,23 +37,19 @@
37/* Prevent the stack from becoming executable for no reason... */ 37/* Prevent the stack from becoming executable for no reason... */
38#if defined(__linux__) && defined(__ELF__) 38#if defined(__linux__) && defined(__ELF__)
39.section .note.GNU-stack,"",%progbits 39.section .note.GNU-stack,"",%progbits
40#endif 40#endif
41 41
42 .text 42.text
43 .fpu neon 43.arch armv8-a
44 .arch armv7a 44
45 .object_arch armv4 45.altmacro
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 46.p2align 2
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
48 .arm
49 .altmacro
50 .p2align 2
51 47
52#include "pixman-private.h" 48#include "pixman-private.h"
53#include "pixman-arm-asm.h" 49#include "pixman-arm-asm.h"
54#include "pixman-arm-neon-asm.h" 50#include "pixman-arma64-neon-asm.h"
55 51
56/* Global configuration options and preferences */ 52/* Global configuration options and preferences */
57 53
58/* 54/*
59 * The code can optionally make use of unaligned memory accesses to improve 55 * The code can optionally make use of unaligned memory accesses to improve
@@ -78,11 +74,11 @@
78 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 74 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
79 * 75 *
80 * Note: some types of function can't support advanced prefetch and fallback 76 * Note: some types of function can't support advanced prefetch and fallback
81 * to simple one (those which handle 24bpp pixels) 77 * to simple one (those which handle 24bpp pixels)
82 */ 78 */
83.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 79.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_NONE
84 80
85/* Prefetch distance in pixels for simple prefetch */ 81/* Prefetch distance in pixels for simple prefetch */
86.set PREFETCH_DISTANCE_SIMPLE, 64 82.set PREFETCH_DISTANCE_SIMPLE, 64
87 83
88/* 84/*
@@ -99,94 +95,102 @@
99 * code for memory reading and writing (including quite tricky cases of 95 * code for memory reading and writing (including quite tricky cases of
100 * handling unaligned leading/trailing pixels), so we only need to deal with 96 * handling unaligned leading/trailing pixels), so we only need to deal with
101 * the data in NEON registers. 97 * the data in NEON registers.
102 * 98 *
103 * NEON registers allocation in general is recommented to be the following: 99 * NEON registers allocation in general is recommented to be the following:
104 * d0, d1, d2, d3 - contain loaded source pixel data 100 * v0, v1, v2, v3 - contain loaded source pixel data
105 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) 101 * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed)
106 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) 102 * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
107 * d28, d29, d30, d31 - place for storing the result (destination pixels) 103 * v28, v29, v30, v31 - place for storing the result (destination pixels)
108 * 104 *
109 * As can be seen above, four 64-bit NEON registers are used for keeping 105 * As can be seen above, four 64-bit NEON registers are used for keeping
110 * intermediate pixel data and up to 8 pixels can be processed in one step 106 * intermediate pixel data and up to 8 pixels can be processed in one step
111 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 107 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
112 * 108 *
113 * This particular function uses the following registers allocation: 109 * This particular function uses the following registers allocation:
114 * d0, d1, d2, d3 - contain loaded source pixel data 110 * v0, v1, v2, v3 - contain loaded source pixel data
115 * d4, d5 - contain loaded destination pixels (they are needed) 111 * v4, v5 - contain loaded destination pixels (they are needed)
116 * d28, d29 - place for storing the result (destination pixels) 112 * v28, v29 - place for storing the result (destination pixels)
117 */ 113 */
118 114
119/* 115/*
120 * Step one. We need to have some code to do some arithmetics on pixel data. 116 * Step one. We need to have some code to do some arithmetics on pixel data.
121 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 117 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
122 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, 118 * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
123 * perform all the needed calculations and write the result to {d28, d29}. 119 * perform all the needed calculations and write the result to {v28, v29}.
124 * The rationale for having two macros and not just one will be explained 120 * The rationale for having two macros and not just one will be explained
125 * later. In practice, any single monolitic function which does the work can 121 * later. In practice, any single monolitic function which does the work can
126 * be split into two parts in any arbitrary way without affecting correctness. 122 * be split into two parts in any arbitrary way without affecting correctness.
127 * 123 *
128 * There is one special trick here too. Common template macro can optionally 124 * There is one special trick here too. Common template macro can optionally
129 * make our life a bit easier by doing R, G, B, A color components 125 * make our life a bit easier by doing R, G, B, A color components
130 * deinterleaving for 32bpp pixel formats (and this feature is used in 126 * deinterleaving for 32bpp pixel formats (and this feature is used in
131 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 127 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
132 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we 128 * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
133 * actually use d0 register for blue channel (a vector of eight 8-bit 129 * actually use v0 register for blue channel (a vector of eight 8-bit
134 * values), d1 register for green, d2 for red and d3 for alpha. This 130 * values), v1 register for green, v2 for red and v3 for alpha. This
135 * simple conversion can be also done with a few NEON instructions: 131 * simple conversion can be also done with a few NEON instructions:
136 * 132 *
137 * Packed to planar conversion: 133 * Packed to planar conversion: // vuzp8 is a wrapper macro
138 * vuzp.8 d0, d1 134 * vuzp8 v0, v1
139 * vuzp.8 d2, d3 135 * vuzp8 v2, v3
140 * vuzp.8 d1, d3 136 * vuzp8 v1, v3
141 * vuzp.8 d0, d2 137 * vuzp8 v0, v2
142 * 138 *
143 * Planar to packed conversion: 139 * Planar to packed conversion: // vzip8 is a wrapper macro
144 * vzip.8 d0, d2 140 * vzip8 v0, v2
145 * vzip.8 d1, d3 141 * vzip8 v1, v3
146 * vzip.8 d2, d3 142 * vzip8 v2, v3
147 * vzip.8 d0, d1 143 * vzip8 v0, v1
148 * 144 *
149 * But pixel can be loaded directly in planar format using VLD4.8 NEON 145 * But pixel can be loaded directly in planar format using LD4 / b NEON
150 * instruction. It is 1 cycle slower than VLD1.32, so this is not always 146 * instruction. It is 1 cycle slower than LD1 / s, so this is not always
151 * desirable, that's why deinterleaving is optional. 147 * desirable, that's why deinterleaving is optional.
152 * 148 *
153 * But anyway, here is the code: 149 * But anyway, here is the code:
154 */ 150 */
151
155.macro pixman_composite_over_8888_0565_process_pixblock_head 152.macro pixman_composite_over_8888_0565_process_pixblock_head
156 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 153 /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
157 and put data into d6 - red, d7 - green, d30 - blue */ 154 and put data into v6 - red, v7 - green, v30 - blue */
158 vshrn.u16 d6, q2, #8 155 mov v4.d[1], v5.d[0]
159 vshrn.u16 d7, q2, #3 156 shrn v6.8b, v4.8h, #8
160 vsli.u16 q2, q2, #5 157 shrn v7.8b, v4.8h, #3
161 vsri.u8 d6, d6, #5 158 sli v4.8h, v4.8h, #5
162 vmvn.8 d3, d3 /* invert source alpha */ 159 sri v6.8b, v6.8b, #5
163 vsri.u8 d7, d7, #6 160 mvn v3.8b, v3.8b /* invert source alpha */
164 vshrn.u16 d30, q2, #2 161 sri v7.8b, v7.8b, #6
162 shrn v30.8b, v4.8h, #2
165 /* now do alpha blending, storing results in 8-bit planar format 163 /* now do alpha blending, storing results in 8-bit planar format
166 into d16 - red, d19 - green, d18 - blue */ 164 into v20 - red, v23 - green, v22 - blue */
167 vmull.u8 q10, d3, d6 165 umull v10.8h, v3.8b, v6.8b
168 vmull.u8 q11, d3, d7 166 umull v11.8h, v3.8b, v7.8b
169 vmull.u8 q12, d3, d30 167 umull v12.8h, v3.8b, v30.8b
170 vrshr.u16 q13, q10, #8 168 urshr v17.8h, v10.8h, #8
171 vrshr.u16 q3, q11, #8 169 urshr v18.8h, v11.8h, #8
172 vrshr.u16 q15, q12, #8 170 urshr v19.8h, v12.8h, #8
173 vraddhn.u16 d20, q10, q13 171 raddhn v20.8b, v10.8h, v17.8h
174 vraddhn.u16 d23, q11, q3 172 raddhn v23.8b, v11.8h, v18.8h
175 vraddhn.u16 d22, q12, q15 173 raddhn v22.8b, v12.8h, v19.8h
176.endm 174.endm
177 175
178.macro pixman_composite_over_8888_0565_process_pixblock_tail 176.macro pixman_composite_over_8888_0565_process_pixblock_tail
179 /* ... continue alpha blending */ 177 /* ... continue alpha blending */
180 vqadd.u8 d16, d2, d20 178 uqadd v17.8b, v2.8b, v20.8b
181 vqadd.u8 q9, q0, q11 179 uqadd v18.8b, v0.8b, v22.8b
182 /* convert the result to r5g6b5 and store it into {d28, d29} */ 180 uqadd v19.8b, v1.8b, v23.8b
183 vshll.u8 q14, d16, #8 181 /* convert the result to r5g6b5 and store it into {v14} */
184 vshll.u8 q8, d19, #8 182 ushll v14.8h, v17.8b, #7
185 vshll.u8 q9, d18, #8 183 sli v14.8h, v14.8h, #1
186 vsri.u16 q14, q8, #5 184 ushll v8.8h, v19.8b, #7
187 vsri.u16 q14, q9, #11 185 sli v8.8h, v8.8h, #1
186 ushll v9.8h, v18.8b, #7
187 sli v9.8h, v9.8h, #1
188 sri v14.8h, v8.8h, #5
189 sri v14.8h, v9.8h, #11
190 mov v28.d[0], v14.d[0]
191 mov v29.d[0], v14.d[1]
188.endm 192.endm
189 193
190/* 194/*
191 * OK, now we got almost everything that we need. Using the above two 195 * OK, now we got almost everything that we need. Using the above two
192 * macros, the work can be done right. But now we want to optimize 196 * macros, the work can be done right. But now we want to optimize
@@ -209,13 +213,13 @@
209 * So what we need now is a '*_tail_head' macro, which will be used 213 * So what we need now is a '*_tail_head' macro, which will be used
210 * in the core main loop. A trivial straightforward implementation 214 * in the core main loop. A trivial straightforward implementation
211 * of this macro would look like this: 215 * of this macro would look like this:
212 * 216 *
213 * pixman_composite_over_8888_0565_process_pixblock_tail 217 * pixman_composite_over_8888_0565_process_pixblock_tail
214 * vst1.16 {d28, d29}, [DST_W, :128]! 218 * st1 {v28.4h, v29.4h}, [DST_W], #32
215 * vld1.16 {d4, d5}, [DST_R, :128]! 219 * ld1 {v4.4h, v5.4h}, [DST_R], #16
216 * vld4.32 {d0, d1, d2, d3}, [SRC]! 220 * ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
217 * pixman_composite_over_8888_0565_process_pixblock_head 221 * pixman_composite_over_8888_0565_process_pixblock_head
218 * cache_preload 8, 8 222 * cache_preload 8, 8
219 * 223 *
220 * Now it also got some VLD/VST instructions. We simply can't move from 224 * Now it also got some VLD/VST instructions. We simply can't move from
221 * processing one block of pixels to the other one with just arithmetics. 225 * processing one block of pixels to the other one with just arithmetics.
@@ -242,64 +246,89 @@
242 * 246 *
243 * Now after all the explanations, here is the optimized code. 247 * Now after all the explanations, here is the optimized code.
244 * Different instruction streams (originaling from '*_head', '*_tail' 248 * Different instruction streams (originaling from '*_head', '*_tail'
245 * and 'cache_preload' macro) use different indentation levels for 249 * and 'cache_preload' macro) use different indentation levels for
246 * better readability. Actually taking the code from one of these 250 * better readability. Actually taking the code from one of these
247 * indentation levels and ignoring a few VLD/VST instructions would 251 * indentation levels and ignoring a few LD/ST instructions would
248 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 252 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
249 * macro! 253 * macro!
250 */ 254 */
251 255
252#if 1 256#if 1
253 257
254.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 258.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
255 vqadd.u8 d16, d2, d20 259 uqadd v17.8b, v2.8b, v20.8b
256 vld1.16 {d4, d5}, [DST_R, :128]! 260 ld1 {v4.4h, v5.4h}, [DST_R], #16
257 vqadd.u8 q9, q0, q11 261 mov v4.d[1], v5.d[0]
258 vshrn.u16 d6, q2, #8 262 uqadd v18.8b, v0.8b, v22.8b
263 uqadd v19.8b, v1.8b, v23.8b
264 shrn v6.8b, v4.8h, #8
259 fetch_src_pixblock 265 fetch_src_pixblock
260 vshrn.u16 d7, q2, #3 266 shrn v7.8b, v4.8h, #3
261 vsli.u16 q2, q2, #5 267 sli v4.8h, v4.8h, #5
262 vshll.u8 q14, d16, #8 268 ushll v14.8h, v17.8b, #7
269 sli v14.8h, v14.8h, #1
263 PF add PF_X, PF_X, #8 270 PF add PF_X, PF_X, #8
264 vshll.u8 q8, d19, #8 271 ushll v8.8h, v19.8b, #7
272 sli v8.8h, v8.8h, #1
265 PF tst PF_CTL, #0xF 273 PF tst PF_CTL, #0xF
266 vsri.u8 d6, d6, #5 274 sri v6.8b, v6.8b, #5
267 PF addne PF_X, PF_X, #8 275 PF beq 10f
268 vmvn.8 d3, d3 276 PF add PF_X, PF_X, #8
269 PF subne PF_CTL, PF_CTL, #1 27710:
270 vsri.u8 d7, d7, #6 278 mvn v3.8b, v3.8b
271 vshrn.u16 d30, q2, #2 279 PF beq 10f
272 vmull.u8 q10, d3, d6 280 PF sub PF_CTL, PF_CTL, #1
273 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 28110:
274 vmull.u8 q11, d3, d7 282 sri v7.8b, v7.8b, #6
275 vmull.u8 q12, d3, d30 283 shrn v30.8b, v4.8h, #2
276 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 284 umull v10.8h, v3.8b, v6.8b
277 vsri.u16 q14, q8, #5 285 PF lsl DUMMY, PF_X, #src_bpp_shift
286 PF prfm pldl2strm, [PF_SRC, DUMMY]
287 umull v11.8h, v3.8b, v7.8b
288 umull v12.8h, v3.8b, v30.8b
289 PF lsl DUMMY, PF_X, #dst_bpp_shift
290 PF prfm pldl2strm, [PF_DST, DUMMY]
291 sri v14.8h, v8.8h, #5
278 PF cmp PF_X, ORIG_W 292 PF cmp PF_X, ORIG_W
279 vshll.u8 q9, d18, #8 293 ushll v9.8h, v18.8b, #7
280 vrshr.u16 q13, q10, #8 294 sli v9.8h, v9.8h, #1
281 PF subge PF_X, PF_X, ORIG_W 295 urshr v17.8h, v10.8h, #8
282 vrshr.u16 q3, q11, #8 296 PF ble 10f
283 vrshr.u16 q15, q12, #8 297 PF sub PF_X, PF_X, ORIG_W
284 PF subges PF_CTL, PF_CTL, #0x10 29810:
285 vsri.u16 q14, q9, #11 299 urshr v19.8h, v11.8h, #8
286 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 300 urshr v18.8h, v12.8h, #8
287 vraddhn.u16 d20, q10, q13 301 PF ble 10f
288 vraddhn.u16 d23, q11, q3 302 PF subs PF_CTL, PF_CTL, #0x10
289 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 30310:
290 vraddhn.u16 d22, q12, q15 304 sri v14.8h, v9.8h, #11
291 vst1.16 {d28, d29}, [DST_W, :128]! 305 mov v28.d[0], v14.d[0]
306 mov v29.d[0], v14.d[1]
307 PF ble 10f
308 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
309 PF ldrsb DUMMY, [PF_SRC, DUMMY]
310 PF add PF_SRC, PF_SRC, #1
31110:
312 raddhn v20.8b, v10.8h, v17.8h
313 raddhn v23.8b, v11.8h, v19.8h
314 PF ble 10f
315 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
316 PF ldrsb DUMMY, [PF_DST, DUMMY]
317 PF add PF_DST, PF_SRC, #1
31810:
319 raddhn v22.8b, v12.8h, v18.8h
320 st1 {v14.8h}, [DST_W], #16
292.endm 321.endm
293 322
294#else 323#else
295 324
296/* If we did not care much about the performance, we would just use this... */ 325/* If we did not care much about the performance, we would just use this... */
297.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 326.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
298 pixman_composite_over_8888_0565_process_pixblock_tail 327 pixman_composite_over_8888_0565_process_pixblock_tail
299 vst1.16 {d28, d29}, [DST_W, :128]! 328 st1 {v14.8h}, [DST_W], #16
300 vld1.16 {d4, d5}, [DST_R, :128]! 329 ld1 {v4.4h, v4.5h}, [DST_R], #16
301 fetch_src_pixblock 330 fetch_src_pixblock
302 pixman_composite_over_8888_0565_process_pixblock_head 331 pixman_composite_over_8888_0565_process_pixblock_head
303 cache_preload 8, 8 332 cache_preload 8, 8
304.endm 333.endm
305 334
@@ -350,60 +379,66 @@ generate_composite_function \
350 24 /* mask_basereg */ 379 24 /* mask_basereg */
351 380
352/******************************************************************************/ 381/******************************************************************************/
353 382
354.macro pixman_composite_over_n_0565_process_pixblock_head 383.macro pixman_composite_over_n_0565_process_pixblock_head
355 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 384 /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
356 and put data into d6 - red, d7 - green, d30 - blue */ 385 and put data into v6 - red, v7 - green, v30 - blue */
357 vshrn.u16 d6, q2, #8 386 mov v4.d[1], v5.d[0]
358 vshrn.u16 d7, q2, #3 387 shrn v6.8b, v4.8h, #8
359 vsli.u16 q2, q2, #5 388 shrn v7.8b, v4.8h, #3
360 vsri.u8 d6, d6, #5 389 sli v4.8h, v4.8h, #5
361 vsri.u8 d7, d7, #6 390 sri v6.8b, v6.8b, #5
362 vshrn.u16 d30, q2, #2 391 sri v7.8b, v7.8b, #6
392 shrn v30.8b, v4.8h, #2
363 /* now do alpha blending, storing results in 8-bit planar format 393 /* now do alpha blending, storing results in 8-bit planar format
364 into d16 - red, d19 - green, d18 - blue */ 394 into v20 - red, v23 - green, v22 - blue */
365 vmull.u8 q10, d3, d6 395 umull v10.8h, v3.8b, v6.8b
366 vmull.u8 q11, d3, d7 396 umull v11.8h, v3.8b, v7.8b
367 vmull.u8 q12, d3, d30 397 umull v12.8h, v3.8b, v30.8b
368 vrshr.u16 q13, q10, #8 398 urshr v13.8h, v10.8h, #8
369 vrshr.u16 q3, q11, #8 399 urshr v14.8h, v11.8h, #8
370 vrshr.u16 q15, q12, #8 400 urshr v15.8h, v12.8h, #8
371 vraddhn.u16 d20, q10, q13 401 raddhn v20.8b, v10.8h, v13.8h
372 vraddhn.u16 d23, q11, q3 402 raddhn v23.8b, v11.8h, v14.8h
373 vraddhn.u16 d22, q12, q15 403 raddhn v22.8b, v12.8h, v15.8h
374.endm 404.endm
375 405
376.macro pixman_composite_over_n_0565_process_pixblock_tail 406.macro pixman_composite_over_n_0565_process_pixblock_tail
377 /* ... continue alpha blending */ 407 /* ... continue alpha blending */
378 vqadd.u8 d16, d2, d20 408 uqadd v17.8b, v2.8b, v20.8b
379 vqadd.u8 q9, q0, q11 409 uqadd v18.8b, v0.8b, v22.8b
380 /* convert the result to r5g6b5 and store it into {d28, d29} */ 410 uqadd v19.8b, v1.8b, v23.8b
381 vshll.u8 q14, d16, #8 411 /* convert the result to r5g6b5 and store it into {v14} */
382 vshll.u8 q8, d19, #8 412 ushll v14.8h, v17.8b, #7
383 vshll.u8 q9, d18, #8 413 sli v14.8h, v14.8h, #1
384 vsri.u16 q14, q8, #5 414 ushll v8.8h, v19.8b, #7
385 vsri.u16 q14, q9, #11 415 sli v8.8h, v8.8h, #1
416 ushll v9.8h, v18.8b, #7
417 sli v9.8h, v9.8h, #1
418 sri v14.8h, v8.8h, #5
419 sri v14.8h, v9.8h, #11
420 mov v28.d[0], v14.d[0]
421 mov v29.d[0], v14.d[1]
386.endm 422.endm
387 423
388/* TODO: expand macros and do better instructions scheduling */ 424/* TODO: expand macros and do better instructions scheduling */
389.macro pixman_composite_over_n_0565_process_pixblock_tail_head 425.macro pixman_composite_over_n_0565_process_pixblock_tail_head
390 pixman_composite_over_n_0565_process_pixblock_tail 426 pixman_composite_over_n_0565_process_pixblock_tail
391 vld1.16 {d4, d5}, [DST_R, :128]! 427 ld1 {v4.4h, v5.4h}, [DST_R], #16
392 vst1.16 {d28, d29}, [DST_W, :128]! 428 st1 {v14.8h}, [DST_W], #16
393 pixman_composite_over_n_0565_process_pixblock_head 429 pixman_composite_over_n_0565_process_pixblock_head
394 cache_preload 8, 8 430 cache_preload 8, 8
395.endm 431.endm
396 432
397.macro pixman_composite_over_n_0565_init 433.macro pixman_composite_over_n_0565_init
398 add DUMMY, sp, #ARGS_STACK_OFFSET 434 mov v3.s[0], w4
399 vld1.32 {d3[0]}, [DUMMY] 435 dup v0.8b, v3.b[0]
400 vdup.8 d0, d3[0] 436 dup v1.8b, v3.b[1]
401 vdup.8 d1, d3[1] 437 dup v2.8b, v3.b[2]
402 vdup.8 d2, d3[2] 438 dup v3.8b, v3.b[3]
403 vdup.8 d3, d3[3] 439 mvn v3.8b, v3.8b /* invert source alpha */
404 vmvn.8 d3, d3 /* invert source alpha */
405.endm 440.endm
406 441
407generate_composite_function \ 442generate_composite_function \
408 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 443 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
409 FLAG_DST_READWRITE, \ 444 FLAG_DST_READWRITE, \
@@ -420,37 +455,56 @@ generate_composite_function \
420 24 /* mask_basereg */ 455 24 /* mask_basereg */
421 456
422/******************************************************************************/ 457/******************************************************************************/
423 458
424.macro pixman_composite_src_8888_0565_process_pixblock_head 459.macro pixman_composite_src_8888_0565_process_pixblock_head
425 vshll.u8 q8, d1, #8 460 ushll v8.8h, v1.8b, #7
426 vshll.u8 q14, d2, #8 461 sli v8.8h, v8.8h, #1
427 vshll.u8 q9, d0, #8 462 ushll v14.8h, v2.8b, #7
463 sli v14.8h, v14.8h, #1
464 ushll v9.8h, v0.8b, #7
465 sli v9.8h, v9.8h, #1
428.endm 466.endm
429 467
430.macro pixman_composite_src_8888_0565_process_pixblock_tail 468.macro pixman_composite_src_8888_0565_process_pixblock_tail
431 vsri.u16 q14, q8, #5 469 sri v14.8h, v8.8h, #5
432 vsri.u16 q14, q9, #11 470 sri v14.8h, v9.8h, #11
471 mov v28.d[0], v14.d[0]
472 mov v29.d[0], v14.d[1]
433.endm 473.endm
434 474
435.macro pixman_composite_src_8888_0565_process_pixblock_tail_head 475.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
436 vsri.u16 q14, q8, #5 476 sri v14.8h, v8.8h, #5
437 PF add PF_X, PF_X, #8 477 PF add PF_X, PF_X, #8
438 PF tst PF_CTL, #0xF 478 PF tst PF_CTL, #0xF
439 fetch_src_pixblock 479 fetch_src_pixblock
440 PF addne PF_X, PF_X, #8 480 PF beq 10f
441 PF subne PF_CTL, PF_CTL, #1 481 PF add PF_X, PF_X, #8
442 vsri.u16 q14, q9, #11 482 PF sub PF_CTL, PF_CTL, #1
48310:
484 sri v14.8h, v9.8h, #11
485 mov v28.d[0], v14.d[0]
486 mov v29.d[0], v14.d[1]
443 PF cmp PF_X, ORIG_W 487 PF cmp PF_X, ORIG_W
444 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 488 PF lsl DUMMY, PF_X, #src_bpp_shift
445 vshll.u8 q8, d1, #8 489 PF prfm pldl2strm, [PF_SRC, DUMMY]
446 vst1.16 {d28, d29}, [DST_W, :128]! 490 ushll v8.8h, v1.8b, #7
447 PF subge PF_X, PF_X, ORIG_W 491 sli v8.8h, v8.8h, #1
448 PF subges PF_CTL, PF_CTL, #0x10 492 st1 {v14.8h}, [DST_W], #16
449 vshll.u8 q14, d2, #8 493 PF ble 10f
450 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 494 PF sub PF_X, PF_X, ORIG_W
451 vshll.u8 q9, d0, #8 495 PF subs PF_CTL, PF_CTL, #0x10
49610:
497 ushll v14.8h, v2.8b, #7
498 sli v14.8h, v14.8h, #1
499 PF ble 10f
500 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
501 PF ldrsb DUMMY, [PF_SRC, DUMMY]
502 PF add PF_SRC, PF_SRC, #1
50310:
504 ushll v9.8h, v0.8b, #7
505 sli v9.8h, v9.8h, #1
452.endm 506.endm
453 507
454generate_composite_function \ 508generate_composite_function \
455 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 509 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
456 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 510 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
@@ -463,26 +517,27 @@ generate_composite_function \
463 pixman_composite_src_8888_0565_process_pixblock_tail_head 517 pixman_composite_src_8888_0565_process_pixblock_tail_head
464 518
465/******************************************************************************/ 519/******************************************************************************/
466 520
467.macro pixman_composite_src_0565_8888_process_pixblock_head 521.macro pixman_composite_src_0565_8888_process_pixblock_head
468 vshrn.u16 d30, q0, #8 522 mov v0.d[1], v1.d[0]
469 vshrn.u16 d29, q0, #3 523 shrn v30.8b, v0.8h, #8
470 vsli.u16 q0, q0, #5 524 shrn v29.8b, v0.8h, #3
471 vmov.u8 d31, #255 525 sli v0.8h, v0.8h, #5
472 vsri.u8 d30, d30, #5 526 movi v31.8b, #255
473 vsri.u8 d29, d29, #6 527 sri v30.8b, v30.8b, #5
474 vshrn.u16 d28, q0, #2 528 sri v29.8b, v29.8b, #6
529 shrn v28.8b, v0.8h, #2
475.endm 530.endm
476 531
477.macro pixman_composite_src_0565_8888_process_pixblock_tail 532.macro pixman_composite_src_0565_8888_process_pixblock_tail
478.endm 533.endm
479 534
480/* TODO: expand macros and do better instructions scheduling */ 535/* TODO: expand macros and do better instructions scheduling */
481.macro pixman_composite_src_0565_8888_process_pixblock_tail_head 536.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
482 pixman_composite_src_0565_8888_process_pixblock_tail 537 pixman_composite_src_0565_8888_process_pixblock_tail
483 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 538 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
484 fetch_src_pixblock 539 fetch_src_pixblock
485 pixman_composite_src_0565_8888_process_pixblock_head 540 pixman_composite_src_0565_8888_process_pixblock_head
486 cache_preload 8, 8 541 cache_preload 8, 8
487.endm 542.endm
488 543
@@ -498,34 +553,50 @@ generate_composite_function \
498 pixman_composite_src_0565_8888_process_pixblock_tail_head 553 pixman_composite_src_0565_8888_process_pixblock_tail_head
499 554
500/******************************************************************************/ 555/******************************************************************************/
501 556
502.macro pixman_composite_add_8_8_process_pixblock_head 557.macro pixman_composite_add_8_8_process_pixblock_head
503 vqadd.u8 q14, q0, q2 558 uqadd v28.8b, v0.8b, v4.8b
504 vqadd.u8 q15, q1, q3 559 uqadd v29.8b, v1.8b, v5.8b
560 uqadd v30.8b, v2.8b, v6.8b
561 uqadd v31.8b, v3.8b, v7.8b
505.endm 562.endm
506 563
507.macro pixman_composite_add_8_8_process_pixblock_tail 564.macro pixman_composite_add_8_8_process_pixblock_tail
508.endm 565.endm
509 566
510.macro pixman_composite_add_8_8_process_pixblock_tail_head 567.macro pixman_composite_add_8_8_process_pixblock_tail_head
511 fetch_src_pixblock 568 fetch_src_pixblock
512 PF add PF_X, PF_X, #32 569 PF add PF_X, PF_X, #32
513 PF tst PF_CTL, #0xF 570 PF tst PF_CTL, #0xF
514 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 571 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
515 PF addne PF_X, PF_X, #32 572 PF beq 10f
516 PF subne PF_CTL, PF_CTL, #1 573 PF add PF_X, PF_X, #32
517 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 574 PF sub PF_CTL, PF_CTL, #1
57510:
576 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
518 PF cmp PF_X, ORIG_W 577 PF cmp PF_X, ORIG_W
519 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 578 PF lsl DUMMY, PF_X, #src_bpp_shift
520 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 579 PF prfm pldl2strm, [PF_SRC, DUMMY]
521 PF subge PF_X, PF_X, ORIG_W 580 PF lsl DUMMY, PF_X, #dst_bpp_shift
522 PF subges PF_CTL, PF_CTL, #0x10 581 PF prfm pldl2strm, [PF_DST, DUMMY]
523 vqadd.u8 q14, q0, q2 582 PF ble 10f
524 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 583 PF sub PF_X, PF_X, ORIG_W
525 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 584 PF subs PF_CTL, PF_CTL, #0x10
526 vqadd.u8 q15, q1, q3 58510:
586 uqadd v28.8b, v0.8b, v4.8b
587 PF ble 10f
588 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
589 PF ldrsb DUMMY, [PF_SRC, DUMMY]
590 PF add PF_SRC, PF_SRC, #1
591 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
592 PF ldrsb DUMMY, [PF_DST, DUMMY]
593 PF add PF_DST, PF_DST, #1
59410:
595 uqadd v29.8b, v1.8b, v5.8b
596 uqadd v30.8b, v2.8b, v6.8b
597 uqadd v31.8b, v3.8b, v7.8b
527.endm 598.endm
528 599
529generate_composite_function \ 600generate_composite_function \
530 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ 601 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
531 FLAG_DST_READWRITE, \ 602 FLAG_DST_READWRITE, \
@@ -541,23 +612,37 @@ generate_composite_function \
541 612
542.macro pixman_composite_add_8888_8888_process_pixblock_tail_head 613.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
543 fetch_src_pixblock 614 fetch_src_pixblock
544 PF add PF_X, PF_X, #8 615 PF add PF_X, PF_X, #8
545 PF tst PF_CTL, #0xF 616 PF tst PF_CTL, #0xF
546 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! 617 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
547 PF addne PF_X, PF_X, #8 618 PF beq 10f
548 PF subne PF_CTL, PF_CTL, #1 619 PF add PF_X, PF_X, #8
549 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! 620 PF sub PF_CTL, PF_CTL, #1
62110:
622 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
550 PF cmp PF_X, ORIG_W 623 PF cmp PF_X, ORIG_W
551 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 624 PF lsl DUMMY, PF_X, #src_bpp_shift
552 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 625 PF prfm pldl2strm, [PF_SRC, DUMMY]
553 PF subge PF_X, PF_X, ORIG_W 626 PF lsl DUMMY, PF_X, #dst_bpp_shift
554 PF subges PF_CTL, PF_CTL, #0x10 627 PF prfm pldl2strm, [PF_DST, DUMMY]
555 vqadd.u8 q14, q0, q2 628 PF ble 10f
556 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 629 PF sub PF_X, PF_X, ORIG_W
557 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 630 PF subs PF_CTL, PF_CTL, #0x10
558 vqadd.u8 q15, q1, q3 63110:
632 uqadd v28.8b, v0.8b, v4.8b
633 PF ble 10f
634 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
635 PF ldrsb DUMMY, [PF_SRC, DUMMY]
636 PF add PF_SRC, PF_SRC, #1
637 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
638 PF ldrsb DUMMY, [PF_DST, DUMMY]
639 PF add PF_DST, PF_DST, #1
64010:
641 uqadd v29.8b, v1.8b, v5.8b
642 uqadd v30.8b, v2.8b, v6.8b
643 uqadd v31.8b, v3.8b, v7.8b
559.endm 644.endm
560 645
561generate_composite_function \ 646generate_composite_function \
562 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 647 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
563 FLAG_DST_READWRITE, \ 648 FLAG_DST_READWRITE, \
@@ -580,57 +665,73 @@ generate_composite_function_single_scanline \
580 pixman_composite_add_8888_8888_process_pixblock_tail_head 665 pixman_composite_add_8888_8888_process_pixblock_tail_head
581 666
582/******************************************************************************/ 667/******************************************************************************/
583 668
584.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head 669.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
585 vmvn.8 d24, d3 /* get inverted alpha */ 670 mvn v24.8b, v3.8b /* get inverted alpha */
586 /* do alpha blending */ 671 /* do alpha blending */
587 vmull.u8 q8, d24, d4 672 umull v8.8h, v24.8b, v4.8b
588 vmull.u8 q9, d24, d5 673 umull v9.8h, v24.8b, v5.8b
589 vmull.u8 q10, d24, d6 674 umull v10.8h, v24.8b, v6.8b
590 vmull.u8 q11, d24, d7 675 umull v11.8h, v24.8b, v7.8b
591.endm 676.endm
592 677
593.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail 678.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
594 vrshr.u16 q14, q8, #8 679 urshr v14.8h, v8.8h, #8
595 vrshr.u16 q15, q9, #8 680 urshr v15.8h, v9.8h, #8
596 vrshr.u16 q12, q10, #8 681 urshr v16.8h, v10.8h, #8
597 vrshr.u16 q13, q11, #8 682 urshr v17.8h, v11.8h, #8
598 vraddhn.u16 d28, q14, q8 683 raddhn v28.8b, v14.8h, v8.8h
599 vraddhn.u16 d29, q15, q9 684 raddhn v29.8b, v15.8h, v9.8h
600 vraddhn.u16 d30, q12, q10 685 raddhn v30.8b, v16.8h, v10.8h
601 vraddhn.u16 d31, q13, q11 686 raddhn v31.8b, v17.8h, v11.8h
602.endm 687.endm
603 688
604.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 689.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
605 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 690 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
606 vrshr.u16 q14, q8, #8 691 urshr v14.8h, v8.8h, #8
607 PF add PF_X, PF_X, #8 692 PF add PF_X, PF_X, #8
608 PF tst PF_CTL, #0xF 693 PF tst PF_CTL, #0xF
609 vrshr.u16 q15, q9, #8 694 urshr v15.8h, v9.8h, #8
610 vrshr.u16 q12, q10, #8 695 urshr v16.8h, v10.8h, #8
611 vrshr.u16 q13, q11, #8 696 urshr v17.8h, v11.8h, #8
612 PF addne PF_X, PF_X, #8 697 PF beq 10f
613 PF subne PF_CTL, PF_CTL, #1 698 PF add PF_X, PF_X, #8
614 vraddhn.u16 d28, q14, q8 699 PF sub PF_CTL, PF_CTL, #1
615 vraddhn.u16 d29, q15, q9 70010:
701 raddhn v28.8b, v14.8h, v8.8h
702 raddhn v29.8b, v15.8h, v9.8h
616 PF cmp PF_X, ORIG_W 703 PF cmp PF_X, ORIG_W
617 vraddhn.u16 d30, q12, q10 704 raddhn v30.8b, v16.8h, v10.8h
618 vraddhn.u16 d31, q13, q11 705 raddhn v31.8b, v17.8h, v11.8h
619 fetch_src_pixblock 706 fetch_src_pixblock
620 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 707 PF lsl DUMMY, PF_X, #src_bpp_shift
621 vmvn.8 d22, d3 708 PF prfm pldl2strm, [PF_SRC, DUMMY]
622 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 709 mvn v22.8b, v3.8b
623 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 710 PF lsl DUMMY, PF_X, #dst_bpp_shift
624 PF subge PF_X, PF_X, ORIG_W 711 PF prfm pldl2strm, [PF_DST, DUMMY]
625 vmull.u8 q8, d22, d4 712 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
626 PF subges PF_CTL, PF_CTL, #0x10 713 PF ble 10f
627 vmull.u8 q9, d22, d5 714 PF sub PF_X, PF_X, ORIG_W
628 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 71510:
629 vmull.u8 q10, d22, d6 716 umull v8.8h, v22.8b, v4.8b
630 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 717 PF ble 10f
631 vmull.u8 q11, d22, d7 718 PF subs PF_CTL, PF_CTL, #0x10
71910:
720 umull v9.8h, v22.8b, v5.8b
721 PF ble 10f
722 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
723 PF ldrsb DUMMY, [PF_SRC, DUMMY]
724 PF add PF_SRC, PF_SRC, #1
72510:
726 umull v10.8h, v22.8b, v6.8b
727 PF ble 10f
728 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
729 PF ldrsb DUMMY, [PF_DST, DUMMY]
730 PF add PF_DST, PF_DST, #1
73110:
732 umull v11.8h, v22.8b, v7.8b
632.endm 733.endm
633 734
634generate_composite_function_single_scanline \ 735generate_composite_function_single_scanline \
635 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ 736 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
636 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 737 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -647,44 +748,64 @@ generate_composite_function_single_scanline \
647 pixman_composite_out_reverse_8888_8888_process_pixblock_head 748 pixman_composite_out_reverse_8888_8888_process_pixblock_head
648.endm 749.endm
649 750
650.macro pixman_composite_over_8888_8888_process_pixblock_tail 751.macro pixman_composite_over_8888_8888_process_pixblock_tail
651 pixman_composite_out_reverse_8888_8888_process_pixblock_tail 752 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
652 vqadd.u8 q14, q0, q14 753 uqadd v28.8b, v0.8b, v28.8b
653 vqadd.u8 q15, q1, q15 754 uqadd v29.8b, v1.8b, v29.8b
755 uqadd v30.8b, v2.8b, v30.8b
756 uqadd v31.8b, v3.8b, v31.8b
654.endm 757.endm
655 758
656.macro pixman_composite_over_8888_8888_process_pixblock_tail_head 759.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
657 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 760 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
658 vrshr.u16 q14, q8, #8 761 urshr v14.8h, v8.8h, #8
659 PF add PF_X, PF_X, #8 762 PF add PF_X, PF_X, #8
660 PF tst PF_CTL, #0xF 763 PF tst PF_CTL, #0xF
661 vrshr.u16 q15, q9, #8 764 urshr v15.8h, v9.8h, #8
662 vrshr.u16 q12, q10, #8 765 urshr v16.8h, v10.8h, #8
663 vrshr.u16 q13, q11, #8 766 urshr v17.8h, v11.8h, #8
664 PF addne PF_X, PF_X, #8 767 PF beq 10f
665 PF subne PF_CTL, PF_CTL, #1 768 PF add PF_X, PF_X, #8
666 vraddhn.u16 d28, q14, q8 769 PF sub PF_CTL, PF_CTL, #1
667 vraddhn.u16 d29, q15, q9 77010:
771 raddhn v28.8b, v14.8h, v8.8h
772 raddhn v29.8b, v15.8h, v9.8h
668 PF cmp PF_X, ORIG_W 773 PF cmp PF_X, ORIG_W
669 vraddhn.u16 d30, q12, q10 774 raddhn v30.8b, v16.8h, v10.8h
670 vraddhn.u16 d31, q13, q11 775 raddhn v31.8b, v17.8h, v11.8h
671 vqadd.u8 q14, q0, q14 776 uqadd v28.8b, v0.8b, v28.8b
672 vqadd.u8 q15, q1, q15 777 uqadd v29.8b, v1.8b, v29.8b
778 uqadd v30.8b, v2.8b, v30.8b
779 uqadd v31.8b, v3.8b, v31.8b
673 fetch_src_pixblock 780 fetch_src_pixblock
674 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 781 PF lsl DUMMY, PF_X, #src_bpp_shift
675 vmvn.8 d22, d3 782 PF prfm pldl2strm, [PF_SRC, DUMMY]
676 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 783 mvn v22.8b, v3.8b
677 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 784 PF lsl DUMMY, PF_X, #dst_bpp_shift
678 PF subge PF_X, PF_X, ORIG_W 785 PF prfm pldl2strm, [PF_DST, DUMMY]
679 vmull.u8 q8, d22, d4 786 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
680 PF subges PF_CTL, PF_CTL, #0x10 787 PF ble 10f
681 vmull.u8 q9, d22, d5 788 PF sub PF_X, PF_X, ORIG_W
682 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 78910:
683 vmull.u8 q10, d22, d6 790 umull v8.8h, v22.8b, v4.8b
684 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 791 PF ble 10f
685 vmull.u8 q11, d22, d7 792 PF subs PF_CTL, PF_CTL, #0x10
79310:
794 umull v9.8h, v22.8b, v5.8b
795 PF ble 10f
796 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
797 PF ldrsb DUMMY, [PF_SRC, DUMMY]
798 PF add PF_SRC, PF_SRC, #1
79910:
800 umull v10.8h, v22.8b, v6.8b
801 PF ble 10f
802 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
803 PF ldrsb DUMMY, [PF_DST, DUMMY]
804 PF add PF_DST, PF_DST, #1
80510:
806 umull v11.8h, v22.8b, v7.8b
686.endm 807.endm
687 808
688generate_composite_function \ 809generate_composite_function \
689 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 810 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
690 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 811 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -707,68 +828,80 @@ generate_composite_function_single_scanline \
707 pixman_composite_over_8888_8888_process_pixblock_tail_head 828 pixman_composite_over_8888_8888_process_pixblock_tail_head
708 829
709/******************************************************************************/ 830/******************************************************************************/
710 831
711.macro pixman_composite_over_n_8888_process_pixblock_head 832.macro pixman_composite_over_n_8888_process_pixblock_head
712 /* deinterleaved source pixels in {d0, d1, d2, d3} */ 833 /* deinterleaved source pixels in {v0, v1, v2, v3} */
713 /* inverted alpha in {d24} */ 834 /* inverted alpha in {v24} */
714 /* destination pixels in {d4, d5, d6, d7} */ 835 /* destination pixels in {v4, v5, v6, v7} */
715 vmull.u8 q8, d24, d4 836 umull v8.8h, v24.8b, v4.8b
716 vmull.u8 q9, d24, d5 837 umull v9.8h, v24.8b, v5.8b
717 vmull.u8 q10, d24, d6 838 umull v10.8h, v24.8b, v6.8b
718 vmull.u8 q11, d24, d7 839 umull v11.8h, v24.8b, v7.8b
719.endm 840.endm
720 841
721.macro pixman_composite_over_n_8888_process_pixblock_tail 842.macro pixman_composite_over_n_8888_process_pixblock_tail
722 vrshr.u16 q14, q8, #8 843 urshr v14.8h, v8.8h, #8
723 vrshr.u16 q15, q9, #8 844 urshr v15.8h, v9.8h, #8
724 vrshr.u16 q2, q10, #8 845 urshr v16.8h, v10.8h, #8
725 vrshr.u16 q3, q11, #8 846 urshr v17.8h, v11.8h, #8
726 vraddhn.u16 d28, q14, q8 847 raddhn v28.8b, v14.8h, v8.8h
727 vraddhn.u16 d29, q15, q9 848 raddhn v29.8b, v15.8h, v9.8h
728 vraddhn.u16 d30, q2, q10 849 raddhn v30.8b, v16.8h, v10.8h
729 vraddhn.u16 d31, q3, q11 850 raddhn v31.8b, v17.8h, v11.8h
730 vqadd.u8 q14, q0, q14 851 uqadd v28.8b, v0.8b, v28.8b
731 vqadd.u8 q15, q1, q15 852 uqadd v29.8b, v1.8b, v29.8b
853 uqadd v30.8b, v2.8b, v30.8b
854 uqadd v31.8b, v3.8b, v31.8b
732.endm 855.endm
733 856
734.macro pixman_composite_over_n_8888_process_pixblock_tail_head 857.macro pixman_composite_over_n_8888_process_pixblock_tail_head
735 vrshr.u16 q14, q8, #8 858 urshr v14.8h, v8.8h, #8
736 vrshr.u16 q15, q9, #8 859 urshr v15.8h, v9.8h, #8
737 vrshr.u16 q2, q10, #8 860 urshr v16.8h, v10.8h, #8
738 vrshr.u16 q3, q11, #8 861 urshr v17.8h, v11.8h, #8
739 vraddhn.u16 d28, q14, q8 862 raddhn v28.8b, v14.8h, v8.8h
740 vraddhn.u16 d29, q15, q9 863 raddhn v29.8b, v15.8h, v9.8h
741 vraddhn.u16 d30, q2, q10 864 raddhn v30.8b, v16.8h, v10.8h
742 vraddhn.u16 d31, q3, q11 865 raddhn v31.8b, v17.8h, v11.8h
743 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 866 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
744 vqadd.u8 q14, q0, q14 867 uqadd v28.8b, v0.8b, v28.8b
745 PF add PF_X, PF_X, #8 868 PF add PF_X, PF_X, #8
746 PF tst PF_CTL, #0x0F 869 PF tst PF_CTL, #0x0F
747 PF addne PF_X, PF_X, #8 870 PF beq 10f
748 PF subne PF_CTL, PF_CTL, #1 871 PF add PF_X, PF_X, #8
749 vqadd.u8 q15, q1, q15 872 PF sub PF_CTL, PF_CTL, #1
87310:
874 uqadd v29.8b, v1.8b, v29.8b
875 uqadd v30.8b, v2.8b, v30.8b
876 uqadd v31.8b, v3.8b, v31.8b
750 PF cmp PF_X, ORIG_W 877 PF cmp PF_X, ORIG_W
751 vmull.u8 q8, d24, d4 878 umull v8.8h, v24.8b, v4.8b
752 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 879 PF lsl DUMMY, PF_X, #dst_bpp_shift
753 vmull.u8 q9, d24, d5 880 PF prfm pldl2strm, [PF_DST, DUMMY]
754 PF subge PF_X, PF_X, ORIG_W 881 umull v9.8h, v24.8b, v5.8b
755 vmull.u8 q10, d24, d6 882 PF ble 10f
756 PF subges PF_CTL, PF_CTL, #0x10 883 PF sub PF_X, PF_X, ORIG_W
757 vmull.u8 q11, d24, d7 88410:
758 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 885 umull v10.8h, v24.8b, v6.8b
759 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 886 PF subs PF_CTL, PF_CTL, #0x10
887 umull v11.8h, v24.8b, v7.8b
888 PF ble 10f
889 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
890 PF ldrsb DUMMY, [PF_DST, DUMMY]
891 PF add PF_DST, PF_DST, #1
89210:
893 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
760.endm 894.endm
761 895
762.macro pixman_composite_over_n_8888_init 896.macro pixman_composite_over_n_8888_init
763 add DUMMY, sp, #ARGS_STACK_OFFSET 897 mov v3.s[0], w4
764 vld1.32 {d3[0]}, [DUMMY] 898 dup v0.8b, v3.b[0]
765 vdup.8 d0, d3[0] 899 dup v1.8b, v3.b[1]
766 vdup.8 d1, d3[1] 900 dup v2.8b, v3.b[2]
767 vdup.8 d2, d3[2] 901 dup v3.8b, v3.b[3]
768 vdup.8 d3, d3[3] 902 mvn v24.8b, v3.8b /* get inverted alpha */
769 vmvn.8 d24, d3 /* get inverted alpha */
770.endm 903.endm
771 904
772generate_composite_function \ 905generate_composite_function \
773 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 906 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
774 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 907 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -781,45 +914,57 @@ generate_composite_function \
781 pixman_composite_over_n_8888_process_pixblock_tail_head 914 pixman_composite_over_n_8888_process_pixblock_tail_head
782 915
783/******************************************************************************/ 916/******************************************************************************/
784 917
785.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 918.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
786 vrshr.u16 q14, q8, #8 919 urshr v14.8h, v8.8h, #8
787 PF add PF_X, PF_X, #8 920 PF add PF_X, PF_X, #8
788 PF tst PF_CTL, #0xF 921 PF tst PF_CTL, #0xF
789 vrshr.u16 q15, q9, #8 922 urshr v15.8h, v9.8h, #8
790 vrshr.u16 q12, q10, #8 923 urshr v12.8h, v10.8h, #8
791 vrshr.u16 q13, q11, #8 924 urshr v13.8h, v11.8h, #8
792 PF addne PF_X, PF_X, #8 925 PF beq 10f
793 PF subne PF_CTL, PF_CTL, #1 926 PF add PF_X, PF_X, #8
794 vraddhn.u16 d28, q14, q8 927 PF sub PF_CTL, PF_CTL, #1
795 vraddhn.u16 d29, q15, q9 92810:
929 raddhn v28.8b, v14.8h, v8.8h
930 raddhn v29.8b, v15.8h, v9.8h
796 PF cmp PF_X, ORIG_W 931 PF cmp PF_X, ORIG_W
797 vraddhn.u16 d30, q12, q10 932 raddhn v30.8b, v12.8h, v10.8h
798 vraddhn.u16 d31, q13, q11 933 raddhn v31.8b, v13.8h, v11.8h
799 vqadd.u8 q14, q0, q14 934 uqadd v28.8b, v0.8b, v28.8b
800 vqadd.u8 q15, q1, q15 935 uqadd v29.8b, v1.8b, v29.8b
801 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! 936 uqadd v30.8b, v2.8b, v30.8b
802 vmvn.8 d22, d3 937 uqadd v31.8b, v3.8b, v31.8b
803 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 938 ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
804 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 939 mvn v22.8b, v3.8b
805 PF subge PF_X, PF_X, ORIG_W 940 PF lsl DUMMY, PF_X, #dst_bpp_shift
806 vmull.u8 q8, d22, d4 941 PF prfm pldl2strm, [PF_DST, DUMMY]
807 PF subges PF_CTL, PF_CTL, #0x10 942 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
808 vmull.u8 q9, d22, d5 943 PF blt 10f
809 vmull.u8 q10, d22, d6 944 PF sub PF_X, PF_X, ORIG_W
810 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 94510:
811 vmull.u8 q11, d22, d7 946 umull v8.8h, v22.8b, v4.8b
947 PF blt 10f
948 PF subs PF_CTL, PF_CTL, #0x10
94910:
950 umull v9.8h, v22.8b, v5.8b
951 umull v10.8h, v22.8b, v6.8b
952 PF blt 10f
953 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
954 PF ldrsb DUMMY, [PF_DST, DUMMY]
955 PF add PF_DST, PF_DST, #1
95610:
957 umull v11.8h, v22.8b, v7.8b
812.endm 958.endm
813 959
814.macro pixman_composite_over_reverse_n_8888_init 960.macro pixman_composite_over_reverse_n_8888_init
815 add DUMMY, sp, #ARGS_STACK_OFFSET 961 mov v7.s[0], w4
816 vld1.32 {d7[0]}, [DUMMY] 962 dup v4.8b, v7.b[0]
817 vdup.8 d4, d7[0] 963 dup v5.8b, v7.b[1]
818 vdup.8 d5, d7[1] 964 dup v6.8b, v7.b[2]
819 vdup.8 d6, d7[2] 965 dup v7.8b, v7.b[3]
820 vdup.8 d7, d7[3]
821.endm 966.endm
822 967
823generate_composite_function \ 968generate_composite_function \
824 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 969 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
825 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 970 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -836,96 +981,118 @@ generate_composite_function \
836 24 /* mask_basereg */ 981 24 /* mask_basereg */
837 982
838/******************************************************************************/ 983/******************************************************************************/
839 984
840.macro pixman_composite_over_8888_8_0565_process_pixblock_head 985.macro pixman_composite_over_8888_8_0565_process_pixblock_head
841 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ 986 umull v0.8h, v24.8b, v8.8b /* IN for SRC pixels (part1) */
842 vmull.u8 q1, d24, d9 987 umull v1.8h, v24.8b, v9.8b
843 vmull.u8 q6, d24, d10 988 umull v2.8h, v24.8b, v10.8b
844 vmull.u8 q7, d24, d11 989 umull v3.8h, v24.8b, v11.8b
845 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ 990 mov v4.d[1], v5.d[0]
846 vshrn.u16 d7, q2, #3 991 shrn v25.8b, v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */
847 vsli.u16 q2, q2, #5 992 shrn v26.8b, v4.8h, #3
848 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ 993 sli v4.8h, v4.8h, #5
849 vrshr.u16 q9, q1, #8 994 urshr v17.8h, v0.8h, #8 /* IN for SRC pixels (part2) */
850 vrshr.u16 q10, q6, #8 995 urshr v18.8h, v1.8h, #8
851 vrshr.u16 q11, q7, #8 996 urshr v19.8h, v2.8h, #8
852 vraddhn.u16 d0, q0, q8 997 urshr v20.8h, v3.8h, #8
853 vraddhn.u16 d1, q1, q9 998 raddhn v0.8b, v0.8h, v17.8h
854 vraddhn.u16 d2, q6, q10 999 raddhn v1.8b, v1.8h, v18.8h
855 vraddhn.u16 d3, q7, q11 1000 raddhn v2.8b, v2.8h, v19.8h
856 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ 1001 raddhn v3.8b, v3.8h, v20.8h
857 vsri.u8 d7, d7, #6 1002 sri v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */
858 vmvn.8 d3, d3 1003 sri v26.8b, v26.8b, #6
859 vshrn.u16 d30, q2, #2 1004 mvn v3.8b, v3.8b
860 vmull.u8 q8, d3, d6 /* now do alpha blending */ 1005 shrn v30.8b, v4.8h, #2
861 vmull.u8 q9, d3, d7 1006 umull v18.8h, v3.8b, v25.8b /* now do alpha blending */
862 vmull.u8 q10, d3, d30 1007 umull v19.8h, v3.8b, v26.8b
1008 umull v20.8h, v3.8b, v30.8b
863.endm 1009.endm
864 1010
865.macro pixman_composite_over_8888_8_0565_process_pixblock_tail 1011.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
866 /* 3 cycle bubble (after vmull.u8) */ 1012 /* 3 cycle bubble (after vmull.u8) */
867 vrshr.u16 q13, q8, #8 1013 urshr v5.8h, v18.8h, #8
868 vrshr.u16 q11, q9, #8 1014 urshr v6.8h, v19.8h, #8
869 vrshr.u16 q15, q10, #8 1015 urshr v7.8h, v20.8h, #8
870 vraddhn.u16 d16, q8, q13 1016 raddhn v17.8b, v18.8h, v5.8h
871 vraddhn.u16 d27, q9, q11 1017 raddhn v19.8b, v19.8h, v6.8h
872 vraddhn.u16 d26, q10, q15 1018 raddhn v18.8b, v20.8h, v7.8h
873 vqadd.u8 d16, d2, d16 1019 uqadd v5.8b, v2.8b, v17.8b
874 /* 1 cycle bubble */ 1020 /* 1 cycle bubble */
875 vqadd.u8 q9, q0, q13 1021 uqadd v6.8b, v0.8b, v18.8b
876 vshll.u8 q14, d16, #8 /* convert to 16bpp */ 1022 uqadd v7.8b, v1.8b, v19.8b
877 vshll.u8 q8, d19, #8 1023 ushll v14.8h, v5.8b, #7 /* convert to 16bpp */
878 vshll.u8 q9, d18, #8 1024 sli v14.8h, v14.8h, #1
879 vsri.u16 q14, q8, #5 1025 ushll v18.8h, v7.8b, #7
1026 sli v18.8h, v18.8h, #1
1027 ushll v19.8h, v6.8b, #7
1028 sli v19.8h, v19.8h, #1
1029 sri v14.8h, v18.8h, #5
880 /* 1 cycle bubble */ 1030 /* 1 cycle bubble */
881 vsri.u16 q14, q9, #11 1031 sri v14.8h, v19.8h, #11
1032 mov v28.d[0], v14.d[0]
1033 mov v29.d[0], v14.d[1]
882.endm 1034.endm
883 1035
884.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head 1036.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
885 vld1.16 {d4, d5}, [DST_R, :128]! 1037#if 0
886 vshrn.u16 d6, q2, #8 1038 ld1 {v4.8h}, [DST_R], #16
1039 shrn v25.8b, v4.8h, #8
887 fetch_mask_pixblock 1040 fetch_mask_pixblock
888 vshrn.u16 d7, q2, #3 1041 shrn v26.8b, v4.8h, #3
889 fetch_src_pixblock 1042 fetch_src_pixblock
890 vmull.u8 q6, d24, d10 1043 umull v22.8h, v24.8b, v10.8b
891 vrshr.u16 q13, q8, #8 1044 urshr v13.8h, v18.8h, #8
892 vrshr.u16 q11, q9, #8 1045 urshr v11.8h, v19.8h, #8
893 vrshr.u16 q15, q10, #8 1046 urshr v15.8h, v20.8h, #8
894 vraddhn.u16 d16, q8, q13 1047 raddhn v17.8b, v18.8h, v13.8h
895 vraddhn.u16 d27, q9, q11 1048 raddhn v19.8b, v19.8h, v11.8h
896 vraddhn.u16 d26, q10, q15 1049 raddhn v18.8b, v20.8h, v15.8h
897 vqadd.u8 d16, d2, d16 1050 uqadd v17.8b, v2.8b, v17.8b
898 vmull.u8 q1, d24, d9 1051 umull v21.8h, v24.8b, v9.8b
899 vqadd.u8 q9, q0, q13 1052 uqadd v18.8b, v0.8b, v18.8b
900 vshll.u8 q14, d16, #8 1053 uqadd v19.8b, v1.8b, v19.8b
901 vmull.u8 q0, d24, d8 1054 ushll v14.8h, v17.8b, #7
902 vshll.u8 q8, d19, #8 1055 sli v14.8h, v14.8h, #1
903 vshll.u8 q9, d18, #8 1056 umull v20.8h, v24.8b, v8.8b
904 vsri.u16 q14, q8, #5 1057 ushll v18.8h, v18.8b, #7
905 vmull.u8 q7, d24, d11 1058 sli v18.8h, v18.8h, #1
906 vsri.u16 q14, q9, #11 1059 ushll v19.8h, v19.8b, #7
1060 sli v19.8h, v19.8h, #1
1061 sri v14.8h, v18.8h, #5
1062 umull v23.8h, v24.8b, v11.8b
1063 sri v14.8h, v19.8h, #11
1064 mov v28.d[0], v14.d[0]
1065 mov v29.d[0], v14.d[1]
907 1066
908 cache_preload 8, 8 1067 cache_preload 8, 8
909 1068
910 vsli.u16 q2, q2, #5 1069 sli v4.8h, v4.8h, #5
911 vrshr.u16 q8, q0, #8 1070 urshr v16.8h, v20.8h, #8
912 vrshr.u16 q9, q1, #8 1071 urshr v17.8h, v21.8h, #8
913 vrshr.u16 q10, q6, #8 1072 urshr v18.8h, v22.8h, #8
914 vrshr.u16 q11, q7, #8 1073 urshr v19.8h, v23.8h, #8
915 vraddhn.u16 d0, q0, q8 1074 raddhn v0.8b, v20.8h, v16.8h
916 vraddhn.u16 d1, q1, q9 1075 raddhn v1.8b, v21.8h, v17.8h
917 vraddhn.u16 d2, q6, q10 1076 raddhn v2.8b, v22.8h, v18.8h
918 vraddhn.u16 d3, q7, q11 1077 raddhn v3.8b, v23.8h, v19.8h
919 vsri.u8 d6, d6, #5 1078 sri v25.8b, v25.8b, #5
920 vsri.u8 d7, d7, #6 1079 sri v26.8b, v26.8b, #6
921 vmvn.8 d3, d3 1080 mvn v3.8b, v3.8b
922 vshrn.u16 d30, q2, #2 1081 shrn v30.8b, v4.8h, #2
923 vst1.16 {d28, d29}, [DST_W, :128]! 1082 st1 {v14.8h}, [DST_W], #16
924 vmull.u8 q8, d3, d6 1083 umull v18.8h, v3.8b, v25.8b
925 vmull.u8 q9, d3, d7 1084 umull v19.8h, v3.8b, v26.8b
926 vmull.u8 q10, d3, d30 1085 umull v20.8h, v3.8b, v30.8b
1086#else
1087 pixman_composite_over_8888_8_0565_process_pixblock_tail
1088 st1 {v28.4h, v29.4h}, [DST_W], #16
1089 ld1 {v4.4h, v5.4h}, [DST_R], #16
1090 fetch_mask_pixblock
1091 fetch_src_pixblock
1092 pixman_composite_over_8888_8_0565_process_pixblock_head
1093#endif
927.endm 1094.endm
928 1095
929generate_composite_function \ 1096generate_composite_function \
930 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ 1097 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
931 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1098 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -952,21 +1119,18 @@ generate_composite_function \
952 * to ABI. These registers are restored from 'cleanup' macro. All the 1119 * to ABI. These registers are restored from 'cleanup' macro. All the
953 * other NEON registers are caller saved, so can be clobbered freely 1120 * other NEON registers are caller saved, so can be clobbered freely
954 * without introducing any problems. 1121 * without introducing any problems.
955 */ 1122 */
956.macro pixman_composite_over_n_8_0565_init 1123.macro pixman_composite_over_n_8_0565_init
957 add DUMMY, sp, #ARGS_STACK_OFFSET 1124 mov v11.s[0], w4
958 vpush {d8-d15} 1125 dup v8.8b, v11.b[0]
959 vld1.32 {d11[0]}, [DUMMY] 1126 dup v9.8b, v11.b[1]
960 vdup.8 d8, d11[0] 1127 dup v10.8b, v11.b[2]
961 vdup.8 d9, d11[1] 1128 dup v11.8b, v11.b[3]
962 vdup.8 d10, d11[2]
963 vdup.8 d11, d11[3]
964.endm 1129.endm
965 1130
966.macro pixman_composite_over_n_8_0565_cleanup 1131.macro pixman_composite_over_n_8_0565_cleanup
967 vpop {d8-d15}
968.endm 1132.endm
969 1133
970generate_composite_function \ 1134generate_composite_function \
971 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 1135 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
972 FLAG_DST_READWRITE, \ 1136 FLAG_DST_READWRITE, \
@@ -974,23 +1138,24 @@ generate_composite_function \
974 5, /* prefetch distance */ \ 1138 5, /* prefetch distance */ \
975 pixman_composite_over_n_8_0565_init, \ 1139 pixman_composite_over_n_8_0565_init, \
976 pixman_composite_over_n_8_0565_cleanup, \ 1140 pixman_composite_over_n_8_0565_cleanup, \
977 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 1141 pixman_composite_over_8888_8_0565_process_pixblock_head, \
978 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 1142 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
979 pixman_composite_over_8888_8_0565_process_pixblock_tail_head 1143 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1144 28, /* dst_w_basereg */ \
1145 4, /* dst_r_basereg */ \
1146 8, /* src_basereg */ \
1147 24 /* mask_basereg */
980 1148
981/******************************************************************************/ 1149/******************************************************************************/
982 1150
983.macro pixman_composite_over_8888_n_0565_init 1151.macro pixman_composite_over_8888_n_0565_init
984 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 1152 mov v24.s[0], w6
985 vpush {d8-d15} 1153 dup v24.8b, v24.b[3]
986 vld1.32 {d24[0]}, [DUMMY]
987 vdup.8 d24, d24[3]
988.endm 1154.endm
989 1155
990.macro pixman_composite_over_8888_n_0565_cleanup 1156.macro pixman_composite_over_8888_n_0565_cleanup
991 vpop {d8-d15}
992.endm 1157.endm
993 1158
994generate_composite_function \ 1159generate_composite_function \
995 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ 1160 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
996 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1161 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -1013,11 +1178,11 @@ generate_composite_function \
1013 1178
1014.macro pixman_composite_src_0565_0565_process_pixblock_tail 1179.macro pixman_composite_src_0565_0565_process_pixblock_tail
1015.endm 1180.endm
1016 1181
1017.macro pixman_composite_src_0565_0565_process_pixblock_tail_head 1182.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1018 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1183 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
1019 fetch_src_pixblock 1184 fetch_src_pixblock
1020 cache_preload 16, 16 1185 cache_preload 16, 16
1021.endm 1186.endm
1022 1187
1023generate_composite_function \ 1188generate_composite_function \
@@ -1042,21 +1207,19 @@ generate_composite_function \
1042 1207
1043.macro pixman_composite_src_n_8_process_pixblock_tail 1208.macro pixman_composite_src_n_8_process_pixblock_tail
1044.endm 1209.endm
1045 1210
1046.macro pixman_composite_src_n_8_process_pixblock_tail_head 1211.macro pixman_composite_src_n_8_process_pixblock_tail_head
1047 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! 1212 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
1048.endm 1213.endm
1049 1214
1050.macro pixman_composite_src_n_8_init 1215.macro pixman_composite_src_n_8_init
1051 add DUMMY, sp, #ARGS_STACK_OFFSET 1216 mov v0.s[0], w4
1052 vld1.32 {d0[0]}, [DUMMY] 1217 dup v3.8b, v0.b[0]
1053 vsli.u64 d0, d0, #8 1218 dup v2.8b, v0.b[0]
1054 vsli.u64 d0, d0, #16 1219 dup v1.8b, v0.b[0]
1055 vsli.u64 d0, d0, #32 1220 dup v0.8b, v0.b[0]
1056 vorr d1, d0, d0
1057 vorr q1, q0, q0
1058.endm 1221.endm
1059 1222
1060.macro pixman_composite_src_n_8_cleanup 1223.macro pixman_composite_src_n_8_cleanup
1061.endm 1224.endm
1062 1225
@@ -1082,20 +1245,19 @@ generate_composite_function \
1082 1245
1083.macro pixman_composite_src_n_0565_process_pixblock_tail 1246.macro pixman_composite_src_n_0565_process_pixblock_tail
1084.endm 1247.endm
1085 1248
1086.macro pixman_composite_src_n_0565_process_pixblock_tail_head 1249.macro pixman_composite_src_n_0565_process_pixblock_tail_head
1087 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1250 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
1088.endm 1251.endm
1089 1252
1090.macro pixman_composite_src_n_0565_init 1253.macro pixman_composite_src_n_0565_init
1091 add DUMMY, sp, #ARGS_STACK_OFFSET 1254 mov v0.s[0], w4
1092 vld1.32 {d0[0]}, [DUMMY] 1255 dup v3.4h, v0.h[0]
1093 vsli.u64 d0, d0, #16 1256 dup v2.4h, v0.h[0]
1094 vsli.u64 d0, d0, #32 1257 dup v1.4h, v0.h[0]
1095 vorr d1, d0, d0 1258 dup v0.4h, v0.h[0]
1096 vorr q1, q0, q0
1097.endm 1259.endm
1098 1260
1099.macro pixman_composite_src_n_0565_cleanup 1261.macro pixman_composite_src_n_0565_cleanup
1100.endm 1262.endm
1101 1263
@@ -1121,19 +1283,19 @@ generate_composite_function \
1121 1283
1122.macro pixman_composite_src_n_8888_process_pixblock_tail 1284.macro pixman_composite_src_n_8888_process_pixblock_tail
1123.endm 1285.endm
1124 1286
1125.macro pixman_composite_src_n_8888_process_pixblock_tail_head 1287.macro pixman_composite_src_n_8888_process_pixblock_tail_head
1126 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1288 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
1127.endm 1289.endm
1128 1290
1129.macro pixman_composite_src_n_8888_init 1291.macro pixman_composite_src_n_8888_init
1130 add DUMMY, sp, #ARGS_STACK_OFFSET 1292 mov v0.s[0], w4
1131 vld1.32 {d0[0]}, [DUMMY] 1293 dup v3.2s, v0.s[0]
1132 vsli.u64 d0, d0, #32 1294 dup v2.2s, v0.s[0]
1133 vorr d1, d0, d0 1295 dup v1.2s, v0.s[0]
1134 vorr q1, q0, q0 1296 dup v0.2s, v0.s[0]
1135.endm 1297.endm
1136 1298
1137.macro pixman_composite_src_n_8888_cleanup 1299.macro pixman_composite_src_n_8888_cleanup
1138.endm 1300.endm
1139 1301
@@ -1159,11 +1321,11 @@ generate_composite_function \
1159 1321
1160.macro pixman_composite_src_8888_8888_process_pixblock_tail 1322.macro pixman_composite_src_8888_8888_process_pixblock_tail
1161.endm 1323.endm
1162 1324
1163.macro pixman_composite_src_8888_8888_process_pixblock_tail_head 1325.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1164 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1326 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
1165 fetch_src_pixblock 1327 fetch_src_pixblock
1166 cache_preload 8, 8 1328 cache_preload 8, 8
1167.endm 1329.endm
1168 1330
1169generate_composite_function \ 1331generate_composite_function \
@@ -1182,28 +1344,33 @@ generate_composite_function \
1182 0 /* mask_basereg */ 1344 0 /* mask_basereg */
1183 1345
1184/******************************************************************************/ 1346/******************************************************************************/
1185 1347
1186.macro pixman_composite_src_x888_8888_process_pixblock_head 1348.macro pixman_composite_src_x888_8888_process_pixblock_head
1187 vorr q0, q0, q2 1349 orr v0.8b, v0.8b, v4.8b
1188 vorr q1, q1, q2 1350 orr v1.8b, v1.8b, v4.8b
1351 orr v2.8b, v2.8b, v4.8b
1352 orr v3.8b, v3.8b, v4.8b
1189.endm 1353.endm
1190 1354
1191.macro pixman_composite_src_x888_8888_process_pixblock_tail 1355.macro pixman_composite_src_x888_8888_process_pixblock_tail
1192.endm 1356.endm
1193 1357
1194.macro pixman_composite_src_x888_8888_process_pixblock_tail_head 1358.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1195 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1359 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
1196 fetch_src_pixblock 1360 fetch_src_pixblock
1197 vorr q0, q0, q2 1361 orr v0.8b, v0.8b, v4.8b
1198 vorr q1, q1, q2 1362 orr v1.8b, v1.8b, v4.8b
1363 orr v2.8b, v2.8b, v4.8b
1364 orr v3.8b, v3.8b, v4.8b
1199 cache_preload 8, 8 1365 cache_preload 8, 8
1200.endm 1366.endm
1201 1367
1202.macro pixman_composite_src_x888_8888_init 1368.macro pixman_composite_src_x888_8888_init
1203 vmov.u8 q2, #0xFF 1369 mov w20, #0xFF
1204 vshl.u32 q2, q2, #24 1370 dup v4.8b, w20
1371 shl v4.2s, v4.2s, #24
1205.endm 1372.endm
1206 1373
1207generate_composite_function \ 1374generate_composite_function \
1208 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 1375 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1209 FLAG_DST_WRITEONLY, \ 1376 FLAG_DST_WRITEONLY, \
@@ -1220,64 +1387,76 @@ generate_composite_function \
1220 0 /* mask_basereg */ 1387 0 /* mask_basereg */
1221 1388
1222/******************************************************************************/ 1389/******************************************************************************/
1223 1390
1224.macro pixman_composite_src_n_8_8888_process_pixblock_head 1391.macro pixman_composite_src_n_8_8888_process_pixblock_head
1225 /* expecting solid source in {d0, d1, d2, d3} */ 1392 /* expecting solid source in {v0, v1, v2, v3} */
1226 /* mask is in d24 (d25, d26, d27 are unused) */ 1393 /* mask is in v24 (v25, v26, v27 are unused) */
1227 1394
1228 /* in */ 1395 /* in */
1229 vmull.u8 q8, d24, d0 1396 umull v8.8h, v24.8b, v0.8b
1230 vmull.u8 q9, d24, d1 1397 umull v9.8h, v24.8b, v1.8b
1231 vmull.u8 q10, d24, d2 1398 umull v10.8h, v24.8b, v2.8b
1232 vmull.u8 q11, d24, d3 1399 umull v11.8h, v24.8b, v3.8b
1233 vrsra.u16 q8, q8, #8 1400 ursra v8.8h, v8.8h, #8
1234 vrsra.u16 q9, q9, #8 1401 ursra v9.8h, v9.8h, #8
1235 vrsra.u16 q10, q10, #8 1402 ursra v10.8h, v10.8h, #8
1236 vrsra.u16 q11, q11, #8 1403 ursra v11.8h, v11.8h, #8
1237.endm 1404.endm
1238 1405
1239.macro pixman_composite_src_n_8_8888_process_pixblock_tail 1406.macro pixman_composite_src_n_8_8888_process_pixblock_tail
1240 vrshrn.u16 d28, q8, #8 1407 rshrn v28.8b, v8.8h, #8
1241 vrshrn.u16 d29, q9, #8 1408 rshrn v29.8b, v9.8h, #8
1242 vrshrn.u16 d30, q10, #8 1409 rshrn v30.8b, v10.8h, #8
1243 vrshrn.u16 d31, q11, #8 1410 rshrn v31.8b, v11.8h, #8
1244.endm 1411.endm
1245 1412
1246.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head 1413.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1247 fetch_mask_pixblock 1414 fetch_mask_pixblock
1248 PF add PF_X, PF_X, #8 1415 PF add PF_X, PF_X, #8
1249 vrshrn.u16 d28, q8, #8 1416 rshrn v28.8b, v8.8h, #8
1250 PF tst PF_CTL, #0x0F 1417 PF tst PF_CTL, #0x0F
1251 vrshrn.u16 d29, q9, #8 1418 rshrn v29.8b, v9.8h, #8
1252 PF addne PF_X, PF_X, #8 1419 PF beq 10f
1253 vrshrn.u16 d30, q10, #8 1420 PF add PF_X, PF_X, #8
1254 PF subne PF_CTL, PF_CTL, #1 142110:
1255 vrshrn.u16 d31, q11, #8 1422 rshrn v30.8b, v10.8h, #8
1423 PF beq 10f
1424 PF sub PF_CTL, PF_CTL, #1
142510:
1426 rshrn v31.8b, v11.8h, #8
1256 PF cmp PF_X, ORIG_W 1427 PF cmp PF_X, ORIG_W
1257 vmull.u8 q8, d24, d0 1428 umull v8.8h, v24.8b, v0.8b
1258 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1429 PF lsl DUMMY, PF_X, #mask_bpp_shift
1259 vmull.u8 q9, d24, d1 1430 PF prfm pldl2strm, [PF_MASK, DUMMY]
1260 PF subge PF_X, PF_X, ORIG_W 1431 umull v9.8h, v24.8b, v1.8b
1261 vmull.u8 q10, d24, d2 1432 PF ble 10f
1262 PF subges PF_CTL, PF_CTL, #0x10 1433 PF sub PF_X, PF_X, ORIG_W
1263 vmull.u8 q11, d24, d3 143410:
1264 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1435 umull v10.8h, v24.8b, v2.8b
1265 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1436 PF ble 10f
1266 vrsra.u16 q8, q8, #8 1437 PF subs PF_CTL, PF_CTL, #0x10
1267 vrsra.u16 q9, q9, #8 143810:
1268 vrsra.u16 q10, q10, #8 1439 umull v11.8h, v24.8b, v3.8b
1269 vrsra.u16 q11, q11, #8 1440 PF ble 10f
1441 PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1442 PF ldrsb DUMMY, [PF_MASK, DUMMY]
1443 PF add PF_MASK, PF_MASK, #1
144410:
1445 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1446 ursra v8.8h, v8.8h, #8
1447 ursra v9.8h, v9.8h, #8
1448 ursra v10.8h, v10.8h, #8
1449 ursra v11.8h, v11.8h, #8
1270.endm 1450.endm
1271 1451
1272.macro pixman_composite_src_n_8_8888_init 1452.macro pixman_composite_src_n_8_8888_init
1273 add DUMMY, sp, #ARGS_STACK_OFFSET 1453 mov v3.s[0], w4
1274 vld1.32 {d3[0]}, [DUMMY] 1454 dup v0.8b, v3.b[0]
1275 vdup.8 d0, d3[0] 1455 dup v1.8b, v3.b[1]
1276 vdup.8 d1, d3[1] 1456 dup v2.8b, v3.b[2]
1277 vdup.8 d2, d3[2] 1457 dup v3.8b, v3.b[3]
1278 vdup.8 d3, d3[3]
1279.endm 1458.endm
1280 1459
1281.macro pixman_composite_src_n_8_8888_cleanup 1460.macro pixman_composite_src_n_8_8888_cleanup
1282.endm 1461.endm
1283 1462
@@ -1293,57 +1472,69 @@ generate_composite_function \
1293 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ 1472 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1294 1473
1295/******************************************************************************/ 1474/******************************************************************************/
1296 1475
1297.macro pixman_composite_src_n_8_8_process_pixblock_head 1476.macro pixman_composite_src_n_8_8_process_pixblock_head
1298 vmull.u8 q0, d24, d16 1477 umull v0.8h, v24.8b, v16.8b
1299 vmull.u8 q1, d25, d16 1478 umull v1.8h, v25.8b, v16.8b
1300 vmull.u8 q2, d26, d16 1479 umull v2.8h, v26.8b, v16.8b
1301 vmull.u8 q3, d27, d16 1480 umull v3.8h, v27.8b, v16.8b
1302 vrsra.u16 q0, q0, #8 1481 ursra v0.8h, v0.8h, #8
1303 vrsra.u16 q1, q1, #8 1482 ursra v1.8h, v1.8h, #8
1304 vrsra.u16 q2, q2, #8 1483 ursra v2.8h, v2.8h, #8
1305 vrsra.u16 q3, q3, #8 1484 ursra v3.8h, v3.8h, #8
1306.endm 1485.endm
1307 1486
1308.macro pixman_composite_src_n_8_8_process_pixblock_tail 1487.macro pixman_composite_src_n_8_8_process_pixblock_tail
1309 vrshrn.u16 d28, q0, #8 1488 rshrn v28.8b, v0.8h, #8
1310 vrshrn.u16 d29, q1, #8 1489 rshrn v29.8b, v1.8h, #8
1311 vrshrn.u16 d30, q2, #8 1490 rshrn v30.8b, v2.8h, #8
1312 vrshrn.u16 d31, q3, #8 1491 rshrn v31.8b, v3.8h, #8
1313.endm 1492.endm
1314 1493
1315.macro pixman_composite_src_n_8_8_process_pixblock_tail_head 1494.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1316 fetch_mask_pixblock 1495 fetch_mask_pixblock
1317 PF add PF_X, PF_X, #8 1496 PF add PF_X, PF_X, #8
1318 vrshrn.u16 d28, q0, #8 1497 rshrn v28.8b, v0.8h, #8
1319 PF tst PF_CTL, #0x0F 1498 PF tst PF_CTL, #0x0F
1320 vrshrn.u16 d29, q1, #8 1499 rshrn v29.8b, v1.8h, #8
1321 PF addne PF_X, PF_X, #8 1500 PF beq 10f
1322 vrshrn.u16 d30, q2, #8 1501 PF add PF_X, PF_X, #8
1323 PF subne PF_CTL, PF_CTL, #1 150210:
1324 vrshrn.u16 d31, q3, #8 1503 rshrn v30.8b, v2.8h, #8
1504 PF beq 10f
1505 PF sub PF_CTL, PF_CTL, #1
150610:
1507 rshrn v31.8b, v3.8h, #8
1325 PF cmp PF_X, ORIG_W 1508 PF cmp PF_X, ORIG_W
1326 vmull.u8 q0, d24, d16 1509 umull v0.8h, v24.8b, v16.8b
1327 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1510 PF lsl DUMMY, PF_X, mask_bpp_shift
1328 vmull.u8 q1, d25, d16 1511 PF prfm pldl2strm, [PF_MASK, DUMMY]
1329 PF subge PF_X, PF_X, ORIG_W 1512 umull v1.8h, v25.8b, v16.8b
1330 vmull.u8 q2, d26, d16 1513 PF ble 10f
1331 PF subges PF_CTL, PF_CTL, #0x10 1514 PF sub PF_X, PF_X, ORIG_W
1332 vmull.u8 q3, d27, d16 151510:
1333 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1516 umull v2.8h, v26.8b, v16.8b
1334 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1517 PF ble 10f
1335 vrsra.u16 q0, q0, #8 1518 PF subs PF_CTL, PF_CTL, #0x10
1336 vrsra.u16 q1, q1, #8 151910:
1337 vrsra.u16 q2, q2, #8 1520 umull v3.8h, v27.8b, v16.8b
1338 vrsra.u16 q3, q3, #8 1521 PF ble 10f
1522 PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1523 PF ldrsb DUMMY, [PF_MASK, DUMMY]
1524 PF add PF_MASK, PF_MASK, #1
152510:
1526 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1527 ursra v0.8h, v0.8h, #8
1528 ursra v1.8h, v1.8h, #8
1529 ursra v2.8h, v2.8h, #8
1530 ursra v3.8h, v3.8h, #8
1339.endm 1531.endm
1340 1532
1341.macro pixman_composite_src_n_8_8_init 1533.macro pixman_composite_src_n_8_8_init
1342 add DUMMY, sp, #ARGS_STACK_OFFSET 1534 mov v16.s[0], w4
1343 vld1.32 {d16[0]}, [DUMMY] 1535 dup v16.8b, v16.b[3]
1344 vdup.8 d16, d16[3]
1345.endm 1536.endm
1346 1537
1347.macro pixman_composite_src_n_8_8_cleanup 1538.macro pixman_composite_src_n_8_8_cleanup
1348.endm 1539.endm
1349 1540
@@ -1359,107 +1550,126 @@ generate_composite_function \
1359 pixman_composite_src_n_8_8_process_pixblock_tail_head 1550 pixman_composite_src_n_8_8_process_pixblock_tail_head
1360 1551
1361/******************************************************************************/ 1552/******************************************************************************/
1362 1553
1363.macro pixman_composite_over_n_8_8888_process_pixblock_head 1554.macro pixman_composite_over_n_8_8888_process_pixblock_head
1364 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ 1555 /* expecting deinterleaved source data in {v8, v9, v10, v11} */
1365 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1556 /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
1366 /* and destination data in {d4, d5, d6, d7} */ 1557 /* and destination data in {v4, v5, v6, v7} */
1367 /* mask is in d24 (d25, d26, d27 are unused) */ 1558 /* mask is in v24 (v25, v26, v27 are unused) */
1368 1559
1369 /* in */ 1560 /* in */
1370 vmull.u8 q6, d24, d8 1561 umull v12.8h, v24.8b, v8.8b
1371 vmull.u8 q7, d24, d9 1562 umull v13.8h, v24.8b, v9.8b
1372 vmull.u8 q8, d24, d10 1563 umull v14.8h, v24.8b, v10.8b
1373 vmull.u8 q9, d24, d11 1564 umull v15.8h, v24.8b, v11.8b
1374 vrshr.u16 q10, q6, #8 1565 urshr v16.8h, v12.8h, #8
1375 vrshr.u16 q11, q7, #8 1566 urshr v17.8h, v13.8h, #8
1376 vrshr.u16 q12, q8, #8 1567 urshr v18.8h, v14.8h, #8
1377 vrshr.u16 q13, q9, #8 1568 urshr v19.8h, v15.8h, #8
1378 vraddhn.u16 d0, q6, q10 1569 raddhn v0.8b, v12.8h, v16.8h
1379 vraddhn.u16 d1, q7, q11 1570 raddhn v1.8b, v13.8h, v17.8h
1380 vraddhn.u16 d2, q8, q12 1571 raddhn v2.8b, v14.8h, v18.8h
1381 vraddhn.u16 d3, q9, q13 1572 raddhn v3.8b, v15.8h, v19.8h
1382 vmvn.8 d25, d3 /* get inverted alpha */ 1573 mvn v25.8b, v3.8b /* get inverted alpha */
1383 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ 1574 /* source: v0 - blue, v1 - green, v2 - red, v3 - alpha */
1384 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ 1575 /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
1385 /* now do alpha blending */ 1576 /* now do alpha blending */
1386 vmull.u8 q8, d25, d4 1577 umull v12.8h, v25.8b, v4.8b
1387 vmull.u8 q9, d25, d5 1578 umull v13.8h, v25.8b, v5.8b
1388 vmull.u8 q10, d25, d6 1579 umull v14.8h, v25.8b, v6.8b
1389 vmull.u8 q11, d25, d7 1580 umull v15.8h, v25.8b, v7.8b
1390.endm 1581.endm
1391 1582
1392.macro pixman_composite_over_n_8_8888_process_pixblock_tail 1583.macro pixman_composite_over_n_8_8888_process_pixblock_tail
1393 vrshr.u16 q14, q8, #8 1584 urshr v16.8h, v12.8h, #8
1394 vrshr.u16 q15, q9, #8 1585 urshr v17.8h, v13.8h, #8
1395 vrshr.u16 q6, q10, #8 1586 urshr v18.8h, v14.8h, #8
1396 vrshr.u16 q7, q11, #8 1587 urshr v19.8h, v15.8h, #8
1397 vraddhn.u16 d28, q14, q8 1588 raddhn v28.8b, v16.8h, v12.8h
1398 vraddhn.u16 d29, q15, q9 1589 raddhn v29.8b, v17.8h, v13.8h
1399 vraddhn.u16 d30, q6, q10 1590 raddhn v30.8b, v18.8h, v14.8h
1400 vraddhn.u16 d31, q7, q11 1591 raddhn v31.8b, v19.8h, v15.8h
1401 vqadd.u8 q14, q0, q14 1592 uqadd v28.8b, v0.8b, v28.8b
1402 vqadd.u8 q15, q1, q15 1593 uqadd v29.8b, v1.8b, v29.8b
1594 uqadd v30.8b, v2.8b, v30.8b
1595 uqadd v31.8b, v3.8b, v31.8b
1403.endm 1596.endm
1404 1597
1405.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 1598.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1406 vrshr.u16 q14, q8, #8 1599 urshr v16.8h, v12.8h, #8
1407 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1600 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1408 vrshr.u16 q15, q9, #8 1601 urshr v17.8h, v13.8h, #8
1409 fetch_mask_pixblock 1602 fetch_mask_pixblock
1410 vrshr.u16 q6, q10, #8 1603 urshr v18.8h, v14.8h, #8
1411 PF add PF_X, PF_X, #8 1604 PF add PF_X, PF_X, #8
1412 vrshr.u16 q7, q11, #8 1605 urshr v19.8h, v15.8h, #8
1413 PF tst PF_CTL, #0x0F 1606 PF tst PF_CTL, #0x0F
1414 vraddhn.u16 d28, q14, q8 1607 raddhn v28.8b, v16.8h, v12.8h
1415 PF addne PF_X, PF_X, #8 1608 PF beq 10f
1416 vraddhn.u16 d29, q15, q9 1609 PF add PF_X, PF_X, #8
1417 PF subne PF_CTL, PF_CTL, #1 161010:
1418 vraddhn.u16 d30, q6, q10 1611 raddhn v29.8b, v17.8h, v13.8h
1612 PF beq 10f
1613 PF sub PF_CTL, PF_CTL, #1
161410:
1615 raddhn v30.8b, v18.8h, v14.8h
1419 PF cmp PF_X, ORIG_W 1616 PF cmp PF_X, ORIG_W
1420 vraddhn.u16 d31, q7, q11 1617 raddhn v31.8b, v19.8h, v15.8h
1421 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1618 PF lsl DUMMY, PF_X, #dst_bpp_shift
1422 vmull.u8 q6, d24, d8 1619 PF prfm pldl2strm, [PF_DST, DUMMY]
1423 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1620 umull v16.8h, v24.8b, v8.8b
1424 vmull.u8 q7, d24, d9 1621 PF lsl DUMMY, PF_X, #mask_bpp_shift
1425 PF subge PF_X, PF_X, ORIG_W 1622 PF prfm pldl2strm, [PF_MASK, DUMMY]
1426 vmull.u8 q8, d24, d10 1623 umull v17.8h, v24.8b, v9.8b
1427 PF subges PF_CTL, PF_CTL, #0x10 1624 PF ble 10f
1428 vmull.u8 q9, d24, d11 1625 PF sub PF_X, PF_X, ORIG_W
1429 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 162610:
1430 vqadd.u8 q14, q0, q14 1627 umull v18.8h, v24.8b, v10.8b
1431 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1628 PF ble 10f
1432 vqadd.u8 q15, q1, q15 1629 PF subs PF_CTL, PF_CTL, #0x10
1433 vrshr.u16 q10, q6, #8 163010:
1434 vrshr.u16 q11, q7, #8 1631 umull v19.8h, v24.8b, v11.8b
1435 vrshr.u16 q12, q8, #8 1632 PF ble 10f
1436 vrshr.u16 q13, q9, #8 1633 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1437 vraddhn.u16 d0, q6, q10 1634 PF ldrsb DUMMY, [PF_DST, DUMMY]
1438 vraddhn.u16 d1, q7, q11 1635 PF add PF_DST, PF_DST, #1
1439 vraddhn.u16 d2, q8, q12 163610:
1440 vraddhn.u16 d3, q9, q13 1637 uqadd v28.8b, v0.8b, v28.8b
1441 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1638 PF ble 10f
1442 vmvn.8 d25, d3 1639 PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1443 vmull.u8 q8, d25, d4 1640 PF ldrsb DUMMY, [PF_MASK, DUMMY]
1444 vmull.u8 q9, d25, d5 1641 PF add PF_MASK, PF_MASK, #1
1445 vmull.u8 q10, d25, d6 164210:
1446 vmull.u8 q11, d25, d7 1643 uqadd v29.8b, v1.8b, v29.8b
1644 uqadd v30.8b, v2.8b, v30.8b
1645 uqadd v31.8b, v3.8b, v31.8b
1646 urshr v12.8h, v16.8h, #8
1647 urshr v13.8h, v17.8h, #8
1648 urshr v14.8h, v18.8h, #8
1649 urshr v15.8h, v19.8h, #8
1650 raddhn v0.8b, v16.8h, v12.8h
1651 raddhn v1.8b, v17.8h, v13.8h
1652 raddhn v2.8b, v18.8h, v14.8h
1653 raddhn v3.8b, v19.8h, v15.8h
1654 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1655 mvn v25.8b, v3.8b
1656 umull v12.8h, v25.8b, v4.8b
1657 umull v13.8h, v25.8b, v5.8b
1658 umull v14.8h, v25.8b, v6.8b
1659 umull v15.8h, v25.8b, v7.8b
1447.endm 1660.endm
1448 1661
1449.macro pixman_composite_over_n_8_8888_init 1662.macro pixman_composite_over_n_8_8888_init
1450 add DUMMY, sp, #ARGS_STACK_OFFSET 1663 mov v11.s[0], w4
1451 vpush {d8-d15} 1664 dup v8.8b, v11.b[0]
1452 vld1.32 {d11[0]}, [DUMMY] 1665 dup v9.8b, v11.b[1]
1453 vdup.8 d8, d11[0] 1666 dup v10.8b, v11.b[2]
1454 vdup.8 d9, d11[1] 1667 dup v11.8b, v11.b[3]
1455 vdup.8 d10, d11[2]
1456 vdup.8 d11, d11[3]
1457.endm 1668.endm
1458 1669
1459.macro pixman_composite_over_n_8_8888_cleanup 1670.macro pixman_composite_over_n_8_8888_cleanup
1460 vpop {d8-d15}
1461.endm 1671.endm
1462 1672
1463generate_composite_function \ 1673generate_composite_function \
1464 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 1674 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1465 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1675 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -1472,62 +1682,63 @@ generate_composite_function \
1472 pixman_composite_over_n_8_8888_process_pixblock_tail_head 1682 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1473 1683
1474/******************************************************************************/ 1684/******************************************************************************/
1475 1685
1476.macro pixman_composite_over_n_8_8_process_pixblock_head 1686.macro pixman_composite_over_n_8_8_process_pixblock_head
1477 vmull.u8 q0, d24, d8 1687 umull v0.8h, v24.8b, v8.8b
1478 vmull.u8 q1, d25, d8 1688 umull v1.8h, v25.8b, v8.8b
1479 vmull.u8 q6, d26, d8 1689 umull v2.8h, v26.8b, v8.8b
1480 vmull.u8 q7, d27, d8 1690 umull v3.8h, v27.8b, v8.8b
1481 vrshr.u16 q10, q0, #8 1691 urshr v10.8h, v0.8h, #8
1482 vrshr.u16 q11, q1, #8 1692 urshr v11.8h, v1.8h, #8
1483 vrshr.u16 q12, q6, #8 1693 urshr v12.8h, v2.8h, #8
1484 vrshr.u16 q13, q7, #8 1694 urshr v13.8h, v3.8h, #8
1485 vraddhn.u16 d0, q0, q10 1695 raddhn v0.8b, v0.8h, v10.8h
1486 vraddhn.u16 d1, q1, q11 1696 raddhn v1.8b, v1.8h, v11.8h
1487 vraddhn.u16 d2, q6, q12 1697 raddhn v2.8b, v2.8h, v12.8h
1488 vraddhn.u16 d3, q7, q13 1698 raddhn v3.8b, v3.8h, v13.8h
1489 vmvn.8 q12, q0 1699 mvn v24.8b, v0.8b
1490 vmvn.8 q13, q1 1700 mvn v25.8b, v1.8b
1491 vmull.u8 q8, d24, d4 1701 mvn v26.8b, v2.8b
1492 vmull.u8 q9, d25, d5 1702 mvn v27.8b, v3.8b
1493 vmull.u8 q10, d26, d6 1703 umull v10.8h, v24.8b, v4.8b
1494 vmull.u8 q11, d27, d7 1704 umull v11.8h, v25.8b, v5.8b
1705 umull v12.8h, v26.8b, v6.8b
1706 umull v13.8h, v27.8b, v7.8b
1495.endm 1707.endm
1496 1708
1497.macro pixman_composite_over_n_8_8_process_pixblock_tail 1709.macro pixman_composite_over_n_8_8_process_pixblock_tail
1498 vrshr.u16 q14, q8, #8 1710 urshr v14.8h, v10.8h, #8
1499 vrshr.u16 q15, q9, #8 1711 urshr v15.8h, v11.8h, #8
1500 vrshr.u16 q12, q10, #8 1712 urshr v16.8h, v12.8h, #8
1501 vrshr.u16 q13, q11, #8 1713 urshr v17.8h, v13.8h, #8
1502 vraddhn.u16 d28, q14, q8 1714 raddhn v28.8b, v14.8h, v10.8h
1503 vraddhn.u16 d29, q15, q9 1715 raddhn v29.8b, v15.8h, v11.8h
1504 vraddhn.u16 d30, q12, q10 1716 raddhn v30.8b, v16.8h, v12.8h
1505 vraddhn.u16 d31, q13, q11 1717 raddhn v31.8b, v17.8h, v13.8h
1506 vqadd.u8 q14, q0, q14 1718 uqadd v28.8b, v0.8b, v28.8b
1507 vqadd.u8 q15, q1, q15 1719 uqadd v29.8b, v1.8b, v29.8b
1720 uqadd v30.8b, v2.8b, v30.8b
1721 uqadd v31.8b, v3.8b, v31.8b
1508.endm 1722.endm
1509 1723
1510/* TODO: expand macros and do better instructions scheduling */ 1724/* TODO: expand macros and do better instructions scheduling */
1511.macro pixman_composite_over_n_8_8_process_pixblock_tail_head 1725.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1512 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1726 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1513 pixman_composite_over_n_8_8_process_pixblock_tail 1727 pixman_composite_over_n_8_8_process_pixblock_tail
1514 fetch_mask_pixblock 1728 fetch_mask_pixblock
1515 cache_preload 32, 32 1729 cache_preload 32, 32
1516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1730 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1517 pixman_composite_over_n_8_8_process_pixblock_head 1731 pixman_composite_over_n_8_8_process_pixblock_head
1518.endm 1732.endm
1519 1733
1520.macro pixman_composite_over_n_8_8_init 1734.macro pixman_composite_over_n_8_8_init
1521 add DUMMY, sp, #ARGS_STACK_OFFSET 1735 mov v8.s[0], w4
1522 vpush {d8-d15} 1736 dup v8.8b, v8.b[3]
1523 vld1.32 {d8[0]}, [DUMMY]
1524 vdup.8 d8, d8[3]
1525.endm 1737.endm
1526 1738
1527.macro pixman_composite_over_n_8_8_cleanup 1739.macro pixman_composite_over_n_8_8_cleanup
1528 vpop {d8-d15}
1529.endm 1740.endm
1530 1741
1531generate_composite_function \ 1742generate_composite_function \
1532 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ 1743 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1533 FLAG_DST_READWRITE, \ 1744 FLAG_DST_READWRITE, \
@@ -1543,95 +1754,97 @@ generate_composite_function \
1543 1754
1544.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1755.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1545 /* 1756 /*
1546 * 'combine_mask_ca' replacement 1757 * 'combine_mask_ca' replacement
1547 * 1758 *
1548 * input: solid src (n) in {d8, d9, d10, d11} 1759 * input: solid src (n) in {v8, v9, v10, v11}
1549 * dest in {d4, d5, d6, d7 } 1760 * dest in {v4, v5, v6, v7 }
1550 * mask in {d24, d25, d26, d27} 1761 * mask in {v24, v25, v26, v27}
1551 * output: updated src in {d0, d1, d2, d3 } 1762 * output: updated src in {v0, v1, v2, v3 }
1552 * updated mask in {d24, d25, d26, d3 } 1763 * updated mask in {v24, v25, v26, v3 }
1553 */ 1764 */
1554 vmull.u8 q0, d24, d8 1765 umull v0.8h, v24.8b, v8.8b
1555 vmull.u8 q1, d25, d9 1766 umull v1.8h, v25.8b, v9.8b
1556 vmull.u8 q6, d26, d10 1767 umull v2.8h, v26.8b, v10.8b
1557 vmull.u8 q7, d27, d11 1768 umull v3.8h, v27.8b, v11.8b
1558 vmull.u8 q9, d11, d25 1769 umull v12.8h, v11.8b, v25.8b
1559 vmull.u8 q12, d11, d24 1770 umull v13.8h, v11.8b, v24.8b
1560 vmull.u8 q13, d11, d26 1771 umull v14.8h, v11.8b, v26.8b
1561 vrshr.u16 q8, q0, #8 1772 urshr v15.8h, v0.8h, #8
1562 vrshr.u16 q10, q1, #8 1773 urshr v16.8h, v1.8h, #8
1563 vrshr.u16 q11, q6, #8 1774 urshr v17.8h, v2.8h, #8
1564 vraddhn.u16 d0, q0, q8 1775 raddhn v0.8b, v0.8h, v15.8h
1565 vraddhn.u16 d1, q1, q10 1776 raddhn v1.8b, v1.8h, v16.8h
1566 vraddhn.u16 d2, q6, q11 1777 raddhn v2.8b, v2.8h, v17.8h
1567 vrshr.u16 q11, q12, #8 1778 urshr v15.8h, v13.8h, #8
1568 vrshr.u16 q8, q9, #8 1779 urshr v16.8h, v12.8h, #8
1569 vrshr.u16 q6, q13, #8 1780 urshr v17.8h, v14.8h, #8
1570 vrshr.u16 q10, q7, #8 1781 urshr v18.8h, v3.8h, #8
1571 vraddhn.u16 d24, q12, q11 1782 raddhn v24.8b, v13.8h, v15.8h
1572 vraddhn.u16 d25, q9, q8 1783 raddhn v25.8b, v12.8h, v16.8h
1573 vraddhn.u16 d26, q13, q6 1784 raddhn v26.8b, v14.8h, v17.8h
1574 vraddhn.u16 d3, q7, q10 1785 raddhn v3.8b, v3.8h, v18.8h
1575 /* 1786 /*
1576 * 'combine_over_ca' replacement 1787 * 'combine_over_ca' replacement
1577 * 1788 *
1578 * output: updated dest in {d28, d29, d30, d31} 1789 * output: updated dest in {v28, v29, v30, v31}
1579 */ 1790 */
1580 vmvn.8 q12, q12 1791 mvn v24.8b, v24.8b
1581 vmvn.8 d26, d26 1792 mvn v25.8b, v25.8b
1582 vmull.u8 q8, d24, d4 1793 mvn v26.8b, v26.8b
1583 vmull.u8 q9, d25, d5 1794 mvn v27.8b, v3.8b
1584 vmvn.8 d27, d3 1795 umull v12.8h, v24.8b, v4.8b
1585 vmull.u8 q10, d26, d6 1796 umull v13.8h, v25.8b, v5.8b
1586 vmull.u8 q11, d27, d7 1797 umull v14.8h, v26.8b, v6.8b
1798 umull v15.8h, v27.8b, v7.8b
1587.endm 1799.endm
1588 1800
1589.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 1801.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1590 /* ... continue 'combine_over_ca' replacement */ 1802 /* ... continue 'combine_over_ca' replacement */
1591 vrshr.u16 q14, q8, #8 1803 urshr v16.8h, v12.8h, #8
1592 vrshr.u16 q15, q9, #8 1804 urshr v17.8h, v13.8h, #8
1593 vrshr.u16 q6, q10, #8 1805 urshr v18.8h, v14.8h, #8
1594 vrshr.u16 q7, q11, #8 1806 urshr v19.8h, v15.8h, #8
1595 vraddhn.u16 d28, q14, q8 1807 raddhn v28.8b, v16.8h, v12.8h
1596 vraddhn.u16 d29, q15, q9 1808 raddhn v29.8b, v17.8h, v13.8h
1597 vraddhn.u16 d30, q6, q10 1809 raddhn v30.8b, v18.8h, v14.8h
1598 vraddhn.u16 d31, q7, q11 1810 raddhn v31.8b, v19.8h, v15.8h
1599 vqadd.u8 q14, q0, q14 1811 uqadd v28.8b, v0.8b, v28.8b
1600 vqadd.u8 q15, q1, q15 1812 uqadd v29.8b, v1.8b, v29.8b
1813 uqadd v30.8b, v2.8b, v30.8b
1814 uqadd v31.8b, v3.8b, v31.8b
1601.endm 1815.endm
1602 1816
1603.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1817.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1604 vrshr.u16 q14, q8, #8 1818 urshr v16.8h, v12.8h, #8
1605 vrshr.u16 q15, q9, #8 1819 urshr v17.8h, v13.8h, #8
1606 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1820 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1607 vrshr.u16 q6, q10, #8 1821 urshr v18.8h, v14.8h, #8
1608 vrshr.u16 q7, q11, #8 1822 urshr v19.8h, v15.8h, #8
1609 vraddhn.u16 d28, q14, q8 1823 raddhn v28.8b, v16.8h, v12.8h
1610 vraddhn.u16 d29, q15, q9 1824 raddhn v29.8b, v17.8h, v13.8h
1611 vraddhn.u16 d30, q6, q10 1825 raddhn v30.8b, v18.8h, v14.8h
1612 vraddhn.u16 d31, q7, q11 1826 raddhn v31.8b, v19.8h, v15.8h
1613 fetch_mask_pixblock 1827 fetch_mask_pixblock
1614 vqadd.u8 q14, q0, q14 1828 uqadd v28.8b, v0.8b, v28.8b
1615 vqadd.u8 q15, q1, q15 1829 uqadd v29.8b, v1.8b, v29.8b
1830 uqadd v30.8b, v2.8b, v30.8b
1831 uqadd v31.8b, v3.8b, v31.8b
1616 cache_preload 8, 8 1832 cache_preload 8, 8
1617 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1833 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1618 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1834 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1619.endm 1835.endm
1620 1836
1621.macro pixman_composite_over_n_8888_8888_ca_init 1837.macro pixman_composite_over_n_8888_8888_ca_init
1622 add DUMMY, sp, #ARGS_STACK_OFFSET 1838 mov v13.s[0], w4
1623 vpush {d8-d15} 1839 dup v8.8b, v13.b[0]
1624 vld1.32 {d11[0]}, [DUMMY] 1840 dup v9.8b, v13.b[1]
1625 vdup.8 d8, d11[0] 1841 dup v10.8b, v13.b[2]
1626 vdup.8 d9, d11[1] 1842 dup v11.8b, v13.b[3]
1627 vdup.8 d10, d11[2]
1628 vdup.8 d11, d11[3]
1629.endm 1843.endm
1630 1844
1631.macro pixman_composite_over_n_8888_8888_ca_cleanup 1845.macro pixman_composite_over_n_8888_8888_ca_cleanup
1632 vpop {d8-d15}
1633.endm 1846.endm
1634 1847
1635generate_composite_function \ 1848generate_composite_function \
1636 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 1849 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1637 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1850 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -1647,160 +1860,174 @@ generate_composite_function \
1647 1860
1648.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head 1861.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1649 /* 1862 /*
1650 * 'combine_mask_ca' replacement 1863 * 'combine_mask_ca' replacement
1651 * 1864 *
1652 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1865 * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A]
1653 * mask in {d24, d25, d26} [B, G, R] 1866 * mask in {v24, v25, v26} [B, G, R]
1654 * output: updated src in {d0, d1, d2 } [B, G, R] 1867 * output: updated src in {v0, v1, v2 } [B, G, R]
1655 * updated mask in {d24, d25, d26} [B, G, R] 1868 * updated mask in {v24, v25, v26} [B, G, R]
1656 */ 1869 */
1657 vmull.u8 q0, d24, d8 1870 umull v0.8h, v24.8b, v8.8b
1658 vmull.u8 q1, d25, d9 1871 umull v1.8h, v25.8b, v9.8b
1659 vmull.u8 q6, d26, d10 1872 umull v2.8h, v26.8b, v10.8b
1660 vmull.u8 q9, d11, d25 1873 umull v12.8h, v11.8b, v24.8b
1661 vmull.u8 q12, d11, d24 1874 umull v13.8h, v11.8b, v25.8b
1662 vmull.u8 q13, d11, d26 1875 umull v14.8h, v11.8b, v26.8b
1663 vrshr.u16 q8, q0, #8 1876 urshr v15.8h, v0.8h, #8
1664 vrshr.u16 q10, q1, #8 1877 urshr v16.8h, v1.8h, #8
1665 vrshr.u16 q11, q6, #8 1878 urshr v17.8h, v2.8h, #8
1666 vraddhn.u16 d0, q0, q8 1879 raddhn v0.8b, v0.8h, v15.8h
1667 vraddhn.u16 d1, q1, q10 1880 raddhn v1.8b, v1.8h, v16.8h
1668 vraddhn.u16 d2, q6, q11 1881 raddhn v2.8b, v2.8h, v17.8h
1669 vrshr.u16 q11, q12, #8 1882 urshr v19.8h, v12.8h, #8
1670 vrshr.u16 q8, q9, #8 1883 urshr v20.8h, v13.8h, #8
1671 vrshr.u16 q6, q13, #8 1884 urshr v21.8h, v14.8h, #8
1672 vraddhn.u16 d24, q12, q11 1885 raddhn v24.8b, v12.8h, v19.8h
1673 vraddhn.u16 d25, q9, q8 1886 raddhn v25.8b, v13.8h, v20.8h
1674 /* 1887 /*
1675 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 1888 * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
1676 * and put data into d16 - blue, d17 - green, d18 - red 1889 * and put data into v16 - blue, v17 - green, v18 - red
1677 */ 1890 */
1678 vshrn.u16 d17, q2, #3 1891 mov v4.d[1], v5.d[0]
1679 vshrn.u16 d18, q2, #8 1892 shrn v17.8b, v4.8h, #3
1680 vraddhn.u16 d26, q13, q6 1893 shrn v18.8b, v4.8h, #8
1681 vsli.u16 q2, q2, #5 1894 raddhn v26.8b, v14.8h, v21.8h
1682 vsri.u8 d18, d18, #5 1895 sli v4.8h, v4.8h, #5
1683 vsri.u8 d17, d17, #6 1896 sri v18.8b, v18.8b, #5
1897 sri v17.8b, v17.8b, #6
1684 /* 1898 /*
1685 * 'combine_over_ca' replacement 1899 * 'combine_over_ca' replacement
1686 * 1900 *
1687 * output: updated dest in d16 - blue, d17 - green, d18 - red 1901 * output: updated dest in v16 - blue, v17 - green, v18 - red
1688 */ 1902 */
1689 vmvn.8 q12, q12 1903 mvn v24.8b, v24.8b
1690 vshrn.u16 d16, q2, #2 1904 mvn v25.8b, v25.8b
1691 vmvn.8 d26, d26 1905 shrn v16.8b, v4.8h, #2
1692 vmull.u8 q6, d16, d24 1906 mvn v26.8b, v26.8b
1693 vmull.u8 q7, d17, d25 1907 umull v5.8h, v16.8b, v24.8b
1694 vmull.u8 q11, d18, d26 1908 umull v6.8h, v17.8b, v25.8b
1909 umull v7.8h, v18.8b, v26.8b
1695.endm 1910.endm
1696 1911
1697.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail 1912.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1698 /* ... continue 'combine_over_ca' replacement */ 1913 /* ... continue 'combine_over_ca' replacement */
1699 vrshr.u16 q10, q6, #8 1914 urshr v13.8h, v5.8h, #8
1700 vrshr.u16 q14, q7, #8 1915 urshr v14.8h, v6.8h, #8
1701 vrshr.u16 q15, q11, #8 1916 urshr v15.8h, v7.8h, #8
1702 vraddhn.u16 d16, q10, q6 1917 raddhn v16.8b, v13.8h, v5.8h
1703 vraddhn.u16 d17, q14, q7 1918 raddhn v17.8b, v14.8h, v6.8h
1704 vraddhn.u16 d18, q15, q11 1919 raddhn v18.8b, v15.8h, v7.8h
1705 vqadd.u8 q8, q0, q8 1920 uqadd v16.8b, v0.8b, v16.8b
1706 vqadd.u8 d18, d2, d18 1921 uqadd v17.8b, v1.8b, v17.8b
1922 uqadd v18.8b, v2.8b, v18.8b
1707 /* 1923 /*
1708 * convert the results in d16, d17, d18 to r5g6b5 and store 1924 * convert the results in v16, v17, v18 to r5g6b5 and store
1709 * them into {d28, d29} 1925 * them into {v14}
1710 */ 1926 */
1711 vshll.u8 q14, d18, #8 1927 ushll v14.8h, v18.8b, #7
1712 vshll.u8 q10, d17, #8 1928 sli v14.8h, v14.8h, #1
1713 vshll.u8 q15, d16, #8 1929 ushll v12.8h, v17.8b, #7
1714 vsri.u16 q14, q10, #5 1930 sli v12.8h, v12.8h, #1
1715 vsri.u16 q14, q15, #11 1931 ushll v13.8h, v16.8b, #7
1932 sli v13.8h, v13.8h, #1
1933 sri v14.8h, v12.8h, #5
1934 sri v14.8h, v13.8h, #11
1935 mov v28.d[0], v14.d[0]
1936 mov v29.d[0], v14.d[1]
1716.endm 1937.endm
1717 1938
1718.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1939.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1719 fetch_mask_pixblock 1940 fetch_mask_pixblock
1720 vrshr.u16 q10, q6, #8 1941 urshr v13.8h, v5.8h, #8
1721 vrshr.u16 q14, q7, #8 1942 urshr v14.8h, v6.8h, #8
1722 vld1.16 {d4, d5}, [DST_R, :128]! 1943 ld1 {v4.8h}, [DST_R], #16
1723 vrshr.u16 q15, q11, #8 1944 urshr v15.8h, v7.8h, #8
1724 vraddhn.u16 d16, q10, q6 1945 raddhn v16.8b, v13.8h, v5.8h
1725 vraddhn.u16 d17, q14, q7 1946 raddhn v17.8b, v14.8h, v6.8h
1726 vraddhn.u16 d22, q15, q11 1947 raddhn v18.8b, v15.8h, v7.8h
1948 mov v5.d[0], v4.d[1]
1727 /* process_pixblock_head */ 1949 /* process_pixblock_head */
1728 /* 1950 /*
1729 * 'combine_mask_ca' replacement 1951 * 'combine_mask_ca' replacement
1730 * 1952 *
1731 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1953 * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A]
1732 * mask in {d24, d25, d26} [B, G, R] 1954 * mask in {v24, v25, v26} [B, G, R]
1733 * output: updated src in {d0, d1, d2 } [B, G, R] 1955 * output: updated src in {v0, v1, v2 } [B, G, R]
1734 * updated mask in {d24, d25, d26} [B, G, R] 1956 * updated mask in {v24, v25, v26} [B, G, R]
1735 */ 1957 */
1736 vmull.u8 q6, d26, d10 1958 uqadd v16.8b, v0.8b, v16.8b
1737 vqadd.u8 q8, q0, q8 1959 uqadd v17.8b, v1.8b, v17.8b
1738 vmull.u8 q0, d24, d8 1960 uqadd v18.8b, v2.8b, v18.8b
1739 vqadd.u8 d22, d2, d22 1961 umull v0.8h, v24.8b, v8.8b
1740 vmull.u8 q1, d25, d9 1962 umull v1.8h, v25.8b, v9.8b
1963 umull v2.8h, v26.8b, v10.8b
1741 /* 1964 /*
1742 * convert the result in d16, d17, d22 to r5g6b5 and store 1965 * convert the result in v16, v17, v18 to r5g6b5 and store
1743 * it into {d28, d29} 1966 * it into {v14}
1744 */ 1967 */
1745 vshll.u8 q14, d22, #8 1968 ushll v14.8h, v18.8b, #7
1746 vshll.u8 q10, d17, #8 1969 sli v14.8h, v14.8h, #1
1747 vshll.u8 q15, d16, #8 1970 ushll v18.8h, v16.8b, #7
1748 vmull.u8 q9, d11, d25 1971 sli v18.8h, v18.8h, #1
1749 vsri.u16 q14, q10, #5 1972 ushll v19.8h, v17.8b, #7
1750 vmull.u8 q12, d11, d24 1973 sli v19.8h, v19.8h, #1
1751 vmull.u8 q13, d11, d26 1974 umull v12.8h, v11.8b, v24.8b
1752 vsri.u16 q14, q15, #11 1975 sri v14.8h, v19.8h, #5
1976 umull v13.8h, v11.8b, v25.8b
1977 umull v15.8h, v11.8b, v26.8b
1978 sri v14.8h, v18.8h, #11
1979 mov v28.d[0], v14.d[0]
1980 mov v29.d[0], v14.d[1]
1753 cache_preload 8, 8 1981 cache_preload 8, 8
1754 vrshr.u16 q8, q0, #8 1982 urshr v16.8h, v0.8h, #8
1755 vrshr.u16 q10, q1, #8 1983 urshr v17.8h, v1.8h, #8
1756 vrshr.u16 q11, q6, #8 1984 urshr v18.8h, v2.8h, #8
1757 vraddhn.u16 d0, q0, q8 1985 raddhn v0.8b, v0.8h, v16.8h
1758 vraddhn.u16 d1, q1, q10 1986 raddhn v1.8b, v1.8h, v17.8h
1759 vraddhn.u16 d2, q6, q11 1987 raddhn v2.8b, v2.8h, v18.8h
1760 vrshr.u16 q11, q12, #8 1988 urshr v19.8h, v12.8h, #8
1761 vrshr.u16 q8, q9, #8 1989 urshr v20.8h, v13.8h, #8
1762 vrshr.u16 q6, q13, #8 1990 urshr v21.8h, v15.8h, #8
1763 vraddhn.u16 d24, q12, q11 1991 raddhn v24.8b, v12.8h, v19.8h
1764 vraddhn.u16 d25, q9, q8 1992 raddhn v25.8b, v13.8h, v20.8h
1765 /* 1993 /*
1766 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 1994 * convert 8 r5g6b5 pixel data from {v4, v5} to planar
1767 * 8-bit format and put data into d16 - blue, d17 - green, 1995 * 8-bit format and put data into v16 - blue, v17 - green,
1768 * d18 - red 1996 * v18 - red
1769 */ 1997 */
1770 vshrn.u16 d17, q2, #3 1998 mov v4.d[1], v5.d[0]
1771 vshrn.u16 d18, q2, #8 1999 shrn v17.8b, v4.8h, #3
1772 vraddhn.u16 d26, q13, q6 2000 shrn v18.8b, v4.8h, #8
1773 vsli.u16 q2, q2, #5 2001 raddhn v26.8b, v15.8h, v21.8h
1774 vsri.u8 d17, d17, #6 2002 sli v4.8h, v4.8h, #5
1775 vsri.u8 d18, d18, #5 2003 sri v17.8b, v17.8b, #6
2004 sri v18.8b, v18.8b, #5
1776 /* 2005 /*
1777 * 'combine_over_ca' replacement 2006 * 'combine_over_ca' replacement
1778 * 2007 *
1779 * output: updated dest in d16 - blue, d17 - green, d18 - red 2008 * output: updated dest in v16 - blue, v17 - green, v18 - red
1780 */ 2009 */
1781 vmvn.8 q12, q12 2010 mvn v24.8b, v24.8b
1782 vshrn.u16 d16, q2, #2 2011 mvn v25.8b, v25.8b
1783 vmvn.8 d26, d26 2012 shrn v16.8b, v4.8h, #2
1784 vmull.u8 q7, d17, d25 2013 mvn v26.8b, v26.8b
1785 vmull.u8 q6, d16, d24 2014 umull v5.8h, v16.8b, v24.8b
1786 vmull.u8 q11, d18, d26 2015 umull v6.8h, v17.8b, v25.8b
1787 vst1.16 {d28, d29}, [DST_W, :128]! 2016 umull v7.8h, v18.8b, v26.8b
2017 st1 {v14.8h}, [DST_W], #16
1788.endm 2018.endm
1789 2019
1790.macro pixman_composite_over_n_8888_0565_ca_init 2020.macro pixman_composite_over_n_8888_0565_ca_init
1791 add DUMMY, sp, #ARGS_STACK_OFFSET 2021 mov v13.s[0], w4
1792 vpush {d8-d15} 2022 dup v8.8b, v13.b[0]
1793 vld1.32 {d11[0]}, [DUMMY] 2023 dup v9.8b, v13.b[1]
1794 vdup.8 d8, d11[0] 2024 dup v10.8b, v13.b[2]
1795 vdup.8 d9, d11[1] 2025 dup v11.8b, v13.b[3]
1796 vdup.8 d10, d11[2]
1797 vdup.8 d11, d11[3]
1798.endm 2026.endm
1799 2027
1800.macro pixman_composite_over_n_8888_0565_ca_cleanup 2028.macro pixman_composite_over_n_8888_0565_ca_cleanup
1801 vpop {d8-d15}
1802.endm 2029.endm
1803 2030
1804generate_composite_function \ 2031generate_composite_function \
1805 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ 2032 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1806 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2033 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -1813,41 +2040,40 @@ generate_composite_function \
1813 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 2040 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1814 2041
1815/******************************************************************************/ 2042/******************************************************************************/
1816 2043
1817.macro pixman_composite_in_n_8_process_pixblock_head 2044.macro pixman_composite_in_n_8_process_pixblock_head
1818 /* expecting source data in {d0, d1, d2, d3} */ 2045 /* expecting source data in {v0, v1, v2, v3} */
1819 /* and destination data in {d4, d5, d6, d7} */ 2046 /* and destination data in {v4, v5, v6, v7} */
1820 vmull.u8 q8, d4, d3 2047 umull v8.8h, v4.8b, v3.8b
1821 vmull.u8 q9, d5, d3 2048 umull v9.8h, v5.8b, v3.8b
1822 vmull.u8 q10, d6, d3 2049 umull v10.8h, v6.8b, v3.8b
1823 vmull.u8 q11, d7, d3 2050 umull v11.8h, v7.8b, v3.8b
1824.endm 2051.endm
1825 2052
1826.macro pixman_composite_in_n_8_process_pixblock_tail 2053.macro pixman_composite_in_n_8_process_pixblock_tail
1827 vrshr.u16 q14, q8, #8 2054 urshr v14.8h, v8.8h, #8
1828 vrshr.u16 q15, q9, #8 2055 urshr v15.8h, v9.8h, #8
1829 vrshr.u16 q12, q10, #8 2056 urshr v12.8h, v10.8h, #8
1830 vrshr.u16 q13, q11, #8 2057 urshr v13.8h, v11.8h, #8
1831 vraddhn.u16 d28, q8, q14 2058 raddhn v28.8b, v8.8h, v14.8h
1832 vraddhn.u16 d29, q9, q15 2059 raddhn v29.8b, v9.8h, v15.8h
1833 vraddhn.u16 d30, q10, q12 2060 raddhn v30.8b, v10.8h, v12.8h
1834 vraddhn.u16 d31, q11, q13 2061 raddhn v31.8b, v11.8h, v13.8h
1835.endm 2062.endm
1836 2063
1837.macro pixman_composite_in_n_8_process_pixblock_tail_head 2064.macro pixman_composite_in_n_8_process_pixblock_tail_head
1838 pixman_composite_in_n_8_process_pixblock_tail 2065 pixman_composite_in_n_8_process_pixblock_tail
1839 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 2066 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1840 cache_preload 32, 32 2067 cache_preload 32, 32
1841 pixman_composite_in_n_8_process_pixblock_head 2068 pixman_composite_in_n_8_process_pixblock_head
1842 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 2069 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1843.endm 2070.endm
1844 2071
1845.macro pixman_composite_in_n_8_init 2072.macro pixman_composite_in_n_8_init
1846 add DUMMY, sp, #ARGS_STACK_OFFSET 2073 mov v3.s[0], w4
1847 vld1.32 {d3[0]}, [DUMMY] 2074 dup v3.8b, v3.b[3]
1848 vdup.8 d3, d3[3]
1849.endm 2075.endm
1850 2076
1851.macro pixman_composite_in_n_8_cleanup 2077.macro pixman_composite_in_n_8_cleanup
1852.endm 2078.endm
1853 2079
@@ -1865,52 +2091,51 @@ generate_composite_function \
1865 4, /* dst_r_basereg */ \ 2091 4, /* dst_r_basereg */ \
1866 0, /* src_basereg */ \ 2092 0, /* src_basereg */ \
1867 24 /* mask_basereg */ 2093 24 /* mask_basereg */
1868 2094
1869.macro pixman_composite_add_n_8_8_process_pixblock_head 2095.macro pixman_composite_add_n_8_8_process_pixblock_head
1870 /* expecting source data in {d8, d9, d10, d11} */ 2096 /* expecting source data in {v8, v9, v10, v11} */
1871 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 2097 /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
1872 /* and destination data in {d4, d5, d6, d7} */ 2098 /* and destination data in {v4, v5, v6, v7} */
1873 /* mask is in d24, d25, d26, d27 */ 2099 /* mask is in v24, v25, v26, v27 */
1874 vmull.u8 q0, d24, d11 2100 umull v0.8h, v24.8b, v11.8b
1875 vmull.u8 q1, d25, d11 2101 umull v1.8h, v25.8b, v11.8b
1876 vmull.u8 q6, d26, d11 2102 umull v2.8h, v26.8b, v11.8b
1877 vmull.u8 q7, d27, d11 2103 umull v3.8h, v27.8b, v11.8b
1878 vrshr.u16 q10, q0, #8 2104 urshr v12.8h, v0.8h, #8
1879 vrshr.u16 q11, q1, #8 2105 urshr v13.8h, v1.8h, #8
1880 vrshr.u16 q12, q6, #8 2106 urshr v14.8h, v2.8h, #8
1881 vrshr.u16 q13, q7, #8 2107 urshr v15.8h, v3.8h, #8
1882 vraddhn.u16 d0, q0, q10 2108 raddhn v0.8b, v0.8h, v12.8h
1883 vraddhn.u16 d1, q1, q11 2109 raddhn v1.8b, v1.8h, v13.8h
1884 vraddhn.u16 d2, q6, q12 2110 raddhn v2.8b, v2.8h, v14.8h
1885 vraddhn.u16 d3, q7, q13 2111 raddhn v3.8b, v3.8h, v15.8h
1886 vqadd.u8 q14, q0, q2 2112 uqadd v28.8b, v0.8b, v4.8b
1887 vqadd.u8 q15, q1, q3 2113 uqadd v29.8b, v1.8b, v5.8b
2114 uqadd v30.8b, v2.8b, v6.8b
2115 uqadd v31.8b, v3.8b, v7.8b
1888.endm 2116.endm
1889 2117
1890.macro pixman_composite_add_n_8_8_process_pixblock_tail 2118.macro pixman_composite_add_n_8_8_process_pixblock_tail
1891.endm 2119.endm
1892 2120
1893/* TODO: expand macros and do better instructions scheduling */ 2121/* TODO: expand macros and do better instructions scheduling */
1894.macro pixman_composite_add_n_8_8_process_pixblock_tail_head 2122.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1895 pixman_composite_add_n_8_8_process_pixblock_tail 2123 pixman_composite_add_n_8_8_process_pixblock_tail
1896 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 2124 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1897 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 2125 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1898 fetch_mask_pixblock 2126 fetch_mask_pixblock
1899 cache_preload 32, 32 2127 cache_preload 32, 32
1900 pixman_composite_add_n_8_8_process_pixblock_head 2128 pixman_composite_add_n_8_8_process_pixblock_head
1901.endm 2129.endm
1902 2130
1903.macro pixman_composite_add_n_8_8_init 2131.macro pixman_composite_add_n_8_8_init
1904 add DUMMY, sp, #ARGS_STACK_OFFSET 2132 mov v11.s[0], w4
1905 vpush {d8-d15} 2133 dup v11.8b, v11.b[3]
1906 vld1.32 {d11[0]}, [DUMMY]
1907 vdup.8 d11, d11[3]
1908.endm 2134.endm
1909 2135
1910.macro pixman_composite_add_n_8_8_cleanup 2136.macro pixman_composite_add_n_8_8_cleanup
1911 vpop {d8-d15}
1912.endm 2137.endm
1913 2138
1914generate_composite_function \ 2139generate_composite_function \
1915 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 2140 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1916 FLAG_DST_READWRITE, \ 2141 FLAG_DST_READWRITE, \
@@ -1923,37 +2148,39 @@ generate_composite_function \
1923 pixman_composite_add_n_8_8_process_pixblock_tail_head 2148 pixman_composite_add_n_8_8_process_pixblock_tail_head
1924 2149
1925/******************************************************************************/ 2150/******************************************************************************/
1926 2151
1927.macro pixman_composite_add_8_8_8_process_pixblock_head 2152.macro pixman_composite_add_8_8_8_process_pixblock_head
1928 /* expecting source data in {d0, d1, d2, d3} */ 2153 /* expecting source data in {v0, v1, v2, v3} */
1929 /* destination data in {d4, d5, d6, d7} */ 2154 /* destination data in {v4, v5, v6, v7} */
1930 /* mask in {d24, d25, d26, d27} */ 2155 /* mask in {v24, v25, v26, v27} */
1931 vmull.u8 q8, d24, d0 2156 umull v8.8h, v24.8b, v0.8b
1932 vmull.u8 q9, d25, d1 2157 umull v9.8h, v25.8b, v1.8b
1933 vmull.u8 q10, d26, d2 2158 umull v10.8h, v26.8b, v2.8b
1934 vmull.u8 q11, d27, d3 2159 umull v11.8h, v27.8b, v3.8b
1935 vrshr.u16 q0, q8, #8 2160 urshr v0.8h, v8.8h, #8
1936 vrshr.u16 q1, q9, #8 2161 urshr v1.8h, v9.8h, #8
1937 vrshr.u16 q12, q10, #8 2162 urshr v12.8h, v10.8h, #8
1938 vrshr.u16 q13, q11, #8 2163 urshr v13.8h, v11.8h, #8
1939 vraddhn.u16 d0, q0, q8 2164 raddhn v0.8b, v0.8h, v8.8h
1940 vraddhn.u16 d1, q1, q9 2165 raddhn v1.8b, v1.8h, v9.8h
1941 vraddhn.u16 d2, q12, q10 2166 raddhn v2.8b, v12.8h, v10.8h
1942 vraddhn.u16 d3, q13, q11 2167 raddhn v3.8b, v13.8h, v11.8h
1943 vqadd.u8 q14, q0, q2 2168 uqadd v28.8b, v0.8b, v4.8b
1944 vqadd.u8 q15, q1, q3 2169 uqadd v29.8b, v1.8b, v5.8b
2170 uqadd v30.8b, v2.8b, v6.8b
2171 uqadd v31.8b, v3.8b, v7.8b
1945.endm 2172.endm
1946 2173
1947.macro pixman_composite_add_8_8_8_process_pixblock_tail 2174.macro pixman_composite_add_8_8_8_process_pixblock_tail
1948.endm 2175.endm
1949 2176
1950/* TODO: expand macros and do better instructions scheduling */ 2177/* TODO: expand macros and do better instructions scheduling */
1951.macro pixman_composite_add_8_8_8_process_pixblock_tail_head 2178.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1952 pixman_composite_add_8_8_8_process_pixblock_tail 2179 pixman_composite_add_8_8_8_process_pixblock_tail
1953 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 2180 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1954 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 2181 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1955 fetch_mask_pixblock 2182 fetch_mask_pixblock
1956 fetch_src_pixblock 2183 fetch_src_pixblock
1957 cache_preload 32, 32 2184 cache_preload 32, 32
1958 pixman_composite_add_8_8_8_process_pixblock_head 2185 pixman_composite_add_8_8_8_process_pixblock_head
1959.endm 2186.endm
@@ -1976,57 +2203,60 @@ generate_composite_function \
1976 pixman_composite_add_8_8_8_process_pixblock_tail_head 2203 pixman_composite_add_8_8_8_process_pixblock_tail_head
1977 2204
1978/******************************************************************************/ 2205/******************************************************************************/
1979 2206
1980.macro pixman_composite_add_8888_8888_8888_process_pixblock_head 2207.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1981 /* expecting source data in {d0, d1, d2, d3} */ 2208 /* expecting source data in {v0, v1, v2, v3} */
1982 /* destination data in {d4, d5, d6, d7} */ 2209 /* destination data in {v4, v5, v6, v7} */
1983 /* mask in {d24, d25, d26, d27} */ 2210 /* mask in {v24, v25, v26, v27} */
1984 vmull.u8 q8, d27, d0 2211 umull v8.8h, v27.8b, v0.8b
1985 vmull.u8 q9, d27, d1 2212 umull v9.8h, v27.8b, v1.8b
1986 vmull.u8 q10, d27, d2 2213 umull v10.8h, v27.8b, v2.8b
1987 vmull.u8 q11, d27, d3 2214 umull v11.8h, v27.8b, v3.8b
1988 /* 1 cycle bubble */ 2215 /* 1 cycle bubble */
1989 vrsra.u16 q8, q8, #8 2216 ursra v8.8h, v8.8h, #8
1990 vrsra.u16 q9, q9, #8 2217 ursra v9.8h, v9.8h, #8
1991 vrsra.u16 q10, q10, #8 2218 ursra v10.8h, v10.8h, #8
1992 vrsra.u16 q11, q11, #8 2219 ursra v11.8h, v11.8h, #8
1993.endm 2220.endm
1994 2221
1995.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 2222.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1996 /* 2 cycle bubble */ 2223 /* 2 cycle bubble */
1997 vrshrn.u16 d28, q8, #8 2224 rshrn v28.8b, v8.8h, #8
1998 vrshrn.u16 d29, q9, #8 2225 rshrn v29.8b, v9.8h, #8
1999 vrshrn.u16 d30, q10, #8 2226 rshrn v30.8b, v10.8h, #8
2000 vrshrn.u16 d31, q11, #8 2227 rshrn v31.8b, v11.8h, #8
2001 vqadd.u8 q14, q2, q14 2228 uqadd v28.8b, v4.8b, v28.8b
2002 /* 1 cycle bubble */ 2229 uqadd v29.8b, v5.8b, v29.8b
2003 vqadd.u8 q15, q3, q15 2230 uqadd v30.8b, v6.8b, v30.8b
2231 uqadd v31.8b, v7.8b, v31.8b
2004.endm 2232.endm
2005 2233
2006.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2234.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2007 fetch_src_pixblock 2235 fetch_src_pixblock
2008 vrshrn.u16 d28, q8, #8 2236 rshrn v28.8b, v8.8h, #8
2009 fetch_mask_pixblock 2237 fetch_mask_pixblock
2010 vrshrn.u16 d29, q9, #8 2238 rshrn v29.8b, v9.8h, #8
2011 vmull.u8 q8, d27, d0 2239 umull v8.8h, v27.8b, v0.8b
2012 vrshrn.u16 d30, q10, #8 2240 rshrn v30.8b, v10.8h, #8
2013 vmull.u8 q9, d27, d1 2241 umull v9.8h, v27.8b, v1.8b
2014 vrshrn.u16 d31, q11, #8 2242 rshrn v31.8b, v11.8h, #8
2015 vmull.u8 q10, d27, d2 2243 umull v10.8h, v27.8b, v2.8b
2016 vqadd.u8 q14, q2, q14 2244 umull v11.8h, v27.8b, v3.8b
2017 vmull.u8 q11, d27, d3 2245 uqadd v28.8b, v4.8b, v28.8b
2018 vqadd.u8 q15, q3, q15 2246 uqadd v29.8b, v5.8b, v29.8b
2019 vrsra.u16 q8, q8, #8 2247 uqadd v30.8b, v6.8b, v30.8b
2020 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2248 uqadd v31.8b, v7.8b, v31.8b
2021 vrsra.u16 q9, q9, #8 2249 ursra v8.8h, v8.8h, #8
2022 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2250 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
2023 vrsra.u16 q10, q10, #8 2251 ursra v9.8h, v9.8h, #8
2252 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2253 ursra v10.8h, v10.8h, #8
2024 2254
2025 cache_preload 8, 8 2255 cache_preload 8, 8
2026 2256
2027 vrsra.u16 q11, q11, #8 2257 ursra v11.8h, v11.8h, #8
2028.endm 2258.endm
2029 2259
2030generate_composite_function \ 2260generate_composite_function \
2031 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 2261 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2032 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2262 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -2034,21 +2264,29 @@ generate_composite_function \
2034 10, /* prefetch distance */ \ 2264 10, /* prefetch distance */ \
2035 default_init, \ 2265 default_init, \
2036 default_cleanup, \ 2266 default_cleanup, \
2037 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2267 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2038 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2268 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2039 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2269 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2270 28, /* dst_w_basereg */ \
2271 4, /* dst_r_basereg */ \
2272 0, /* src_basereg */ \
2273 24 /* mask_basereg */
2040 2274
2041generate_composite_function_single_scanline \ 2275generate_composite_function_single_scanline \
2042 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 2276 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2043 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2277 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2044 8, /* number of pixels, processed in a single block */ \ 2278 8, /* number of pixels, processed in a single block */ \
2045 default_init, \ 2279 default_init, \
2046 default_cleanup, \ 2280 default_cleanup, \
2047 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2281 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2048 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2282 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2049 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2283 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2284 28, /* dst_w_basereg */ \
2285 4, /* dst_r_basereg */ \
2286 0, /* src_basereg */ \
2287 24 /* mask_basereg */
2050 2288
2051/******************************************************************************/ 2289/******************************************************************************/
2052 2290
2053generate_composite_function \ 2291generate_composite_function \
2054 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ 2292 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
@@ -2066,16 +2304,15 @@ generate_composite_function \
2066 27 /* mask_basereg */ 2304 27 /* mask_basereg */
2067 2305
2068/******************************************************************************/ 2306/******************************************************************************/
2069 2307
2070.macro pixman_composite_add_n_8_8888_init 2308.macro pixman_composite_add_n_8_8888_init
2071 add DUMMY, sp, #ARGS_STACK_OFFSET 2309 mov v3.s[0], w4
2072 vld1.32 {d3[0]}, [DUMMY] 2310 dup v0.8b, v3.b[0]
2073 vdup.8 d0, d3[0] 2311 dup v1.8b, v3.b[1]
2074 vdup.8 d1, d3[1] 2312 dup v2.8b, v3.b[2]
2075 vdup.8 d2, d3[2] 2313 dup v3.8b, v3.b[3]
2076 vdup.8 d3, d3[3]
2077.endm 2314.endm
2078 2315
2079.macro pixman_composite_add_n_8_8888_cleanup 2316.macro pixman_composite_add_n_8_8888_cleanup
2080.endm 2317.endm
2081 2318
@@ -2095,13 +2332,12 @@ generate_composite_function \
2095 27 /* mask_basereg */ 2332 27 /* mask_basereg */
2096 2333
2097/******************************************************************************/ 2334/******************************************************************************/
2098 2335
2099.macro pixman_composite_add_8888_n_8888_init 2336.macro pixman_composite_add_8888_n_8888_init
2100 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2337 mov v27.s[0], w6
2101 vld1.32 {d27[0]}, [DUMMY] 2338 dup v27.8b, v27.b[3]
2102 vdup.8 d27, d27[3]
2103.endm 2339.endm
2104 2340
2105.macro pixman_composite_add_8888_n_8888_cleanup 2341.macro pixman_composite_add_8888_n_8888_cleanup
2106.endm 2342.endm
2107 2343
@@ -2121,55 +2357,55 @@ generate_composite_function \
2121 27 /* mask_basereg */ 2357 27 /* mask_basereg */
2122 2358
2123/******************************************************************************/ 2359/******************************************************************************/
2124 2360
2125.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2361.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2126 /* expecting source data in {d0, d1, d2, d3} */ 2362 /* expecting source data in {v0, v1, v2, v3} */
2127 /* destination data in {d4, d5, d6, d7} */ 2363 /* destination data in {v4, v5, v6, v7} */
2128 /* solid mask is in d15 */ 2364 /* solid mask is in v15 */
2129 2365
2130 /* 'in' */ 2366 /* 'in' */
2131 vmull.u8 q8, d15, d3 2367 umull v11.8h, v15.8b, v3.8b
2132 vmull.u8 q6, d15, d2 2368 umull v10.8h, v15.8b, v2.8b
2133 vmull.u8 q5, d15, d1 2369 umull v9.8h, v15.8b, v1.8b
2134 vmull.u8 q4, d15, d0 2370 umull v8.8h, v15.8b, v0.8b
2135 vrshr.u16 q13, q8, #8 2371 urshr v16.8h, v11.8h, #8
2136 vrshr.u16 q12, q6, #8 2372 urshr v14.8h, v10.8h, #8
2137 vrshr.u16 q11, q5, #8 2373 urshr v13.8h, v9.8h, #8
2138 vrshr.u16 q10, q4, #8 2374 urshr v12.8h, v8.8h, #8
2139 vraddhn.u16 d3, q8, q13 2375 raddhn v3.8b, v11.8h, v16.8h
2140 vraddhn.u16 d2, q6, q12 2376 raddhn v2.8b, v10.8h, v14.8h
2141 vraddhn.u16 d1, q5, q11 2377 raddhn v1.8b, v9.8h, v13.8h
2142 vraddhn.u16 d0, q4, q10 2378 raddhn v0.8b, v8.8h, v12.8h
2143 vmvn.8 d24, d3 /* get inverted alpha */ 2379 mvn v24.8b, v3.8b /* get inverted alpha */
2144 /* now do alpha blending */ 2380 /* now do alpha blending */
2145 vmull.u8 q8, d24, d4 2381 umull v8.8h, v24.8b, v4.8b
2146 vmull.u8 q9, d24, d5 2382 umull v9.8h, v24.8b, v5.8b
2147 vmull.u8 q10, d24, d6 2383 umull v10.8h, v24.8b, v6.8b
2148 vmull.u8 q11, d24, d7 2384 umull v11.8h, v24.8b, v7.8b
2149.endm 2385.endm
2150 2386
2151.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2387.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2152 vrshr.u16 q14, q8, #8 2388 urshr v16.8h, v8.8h, #8
2153 vrshr.u16 q15, q9, #8 2389 urshr v17.8h, v9.8h, #8
2154 vrshr.u16 q12, q10, #8 2390 urshr v18.8h, v10.8h, #8
2155 vrshr.u16 q13, q11, #8 2391 urshr v19.8h, v11.8h, #8
2156 vraddhn.u16 d28, q14, q8 2392 raddhn v28.8b, v16.8h, v8.8h
2157 vraddhn.u16 d29, q15, q9 2393 raddhn v29.8b, v17.8h, v9.8h
2158 vraddhn.u16 d30, q12, q10 2394 raddhn v30.8b, v18.8h, v10.8h
2159 vraddhn.u16 d31, q13, q11 2395 raddhn v31.8b, v19.8h, v11.8h
2160.endm 2396.endm
2161 2397
2162/* TODO: expand macros and do better instructions scheduling */ 2398/* TODO: expand macros and do better instructions scheduling */
2163.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head 2399.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2164 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2400 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
2165 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2401 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2166 fetch_src_pixblock 2402 fetch_src_pixblock
2167 cache_preload 8, 8 2403 cache_preload 8, 8
2168 fetch_mask_pixblock 2404 fetch_mask_pixblock
2169 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2405 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2170 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2406 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2171.endm 2407.endm
2172 2408
2173generate_composite_function_single_scanline \ 2409generate_composite_function_single_scanline \
2174 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ 2410 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2175 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2411 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -2190,33 +2426,32 @@ generate_composite_function_single_scanline \
2190 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2426 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2191.endm 2427.endm
2192 2428
2193.macro pixman_composite_over_8888_n_8888_process_pixblock_tail 2429.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2194 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2430 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2195 vqadd.u8 q14, q0, q14 2431 uqadd v28.8b, v0.8b, v28.8b
2196 vqadd.u8 q15, q1, q15 2432 uqadd v29.8b, v1.8b, v29.8b
2433 uqadd v30.8b, v2.8b, v30.8b
2434 uqadd v31.8b, v3.8b, v31.8b
2197.endm 2435.endm
2198 2436
2199/* TODO: expand macros and do better instructions scheduling */ 2437/* TODO: expand macros and do better instructions scheduling */
2200.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2438.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2201 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2439 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
2202 pixman_composite_over_8888_n_8888_process_pixblock_tail 2440 pixman_composite_over_8888_n_8888_process_pixblock_tail
2203 fetch_src_pixblock 2441 fetch_src_pixblock
2204 cache_preload 8, 8 2442 cache_preload 8, 8
2205 pixman_composite_over_8888_n_8888_process_pixblock_head 2443 pixman_composite_over_8888_n_8888_process_pixblock_head
2206 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2444 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2207.endm 2445.endm
2208 2446
2209.macro pixman_composite_over_8888_n_8888_init 2447.macro pixman_composite_over_8888_n_8888_init
2210 add DUMMY, sp, #48 2448 mov v15.s[0], w6
2211 vpush {d8-d15} 2449 dup v15.8b, v15.b[3]
2212 vld1.32 {d15[0]}, [DUMMY]
2213 vdup.8 d15, d15[3]
2214.endm 2450.endm
2215 2451
2216.macro pixman_composite_over_8888_n_8888_cleanup 2452.macro pixman_composite_over_8888_n_8888_cleanup
2217 vpop {d8-d15}
2218.endm 2453.endm
2219 2454
2220generate_composite_function \ 2455generate_composite_function \
2221 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 2456 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2222 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2457 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -2224,23 +2459,27 @@ generate_composite_function \
2224 5, /* prefetch distance */ \ 2459 5, /* prefetch distance */ \
2225 pixman_composite_over_8888_n_8888_init, \ 2460 pixman_composite_over_8888_n_8888_init, \
2226 pixman_composite_over_8888_n_8888_cleanup, \ 2461 pixman_composite_over_8888_n_8888_cleanup, \
2227 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2462 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2228 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2463 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2229 pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2464 pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
2465 28, /* dst_w_basereg */ \
2466 4, /* dst_r_basereg */ \
2467 0, /* src_basereg */ \
2468 12 /* mask_basereg */
2230 2469
2231/******************************************************************************/ 2470/******************************************************************************/
2232 2471
2233/* TODO: expand macros and do better instructions scheduling */ 2472/* TODO: expand macros and do better instructions scheduling */
2234.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 2473.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2235 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2474 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
2236 pixman_composite_over_8888_n_8888_process_pixblock_tail 2475 pixman_composite_over_8888_n_8888_process_pixblock_tail
2237 fetch_src_pixblock 2476 fetch_src_pixblock
2238 cache_preload 8, 8 2477 cache_preload 8, 8
2239 fetch_mask_pixblock 2478 fetch_mask_pixblock
2240 pixman_composite_over_8888_n_8888_process_pixblock_head 2479 pixman_composite_over_8888_n_8888_process_pixblock_head
2241 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2480 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2242.endm 2481.endm
2243 2482
2244generate_composite_function \ 2483generate_composite_function \
2245 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 2484 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2246 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2485 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -2272,17 +2511,17 @@ generate_composite_function_single_scanline \
2272 2511
2273/******************************************************************************/ 2512/******************************************************************************/
2274 2513
2275/* TODO: expand macros and do better instructions scheduling */ 2514/* TODO: expand macros and do better instructions scheduling */
2276.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 2515.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2277 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2516 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
2278 pixman_composite_over_8888_n_8888_process_pixblock_tail 2517 pixman_composite_over_8888_n_8888_process_pixblock_tail
2279 fetch_src_pixblock 2518 fetch_src_pixblock
2280 cache_preload 8, 8 2519 cache_preload 8, 8
2281 fetch_mask_pixblock 2520 fetch_mask_pixblock
2282 pixman_composite_over_8888_n_8888_process_pixblock_head 2521 pixman_composite_over_8888_n_8888_process_pixblock_head
2283 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2522 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2284.endm 2523.endm
2285 2524
2286generate_composite_function \ 2525generate_composite_function \
2287 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 2526 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2288 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2527 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -2305,11 +2544,11 @@ generate_composite_function \
2305 2544
2306.macro pixman_composite_src_0888_0888_process_pixblock_tail 2545.macro pixman_composite_src_0888_0888_process_pixblock_tail
2307.endm 2546.endm
2308 2547
2309.macro pixman_composite_src_0888_0888_process_pixblock_tail_head 2548.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2310 vst3.8 {d0, d1, d2}, [DST_W]! 2549 st3 {v0.8b, v1.8b, v2.8b}, [DST_W], #24
2311 fetch_src_pixblock 2550 fetch_src_pixblock
2312 cache_preload 8, 8 2551 cache_preload 8, 8
2313.endm 2552.endm
2314 2553
2315generate_composite_function \ 2554generate_composite_function \
@@ -2328,25 +2567,29 @@ generate_composite_function \
2328 0 /* mask_basereg */ 2567 0 /* mask_basereg */
2329 2568
2330/******************************************************************************/ 2569/******************************************************************************/
2331 2570
2332.macro pixman_composite_src_0888_8888_rev_process_pixblock_head 2571.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2333 vswp d0, d2 2572 mov v31.8b, v2.8b
2573 mov v2.8b, v0.8b
2574 mov v0.8b, v31.8b
2334.endm 2575.endm
2335 2576
2336.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 2577.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2337.endm 2578.endm
2338 2579
2339.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 2580.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2340 vst4.8 {d0, d1, d2, d3}, [DST_W]! 2581 st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
2341 fetch_src_pixblock 2582 fetch_src_pixblock
2342 vswp d0, d2 2583 mov v31.8b, v2.8b
2584 mov v2.8b, v0.8b
2585 mov v0.8b, v31.8b
2343 cache_preload 8, 8 2586 cache_preload 8, 8
2344.endm 2587.endm
2345 2588
2346.macro pixman_composite_src_0888_8888_rev_init 2589.macro pixman_composite_src_0888_8888_rev_init
2347 veor d3, d3, d3 2590 eor v3.8b, v3.8b, v3.8b
2348.endm 2591.endm
2349 2592
2350generate_composite_function \ 2593generate_composite_function \
2351 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 2594 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2352 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2595 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
@@ -2363,28 +2606,38 @@ generate_composite_function \
2363 0 /* mask_basereg */ 2606 0 /* mask_basereg */
2364 2607
2365/******************************************************************************/ 2608/******************************************************************************/
2366 2609
2367.macro pixman_composite_src_0888_0565_rev_process_pixblock_head 2610.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2368 vshll.u8 q8, d1, #8 2611 ushll v8.8h, v1.8b, #7
2369 vshll.u8 q9, d2, #8 2612 sli v8.8h, v8.8h, #1
2613 ushll v9.8h, v2.8b, #7
2614 sli v9.8h, v9.8h, #1
2370.endm 2615.endm
2371 2616
2372.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 2617.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2373 vshll.u8 q14, d0, #8 2618 ushll v14.8h, v0.8b, #7
2374 vsri.u16 q14, q8, #5 2619 sli v14.8h, v14.8h, #1
2375 vsri.u16 q14, q9, #11 2620 sri v14.8h, v8.8h, #5
2621 sri v14.8h, v9.8h, #11
2622 mov v28.d[0], v14.d[0]
2623 mov v29.d[0], v14.d[1]
2376.endm 2624.endm
2377 2625
2378.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 2626.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2379 vshll.u8 q14, d0, #8 2627 ushll v14.8h, v0.8b, #7
2628 sli v14.8h, v14.8h, #1
2380 fetch_src_pixblock 2629 fetch_src_pixblock
2381 vsri.u16 q14, q8, #5 2630 sri v14.8h, v8.8h, #5
2382 vsri.u16 q14, q9, #11 2631 sri v14.8h, v9.8h, #11
2383 vshll.u8 q8, d1, #8 2632 mov v28.d[0], v14.d[0]
2384 vst1.16 {d28, d29}, [DST_W, :128]! 2633 mov v29.d[0], v14.d[1]
2385 vshll.u8 q9, d2, #8 2634 ushll v8.8h, v1.8b, #7
2635 sli v8.8h, v8.8h, #1
2636 st1 {v14.8h}, [DST_W], #16
2637 ushll v9.8h, v2.8b, #7
2638 sli v9.8h, v9.8h, #1
2386.endm 2639.endm
2387 2640
2388generate_composite_function \ 2641generate_composite_function \
2389 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 2642 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2390 FLAG_DST_WRITEONLY, \ 2643 FLAG_DST_WRITEONLY, \
@@ -2401,47 +2654,59 @@ generate_composite_function \
2401 0 /* mask_basereg */ 2654 0 /* mask_basereg */
2402 2655
2403/******************************************************************************/ 2656/******************************************************************************/
2404 2657
2405.macro pixman_composite_src_pixbuf_8888_process_pixblock_head 2658.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2406 vmull.u8 q8, d3, d0 2659 umull v8.8h, v3.8b, v0.8b
2407 vmull.u8 q9, d3, d1 2660 umull v9.8h, v3.8b, v1.8b
2408 vmull.u8 q10, d3, d2 2661 umull v10.8h, v3.8b, v2.8b
2409.endm 2662.endm
2410 2663
2411.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 2664.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2412 vrshr.u16 q11, q8, #8 2665 urshr v11.8h, v8.8h, #8
2413 vswp d3, d31 2666 mov v30.8b, v31.8b
2414 vrshr.u16 q12, q9, #8 2667 mov v31.8b, v3.8b
2415 vrshr.u16 q13, q10, #8 2668 mov v3.8b, v30.8b
2416 vraddhn.u16 d30, q11, q8 2669 urshr v12.8h, v9.8h, #8
2417 vraddhn.u16 d29, q12, q9 2670 urshr v13.8h, v10.8h, #8
2418 vraddhn.u16 d28, q13, q10 2671 raddhn v30.8b, v11.8h, v8.8h
2672 raddhn v29.8b, v12.8h, v9.8h
2673 raddhn v28.8b, v13.8h, v10.8h
2419.endm 2674.endm
2420 2675
2421.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 2676.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2422 vrshr.u16 q11, q8, #8 2677 urshr v11.8h, v8.8h, #8
2423 vswp d3, d31 2678 mov v30.8b, v31.8b
2424 vrshr.u16 q12, q9, #8 2679 mov v31.8b, v3.8b
2425 vrshr.u16 q13, q10, #8 2680 mov v3.8b, v31.8b
2681 urshr v12.8h, v9.8h, #8
2682 urshr v13.8h, v10.8h, #8
2426 fetch_src_pixblock 2683 fetch_src_pixblock
2427 vraddhn.u16 d30, q11, q8 2684 raddhn v30.8b, v11.8h, v8.8h
2428 PF add PF_X, PF_X, #8 2685 PF add PF_X, PF_X, #8
2429 PF tst PF_CTL, #0xF 2686 PF tst PF_CTL, #0xF
2430 PF addne PF_X, PF_X, #8 2687 PF beq 10f
2431 PF subne PF_CTL, PF_CTL, #1 2688 PF add PF_X, PF_X, #8
2432 vraddhn.u16 d29, q12, q9 2689 PF sub PF_CTL, PF_CTL, #1
2433 vraddhn.u16 d28, q13, q10 269010:
2434 vmull.u8 q8, d3, d0 2691 raddhn v29.8b, v12.8h, v9.8h
2435 vmull.u8 q9, d3, d1 2692 raddhn v28.8b, v13.8h, v10.8h
2436 vmull.u8 q10, d3, d2 2693 umull v8.8h, v3.8b, v0.8b
2437 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2694 umull v9.8h, v3.8b, v1.8b
2695 umull v10.8h, v3.8b, v2.8b
2696 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2438 PF cmp PF_X, ORIG_W 2697 PF cmp PF_X, ORIG_W
2439 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2698 PF lsl DUMMY, PF_X, src_bpp_shift
2440 PF subge PF_X, PF_X, ORIG_W 2699 PF prfm pldl2strm, [PF_SRC, DUMMY]
2441 PF subges PF_CTL, PF_CTL, #0x10 2700 PF ble 10f
2442 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2701 PF sub PF_X, PF_X, ORIG_W
2702 PF subs PF_CTL, PF_CTL, #0x10
2703 PF ble 10f
2704 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
2705 PF ldrsb DUMMY, [PF_SRC, DUMMY]
2706 PF add PF_SRC, PF_SRC, #1
270710:
2443.endm 2708.endm
2444 2709
2445generate_composite_function \ 2710generate_composite_function \
2446 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 2711 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2447 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2712 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
@@ -2458,47 +2723,59 @@ generate_composite_function \
2458 0 /* mask_basereg */ 2723 0 /* mask_basereg */
2459 2724
2460/******************************************************************************/ 2725/******************************************************************************/
2461 2726
2462.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head 2727.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2463 vmull.u8 q8, d3, d0 2728 umull v8.8h, v3.8b, v0.8b
2464 vmull.u8 q9, d3, d1 2729 umull v9.8h, v3.8b, v1.8b
2465 vmull.u8 q10, d3, d2 2730 umull v10.8h, v3.8b, v2.8b
2466.endm 2731.endm
2467 2732
2468.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail 2733.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2469 vrshr.u16 q11, q8, #8 2734 urshr v11.8h, v8.8h, #8
2470 vswp d3, d31 2735 mov v30.8b, v31.8b
2471 vrshr.u16 q12, q9, #8 2736 mov v31.8b, v3.8b
2472 vrshr.u16 q13, q10, #8 2737 mov v3.8b, v30.8b
2473 vraddhn.u16 d28, q11, q8 2738 urshr v12.8h, v9.8h, #8
2474 vraddhn.u16 d29, q12, q9 2739 urshr v13.8h, v10.8h, #8
2475 vraddhn.u16 d30, q13, q10 2740 raddhn v28.8b, v11.8h, v8.8h
2741 raddhn v29.8b, v12.8h, v9.8h
2742 raddhn v30.8b, v13.8h, v10.8h
2476.endm 2743.endm
2477 2744
2478.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head 2745.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2479 vrshr.u16 q11, q8, #8 2746 urshr v11.8h, v8.8h, #8
2480 vswp d3, d31 2747 mov v30.8b, v31.8b
2481 vrshr.u16 q12, q9, #8 2748 mov v31.8b, v3.8b
2482 vrshr.u16 q13, q10, #8 2749 mov v3.8b, v30.8b
2750 urshr v12.8h, v9.8h, #8
2751 urshr v13.8h, v10.8h, #8
2483 fetch_src_pixblock 2752 fetch_src_pixblock
2484 vraddhn.u16 d28, q11, q8 2753 raddhn v28.8b, v11.8h, v8.8h
2485 PF add PF_X, PF_X, #8 2754 PF add PF_X, PF_X, #8
2486 PF tst PF_CTL, #0xF 2755 PF tst PF_CTL, #0xF
2487 PF addne PF_X, PF_X, #8 2756 PF beq 10f
2488 PF subne PF_CTL, PF_CTL, #1 2757 PF add PF_X, PF_X, #8
2489 vraddhn.u16 d29, q12, q9 2758 PF sub PF_CTL, PF_CTL, #1
2490 vraddhn.u16 d30, q13, q10 275910:
2491 vmull.u8 q8, d3, d0 2760 raddhn v29.8b, v12.8h, v9.8h
2492 vmull.u8 q9, d3, d1 2761 raddhn v30.8b, v13.8h, v10.8h
2493 vmull.u8 q10, d3, d2 2762 umull v8.8h, v3.8b, v0.8b
2494 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2763 umull v9.8h, v3.8b, v1.8b
2764 umull v10.8h, v3.8b, v2.8b
2765 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2495 PF cmp PF_X, ORIG_W 2766 PF cmp PF_X, ORIG_W
2496 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2767 PF lsl DUMMY, PF_X, src_bpp_shift
2497 PF subge PF_X, PF_X, ORIG_W 2768 PF prfm pldl2strm, [PF_SRC, DUMMY]
2498 PF subges PF_CTL, PF_CTL, #0x10 2769 PF ble 10f
2499 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2770 PF sub PF_X, PF_X, ORIG_W
2771 PF subs PF_CTL, PF_CTL, #0x10
2772 PF ble 10f
2773 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
2774 PF ldrsb DUMMY, [PF_SRC, DUMMY]
2775 PF add PF_SRC, PF_SRC, #1
277610:
2500.endm 2777.endm
2501 2778
2502generate_composite_function \ 2779generate_composite_function \
2503 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ 2780 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2504 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2781 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
@@ -2515,52 +2792,59 @@ generate_composite_function \
2515 0 /* mask_basereg */ 2792 0 /* mask_basereg */
2516 2793
2517/******************************************************************************/ 2794/******************************************************************************/
2518 2795
2519.macro pixman_composite_over_0565_8_0565_process_pixblock_head 2796.macro pixman_composite_over_0565_8_0565_process_pixblock_head
2520 /* mask is in d15 */ 2797 /* mask is in v15 */
2521 convert_0565_to_x888 q4, d2, d1, d0 2798 mov v4.d[0], v8.d[0]
2522 convert_0565_to_x888 q5, d6, d5, d4 2799 mov v4.d[1], v9.d[0]
2523 /* source pixel data is in {d0, d1, d2, XX} */ 2800 mov v13.d[0], v10.d[0]
2524 /* destination pixel data is in {d4, d5, d6, XX} */ 2801 mov v13.d[1], v11.d[0]
2525 vmvn.8 d7, d15 2802 convert_0565_to_x888 v4, v2, v1, v0
2526 vmull.u8 q6, d15, d2 2803 convert_0565_to_x888 v13, v6, v5, v4
2527 vmull.u8 q5, d15, d1 2804 /* source pixel data is in {v0, v1, v2, XX} */
2528 vmull.u8 q4, d15, d0 2805 /* destination pixel data is in {v4, v5, v6, XX} */
2529 vmull.u8 q8, d7, d4 2806 mvn v7.8b, v15.8b
2530 vmull.u8 q9, d7, d5 2807 umull v10.8h, v15.8b, v2.8b
2531 vmull.u8 q13, d7, d6 2808 umull v9.8h, v15.8b, v1.8b
2532 vrshr.u16 q12, q6, #8 2809 umull v8.8h, v15.8b, v0.8b
2533 vrshr.u16 q11, q5, #8 2810 umull v11.8h, v7.8b, v4.8b
2534 vrshr.u16 q10, q4, #8 2811 umull v12.8h, v7.8b, v5.8b
2535 vraddhn.u16 d2, q6, q12 2812 umull v13.8h, v7.8b, v6.8b
2536 vraddhn.u16 d1, q5, q11 2813 urshr v19.8h, v10.8h, #8
2537 vraddhn.u16 d0, q4, q10 2814 urshr v18.8h, v9.8h, #8
2815 urshr v17.8h, v8.8h, #8
2816 raddhn v2.8b, v10.8h, v19.8h
2817 raddhn v1.8b, v9.8h, v18.8h
2818 raddhn v0.8b, v8.8h, v17.8h
2538.endm 2819.endm
2539 2820
2540.macro pixman_composite_over_0565_8_0565_process_pixblock_tail 2821.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2541 vrshr.u16 q14, q8, #8 2822 urshr v17.8h, v11.8h, #8
2542 vrshr.u16 q15, q9, #8 2823 urshr v18.8h, v12.8h, #8
2543 vrshr.u16 q12, q13, #8 2824 urshr v19.8h, v13.8h, #8
2544 vraddhn.u16 d28, q14, q8 2825 raddhn v28.8b, v17.8h, v11.8h
2545 vraddhn.u16 d29, q15, q9 2826 raddhn v29.8b, v18.8h, v12.8h
2546 vraddhn.u16 d30, q12, q13 2827 raddhn v30.8b, v19.8h, v13.8h
2547 vqadd.u8 q0, q0, q14 2828 uqadd v0.8b, v0.8b, v28.8b
2548 vqadd.u8 q1, q1, q15 2829 uqadd v1.8b, v1.8b, v29.8b
2549 /* 32bpp result is in {d0, d1, d2, XX} */ 2830 uqadd v2.8b, v2.8b, v30.8b
2550 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2831 /* 32bpp result is in {v0, v1, v2, XX} */
2832 convert_8888_to_0565 v2, v1, v0, v14, v30, v13
2833 mov v28.d[0], v14.d[0]
2834 mov v29.d[0], v14.d[1]
2551.endm 2835.endm
2552 2836
2553/* TODO: expand macros and do better instructions scheduling */ 2837/* TODO: expand macros and do better instructions scheduling */
2554.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head 2838.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2555 fetch_mask_pixblock 2839 fetch_mask_pixblock
2556 pixman_composite_over_0565_8_0565_process_pixblock_tail 2840 pixman_composite_over_0565_8_0565_process_pixblock_tail
2557 fetch_src_pixblock 2841 fetch_src_pixblock
2558 vld1.16 {d10, d11}, [DST_R, :128]! 2842 ld1 {v10.4h, v11.4h}, [DST_R], #16
2559 cache_preload 8, 8 2843 cache_preload 8, 8
2560 pixman_composite_over_0565_8_0565_process_pixblock_head 2844 pixman_composite_over_0565_8_0565_process_pixblock_head
2561 vst1.16 {d28, d29}, [DST_W, :128]! 2845 st1 {v14.8h}, [DST_W], #16
2562.endm 2846.endm
2563 2847
2564generate_composite_function \ 2848generate_composite_function \
2565 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ 2849 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2566 FLAG_DST_READWRITE, \ 2850 FLAG_DST_READWRITE, \
@@ -2577,18 +2861,15 @@ generate_composite_function \
2577 15 /* mask_basereg */ 2861 15 /* mask_basereg */
2578 2862
2579/******************************************************************************/ 2863/******************************************************************************/
2580 2864
2581.macro pixman_composite_over_0565_n_0565_init 2865.macro pixman_composite_over_0565_n_0565_init
2582 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2866 mov v15.s[0], w6
2583 vpush {d8-d15} 2867 dup v15.8b, v15.b[3]
2584 vld1.32 {d15[0]}, [DUMMY]
2585 vdup.8 d15, d15[3]
2586.endm 2868.endm
2587 2869
2588.macro pixman_composite_over_0565_n_0565_cleanup 2870.macro pixman_composite_over_0565_n_0565_cleanup
2589 vpop {d8-d15}
2590.endm 2871.endm
2591 2872
2592generate_composite_function \ 2873generate_composite_function \
2593 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ 2874 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2594 FLAG_DST_READWRITE, \ 2875 FLAG_DST_READWRITE, \
@@ -2598,49 +2879,56 @@ generate_composite_function \
2598 pixman_composite_over_0565_n_0565_cleanup, \ 2879 pixman_composite_over_0565_n_0565_cleanup, \
2599 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2880 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2600 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2881 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2601 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2882 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2602 28, /* dst_w_basereg */ \ 2883 28, /* dst_w_basereg */ \
2603 10, /* dst_r_basereg */ \ 2884 10, /* dst_r_basereg */ \
2604 8, /* src_basereg */ \ 2885 8, /* src_basereg */ \
2605 15 /* mask_basereg */ 2886 15 /* mask_basereg */
2606 2887
2607/******************************************************************************/ 2888/******************************************************************************/
2608 2889
2609.macro pixman_composite_add_0565_8_0565_process_pixblock_head 2890.macro pixman_composite_add_0565_8_0565_process_pixblock_head
2610 /* mask is in d15 */ 2891 /* mask is in v15 */
2611 convert_0565_to_x888 q4, d2, d1, d0 2892 mov v4.d[0], v8.d[0]
2612 convert_0565_to_x888 q5, d6, d5, d4 2893 mov v4.d[1], v9.d[0]
2613 /* source pixel data is in {d0, d1, d2, XX} */ 2894 mov v13.d[0], v10.d[0]
2614 /* destination pixel data is in {d4, d5, d6, XX} */ 2895 mov v13.d[1], v11.d[0]
2615 vmull.u8 q6, d15, d2 2896 convert_0565_to_x888 v4, v2, v1, v0
2616 vmull.u8 q5, d15, d1 2897 convert_0565_to_x888 v13, v6, v5, v4
2617 vmull.u8 q4, d15, d0 2898 /* source pixel data is in {v0, v1, v2, XX} */
2618 vrshr.u16 q12, q6, #8 2899 /* destination pixel data is in {v4, v5, v6, XX} */
2619 vrshr.u16 q11, q5, #8 2900 umull v9.8h, v15.8b, v2.8b
2620 vrshr.u16 q10, q4, #8 2901 umull v8.8h, v15.8b, v1.8b
2621 vraddhn.u16 d2, q6, q12 2902 umull v7.8h, v15.8b, v0.8b
2622 vraddhn.u16 d1, q5, q11 2903 urshr v12.8h, v9.8h, #8
2623 vraddhn.u16 d0, q4, q10 2904 urshr v11.8h, v8.8h, #8
2905 urshr v10.8h, v7.8h, #8
2906 raddhn v2.8b, v9.8h, v12.8h
2907 raddhn v1.8b, v8.8h, v11.8h
2908 raddhn v0.8b, v7.8h, v10.8h
2624.endm 2909.endm
2625 2910
2626.macro pixman_composite_add_0565_8_0565_process_pixblock_tail 2911.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2627 vqadd.u8 q0, q0, q2 2912 uqadd v0.8b, v0.8b, v4.8b
2628 vqadd.u8 q1, q1, q3 2913 uqadd v1.8b, v1.8b, v5.8b
2629 /* 32bpp result is in {d0, d1, d2, XX} */ 2914 uqadd v2.8b, v2.8b, v6.8b
2630 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2915 /* 32bpp result is in {v0, v1, v2, XX} */
2916 convert_8888_to_0565 v2, v1, v0, v14, v30, v13
2917 mov v28.d[0], v14.d[0]
2918 mov v29.d[0], v14.d[1]
2631.endm 2919.endm
2632 2920
2633/* TODO: expand macros and do better instructions scheduling */ 2921/* TODO: expand macros and do better instructions scheduling */
2634.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head 2922.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2635 fetch_mask_pixblock 2923 fetch_mask_pixblock
2636 pixman_composite_add_0565_8_0565_process_pixblock_tail 2924 pixman_composite_add_0565_8_0565_process_pixblock_tail
2637 fetch_src_pixblock 2925 fetch_src_pixblock
2638 vld1.16 {d10, d11}, [DST_R, :128]! 2926 ld1 {v10.4h, v11.4h}, [DST_R], #16
2639 cache_preload 8, 8 2927 cache_preload 8, 8
2640 pixman_composite_add_0565_8_0565_process_pixblock_head 2928 pixman_composite_add_0565_8_0565_process_pixblock_head
2641 vst1.16 {d28, d29}, [DST_W, :128]! 2929 st1 {v14.8h}, [DST_W], #16
2642.endm 2930.endm
2643 2931
2644generate_composite_function \ 2932generate_composite_function \
2645 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ 2933 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2646 FLAG_DST_READWRITE, \ 2934 FLAG_DST_READWRITE, \
@@ -2657,39 +2945,43 @@ generate_composite_function \
2657 15 /* mask_basereg */ 2945 15 /* mask_basereg */
2658 2946
2659/******************************************************************************/ 2947/******************************************************************************/
2660 2948
2661.macro pixman_composite_out_reverse_8_0565_process_pixblock_head 2949.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2662 /* mask is in d15 */ 2950 /* mask is in v15 */
2663 convert_0565_to_x888 q5, d6, d5, d4 2951 mov v12.d[0], v10.d[0]
2664 /* destination pixel data is in {d4, d5, d6, xx} */ 2952 mov v12.d[1], v11.d[0]
2665 vmvn.8 d24, d15 /* get inverted alpha */ 2953 convert_0565_to_x888 v12, v6, v5, v4
2954 /* destination pixel data is in {v4, v5, v6, xx} */
2955 mvn v24.8b, v15.8b /* get inverted alpha */
2666 /* now do alpha blending */ 2956 /* now do alpha blending */
2667 vmull.u8 q8, d24, d4 2957 umull v8.8h, v24.8b, v4.8b
2668 vmull.u8 q9, d24, d5 2958 umull v9.8h, v24.8b, v5.8b
2669 vmull.u8 q10, d24, d6 2959 umull v10.8h, v24.8b, v6.8b
2670.endm 2960.endm
2671 2961
2672.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail 2962.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2673 vrshr.u16 q14, q8, #8 2963 urshr v11.8h, v8.8h, #8
2674 vrshr.u16 q15, q9, #8 2964 urshr v12.8h, v9.8h, #8
2675 vrshr.u16 q12, q10, #8 2965 urshr v13.8h, v10.8h, #8
2676 vraddhn.u16 d0, q14, q8 2966 raddhn v0.8b, v11.8h, v8.8h
2677 vraddhn.u16 d1, q15, q9 2967 raddhn v1.8b, v12.8h, v9.8h
2678 vraddhn.u16 d2, q12, q10 2968 raddhn v2.8b, v13.8h, v10.8h
2679 /* 32bpp result is in {d0, d1, d2, XX} */ 2969 /* 32bpp result is in {v0, v1, v2, XX} */
2680 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2970 convert_8888_to_0565 v2, v1, v0, v14, v12, v3
2971 mov v28.d[0], v14.d[0]
2972 mov v29.d[0], v14.d[1]
2681.endm 2973.endm
2682 2974
2683/* TODO: expand macros and do better instructions scheduling */ 2975/* TODO: expand macros and do better instructions scheduling */
2684.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head 2976.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2685 fetch_src_pixblock 2977 fetch_src_pixblock
2686 pixman_composite_out_reverse_8_0565_process_pixblock_tail 2978 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2687 vld1.16 {d10, d11}, [DST_R, :128]! 2979 ld1 {v10.4h, v11.4h}, [DST_R], #16
2688 cache_preload 8, 8 2980 cache_preload 8, 8
2689 pixman_composite_out_reverse_8_0565_process_pixblock_head 2981 pixman_composite_out_reverse_8_0565_process_pixblock_head
2690 vst1.16 {d28, d29}, [DST_W, :128]! 2982 st1 {v14.8h}, [DST_W], #16
2691.endm 2983.endm
2692 2984
2693generate_composite_function \ 2985generate_composite_function \
2694 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ 2986 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2695 FLAG_DST_READWRITE, \ 2987 FLAG_DST_READWRITE, \
@@ -2699,47 +2991,47 @@ generate_composite_function \
2699 default_cleanup_need_all_regs, \ 2991 default_cleanup_need_all_regs, \
2700 pixman_composite_out_reverse_8_0565_process_pixblock_head, \ 2992 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2701 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ 2993 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2702 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ 2994 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2703 28, /* dst_w_basereg */ \ 2995 28, /* dst_w_basereg */ \
2704 10, /* dst_r_basereg */ \ 2996 10, /* dst_r_basereg */ \
2705 15, /* src_basereg */ \ 2997 15, /* src_basereg */ \
2706 0 /* mask_basereg */ 2998 0 /* mask_basereg */
2707 2999
2708/******************************************************************************/ 3000/******************************************************************************/
2709 3001
2710.macro pixman_composite_out_reverse_8_8888_process_pixblock_head 3002.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2711 /* src is in d0 */ 3003 /* src is in v0 */
2712 /* destination pixel data is in {d4, d5, d6, d7} */ 3004 /* destination pixel data is in {v4, v5, v6, v7} */
2713 vmvn.8 d1, d0 /* get inverted alpha */ 3005 mvn v1.8b, v0.8b /* get inverted alpha */
2714 /* now do alpha blending */ 3006 /* now do alpha blending */
2715 vmull.u8 q8, d1, d4 3007 umull v8.8h, v1.8b, v4.8b
2716 vmull.u8 q9, d1, d5 3008 umull v9.8h, v1.8b, v5.8b
2717 vmull.u8 q10, d1, d6 3009 umull v10.8h, v1.8b, v6.8b
2718 vmull.u8 q11, d1, d7 3010 umull v11.8h, v1.8b, v7.8b
2719.endm 3011.endm
2720 3012
2721.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail 3013.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2722 vrshr.u16 q14, q8, #8 3014 urshr v14.8h, v8.8h, #8
2723 vrshr.u16 q15, q9, #8 3015 urshr v15.8h, v9.8h, #8
2724 vrshr.u16 q12, q10, #8 3016 urshr v12.8h, v10.8h, #8
2725 vrshr.u16 q13, q11, #8 3017 urshr v13.8h, v11.8h, #8
2726 vraddhn.u16 d28, q14, q8 3018 raddhn v28.8b, v14.8h, v8.8h
2727 vraddhn.u16 d29, q15, q9 3019 raddhn v29.8b, v15.8h, v9.8h
2728 vraddhn.u16 d30, q12, q10 3020 raddhn v30.8b, v12.8h, v10.8h
2729 vraddhn.u16 d31, q13, q11 3021 raddhn v31.8b, v13.8h, v11.8h
2730 /* 32bpp result is in {d28, d29, d30, d31} */ 3022 /* 32bpp result is in {v28, v29, v30, v31} */
2731.endm 3023.endm
2732 3024
2733/* TODO: expand macros and do better instructions scheduling */ 3025/* TODO: expand macros and do better instructions scheduling */
2734.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head 3026.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2735 fetch_src_pixblock 3027 fetch_src_pixblock
2736 pixman_composite_out_reverse_8_8888_process_pixblock_tail 3028 pixman_composite_out_reverse_8_8888_process_pixblock_tail
2737 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 3029 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
2738 cache_preload 8, 8 3030 cache_preload 8, 8
2739 pixman_composite_out_reverse_8_8888_process_pixblock_head 3031 pixman_composite_out_reverse_8_8888_process_pixblock_head
2740 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3032 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
2741.endm 3033.endm
2742 3034
2743generate_composite_function \ 3035generate_composite_function \
2744 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ 3036 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2745 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3037 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -2752,11 +3044,11 @@ generate_composite_function \
2752 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ 3044 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2753 28, /* dst_w_basereg */ \ 3045 28, /* dst_w_basereg */ \
2754 4, /* dst_r_basereg */ \ 3046 4, /* dst_r_basereg */ \
2755 0, /* src_basereg */ \ 3047 0, /* src_basereg */ \
2756 0 /* mask_basereg */ 3048 0 /* mask_basereg */
2757 3049
2758/******************************************************************************/ 3050/******************************************************************************/
2759 3051
2760generate_composite_function_nearest_scanline \ 3052generate_composite_function_nearest_scanline \
2761 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ 3053 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2762 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3054 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
@@ -2787,12 +3079,12 @@ generate_composite_function_nearest_scanline \
2787 8, /* number of pixels, processed in a single block */ \ 3079 8, /* number of pixels, processed in a single block */ \
2788 default_init, \ 3080 default_init, \
2789 default_cleanup, \ 3081 default_cleanup, \
2790 pixman_composite_src_8888_0565_process_pixblock_head, \ 3082 pixman_composite_src_8888_0565_process_pixblock_head, \
2791 pixman_composite_src_8888_0565_process_pixblock_tail, \ 3083 pixman_composite_src_8888_0565_process_pixblock_tail, \
2792 pixman_composite_src_8888_0565_process_pixblock_tail_head 3084 pixman_composite_src_8888_0565_process_pixblock_tail_head, \
2793 3085
2794generate_composite_function_nearest_scanline \ 3086generate_composite_function_nearest_scanline \
2795 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ 3087 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2796 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3088 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2797 8, /* number of pixels, processed in a single block */ \ 3089 8, /* number of pixels, processed in a single block */ \
2798 default_init, \ 3090 default_init, \
@@ -2838,33 +3130,35 @@ generate_composite_function_nearest_scanline \
2838 */ 3130 */
2839 3131
2840.macro bilinear_load_8888 reg1, reg2, tmp 3132.macro bilinear_load_8888 reg1, reg2, tmp
2841 asr TMP1, X, #16 3133 asr TMP1, X, #16
2842 add X, X, UX 3134 add X, X, UX
2843 add TMP1, TOP, TMP1, lsl #2 3135 lsl TMP2, TMP1, #2
2844 vld1.32 {reg1}, [TMP1], STRIDE 3136 add TMP1, TOP, TMP2
2845 vld1.32 {reg2}, [TMP1] 3137 ld1 {&reg1&.2s}, [TMP1], STRIDE
3138 ld1 {&reg2&.2s}, [TMP1]
2846.endm 3139.endm
2847 3140
2848.macro bilinear_load_0565 reg1, reg2, tmp 3141.macro bilinear_load_0565 reg1, reg2, tmp
2849 asr TMP1, X, #16 3142 asr TMP1, X, #16
2850 add X, X, UX 3143 add X, X, UX
2851 add TMP1, TOP, TMP1, lsl #1 3144 lsl TMP2, TMP1, #1
2852 vld1.32 {reg2[0]}, [TMP1], STRIDE 3145 add TMP1, TOP, TMP2
2853 vld1.32 {reg2[1]}, [TMP1] 3146 ld1 {&reg2&.s}[0], [TMP1], STRIDE
3147 ld1 {&reg2&.s}[1], [TMP1]
2854 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp 3148 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2855.endm 3149.endm
2856 3150
2857.macro bilinear_load_and_vertical_interpolate_two_8888 \ 3151.macro bilinear_load_and_vertical_interpolate_two_8888 \
2858 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 3152 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2859 3153
2860 bilinear_load_8888 reg1, reg2, tmp1 3154 bilinear_load_8888 reg1, reg2, tmp1
2861 vmull.u8 acc1, reg1, d28 3155 umull &acc1&.8h, &reg1&.8b, v28.8b
2862 vmlal.u8 acc1, reg2, d29 3156 umlal &acc1&.8h, &reg2&.8b, v29.8b
2863 bilinear_load_8888 reg3, reg4, tmp2 3157 bilinear_load_8888 reg3, reg4, tmp2
2864 vmull.u8 acc2, reg3, d28 3158 umull &acc2&.8h, &reg3&.8b, v28.8b
2865 vmlal.u8 acc2, reg4, d29 3159 umlal &acc2&.8h, &reg4&.8b, v29.8b
2866.endm 3160.endm
2867 3161
2868.macro bilinear_load_and_vertical_interpolate_four_8888 \ 3162.macro bilinear_load_and_vertical_interpolate_four_8888 \
2869 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 3163 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2870 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 3164 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
@@ -2873,170 +3167,190 @@ generate_composite_function_nearest_scanline \
2873 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 3167 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2874 bilinear_load_and_vertical_interpolate_two_8888 \ 3168 bilinear_load_and_vertical_interpolate_two_8888 \
2875 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 3169 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2876.endm 3170.endm
2877 3171
3172.macro vzip reg1, reg2
3173 umov TMP4, v31.d[0]
3174 zip1 v31.8b, reg1, reg2
3175 zip2 reg2, reg1, reg2
3176 mov reg1, v31.8b
3177 mov v31.d[0], TMP4
3178.endm
3179
3180.macro vuzp reg1, reg2
3181 umov TMP4, v31.d[0]
3182 uzp1 v31.8b, reg1, reg2
3183 uzp2 reg2, reg1, reg2
3184 mov reg1, v31.8b
3185 mov v31.d[0], TMP4
3186.endm
3187
2878.macro bilinear_load_and_vertical_interpolate_two_0565 \ 3188.macro bilinear_load_and_vertical_interpolate_two_0565 \
2879 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 3189 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2880
2881 asr TMP1, X, #16 3190 asr TMP1, X, #16
2882 add X, X, UX 3191 add X, X, UX
2883 add TMP1, TOP, TMP1, lsl #1 3192 lsl TMP2, TMP1, #1
3193 add TMP1, TOP, TMP2
2884 asr TMP2, X, #16 3194 asr TMP2, X, #16
2885 add X, X, UX 3195 add X, X, UX
2886 add TMP2, TOP, TMP2, lsl #1 3196 lsl TMP3, TMP2, #1
2887 vld1.32 {acc2lo[0]}, [TMP1], STRIDE 3197 add TMP2, TOP, TMP3
2888 vld1.32 {acc2hi[0]}, [TMP2], STRIDE 3198 ld1 {&acc2&.s}[0], [TMP1], STRIDE
2889 vld1.32 {acc2lo[1]}, [TMP1] 3199 ld1 {&acc2&.s}[2], [TMP2], STRIDE
2890 vld1.32 {acc2hi[1]}, [TMP2] 3200 ld1 {&acc2&.s}[1], [TMP1]
3201 ld1 {&acc2&.s}[3], [TMP2]
2891 convert_0565_to_x888 acc2, reg3, reg2, reg1 3202 convert_0565_to_x888 acc2, reg3, reg2, reg1
2892 vzip.u8 reg1, reg3 3203 vzip &reg1&.8b, &reg3&.8b
2893 vzip.u8 reg2, reg4 3204 vzip &reg2&.8b, &reg4&.8b
2894 vzip.u8 reg3, reg4 3205 vzip &reg3&.8b, &reg4&.8b
2895 vzip.u8 reg1, reg2 3206 vzip &reg1&.8b, &reg2&.8b
2896 vmull.u8 acc1, reg1, d28 3207 umull &acc1&.8h, &reg1&.8b, v28.8b
2897 vmlal.u8 acc1, reg2, d29 3208 umlal &acc1&.8h, &reg2&.8b, v29.8b
2898 vmull.u8 acc2, reg3, d28 3209 umull &acc2&.8h, &reg3&.8b, v28.8b
2899 vmlal.u8 acc2, reg4, d29 3210 umlal &acc2&.8h, &reg4&.8b, v29.8b
2900.endm 3211.endm
2901 3212
2902.macro bilinear_load_and_vertical_interpolate_four_0565 \ 3213.macro bilinear_load_and_vertical_interpolate_four_0565 \
2903 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 3214 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2904 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 3215 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2905
2906 asr TMP1, X, #16 3216 asr TMP1, X, #16
2907 add X, X, UX 3217 add X, X, UX
2908 add TMP1, TOP, TMP1, lsl #1 3218 lsl TMP2, TMP1, #1
3219 add TMP1, TOP, TMP2
2909 asr TMP2, X, #16 3220 asr TMP2, X, #16
2910 add X, X, UX 3221 add X, X, UX
2911 add TMP2, TOP, TMP2, lsl #1 3222 lsl TMP3, TMP2, #1
2912 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE 3223 add TMP2, TOP, TMP3
2913 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE 3224 ld1 {&xacc2&.s}[0], [TMP1], STRIDE
2914 vld1.32 {xacc2lo[1]}, [TMP1] 3225 ld1 {&xacc2&.s}[2], [TMP2], STRIDE
2915 vld1.32 {xacc2hi[1]}, [TMP2] 3226 ld1 {&xacc2&.s}[1], [TMP1]
3227 ld1 {&xacc2&.s}[3], [TMP2]
2916 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 3228 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2917 asr TMP1, X, #16 3229 asr TMP1, X, #16
2918 add X, X, UX 3230 add X, X, UX
2919 add TMP1, TOP, TMP1, lsl #1 3231 lsl TMP2, TMP1, #1
3232 add TMP1, TOP, TMP2
2920 asr TMP2, X, #16 3233 asr TMP2, X, #16
2921 add X, X, UX 3234 add X, X, UX
2922 add TMP2, TOP, TMP2, lsl #1 3235 lsl TMP3, TMP2, #1
2923 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE 3236 add TMP2, TOP, TMP3
2924 vzip.u8 xreg1, xreg3 3237 ld1 {&yacc2&.s}[0], [TMP1], STRIDE
2925 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE 3238 vzip &xreg1&.8b, &xreg3&.8b
2926 vzip.u8 xreg2, xreg4 3239 ld1 {&yacc2&.s}[2], [TMP2], STRIDE
2927 vld1.32 {yacc2lo[1]}, [TMP1] 3240 vzip &xreg2&.8b, &xreg4&.8b
2928 vzip.u8 xreg3, xreg4 3241 ld1 {&yacc2&.s}[1], [TMP1]
2929 vld1.32 {yacc2hi[1]}, [TMP2] 3242 vzip &xreg3&.8b, &xreg4&.8b
2930 vzip.u8 xreg1, xreg2 3243 ld1 {&yacc2&.s}[3], [TMP2]
3244 vzip &xreg1&.8b, &xreg2&.8b
2931 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 3245 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2932 vmull.u8 xacc1, xreg1, d28 3246 umull &xacc1&.8h, &xreg1&.8b, v28.8b
2933 vzip.u8 yreg1, yreg3 3247 vzip &yreg1&.8b, &yreg3&.8b
2934 vmlal.u8 xacc1, xreg2, d29 3248 umlal &xacc1&.8h, &xreg2&.8b, v29.8b
2935 vzip.u8 yreg2, yreg4 3249 vzip &yreg2&.8b, &yreg4&.8b
2936 vmull.u8 xacc2, xreg3, d28 3250 umull &xacc2&.8h, &xreg3&.8b, v28.8b
2937 vzip.u8 yreg3, yreg4 3251 vzip &yreg3&.8b, &yreg4&.8b
2938 vmlal.u8 xacc2, xreg4, d29 3252 umlal &xacc2&.8h, &xreg4&.8b, v29.8b
2939 vzip.u8 yreg1, yreg2 3253 vzip &yreg1&.8b, &yreg2&.8b
2940 vmull.u8 yacc1, yreg1, d28 3254 umull &yacc1&.8h, &yreg1&.8b, v28.8b
2941 vmlal.u8 yacc1, yreg2, d29 3255 umlal &yacc1&.8h, &yreg2&.8b, v29.8b
2942 vmull.u8 yacc2, yreg3, d28 3256 umull &yacc2&.8h, &yreg3&.8b, v28.8b
2943 vmlal.u8 yacc2, yreg4, d29 3257 umlal &yacc2&.8h, &yreg4&.8b, v29.8b
2944.endm 3258.endm
2945 3259
2946.macro bilinear_store_8888 numpix, tmp1, tmp2 3260.macro bilinear_store_8888 numpix, tmp1, tmp2
2947.if numpix == 4 3261.if numpix == 4
2948 vst1.32 {d0, d1}, [OUT, :128]! 3262 st1 {v0.2s, v1.2s}, [OUT], #16
2949.elseif numpix == 2 3263.elseif numpix == 2
2950 vst1.32 {d0}, [OUT, :64]! 3264 st1 {v0.2s}, [OUT], #8
2951.elseif numpix == 1 3265.elseif numpix == 1
2952 vst1.32 {d0[0]}, [OUT, :32]! 3266 st1 {v0.s}[0], [OUT], #4
2953.else 3267.else
2954 .error bilinear_store_8888 numpix is unsupported 3268 .error bilinear_store_8888 numpix is unsupported
2955.endif 3269.endif
2956.endm 3270.endm
2957 3271
2958.macro bilinear_store_0565 numpix, tmp1, tmp2 3272.macro bilinear_store_0565 numpix, tmp1, tmp2
2959 vuzp.u8 d0, d1 3273 vuzp v0.8b, v1.8b
2960 vuzp.u8 d2, d3 3274 vuzp v2.8b, v3.8b
2961 vuzp.u8 d1, d3 3275 vuzp v1.8b, v3.8b
2962 vuzp.u8 d0, d2 3276 vuzp v0.8b, v2.8b
2963 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 3277 convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
2964.if numpix == 4 3278.if numpix == 4
2965 vst1.16 {d2}, [OUT, :64]! 3279 st1 {v1.4h}, [OUT], #8
2966.elseif numpix == 2 3280.elseif numpix == 2
2967 vst1.32 {d2[0]}, [OUT, :32]! 3281 st1 {v1.s}[0], [OUT], #4
2968.elseif numpix == 1 3282.elseif numpix == 1
2969 vst1.16 {d2[0]}, [OUT, :16]! 3283 st1 {v1.h}[0], [OUT], #2
2970.else 3284.else
2971 .error bilinear_store_0565 numpix is unsupported 3285 .error bilinear_store_0565 numpix is unsupported
2972.endif 3286.endif
2973.endm 3287.endm
2974 3288
2975.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt 3289.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2976 bilinear_load_&src_fmt d0, d1, d2 3290 bilinear_load_&src_fmt v0, v1, v2
2977 vmull.u8 q1, d0, d28 3291 umull v2.8h, v0.8b, v28.8b
2978 vmlal.u8 q1, d1, d29 3292 umlal v2.8h, v1.8b, v29.8b
2979 /* 5 cycles bubble */ 3293 /* 5 cycles bubble */
2980 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 3294 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
2981 vmlsl.u16 q0, d2, d30 3295 umlsl v0.4s, v2.4h, v15.h[0]
2982 vmlal.u16 q0, d3, d30 3296 umlal2 v0.4s, v2.8h, v15.h[0]
2983 /* 5 cycles bubble */ 3297 /* 5 cycles bubble */
2984 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3298 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2985 /* 3 cycles bubble */ 3299 /* 3 cycles bubble */
2986 vmovn.u16 d0, q0 3300 xtn v0.8b, v0.8h
2987 /* 1 cycle bubble */ 3301 /* 1 cycle bubble */
2988 bilinear_store_&dst_fmt 1, q2, q3 3302 bilinear_store_&dst_fmt 1, v3, v4
2989.endm 3303.endm
2990 3304
2991.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt 3305.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2992 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 3306 bilinear_load_and_vertical_interpolate_two_&src_fmt \
2993 q1, q11, d0, d1, d20, d21, d22, d23 3307 v1, v11, v2, v3, v20, v21, v22, v23
2994 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 3308 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
2995 vmlsl.u16 q0, d2, d30 3309 umlsl v0.4s, v1.4h, v15.h[0]
2996 vmlal.u16 q0, d3, d30 3310 umlal2 v0.4s, v1.8h, v15.h[0]
2997 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 3311 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
2998 vmlsl.u16 q10, d22, d31 3312 umlsl v10.4s, v11.4h, v15.h[4]
2999 vmlal.u16 q10, d23, d31 3313 umlal2 v10.4s, v11.8h, v15.h[4]
3000 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3314 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3001 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3315 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3002 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3316 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3003 vadd.u16 q12, q12, q13 3317 add v12.8h, v12.8h, v13.8h
3004 vmovn.u16 d0, q0 3318 xtn v0.8b, v0.8h
3005 bilinear_store_&dst_fmt 2, q2, q3 3319 bilinear_store_&dst_fmt 2, v3, v4
3006.endm 3320.endm
3007 3321
3008.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt 3322.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3009 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 3323 bilinear_load_and_vertical_interpolate_four_&src_fmt \
3010 q1, q11, d0, d1, d20, d21, d22, d23 \ 3324 v1, v11, v14, v20, v16, v17, v22, v23 \
3011 q3, q9, d4, d5, d16, d17, d18, d19 3325 v3, v9, v24, v25, v26, v27, v18, v19
3012 pld [TMP1, PF_OFFS] 3326 prfm pldl2strm, [TMP1, PF_OFFS]
3013 sub TMP1, TMP1, STRIDE 3327 sub TMP1, TMP1, STRIDE
3014 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 3328 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
3015 vmlsl.u16 q0, d2, d30 3329 umlsl v0.4s, v1.4h, v15.h[0]
3016 vmlal.u16 q0, d3, d30 3330 umlal2 v0.4s, v1.8h, v15.h[0]
3017 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 3331 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
3018 vmlsl.u16 q10, d22, d31 3332 umlsl v10.4s, v11.4h, v15.h[4]
3019 vmlal.u16 q10, d23, d31 3333 umlal2 v10.4s, v11.8h, v15.h[4]
3020 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3334 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3021 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 3335 ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
3022 vmlsl.u16 q2, d6, d30 3336 umlsl v2.4s, v3.4h, v15.h[0]
3023 vmlal.u16 q2, d7, d30 3337 umlal2 v2.4s, v3.8h, v15.h[0]
3024 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 3338 ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
3025 pld [TMP2, PF_OFFS] 3339 prfm pldl2strm, [TMP2, PF_OFFS]
3026 vmlsl.u16 q8, d18, d31 3340 umlsl v8.4s, v9.4h, v15.h[4]
3027 vmlal.u16 q8, d19, d31 3341 umlal2 v8.4s, v9.8h, v15.h[4]
3028 vadd.u16 q12, q12, q13 3342 add v12.8h, v12.8h, v13.8h
3029 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3343 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3030 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3344 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3031 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3345 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3032 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 3346 shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3033 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3347 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3034 vmovn.u16 d0, q0 3348 xtn v0.8b, v0.8h
3035 vmovn.u16 d1, q2 3349 xtn v1.8b, v2.8h
3036 vadd.u16 q12, q12, q13 3350 add v12.8h, v12.8h, v13.8h
3037 bilinear_store_&dst_fmt 4, q2, q3 3351 bilinear_store_&dst_fmt 4, v3, v4
3038.endm 3352.endm
3039 3353
3040.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3354.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3041.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 3355.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3042 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head 3356 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
@@ -3105,125 +3419,141 @@ generate_composite_function_nearest_scanline \
3105.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ 3419.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3106 src_bpp_shift, dst_bpp_shift, \ 3420 src_bpp_shift, dst_bpp_shift, \
3107 prefetch_distance, flags 3421 prefetch_distance, flags
3108 3422
3109pixman_asm_function fname 3423pixman_asm_function fname
3110 OUT .req r0 3424 OUT .req x0
3111 TOP .req r1 3425 TOP .req x1
3112 BOTTOM .req r2 3426 BOTTOM .req x2
3113 WT .req r3 3427 WT .req x3
3114 WB .req r4 3428 WB .req x4
3115 X .req r5 3429 X .req x5
3116 UX .req r6 3430 UX .req x6
3117 WIDTH .req ip 3431 WIDTH .req x7
3118 TMP1 .req r3 3432 TMP1 .req x8
3119 TMP2 .req r4 3433 TMP2 .req x9
3120 PF_OFFS .req r7 3434 PF_OFFS .req x10
3121 TMP3 .req r8 3435 TMP3 .req x11
3122 TMP4 .req r9 3436 TMP4 .req x12
3123 STRIDE .req r2 3437 STRIDE .req x13
3124 3438
3125 mov ip, sp 3439 sxtw x3, w3
3126 push {r4, r5, r6, r7, r8, r9} 3440 sxtw x4, w4
3441 sxtw x5, w5
3442 sxtw x6, w6
3443 sxtw x7, w7
3444
3445 stp x29, x30, [sp, -16]!
3446 mov x29, sp
3447 sub sp, sp, 112 /* push all registers */
3448 sub x29, x29, 64
3449 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
3450 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
3451 stp x8, x9, [x29, -80]
3452 stp x10, x11, [x29, -96]
3453 stp x12, x13, [x29, -112]
3454
3127 mov PF_OFFS, #prefetch_distance 3455 mov PF_OFFS, #prefetch_distance
3128 ldmia ip, {WB, X, UX, WIDTH}
3129 mul PF_OFFS, PF_OFFS, UX 3456 mul PF_OFFS, PF_OFFS, UX
3130 3457
3131.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3458 subs STRIDE, BOTTOM, TOP
3132 vpush {d8-d15}
3133.endif
3134
3135 sub STRIDE, BOTTOM, TOP
3136 .unreq BOTTOM 3459 .unreq BOTTOM
3137 3460
3138 cmp WIDTH, #0 3461 cmp WIDTH, #0
3139 ble 3f 3462 ble 300f
3140 3463
3141 vdup.u16 q12, X 3464 dup v12.8h, w5
3142 vdup.u16 q13, UX 3465 dup v13.8h, w6
3143 vdup.u8 d28, WT 3466 dup v28.8b, w3
3144 vdup.u8 d29, WB 3467 dup v29.8b, w4
3145 vadd.u16 d25, d25, d26 3468 mov v25.d[0], v12.d[1]
3469 mov v26.d[0], v13.d[0]
3470 add v25.4h, v25.4h, v26.4h
3471 mov v12.d[1], v25.d[0]
3146 3472
3147 /* ensure good destination alignment */ 3473 /* ensure good destination alignment */
3148 cmp WIDTH, #1 3474 cmp WIDTH, #1
3149 blt 0f 3475 blt 100f
3150 tst OUT, #(1 << dst_bpp_shift) 3476 tst OUT, #(1 << dst_bpp_shift)
3151 beq 0f 3477 beq 100f
3152 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3478 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3153 vadd.u16 q12, q12, q13 3479 add v12.8h, v12.8h, v13.8h
3154 bilinear_interpolate_last_pixel src_fmt, dst_fmt 3480 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3155 sub WIDTH, WIDTH, #1 3481 sub WIDTH, WIDTH, #1
31560: 3482100:
3157 vadd.u16 q13, q13, q13 3483 add v13.8h, v13.8h, v13.8h
3158 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3484 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3159 vadd.u16 q12, q12, q13 3485 add v12.8h, v12.8h, v13.8h
3160 3486
3161 cmp WIDTH, #2 3487 cmp WIDTH, #2
3162 blt 0f 3488 blt 100f
3163 tst OUT, #(1 << (dst_bpp_shift + 1)) 3489 tst OUT, #(1 << (dst_bpp_shift + 1))
3164 beq 0f 3490 beq 100f
3165 bilinear_interpolate_two_pixels src_fmt, dst_fmt 3491 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3166 sub WIDTH, WIDTH, #2 3492 sub WIDTH, WIDTH, #2
31670: 3493100:
3168.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 3494.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3169/*********** 8 pixels per iteration *****************/ 3495/*********** 8 pixels per iteration *****************/
3170 cmp WIDTH, #4 3496 cmp WIDTH, #4
3171 blt 0f 3497 blt 100f
3172 tst OUT, #(1 << (dst_bpp_shift + 2)) 3498 tst OUT, #(1 << (dst_bpp_shift + 2))
3173 beq 0f 3499 beq 100f
3174 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3500 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3175 sub WIDTH, WIDTH, #4 3501 sub WIDTH, WIDTH, #4
31760: 3502100:
3177 subs WIDTH, WIDTH, #8 3503 subs WIDTH, WIDTH, #8
3178 blt 1f 3504 blt 100f
3179 asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) 3505 asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
3180 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 3506 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3181 subs WIDTH, WIDTH, #8 3507 subs WIDTH, WIDTH, #8
3182 blt 5f 3508 blt 500f
31830: 35091000:
3184 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 3510 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3185 subs WIDTH, WIDTH, #8 3511 subs WIDTH, WIDTH, #8
3186 bge 0b 3512 bge 1000b
31875: 3513500:
3188 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 3514 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
31891: 3515100:
3190 tst WIDTH, #4 3516 tst WIDTH, #4
3191 beq 2f 3517 beq 200f
3192 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3518 bilinear_interpolate_four_pixels src_fmt, dst_fmt
31932: 3519200:
3194.else 3520.else
3195/*********** 4 pixels per iteration *****************/ 3521/*********** 4 pixels per iteration *****************/
3196 subs WIDTH, WIDTH, #4 3522 subs WIDTH, WIDTH, #4
3197 blt 1f 3523 blt 100f
3198 asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) 3524 asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
3199 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3525 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3200 subs WIDTH, WIDTH, #4 3526 subs WIDTH, WIDTH, #4
3201 blt 5f 3527 blt 500f
32020: 35281000:
3203 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3529 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3204 subs WIDTH, WIDTH, #4 3530 subs WIDTH, WIDTH, #4
3205 bge 0b 3531 bge 1000b
32065: 3532500:
3207 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3533 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
32081: 3534100:
3209/****************************************************/ 3535/****************************************************/
3210.endif 3536.endif
3211 /* handle the remaining trailing pixels */ 3537 /* handle the remaining trailing pixels */
3212 tst WIDTH, #2 3538 tst WIDTH, #2
3213 beq 2f 3539 beq 200f
3214 bilinear_interpolate_two_pixels src_fmt, dst_fmt 3540 bilinear_interpolate_two_pixels src_fmt, dst_fmt
32152: 3541200:
3216 tst WIDTH, #1 3542 tst WIDTH, #1
3217 beq 3f 3543 beq 300f
3218 bilinear_interpolate_last_pixel src_fmt, dst_fmt 3544 bilinear_interpolate_last_pixel src_fmt, dst_fmt
32193: 3545300:
3220.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3546 sub x29, x29, 64
3221 vpop {d8-d15} 3547 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
3222.endif 3548 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
3223 pop {r4, r5, r6, r7, r8, r9} 3549 ldp x8, x9, [x29, -80]
3224 bx lr 3550 ldp x10, x11, [x29, -96]
3551 ldp x12, x13, [x29, -104]
3552 mov sp, x29
3553 ldp x29, x30, [sp], 16
3554 ret
3225 3555
3226 .unreq OUT 3556 .unreq OUT
3227 .unreq TOP 3557 .unreq TOP
3228 .unreq WT 3558 .unreq WT
3229 .unreq WB 3559 .unreq WB
@@ -3250,368 +3580,125 @@ pixman_asm_function fname
3250 add TMP1, TOP, TMP1, lsl #2 3580 add TMP1, TOP, TMP1, lsl #2
3251 asr TMP2, X, #16 3581 asr TMP2, X, #16
3252 add X, X, UX 3582 add X, X, UX
3253 add TMP2, TOP, TMP2, lsl #2 3583 add TMP2, TOP, TMP2, lsl #2
3254 3584
3255 vld1.32 {d22}, [TMP1], STRIDE 3585 ld1 {v22.2s}, [TMP1], STRIDE
3256 vld1.32 {d23}, [TMP1] 3586 ld1 {v23.2s}, [TMP1]
3257 asr TMP3, X, #16 3587 asr TMP3, X, #16
3258 add X, X, UX 3588 add X, X, UX
3259 add TMP3, TOP, TMP3, lsl #2 3589 add TMP3, TOP, TMP3, lsl #2
3260 vmull.u8 q8, d22, d28 3590 umull v8.8h, v22.8b, v28.8b
3261 vmlal.u8 q8, d23, d29 3591 umlal v8.8h, v23.8b, v29.8b
3262 3592
3263 vld1.32 {d22}, [TMP2], STRIDE 3593 ld1 {v22.2s}, [TMP2], STRIDE
3264 vld1.32 {d23}, [TMP2] 3594 ld1 {v23.2s}, [TMP2]
3265 asr TMP4, X, #16 3595 asr TMP4, X, #16
3266 add X, X, UX 3596 add X, X, UX
3267 add TMP4, TOP, TMP4, lsl #2 3597 add TMP4, TOP, TMP4, lsl #2
3268 vmull.u8 q9, d22, d28 3598 umull v9.8h, v22.8b, v28.8b
3269 vmlal.u8 q9, d23, d29 3599 umlal v9.8h, v23.8b, v29.8b
3270 3600
3271 vld1.32 {d22}, [TMP3], STRIDE 3601 ld1 {v22.2s}, [TMP3], STRIDE
3272 vld1.32 {d23}, [TMP3] 3602 ld1 {v23.2s}, [TMP3]
3273 vmull.u8 q10, d22, d28 3603 umull v10.8h, v22.8b, v28.8b
3274 vmlal.u8 q10, d23, d29 3604 umlal v10.8h, v23.8b, v29.8b
3275 3605
3276 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3606 ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
3277 vmlsl.u16 q0, d16, d30 3607 umlsl v0.4s, v8.4h, v15.h[0]
3278 vmlal.u16 q0, d17, d30 3608 umlal2 v0.4s, v8.8h, v15.h[0]
3279 3609
3280 pld [TMP4, PF_OFFS] 3610 prfm pldl2strm, [TMP4, PF_OFFS]
3281 vld1.32 {d16}, [TMP4], STRIDE 3611 ld1 {v16.2s}, [TMP4], STRIDE
3282 vld1.32 {d17}, [TMP4] 3612 ld1 {v17.2s}, [TMP4]
3283 pld [TMP4, PF_OFFS] 3613 prfm pldl2strm, [TMP4, PF_OFFS]
3284 vmull.u8 q11, d16, d28 3614 umull v11.8h, v16.8b, v28.8b
3285 vmlal.u8 q11, d17, d29 3615 umlal v11.8h, v17.8b, v29.8b
3286 3616
3287 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3617 ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
3288 vmlsl.u16 q1, d18, d31 3618 umlsl v1.4s, v9.4h, v15.h[4]
3289.endm 3619.endm
3290 3620
3291.macro bilinear_interpolate_four_pixels_8888_8888_tail 3621.macro bilinear_interpolate_four_pixels_8888_8888_tail
3292 vmlal.u16 q1, d19, d31 3622 umlal2 v1.4s, v9.8h, v15.h[4]
3293 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3623 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3294 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3624 ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
3295 vmlsl.u16 q2, d20, d30 3625 umlsl v2.4s, v10.4h, v15.h[0]
3296 vmlal.u16 q2, d21, d30 3626 umlal2 v2.4s, v10.8h, v15.h[0]
3297 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3627 ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
3298 vmlsl.u16 q3, d22, d31 3628 umlsl v3.4s, v11.4h, v15.h[4]
3299 vmlal.u16 q3, d23, d31 3629 umlal2 v3.4s, v11.8h, v15.h[4]
3300 vadd.u16 q12, q12, q13 3630 add v12.8h, v12.8h, v13.8h
3301 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3631 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3302 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3632 shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3303 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3633 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3304 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3634 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3305 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3635 shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3306 vmovn.u16 d6, q0 3636 xtn v6.8b, v0.8h
3307 vmovn.u16 d7, q2 3637 xtn v7.8b, v2.8h
3308 vadd.u16 q12, q12, q13 3638 add v12.8h, v12.8h, v13.8h
3309 vst1.32 {d6, d7}, [OUT, :128]! 3639 st1 {v6.2s, v7.2s}, [OUT], #16
3310.endm 3640.endm
3311 3641
3312.macro bilinear_interpolate_four_pixels_8888_8888_tail_head 3642.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3313 asr TMP1, X, #16 3643 asr TMP1, X, #16
3314 add X, X, UX 3644 add X, X, UX
3315 add TMP1, TOP, TMP1, lsl #2 3645 add TMP1, TOP, TMP1, lsl #2
3316 asr TMP2, X, #16 3646 asr TMP2, X, #16
3317 add X, X, UX 3647 add X, X, UX
3318 add TMP2, TOP, TMP2, lsl #2 3648 add TMP2, TOP, TMP2, lsl #2
3319 vmlal.u16 q1, d19, d31 3649 umlal2 v1.4s, v9.8h, v15.h[4]
3320 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3650 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3321 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3651 ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
3322 vmlsl.u16 q2, d20, d30 3652 umlsl v2.4s, v10.4h, v15.h[0]
3323 vmlal.u16 q2, d21, d30 3653 umlal2 v2.4s, v10.8h, v15.h[0]
3324 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3654 ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
3325 vld1.32 {d20}, [TMP1], STRIDE 3655 ld1 {v20.2s}, [TMP1], STRIDE
3326 vmlsl.u16 q3, d22, d31 3656 umlsl v3.4s, v11.4h, v15.h[4]
3327 vmlal.u16 q3, d23, d31 3657 umlal2 v3.4s, v11.8h, v15.h[4]
3328 vld1.32 {d21}, [TMP1] 3658 ld1 {v21.2s}, [TMP1]
3329 vmull.u8 q8, d20, d28 3659 umull v8.8h, v20.8b, v28.8b
3330 vmlal.u8 q8, d21, d29 3660 umlal v8.8h, v21.8b, v29.8b
3331 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3661 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3332 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3662 shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3333 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3663 shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3334 vld1.32 {d22}, [TMP2], STRIDE 3664 ld1 {v22.2s}, [TMP2], STRIDE
3335 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3665 shrn2 v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
3336 vadd.u16 q12, q12, q13 3666 add v12.8h, v12.8h, v13.8h
3337 vld1.32 {d23}, [TMP2] 3667 ld1 {v23.2s}, [TMP2]
3338 vmull.u8 q9, d22, d28 3668 umull v9.8h, v22.8b, v28.8b
3339 asr TMP3, X, #16 3669 asr TMP3, X, #16
3340 add X, X, UX 3670 add X, X, UX
3341 add TMP3, TOP, TMP3, lsl #2 3671 add TMP3, TOP, TMP3, lsl #2
3342 asr TMP4, X, #16 3672 asr TMP4, X, #16
3343 add X, X, UX 3673 add X, X, UX
3344 add TMP4, TOP, TMP4, lsl #2 3674 add TMP4, TOP, TMP4, lsl #2
3345 vmlal.u8 q9, d23, d29 3675 umlal v9.8h, v23.8b, v29.8b
3346 vld1.32 {d22}, [TMP3], STRIDE 3676 ld1 {v22.2s}, [TMP3], STRIDE
3347 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3677 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
3348 vld1.32 {d23}, [TMP3] 3678 ld1 {v23.2s}, [TMP3]
3349 vmull.u8 q10, d22, d28 3679 umull v10.8h, v22.8b, v28.8b
3350 vmlal.u8 q10, d23, d29 3680 umlal v10.8h, v23.8b, v29.8b
3351 vmovn.u16 d6, q0 3681 xtn v6.8b, v0.8h
3352 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3682 ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
3353 vmovn.u16 d7, q2 3683 xtn v7.8b, v4.8h
3354 vmlsl.u16 q0, d16, d30 3684 umlsl v0.4s, v8.4h, v15.h[0]
3355 vmlal.u16 q0, d17, d30 3685 umlal2 v0.4s, v8.8h, v15.h[0]
3356 pld [TMP4, PF_OFFS] 3686 prfm pldl2strm, [TMP4, PF_OFFS]
3357 vld1.32 {d16}, [TMP4], STRIDE 3687 ld1 {v16.2s}, [TMP4], STRIDE
3358 vadd.u16 q12, q12, q13 3688 add v12.8h, v12.8h, v13.8h
3359 vld1.32 {d17}, [TMP4] 3689 ld1 {v17.2s}, [TMP4]
3360 pld [TMP4, PF_OFFS] 3690 prfm pldl2strm, [TMP4, PF_OFFS]
3361 vmull.u8 q11, d16, d28 3691 umull v11.8h, v16.8b, v28.8b
3362 vmlal.u8 q11, d17, d29 3692 umlal v11.8h, v17.8b, v29.8b
3363 vst1.32 {d6, d7}, [OUT, :128]! 3693 st1 {v6.2s, v7.2s}, [OUT], #16
3364 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3694 ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
3365 vmlsl.u16 q1, d18, d31 3695 umlsl v1.4s, v9.4h, v15.h[4]
3366.endm 3696.endm
3367 3697
3368/*****************************************************************************/ 3698/*****************************************************************************/
3369 3699
3370.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3371
3372.macro bilinear_interpolate_eight_pixels_8888_0565_head
3373 asr TMP1, X, #16
3374 add X, X, UX
3375 add TMP1, TOP, TMP1, lsl #2
3376 asr TMP2, X, #16
3377 add X, X, UX
3378 add TMP2, TOP, TMP2, lsl #2
3379 vld1.32 {d20}, [TMP1], STRIDE
3380 vld1.32 {d21}, [TMP1]
3381 vmull.u8 q8, d20, d28
3382 vmlal.u8 q8, d21, d29
3383 vld1.32 {d22}, [TMP2], STRIDE
3384 vld1.32 {d23}, [TMP2]
3385 vmull.u8 q9, d22, d28
3386 asr TMP3, X, #16
3387 add X, X, UX
3388 add TMP3, TOP, TMP3, lsl #2
3389 asr TMP4, X, #16
3390 add X, X, UX
3391 add TMP4, TOP, TMP4, lsl #2
3392 vmlal.u8 q9, d23, d29
3393 vld1.32 {d22}, [TMP3], STRIDE
3394 vld1.32 {d23}, [TMP3]
3395 vmull.u8 q10, d22, d28
3396 vmlal.u8 q10, d23, d29
3397 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3398 vmlsl.u16 q0, d16, d30
3399 vmlal.u16 q0, d17, d30
3400 pld [TMP4, PF_OFFS]
3401 vld1.32 {d16}, [TMP4], STRIDE
3402 vld1.32 {d17}, [TMP4]
3403 pld [TMP4, PF_OFFS]
3404 vmull.u8 q11, d16, d28
3405 vmlal.u8 q11, d17, d29
3406 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3407 vmlsl.u16 q1, d18, d31
3408
3409 asr TMP1, X, #16
3410 add X, X, UX
3411 add TMP1, TOP, TMP1, lsl #2
3412 asr TMP2, X, #16
3413 add X, X, UX
3414 add TMP2, TOP, TMP2, lsl #2
3415 vmlal.u16 q1, d19, d31
3416 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3417 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3418 vmlsl.u16 q2, d20, d30
3419 vmlal.u16 q2, d21, d30
3420 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3421 vld1.32 {d20}, [TMP1], STRIDE
3422 vmlsl.u16 q3, d22, d31
3423 vmlal.u16 q3, d23, d31
3424 vld1.32 {d21}, [TMP1]
3425 vmull.u8 q8, d20, d28
3426 vmlal.u8 q8, d21, d29
3427 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3428 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3429 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3430 vld1.32 {d22}, [TMP2], STRIDE
3431 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3432 vadd.u16 q12, q12, q13
3433 vld1.32 {d23}, [TMP2]
3434 vmull.u8 q9, d22, d28
3435 asr TMP3, X, #16
3436 add X, X, UX
3437 add TMP3, TOP, TMP3, lsl #2
3438 asr TMP4, X, #16
3439 add X, X, UX
3440 add TMP4, TOP, TMP4, lsl #2
3441 vmlal.u8 q9, d23, d29
3442 vld1.32 {d22}, [TMP3], STRIDE
3443 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3444 vld1.32 {d23}, [TMP3]
3445 vmull.u8 q10, d22, d28
3446 vmlal.u8 q10, d23, d29
3447 vmovn.u16 d8, q0
3448 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3449 vmovn.u16 d9, q2
3450 vmlsl.u16 q0, d16, d30
3451 vmlal.u16 q0, d17, d30
3452 pld [TMP4, PF_OFFS]
3453 vld1.32 {d16}, [TMP4], STRIDE
3454 vadd.u16 q12, q12, q13
3455 vld1.32 {d17}, [TMP4]
3456 pld [TMP4, PF_OFFS]
3457 vmull.u8 q11, d16, d28
3458 vmlal.u8 q11, d17, d29
3459 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3460 vmlsl.u16 q1, d18, d31
3461.endm
3462
3463.macro bilinear_interpolate_eight_pixels_8888_0565_tail
3464 vmlal.u16 q1, d19, d31
3465 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3466 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3467 vmlsl.u16 q2, d20, d30
3468 vmlal.u16 q2, d21, d30
3469 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3470 vmlsl.u16 q3, d22, d31
3471 vmlal.u16 q3, d23, d31
3472 vadd.u16 q12, q12, q13
3473 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3474 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3475 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3476 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3477 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3478 vmovn.u16 d10, q0
3479 vmovn.u16 d11, q2
3480 vadd.u16 q12, q12, q13
3481
3482 vuzp.u8 d8, d9
3483 vuzp.u8 d10, d11
3484 vuzp.u8 d9, d11
3485 vuzp.u8 d8, d10
3486 vshll.u8 q6, d9, #8
3487 vshll.u8 q5, d10, #8
3488 vshll.u8 q7, d8, #8
3489 vsri.u16 q5, q6, #5
3490 vsri.u16 q5, q7, #11
3491 vst1.32 {d10, d11}, [OUT, :128]!
3492.endm
3493
3494.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3495 asr TMP1, X, #16
3496 add X, X, UX
3497 add TMP1, TOP, TMP1, lsl #2
3498 asr TMP2, X, #16
3499 add X, X, UX
3500 add TMP2, TOP, TMP2, lsl #2
3501 vmlal.u16 q1, d19, d31
3502 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3503 vuzp.u8 d8, d9
3504 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3505 vmlsl.u16 q2, d20, d30
3506 vmlal.u16 q2, d21, d30
3507 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3508 vld1.32 {d20}, [TMP1], STRIDE
3509 vmlsl.u16 q3, d22, d31
3510 vmlal.u16 q3, d23, d31
3511 vld1.32 {d21}, [TMP1]
3512 vmull.u8 q8, d20, d28
3513 vmlal.u8 q8, d21, d29
3514 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3515 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3516 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3517 vld1.32 {d22}, [TMP2], STRIDE
3518 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3519 vadd.u16 q12, q12, q13
3520 vld1.32 {d23}, [TMP2]
3521 vmull.u8 q9, d22, d28
3522 asr TMP3, X, #16
3523 add X, X, UX
3524 add TMP3, TOP, TMP3, lsl #2
3525 asr TMP4, X, #16
3526 add X, X, UX
3527 add TMP4, TOP, TMP4, lsl #2
3528 vmlal.u8 q9, d23, d29
3529 vld1.32 {d22}, [TMP3], STRIDE
3530 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3531 vld1.32 {d23}, [TMP3]
3532 vmull.u8 q10, d22, d28
3533 vmlal.u8 q10, d23, d29
3534 vmovn.u16 d10, q0
3535 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3536 vmovn.u16 d11, q2
3537 vmlsl.u16 q0, d16, d30
3538 vmlal.u16 q0, d17, d30
3539 pld [TMP4, PF_OFFS]
3540 vld1.32 {d16}, [TMP4], STRIDE
3541 vadd.u16 q12, q12, q13
3542 vld1.32 {d17}, [TMP4]
3543 pld [TMP4, PF_OFFS]
3544 vmull.u8 q11, d16, d28
3545 vmlal.u8 q11, d17, d29
3546 vuzp.u8 d10, d11
3547 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3548 vmlsl.u16 q1, d18, d31
3549
3550 asr TMP1, X, #16
3551 add X, X, UX
3552 add TMP1, TOP, TMP1, lsl #2
3553 asr TMP2, X, #16
3554 add X, X, UX
3555 add TMP2, TOP, TMP2, lsl #2
3556 vmlal.u16 q1, d19, d31
3557 vuzp.u8 d9, d11
3558 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3559 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3560 vuzp.u8 d8, d10
3561 vmlsl.u16 q2, d20, d30
3562 vmlal.u16 q2, d21, d30
3563 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3564 vld1.32 {d20}, [TMP1], STRIDE
3565 vmlsl.u16 q3, d22, d31
3566 vmlal.u16 q3, d23, d31
3567 vld1.32 {d21}, [TMP1]
3568 vmull.u8 q8, d20, d28
3569 vmlal.u8 q8, d21, d29
3570 vshll.u8 q6, d9, #8
3571 vshll.u8 q5, d10, #8
3572 vshll.u8 q7, d8, #8
3573 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3574 vsri.u16 q5, q6, #5
3575 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3576 vsri.u16 q5, q7, #11
3577 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3578 vld1.32 {d22}, [TMP2], STRIDE
3579 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3580 vadd.u16 q12, q12, q13
3581 vld1.32 {d23}, [TMP2]
3582 vmull.u8 q9, d22, d28
3583 asr TMP3, X, #16
3584 add X, X, UX
3585 add TMP3, TOP, TMP3, lsl #2
3586 asr TMP4, X, #16
3587 add X, X, UX
3588 add TMP4, TOP, TMP4, lsl #2
3589 vmlal.u8 q9, d23, d29
3590 vld1.32 {d22}, [TMP3], STRIDE
3591 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3592 vld1.32 {d23}, [TMP3]
3593 vmull.u8 q10, d22, d28
3594 vmlal.u8 q10, d23, d29
3595 vmovn.u16 d8, q0
3596 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3597 vmovn.u16 d9, q2
3598 vmlsl.u16 q0, d16, d30
3599 vmlal.u16 q0, d17, d30
3600 pld [TMP4, PF_OFFS]
3601 vld1.32 {d16}, [TMP4], STRIDE
3602 vadd.u16 q12, q12, q13
3603 vld1.32 {d17}, [TMP4]
3604 pld [TMP4, PF_OFFS]
3605 vmull.u8 q11, d16, d28
3606 vmlal.u8 q11, d17, d29
3607 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3608 vst1.32 {d10, d11}, [OUT, :128]!
3609 vmlsl.u16 q1, d18, d31
3610.endm
3611/*****************************************************************************/
3612
3613generate_bilinear_scanline_func \ 3700generate_bilinear_scanline_func \
3614 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 3701 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3615 2, 2, 28, BILINEAR_FLAG_UNROLL_4 3702 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3616 3703
3617generate_bilinear_scanline_func \ 3704generate_bilinear_scanline_func \
diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
index 6a5a5fe..97cde5d 100644
--- a/pixman/pixman-arma64-neon-asm.h
+++ b/pixman/pixman-arma64-neon-asm.h
@@ -36,11 +36,11 @@
36 * scheduling 36 * scheduling
37 * 37 *
38 * The user of this macro has to provide some configuration parameters 38 * The user of this macro has to provide some configuration parameters
39 * (bit depths for the images, prefetch distance, etc.) and a set of 39 * (bit depths for the images, prefetch distance, etc.) and a set of
40 * macros, which should implement basic code chunks responsible for 40 * macros, which should implement basic code chunks responsible for
41 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage 41 * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
42 * examples. 42 * examples.
43 * 43 *
44 * TODO: 44 * TODO:
45 * - try overlapped pixel method (from Ian Rickards) when processing 45 * - try overlapped pixel method (from Ian Rickards) when processing
46 * exactly two blocks of pixels 46 * exactly two blocks of pixels
@@ -54,16 +54,10 @@
54.set FLAG_DST_WRITEONLY, 0 54.set FLAG_DST_WRITEONLY, 0
55.set FLAG_DST_READWRITE, 1 55.set FLAG_DST_READWRITE, 1
56.set FLAG_DEINTERLEAVE_32BPP, 2 56.set FLAG_DEINTERLEAVE_32BPP, 2
57 57
58/* 58/*
59 * Offset in stack where mask and source pointer/stride can be accessed
60 * from 'init' macro. This is useful for doing special handling for solid mask.
61 */
62.set ARGS_STACK_OFFSET, 40
63
64/*
65 * Constants for selecting preferable prefetch type. 59 * Constants for selecting preferable prefetch type.
66 */ 60 */
67.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ 61.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */
68.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ 62.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */
69.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ 63.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
@@ -72,121 +66,129 @@
72 * Definitions of supplementary pixld/pixst macros (for partial load/store of 66 * Definitions of supplementary pixld/pixst macros (for partial load/store of
73 * pixel data). 67 * pixel data).
74 */ 68 */
75 69
76.macro pixldst1 op, elem_size, reg1, mem_operand, abits 70.macro pixldst1 op, elem_size, reg1, mem_operand, abits
77.if abits > 0 71 op {v&reg1&.&elem_size}, [&mem_operand&], #8
78 op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
79.else
80 op&.&elem_size {d&reg1}, [&mem_operand&]!
81.endif
82.endm 72.endm
83 73
84.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits 74.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
85.if abits > 0 75 op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
86 op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
87.else
88 op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
89.endif
90.endm 76.endm
91 77
92.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits 78.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
93.if abits > 0 79 op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size, v&reg4&.&elem_size}, [&mem_operand&], #32
94 op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
95.else
96 op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
97.endif
98.endm 80.endm
99 81
100.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits 82.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
101 op&.&elem_size {d&reg1[idx]}, [&mem_operand&]! 83 op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
102.endm 84.endm
103 85
104.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand 86.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
105 op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]! 87 op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}, [&mem_operand&], #24
106.endm 88.endm
107 89
108.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand 90.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
109 op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]! 91 op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}[idx], [&mem_operand&], #3
110.endm 92.endm
111 93
112.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits 94.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
113.if numbytes == 32 95.if numbytes == 32
114 pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ 96 .if elem_size==32
97 pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
98 %(basereg+6), %(basereg+7), mem_operand, abits
99 .elseif elem_size==16
100 pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
101 %(basereg+6), %(basereg+7), mem_operand, abits
102 .else
103 pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
115 %(basereg+6), %(basereg+7), mem_operand, abits 104 %(basereg+6), %(basereg+7), mem_operand, abits
105 .endif
116.elseif numbytes == 16 106.elseif numbytes == 16
117 pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits 107 .if elem_size==32
108 pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
109 .elseif elem_size==16
110 pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
111 .else
112 pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
113 .endif
118.elseif numbytes == 8 114.elseif numbytes == 8
119 pixldst1 op, elem_size, %(basereg+1), mem_operand, abits 115 .if elem_size==32
116 pixldst1 op, 2s, %(basereg+1), mem_operand, abits
117 .elseif elem_size==16
118 pixldst1 op, 4h, %(basereg+1), mem_operand, abits
119 .else
120 pixldst1 op, 8b, %(basereg+1), mem_operand, abits
121 .endif
120.elseif numbytes == 4 122.elseif numbytes == 4
121 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) 123 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
122 pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits 124 pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
123 .elseif elem_size == 16 125 .elseif elem_size == 16
124 pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits 126 pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
125 pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits 127 pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
126 .else 128 .else
127 pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits 129 pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
128 pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits 130 pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
129 pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits 131 pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
130 pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits 132 pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
131 .endif 133 .endif
132.elseif numbytes == 2 134.elseif numbytes == 2
133 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) 135 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
134 pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits 136 pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
135 .else 137 .else
136 pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits 138 pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
137 pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits 139 pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
138 .endif 140 .endif
139.elseif numbytes == 1 141.elseif numbytes == 1
140 pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits 142 pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
141.else 143.else
142 .error "unsupported size: numbytes" 144 .error "unsupported size: numbytes"
143.endif 145.endif
144.endm 146.endm
145 147
146.macro pixld numpix, bpp, basereg, mem_operand, abits=0 148.macro pixld numpix, bpp, basereg, mem_operand, abits=0
147.if bpp > 0 149.if bpp > 0
148.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 150.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
149 pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ 151 pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
150 %(basereg+6), %(basereg+7), mem_operand, abits 152 %(basereg+6), %(basereg+7), mem_operand, abits
151.elseif (bpp == 24) && (numpix == 8) 153.elseif (bpp == 24) && (numpix == 8)
152 pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 154 pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
153.elseif (bpp == 24) && (numpix == 4) 155.elseif (bpp == 24) && (numpix == 4)
154 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 156 pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
155 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 157 pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
156 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 158 pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
157 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 159 pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
158.elseif (bpp == 24) && (numpix == 2) 160.elseif (bpp == 24) && (numpix == 2)
159 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 161 pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
160 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 162 pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
161.elseif (bpp == 24) && (numpix == 1) 163.elseif (bpp == 24) && (numpix == 1)
162 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 164 pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
163.else 165.else
164 pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits 166 pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
165.endif 167.endif
166.endif 168.endif
167.endm 169.endm
168 170
169.macro pixst numpix, bpp, basereg, mem_operand, abits=0 171.macro pixst numpix, bpp, basereg, mem_operand, abits=0
170.if bpp > 0 172.if bpp > 0
171.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 173.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
172 pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ 174 pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
173 %(basereg+6), %(basereg+7), mem_operand, abits 175 %(basereg+6), %(basereg+7), mem_operand, abits
174.elseif (bpp == 24) && (numpix == 8) 176.elseif (bpp == 24) && (numpix == 8)
175 pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 177 pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
176.elseif (bpp == 24) && (numpix == 4) 178.elseif (bpp == 24) && (numpix == 4)
177 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 179 pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
178 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 180 pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
179 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 181 pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
180 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 182 pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
181.elseif (bpp == 24) && (numpix == 2) 183.elseif (bpp == 24) && (numpix == 2)
182 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 184 pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
183 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 185 pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
184.elseif (bpp == 24) && (numpix == 1) 186.elseif (bpp == 24) && (numpix == 1)
185 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 187 pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
186.else 188.else
187 pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits 189 pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
188.endif 190.endif
189.endif 191.endif
190.endm 192.endm
191 193
192.macro pixld_a numpix, bpp, basereg, mem_operand 194.macro pixld_a numpix, bpp, basereg, mem_operand
@@ -211,89 +213,114 @@
211 */ 213 */
212.macro pixld1_s elem_size, reg1, mem_operand 214.macro pixld1_s elem_size, reg1, mem_operand
213.if elem_size == 16 215.if elem_size == 16
214 asr TMP1, VX, #16 216 asr TMP1, VX, #16
215 adds VX, VX, UNIT_X 217 adds VX, VX, UNIT_X
2165: subpls VX, VX, SRC_WIDTH_FIXED 218 bmi 55f
2195: subs VX, VX, SRC_WIDTH_FIXED
217 bpl 5b 220 bpl 5b
218 add TMP1, mem_operand, TMP1, lsl #1 22155:
222 lsl DUMMY, TMP1, #1
223 add TMP1, mem_operand, DUMMY
219 asr TMP2, VX, #16 224 asr TMP2, VX, #16
220 adds VX, VX, UNIT_X 225 adds VX, VX, UNIT_X
2215: subpls VX, VX, SRC_WIDTH_FIXED 226 bmi 55f
2275: subs VX, VX, SRC_WIDTH_FIXED
222 bpl 5b 228 bpl 5b
223 add TMP2, mem_operand, TMP2, lsl #1 22955:
224 vld1.16 {d&reg1&[0]}, [TMP1, :16] 230 lsl DUMMY, TMP2, #1
231 add TMP2, mem_operand, DUMMY
232 ld1 {v&reg1&.h}[0], [TMP1]
225 asr TMP1, VX, #16 233 asr TMP1, VX, #16
226 adds VX, VX, UNIT_X 234 adds VX, VX, UNIT_X
2275: subpls VX, VX, SRC_WIDTH_FIXED 235 bmi 55f
2365: subs VX, VX, SRC_WIDTH_FIXED
228 bpl 5b 237 bpl 5b
229 add TMP1, mem_operand, TMP1, lsl #1 23855:
230 vld1.16 {d&reg1&[1]}, [TMP2, :16] 239 lsl DUMMY, TMP1, #1
240 add TMP1, mem_operand, DUMMY
241 ld1 {v&reg1&.h}[1], [TMP2]
231 asr TMP2, VX, #16 242 asr TMP2, VX, #16
232 adds VX, VX, UNIT_X 243 adds VX, VX, UNIT_X
2335: subpls VX, VX, SRC_WIDTH_FIXED 244 bmi 55f
2455: subs VX, VX, SRC_WIDTH_FIXED
234 bpl 5b 246 bpl 5b
235 add TMP2, mem_operand, TMP2, lsl #1 24755:
236 vld1.16 {d&reg1&[2]}, [TMP1, :16] 248 lsl DUMMY, TMP2, #1
237 vld1.16 {d&reg1&[3]}, [TMP2, :16] 249 add TMP2, mem_operand, DUMMY
250 ld1 {v&reg1&.h}[2], [TMP1]
251 ld1 {v&reg1&.h}[3], [TMP2]
238.elseif elem_size == 32 252.elseif elem_size == 32
239 asr TMP1, VX, #16 253 asr TMP1, VX, #16
240 adds VX, VX, UNIT_X 254 adds VX, VX, UNIT_X
2415: subpls VX, VX, SRC_WIDTH_FIXED 255 bmi 55f
2565: subs VX, VX, SRC_WIDTH_FIXED
242 bpl 5b 257 bpl 5b
243 add TMP1, mem_operand, TMP1, lsl #2 25855:
259 lsl DUMMY, TMP1, #2
260 add TMP1, mem_operand, DUMMY
244 asr TMP2, VX, #16 261 asr TMP2, VX, #16
245 adds VX, VX, UNIT_X 262 adds VX, VX, UNIT_X
2465: subpls VX, VX, SRC_WIDTH_FIXED 263 bmi 55f
2645: subs VX, VX, SRC_WIDTH_FIXED
247 bpl 5b 265 bpl 5b
248 add TMP2, mem_operand, TMP2, lsl #2 26655:
249 vld1.32 {d&reg1&[0]}, [TMP1, :32] 267 lsl DUMMY, TMP2, #2
250 vld1.32 {d&reg1&[1]}, [TMP2, :32] 268 add TMP2, mem_operand, DUMMY
269 ld1 {v&reg1&.s}[0], [TMP1]
270 ld1 {v&reg1&.s}[1], [TMP2]
251.else 271.else
252 .error "unsupported" 272 .error "unsupported"
253.endif 273.endif
254.endm 274.endm
255 275
256.macro pixld2_s elem_size, reg1, reg2, mem_operand 276.macro pixld2_s elem_size, reg1, reg2, mem_operand
257.if 0 /* elem_size == 32 */ 277.if 0 /* elem_size == 32 */
258 asr TMP1, VX, #16 278 mov TMP1, VX, asr #16
259 add VX, VX, UNIT_X, lsl #1 279 add VX, VX, UNIT_X, asl #1
260 add TMP1, mem_operand, TMP1, lsl #2 280 add TMP1, mem_operand, TMP1, asl #2
261 asr TMP2, VX, #16 281 mov TMP2, VX, asr #16
262 sub VX, VX, UNIT_X 282 sub VX, VX, UNIT_X
263 add TMP2, mem_operand, TMP2, lsl #2 283 add TMP2, mem_operand, TMP2, asl #2
264 vld1.32 {d&reg1&[0]}, [TMP1, :32] 284 ld1 {v&reg1&.s}[0], [TMP1]
265 asr TMP1, VX, #16 285 mov TMP1, VX, asr #16
266 add VX, VX, UNIT_X, lsl #1 286 add VX, VX, UNIT_X, asl #1
267 add TMP1, mem_operand, TMP1, lsl #2 287 add TMP1, mem_operand, TMP1, asl #2
268 vld1.32 {d&reg2&[0]}, [TMP2, :32] 288 ld1 {v&reg2&.s}[0], [TMP2, :32]
269 asr TMP2, VX, #16 289 mov TMP2, VX, asr #16
270 add VX, VX, UNIT_X 290 add VX, VX, UNIT_X
271 add TMP2, mem_operand, TMP2, lsl #2 291 add TMP2, mem_operand, TMP2, asl #2
272 vld1.32 {d&reg1&[1]}, [TMP1, :32] 292 ld1 {v&reg1&.s}[1], [TMP1]
273 vld1.32 {d&reg2&[1]}, [TMP2, :32] 293 ld1 {v&reg2&.s}[1], [TMP2]
274.else 294.else
275 pixld1_s elem_size, reg1, mem_operand 295 pixld1_s elem_size, reg1, mem_operand
276 pixld1_s elem_size, reg2, mem_operand 296 pixld1_s elem_size, reg2, mem_operand
277.endif 297.endif
278.endm 298.endm
279 299
280.macro pixld0_s elem_size, reg1, idx, mem_operand 300.macro pixld0_s elem_size, reg1, idx, mem_operand
281.if elem_size == 16 301.if elem_size == 16
282 asr TMP1, VX, #16 302 asr TMP1, VX, #16
283 adds VX, VX, UNIT_X 303 adds VX, VX, UNIT_X
2845: subpls VX, VX, SRC_WIDTH_FIXED 304 bmi 55f
3055: subs VX, VX, SRC_WIDTH_FIXED
285 bpl 5b 306 bpl 5b
286 add TMP1, mem_operand, TMP1, lsl #1 30755:
287 vld1.16 {d&reg1&[idx]}, [TMP1, :16] 308 lsl DUMMY, TMP1, #1
309 add TMP1, mem_operand, DUMMY
310 ld1 {v&reg1&.h}[idx], [TMP1]
288.elseif elem_size == 32 311.elseif elem_size == 32
289 asr TMP1, VX, #16 312 asr DUMMY, VX, #16
313 mov TMP1, DUMMY
290 adds VX, VX, UNIT_X 314 adds VX, VX, UNIT_X
2915: subpls VX, VX, SRC_WIDTH_FIXED 315 bmi 55f
3165: subs VX, VX, SRC_WIDTH_FIXED
292 bpl 5b 317 bpl 5b
293 add TMP1, mem_operand, TMP1, lsl #2 31855:
294 vld1.32 {d&reg1&[idx]}, [TMP1, :32] 319 lsl DUMMY, TMP1, #2
320 add TMP1, mem_operand, DUMMY
321 ld1 {v&reg1&.s}[idx], [TMP1]
295.endif 322.endif
296.endm 323.endm
297 324
298.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand 325.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
299.if numbytes == 32 326.if numbytes == 32
@@ -335,15 +362,23 @@
335 pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand 362 pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
336.endif 363.endif
337.endm 364.endm
338 365
339.macro vuzp8 reg1, reg2 366.macro vuzp8 reg1, reg2
340 vuzp.8 d&reg1, d&reg2 367 umov DUMMY, v16.d[0]
368 uzp1 v16.8b, v&reg1&.8b, v&reg2&.8b
369 uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
370 mov v&reg1&.8b, v16.8b
371 mov v16.d[0], DUMMY
341.endm 372.endm
342 373
343.macro vzip8 reg1, reg2 374.macro vzip8 reg1, reg2
344 vzip.8 d&reg1, d&reg2 375 umov DUMMY, v16.d[0]
376 zip1 v16.8b, v&reg1&.8b, v&reg2&.8b
377 zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
378 mov v&reg1&.8b, v16.8b
379 mov v16.d[0], DUMMY
345.endm 380.endm
346 381
347/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 382/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
348.macro pixdeinterleave bpp, basereg 383.macro pixdeinterleave bpp, basereg
349.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 384.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
@@ -398,53 +433,65 @@
398.endif 433.endif
399.endm 434.endm
400 435
401.macro cache_preload std_increment, boost_increment 436.macro cache_preload std_increment, boost_increment
402.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) 437.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
403.if regs_shortage
404 PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
405.endif
406.if std_increment != 0 438.if std_increment != 0
407 PF add PF_X, PF_X, #std_increment 439 PF add PF_X, PF_X, #std_increment
408.endif 440.endif
409 PF tst PF_CTL, #0xF 441 PF tst PF_CTL, #0xF
410 PF addne PF_X, PF_X, #boost_increment 442 PF beq 71f
411 PF subne PF_CTL, PF_CTL, #1 443 PF add PF_X, PF_X, #boost_increment
444 PF sub PF_CTL, PF_CTL, #1
44571:
412 PF cmp PF_X, ORIG_W 446 PF cmp PF_X, ORIG_W
413.if src_bpp_shift >= 0 447.if src_bpp_shift >= 0
414 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 448 PF lsl DUMMY, PF_X, #src_bpp_shift
449 PF prfm pldl2strm, [PF_SRC, DUMMY]
415.endif 450.endif
416.if dst_r_bpp != 0 451.if dst_r_bpp != 0
417 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 452 PF lsl DUMMY, PF_X, #dst_bpp_shift
453 PF prfm pldl2strm, [PF_DST, DUMMY]
418.endif 454.endif
419.if mask_bpp_shift >= 0 455.if mask_bpp_shift >= 0
420 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 456 PF lsl DUMMY, PF_X, #mask_bpp_shift
421.endif 457 PF prfm pldl2strm, [PF_MASK, DUMMY]
422 PF subge PF_X, PF_X, ORIG_W 458.endif
423 PF subges PF_CTL, PF_CTL, #0x10 459 PF ble 71f
460 PF sub PF_X, PF_X, ORIG_W
461 PF subs PF_CTL, PF_CTL, #0x10
46271:
463 PF ble 72f
424.if src_bpp_shift >= 0 464.if src_bpp_shift >= 0
425 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 465 PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
466 PF ldrsb DUMMY, [PF_SRC, DUMMY]
467 PF add PF_SRC, PF_SRC, #1
426.endif 468.endif
427.if dst_r_bpp != 0 469.if dst_r_bpp != 0
428 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 470 PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
471 PF ldrsb DUMMY, [PF_DST, DUMMY]
472 PF add PF_DST, PF_DST, #1
429.endif 473.endif
430.if mask_bpp_shift >= 0 474.if mask_bpp_shift >= 0
431 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 475 PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
476 PF ldrsb DUMMY, [PF_MASK, DUMMY]
477 PF add PF_MASK, PF_MASK, #1
432.endif 478.endif
47972:
433.endif 480.endif
434.endm 481.endm
435 482
436.macro cache_preload_simple 483.macro cache_preload_simple
437.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) 484.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
438.if src_bpp > 0 485.if src_bpp > 0
439 pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] 486 prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
440.endif 487.endif
441.if dst_r_bpp > 0 488.if dst_r_bpp > 0
442 pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] 489 prfm pldl2strm, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
443.endif 490.endif
444.if mask_bpp > 0 491.if mask_bpp > 0
445 pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] 492 prfm pldl2strm, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
446.endif 493.endif
447.endif 494.endif
448.endm 495.endm
449 496
450.macro fetch_mask_pixblock 497.macro fetch_mask_pixblock
@@ -460,29 +507,28 @@
460.macro ensure_destination_ptr_alignment process_pixblock_head, \ 507.macro ensure_destination_ptr_alignment process_pixblock_head, \
461 process_pixblock_tail, \ 508 process_pixblock_tail, \
462 process_pixblock_tail_head 509 process_pixblock_tail_head
463.if dst_w_bpp != 24 510.if dst_w_bpp != 24
464 tst DST_R, #0xF 511 tst DST_R, #0xF
465 beq 2f 512 beq 52f
466
467.irp lowbit, 1, 2, 4, 8, 16 513.irp lowbit, 1, 2, 4, 8, 16
468local skip1 514local skip1
469.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 515.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
470.if lowbit < 16 /* we don't need more than 16-byte alignment */ 516.if lowbit < 16 /* we don't need more than 16-byte alignment */
471 tst DST_R, #lowbit 517 tst DST_R, #lowbit
472 beq 1f 518 beq 51f
473.endif 519.endif
474 pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC 520 pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
475 pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK 521 pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
476.if dst_r_bpp > 0 522.if dst_r_bpp > 0
477 pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R 523 pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
478.else 524.else
479 add DST_R, DST_R, #lowbit 525 add DST_R, DST_R, #lowbit
480.endif 526.endif
481 PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) 527 PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
482 sub W, W, #(lowbit * 8 / dst_w_bpp) 528 sub W, W, #(lowbit * 8 / dst_w_bpp)
4831: 52951:
484.endif 530.endif
485.endr 531.endr
486 pixdeinterleave src_bpp, src_basereg 532 pixdeinterleave src_bpp, src_basereg
487 pixdeinterleave mask_bpp, mask_basereg 533 pixdeinterleave mask_bpp, mask_basereg
488 pixdeinterleave dst_r_bpp, dst_r_basereg 534 pixdeinterleave dst_r_bpp, dst_r_basereg
@@ -491,22 +537,23 @@ local skip1
491 cache_preload 0, pixblock_size 537 cache_preload 0, pixblock_size
492 cache_preload_simple 538 cache_preload_simple
493 process_pixblock_tail 539 process_pixblock_tail
494 540
495 pixinterleave dst_w_bpp, dst_w_basereg 541 pixinterleave dst_w_bpp, dst_w_basereg
542
496.irp lowbit, 1, 2, 4, 8, 16 543.irp lowbit, 1, 2, 4, 8, 16
497.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 544.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
498.if lowbit < 16 /* we don't need more than 16-byte alignment */ 545.if lowbit < 16 /* we don't need more than 16-byte alignment */
499 tst DST_W, #lowbit 546 tst DST_W, #lowbit
500 beq 1f 547 beq 51f
501.endif 548.endif
502 pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W 549 pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
5031: 55051:
504.endif 551.endif
505.endr 552.endr
506.endif 553.endif
5072: 55452:
508.endm 555.endm
509 556
510/* 557/*
511 * Special code for processing up to (pixblock_size - 1) remaining 558 * Special code for processing up to (pixblock_size - 1) remaining
512 * trailing pixels. As SIMD processing performs operation on 559 * trailing pixels. As SIMD processing performs operation on
@@ -526,26 +573,26 @@ local skip1
526 dst_aligned_flag, \ 573 dst_aligned_flag, \
527 process_pixblock_head, \ 574 process_pixblock_head, \
528 process_pixblock_tail, \ 575 process_pixblock_tail, \
529 process_pixblock_tail_head 576 process_pixblock_tail_head
530 tst W, #(pixblock_size - 1) 577 tst W, #(pixblock_size - 1)
531 beq 2f 578 beq 52f
532.irp chunk_size, 16, 8, 4, 2, 1 579.irp chunk_size, 16, 8, 4, 2, 1
533.if pixblock_size > chunk_size 580.if pixblock_size > chunk_size
534 tst W, #chunk_size 581 tst W, #chunk_size
535 beq 1f 582 beq 51f
536 pixld_src chunk_size, src_bpp, src_basereg, SRC 583 pixld_src chunk_size, src_bpp, src_basereg, SRC
537 pixld chunk_size, mask_bpp, mask_basereg, MASK 584 pixld chunk_size, mask_bpp, mask_basereg, MASK
538.if dst_aligned_flag != 0 585.if dst_aligned_flag != 0
539 pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R 586 pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
540.else 587.else
541 pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R 588 pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
542.endif 589.endif
543.if cache_preload_flag != 0 590.if cache_preload_flag != 0
544 PF add PF_X, PF_X, #chunk_size 591 PF add PF_X, PF_X, #chunk_size
545.endif 592.endif
5461: 59351:
547.endif 594.endif
548.endr 595.endr
549 pixdeinterleave src_bpp, src_basereg 596 pixdeinterleave src_bpp, src_basereg
550 pixdeinterleave mask_bpp, mask_basereg 597 pixdeinterleave mask_bpp, mask_basereg
551 pixdeinterleave dst_r_bpp, dst_r_basereg 598 pixdeinterleave dst_r_bpp, dst_r_basereg
@@ -558,63 +605,62 @@ local skip1
558 process_pixblock_tail 605 process_pixblock_tail
559 pixinterleave dst_w_bpp, dst_w_basereg 606 pixinterleave dst_w_bpp, dst_w_basereg
560.irp chunk_size, 16, 8, 4, 2, 1 607.irp chunk_size, 16, 8, 4, 2, 1
561.if pixblock_size > chunk_size 608.if pixblock_size > chunk_size
562 tst W, #chunk_size 609 tst W, #chunk_size
563 beq 1f 610 beq 51f
564.if dst_aligned_flag != 0 611.if dst_aligned_flag != 0
565 pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W 612 pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
566.else 613.else
567 pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W 614 pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
568.endif 615.endif
5691: 61651:
570.endif 617.endif
571.endr 618.endr
5722: 61952:
573.endm 620.endm
574 621
575/* 622/*
576 * Macro, which performs all the needed operations to switch to the next 623 * Macro, which performs all the needed operations to switch to the next
577 * scanline and start the next loop iteration unless all the scanlines 624 * scanline and start the next loop iteration unless all the scanlines
578 * are already processed. 625 * are already processed.
579 */ 626 */
580.macro advance_to_next_scanline start_of_loop_label 627.macro advance_to_next_scanline start_of_loop_label
581.if regs_shortage
582 ldrd W, [sp] /* load W and H (width and height) from stack */
583.else
584 mov W, ORIG_W 628 mov W, ORIG_W
585.endif 629 lsl DUMMY, DST_STRIDE, #dst_bpp_shift
586 add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift 630 add DST_W, DST_W, DUMMY
587.if src_bpp != 0 631.if src_bpp != 0
588 add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift 632 lsl DUMMY, SRC_STRIDE, #src_bpp_shift
633 add SRC, SRC, DUMMY
589.endif 634.endif
590.if mask_bpp != 0 635.if mask_bpp != 0
591 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift 636 lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
637 add MASK, MASK, DUMMY
592.endif 638.endif
593.if (dst_w_bpp != 24) 639.if (dst_w_bpp != 24)
594 sub DST_W, DST_W, W, lsl #dst_bpp_shift 640 lsl DUMMY, W, #dst_bpp_shift
641 sub DST_W, DST_W, DUMMY
595.endif 642.endif
596.if (src_bpp != 24) && (src_bpp != 0) 643.if (src_bpp != 24) && (src_bpp != 0)
597 sub SRC, SRC, W, lsl #src_bpp_shift 644 lsl DUMMY, W, #src_bpp_shift
645 sub SRC, SRC, DUMMY
598.endif 646.endif
599.if (mask_bpp != 24) && (mask_bpp != 0) 647.if (mask_bpp != 24) && (mask_bpp != 0)
600 sub MASK, MASK, W, lsl #mask_bpp_shift 648 lsl DUMMY, W, #mask_bpp_shift
649 sub MASK, MASK, DUMMY
601.endif 650.endif
602 subs H, H, #1 651 subs H, H, #1
603 mov DST_R, DST_W 652 mov DST_R, DST_W
604.if regs_shortage
605 str H, [sp, #4] /* save updated height to stack */
606.endif
607 bge start_of_loop_label 653 bge start_of_loop_label
608.endm 654.endm
609 655
610/* 656/*
611 * Registers are allocated in the following way by default: 657 * Registers are allocated in the following way by default:
612 * d0, d1, d2, d3 - reserved for loading source pixel data 658 * v0, v1, v2, v3 - reserved for loading source pixel data
613 * d4, d5, d6, d7 - reserved for loading destination pixel data 659 * v4, v5, v6, v7 - reserved for loading destination pixel data
614 * d24, d25, d26, d27 - reserved for loading mask pixel data 660 * v24, v25, v26, v27 - reserved for loading mask pixel data
615 * d28, d29, d30, d31 - final destination pixel data for writeback to memory 661 * v28, v29, v30, v31 - final destination pixel data for writeback to memory
616 */ 662 */
617.macro generate_composite_function fname, \ 663.macro generate_composite_function fname, \
618 src_bpp_, \ 664 src_bpp_, \
619 mask_bpp_, \ 665 mask_bpp_, \
620 dst_w_bpp_, \ 666 dst_w_bpp_, \
@@ -630,12 +676,27 @@ local skip1
630 dst_r_basereg_ = 4, \ 676 dst_r_basereg_ = 4, \
631 src_basereg_ = 0, \ 677 src_basereg_ = 0, \
632 mask_basereg_ = 24 678 mask_basereg_ = 24
633 679
634 pixman_asm_function fname 680 pixman_asm_function fname
635 681 stp x29, x30, [sp, -16]!
636 push {r4-r12, lr} /* save all registers */ 682 mov x29, sp
683 sub sp, sp, 232 /* push all registers */
684 sub x29, x29, 64
685 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
686 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
687 stp x8, x9, [x29, -80]
688 stp x10, x11, [x29, -96]
689 stp x12, x13, [x29, -112]
690 stp x14, x15, [x29, -128]
691 stp x16, x17, [x29, -144]
692 stp x18, x19, [x29, -160]
693 stp x20, x21, [x29, -176]
694 stp x22, x23, [x29, -192]
695 stp x24, x25, [x29, -208]
696 stp x26, x27, [x29, -224]
697 str x28, [x29, -232]
637 698
638/* 699/*
639 * Select prefetch type for this function. If prefetch distance is 700 * Select prefetch type for this function. If prefetch distance is
640 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch 701 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
641 * has to be used instead of ADVANCED. 702 * has to be used instead of ADVANCED.
@@ -669,56 +730,40 @@ local skip1
669 (src_basereg - pixblock_size * src_bpp / 64), SRC 730 (src_basereg - pixblock_size * src_bpp / 64), SRC
670 .endm 731 .endm
671/* 732/*
672 * Assign symbolic names to registers 733 * Assign symbolic names to registers
673 */ 734 */
674 W .req r0 /* width (is updated during processing) */ 735 W .req x0 /* width (is updated during processing) */
675 H .req r1 /* height (is updated during processing) */ 736 H .req x1 /* height (is updated during processing) */
676 DST_W .req r2 /* destination buffer pointer for writes */ 737 DST_W .req x2 /* destination buffer pointer for writes */
677 DST_STRIDE .req r3 /* destination image stride */ 738 DST_STRIDE .req x3 /* destination image stride */
678 SRC .req r4 /* source buffer pointer */ 739 SRC .req x4 /* source buffer pointer */
679 SRC_STRIDE .req r5 /* source image stride */ 740 SRC_STRIDE .req x5 /* source image stride */
680 DST_R .req r6 /* destination buffer pointer for reads */ 741 MASK .req x6 /* mask pointer */
681 742 MASK_STRIDE .req x7 /* mask stride */
682 MASK .req r7 /* mask pointer */ 743
683 MASK_STRIDE .req r8 /* mask stride */ 744 DST_R .req x8 /* destination buffer pointer for reads */
684 745
685 PF_CTL .req r9 /* combined lines counter and prefetch */ 746 PF_CTL .req x9 /* combined lines counter and prefetch */
686 /* distance increment counter */ 747 /* distance increment counter */
687 PF_X .req r10 /* pixel index in a scanline for current */ 748 PF_X .req x10 /* pixel index in a scanline for current */
688 /* pretetch position */ 749 /* pretetch position */
689 PF_SRC .req r11 /* pointer to source scanline start */ 750 PF_SRC .req x11 /* pointer to source scanline start */
690 /* for prefetch purposes */ 751 /* for prefetch purposes */
691 PF_DST .req r12 /* pointer to destination scanline start */ 752 PF_DST .req x12 /* pointer to destination scanline start */
692 /* for prefetch purposes */ 753 /* for prefetch purposes */
693 PF_MASK .req r14 /* pointer to mask scanline start */ 754 PF_MASK .req x13 /* pointer to mask scanline start */
694 /* for prefetch purposes */ 755 /* for prefetch purposes */
695/* 756
696 * Check whether we have enough registers for all the local variables. 757 ORIG_W .req x14 /* saved original width */
697 * If we don't have enough registers, original width and height are 758 DUMMY .req x15 /* temporary register */
698 * kept on top of stack (and 'regs_shortage' variable is set to indicate 759
699 * this for the rest of code). Even if there are enough registers, the 760 sxtw x0, w0
700 * allocation scheme may be a bit different depending on whether source 761 sxtw x1, w1
701 * or mask is not used. 762 sxtw x3, w3
702 */ 763 sxtw x5, w5
703.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) 764 sxtw x7, w7
704 ORIG_W .req r10 /* saved original width */
705 DUMMY .req r12 /* temporary register */
706 .set regs_shortage, 0
707.elseif mask_bpp == 0
708 ORIG_W .req r7 /* saved original width */
709 DUMMY .req r8 /* temporary register */
710 .set regs_shortage, 0
711.elseif src_bpp == 0
712 ORIG_W .req r4 /* saved original width */
713 DUMMY .req r5 /* temporary register */
714 .set regs_shortage, 0
715.else
716 ORIG_W .req r1 /* saved original width */
717 DUMMY .req r1 /* temporary register */
718 .set regs_shortage, 1
719.endif
720 765
721 .set mask_bpp_shift, -1 766 .set mask_bpp_shift, -1
722.if src_bpp == 32 767.if src_bpp == 32
723 .set src_bpp_shift, 2 768 .set src_bpp_shift, 2
724.elseif src_bpp == 24 769.elseif src_bpp == 24
@@ -768,23 +813,11 @@ local skip1
768 813
769.if prefetch_distance < 0 || prefetch_distance > 15 814.if prefetch_distance < 0 || prefetch_distance > 15
770 .error "invalid prefetch distance (prefetch_distance)" 815 .error "invalid prefetch distance (prefetch_distance)"
771.endif 816.endif
772 817
773.if src_bpp > 0
774 ldr SRC, [sp, #40]
775.endif
776.if mask_bpp > 0
777 ldr MASK, [sp, #48]
778.endif
779 PF mov PF_X, #0 818 PF mov PF_X, #0
780.if src_bpp > 0
781 ldr SRC_STRIDE, [sp, #44]
782.endif
783.if mask_bpp > 0
784 ldr MASK_STRIDE, [sp, #52]
785.endif
786 mov DST_R, DST_W 819 mov DST_R, DST_W
787 820
788.if src_bpp == 24 821.if src_bpp == 24
789 sub SRC_STRIDE, SRC_STRIDE, W 822 sub SRC_STRIDE, SRC_STRIDE, W
790 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 823 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
@@ -803,26 +836,20 @@ local skip1
803 */ 836 */
804 PF mov PF_SRC, SRC 837 PF mov PF_SRC, SRC
805 PF mov PF_DST, DST_R 838 PF mov PF_DST, DST_R
806 PF mov PF_MASK, MASK 839 PF mov PF_MASK, MASK
807 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ 840 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
808 PF mov PF_CTL, H, lsl #4 841 PF lsl DUMMY, H, #4
809 PF add PF_CTL, #(prefetch_distance - 0x10) 842 PF mov PF_CTL, DUMMY
843 PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
810 844
811 init 845 init
812.if regs_shortage
813 push {r0, r1}
814.endif
815 subs H, H, #1 846 subs H, H, #1
816.if regs_shortage
817 str H, [sp, #4] /* save updated height to stack */
818.else
819 mov ORIG_W, W 847 mov ORIG_W, W
820.endif
821 blt 9f 848 blt 9f
822 cmp W, #(pixblock_size * 2) 849 cmp W, #(pixblock_size * 2)
823 blt 8f 850 blt 800f
824/* 851/*
825 * This is the start of the pipelined loop, which if optimized for 852 * This is the start of the pipelined loop, which if optimized for
826 * long scanlines 853 * long scanlines
827 */ 854 */
8280: 8550:
@@ -839,17 +866,19 @@ local skip1
839 PF add PF_X, PF_X, #pixblock_size 866 PF add PF_X, PF_X, #pixblock_size
840 process_pixblock_head 867 process_pixblock_head
841 cache_preload 0, pixblock_size 868 cache_preload 0, pixblock_size
842 cache_preload_simple 869 cache_preload_simple
843 subs W, W, #(pixblock_size * 2) 870 subs W, W, #(pixblock_size * 2)
844 blt 2f 871 blt 200f
8451: 872
873100:
846 process_pixblock_tail_head 874 process_pixblock_tail_head
847 cache_preload_simple 875 cache_preload_simple
848 subs W, W, #pixblock_size 876 subs W, W, #pixblock_size
849 bge 1b 877 bge 100b
8502: 878
879200:
851 process_pixblock_tail 880 process_pixblock_tail
852 pixst_a pixblock_size, dst_w_bpp, \ 881 pixst_a pixblock_size, dst_w_bpp, \
853 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 882 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
854 883
855 /* Process the remaining trailing pixels in the scanline */ 884 /* Process the remaining trailing pixels in the scanline */
@@ -857,46 +886,75 @@ local skip1
857 process_pixblock_head, \ 886 process_pixblock_head, \
858 process_pixblock_tail, \ 887 process_pixblock_tail, \
859 process_pixblock_tail_head 888 process_pixblock_tail_head
860 advance_to_next_scanline 0b 889 advance_to_next_scanline 0b
861 890
862.if regs_shortage
863 pop {r0, r1}
864.endif
865 cleanup 891 cleanup
866 pop {r4-r12, pc} /* exit */ 8921000:
893 /* pop all registers */
894 sub x29, x29, 64
895 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
896 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
897 ldp x8, x9, [x29, -80]
898 ldp x10, x11, [x29, -96]
899 ldp x12, x13, [x29, -112]
900 ldp x14, x15, [x29, -128]
901 ldp x16, x17, [x29, -144]
902 ldp x18, x19, [x29, -160]
903 ldp x20, x21, [x29, -176]
904 ldp x22, x23, [x29, -192]
905 ldp x24, x25, [x29, -208]
906 ldp x26, x27, [x29, -224]
907 ldr x28, [x29, -232]
908 mov sp, x29
909 ldp x29, x30, [sp], 16
910 ret /* exit */
867/* 911/*
868 * This is the start of the loop, designed to process images with small width 912 * This is the start of the loop, designed to process images with small width
869 * (less than pixblock_size * 2 pixels). In this case neither pipelining 913 * (less than pixblock_size * 2 pixels). In this case neither pipelining
870 * nor prefetch are used. 914 * nor prefetch are used.
871 */ 915 */
8728: 916800:
873 /* Process exactly pixblock_size pixels if needed */ 917 /* Process exactly pixblock_size pixels if needed */
874 tst W, #pixblock_size 918 tst W, #pixblock_size
875 beq 1f 919 beq 100f
876 pixld pixblock_size, dst_r_bpp, \ 920 pixld pixblock_size, dst_r_bpp, \
877 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 921 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
878 fetch_src_pixblock 922 fetch_src_pixblock
879 pixld pixblock_size, mask_bpp, \ 923 pixld pixblock_size, mask_bpp, \
880 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 924 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
881 process_pixblock_head 925 process_pixblock_head
882 process_pixblock_tail 926 process_pixblock_tail
883 pixst pixblock_size, dst_w_bpp, \ 927 pixst pixblock_size, dst_w_bpp, \
884 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 928 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
8851: 929100:
886 /* Process the remaining trailing pixels in the scanline */ 930 /* Process the remaining trailing pixels in the scanline */
887 process_trailing_pixels 0, 0, \ 931 process_trailing_pixels 0, 0, \
888 process_pixblock_head, \ 932 process_pixblock_head, \
889 process_pixblock_tail, \ 933 process_pixblock_tail, \
890 process_pixblock_tail_head 934 process_pixblock_tail_head
891 advance_to_next_scanline 8b 935 advance_to_next_scanline 800b
8929: 9369:
893.if regs_shortage
894 pop {r0, r1}
895.endif
896 cleanup 937 cleanup
897 pop {r4-r12, pc} /* exit */ 938 /* pop all registers */
939 sub x29, x29, 64
940 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
941 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
942 ldp x8, x9, [x29, -80]
943 ldp x10, x11, [x29, -96]
944 ldp x12, x13, [x29, -112]
945 ldp x14, x15, [x29, -128]
946 ldp x16, x17, [x29, -144]
947 ldp x18, x19, [x29, -160]
948 ldp x20, x21, [x29, -176]
949 ldp x22, x23, [x29, -192]
950 ldp x24, x25, [x29, -208]
951 ldp x26, x27, [x29, -224]
952 ldr x28, [x29, -232]
953 mov sp, x29
954 ldp x29, x30, [sp], 16
955 ret /* exit */
898 956
899 .purgem fetch_src_pixblock 957 .purgem fetch_src_pixblock
900 .purgem pixld_src 958 .purgem pixld_src
901 959
902 .unreq SRC 960 .unreq SRC
@@ -938,12 +996,12 @@ local skip1
938 dst_r_basereg_ = 4, \ 996 dst_r_basereg_ = 4, \
939 src_basereg_ = 0, \ 997 src_basereg_ = 0, \
940 mask_basereg_ = 24 998 mask_basereg_ = 24
941 999
942 pixman_asm_function fname 1000 pixman_asm_function fname
943
944 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 1001 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
1002
945/* 1003/*
946 * Make some macro arguments globally visible and accessible 1004 * Make some macro arguments globally visible and accessible
947 * from other macros 1005 * from other macros
948 */ 1006 */
949 .set src_bpp, src_bpp_ 1007 .set src_bpp, src_bpp_
@@ -952,49 +1010,67 @@ local skip1
952 .set pixblock_size, pixblock_size_ 1010 .set pixblock_size, pixblock_size_
953 .set dst_w_basereg, dst_w_basereg_ 1011 .set dst_w_basereg, dst_w_basereg_
954 .set dst_r_basereg, dst_r_basereg_ 1012 .set dst_r_basereg, dst_r_basereg_
955 .set src_basereg, src_basereg_ 1013 .set src_basereg, src_basereg_
956 .set mask_basereg, mask_basereg_ 1014 .set mask_basereg, mask_basereg_
957 1015
958.if use_nearest_scaling != 0 1016.if use_nearest_scaling != 0
959 /* 1017 /*
960 * Assign symbolic names to registers for nearest scaling 1018 * Assign symbolic names to registers for nearest scaling
961 */ 1019 */
962 W .req r0 1020 W .req x0
963 DST_W .req r1 1021 DST_W .req x1
964 SRC .req r2 1022 SRC .req x2
965 VX .req r3 1023 VX .req x3
966 UNIT_X .req ip 1024 UNIT_X .req x4
967 MASK .req lr 1025 SRC_WIDTH_FIXED .req x5
968 TMP1 .req r4 1026 MASK .req x6
969 TMP2 .req r5 1027 TMP1 .req x8
970 DST_R .req r6 1028 TMP2 .req x9
971 SRC_WIDTH_FIXED .req r7 1029 DST_R .req x10
1030 DUMMY .req x30
972 1031
973 .macro pixld_src x:vararg 1032 .macro pixld_src x:vararg
974 pixld_s x 1033 pixld_s x
975 .endm 1034 .endm
976 1035
977 ldr UNIT_X, [sp] 1036 sxtw x0, w0
978 push {r4-r8, lr} 1037 sxtw x3, w3
979 ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] 1038 sxtw x4, w4
980 .if mask_bpp != 0 1039 sxtw x5, w5
981 ldr MASK, [sp, #(24 + 8)] 1040
982 .endif 1041 stp x29, x30, [sp, -16]!
1042 mov x29, sp
1043 sub sp, sp, 88
1044 sub x29, x29, 64
1045 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
1046 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
1047 stp x8, x9, [x29, -80]
1048 str x10, [x29, -88]
983.else 1049.else
984 /* 1050 /*
985 * Assign symbolic names to registers 1051 * Assign symbolic names to registers
986 */ 1052 */
987 W .req r0 /* width (is updated during processing) */ 1053 W .req x0 /* width (is updated during processing) */
988 DST_W .req r1 /* destination buffer pointer for writes */ 1054 DST_W .req x1 /* destination buffer pointer for writes */
989 SRC .req r2 /* source buffer pointer */ 1055 SRC .req x2 /* source buffer pointer */
990 DST_R .req ip /* destination buffer pointer for reads */ 1056 MASK .req x3 /* mask pointer */
991 MASK .req r3 /* mask pointer */ 1057 DST_R .req x4 /* destination buffer pointer for reads */
1058 DUMMY .req x30
992 1059
993 .macro pixld_src x:vararg 1060 .macro pixld_src x:vararg
994 pixld x 1061 pixld x
995 .endm 1062 .endm
1063
1064 sxtw x0, w0
1065
1066 stp x29, x30, [sp, -16]!
1067 mov x29, sp
1068 sub sp, sp, 64
1069 sub x29, x29, 64
1070 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
1071 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
996.endif 1072.endif
997 1073
998.if (((flags) & FLAG_DST_READWRITE) != 0) 1074.if (((flags) & FLAG_DST_READWRITE) != 0)
999 .set dst_r_bpp, dst_w_bpp 1075 .set dst_r_bpp, dst_w_bpp
1000.else 1076.else
@@ -1013,61 +1089,80 @@ local skip1
1013 1089
1014 init 1090 init
1015 mov DST_R, DST_W 1091 mov DST_R, DST_W
1016 1092
1017 cmp W, #pixblock_size 1093 cmp W, #pixblock_size
1018 blt 8f 1094 blt 800f
1019 1095
1020 ensure_destination_ptr_alignment process_pixblock_head, \ 1096 ensure_destination_ptr_alignment process_pixblock_head, \
1021 process_pixblock_tail, \ 1097 process_pixblock_tail, \
1022 process_pixblock_tail_head 1098 process_pixblock_tail_head
1023 1099
1024 subs W, W, #pixblock_size 1100 subs W, W, #pixblock_size
1025 blt 7f 1101 blt 700f
1026 1102
1027 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 1103 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
1028 pixld_a pixblock_size, dst_r_bpp, \ 1104 pixld_a pixblock_size, dst_r_bpp, \
1029 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 1105 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
1030 fetch_src_pixblock 1106 fetch_src_pixblock
1031 pixld pixblock_size, mask_bpp, \ 1107 pixld pixblock_size, mask_bpp, \
1032 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1108 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
1033 process_pixblock_head 1109 process_pixblock_head
1034 subs W, W, #pixblock_size 1110 subs W, W, #pixblock_size
1035 blt 2f 1111 blt 200f
10361: 1112100:
1037 process_pixblock_tail_head 1113 process_pixblock_tail_head
1038 subs W, W, #pixblock_size 1114 subs W, W, #pixblock_size
1039 bge 1b 1115 bge 100b
10402: 1116200:
1041 process_pixblock_tail 1117 process_pixblock_tail
1042 pixst_a pixblock_size, dst_w_bpp, \ 1118 pixst_a pixblock_size, dst_w_bpp, \
1043 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 1119 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
10447: 1120700:
1045 /* Process the remaining trailing pixels in the scanline (dst aligned) */ 1121 /* Process the remaining trailing pixels in the scanline (dst aligned) */
1046 process_trailing_pixels 0, 1, \ 1122 process_trailing_pixels 0, 1, \
1047 process_pixblock_head, \ 1123 process_pixblock_head, \
1048 process_pixblock_tail, \ 1124 process_pixblock_tail, \
1049 process_pixblock_tail_head 1125 process_pixblock_tail_head
1050 1126
1051 cleanup 1127 cleanup
1052.if use_nearest_scaling != 0 1128.if use_nearest_scaling != 0
1053 pop {r4-r8, pc} /* exit */ 1129 sub x29, x29, 64
1130 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
1131 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
1132 ldp x8, x9, [x29, -80]
1133 ldr x10, [x29, -96]
1134 mov sp, x29
1135 ldp x29, x30, [sp], 16
1136 ret /* exit */
1054.else 1137.else
1055 bx lr /* exit */ 1138 sub x29, x29, 64
1056.endif 1139 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
10578: 1140 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
1141 mov sp, x29
1142 ldp x29, x30, [sp], 16
1143 ret /* exit */
1144.endif
1145800:
1058 /* Process the remaining trailing pixels in the scanline (dst unaligned) */ 1146 /* Process the remaining trailing pixels in the scanline (dst unaligned) */
1059 process_trailing_pixels 0, 0, \ 1147 process_trailing_pixels 0, 0, \
1060 process_pixblock_head, \ 1148 process_pixblock_head, \
1061 process_pixblock_tail, \ 1149 process_pixblock_tail, \
1062 process_pixblock_tail_head 1150 process_pixblock_tail_head
1063 1151
1064 cleanup 1152 cleanup
1065
1066.if use_nearest_scaling != 0 1153.if use_nearest_scaling != 0
1067 pop {r4-r8, pc} /* exit */ 1154 sub x29, x29, 64
1155 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
1156 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
1157 ldp x8, x9, [x29, -80]
1158 ldr x10, [x29, -88]
1159 mov sp, x29
1160 ldp x29, x30, [sp], 16
1161 ret /* exit */
1068 1162
1163 .unreq DUMMY
1069 .unreq DST_R 1164 .unreq DST_R
1070 .unreq SRC 1165 .unreq SRC
1071 .unreq W 1166 .unreq W
1072 .unreq VX 1167 .unreq VX
1073 .unreq UNIT_X 1168 .unreq UNIT_X
@@ -1076,12 +1171,18 @@ local skip1
1076 .unreq DST_W 1171 .unreq DST_W
1077 .unreq MASK 1172 .unreq MASK
1078 .unreq SRC_WIDTH_FIXED 1173 .unreq SRC_WIDTH_FIXED
1079 1174
1080.else 1175.else
1081 bx lr /* exit */ 1176 sub x29, x29, 64
1177 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
1178 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
1179 mov sp, x29
1180 ldp x29, x30, [sp], 16
1181 ret /* exit */
1082 1182
1183 .unreq DUMMY
1083 .unreq SRC 1184 .unreq SRC
1084 .unreq MASK 1185 .unreq MASK
1085 .unreq DST_R 1186 .unreq DST_R
1086 .unreq DST_W 1187 .unreq DST_W
1087 .unreq W 1188 .unreq W
@@ -1108,21 +1209,19 @@ local skip1
1108 1209
1109.macro default_cleanup 1210.macro default_cleanup
1110.endm 1211.endm
1111 1212
1112/* 1213/*
1113 * Prologue/epilogue variant which additionally saves/restores d8-d15 1214 * Prologue/epilogue variant which additionally saves/restores v8-v15
1114 * registers (they need to be saved/restored by callee according to ABI). 1215 * registers (they need to be saved/restored by callee according to ABI).
1115 * This is required if the code needs to use all the NEON registers. 1216 * This is required if the code needs to use all the NEON registers.
1116 */ 1217 */
1117 1218
1118.macro default_init_need_all_regs 1219.macro default_init_need_all_regs
1119 vpush {d8-d15}
1120.endm 1220.endm
1121 1221
1122.macro default_cleanup_need_all_regs 1222.macro default_cleanup_need_all_regs
1123 vpop {d8-d15}
1124.endm 1223.endm
1125 1224
1126/******************************************************************************/ 1225/******************************************************************************/
1127 1226
1128/* 1227/*
@@ -1132,53 +1231,58 @@ local skip1
1132 * 1231 *
1133 * Warning: the conversion is destructive and the original 1232 * Warning: the conversion is destructive and the original
1134 * value (in) is lost. 1233 * value (in) is lost.
1135 */ 1234 */
1136.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b 1235.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
1137 vshrn.u16 out_r, in, #8 1236 shrn &out_r&.8b, &in&.8h, #8
1138 vshrn.u16 out_g, in, #3 1237 shrn &out_g&.8b, &in&.8h, #3
1139 vsli.u16 in, in, #5 1238 sli &in&.8h, &in&.8h, #5
1140 vmov.u8 out_a, #255 1239 movi &out_a&.8b, #255
1141 vsri.u8 out_r, out_r, #5 1240 sri &out_r&.8b, &out_r&.8b, #5
1142 vsri.u8 out_g, out_g, #6 1241 sri &out_g&.8b, &out_g&.8b, #6
1143 vshrn.u16 out_b, in, #2 1242 shrn &out_b&.8b, &in&.8h, #2
1144.endm 1243.endm
1145 1244
1146.macro convert_0565_to_x888 in, out_r, out_g, out_b 1245.macro convert_0565_to_x888 in, out_r, out_g, out_b
1147 vshrn.u16 out_r, in, #8 1246 shrn &out_r&.8b, &in&.8h, #8
1148 vshrn.u16 out_g, in, #3 1247 shrn &out_g&.8b, &in&.8h, #3
1149 vsli.u16 in, in, #5 1248 sli &in&.8h, &in&.8h, #5
1150 vsri.u8 out_r, out_r, #5 1249 sri &out_r&.8b, &out_r&.8b, #5
1151 vsri.u8 out_g, out_g, #6 1250 sri &out_g&.8b, &out_g&.8b, #6
1152 vshrn.u16 out_b, in, #2 1251 shrn &out_b&.8b, &in&.8h, #2
1153.endm 1252.endm
1154 1253
1155/* 1254/*
1156 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components 1255 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
1157 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 1256 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
1158 * pixels packed in 128-bit register (out). Requires two temporary 128-bit 1257 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
1159 * registers (tmp1, tmp2) 1258 * registers (tmp1, tmp2)
1160 */ 1259 */
1161.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 1260.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
1162 vshll.u8 tmp1, in_g, #8 1261 ushll &tmp1&.8h, &in_g&.8b, #7
1163 vshll.u8 out, in_r, #8 1262 shl &tmp1&.8h, &tmp1&.8h, #1
1164 vshll.u8 tmp2, in_b, #8 1263 ushll &out&.8h, &in_r&.8b, #7
1165 vsri.u16 out, tmp1, #5 1264 shl &out&.8h, &out&.8h, #1
1166 vsri.u16 out, tmp2, #11 1265 ushll &tmp2&.8h, &in_b&.8b, #7
1266 shl &tmp2&.8h, &tmp2&.8h, #1
1267 sri &out&.8h, &tmp1&.8h, #5
1268 sri &out&.8h, &tmp2&.8h, #11
1167.endm 1269.endm
1168 1270
1169/* 1271/*
1170 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels 1272 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
1171 * returned in (out0, out1) registers pair. Requires one temporary 1273 * returned in (out0, out1) registers pair. Requires one temporary
1172 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original 1274 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
1173 * value from 'in' is lost 1275 * value from 'in' is lost
1174 */ 1276 */
1175.macro convert_four_0565_to_x888_packed in, out0, out1, tmp 1277.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
1176 vshl.u16 out0, in, #5 /* G top 6 bits */ 1278 shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */
1177 vshl.u16 tmp, in, #11 /* B top 5 bits */ 1279 shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */
1178 vsri.u16 in, in, #5 /* R is ready in top bits */ 1280 sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */
1179 vsri.u16 out0, out0, #6 /* G is ready in top bits */ 1281 sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */
1180 vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ 1282 sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */
1181 vshr.u16 out1, in, #8 /* R is in place */ 1283 ushr &out1&.4h, &in&.4h, #8 /* R is in place */
1182 vsri.u16 out0, tmp, #8 /* G & B is in place */ 1284 sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */
1183 vzip.u16 out0, out1 /* everything is in place */ 1285 zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */
1286 zip2 &out1&.4h, &out0&.4h, &out1&.4h
1287 mov &out0&.d[0], &tmp&.d[0]
1184.endm 1288.endm
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 73a5414..81e0f23 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -605,10 +605,15 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
605#ifdef USE_ARM_NEON 605#ifdef USE_ARM_NEON
606pixman_implementation_t * 606pixman_implementation_t *
607_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); 607_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
608#endif 608#endif
609 609
610#ifdef USE_ARM_A64_NEON
611pixman_implementation_t *
612_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
613#endif
614
610#ifdef USE_MIPS_DSPR2 615#ifdef USE_MIPS_DSPR2
611pixman_implementation_t * 616pixman_implementation_t *
612_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback); 617_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
613#endif 618#endif
614 619