diff options
-rw-r--r-- | configure.ac | 34 | ||||
-rw-r--r-- | pixman/Makefile.am | 15 | ||||
-rw-r--r-- | pixman/pixman-arm-neon.c | 4 | ||||
-rwxr-xr-x[-rw-r--r--] | pixman/pixman-arm.c | 6 | ||||
-rw-r--r-- | pixman/pixman-arma64-neon-asm-bilinear.S | 1203 | ||||
-rw-r--r-- | pixman/pixman-arma64-neon-asm.S | 3341 | ||||
-rw-r--r-- | pixman/pixman-arma64-neon-asm.h | 694 | ||||
-rw-r--r-- | pixman/pixman-private.h | 5 |
8 files changed, 2778 insertions, 2524 deletions
diff --git a/configure.ac b/configure.ac index 6b2134e..bb0192a 100644 --- a/configure.ac +++ b/configure.ac | |||
@@ -665,10 +665,44 @@ AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes) | |||
665 | AC_MSG_RESULT($have_arm_neon) | 665 | AC_MSG_RESULT($have_arm_neon) |
666 | if test $enable_arm_neon = yes && test $have_arm_neon = no ; then | 666 | if test $enable_arm_neon = yes && test $have_arm_neon = no ; then |
667 | AC_MSG_ERROR([ARM NEON intrinsics not detected]) | 667 | AC_MSG_ERROR([ARM NEON intrinsics not detected]) |
668 | fi | 668 | fi |
669 | 669 | ||
670 | dnl ========================================================================== | ||
671 | dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions | ||
672 | have_arm_a64_neon=no | ||
673 | AC_MSG_CHECKING(whether to use ARM A64 NEON assembler) | ||
674 | xserver_save_CFLAGS=$CFLAGS | ||
675 | CFLAGS="-x assembler-with-cpp $CFLAGS" | ||
676 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ | ||
677 | .text | ||
678 | .arch armv8-a | ||
679 | .altmacro | ||
680 | prfm pldl2strm, [x0] | ||
681 | xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes) | ||
682 | CFLAGS=$xserver_save_CFLAGS | ||
683 | |||
684 | AC_ARG_ENABLE(arm-a64-neon, | ||
685 | [AC_HELP_STRING([--disable-arm-a64-neon], | ||
686 | [disable ARM A64 NEON fast paths])], | ||
687 | [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto]) | ||
688 | |||
689 | if test $enable_arm_a64_neon = no ; then | ||
690 | have_arm_a64_neon=disabled | ||
691 | fi | ||
692 | |||
693 | if test $have_arm_a64_neon = yes ; then | ||
694 | AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations]) | ||
695 | fi | ||
696 | |||
697 | AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes) | ||
698 | |||
699 | AC_MSG_RESULT($have_arm_a64_neon) | ||
700 | if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then | ||
701 | AC_MSG_ERROR([ARM A64 NEON intrinsics not detected]) | ||
702 | fi | ||
703 | |||
670 | dnl =========================================================================== | 704 | dnl =========================================================================== |
671 | dnl Check for IWMMXT | 705 | dnl Check for IWMMXT |
672 | 706 | ||
673 | AC_ARG_ENABLE(arm-iwmmxt, | 707 | AC_ARG_ENABLE(arm-iwmmxt, |
674 | [AC_HELP_STRING([--disable-arm-iwmmxt], | 708 | [AC_HELP_STRING([--disable-arm-iwmmxt], |
diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 581b6f6..9229e78 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am | |||
@@ -92,10 +92,25 @@ libpixman_arm_neon_la_SOURCES = \ | |||
92 | libpixman_1_la_LIBADD += libpixman-arm-neon.la | 92 | libpixman_1_la_LIBADD += libpixman-arm-neon.la |
93 | 93 | ||
94 | ASM_CFLAGS_arm_neon= | 94 | ASM_CFLAGS_arm_neon= |
95 | endif | 95 | endif |
96 | 96 | ||
97 | # arm a64 neon code | ||
98 | if USE_ARM_A64_NEON | ||
99 | noinst_LTLIBRARIES += libpixman-arma64-neon.la | ||
100 | libpixman_arma64_neon_la_SOURCES = \ | ||
101 | pixman-arm-neon.c \ | ||
102 | pixman-arm-common.h \ | ||
103 | pixman-arma64-neon-asm.S \ | ||
104 | pixman-arma64-neon-asm-bilinear.S \ | ||
105 | pixman-arm-asm.h \ | ||
106 | pixman-arma64-neon-asm.h | ||
107 | libpixman_1_la_LIBADD += libpixman-arma64-neon.la | ||
108 | |||
109 | ASM_CFLAGS_arm_neon= | ||
110 | endif | ||
111 | |||
97 | # iwmmxt code | 112 | # iwmmxt code |
98 | if USE_ARM_IWMMXT | 113 | if USE_ARM_IWMMXT |
99 | libpixman_iwmmxt_la_SOURCES = pixman-mmx.c | 114 | libpixman_iwmmxt_la_SOURCES = pixman-mmx.c |
100 | noinst_LTLIBRARIES += libpixman-iwmmxt.la | 115 | noinst_LTLIBRARIES += libpixman-iwmmxt.la |
101 | libpixman_1_la_LIBADD += libpixman-iwmmxt.la | 116 | libpixman_1_la_LIBADD += libpixman-iwmmxt.la |
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index be761c9..28e13d1 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c | |||
@@ -192,12 +192,12 @@ arm_neon_fill (pixman_implementation_t *imp, | |||
192 | int width, | 192 | int width, |
193 | int height, | 193 | int height, |
194 | uint32_t _xor) | 194 | uint32_t _xor) |
195 | { | 195 | { |
196 | /* stride is always multiple of 32bit units in pixman */ | 196 | /* stride is always multiple of 32bit units in pixman */ |
197 | uint32_t byte_stride = stride * sizeof(uint32_t); | 197 | int32_t byte_stride = stride * sizeof(uint32_t); |
198 | 198 | ||
199 | switch (bpp) | 199 | switch (bpp) |
200 | { | 200 | { |
201 | case 8: | 201 | case 8: |
202 | pixman_composite_src_n_8_asm_neon ( | 202 | pixman_composite_src_n_8_asm_neon ( |
203 | width, | 203 | width, |
diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c index 23374e4..734cbea 100644..100755 --- a/pixman/pixman-arm.c +++ b/pixman/pixman-arm.c | |||
@@ -219,7 +219,13 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp) | |||
219 | #ifdef USE_ARM_NEON | 219 | #ifdef USE_ARM_NEON |
220 | if (!_pixman_disabled ("arm-neon") && have_feature (ARM_NEON)) | 220 | if (!_pixman_disabled ("arm-neon") && have_feature (ARM_NEON)) |
221 | imp = _pixman_implementation_create_arm_neon (imp); | 221 | imp = _pixman_implementation_create_arm_neon (imp); |
222 | #endif | 222 | #endif |
223 | 223 | ||
224 | #ifdef USE_ARM_A64_NEON | ||
225 | /* neon is a part of aarch64 */ | ||
226 | if (!_pixman_disabled ("arm-neon")) | ||
227 | imp = _pixman_implementation_create_arm_neon (imp); | ||
228 | #endif | ||
229 | |||
224 | return imp; | 230 | return imp; |
225 | } | 231 | } |
diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S index a7d94c3..41ee753 100644 --- a/pixman/pixman-arma64-neon-asm-bilinear.S +++ b/pixman/pixman-arma64-neon-asm-bilinear.S | |||
@@ -53,22 +53,17 @@ | |||
53 | #if defined(__linux__) && defined (__ELF__) | 53 | #if defined(__linux__) && defined (__ELF__) |
54 | .section .note.GNU-stack,"",%progbits | 54 | .section .note.GNU-stack,"",%progbits |
55 | #endif | 55 | #endif |
56 | 56 | ||
57 | .text | 57 | .text |
58 | .fpu neon | 58 | .arch armv8-a |
59 | .arch armv7a | ||
60 | .object_arch armv4 | ||
61 | .eabi_attribute 10, 0 | ||
62 | .eabi_attribute 12, 0 | ||
63 | .arm | ||
64 | .altmacro | 59 | .altmacro |
65 | .p2align 2 | 60 | .p2align 2 |
66 | 61 | ||
67 | #include "pixman-private.h" | 62 | #include "pixman-private.h" |
68 | #include "pixman-arm-asm.h" | 63 | #include "pixman-arm-asm.h" |
69 | #include "pixman-arm-neon-asm.h" | 64 | #include "pixman-arma64-neon-asm.h" |
70 | 65 | ||
71 | /* | 66 | /* |
72 | * Bilinear macros from pixman-arm-neon-asm.S | 67 | * Bilinear macros from pixman-arm-neon-asm.S |
73 | */ | 68 | */ |
74 | 69 | ||
@@ -77,35 +72,39 @@ | |||
77 | * format conversion, and interpolation as separate macros which can be used | 72 | * format conversion, and interpolation as separate macros which can be used |
78 | * as the basic building blocks for constructing bilinear scanline functions. | 73 | * as the basic building blocks for constructing bilinear scanline functions. |
79 | */ | 74 | */ |
80 | 75 | ||
81 | .macro bilinear_load_8888 reg1, reg2, tmp | 76 | .macro bilinear_load_8888 reg1, reg2, tmp |
82 | asr TMP1, X, #16 | 77 | asr WTMP1, X, #16 |
83 | add X, X, UX | 78 | add X, X, UX |
84 | add TMP1, TOP, TMP1, lsl #2 | 79 | lsl TMP2, TMP1, #2 |
85 | vld1.32 {reg1}, [TMP1], STRIDE | 80 | add TMP1, TOP, TMP2 |
86 | vld1.32 {reg2}, [TMP1] | 81 | ld1 {®1&.2s}, [TMP1], STRIDE |
82 | ld1 {®2&.2s}, [TMP1] | ||
87 | .endm | 83 | .endm |
88 | 84 | ||
89 | .macro bilinear_load_0565 reg1, reg2, tmp | 85 | .macro bilinear_load_0565 reg1, reg2, tmp |
90 | asr TMP1, X, #16 | 86 | asr WTMP1, X, #16 |
91 | add X, X, UX | 87 | add X, X, UX |
92 | add TMP1, TOP, TMP1, lsl #1 | 88 | lsl TMP2, TMP1, #1 |
93 | vld1.32 {reg2[0]}, [TMP1], STRIDE | 89 | add TMP1, TOP, TMP2 |
94 | vld1.32 {reg2[1]}, [TMP1] | 90 | ld1 {®2&.s}[0], [TMP1], STRIDE |
91 | ld1 {®2&.s}[1], [TMP1] | ||
95 | convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp | 92 | convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp |
96 | .endm | 93 | .endm |
97 | 94 | ||
98 | .macro bilinear_load_and_vertical_interpolate_two_8888 \ | 95 | .macro bilinear_load_and_vertical_interpolate_two_8888 \ |
99 | acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 | 96 | acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 |
100 | 97 | ||
101 | bilinear_load_8888 reg1, reg2, tmp1 | 98 | bilinear_load_8888 reg1, reg2, tmp1 |
102 | vmull.u8 acc1, reg1, d28 | 99 | umull &tmp1&.8h, ®1&.8b, v28.8b |
103 | vmlal.u8 acc1, reg2, d29 | 100 | umlal &tmp1&.8h, ®2&.8b, v29.8b |
101 | mov &acc1&.16b, &tmp1&.16b | ||
104 | bilinear_load_8888 reg3, reg4, tmp2 | 102 | bilinear_load_8888 reg3, reg4, tmp2 |
105 | vmull.u8 acc2, reg3, d28 | 103 | umull &tmp2&.8h, ®3&.8b, v28.8b |
106 | vmlal.u8 acc2, reg4, d29 | 104 | umlal &tmp2&.8h, ®4&.8b, v29.8b |
105 | mov &acc2&.16b, &tmp2&.16b | ||
107 | .endm | 106 | .endm |
108 | 107 | ||
109 | .macro bilinear_load_and_vertical_interpolate_four_8888 \ | 108 | .macro bilinear_load_and_vertical_interpolate_four_8888 \ |
110 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ | 109 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
111 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi | 110 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
@@ -114,126 +113,149 @@ | |||
114 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi | 113 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi |
115 | bilinear_load_and_vertical_interpolate_two_8888 \ | 114 | bilinear_load_and_vertical_interpolate_two_8888 \ |
116 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi | 115 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
117 | .endm | 116 | .endm |
118 | 117 | ||
118 | .macro vzip reg1, reg2 | ||
119 | zip1 v24.8b, reg1, reg2 | ||
120 | zip2 reg2, reg1, reg2 | ||
121 | mov reg1, v24.8b | ||
122 | .endm | ||
123 | |||
124 | .macro vuzp reg1, reg2 | ||
125 | uzp1 v24.8b, reg1, reg2 | ||
126 | uzp2 reg2, reg1, reg2 | ||
127 | mov reg1, v24.8b | ||
128 | .endm | ||
129 | |||
119 | .macro bilinear_load_and_vertical_interpolate_two_0565 \ | 130 | .macro bilinear_load_and_vertical_interpolate_two_0565 \ |
120 | acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi | 131 | acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi |
121 | 132 | asr WTMP1, X, #16 | |
122 | asr TMP1, X, #16 | ||
123 | add X, X, UX | 133 | add X, X, UX |
124 | add TMP1, TOP, TMP1, lsl #1 | 134 | lsl TMP2, TMP1, #1 |
125 | asr TMP2, X, #16 | 135 | add TMP1, TOP, TMP2 |
136 | asr WTMP2, X, #16 | ||
126 | add X, X, UX | 137 | add X, X, UX |
127 | add TMP2, TOP, TMP2, lsl #1 | 138 | lsl TMP3, TMP2, #1 |
128 | vld1.32 {acc2lo[0]}, [TMP1], STRIDE | 139 | add TMP2, TOP, TMP3 |
129 | vld1.32 {acc2hi[0]}, [TMP2], STRIDE | 140 | ld1 {&acc2lo&.s}[0], [TMP1], STRIDE |
130 | vld1.32 {acc2lo[1]}, [TMP1] | 141 | ld1 {&acc2hi&.s}[0], [TMP2], STRIDE |
131 | vld1.32 {acc2hi[1]}, [TMP2] | 142 | ld1 {&acc2lo&.s}[1], [TMP1] |
143 | ld1 {&acc2hi&.s}[1], [TMP2] | ||
144 | mov &acc2&.d[0], &acc2lo&.d[0] | ||
145 | mov &acc2&.d[1], &acc2hi&.d[0] | ||
132 | convert_0565_to_x888 acc2, reg3, reg2, reg1 | 146 | convert_0565_to_x888 acc2, reg3, reg2, reg1 |
133 | vzip.u8 reg1, reg3 | 147 | vzip ®1&.8b, ®3&.8b |
134 | vzip.u8 reg2, reg4 | 148 | vzip ®2&.8b, ®4&.8b |
135 | vzip.u8 reg3, reg4 | 149 | vzip ®3&.8b, ®4&.8b |
136 | vzip.u8 reg1, reg2 | 150 | vzip ®1&.8b, ®2&.8b |
137 | vmull.u8 acc1, reg1, d28 | 151 | umull &acc1&.8h, ®1&.8b, v28.8b |
138 | vmlal.u8 acc1, reg2, d29 | 152 | umlal &acc1&.8h, ®2&.8b, v29.8b |
139 | vmull.u8 acc2, reg3, d28 | 153 | umull &acc2&.8h, ®3&.8b, v28.8b |
140 | vmlal.u8 acc2, reg4, d29 | 154 | umlal &acc2&.8h, ®4&.8b, v29.8b |
141 | .endm | 155 | .endm |
142 | 156 | ||
143 | .macro bilinear_load_and_vertical_interpolate_four_0565 \ | 157 | .macro bilinear_load_and_vertical_interpolate_four_0565 \ |
144 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ | 158 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
145 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi | 159 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
146 | 160 | ||
147 | asr TMP1, X, #16 | 161 | asr WTMP1, X, #16 |
148 | add X, X, UX | 162 | add X, X, UX |
149 | add TMP1, TOP, TMP1, lsl #1 | 163 | lsl TMP2, TMP1, #1 |
150 | asr TMP2, X, #16 | 164 | add TMP1, TOP, TMP2 |
165 | asr WTMP2, X, #16 | ||
151 | add X, X, UX | 166 | add X, X, UX |
152 | add TMP2, TOP, TMP2, lsl #1 | 167 | lsl TMP3, TMP2, #1 |
153 | vld1.32 {xacc2lo[0]}, [TMP1], STRIDE | 168 | add TMP2, TOP, TMP3 |
154 | vld1.32 {xacc2hi[0]}, [TMP2], STRIDE | 169 | ld1 {&xacc2lo&.s}[0], [TMP1], STRIDE |
155 | vld1.32 {xacc2lo[1]}, [TMP1] | 170 | ld1 {&xacc2hi&.s}[0], [TMP2], STRIDE |
156 | vld1.32 {xacc2hi[1]}, [TMP2] | 171 | ld1 {&xacc2lo&.s}[1], [TMP1] |
172 | ld1 {&xacc2hi&.s}[1], [TMP2] | ||
173 | mov &xacc2&.d[0], &xacc2lo&.d[0] | ||
174 | mov &xacc2&.d[1], &xacc2hi&.d[0] | ||
157 | convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 | 175 | convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 |
158 | asr TMP1, X, #16 | 176 | asr WTMP1, X, #16 |
159 | add X, X, UX | 177 | add X, X, UX |
160 | add TMP1, TOP, TMP1, lsl #1 | 178 | lsl TMP2, TMP1, #1 |
161 | asr TMP2, X, #16 | 179 | add TMP1, TOP, TMP2 |
180 | asr WTMP2, X, #16 | ||
162 | add X, X, UX | 181 | add X, X, UX |
163 | add TMP2, TOP, TMP2, lsl #1 | 182 | lsl TMP3, TMP2, #1 |
164 | vld1.32 {yacc2lo[0]}, [TMP1], STRIDE | 183 | add TMP2, TOP, TMP3 |
165 | vzip.u8 xreg1, xreg3 | 184 | ld1 {&yacc2lo&.s}[0], [TMP1], STRIDE |
166 | vld1.32 {yacc2hi[0]}, [TMP2], STRIDE | 185 | vzip &xreg1&.8b, &xreg3&.8b |
167 | vzip.u8 xreg2, xreg4 | 186 | ld1 {&yacc2hi&.s}[0], [TMP2], STRIDE |
168 | vld1.32 {yacc2lo[1]}, [TMP1] | 187 | vzip &xreg2&.8b, &xreg4&.8b |
169 | vzip.u8 xreg3, xreg4 | 188 | ld1 {&yacc2lo&.s}[1], [TMP1] |
170 | vld1.32 {yacc2hi[1]}, [TMP2] | 189 | vzip &xreg3&.8b, &xreg4&.8b |
171 | vzip.u8 xreg1, xreg2 | 190 | ld1 {&yacc2hi&.s}[1], [TMP2] |
191 | vzip &xreg1&.8b, &xreg2&.8b | ||
192 | mov &yacc2&.d[0], &yacc2lo&.d[0] | ||
193 | mov &yacc2&.d[1], &yacc2hi&.d[0] | ||
172 | convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 | 194 | convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 |
173 | vmull.u8 xacc1, xreg1, d28 | 195 | umull &xacc1&.8h, &xreg1&.8b, v28.8b |
174 | vzip.u8 yreg1, yreg3 | 196 | vzip &yreg1&.8b, &yreg3&.8b |
175 | vmlal.u8 xacc1, xreg2, d29 | 197 | umlal &xacc1&.8h, &xreg2&.8b, v29.8b |
176 | vzip.u8 yreg2, yreg4 | 198 | vzip &yreg2&.8b, &yreg4&.8b |
177 | vmull.u8 xacc2, xreg3, d28 | 199 | umull &xacc2&.8h, &xreg3&.8b, v28.8b |
178 | vzip.u8 yreg3, yreg4 | 200 | vzip &yreg3&.8b, &yreg4&.8b |
179 | vmlal.u8 xacc2, xreg4, d29 | 201 | umlal &xacc2&.8h, &xreg4&.8b, v29.8b |
180 | vzip.u8 yreg1, yreg2 | 202 | vzip &yreg1&.8b, &yreg2&.8b |
181 | vmull.u8 yacc1, yreg1, d28 | 203 | umull &yacc1&.8h, &yreg1&.8b, v28.8b |
182 | vmlal.u8 yacc1, yreg2, d29 | 204 | umlal &yacc1&.8h, &yreg2&.8b, v29.8b |
183 | vmull.u8 yacc2, yreg3, d28 | 205 | umull &yacc2&.8h, &yreg3&.8b, v28.8b |
184 | vmlal.u8 yacc2, yreg4, d29 | 206 | umlal &yacc2&.8h, &yreg4&.8b, v29.8b |
185 | .endm | 207 | .endm |
186 | 208 | ||
187 | .macro bilinear_store_8888 numpix, tmp1, tmp2 | 209 | .macro bilinear_store_8888 numpix, tmp1, tmp2 |
188 | .if numpix == 4 | 210 | .if numpix == 4 |
189 | vst1.32 {d0, d1}, [OUT]! | 211 | st1 {v0.2s, v1.2s}, [OUT], #16 |
190 | .elseif numpix == 2 | 212 | .elseif numpix == 2 |
191 | vst1.32 {d0}, [OUT]! | 213 | st1 {v0.2s}, [OUT], #8 |
192 | .elseif numpix == 1 | 214 | .elseif numpix == 1 |
193 | vst1.32 {d0[0]}, [OUT, :32]! | 215 | st1 {v0.s}[0], [OUT], #4 |
194 | .else | 216 | .else |
195 | .error bilinear_store_8888 numpix is unsupported | 217 | .error bilinear_store_8888 numpix is unsupported |
196 | .endif | 218 | .endif |
197 | .endm | 219 | .endm |
198 | 220 | ||
199 | .macro bilinear_store_0565 numpix, tmp1, tmp2 | 221 | .macro bilinear_store_0565 numpix, tmp1, tmp2 |
200 | vuzp.u8 d0, d1 | 222 | vuzp v0.8b, v1.8b |
201 | vuzp.u8 d2, d3 | 223 | vuzp v2.8b, v3.8b |
202 | vuzp.u8 d1, d3 | 224 | vuzp v1.8b, v3.8b |
203 | vuzp.u8 d0, d2 | 225 | vuzp v0.8b, v2.8b |
204 | convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 | 226 | convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 |
205 | .if numpix == 4 | 227 | .if numpix == 4 |
206 | vst1.16 {d2}, [OUT]! | 228 | st1 {v1.4h}, [OUT], #8 |
207 | .elseif numpix == 2 | 229 | .elseif numpix == 2 |
208 | vst1.32 {d2[0]}, [OUT]! | 230 | st1 {v1.s}[0], [OUT], #4 |
209 | .elseif numpix == 1 | 231 | .elseif numpix == 1 |
210 | vst1.16 {d2[0]}, [OUT]! | 232 | st1 {v1.h}[0], [OUT], #2 |
211 | .else | 233 | .else |
212 | .error bilinear_store_0565 numpix is unsupported | 234 | .error bilinear_store_0565 numpix is unsupported |
213 | .endif | 235 | .endif |
214 | .endm | 236 | .endm |
215 | 237 | ||
216 | 238 | ||
217 | /* | 239 | /* |
218 | * Macros for loading mask pixels into register 'mask'. | 240 | * Macros for loading mask pixels into register 'mask'. |
219 | * vdup must be done in somewhere else. | 241 | * dup must be done in somewhere else. |
220 | */ | 242 | */ |
221 | .macro bilinear_load_mask_x numpix, mask | 243 | .macro bilinear_load_mask_x numpix, mask |
222 | .endm | 244 | .endm |
223 | 245 | ||
224 | .macro bilinear_load_mask_8 numpix, mask | 246 | .macro bilinear_load_mask_8 numpix, mask |
225 | .if numpix == 4 | 247 | .if numpix == 4 |
226 | vld1.32 {mask[0]}, [MASK]! | 248 | ld1 {&mask&.s}[0], [MASK], #4 |
227 | .elseif numpix == 2 | 249 | .elseif numpix == 2 |
228 | vld1.16 {mask[0]}, [MASK]! | 250 | ld1 {&mask&.h}[0], [MASK], #2 |
229 | .elseif numpix == 1 | 251 | .elseif numpix == 1 |
230 | vld1.8 {mask[0]}, [MASK]! | 252 | ld1 {&mask&.b}[0], [MASK], #1 |
231 | .else | 253 | .else |
232 | .error bilinear_load_mask_8 numpix is unsupported | 254 | .error bilinear_load_mask_8 numpix is unsupported |
233 | .endif | 255 | .endif |
234 | pld [MASK, #prefetch_offset] | 256 | prfm PLDL2STRM, [MASK, #prefetch_offset] |
235 | .endm | 257 | .endm |
236 | 258 | ||
237 | .macro bilinear_load_mask mask_fmt, numpix, mask | 259 | .macro bilinear_load_mask mask_fmt, numpix, mask |
238 | bilinear_load_mask_&mask_fmt numpix, mask | 260 | bilinear_load_mask_&mask_fmt numpix, mask |
239 | .endm | 261 | .endm |
@@ -249,19 +271,21 @@ | |||
249 | .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 | 271 | .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 |
250 | .endm | 272 | .endm |
251 | 273 | ||
252 | .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 | 274 | .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 |
253 | .if numpix == 4 | 275 | .if numpix == 4 |
254 | vld1.32 {dst0, dst1}, [OUT] | 276 | ld1 {&dst0&.2s, &dst1&.2s}, [OUT] |
255 | .elseif numpix == 2 | 277 | .elseif numpix == 2 |
256 | vld1.32 {dst0}, [OUT] | 278 | ld1 {&dst0&.2s}, [OUT] |
257 | .elseif numpix == 1 | 279 | .elseif numpix == 1 |
258 | vld1.32 {dst0[0]}, [OUT] | 280 | ld1 {&dst0&.s}[0], [OUT] |
259 | .else | 281 | .else |
260 | .error bilinear_load_dst_8888 numpix is unsupported | 282 | .error bilinear_load_dst_8888 numpix is unsupported |
261 | .endif | 283 | .endif |
262 | pld [OUT, #(prefetch_offset * 4)] | 284 | mov &dst01&.d[0], &dst0&.d[0] |
285 | mov &dst01&.d[1], &dst1&.d[0] | ||
286 | prfm PLDL2STRM, [OUT, #(prefetch_offset * 4)] | ||
263 | .endm | 287 | .endm |
264 | 288 | ||
265 | .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 | 289 | .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 |
266 | bilinear_load_dst_8888 numpix, dst0, dst1, dst01 | 290 | bilinear_load_dst_8888 numpix, dst0, dst1, dst01 |
267 | .endm | 291 | .endm |
@@ -289,15 +313,15 @@ | |||
289 | .macro bilinear_duplicate_mask_x numpix, mask | 313 | .macro bilinear_duplicate_mask_x numpix, mask |
290 | .endm | 314 | .endm |
291 | 315 | ||
292 | .macro bilinear_duplicate_mask_8 numpix, mask | 316 | .macro bilinear_duplicate_mask_8 numpix, mask |
293 | .if numpix == 4 | 317 | .if numpix == 4 |
294 | vdup.32 mask, mask[0] | 318 | dup &mask&.2s, &mask&.s[0] |
295 | .elseif numpix == 2 | 319 | .elseif numpix == 2 |
296 | vdup.16 mask, mask[0] | 320 | dup &mask&.4h, &mask&.h[0] |
297 | .elseif numpix == 1 | 321 | .elseif numpix == 1 |
298 | vdup.8 mask, mask[0] | 322 | dup &mask&.8b, &mask&.b[0] |
299 | .else | 323 | .else |
300 | .error bilinear_duplicate_mask_8 is unsupported | 324 | .error bilinear_duplicate_mask_8 is unsupported |
301 | .endif | 325 | .endif |
302 | .endm | 326 | .endm |
303 | 327 | ||
@@ -307,47 +331,52 @@ | |||
307 | 331 | ||
308 | /* | 332 | /* |
309 | * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. | 333 | * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. |
310 | * Interleave should be done when maks is enabled or operator is 'over'. | 334 | * Interleave should be done when maks is enabled or operator is 'over'. |
311 | */ | 335 | */ |
312 | .macro bilinear_interleave src0, src1, dst0, dst1 | 336 | .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 |
313 | vuzp.8 src0, src1 | 337 | vuzp &src0&.8b, &src1&.8b |
314 | vuzp.8 dst0, dst1 | 338 | vuzp &dst0&.8b, &dst1&.8b |
315 | vuzp.8 src0, src1 | 339 | vuzp &src0&.8b, &src1&.8b |
316 | vuzp.8 dst0, dst1 | 340 | vuzp &dst0&.8b, &dst1&.8b |
341 | mov &src01&.d[1], &src1&.d[0] | ||
342 | mov &src01&.d[0], &src0&.d[0] | ||
343 | mov &dst01&.d[1], &dst1&.d[0] | ||
344 | mov &dst01&.d[0], &dst0&.d[0] | ||
317 | .endm | 345 | .endm |
318 | 346 | ||
319 | .macro bilinear_interleave_src_dst_x_src \ | 347 | .macro bilinear_interleave_src_dst_x_src \ |
320 | numpix, src0, src1, src01, dst0, dst1, dst01 | 348 | numpix, src0, src1, src01, dst0, dst1, dst01 |
321 | .endm | 349 | .endm |
322 | 350 | ||
323 | .macro bilinear_interleave_src_dst_x_over \ | 351 | .macro bilinear_interleave_src_dst_x_over \ |
324 | numpix, src0, src1, src01, dst0, dst1, dst01 | 352 | numpix, src0, src1, src01, dst0, dst1, dst01 |
325 | 353 | ||
326 | bilinear_interleave src0, src1, dst0, dst1 | 354 | bilinear_interleave src0, src1, src01, dst0, dst1, dst01 |
327 | .endm | 355 | .endm |
328 | 356 | ||
329 | .macro bilinear_interleave_src_dst_x_add \ | 357 | .macro bilinear_interleave_src_dst_x_add \ |
330 | numpix, src0, src1, src01, dst0, dst1, dst01 | 358 | numpix, src0, src1, src01, dst0, dst1, dst01 |
359 | bilinear_interleave src0, src1, src01, dst0, dst1, dst01 | ||
331 | .endm | 360 | .endm |
332 | 361 | ||
333 | .macro bilinear_interleave_src_dst_8_src \ | 362 | .macro bilinear_interleave_src_dst_8_src \ |
334 | numpix, src0, src1, src01, dst0, dst1, dst01 | 363 | numpix, src0, src1, src01, dst0, dst1, dst01 |
335 | 364 | ||
336 | bilinear_interleave src0, src1, dst0, dst1 | 365 | bilinear_interleave src0, src1, src01, dst0, dst1, dst01 |
337 | .endm | 366 | .endm |
338 | 367 | ||
339 | .macro bilinear_interleave_src_dst_8_over \ | 368 | .macro bilinear_interleave_src_dst_8_over \ |
340 | numpix, src0, src1, src01, dst0, dst1, dst01 | 369 | numpix, src0, src1, src01, dst0, dst1, dst01 |
341 | 370 | ||
342 | bilinear_interleave src0, src1, dst0, dst1 | 371 | bilinear_interleave src0, src1, src01, dst0, dst1, dst01 |
343 | .endm | 372 | .endm |
344 | 373 | ||
345 | .macro bilinear_interleave_src_dst_8_add \ | 374 | .macro bilinear_interleave_src_dst_8_add \ |
346 | numpix, src0, src1, src01, dst0, dst1, dst01 | 375 | numpix, src0, src1, src01, dst0, dst1, dst01 |
347 | 376 | ||
348 | bilinear_interleave src0, src1, dst0, dst1 | 377 | bilinear_interleave src0, src1, src01, dst0, dst1, dst01 |
349 | .endm | 378 | .endm |
350 | 379 | ||
351 | .macro bilinear_interleave_src_dst \ | 380 | .macro bilinear_interleave_src_dst \ |
352 | mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 | 381 | mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 |
353 | 382 | ||
@@ -368,18 +397,20 @@ | |||
368 | 397 | ||
369 | .macro bilinear_apply_mask_to_src_8 \ | 398 | .macro bilinear_apply_mask_to_src_8 \ |
370 | numpix, src0, src1, src01, mask, \ | 399 | numpix, src0, src1, src01, mask, \ |
371 | tmp01, tmp23, tmp45, tmp67 | 400 | tmp01, tmp23, tmp45, tmp67 |
372 | 401 | ||
373 | vmull.u8 tmp01, src0, mask | 402 | umull &tmp01&.8h, &src0&.8b, &mask&.8b |
374 | vmull.u8 tmp23, src1, mask | 403 | umull &tmp23&.8h, &src1&.8b, &mask&.8b |
375 | /* bubbles */ | 404 | /* bubbles */ |
376 | vrshr.u16 tmp45, tmp01, #8 | 405 | urshr &tmp45&.8h, &tmp01&.8h, #8 |
377 | vrshr.u16 tmp67, tmp23, #8 | 406 | urshr &tmp67&.8h, &tmp23&.8h, #8 |
378 | /* bubbles */ | 407 | /* bubbles */ |
379 | vraddhn.u16 src0, tmp45, tmp01 | 408 | raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h |
380 | vraddhn.u16 src1, tmp67, tmp23 | 409 | raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h |
410 | mov &src01&.d[0], &src0&.d[0] | ||
411 | mov &src01&.d[1], &src1&.d[0] | ||
381 | .endm | 412 | .endm |
382 | 413 | ||
383 | .macro bilinear_apply_mask_to_src \ | 414 | .macro bilinear_apply_mask_to_src \ |
384 | mask_fmt, numpix, src0, src1, src01, mask, \ | 415 | mask_fmt, numpix, src0, src1, src01, mask, \ |
385 | tmp01, tmp23, tmp45, tmp67 | 416 | tmp01, tmp23, tmp45, tmp67 |
@@ -401,32 +432,40 @@ | |||
401 | 432 | ||
402 | .macro bilinear_combine_over \ | 433 | .macro bilinear_combine_over \ |
403 | numpix, src0, src1, src01, dst0, dst1, dst01, \ | 434 | numpix, src0, src1, src01, dst0, dst1, dst01, \ |
404 | tmp01, tmp23, tmp45, tmp67, tmp8 | 435 | tmp01, tmp23, tmp45, tmp67, tmp8 |
405 | 436 | ||
406 | vdup.32 tmp8, src1[1] | 437 | dup &tmp8&.2s, &src1&.s[1] |
407 | /* bubbles */ | 438 | /* bubbles */ |
408 | vmvn.8 tmp8, tmp8 | 439 | mvn &tmp8&.8b, &tmp8&.8b |
409 | /* bubbles */ | 440 | /* bubbles */ |
410 | vmull.u8 tmp01, dst0, tmp8 | 441 | umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b |
411 | /* bubbles */ | 442 | /* bubbles */ |
412 | vmull.u8 tmp23, dst1, tmp8 | 443 | umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b |
413 | /* bubbles */ | 444 | /* bubbles */ |
414 | vrshr.u16 tmp45, tmp01, #8 | 445 | urshr &tmp45&.8h, &tmp01&.8h, #8 |
415 | vrshr.u16 tmp67, tmp23, #8 | 446 | urshr &tmp67&.8h, &tmp23&.8h, #8 |
416 | /* bubbles */ | 447 | /* bubbles */ |
417 | vraddhn.u16 dst0, tmp45, tmp01 | 448 | raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h |
418 | vraddhn.u16 dst1, tmp67, tmp23 | 449 | raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h |
450 | mov &dst01&.d[0], &dst0&.d[0] | ||
451 | mov &dst01&.d[1], &dst1&.d[0] | ||
419 | /* bubbles */ | 452 | /* bubbles */ |
420 | vqadd.u8 src01, dst01, src01 | 453 | uqadd &src0&.8b, &dst0&.8b, &src0&.8b |
454 | uqadd &src1&.8b, &dst1&.8b, &src1&.8b | ||
455 | mov &src01&.d[0], &src0&.d[0] | ||
456 | mov &src01&.d[1], &src1&.d[0] | ||
421 | .endm | 457 | .endm |
422 | 458 | ||
423 | .macro bilinear_combine_add \ | 459 | .macro bilinear_combine_add \ |
424 | numpix, src0, src1, src01, dst0, dst1, dst01, \ | 460 | numpix, src0, src1, src01, dst0, dst1, dst01, \ |
425 | tmp01, tmp23, tmp45, tmp67, tmp8 | 461 | tmp01, tmp23, tmp45, tmp67, tmp8 |
426 | 462 | ||
427 | vqadd.u8 src01, dst01, src01 | 463 | uqadd &src0&.8b, &dst0&.8b, &src0&.8b |
464 | uqadd &src1&.8b, &dst1&.8b, &src1&.8b | ||
465 | mov &src01&.d[0], &src0&.d[0] | ||
466 | mov &src01&.d[1], &src1&.d[0] | ||
428 | .endm | 467 | .endm |
429 | 468 | ||
430 | .macro bilinear_combine \ | 469 | .macro bilinear_combine \ |
431 | op, numpix, src0, src1, src01, dst0, dst1, dst01, \ | 470 | op, numpix, src0, src1, src01, dst0, dst1, dst01, \ |
432 | tmp01, tmp23, tmp45, tmp67, tmp8 | 471 | tmp01, tmp23, tmp45, tmp67, tmp8 |
@@ -438,23 +477,26 @@ | |||
438 | 477 | ||
439 | /* | 478 | /* |
440 | * Macros for final deinterleaving of destination pixels if needed. | 479 | * Macros for final deinterleaving of destination pixels if needed. |
441 | */ | 480 | */ |
442 | .macro bilinear_deinterleave numpix, dst0, dst1, dst01 | 481 | .macro bilinear_deinterleave numpix, dst0, dst1, dst01 |
443 | vuzp.8 dst0, dst1 | 482 | vuzp &dst0&.8b, &dst1&.8b |
444 | /* bubbles */ | 483 | /* bubbles */ |
445 | vuzp.8 dst0, dst1 | 484 | vuzp &dst0&.8b, &dst1&.8b |
485 | mov &dst01&.d[0], &dst0&.d[0] | ||
486 | mov &dst01&.d[1], &dst1&.d[0] | ||
446 | .endm | 487 | .endm |
447 | 488 | ||
448 | .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 | 489 | .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 |
449 | .endm | 490 | .endm |
450 | 491 | ||
451 | .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 | 492 | .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 |
452 | bilinear_deinterleave numpix, dst0, dst1, dst01 | 493 | bilinear_deinterleave numpix, dst0, dst1, dst01 |
453 | .endm | 494 | .endm |
454 | 495 | ||
455 | .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 | 496 | .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 |
497 | bilinear_deinterleave numpix, dst0, dst1, dst01 | ||
456 | .endm | 498 | .endm |
457 | 499 | ||
458 | .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 | 500 | .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 |
459 | bilinear_deinterleave numpix, dst0, dst1, dst01 | 501 | bilinear_deinterleave numpix, dst0, dst1, dst01 |
460 | .endm | 502 | .endm |
@@ -471,294 +513,381 @@ | |||
471 | bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 | 513 | bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 |
472 | .endm | 514 | .endm |
473 | 515 | ||
474 | 516 | ||
475 | .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op | 517 | .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op |
476 | bilinear_load_&src_fmt d0, d1, d2 | 518 | bilinear_load_&src_fmt v0, v1, v2 |
477 | bilinear_load_mask mask_fmt, 1, d4 | 519 | bilinear_load_mask mask_fmt, 1, v4 |
478 | bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 | 520 | bilinear_load_dst dst_fmt, op, 1, v18, v19, v9 |
479 | vmull.u8 q1, d0, d28 | 521 | mov v9.d[0], v18.d[0] |
480 | vmlal.u8 q1, d1, d29 | 522 | mov v9.d[1], v19.d[0] |
523 | umull v2.8h, v0.8b, v28.8b | ||
524 | umlal v2.8h, v1.8b, v29.8b | ||
481 | /* 5 cycles bubble */ | 525 | /* 5 cycles bubble */ |
482 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS | 526 | ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS |
483 | vmlsl.u16 q0, d2, d30 | 527 | umlsl v0.4s, v2.4h, v30.4h |
484 | vmlal.u16 q0, d3, d30 | 528 | mov v2.d[0], v2.d[1] |
529 | umlal v0.4s, v2.4h, v30.4h | ||
485 | /* 5 cycles bubble */ | 530 | /* 5 cycles bubble */ |
486 | bilinear_duplicate_mask mask_fmt, 1, d4 | 531 | bilinear_duplicate_mask mask_fmt, 1, v4 |
487 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 532 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
488 | /* 3 cycles bubble */ | 533 | /* 3 cycles bubble */ |
489 | vmovn.u16 d0, q0 | 534 | xtn v0.8b, v0.8h |
490 | /* 1 cycle bubble */ | 535 | /* 1 cycle bubble */ |
491 | bilinear_interleave_src_dst \ | 536 | bilinear_interleave_src_dst \ |
492 | mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 | 537 | mask_fmt, op, 1, v0, v1, v0, v18, v19, v9 |
538 | mov v1.d[0], v0.d[1] | ||
539 | mov v18.d[0], v9.d[0] | ||
540 | mov v19.d[0], v9.d[1] | ||
493 | bilinear_apply_mask_to_src \ | 541 | bilinear_apply_mask_to_src \ |
494 | mask_fmt, 1, d0, d1, q0, d4, \ | 542 | mask_fmt, 1, v0, v1, v0, v4, \ |
495 | q3, q8, q10, q11 | 543 | v3, v8, v10, v11 |
544 | mov v1.d[0], v0.d[1] | ||
496 | bilinear_combine \ | 545 | bilinear_combine \ |
497 | op, 1, d0, d1, q0, d18, d19, q9, \ | 546 | op, 1, v0, v1, v0, v18, v19, v9, \ |
498 | q3, q8, q10, q11, d5 | 547 | v3, v8, v10, v11, v5 |
499 | bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 | 548 | mov v1.d[0], v0.d[1] |
500 | bilinear_store_&dst_fmt 1, q2, q3 | 549 | bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0 |
550 | mov v1.d[0], v0.d[1] | ||
551 | bilinear_store_&dst_fmt 1, v17, v18 | ||
501 | .endm | 552 | .endm |
502 | 553 | ||
503 | .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op | 554 | .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op |
504 | bilinear_load_and_vertical_interpolate_two_&src_fmt \ | 555 | bilinear_load_and_vertical_interpolate_two_&src_fmt \ |
505 | q1, q11, d0, d1, d20, d21, d22, d23 | 556 | v1, v11, v18, v19, v20, v21, v22, v23 |
506 | bilinear_load_mask mask_fmt, 2, d4 | 557 | mov v2.d[0], v1.d[0] |
507 | bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 | 558 | mov v3.d[0], v1.d[1] |
508 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS | 559 | mov v22.d[0], v11.d[0] |
509 | vmlsl.u16 q0, d2, d30 | 560 | mov v23.d[0], v11.d[1] |
510 | vmlal.u16 q0, d3, d30 | 561 | bilinear_load_mask mask_fmt, 2, v4 |
511 | vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS | 562 | bilinear_load_dst dst_fmt, op, 2, v18, v19, v9 |
512 | vmlsl.u16 q10, d22, d31 | 563 | mov v9.d[0], v18.d[0] |
513 | vmlal.u16 q10, d23, d31 | 564 | mov v9.d[1], v19.d[0] |
514 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 565 | ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS |
515 | vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) | 566 | umlsl v0.4s, v2.4h, v30.4h |
516 | bilinear_duplicate_mask mask_fmt, 2, d4 | 567 | umlal v0.4s, v3.4h, v30.4h |
517 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 568 | ushll v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS |
518 | vadd.u16 q12, q12, q13 | 569 | umlsl v10.4s, v22.4h, v31.4h |
519 | vmovn.u16 d0, q0 | 570 | umlal v10.4s, v23.4h, v31.4h |
571 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
572 | shrn v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
573 | mov v0.d[1], v1.d[0] | ||
574 | bilinear_duplicate_mask mask_fmt, 2, v4 | ||
575 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
576 | mov v31.d[0], v30.d[1] | ||
577 | add v12.8h, v12.8h, v13.8h | ||
578 | xtn v0.8b, v0.8h | ||
520 | bilinear_interleave_src_dst \ | 579 | bilinear_interleave_src_dst \ |
521 | mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 | 580 | mask_fmt, op, 2, v0, v1, v0, v18, v19, v9 |
581 | mov v1.d[0], v0.d[1] | ||
582 | mov v18.d[0], v9.d[0] | ||
583 | mov v19.d[0], v9.d[1] | ||
522 | bilinear_apply_mask_to_src \ | 584 | bilinear_apply_mask_to_src \ |
523 | mask_fmt, 2, d0, d1, q0, d4, \ | 585 | mask_fmt, 2, v0, v1, v0, v4, \ |
524 | q3, q8, q10, q11 | 586 | v3, v8, v10, v11 |
587 | mov v1.d[0], v0.d[1] | ||
525 | bilinear_combine \ | 588 | bilinear_combine \ |
526 | op, 2, d0, d1, q0, d18, d19, q9, \ | 589 | op, 2, v0, v1, v0, v18, v19, v9, \ |
527 | q3, q8, q10, q11, d5 | 590 | v3, v8, v10, v11, v5 |
528 | bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 | 591 | mov v1.d[0], v0.d[1] |
529 | bilinear_store_&dst_fmt 2, q2, q3 | 592 | bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0 |
593 | mov v1.d[0], v0.d[1] | ||
594 | bilinear_store_&dst_fmt 2, v16, v17 | ||
530 | .endm | 595 | .endm |
531 | 596 | ||
532 | .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op | 597 | .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op |
533 | bilinear_load_and_vertical_interpolate_four_&src_fmt \ | 598 | bilinear_load_and_vertical_interpolate_four_&src_fmt \ |
534 | q1, q11, d0, d1, d20, d21, d22, d23 \ | 599 | v1, v11, v4, v5, v6, v7, v22, v23 \ |
535 | q3, q9, d4, d5, d16, d17, d18, d19 | 600 | v3, v9, v16, v17, v20, v21, v18, v19 |
536 | pld [TMP1, PF_OFFS] | 601 | mov v6.d[0], v3.d[0] |
602 | mov v7.d[0], v3.d[1] | ||
603 | mov v18.d[0], v9.d[0] | ||
604 | mov v19.d[0], v9.d[1] | ||
605 | mov v2.d[0], v1.d[0] | ||
606 | mov v3.d[0], v1.d[1] | ||
607 | mov v22.d[0], v11.d[0] | ||
608 | mov v23.d[0], v11.d[1] | ||
609 | prfm PLDL2STRM, [TMP1, PF_OFFS] | ||
537 | sub TMP1, TMP1, STRIDE | 610 | sub TMP1, TMP1, STRIDE |
538 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS | 611 | prfm PLDL2STRM, [TMP1, PF_OFFS] |
539 | vmlsl.u16 q0, d2, d30 | 612 | ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS |
540 | vmlal.u16 q0, d3, d30 | 613 | umlsl v0.4s, v2.4h, v30.4h |
541 | vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS | 614 | umlal v0.4s, v3.4h, v30.4h |
542 | vmlsl.u16 q10, d22, d31 | 615 | ushll v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS |
543 | vmlal.u16 q10, d23, d31 | 616 | umlsl v10.4s, v22.4h, v31.4h |
544 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 617 | umlal v10.4s, v23.4h, v31.4h |
545 | vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS | 618 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
546 | vmlsl.u16 q2, d6, d30 | 619 | mov v31.d[0], v30.d[1] |
547 | vmlal.u16 q2, d7, d30 | 620 | ushll v2.4s, v6.4h, #BILINEAR_INTERPOLATION_BITS |
548 | vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS | 621 | umlsl v2.4s, v6.4h, v30.4h |
549 | bilinear_load_mask mask_fmt, 4, d22 | 622 | umlal v2.4s, v7.4h, v30.4h |
550 | bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 | 623 | ushll v8.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS |
551 | pld [TMP1, PF_OFFS] | 624 | umlsl v8.4s, v18.4h, v31.4h |
552 | vmlsl.u16 q8, d18, d31 | 625 | umlal v8.4s, v19.4h, v31.4h |
553 | vmlal.u16 q8, d19, d31 | 626 | add v12.8h, v12.8h, v13.8h |
554 | vadd.u16 q12, q12, q13 | 627 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
555 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 628 | shrn v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
556 | vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) | 629 | mov v0.d[1], v1.d[0] |
557 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | 630 | shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
558 | vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) | 631 | shrn v5.4h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
559 | bilinear_duplicate_mask mask_fmt, 4, d22 | 632 | mov v2.d[0], v4.d[0] |
560 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 633 | mov v2.d[1], v5.d[0] |
561 | vmovn.u16 d0, q0 | 634 | bilinear_load_mask mask_fmt, 4, v4 |
562 | vmovn.u16 d1, q2 | 635 | bilinear_duplicate_mask mask_fmt, 4, v4 |
563 | vadd.u16 q12, q12, q13 | 636 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
637 | mov v31.d[0], v30.d[1] | ||
638 | xtn v0.8b, v0.8h | ||
639 | xtn v1.8b, v2.8h | ||
640 | add v12.8h, v12.8h, v13.8h | ||
641 | bilinear_load_dst dst_fmt, op, 4, v2, v3, v21 | ||
642 | mov v21.d[0], v2.d[0] | ||
643 | mov v21.d[1], v3.d[0] | ||
564 | bilinear_interleave_src_dst \ | 644 | bilinear_interleave_src_dst \ |
565 | mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 | 645 | mask_fmt, op, 4, v0, v1, v0, v2, v3, v1 |
646 | mov v2.d[0], v1.d[0] | ||
647 | mov v3.d[0], v1.d[1] | ||
648 | mov v1.d[0], v0.d[1] | ||
566 | bilinear_apply_mask_to_src \ | 649 | bilinear_apply_mask_to_src \ |
567 | mask_fmt, 4, d0, d1, q0, d22, \ | 650 | mask_fmt, 4, v0, v1, v0, v4, \ |
568 | q3, q8, q9, q10 | 651 | v6, v8, v9, v10 |
652 | mov v1.d[0], v0.d[1] | ||
569 | bilinear_combine \ | 653 | bilinear_combine \ |
570 | op, 4, d0, d1, q0, d2, d3, q1, \ | 654 | op, 4, v0, v1, v0, v2, v3, v1, \ |
571 | q3, q8, q9, q10, d23 | 655 | v6, v8, v9, v10, v23 |
572 | bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 | 656 | mov v1.d[0], v0.d[1] |
573 | bilinear_store_&dst_fmt 4, q2, q3 | 657 | bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0 |
658 | mov v1.d[0], v0.d[1] | ||
659 | bilinear_store_&dst_fmt 4, v6, v7 | ||
574 | .endm | 660 | .endm |
575 | 661 | ||
576 | .set BILINEAR_FLAG_USE_MASK, 1 | 662 | .set BILINEAR_FLAG_USE_MASK, 1 |
577 | .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 | 663 | .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 |
578 | 664 | ||
579 | /* | 665 | /* |
580 | * Main template macro for generating NEON optimized bilinear scanline functions. | 666 | * Main template macro for generating NEON optimized bilinear scanline functions. |
581 | * | 667 | * |
582 | * Bilinear scanline generator macro take folling arguments: | 668 | * Bilinear scanline generator macro take folling arguments: |
583 | * fname - name of the function to generate | 669 | * fname - name of the function to generate |
584 | * src_fmt - source color format (8888 or 0565) | 670 | * src_fmt - source color format (8888 or 0565) |
585 | * dst_fmt - destination color format (8888 or 0565) | 671 | * dst_fmt - destination color format (8888 or 0565) |
586 | * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes | 672 | * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes |
587 | * process_last_pixel - code block that interpolate one pixel and does not | 673 | * process_last_pixel - code block that interpolate one pixel and does not |
588 | * update horizontal weight | 674 | * update horizontal weight |
589 | * process_two_pixels - code block that interpolate two pixels and update | 675 | * process_two_pixels - code block that interpolate two pixels and update |
590 | * horizontal weight | 676 | * horizontal weight |
591 | * process_four_pixels - code block that interpolate four pixels and update | 677 | * process_four_pixels - code block that interpolate four pixels and update |
592 | * horizontal weight | 678 | * horizontal weight |
593 | * process_pixblock_head - head part of middle loop | 679 | * process_pixblock_head - head part of middle loop |
594 | * process_pixblock_tail - tail part of middle loop | 680 | * process_pixblock_tail - tail part of middle loop |
595 | * process_pixblock_tail_head - tail_head of middle loop | 681 | * process_pixblock_tail_head - tail_head of middle loop |
596 | * pixblock_size - number of pixels processed in a single middle loop | 682 | * pixblock_size - number of pixels processed in a single middle loop |
597 | * prefetch_distance - prefetch in the source image by that many pixels ahead | 683 | * prefetch_distance - prefetch in the source image by that many pixels ahead |
598 | */ | 684 | */ |
599 | 685 | ||
600 | .macro generate_bilinear_scanline_func \ | 686 | .macro generate_bilinear_scanline_func \ |
601 | fname, \ | 687 | fname, \ |
602 | src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ | 688 | src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ |
603 | bilinear_process_last_pixel, \ | 689 | bilinear_process_last_pixel, \ |
604 | bilinear_process_two_pixels, \ | 690 | bilinear_process_two_pixels, \ |
605 | bilinear_process_four_pixels, \ | 691 | bilinear_process_four_pixels, \ |
606 | bilinear_process_pixblock_head, \ | 692 | bilinear_process_pixblock_head, \ |
607 | bilinear_process_pixblock_tail, \ | 693 | bilinear_process_pixblock_tail, \ |
608 | bilinear_process_pixblock_tail_head, \ | 694 | bilinear_process_pixblock_tail_head, \ |
609 | pixblock_size, \ | 695 | pixblock_size, \ |
610 | prefetch_distance, \ | 696 | prefetch_distance, \ |
611 | flags | 697 | flags |
612 | 698 | ||
613 | pixman_asm_function fname | 699 | pixman_asm_function fname |
614 | .if pixblock_size == 8 | 700 | .if pixblock_size == 8 |
615 | .elseif pixblock_size == 4 | 701 | .elseif pixblock_size == 4 |
616 | .else | 702 | .else |
617 | .error unsupported pixblock size | 703 | .error unsupported pixblock size |
618 | .endif | 704 | .endif |
619 | 705 | ||
620 | .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 | 706 | .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 |
621 | OUT .req r0 | 707 | OUT .req x0 |
622 | TOP .req r1 | 708 | TOP .req x1 |
623 | BOTTOM .req r2 | 709 | BOTTOM .req x2 |
624 | WT .req r3 | 710 | WT .req x3 |
625 | WB .req r4 | 711 | WWT .req w3 |
626 | X .req r5 | 712 | WB .req x4 |
627 | UX .req r6 | 713 | WWB .req w4 |
628 | WIDTH .req ip | 714 | X .req w5 |
629 | TMP1 .req r3 | 715 | UX .req w6 |
630 | TMP2 .req r4 | 716 | WIDTH .req x7 |
631 | PF_OFFS .req r7 | 717 | TMP1 .req x10 |
632 | TMP3 .req r8 | 718 | WTMP1 .req w10 |
633 | TMP4 .req r9 | 719 | TMP2 .req x11 |
634 | STRIDE .req r2 | 720 | WTMP2 .req w11 |
635 | 721 | PF_OFFS .req x12 | |
636 | mov ip, sp | 722 | TMP3 .req x13 |
637 | push {r4, r5, r6, r7, r8, r9} | 723 | WTMP3 .req w13 |
638 | mov PF_OFFS, #prefetch_distance | 724 | TMP4 .req x14 |
639 | ldmia ip, {WB, X, UX, WIDTH} | 725 | WTMP4 .req w14 |
726 | STRIDE .req x15 | ||
727 | DUMMY .req x30 | ||
728 | |||
729 | stp x29, x30, [sp, -16]! | ||
730 | mov x29, sp | ||
731 | sub sp, sp, 112 | ||
732 | sub x29, x29, 64 | ||
733 | st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
734 | st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
735 | stp x10, x11, [x29, -80] | ||
736 | stp x12, x13, [x29, -96] | ||
737 | stp x14, x15, [x29, -112] | ||
640 | .else | 738 | .else |
641 | OUT .req r0 | 739 | OUT .req x0 |
642 | MASK .req r1 | 740 | MASK .req x1 |
643 | TOP .req r2 | 741 | TOP .req x2 |
644 | BOTTOM .req r3 | 742 | BOTTOM .req x3 |
645 | WT .req r4 | 743 | WT .req x4 |
646 | WB .req r5 | 744 | WWT .req w4 |
647 | X .req r6 | 745 | WB .req x5 |
648 | UX .req r7 | 746 | WWB .req w5 |
649 | WIDTH .req ip | 747 | X .req w6 |
650 | TMP1 .req r4 | 748 | UX .req w7 |
651 | TMP2 .req r5 | 749 | WIDTH .req x8 |
652 | PF_OFFS .req r8 | 750 | TMP1 .req x10 |
653 | TMP3 .req r9 | 751 | WTMP1 .req w10 |
654 | TMP4 .req r10 | 752 | TMP2 .req x11 |
655 | STRIDE .req r3 | 753 | WTMP2 .req w11 |
754 | PF_OFFS .req x12 | ||
755 | TMP3 .req x13 | ||
756 | WTMP3 .req w13 | ||
757 | TMP4 .req x14 | ||
758 | WTMP4 .req w14 | ||
759 | STRIDE .req x15 | ||
760 | DUMMY .req x30 | ||
656 | 761 | ||
657 | .set prefetch_offset, prefetch_distance | 762 | .set prefetch_offset, prefetch_distance |
658 | 763 | ||
659 | mov ip, sp | 764 | stp x29, x30, [sp, -16]! |
660 | push {r4, r5, r6, r7, r8, r9, r10, ip} | 765 | mov x29, sp |
661 | mov PF_OFFS, #prefetch_distance | 766 | sub x29, x29, 64 |
662 | ldmia ip, {WT, WB, X, UX, WIDTH} | 767 | st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 |
768 | st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
769 | stp x10, x11, [x29, -80] | ||
770 | stp x12, x13, [x29, -96] | ||
771 | stp x14, x15, [x29, -112] | ||
772 | str x8, [x29, -120] | ||
773 | ldr w8, [x29, 16] | ||
774 | sub sp, sp, 120 | ||
663 | .endif | 775 | .endif |
664 | 776 | ||
665 | mul PF_OFFS, PF_OFFS, UX | 777 | mov WTMP1, #prefetch_distance |
666 | 778 | umull PF_OFFS, WTMP1, UX | |
667 | .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 | ||
668 | vpush {d8-d15} | ||
669 | .endif | ||
670 | 779 | ||
671 | sub STRIDE, BOTTOM, TOP | 780 | sub STRIDE, BOTTOM, TOP |
672 | .unreq BOTTOM | 781 | .unreq BOTTOM |
673 | 782 | ||
674 | cmp WIDTH, #0 | 783 | cmp WIDTH, #0 |
675 | ble 3f | 784 | ble 300f |
676 | 785 | ||
677 | vdup.u16 q12, X | 786 | dup v12.8h, X |
678 | vdup.u16 q13, UX | 787 | dup v13.8h, UX |
679 | vdup.u8 d28, WT | 788 | dup v28.8b, WWT |
680 | vdup.u8 d29, WB | 789 | dup v29.8b, WWB |
681 | vadd.u16 d25, d25, d26 | 790 | mov v25.d[0], v12.d[1] |
791 | mov v26.d[0], v13.d[0] | ||
792 | add v25.4h, v25.4h, v26.4h | ||
793 | mov v12.d[1], v25.d[0] | ||
682 | 794 | ||
683 | /* ensure good destination alignment */ | 795 | /* ensure good destination alignment */ |
684 | cmp WIDTH, #1 | 796 | cmp WIDTH, #1 |
685 | blt 0f | 797 | blt 100f |
686 | tst OUT, #(1 << dst_bpp_shift) | 798 | tst OUT, #(1 << dst_bpp_shift) |
687 | beq 0f | 799 | beq 100f |
688 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 800 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
689 | vadd.u16 q12, q12, q13 | 801 | mov v31.d[0], v30.d[1] |
802 | add v12.8h, v12.8h, v13.8h | ||
690 | bilinear_process_last_pixel | 803 | bilinear_process_last_pixel |
691 | sub WIDTH, WIDTH, #1 | 804 | sub WIDTH, WIDTH, #1 |
692 | 0: | 805 | 100: |
693 | vadd.u16 q13, q13, q13 | 806 | add v13.8h, v13.8h, v13.8h |
694 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 807 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
695 | vadd.u16 q12, q12, q13 | 808 | mov v31.d[0], v30.d[1] |
809 | add v12.8h, v12.8h, v13.8h | ||
696 | 810 | ||
697 | cmp WIDTH, #2 | 811 | cmp WIDTH, #2 |
698 | blt 0f | 812 | blt 100f |
699 | tst OUT, #(1 << (dst_bpp_shift + 1)) | 813 | tst OUT, #(1 << (dst_bpp_shift + 1)) |
700 | beq 0f | 814 | beq 100f |
701 | bilinear_process_two_pixels | 815 | bilinear_process_two_pixels |
702 | sub WIDTH, WIDTH, #2 | 816 | sub WIDTH, WIDTH, #2 |
703 | 0: | 817 | 100: |
704 | .if pixblock_size == 8 | 818 | .if pixblock_size == 8 |
705 | cmp WIDTH, #4 | 819 | cmp WIDTH, #4 |
706 | blt 0f | 820 | blt 100f |
707 | tst OUT, #(1 << (dst_bpp_shift + 2)) | 821 | tst OUT, #(1 << (dst_bpp_shift + 2)) |
708 | beq 0f | 822 | beq 100f |
709 | bilinear_process_four_pixels | 823 | bilinear_process_four_pixels |
710 | sub WIDTH, WIDTH, #4 | 824 | sub WIDTH, WIDTH, #4 |
711 | 0: | 825 | 100: |
712 | .endif | 826 | .endif |
713 | subs WIDTH, WIDTH, #pixblock_size | 827 | subs WIDTH, WIDTH, #pixblock_size |
714 | blt 1f | 828 | blt 100f |
715 | asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) | 829 | asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) |
716 | bilinear_process_pixblock_head | 830 | bilinear_process_pixblock_head |
717 | subs WIDTH, WIDTH, #pixblock_size | 831 | subs WIDTH, WIDTH, #pixblock_size |
718 | blt 5f | 832 | blt 500f |
719 | 0: | 833 | 0: |
720 | bilinear_process_pixblock_tail_head | 834 | bilinear_process_pixblock_tail_head |
721 | subs WIDTH, WIDTH, #pixblock_size | 835 | subs WIDTH, WIDTH, #pixblock_size |
722 | bge 0b | 836 | bge 0b |
723 | 5: | 837 | 500: |
724 | bilinear_process_pixblock_tail | 838 | bilinear_process_pixblock_tail |
725 | 1: | 839 | 100: |
726 | .if pixblock_size == 8 | 840 | .if pixblock_size == 8 |
727 | tst WIDTH, #4 | 841 | tst WIDTH, #4 |
728 | beq 2f | 842 | beq 200f |
729 | bilinear_process_four_pixels | 843 | bilinear_process_four_pixels |
730 | 2: | 844 | 200: |
731 | .endif | 845 | .endif |
732 | /* handle the remaining trailing pixels */ | 846 | /* handle the remaining trailing pixels */ |
733 | tst WIDTH, #2 | 847 | tst WIDTH, #2 |
734 | beq 2f | 848 | beq 200f |
735 | bilinear_process_two_pixels | 849 | bilinear_process_two_pixels |
736 | 2: | 850 | 200: |
737 | tst WIDTH, #1 | 851 | tst WIDTH, #1 |
738 | beq 3f | 852 | beq 300f |
739 | bilinear_process_last_pixel | 853 | bilinear_process_last_pixel |
740 | 3: | 854 | 300: |
741 | .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 | ||
742 | vpop {d8-d15} | ||
743 | .endif | ||
744 | 855 | ||
745 | .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 | 856 | .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 |
746 | pop {r4, r5, r6, r7, r8, r9} | 857 | sub x29, x29, 64 |
858 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
859 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
860 | ldp x10, x11, [x29, -80] | ||
861 | ldp x12, x13, [x29, -96] | ||
862 | ldp x14, x15, [x29, -112] | ||
863 | mov sp, x29 | ||
864 | ldp x29, x30, [sp], 16 | ||
747 | .else | 865 | .else |
748 | pop {r4, r5, r6, r7, r8, r9, r10, ip} | 866 | sub x29, x29, 64 |
867 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
868 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
869 | ldp x10, x11, [x29, -80] | ||
870 | ldp x12, x13, [x29, -96] | ||
871 | ldp x14, x15, [x29, -112] | ||
872 | ldr x8, [x29, -120] | ||
873 | mov sp, x29 | ||
874 | ldp x29, x30, [sp], 16 | ||
749 | .endif | 875 | .endif |
750 | bx lr | 876 | ret |
751 | 877 | ||
752 | .unreq OUT | 878 | .unreq OUT |
753 | .unreq TOP | 879 | .unreq TOP |
754 | .unreq WT | 880 | .unreq WT |
881 | .unreq WWT | ||
755 | .unreq WB | 882 | .unreq WB |
883 | .unreq WWB | ||
756 | .unreq X | 884 | .unreq X |
757 | .unreq UX | 885 | .unreq UX |
758 | .unreq WIDTH | 886 | .unreq WIDTH |
759 | .unreq TMP1 | 887 | .unreq TMP1 |
888 | .unreq WTMP1 | ||
760 | .unreq TMP2 | 889 | .unreq TMP2 |
761 | .unreq PF_OFFS | 890 | .unreq PF_OFFS |
762 | .unreq TMP3 | 891 | .unreq TMP3 |
763 | .unreq TMP4 | 892 | .unreq TMP4 |
764 | .unreq STRIDE | 893 | .unreq STRIDE |
@@ -882,162 +1011,188 @@ pixman_asm_function fname | |||
882 | .macro bilinear_over_8888_8888_process_four_pixels | 1011 | .macro bilinear_over_8888_8888_process_four_pixels |
883 | bilinear_interpolate_four_pixels 8888, x, 8888, over | 1012 | bilinear_interpolate_four_pixels 8888, x, 8888, over |
884 | .endm | 1013 | .endm |
885 | 1014 | ||
886 | .macro bilinear_over_8888_8888_process_pixblock_head | 1015 | .macro bilinear_over_8888_8888_process_pixblock_head |
887 | asr TMP1, X, #16 | 1016 | asr WTMP1, X, #16 |
888 | add X, X, UX | 1017 | add X, X, UX |
889 | add TMP1, TOP, TMP1, lsl #2 | 1018 | lsl TMP2, TMP1, #2 |
890 | asr TMP2, X, #16 | 1019 | add TMP1, TOP, TMP2 |
1020 | asr WTMP2, X, #16 | ||
891 | add X, X, UX | 1021 | add X, X, UX |
892 | add TMP2, TOP, TMP2, lsl #2 | 1022 | lsl TMP3, TMP2, #2 |
1023 | add TMP2, TOP, TMP3 | ||
893 | 1024 | ||
894 | vld1.32 {d22}, [TMP1], STRIDE | 1025 | ld1 {v22.2s}, [TMP1], STRIDE |
895 | vld1.32 {d23}, [TMP1] | 1026 | ld1 {v23.2s}, [TMP1] |
896 | asr TMP3, X, #16 | 1027 | asr WTMP3, X, #16 |
897 | add X, X, UX | 1028 | add X, X, UX |
898 | add TMP3, TOP, TMP3, lsl #2 | 1029 | lsl TMP4, TMP3, #2 |
899 | vmull.u8 q8, d22, d28 | 1030 | add TMP3, TOP, TMP4 |
900 | vmlal.u8 q8, d23, d29 | 1031 | umull v16.8h, v22.8b, v28.8b |
901 | 1032 | umlal v16.8h, v23.8b, v29.8b | |
902 | vld1.32 {d22}, [TMP2], STRIDE | 1033 | mov v17.d[0], v16.d[1] |
903 | vld1.32 {d23}, [TMP2] | 1034 | |
904 | asr TMP4, X, #16 | 1035 | ld1 {v22.2s}, [TMP2], STRIDE |
1036 | ld1 {v23.2s}, [TMP2] | ||
1037 | asr WTMP4, X, #16 | ||
905 | add X, X, UX | 1038 | add X, X, UX |
906 | add TMP4, TOP, TMP4, lsl #2 | 1039 | lsl TMP1, TMP4, #2 |
907 | vmull.u8 q9, d22, d28 | 1040 | add TMP4, TOP, TMP1 |
908 | vmlal.u8 q9, d23, d29 | 1041 | umull v18.8h, v22.8b, v28.8b |
909 | 1042 | umlal v18.8h, v23.8b, v29.8b | |
910 | vld1.32 {d22}, [TMP3], STRIDE | 1043 | mov v19.d[0], v18.d[1] |
911 | vld1.32 {d23}, [TMP3] | 1044 | |
912 | vmull.u8 q10, d22, d28 | 1045 | ld1 {v22.2s}, [TMP3], STRIDE |
913 | vmlal.u8 q10, d23, d29 | 1046 | ld1 {v23.2s}, [TMP3] |
914 | 1047 | umull v20.8h, v22.8b, v28.8b | |
915 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | 1048 | umlal v20.8h, v23.8b, v29.8b |
916 | vmlsl.u16 q0, d16, d30 | 1049 | mov v21.d[0], v20.d[1] |
917 | vmlal.u16 q0, d17, d30 | 1050 | |
918 | 1051 | ushll v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS | |
919 | pld [TMP4, PF_OFFS] | 1052 | umlsl v0.4s, v16.4h, v30.4h |
920 | vld1.32 {d16}, [TMP4], STRIDE | 1053 | umlal v0.4s, v17.4h, v30.4h |
921 | vld1.32 {d17}, [TMP4] | 1054 | |
922 | pld [TMP4, PF_OFFS] | 1055 | prfm PLDL2STRM, [TMP4, PF_OFFS] |
923 | vmull.u8 q11, d16, d28 | 1056 | ld1 {v16.2s}, [TMP4], STRIDE |
924 | vmlal.u8 q11, d17, d29 | 1057 | ld1 {v17.2s}, [TMP4] |
925 | 1058 | prfm PLDL2STRM, [TMP4, PF_OFFS] | |
926 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | 1059 | umull v22.8h, v16.8b, v28.8b |
927 | vmlsl.u16 q1, d18, d31 | 1060 | umlal v22.8h, v17.8b, v29.8b |
928 | vmlal.u16 q1, d19, d31 | 1061 | mov v23.d[0], v22.d[1] |
929 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 1062 | |
930 | vadd.u16 q12, q12, q13 | 1063 | ushll v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS |
1064 | umlsl v1.4s, v18.4h, v31.4h | ||
1065 | umlal v1.4s, v19.4h, v31.4h | ||
1066 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
1067 | mov v31.d[0], v30.d[1] | ||
1068 | add v12.8h, v12.8h, v13.8h | ||
931 | .endm | 1069 | .endm |
932 | 1070 | ||
933 | .macro bilinear_over_8888_8888_process_pixblock_tail | 1071 | .macro bilinear_over_8888_8888_process_pixblock_tail |
934 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | 1072 | ushll v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS |
935 | vmlsl.u16 q2, d20, d30 | 1073 | umlsl v2.4s, v20.4h, v30.4h |
936 | vmlal.u16 q2, d21, d30 | 1074 | umlal v2.4s, v21.4h, v30.4h |
937 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | 1075 | ushll v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS |
938 | vmlsl.u16 q3, d22, d31 | 1076 | umlsl v3.4s, v22.4h, v31.4h |
939 | vmlal.u16 q3, d23, d31 | 1077 | umlal v3.4s, v23.4h, v31.4h |
940 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 1078 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
941 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | 1079 | shrn v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
942 | vld1.32 {d2, d3}, [OUT, :128] | 1080 | mov v0.d[1], v1.d[0] |
943 | pld [OUT, #(prefetch_offset * 4)] | 1081 | ld1 {v22.2s, v23.2s}, [OUT] |
944 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | 1082 | prfm PLDL2STRM, [OUT, #(prefetch_offset * 4)] |
945 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 1083 | shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
946 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | 1084 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
947 | vmovn.u16 d6, q0 | 1085 | mov v31.d[0], v30.d[1] |
948 | vmovn.u16 d7, q2 | 1086 | shrn v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
949 | vuzp.8 d6, d7 | 1087 | mov v2.d[1], v5.d[0] |
950 | vuzp.8 d2, d3 | 1088 | xtn v6.8b, v0.8h |
951 | vuzp.8 d6, d7 | 1089 | xtn v7.8b, v2.8h |
952 | vuzp.8 d2, d3 | 1090 | vuzp v6.8b, v7.8b |
953 | vdup.32 d4, d7[1] | 1091 | vuzp v22.8b, v23.8b |
954 | vmvn.8 d4, d4 | 1092 | vuzp v6.8b, v7.8b |
955 | vmull.u8 q11, d2, d4 | 1093 | vuzp v22.8b, v23.8b |
956 | vmull.u8 q2, d3, d4 | 1094 | dup v4.2s, v7.s[1] |
957 | vrshr.u16 q1, q11, #8 | 1095 | mvn v4.8b, v4.8b |
958 | vrshr.u16 q10, q2, #8 | 1096 | umull v11.8h, v22.8b, v4.8b |
959 | vraddhn.u16 d2, q1, q11 | 1097 | umull v2.8h, v23.8b, v4.8b |
960 | vraddhn.u16 d3, q10, q2 | 1098 | urshr v1.8h, v11.8h, #8 |
961 | vqadd.u8 q3, q1, q3 | 1099 | urshr v10.8h, v2.8h, #8 |
962 | vuzp.8 d6, d7 | 1100 | raddhn v3.8b, v10.8h, v2.8h |
963 | vuzp.8 d6, d7 | 1101 | raddhn v2.8b, v1.8h, v11.8h |
964 | vadd.u16 q12, q12, q13 | 1102 | uqadd v6.8b, v2.8b, v6.8b |
965 | vst1.32 {d6, d7}, [OUT, :128]! | 1103 | uqadd v7.8b, v3.8b, v7.8b |
1104 | vuzp v6.8b, v7.8b | ||
1105 | vuzp v6.8b, v7.8b | ||
1106 | add v12.8h, v12.8h, v13.8h | ||
1107 | st1 {v6.2s, v7.2s}, [OUT], #16 | ||
966 | .endm | 1108 | .endm |
967 | 1109 | ||
968 | .macro bilinear_over_8888_8888_process_pixblock_tail_head | 1110 | .macro bilinear_over_8888_8888_process_pixblock_tail_head |
969 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | 1111 | ushll v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS |
970 | asr TMP1, X, #16 | 1112 | asr WTMP1, X, #16 |
971 | add X, X, UX | 1113 | add X, X, UX |
972 | add TMP1, TOP, TMP1, lsl #2 | 1114 | lsl TMP2, TMP1, #2 |
973 | vmlsl.u16 q2, d20, d30 | 1115 | add TMP1, TOP, TMP2 |
974 | asr TMP2, X, #16 | 1116 | umlsl v2.4s, v20.4h, v30.4h |
1117 | asr WTMP2, X, #16 | ||
975 | add X, X, UX | 1118 | add X, X, UX |
976 | add TMP2, TOP, TMP2, lsl #2 | 1119 | lsl TMP3, TMP2, #2 |
977 | vmlal.u16 q2, d21, d30 | 1120 | add TMP2, TOP, TMP3 |
978 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | 1121 | umlal v2.4s, v21.4h, v30.4h |
979 | vld1.32 {d20}, [TMP1], STRIDE | 1122 | ushll v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS |
980 | vmlsl.u16 q3, d22, d31 | 1123 | ld1 {v20.2s}, [TMP1], STRIDE |
981 | vmlal.u16 q3, d23, d31 | 1124 | umlsl v3.4s, v22.4h, v31.4h |
982 | vld1.32 {d21}, [TMP1] | 1125 | umlal v3.4s, v23.4h, v31.4h |
983 | vmull.u8 q8, d20, d28 | 1126 | ld1 {v21.2s}, [TMP1] |
984 | vmlal.u8 q8, d21, d29 | 1127 | umull v16.8h, v20.8b, v28.8b |
985 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 1128 | umlal v16.8h, v21.8b, v29.8b |
986 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | 1129 | mov v17.d[0], v16.d[1] |
987 | vld1.32 {d2, d3}, [OUT, :128] | 1130 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
988 | pld [OUT, PF_OFFS] | 1131 | shrn v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
989 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | 1132 | mov v0.d[1], v1.d[0] |
990 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 1133 | shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
991 | vld1.32 {d22}, [TMP2], STRIDE | 1134 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
992 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | 1135 | mov v31.d[0], v30.d[1] |
993 | vmovn.u16 d6, q0 | 1136 | ld1 {v22.2s}, [TMP2], STRIDE |
994 | vld1.32 {d23}, [TMP2] | 1137 | shrn v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
995 | vmull.u8 q9, d22, d28 | 1138 | mov v2.d[1], v5.d[0] |
996 | asr TMP3, X, #16 | 1139 | xtn v6.8b, v0.8h |
1140 | ld1 {v23.2s}, [TMP2] | ||
1141 | umull v18.8h, v22.8b, v28.8b | ||
1142 | asr WTMP3, X, #16 | ||
997 | add X, X, UX | 1143 | add X, X, UX |
998 | add TMP3, TOP, TMP3, lsl #2 | 1144 | lsl TMP4, TMP3, #2 |
999 | asr TMP4, X, #16 | 1145 | add TMP3, TOP, TMP4 |
1146 | asr WTMP4, X, #16 | ||
1000 | add X, X, UX | 1147 | add X, X, UX |
1001 | add TMP4, TOP, TMP4, lsl #2 | 1148 | lsl TMP1, TMP4, #2 |
1002 | vmlal.u8 q9, d23, d29 | 1149 | add TMP4, TOP, TMP1 |
1003 | vmovn.u16 d7, q2 | 1150 | umlal v18.8h, v23.8b, v29.8b |
1004 | vld1.32 {d22}, [TMP3], STRIDE | 1151 | mov v19.d[0], v18.d[1] |
1005 | vuzp.8 d6, d7 | 1152 | xtn v7.8b, v2.8h |
1006 | vuzp.8 d2, d3 | 1153 | ld1 {v2.2s, v3.2s}, [OUT] |
1007 | vuzp.8 d6, d7 | 1154 | prfm PLDL2STRM, [OUT, PF_OFFS] |
1008 | vuzp.8 d2, d3 | 1155 | ld1 {v22.2s}, [TMP3], STRIDE |
1009 | vdup.32 d4, d7[1] | 1156 | vuzp v6.8b, v7.8b |
1010 | vld1.32 {d23}, [TMP3] | 1157 | vuzp v2.8b, v3.8b |
1011 | vmvn.8 d4, d4 | 1158 | vuzp v6.8b, v7.8b |
1012 | vmull.u8 q10, d22, d28 | 1159 | vuzp v2.8b, v3.8b |
1013 | vmlal.u8 q10, d23, d29 | 1160 | dup v4.2s, v7.s[1] |
1014 | vmull.u8 q11, d2, d4 | 1161 | ld1 {v23.2s}, [TMP3] |
1015 | vmull.u8 q2, d3, d4 | 1162 | mvn v4.8b, v4.8b |
1016 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | 1163 | umull v20.8h, v22.8b, v28.8b |
1017 | vmlsl.u16 q0, d16, d30 | 1164 | umlal v20.8h, v23.8b, v29.8b |
1018 | vrshr.u16 q1, q11, #8 | 1165 | umull v11.8h, v2.8b, v4.8b |
1019 | vmlal.u16 q0, d17, d30 | 1166 | umull v2.8h, v3.8b, v4.8b |
1020 | vrshr.u16 q8, q2, #8 | 1167 | mov v21.d[0], v20.d[1] |
1021 | vraddhn.u16 d2, q1, q11 | 1168 | ushll v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS |
1022 | vraddhn.u16 d3, q8, q2 | 1169 | umlsl v0.4s, v16.4h, v30.4h |
1023 | pld [TMP4, PF_OFFS] | 1170 | urshr v1.8h, v11.8h, #8 |
1024 | vld1.32 {d16}, [TMP4], STRIDE | 1171 | umlal v0.4s, v17.4h, v30.4h |
1025 | vqadd.u8 q3, q1, q3 | 1172 | urshr v8.8h, v2.8h, #8 |
1026 | vld1.32 {d17}, [TMP4] | 1173 | raddhn v3.8b, v8.8h, v2.8h |
1027 | pld [TMP4, PF_OFFS] | 1174 | raddhn v2.8b, v1.8h, v11.8h |
1028 | vmull.u8 q11, d16, d28 | 1175 | prfm PLDL2STRM, [TMP4, PF_OFFS] |
1029 | vmlal.u8 q11, d17, d29 | 1176 | ld1 {v16.2s}, [TMP4], STRIDE |
1030 | vuzp.8 d6, d7 | 1177 | uqadd v6.8b, v2.8b, v6.8b |
1031 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | 1178 | uqadd v7.8b, v3.8b, v7.8b |
1032 | vuzp.8 d6, d7 | 1179 | ld1 {v17.2s}, [TMP4] |
1033 | vmlsl.u16 q1, d18, d31 | 1180 | prfm PLDL2STRM, [TMP4, PF_OFFS] |
1034 | vadd.u16 q12, q12, q13 | 1181 | umull v22.8h, v16.8b, v28.8b |
1035 | vmlal.u16 q1, d19, d31 | 1182 | umlal v22.8h, v17.8b, v29.8b |
1036 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 1183 | mov v23.d[0], v22.d[1] |
1037 | vadd.u16 q12, q12, q13 | 1184 | vuzp v6.8b, v7.8b |
1038 | vst1.32 {d6, d7}, [OUT, :128]! | 1185 | ushll v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS |
1186 | vuzp v6.8b, v7.8b | ||
1187 | umlsl v1.4s, v18.4h, v31.4h | ||
1188 | add v12.8h, v12.8h, v13.8h | ||
1189 | umlal v1.4s, v19.4h, v31.4h | ||
1190 | ushr v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
1191 | mov v31.d[0], v30.d[1] | ||
1192 | add v12.8h, v12.8h, v13.8h | ||
1193 | st1 {v6.2s, v7.2s}, [OUT], #16 | ||
1039 | .endm | 1194 | .endm |
1040 | 1195 | ||
1041 | /* over_8888_8_8888 */ | 1196 | /* over_8888_8_8888 */ |
1042 | .macro bilinear_over_8888_8_8888_process_last_pixel | 1197 | .macro bilinear_over_8888_8_8888_process_last_pixel |
1043 | bilinear_interpolate_last_pixel 8888, 8, 8888, over | 1198 | bilinear_interpolate_last_pixel 8888, 8, 8888, over |
@@ -1046,177 +1201,24 @@ pixman_asm_function fname | |||
1046 | .macro bilinear_over_8888_8_8888_process_two_pixels | 1201 | .macro bilinear_over_8888_8_8888_process_two_pixels |
1047 | bilinear_interpolate_two_pixels 8888, 8, 8888, over | 1202 | bilinear_interpolate_two_pixels 8888, 8, 8888, over |
1048 | .endm | 1203 | .endm |
1049 | 1204 | ||
1050 | .macro bilinear_over_8888_8_8888_process_four_pixels | 1205 | .macro bilinear_over_8888_8_8888_process_four_pixels |
1051 | bilinear_interpolate_four_pixels 8888, 8, 8888, over | 1206 | bilinear_interpolate_two_pixels 8888, 8, 8888, over |
1207 | bilinear_interpolate_two_pixels 8888, 8, 8888, over | ||
1052 | .endm | 1208 | .endm |
1053 | 1209 | ||
1054 | .macro bilinear_over_8888_8_8888_process_pixblock_head | 1210 | .macro bilinear_over_8888_8_8888_process_pixblock_head |
1055 | asr TMP1, X, #16 | 1211 | bilinear_over_8888_8_8888_process_four_pixels |
1056 | add X, X, UX | ||
1057 | add TMP1, TOP, TMP1, lsl #2 | ||
1058 | vld1.32 {d0}, [TMP1], STRIDE | ||
1059 | asr TMP2, X, #16 | ||
1060 | add X, X, UX | ||
1061 | add TMP2, TOP, TMP2, lsl #2 | ||
1062 | vld1.32 {d1}, [TMP1] | ||
1063 | asr TMP3, X, #16 | ||
1064 | add X, X, UX | ||
1065 | add TMP3, TOP, TMP3, lsl #2 | ||
1066 | vld1.32 {d2}, [TMP2], STRIDE | ||
1067 | asr TMP4, X, #16 | ||
1068 | add X, X, UX | ||
1069 | add TMP4, TOP, TMP4, lsl #2 | ||
1070 | vld1.32 {d3}, [TMP2] | ||
1071 | vmull.u8 q2, d0, d28 | ||
1072 | vmull.u8 q3, d2, d28 | ||
1073 | vmlal.u8 q2, d1, d29 | ||
1074 | vmlal.u8 q3, d3, d29 | ||
1075 | vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS | ||
1076 | vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS | ||
1077 | vmlsl.u16 q0, d4, d30 | ||
1078 | vmlsl.u16 q1, d6, d31 | ||
1079 | vmlal.u16 q0, d5, d30 | ||
1080 | vmlal.u16 q1, d7, d31 | ||
1081 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1082 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1083 | vld1.32 {d2}, [TMP3], STRIDE | ||
1084 | vld1.32 {d3}, [TMP3] | ||
1085 | pld [TMP4, PF_OFFS] | ||
1086 | vld1.32 {d4}, [TMP4], STRIDE | ||
1087 | vld1.32 {d5}, [TMP4] | ||
1088 | pld [TMP4, PF_OFFS] | ||
1089 | vmull.u8 q3, d2, d28 | ||
1090 | vmlal.u8 q3, d3, d29 | ||
1091 | vmull.u8 q1, d4, d28 | ||
1092 | vmlal.u8 q1, d5, d29 | ||
1093 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
1094 | vld1.32 {d22[0]}, [MASK]! | ||
1095 | pld [MASK, #prefetch_offset] | ||
1096 | vadd.u16 q12, q12, q13 | ||
1097 | vmovn.u16 d16, q0 | ||
1098 | .endm | 1212 | .endm |
1099 | 1213 | ||
1100 | .macro bilinear_over_8888_8_8888_process_pixblock_tail | 1214 | .macro bilinear_over_8888_8_8888_process_pixblock_tail |
1101 | vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS | ||
1102 | vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS | ||
1103 | vmlsl.u16 q9, d6, d30 | ||
1104 | vmlsl.u16 q10, d2, d31 | ||
1105 | vmlal.u16 q9, d7, d30 | ||
1106 | vmlal.u16 q10, d3, d31 | ||
1107 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
1108 | vadd.u16 q12, q12, q13 | ||
1109 | vdup.32 d22, d22[0] | ||
1110 | vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1111 | vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1112 | vmovn.u16 d17, q9 | ||
1113 | vld1.32 {d18, d19}, [OUT, :128] | ||
1114 | pld [OUT, PF_OFFS] | ||
1115 | vuzp.8 d16, d17 | ||
1116 | vuzp.8 d18, d19 | ||
1117 | vuzp.8 d16, d17 | ||
1118 | vuzp.8 d18, d19 | ||
1119 | vmull.u8 q10, d16, d22 | ||
1120 | vmull.u8 q11, d17, d22 | ||
1121 | vrsra.u16 q10, q10, #8 | ||
1122 | vrsra.u16 q11, q11, #8 | ||
1123 | vrshrn.u16 d16, q10, #8 | ||
1124 | vrshrn.u16 d17, q11, #8 | ||
1125 | vdup.32 d22, d17[1] | ||
1126 | vmvn.8 d22, d22 | ||
1127 | vmull.u8 q10, d18, d22 | ||
1128 | vmull.u8 q11, d19, d22 | ||
1129 | vrshr.u16 q9, q10, #8 | ||
1130 | vrshr.u16 q0, q11, #8 | ||
1131 | vraddhn.u16 d18, q9, q10 | ||
1132 | vraddhn.u16 d19, q0, q11 | ||
1133 | vqadd.u8 q9, q8, q9 | ||
1134 | vuzp.8 d18, d19 | ||
1135 | vuzp.8 d18, d19 | ||
1136 | vst1.32 {d18, d19}, [OUT, :128]! | ||
1137 | .endm | 1215 | .endm |
1138 | 1216 | ||
1139 | .macro bilinear_over_8888_8_8888_process_pixblock_tail_head | 1217 | .macro bilinear_over_8888_8_8888_process_pixblock_tail_head |
1140 | vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS | 1218 | bilinear_over_8888_8_8888_process_pixblock_tail |
1141 | asr TMP1, X, #16 | 1219 | bilinear_over_8888_8_8888_process_pixblock_head |
1142 | add X, X, UX | ||
1143 | add TMP1, TOP, TMP1, lsl #2 | ||
1144 | vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS | ||
1145 | vld1.32 {d0}, [TMP1], STRIDE | ||
1146 | asr TMP2, X, #16 | ||
1147 | add X, X, UX | ||
1148 | add TMP2, TOP, TMP2, lsl #2 | ||
1149 | vmlsl.u16 q9, d6, d30 | ||
1150 | vmlsl.u16 q10, d2, d31 | ||
1151 | vld1.32 {d1}, [TMP1] | ||
1152 | asr TMP3, X, #16 | ||
1153 | add X, X, UX | ||
1154 | add TMP3, TOP, TMP3, lsl #2 | ||
1155 | vmlal.u16 q9, d7, d30 | ||
1156 | vmlal.u16 q10, d3, d31 | ||
1157 | vld1.32 {d2}, [TMP2], STRIDE | ||
1158 | asr TMP4, X, #16 | ||
1159 | add X, X, UX | ||
1160 | add TMP4, TOP, TMP4, lsl #2 | ||
1161 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
1162 | vadd.u16 q12, q12, q13 | ||
1163 | vld1.32 {d3}, [TMP2] | ||
1164 | vdup.32 d22, d22[0] | ||
1165 | vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1166 | vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1167 | vmull.u8 q2, d0, d28 | ||
1168 | vmull.u8 q3, d2, d28 | ||
1169 | vmovn.u16 d17, q9 | ||
1170 | vld1.32 {d18, d19}, [OUT, :128] | ||
1171 | pld [OUT, #(prefetch_offset * 4)] | ||
1172 | vmlal.u8 q2, d1, d29 | ||
1173 | vmlal.u8 q3, d3, d29 | ||
1174 | vuzp.8 d16, d17 | ||
1175 | vuzp.8 d18, d19 | ||
1176 | vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS | ||
1177 | vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS | ||
1178 | vuzp.8 d16, d17 | ||
1179 | vuzp.8 d18, d19 | ||
1180 | vmlsl.u16 q0, d4, d30 | ||
1181 | vmlsl.u16 q1, d6, d31 | ||
1182 | vmull.u8 q10, d16, d22 | ||
1183 | vmull.u8 q11, d17, d22 | ||
1184 | vmlal.u16 q0, d5, d30 | ||
1185 | vmlal.u16 q1, d7, d31 | ||
1186 | vrsra.u16 q10, q10, #8 | ||
1187 | vrsra.u16 q11, q11, #8 | ||
1188 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1189 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
1190 | vrshrn.u16 d16, q10, #8 | ||
1191 | vrshrn.u16 d17, q11, #8 | ||
1192 | vld1.32 {d2}, [TMP3], STRIDE | ||
1193 | vdup.32 d22, d17[1] | ||
1194 | vld1.32 {d3}, [TMP3] | ||
1195 | vmvn.8 d22, d22 | ||
1196 | pld [TMP4, PF_OFFS] | ||
1197 | vld1.32 {d4}, [TMP4], STRIDE | ||
1198 | vmull.u8 q10, d18, d22 | ||
1199 | vmull.u8 q11, d19, d22 | ||
1200 | vld1.32 {d5}, [TMP4] | ||
1201 | pld [TMP4, PF_OFFS] | ||
1202 | vmull.u8 q3, d2, d28 | ||
1203 | vrshr.u16 q9, q10, #8 | ||
1204 | vrshr.u16 q15, q11, #8 | ||
1205 | vmlal.u8 q3, d3, d29 | ||
1206 | vmull.u8 q1, d4, d28 | ||
1207 | vraddhn.u16 d18, q9, q10 | ||
1208 | vraddhn.u16 d19, q15, q11 | ||
1209 | vmlal.u8 q1, d5, d29 | ||
1210 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
1211 | vqadd.u8 q9, q8, q9 | ||
1212 | vld1.32 {d22[0]}, [MASK]! | ||
1213 | vuzp.8 d18, d19 | ||
1214 | vadd.u16 q12, q12, q13 | ||
1215 | vuzp.8 d18, d19 | ||
1216 | vmovn.u16 d16, q0 | ||
1217 | vst1.32 {d18, d19}, [OUT, :128]! | ||
1218 | .endm | 1220 | .endm |
1219 | 1221 | ||
1220 | /* add_8888_8888 */ | 1222 | /* add_8888_8888 */ |
1221 | .macro bilinear_add_8888_8888_process_last_pixel | 1223 | .macro bilinear_add_8888_8888_process_last_pixel |
1222 | bilinear_interpolate_last_pixel 8888, x, 8888, add | 1224 | bilinear_interpolate_last_pixel 8888, x, 8888, add |
@@ -1225,11 +1227,12 @@ pixman_asm_function fname | |||
1225 | .macro bilinear_add_8888_8888_process_two_pixels | 1227 | .macro bilinear_add_8888_8888_process_two_pixels |
1226 | bilinear_interpolate_two_pixels 8888, x, 8888, add | 1228 | bilinear_interpolate_two_pixels 8888, x, 8888, add |
1227 | .endm | 1229 | .endm |
1228 | 1230 | ||
1229 | .macro bilinear_add_8888_8888_process_four_pixels | 1231 | .macro bilinear_add_8888_8888_process_four_pixels |
1230 | bilinear_interpolate_four_pixels 8888, x, 8888, add | 1232 | bilinear_interpolate_two_pixels 8888, x, 8888, add |
1233 | bilinear_interpolate_two_pixels 8888, x, 8888, add | ||
1231 | .endm | 1234 | .endm |
1232 | 1235 | ||
1233 | .macro bilinear_add_8888_8888_process_pixblock_head | 1236 | .macro bilinear_add_8888_8888_process_pixblock_head |
1234 | bilinear_add_8888_8888_process_four_pixels | 1237 | bilinear_add_8888_8888_process_four_pixels |
1235 | .endm | 1238 | .endm |
diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S index 059b285..2b08766 100644 --- a/pixman/pixman-arma64-neon-asm.S +++ b/pixman/pixman-arma64-neon-asm.S | |||
@@ -37,23 +37,19 @@ | |||
37 | /* Prevent the stack from becoming executable for no reason... */ | 37 | /* Prevent the stack from becoming executable for no reason... */ |
38 | #if defined(__linux__) && defined(__ELF__) | 38 | #if defined(__linux__) && defined(__ELF__) |
39 | .section .note.GNU-stack,"",%progbits | 39 | .section .note.GNU-stack,"",%progbits |
40 | #endif | 40 | #endif |
41 | 41 | ||
42 | .text | 42 | .text |
43 | .fpu neon | 43 | .arch armv8-a |
44 | .arch armv7a | 44 | |
45 | .object_arch armv4 | 45 | .altmacro |
46 | .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ | 46 | .p2align 2 |
47 | .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ | ||
48 | .arm | ||
49 | .altmacro | ||
50 | .p2align 2 | ||
51 | 47 | ||
52 | #include "pixman-private.h" | 48 | #include "pixman-private.h" |
53 | #include "pixman-arm-asm.h" | 49 | #include "pixman-arm-asm.h" |
54 | #include "pixman-arm-neon-asm.h" | 50 | #include "pixman-arma64-neon-asm.h" |
55 | 51 | ||
56 | /* Global configuration options and preferences */ | 52 | /* Global configuration options and preferences */ |
57 | 53 | ||
58 | /* | 54 | /* |
59 | * The code can optionally make use of unaligned memory accesses to improve | 55 | * The code can optionally make use of unaligned memory accesses to improve |
@@ -78,11 +74,11 @@ | |||
78 | * instructions do not add (many) extra cycles, but improve prefetch efficiency) | 74 | * instructions do not add (many) extra cycles, but improve prefetch efficiency) |
79 | * | 75 | * |
80 | * Note: some types of function can't support advanced prefetch and fallback | 76 | * Note: some types of function can't support advanced prefetch and fallback |
81 | * to simple one (those which handle 24bpp pixels) | 77 | * to simple one (those which handle 24bpp pixels) |
82 | */ | 78 | */ |
83 | .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED | 79 | .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_NONE |
84 | 80 | ||
85 | /* Prefetch distance in pixels for simple prefetch */ | 81 | /* Prefetch distance in pixels for simple prefetch */ |
86 | .set PREFETCH_DISTANCE_SIMPLE, 64 | 82 | .set PREFETCH_DISTANCE_SIMPLE, 64 |
87 | 83 | ||
88 | /* | 84 | /* |
@@ -99,94 +95,102 @@ | |||
99 | * code for memory reading and writing (including quite tricky cases of | 95 | * code for memory reading and writing (including quite tricky cases of |
100 | * handling unaligned leading/trailing pixels), so we only need to deal with | 96 | * handling unaligned leading/trailing pixels), so we only need to deal with |
101 | * the data in NEON registers. | 97 | * the data in NEON registers. |
102 | * | 98 | * |
103 | * NEON registers allocation in general is recommented to be the following: | 99 | * NEON registers allocation in general is recommented to be the following: |
104 | * d0, d1, d2, d3 - contain loaded source pixel data | 100 | * v0, v1, v2, v3 - contain loaded source pixel data |
105 | * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) | 101 | * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed) |
106 | * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) | 102 | * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used) |
107 | * d28, d29, d30, d31 - place for storing the result (destination pixels) | 103 | * v28, v29, v30, v31 - place for storing the result (destination pixels) |
108 | * | 104 | * |
109 | * As can be seen above, four 64-bit NEON registers are used for keeping | 105 | * As can be seen above, four 64-bit NEON registers are used for keeping |
110 | * intermediate pixel data and up to 8 pixels can be processed in one step | 106 | * intermediate pixel data and up to 8 pixels can be processed in one step |
111 | * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). | 107 | * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). |
112 | * | 108 | * |
113 | * This particular function uses the following registers allocation: | 109 | * This particular function uses the following registers allocation: |
114 | * d0, d1, d2, d3 - contain loaded source pixel data | 110 | * v0, v1, v2, v3 - contain loaded source pixel data |
115 | * d4, d5 - contain loaded destination pixels (they are needed) | 111 | * v4, v5 - contain loaded destination pixels (they are needed) |
116 | * d28, d29 - place for storing the result (destination pixels) | 112 | * v28, v29 - place for storing the result (destination pixels) |
117 | */ | 113 | */ |
118 | 114 | ||
119 | /* | 115 | /* |
120 | * Step one. We need to have some code to do some arithmetics on pixel data. | 116 | * Step one. We need to have some code to do some arithmetics on pixel data. |
121 | * This is implemented as a pair of macros: '*_head' and '*_tail'. When used | 117 | * This is implemented as a pair of macros: '*_head' and '*_tail'. When used |
122 | * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, | 118 | * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5}, |
123 | * perform all the needed calculations and write the result to {d28, d29}. | 119 | * perform all the needed calculations and write the result to {v28, v29}. |
124 | * The rationale for having two macros and not just one will be explained | 120 | * The rationale for having two macros and not just one will be explained |
125 | * later. In practice, any single monolitic function which does the work can | 121 | * later. In practice, any single monolitic function which does the work can |
126 | * be split into two parts in any arbitrary way without affecting correctness. | 122 | * be split into two parts in any arbitrary way without affecting correctness. |
127 | * | 123 | * |
128 | * There is one special trick here too. Common template macro can optionally | 124 | * There is one special trick here too. Common template macro can optionally |
129 | * make our life a bit easier by doing R, G, B, A color components | 125 | * make our life a bit easier by doing R, G, B, A color components |
130 | * deinterleaving for 32bpp pixel formats (and this feature is used in | 126 | * deinterleaving for 32bpp pixel formats (and this feature is used in |
131 | * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that | 127 | * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that |
132 | * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we | 128 | * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we |
133 | * actually use d0 register for blue channel (a vector of eight 8-bit | 129 | * actually use v0 register for blue channel (a vector of eight 8-bit |
134 | * values), d1 register for green, d2 for red and d3 for alpha. This | 130 | * values), v1 register for green, v2 for red and v3 for alpha. This |
135 | * simple conversion can be also done with a few NEON instructions: | 131 | * simple conversion can be also done with a few NEON instructions: |
136 | * | 132 | * |
137 | * Packed to planar conversion: | 133 | * Packed to planar conversion: // vuzp8 is a wrapper macro |
138 | * vuzp.8 d0, d1 | 134 | * vuzp8 v0, v1 |
139 | * vuzp.8 d2, d3 | 135 | * vuzp8 v2, v3 |
140 | * vuzp.8 d1, d3 | 136 | * vuzp8 v1, v3 |
141 | * vuzp.8 d0, d2 | 137 | * vuzp8 v0, v2 |
142 | * | 138 | * |
143 | * Planar to packed conversion: | 139 | * Planar to packed conversion: // vzip8 is a wrapper macro |
144 | * vzip.8 d0, d2 | 140 | * vzip8 v0, v2 |
145 | * vzip.8 d1, d3 | 141 | * vzip8 v1, v3 |
146 | * vzip.8 d2, d3 | 142 | * vzip8 v2, v3 |
147 | * vzip.8 d0, d1 | 143 | * vzip8 v0, v1 |
148 | * | 144 | * |
149 | * But pixel can be loaded directly in planar format using VLD4.8 NEON | 145 | * But pixel can be loaded directly in planar format using LD4 / b NEON |
150 | * instruction. It is 1 cycle slower than VLD1.32, so this is not always | 146 | * instruction. It is 1 cycle slower than LD1 / s, so this is not always |
151 | * desirable, that's why deinterleaving is optional. | 147 | * desirable, that's why deinterleaving is optional. |
152 | * | 148 | * |
153 | * But anyway, here is the code: | 149 | * But anyway, here is the code: |
154 | */ | 150 | */ |
151 | |||
155 | .macro pixman_composite_over_8888_0565_process_pixblock_head | 152 | .macro pixman_composite_over_8888_0565_process_pixblock_head |
156 | /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format | 153 | /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format |
157 | and put data into d6 - red, d7 - green, d30 - blue */ | 154 | and put data into v6 - red, v7 - green, v30 - blue */ |
158 | vshrn.u16 d6, q2, #8 | 155 | mov v4.d[1], v5.d[0] |
159 | vshrn.u16 d7, q2, #3 | 156 | shrn v6.8b, v4.8h, #8 |
160 | vsli.u16 q2, q2, #5 | 157 | shrn v7.8b, v4.8h, #3 |
161 | vsri.u8 d6, d6, #5 | 158 | sli v4.8h, v4.8h, #5 |
162 | vmvn.8 d3, d3 /* invert source alpha */ | 159 | sri v6.8b, v6.8b, #5 |
163 | vsri.u8 d7, d7, #6 | 160 | mvn v3.8b, v3.8b /* invert source alpha */ |
164 | vshrn.u16 d30, q2, #2 | 161 | sri v7.8b, v7.8b, #6 |
162 | shrn v30.8b, v4.8h, #2 | ||
165 | /* now do alpha blending, storing results in 8-bit planar format | 163 | /* now do alpha blending, storing results in 8-bit planar format |
166 | into d16 - red, d19 - green, d18 - blue */ | 164 | into v20 - red, v23 - green, v22 - blue */ |
167 | vmull.u8 q10, d3, d6 | 165 | umull v10.8h, v3.8b, v6.8b |
168 | vmull.u8 q11, d3, d7 | 166 | umull v11.8h, v3.8b, v7.8b |
169 | vmull.u8 q12, d3, d30 | 167 | umull v12.8h, v3.8b, v30.8b |
170 | vrshr.u16 q13, q10, #8 | 168 | urshr v17.8h, v10.8h, #8 |
171 | vrshr.u16 q3, q11, #8 | 169 | urshr v18.8h, v11.8h, #8 |
172 | vrshr.u16 q15, q12, #8 | 170 | urshr v19.8h, v12.8h, #8 |
173 | vraddhn.u16 d20, q10, q13 | 171 | raddhn v20.8b, v10.8h, v17.8h |
174 | vraddhn.u16 d23, q11, q3 | 172 | raddhn v23.8b, v11.8h, v18.8h |
175 | vraddhn.u16 d22, q12, q15 | 173 | raddhn v22.8b, v12.8h, v19.8h |
176 | .endm | 174 | .endm |
177 | 175 | ||
178 | .macro pixman_composite_over_8888_0565_process_pixblock_tail | 176 | .macro pixman_composite_over_8888_0565_process_pixblock_tail |
179 | /* ... continue alpha blending */ | 177 | /* ... continue alpha blending */ |
180 | vqadd.u8 d16, d2, d20 | 178 | uqadd v17.8b, v2.8b, v20.8b |
181 | vqadd.u8 q9, q0, q11 | 179 | uqadd v18.8b, v0.8b, v22.8b |
182 | /* convert the result to r5g6b5 and store it into {d28, d29} */ | 180 | uqadd v19.8b, v1.8b, v23.8b |
183 | vshll.u8 q14, d16, #8 | 181 | /* convert the result to r5g6b5 and store it into {v14} */ |
184 | vshll.u8 q8, d19, #8 | 182 | ushll v14.8h, v17.8b, #7 |
185 | vshll.u8 q9, d18, #8 | 183 | sli v14.8h, v14.8h, #1 |
186 | vsri.u16 q14, q8, #5 | 184 | ushll v8.8h, v19.8b, #7 |
187 | vsri.u16 q14, q9, #11 | 185 | sli v8.8h, v8.8h, #1 |
186 | ushll v9.8h, v18.8b, #7 | ||
187 | sli v9.8h, v9.8h, #1 | ||
188 | sri v14.8h, v8.8h, #5 | ||
189 | sri v14.8h, v9.8h, #11 | ||
190 | mov v28.d[0], v14.d[0] | ||
191 | mov v29.d[0], v14.d[1] | ||
188 | .endm | 192 | .endm |
189 | 193 | ||
190 | /* | 194 | /* |
191 | * OK, now we got almost everything that we need. Using the above two | 195 | * OK, now we got almost everything that we need. Using the above two |
192 | * macros, the work can be done right. But now we want to optimize | 196 | * macros, the work can be done right. But now we want to optimize |
@@ -209,13 +213,13 @@ | |||
209 | * So what we need now is a '*_tail_head' macro, which will be used | 213 | * So what we need now is a '*_tail_head' macro, which will be used |
210 | * in the core main loop. A trivial straightforward implementation | 214 | * in the core main loop. A trivial straightforward implementation |
211 | * of this macro would look like this: | 215 | * of this macro would look like this: |
212 | * | 216 | * |
213 | * pixman_composite_over_8888_0565_process_pixblock_tail | 217 | * pixman_composite_over_8888_0565_process_pixblock_tail |
214 | * vst1.16 {d28, d29}, [DST_W, :128]! | 218 | * st1 {v28.4h, v29.4h}, [DST_W], #32 |
215 | * vld1.16 {d4, d5}, [DST_R, :128]! | 219 | * ld1 {v4.4h, v5.4h}, [DST_R], #16 |
216 | * vld4.32 {d0, d1, d2, d3}, [SRC]! | 220 | * ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32 |
217 | * pixman_composite_over_8888_0565_process_pixblock_head | 221 | * pixman_composite_over_8888_0565_process_pixblock_head |
218 | * cache_preload 8, 8 | 222 | * cache_preload 8, 8 |
219 | * | 223 | * |
220 | * Now it also got some VLD/VST instructions. We simply can't move from | 224 | * Now it also got some VLD/VST instructions. We simply can't move from |
221 | * processing one block of pixels to the other one with just arithmetics. | 225 | * processing one block of pixels to the other one with just arithmetics. |
@@ -242,64 +246,89 @@ | |||
242 | * | 246 | * |
243 | * Now after all the explanations, here is the optimized code. | 247 | * Now after all the explanations, here is the optimized code. |
244 | * Different instruction streams (originaling from '*_head', '*_tail' | 248 | * Different instruction streams (originaling from '*_head', '*_tail' |
245 | * and 'cache_preload' macro) use different indentation levels for | 249 | * and 'cache_preload' macro) use different indentation levels for |
246 | * better readability. Actually taking the code from one of these | 250 | * better readability. Actually taking the code from one of these |
247 | * indentation levels and ignoring a few VLD/VST instructions would | 251 | * indentation levels and ignoring a few LD/ST instructions would |
248 | * result in exactly the code from '*_head', '*_tail' or 'cache_preload' | 252 | * result in exactly the code from '*_head', '*_tail' or 'cache_preload' |
249 | * macro! | 253 | * macro! |
250 | */ | 254 | */ |
251 | 255 | ||
252 | #if 1 | 256 | #if 1 |
253 | 257 | ||
254 | .macro pixman_composite_over_8888_0565_process_pixblock_tail_head | 258 | .macro pixman_composite_over_8888_0565_process_pixblock_tail_head |
255 | vqadd.u8 d16, d2, d20 | 259 | uqadd v17.8b, v2.8b, v20.8b |
256 | vld1.16 {d4, d5}, [DST_R, :128]! | 260 | ld1 {v4.4h, v5.4h}, [DST_R], #16 |
257 | vqadd.u8 q9, q0, q11 | 261 | mov v4.d[1], v5.d[0] |
258 | vshrn.u16 d6, q2, #8 | 262 | uqadd v18.8b, v0.8b, v22.8b |
263 | uqadd v19.8b, v1.8b, v23.8b | ||
264 | shrn v6.8b, v4.8h, #8 | ||
259 | fetch_src_pixblock | 265 | fetch_src_pixblock |
260 | vshrn.u16 d7, q2, #3 | 266 | shrn v7.8b, v4.8h, #3 |
261 | vsli.u16 q2, q2, #5 | 267 | sli v4.8h, v4.8h, #5 |
262 | vshll.u8 q14, d16, #8 | 268 | ushll v14.8h, v17.8b, #7 |
269 | sli v14.8h, v14.8h, #1 | ||
263 | PF add PF_X, PF_X, #8 | 270 | PF add PF_X, PF_X, #8 |
264 | vshll.u8 q8, d19, #8 | 271 | ushll v8.8h, v19.8b, #7 |
272 | sli v8.8h, v8.8h, #1 | ||
265 | PF tst PF_CTL, #0xF | 273 | PF tst PF_CTL, #0xF |
266 | vsri.u8 d6, d6, #5 | 274 | sri v6.8b, v6.8b, #5 |
267 | PF addne PF_X, PF_X, #8 | 275 | PF beq 10f |
268 | vmvn.8 d3, d3 | 276 | PF add PF_X, PF_X, #8 |
269 | PF subne PF_CTL, PF_CTL, #1 | 277 | 10: |
270 | vsri.u8 d7, d7, #6 | 278 | mvn v3.8b, v3.8b |
271 | vshrn.u16 d30, q2, #2 | 279 | PF beq 10f |
272 | vmull.u8 q10, d3, d6 | 280 | PF sub PF_CTL, PF_CTL, #1 |
273 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 281 | 10: |
274 | vmull.u8 q11, d3, d7 | 282 | sri v7.8b, v7.8b, #6 |
275 | vmull.u8 q12, d3, d30 | 283 | shrn v30.8b, v4.8h, #2 |
276 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 284 | umull v10.8h, v3.8b, v6.8b |
277 | vsri.u16 q14, q8, #5 | 285 | PF lsl DUMMY, PF_X, #src_bpp_shift |
286 | PF prfm pldl2strm, [PF_SRC, DUMMY] | ||
287 | umull v11.8h, v3.8b, v7.8b | ||
288 | umull v12.8h, v3.8b, v30.8b | ||
289 | PF lsl DUMMY, PF_X, #dst_bpp_shift | ||
290 | PF prfm pldl2strm, [PF_DST, DUMMY] | ||
291 | sri v14.8h, v8.8h, #5 | ||
278 | PF cmp PF_X, ORIG_W | 292 | PF cmp PF_X, ORIG_W |
279 | vshll.u8 q9, d18, #8 | 293 | ushll v9.8h, v18.8b, #7 |
280 | vrshr.u16 q13, q10, #8 | 294 | sli v9.8h, v9.8h, #1 |
281 | PF subge PF_X, PF_X, ORIG_W | 295 | urshr v17.8h, v10.8h, #8 |
282 | vrshr.u16 q3, q11, #8 | 296 | PF ble 10f |
283 | vrshr.u16 q15, q12, #8 | 297 | PF sub PF_X, PF_X, ORIG_W |
284 | PF subges PF_CTL, PF_CTL, #0x10 | 298 | 10: |
285 | vsri.u16 q14, q9, #11 | 299 | urshr v19.8h, v11.8h, #8 |
286 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 300 | urshr v18.8h, v12.8h, #8 |
287 | vraddhn.u16 d20, q10, q13 | 301 | PF ble 10f |
288 | vraddhn.u16 d23, q11, q3 | 302 | PF subs PF_CTL, PF_CTL, #0x10 |
289 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 303 | 10: |
290 | vraddhn.u16 d22, q12, q15 | 304 | sri v14.8h, v9.8h, #11 |
291 | vst1.16 {d28, d29}, [DST_W, :128]! | 305 | mov v28.d[0], v14.d[0] |
306 | mov v29.d[0], v14.d[1] | ||
307 | PF ble 10f | ||
308 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
309 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
310 | PF add PF_SRC, PF_SRC, #1 | ||
311 | 10: | ||
312 | raddhn v20.8b, v10.8h, v17.8h | ||
313 | raddhn v23.8b, v11.8h, v19.8h | ||
314 | PF ble 10f | ||
315 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift | ||
316 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
317 | PF add PF_DST, PF_SRC, #1 | ||
318 | 10: | ||
319 | raddhn v22.8b, v12.8h, v18.8h | ||
320 | st1 {v14.8h}, [DST_W], #16 | ||
292 | .endm | 321 | .endm |
293 | 322 | ||
294 | #else | 323 | #else |
295 | 324 | ||
296 | /* If we did not care much about the performance, we would just use this... */ | 325 | /* If we did not care much about the performance, we would just use this... */ |
297 | .macro pixman_composite_over_8888_0565_process_pixblock_tail_head | 326 | .macro pixman_composite_over_8888_0565_process_pixblock_tail_head |
298 | pixman_composite_over_8888_0565_process_pixblock_tail | 327 | pixman_composite_over_8888_0565_process_pixblock_tail |
299 | vst1.16 {d28, d29}, [DST_W, :128]! | 328 | st1 {v14.8h}, [DST_W], #16 |
300 | vld1.16 {d4, d5}, [DST_R, :128]! | 329 | ld1 {v4.4h, v4.5h}, [DST_R], #16 |
301 | fetch_src_pixblock | 330 | fetch_src_pixblock |
302 | pixman_composite_over_8888_0565_process_pixblock_head | 331 | pixman_composite_over_8888_0565_process_pixblock_head |
303 | cache_preload 8, 8 | 332 | cache_preload 8, 8 |
304 | .endm | 333 | .endm |
305 | 334 | ||
@@ -350,60 +379,66 @@ generate_composite_function \ | |||
350 | 24 /* mask_basereg */ | 379 | 24 /* mask_basereg */ |
351 | 380 | ||
352 | /******************************************************************************/ | 381 | /******************************************************************************/ |
353 | 382 | ||
354 | .macro pixman_composite_over_n_0565_process_pixblock_head | 383 | .macro pixman_composite_over_n_0565_process_pixblock_head |
355 | /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format | 384 | /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format |
356 | and put data into d6 - red, d7 - green, d30 - blue */ | 385 | and put data into v6 - red, v7 - green, v30 - blue */ |
357 | vshrn.u16 d6, q2, #8 | 386 | mov v4.d[1], v5.d[0] |
358 | vshrn.u16 d7, q2, #3 | 387 | shrn v6.8b, v4.8h, #8 |
359 | vsli.u16 q2, q2, #5 | 388 | shrn v7.8b, v4.8h, #3 |
360 | vsri.u8 d6, d6, #5 | 389 | sli v4.8h, v4.8h, #5 |
361 | vsri.u8 d7, d7, #6 | 390 | sri v6.8b, v6.8b, #5 |
362 | vshrn.u16 d30, q2, #2 | 391 | sri v7.8b, v7.8b, #6 |
392 | shrn v30.8b, v4.8h, #2 | ||
363 | /* now do alpha blending, storing results in 8-bit planar format | 393 | /* now do alpha blending, storing results in 8-bit planar format |
364 | into d16 - red, d19 - green, d18 - blue */ | 394 | into v20 - red, v23 - green, v22 - blue */ |
365 | vmull.u8 q10, d3, d6 | 395 | umull v10.8h, v3.8b, v6.8b |
366 | vmull.u8 q11, d3, d7 | 396 | umull v11.8h, v3.8b, v7.8b |
367 | vmull.u8 q12, d3, d30 | 397 | umull v12.8h, v3.8b, v30.8b |
368 | vrshr.u16 q13, q10, #8 | 398 | urshr v13.8h, v10.8h, #8 |
369 | vrshr.u16 q3, q11, #8 | 399 | urshr v14.8h, v11.8h, #8 |
370 | vrshr.u16 q15, q12, #8 | 400 | urshr v15.8h, v12.8h, #8 |
371 | vraddhn.u16 d20, q10, q13 | 401 | raddhn v20.8b, v10.8h, v13.8h |
372 | vraddhn.u16 d23, q11, q3 | 402 | raddhn v23.8b, v11.8h, v14.8h |
373 | vraddhn.u16 d22, q12, q15 | 403 | raddhn v22.8b, v12.8h, v15.8h |
374 | .endm | 404 | .endm |
375 | 405 | ||
376 | .macro pixman_composite_over_n_0565_process_pixblock_tail | 406 | .macro pixman_composite_over_n_0565_process_pixblock_tail |
377 | /* ... continue alpha blending */ | 407 | /* ... continue alpha blending */ |
378 | vqadd.u8 d16, d2, d20 | 408 | uqadd v17.8b, v2.8b, v20.8b |
379 | vqadd.u8 q9, q0, q11 | 409 | uqadd v18.8b, v0.8b, v22.8b |
380 | /* convert the result to r5g6b5 and store it into {d28, d29} */ | 410 | uqadd v19.8b, v1.8b, v23.8b |
381 | vshll.u8 q14, d16, #8 | 411 | /* convert the result to r5g6b5 and store it into {v14} */ |
382 | vshll.u8 q8, d19, #8 | 412 | ushll v14.8h, v17.8b, #7 |
383 | vshll.u8 q9, d18, #8 | 413 | sli v14.8h, v14.8h, #1 |
384 | vsri.u16 q14, q8, #5 | 414 | ushll v8.8h, v19.8b, #7 |
385 | vsri.u16 q14, q9, #11 | 415 | sli v8.8h, v8.8h, #1 |
416 | ushll v9.8h, v18.8b, #7 | ||
417 | sli v9.8h, v9.8h, #1 | ||
418 | sri v14.8h, v8.8h, #5 | ||
419 | sri v14.8h, v9.8h, #11 | ||
420 | mov v28.d[0], v14.d[0] | ||
421 | mov v29.d[0], v14.d[1] | ||
386 | .endm | 422 | .endm |
387 | 423 | ||
388 | /* TODO: expand macros and do better instructions scheduling */ | 424 | /* TODO: expand macros and do better instructions scheduling */ |
389 | .macro pixman_composite_over_n_0565_process_pixblock_tail_head | 425 | .macro pixman_composite_over_n_0565_process_pixblock_tail_head |
390 | pixman_composite_over_n_0565_process_pixblock_tail | 426 | pixman_composite_over_n_0565_process_pixblock_tail |
391 | vld1.16 {d4, d5}, [DST_R, :128]! | 427 | ld1 {v4.4h, v5.4h}, [DST_R], #16 |
392 | vst1.16 {d28, d29}, [DST_W, :128]! | 428 | st1 {v14.8h}, [DST_W], #16 |
393 | pixman_composite_over_n_0565_process_pixblock_head | 429 | pixman_composite_over_n_0565_process_pixblock_head |
394 | cache_preload 8, 8 | 430 | cache_preload 8, 8 |
395 | .endm | 431 | .endm |
396 | 432 | ||
397 | .macro pixman_composite_over_n_0565_init | 433 | .macro pixman_composite_over_n_0565_init |
398 | add DUMMY, sp, #ARGS_STACK_OFFSET | 434 | mov v3.s[0], w4 |
399 | vld1.32 {d3[0]}, [DUMMY] | 435 | dup v0.8b, v3.b[0] |
400 | vdup.8 d0, d3[0] | 436 | dup v1.8b, v3.b[1] |
401 | vdup.8 d1, d3[1] | 437 | dup v2.8b, v3.b[2] |
402 | vdup.8 d2, d3[2] | 438 | dup v3.8b, v3.b[3] |
403 | vdup.8 d3, d3[3] | 439 | mvn v3.8b, v3.8b /* invert source alpha */ |
404 | vmvn.8 d3, d3 /* invert source alpha */ | ||
405 | .endm | 440 | .endm |
406 | 441 | ||
407 | generate_composite_function \ | 442 | generate_composite_function \ |
408 | pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ | 443 | pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ |
409 | FLAG_DST_READWRITE, \ | 444 | FLAG_DST_READWRITE, \ |
@@ -420,37 +455,56 @@ generate_composite_function \ | |||
420 | 24 /* mask_basereg */ | 455 | 24 /* mask_basereg */ |
421 | 456 | ||
422 | /******************************************************************************/ | 457 | /******************************************************************************/ |
423 | 458 | ||
424 | .macro pixman_composite_src_8888_0565_process_pixblock_head | 459 | .macro pixman_composite_src_8888_0565_process_pixblock_head |
425 | vshll.u8 q8, d1, #8 | 460 | ushll v8.8h, v1.8b, #7 |
426 | vshll.u8 q14, d2, #8 | 461 | sli v8.8h, v8.8h, #1 |
427 | vshll.u8 q9, d0, #8 | 462 | ushll v14.8h, v2.8b, #7 |
463 | sli v14.8h, v14.8h, #1 | ||
464 | ushll v9.8h, v0.8b, #7 | ||
465 | sli v9.8h, v9.8h, #1 | ||
428 | .endm | 466 | .endm |
429 | 467 | ||
430 | .macro pixman_composite_src_8888_0565_process_pixblock_tail | 468 | .macro pixman_composite_src_8888_0565_process_pixblock_tail |
431 | vsri.u16 q14, q8, #5 | 469 | sri v14.8h, v8.8h, #5 |
432 | vsri.u16 q14, q9, #11 | 470 | sri v14.8h, v9.8h, #11 |
471 | mov v28.d[0], v14.d[0] | ||
472 | mov v29.d[0], v14.d[1] | ||
433 | .endm | 473 | .endm |
434 | 474 | ||
435 | .macro pixman_composite_src_8888_0565_process_pixblock_tail_head | 475 | .macro pixman_composite_src_8888_0565_process_pixblock_tail_head |
436 | vsri.u16 q14, q8, #5 | 476 | sri v14.8h, v8.8h, #5 |
437 | PF add PF_X, PF_X, #8 | 477 | PF add PF_X, PF_X, #8 |
438 | PF tst PF_CTL, #0xF | 478 | PF tst PF_CTL, #0xF |
439 | fetch_src_pixblock | 479 | fetch_src_pixblock |
440 | PF addne PF_X, PF_X, #8 | 480 | PF beq 10f |
441 | PF subne PF_CTL, PF_CTL, #1 | 481 | PF add PF_X, PF_X, #8 |
442 | vsri.u16 q14, q9, #11 | 482 | PF sub PF_CTL, PF_CTL, #1 |
483 | 10: | ||
484 | sri v14.8h, v9.8h, #11 | ||
485 | mov v28.d[0], v14.d[0] | ||
486 | mov v29.d[0], v14.d[1] | ||
443 | PF cmp PF_X, ORIG_W | 487 | PF cmp PF_X, ORIG_W |
444 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 488 | PF lsl DUMMY, PF_X, #src_bpp_shift |
445 | vshll.u8 q8, d1, #8 | 489 | PF prfm pldl2strm, [PF_SRC, DUMMY] |
446 | vst1.16 {d28, d29}, [DST_W, :128]! | 490 | ushll v8.8h, v1.8b, #7 |
447 | PF subge PF_X, PF_X, ORIG_W | 491 | sli v8.8h, v8.8h, #1 |
448 | PF subges PF_CTL, PF_CTL, #0x10 | 492 | st1 {v14.8h}, [DST_W], #16 |
449 | vshll.u8 q14, d2, #8 | 493 | PF ble 10f |
450 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 494 | PF sub PF_X, PF_X, ORIG_W |
451 | vshll.u8 q9, d0, #8 | 495 | PF subs PF_CTL, PF_CTL, #0x10 |
496 | 10: | ||
497 | ushll v14.8h, v2.8b, #7 | ||
498 | sli v14.8h, v14.8h, #1 | ||
499 | PF ble 10f | ||
500 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
501 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
502 | PF add PF_SRC, PF_SRC, #1 | ||
503 | 10: | ||
504 | ushll v9.8h, v0.8b, #7 | ||
505 | sli v9.8h, v9.8h, #1 | ||
452 | .endm | 506 | .endm |
453 | 507 | ||
454 | generate_composite_function \ | 508 | generate_composite_function \ |
455 | pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ | 509 | pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ |
456 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ | 510 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -463,26 +517,27 @@ generate_composite_function \ | |||
463 | pixman_composite_src_8888_0565_process_pixblock_tail_head | 517 | pixman_composite_src_8888_0565_process_pixblock_tail_head |
464 | 518 | ||
465 | /******************************************************************************/ | 519 | /******************************************************************************/ |
466 | 520 | ||
467 | .macro pixman_composite_src_0565_8888_process_pixblock_head | 521 | .macro pixman_composite_src_0565_8888_process_pixblock_head |
468 | vshrn.u16 d30, q0, #8 | 522 | mov v0.d[1], v1.d[0] |
469 | vshrn.u16 d29, q0, #3 | 523 | shrn v30.8b, v0.8h, #8 |
470 | vsli.u16 q0, q0, #5 | 524 | shrn v29.8b, v0.8h, #3 |
471 | vmov.u8 d31, #255 | 525 | sli v0.8h, v0.8h, #5 |
472 | vsri.u8 d30, d30, #5 | 526 | movi v31.8b, #255 |
473 | vsri.u8 d29, d29, #6 | 527 | sri v30.8b, v30.8b, #5 |
474 | vshrn.u16 d28, q0, #2 | 528 | sri v29.8b, v29.8b, #6 |
529 | shrn v28.8b, v0.8h, #2 | ||
475 | .endm | 530 | .endm |
476 | 531 | ||
477 | .macro pixman_composite_src_0565_8888_process_pixblock_tail | 532 | .macro pixman_composite_src_0565_8888_process_pixblock_tail |
478 | .endm | 533 | .endm |
479 | 534 | ||
480 | /* TODO: expand macros and do better instructions scheduling */ | 535 | /* TODO: expand macros and do better instructions scheduling */ |
481 | .macro pixman_composite_src_0565_8888_process_pixblock_tail_head | 536 | .macro pixman_composite_src_0565_8888_process_pixblock_tail_head |
482 | pixman_composite_src_0565_8888_process_pixblock_tail | 537 | pixman_composite_src_0565_8888_process_pixblock_tail |
483 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 538 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
484 | fetch_src_pixblock | 539 | fetch_src_pixblock |
485 | pixman_composite_src_0565_8888_process_pixblock_head | 540 | pixman_composite_src_0565_8888_process_pixblock_head |
486 | cache_preload 8, 8 | 541 | cache_preload 8, 8 |
487 | .endm | 542 | .endm |
488 | 543 | ||
@@ -498,34 +553,50 @@ generate_composite_function \ | |||
498 | pixman_composite_src_0565_8888_process_pixblock_tail_head | 553 | pixman_composite_src_0565_8888_process_pixblock_tail_head |
499 | 554 | ||
500 | /******************************************************************************/ | 555 | /******************************************************************************/ |
501 | 556 | ||
502 | .macro pixman_composite_add_8_8_process_pixblock_head | 557 | .macro pixman_composite_add_8_8_process_pixblock_head |
503 | vqadd.u8 q14, q0, q2 | 558 | uqadd v28.8b, v0.8b, v4.8b |
504 | vqadd.u8 q15, q1, q3 | 559 | uqadd v29.8b, v1.8b, v5.8b |
560 | uqadd v30.8b, v2.8b, v6.8b | ||
561 | uqadd v31.8b, v3.8b, v7.8b | ||
505 | .endm | 562 | .endm |
506 | 563 | ||
507 | .macro pixman_composite_add_8_8_process_pixblock_tail | 564 | .macro pixman_composite_add_8_8_process_pixblock_tail |
508 | .endm | 565 | .endm |
509 | 566 | ||
510 | .macro pixman_composite_add_8_8_process_pixblock_tail_head | 567 | .macro pixman_composite_add_8_8_process_pixblock_tail_head |
511 | fetch_src_pixblock | 568 | fetch_src_pixblock |
512 | PF add PF_X, PF_X, #32 | 569 | PF add PF_X, PF_X, #32 |
513 | PF tst PF_CTL, #0xF | 570 | PF tst PF_CTL, #0xF |
514 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! | 571 | ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
515 | PF addne PF_X, PF_X, #32 | 572 | PF beq 10f |
516 | PF subne PF_CTL, PF_CTL, #1 | 573 | PF add PF_X, PF_X, #32 |
517 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! | 574 | PF sub PF_CTL, PF_CTL, #1 |
575 | 10: | ||
576 | st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
518 | PF cmp PF_X, ORIG_W | 577 | PF cmp PF_X, ORIG_W |
519 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 578 | PF lsl DUMMY, PF_X, #src_bpp_shift |
520 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 579 | PF prfm pldl2strm, [PF_SRC, DUMMY] |
521 | PF subge PF_X, PF_X, ORIG_W | 580 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
522 | PF subges PF_CTL, PF_CTL, #0x10 | 581 | PF prfm pldl2strm, [PF_DST, DUMMY] |
523 | vqadd.u8 q14, q0, q2 | 582 | PF ble 10f |
524 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 583 | PF sub PF_X, PF_X, ORIG_W |
525 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 584 | PF subs PF_CTL, PF_CTL, #0x10 |
526 | vqadd.u8 q15, q1, q3 | 585 | 10: |
586 | uqadd v28.8b, v0.8b, v4.8b | ||
587 | PF ble 10f | ||
588 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
589 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
590 | PF add PF_SRC, PF_SRC, #1 | ||
591 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift | ||
592 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
593 | PF add PF_DST, PF_DST, #1 | ||
594 | 10: | ||
595 | uqadd v29.8b, v1.8b, v5.8b | ||
596 | uqadd v30.8b, v2.8b, v6.8b | ||
597 | uqadd v31.8b, v3.8b, v7.8b | ||
527 | .endm | 598 | .endm |
528 | 599 | ||
529 | generate_composite_function \ | 600 | generate_composite_function \ |
530 | pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ | 601 | pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ |
531 | FLAG_DST_READWRITE, \ | 602 | FLAG_DST_READWRITE, \ |
@@ -541,23 +612,37 @@ generate_composite_function \ | |||
541 | 612 | ||
542 | .macro pixman_composite_add_8888_8888_process_pixblock_tail_head | 613 | .macro pixman_composite_add_8888_8888_process_pixblock_tail_head |
543 | fetch_src_pixblock | 614 | fetch_src_pixblock |
544 | PF add PF_X, PF_X, #8 | 615 | PF add PF_X, PF_X, #8 |
545 | PF tst PF_CTL, #0xF | 616 | PF tst PF_CTL, #0xF |
546 | vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! | 617 | ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
547 | PF addne PF_X, PF_X, #8 | 618 | PF beq 10f |
548 | PF subne PF_CTL, PF_CTL, #1 | 619 | PF add PF_X, PF_X, #8 |
549 | vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! | 620 | PF sub PF_CTL, PF_CTL, #1 |
621 | 10: | ||
622 | st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
550 | PF cmp PF_X, ORIG_W | 623 | PF cmp PF_X, ORIG_W |
551 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 624 | PF lsl DUMMY, PF_X, #src_bpp_shift |
552 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 625 | PF prfm pldl2strm, [PF_SRC, DUMMY] |
553 | PF subge PF_X, PF_X, ORIG_W | 626 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
554 | PF subges PF_CTL, PF_CTL, #0x10 | 627 | PF prfm pldl2strm, [PF_DST, DUMMY] |
555 | vqadd.u8 q14, q0, q2 | 628 | PF ble 10f |
556 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 629 | PF sub PF_X, PF_X, ORIG_W |
557 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 630 | PF subs PF_CTL, PF_CTL, #0x10 |
558 | vqadd.u8 q15, q1, q3 | 631 | 10: |
632 | uqadd v28.8b, v0.8b, v4.8b | ||
633 | PF ble 10f | ||
634 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
635 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
636 | PF add PF_SRC, PF_SRC, #1 | ||
637 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift | ||
638 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
639 | PF add PF_DST, PF_DST, #1 | ||
640 | 10: | ||
641 | uqadd v29.8b, v1.8b, v5.8b | ||
642 | uqadd v30.8b, v2.8b, v6.8b | ||
643 | uqadd v31.8b, v3.8b, v7.8b | ||
559 | .endm | 644 | .endm |
560 | 645 | ||
561 | generate_composite_function \ | 646 | generate_composite_function \ |
562 | pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ | 647 | pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ |
563 | FLAG_DST_READWRITE, \ | 648 | FLAG_DST_READWRITE, \ |
@@ -580,57 +665,73 @@ generate_composite_function_single_scanline \ | |||
580 | pixman_composite_add_8888_8888_process_pixblock_tail_head | 665 | pixman_composite_add_8888_8888_process_pixblock_tail_head |
581 | 666 | ||
582 | /******************************************************************************/ | 667 | /******************************************************************************/ |
583 | 668 | ||
584 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head | 669 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head |
585 | vmvn.8 d24, d3 /* get inverted alpha */ | 670 | mvn v24.8b, v3.8b /* get inverted alpha */ |
586 | /* do alpha blending */ | 671 | /* do alpha blending */ |
587 | vmull.u8 q8, d24, d4 | 672 | umull v8.8h, v24.8b, v4.8b |
588 | vmull.u8 q9, d24, d5 | 673 | umull v9.8h, v24.8b, v5.8b |
589 | vmull.u8 q10, d24, d6 | 674 | umull v10.8h, v24.8b, v6.8b |
590 | vmull.u8 q11, d24, d7 | 675 | umull v11.8h, v24.8b, v7.8b |
591 | .endm | 676 | .endm |
592 | 677 | ||
593 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail | 678 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail |
594 | vrshr.u16 q14, q8, #8 | 679 | urshr v14.8h, v8.8h, #8 |
595 | vrshr.u16 q15, q9, #8 | 680 | urshr v15.8h, v9.8h, #8 |
596 | vrshr.u16 q12, q10, #8 | 681 | urshr v16.8h, v10.8h, #8 |
597 | vrshr.u16 q13, q11, #8 | 682 | urshr v17.8h, v11.8h, #8 |
598 | vraddhn.u16 d28, q14, q8 | 683 | raddhn v28.8b, v14.8h, v8.8h |
599 | vraddhn.u16 d29, q15, q9 | 684 | raddhn v29.8b, v15.8h, v9.8h |
600 | vraddhn.u16 d30, q12, q10 | 685 | raddhn v30.8b, v16.8h, v10.8h |
601 | vraddhn.u16 d31, q13, q11 | 686 | raddhn v31.8b, v17.8h, v11.8h |
602 | .endm | 687 | .endm |
603 | 688 | ||
604 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head | 689 | .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head |
605 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 690 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
606 | vrshr.u16 q14, q8, #8 | 691 | urshr v14.8h, v8.8h, #8 |
607 | PF add PF_X, PF_X, #8 | 692 | PF add PF_X, PF_X, #8 |
608 | PF tst PF_CTL, #0xF | 693 | PF tst PF_CTL, #0xF |
609 | vrshr.u16 q15, q9, #8 | 694 | urshr v15.8h, v9.8h, #8 |
610 | vrshr.u16 q12, q10, #8 | 695 | urshr v16.8h, v10.8h, #8 |
611 | vrshr.u16 q13, q11, #8 | 696 | urshr v17.8h, v11.8h, #8 |
612 | PF addne PF_X, PF_X, #8 | 697 | PF beq 10f |
613 | PF subne PF_CTL, PF_CTL, #1 | 698 | PF add PF_X, PF_X, #8 |
614 | vraddhn.u16 d28, q14, q8 | 699 | PF sub PF_CTL, PF_CTL, #1 |
615 | vraddhn.u16 d29, q15, q9 | 700 | 10: |
701 | raddhn v28.8b, v14.8h, v8.8h | ||
702 | raddhn v29.8b, v15.8h, v9.8h | ||
616 | PF cmp PF_X, ORIG_W | 703 | PF cmp PF_X, ORIG_W |
617 | vraddhn.u16 d30, q12, q10 | 704 | raddhn v30.8b, v16.8h, v10.8h |
618 | vraddhn.u16 d31, q13, q11 | 705 | raddhn v31.8b, v17.8h, v11.8h |
619 | fetch_src_pixblock | 706 | fetch_src_pixblock |
620 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 707 | PF lsl DUMMY, PF_X, #src_bpp_shift |
621 | vmvn.8 d22, d3 | 708 | PF prfm pldl2strm, [PF_SRC, DUMMY] |
622 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 709 | mvn v22.8b, v3.8b |
623 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 710 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
624 | PF subge PF_X, PF_X, ORIG_W | 711 | PF prfm pldl2strm, [PF_DST, DUMMY] |
625 | vmull.u8 q8, d22, d4 | 712 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
626 | PF subges PF_CTL, PF_CTL, #0x10 | 713 | PF ble 10f |
627 | vmull.u8 q9, d22, d5 | 714 | PF sub PF_X, PF_X, ORIG_W |
628 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 715 | 10: |
629 | vmull.u8 q10, d22, d6 | 716 | umull v8.8h, v22.8b, v4.8b |
630 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 717 | PF ble 10f |
631 | vmull.u8 q11, d22, d7 | 718 | PF subs PF_CTL, PF_CTL, #0x10 |
719 | 10: | ||
720 | umull v9.8h, v22.8b, v5.8b | ||
721 | PF ble 10f | ||
722 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
723 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
724 | PF add PF_SRC, PF_SRC, #1 | ||
725 | 10: | ||
726 | umull v10.8h, v22.8b, v6.8b | ||
727 | PF ble 10f | ||
728 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift | ||
729 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
730 | PF add PF_DST, PF_DST, #1 | ||
731 | 10: | ||
732 | umull v11.8h, v22.8b, v7.8b | ||
632 | .endm | 733 | .endm |
633 | 734 | ||
634 | generate_composite_function_single_scanline \ | 735 | generate_composite_function_single_scanline \ |
635 | pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ | 736 | pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ |
636 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 737 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -647,44 +748,64 @@ generate_composite_function_single_scanline \ | |||
647 | pixman_composite_out_reverse_8888_8888_process_pixblock_head | 748 | pixman_composite_out_reverse_8888_8888_process_pixblock_head |
648 | .endm | 749 | .endm |
649 | 750 | ||
650 | .macro pixman_composite_over_8888_8888_process_pixblock_tail | 751 | .macro pixman_composite_over_8888_8888_process_pixblock_tail |
651 | pixman_composite_out_reverse_8888_8888_process_pixblock_tail | 752 | pixman_composite_out_reverse_8888_8888_process_pixblock_tail |
652 | vqadd.u8 q14, q0, q14 | 753 | uqadd v28.8b, v0.8b, v28.8b |
653 | vqadd.u8 q15, q1, q15 | 754 | uqadd v29.8b, v1.8b, v29.8b |
755 | uqadd v30.8b, v2.8b, v30.8b | ||
756 | uqadd v31.8b, v3.8b, v31.8b | ||
654 | .endm | 757 | .endm |
655 | 758 | ||
656 | .macro pixman_composite_over_8888_8888_process_pixblock_tail_head | 759 | .macro pixman_composite_over_8888_8888_process_pixblock_tail_head |
657 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 760 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
658 | vrshr.u16 q14, q8, #8 | 761 | urshr v14.8h, v8.8h, #8 |
659 | PF add PF_X, PF_X, #8 | 762 | PF add PF_X, PF_X, #8 |
660 | PF tst PF_CTL, #0xF | 763 | PF tst PF_CTL, #0xF |
661 | vrshr.u16 q15, q9, #8 | 764 | urshr v15.8h, v9.8h, #8 |
662 | vrshr.u16 q12, q10, #8 | 765 | urshr v16.8h, v10.8h, #8 |
663 | vrshr.u16 q13, q11, #8 | 766 | urshr v17.8h, v11.8h, #8 |
664 | PF addne PF_X, PF_X, #8 | 767 | PF beq 10f |
665 | PF subne PF_CTL, PF_CTL, #1 | 768 | PF add PF_X, PF_X, #8 |
666 | vraddhn.u16 d28, q14, q8 | 769 | PF sub PF_CTL, PF_CTL, #1 |
667 | vraddhn.u16 d29, q15, q9 | 770 | 10: |
771 | raddhn v28.8b, v14.8h, v8.8h | ||
772 | raddhn v29.8b, v15.8h, v9.8h | ||
668 | PF cmp PF_X, ORIG_W | 773 | PF cmp PF_X, ORIG_W |
669 | vraddhn.u16 d30, q12, q10 | 774 | raddhn v30.8b, v16.8h, v10.8h |
670 | vraddhn.u16 d31, q13, q11 | 775 | raddhn v31.8b, v17.8h, v11.8h |
671 | vqadd.u8 q14, q0, q14 | 776 | uqadd v28.8b, v0.8b, v28.8b |
672 | vqadd.u8 q15, q1, q15 | 777 | uqadd v29.8b, v1.8b, v29.8b |
778 | uqadd v30.8b, v2.8b, v30.8b | ||
779 | uqadd v31.8b, v3.8b, v31.8b | ||
673 | fetch_src_pixblock | 780 | fetch_src_pixblock |
674 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 781 | PF lsl DUMMY, PF_X, #src_bpp_shift |
675 | vmvn.8 d22, d3 | 782 | PF prfm pldl2strm, [PF_SRC, DUMMY] |
676 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 783 | mvn v22.8b, v3.8b |
677 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 784 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
678 | PF subge PF_X, PF_X, ORIG_W | 785 | PF prfm pldl2strm, [PF_DST, DUMMY] |
679 | vmull.u8 q8, d22, d4 | 786 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
680 | PF subges PF_CTL, PF_CTL, #0x10 | 787 | PF ble 10f |
681 | vmull.u8 q9, d22, d5 | 788 | PF sub PF_X, PF_X, ORIG_W |
682 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 789 | 10: |
683 | vmull.u8 q10, d22, d6 | 790 | umull v8.8h, v22.8b, v4.8b |
684 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 791 | PF ble 10f |
685 | vmull.u8 q11, d22, d7 | 792 | PF subs PF_CTL, PF_CTL, #0x10 |
793 | 10: | ||
794 | umull v9.8h, v22.8b, v5.8b | ||
795 | PF ble 10f | ||
796 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
797 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
798 | PF add PF_SRC, PF_SRC, #1 | ||
799 | 10: | ||
800 | umull v10.8h, v22.8b, v6.8b | ||
801 | PF ble 10f | ||
802 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift | ||
803 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
804 | PF add PF_DST, PF_DST, #1 | ||
805 | 10: | ||
806 | umull v11.8h, v22.8b, v7.8b | ||
686 | .endm | 807 | .endm |
687 | 808 | ||
688 | generate_composite_function \ | 809 | generate_composite_function \ |
689 | pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ | 810 | pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ |
690 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 811 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -707,68 +828,80 @@ generate_composite_function_single_scanline \ | |||
707 | pixman_composite_over_8888_8888_process_pixblock_tail_head | 828 | pixman_composite_over_8888_8888_process_pixblock_tail_head |
708 | 829 | ||
709 | /******************************************************************************/ | 830 | /******************************************************************************/ |
710 | 831 | ||
711 | .macro pixman_composite_over_n_8888_process_pixblock_head | 832 | .macro pixman_composite_over_n_8888_process_pixblock_head |
712 | /* deinterleaved source pixels in {d0, d1, d2, d3} */ | 833 | /* deinterleaved source pixels in {v0, v1, v2, v3} */ |
713 | /* inverted alpha in {d24} */ | 834 | /* inverted alpha in {v24} */ |
714 | /* destination pixels in {d4, d5, d6, d7} */ | 835 | /* destination pixels in {v4, v5, v6, v7} */ |
715 | vmull.u8 q8, d24, d4 | 836 | umull v8.8h, v24.8b, v4.8b |
716 | vmull.u8 q9, d24, d5 | 837 | umull v9.8h, v24.8b, v5.8b |
717 | vmull.u8 q10, d24, d6 | 838 | umull v10.8h, v24.8b, v6.8b |
718 | vmull.u8 q11, d24, d7 | 839 | umull v11.8h, v24.8b, v7.8b |
719 | .endm | 840 | .endm |
720 | 841 | ||
721 | .macro pixman_composite_over_n_8888_process_pixblock_tail | 842 | .macro pixman_composite_over_n_8888_process_pixblock_tail |
722 | vrshr.u16 q14, q8, #8 | 843 | urshr v14.8h, v8.8h, #8 |
723 | vrshr.u16 q15, q9, #8 | 844 | urshr v15.8h, v9.8h, #8 |
724 | vrshr.u16 q2, q10, #8 | 845 | urshr v16.8h, v10.8h, #8 |
725 | vrshr.u16 q3, q11, #8 | 846 | urshr v17.8h, v11.8h, #8 |
726 | vraddhn.u16 d28, q14, q8 | 847 | raddhn v28.8b, v14.8h, v8.8h |
727 | vraddhn.u16 d29, q15, q9 | 848 | raddhn v29.8b, v15.8h, v9.8h |
728 | vraddhn.u16 d30, q2, q10 | 849 | raddhn v30.8b, v16.8h, v10.8h |
729 | vraddhn.u16 d31, q3, q11 | 850 | raddhn v31.8b, v17.8h, v11.8h |
730 | vqadd.u8 q14, q0, q14 | 851 | uqadd v28.8b, v0.8b, v28.8b |
731 | vqadd.u8 q15, q1, q15 | 852 | uqadd v29.8b, v1.8b, v29.8b |
853 | uqadd v30.8b, v2.8b, v30.8b | ||
854 | uqadd v31.8b, v3.8b, v31.8b | ||
732 | .endm | 855 | .endm |
733 | 856 | ||
734 | .macro pixman_composite_over_n_8888_process_pixblock_tail_head | 857 | .macro pixman_composite_over_n_8888_process_pixblock_tail_head |
735 | vrshr.u16 q14, q8, #8 | 858 | urshr v14.8h, v8.8h, #8 |
736 | vrshr.u16 q15, q9, #8 | 859 | urshr v15.8h, v9.8h, #8 |
737 | vrshr.u16 q2, q10, #8 | 860 | urshr v16.8h, v10.8h, #8 |
738 | vrshr.u16 q3, q11, #8 | 861 | urshr v17.8h, v11.8h, #8 |
739 | vraddhn.u16 d28, q14, q8 | 862 | raddhn v28.8b, v14.8h, v8.8h |
740 | vraddhn.u16 d29, q15, q9 | 863 | raddhn v29.8b, v15.8h, v9.8h |
741 | vraddhn.u16 d30, q2, q10 | 864 | raddhn v30.8b, v16.8h, v10.8h |
742 | vraddhn.u16 d31, q3, q11 | 865 | raddhn v31.8b, v17.8h, v11.8h |
743 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 866 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
744 | vqadd.u8 q14, q0, q14 | 867 | uqadd v28.8b, v0.8b, v28.8b |
745 | PF add PF_X, PF_X, #8 | 868 | PF add PF_X, PF_X, #8 |
746 | PF tst PF_CTL, #0x0F | 869 | PF tst PF_CTL, #0x0F |
747 | PF addne PF_X, PF_X, #8 | 870 | PF beq 10f |
748 | PF subne PF_CTL, PF_CTL, #1 | 871 | PF add PF_X, PF_X, #8 |
749 | vqadd.u8 q15, q1, q15 | 872 | PF sub PF_CTL, PF_CTL, #1 |
873 | 10: | ||
874 | uqadd v29.8b, v1.8b, v29.8b | ||
875 | uqadd v30.8b, v2.8b, v30.8b | ||
876 | uqadd v31.8b, v3.8b, v31.8b | ||
750 | PF cmp PF_X, ORIG_W | 877 | PF cmp PF_X, ORIG_W |
751 | vmull.u8 q8, d24, d4 | 878 | umull v8.8h, v24.8b, v4.8b |
752 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 879 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
753 | vmull.u8 q9, d24, d5 | 880 | PF prfm pldl2strm, [PF_DST, DUMMY] |
754 | PF subge PF_X, PF_X, ORIG_W | 881 | umull v9.8h, v24.8b, v5.8b |
755 | vmull.u8 q10, d24, d6 | 882 | PF ble 10f |
756 | PF subges PF_CTL, PF_CTL, #0x10 | 883 | PF sub PF_X, PF_X, ORIG_W |
757 | vmull.u8 q11, d24, d7 | 884 | 10: |
758 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 885 | umull v10.8h, v24.8b, v6.8b |
759 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 886 | PF subs PF_CTL, PF_CTL, #0x10 |
887 | umull v11.8h, v24.8b, v7.8b | ||
888 | PF ble 10f | ||
889 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift | ||
890 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
891 | PF add PF_DST, PF_DST, #1 | ||
892 | 10: | ||
893 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
760 | .endm | 894 | .endm |
761 | 895 | ||
762 | .macro pixman_composite_over_n_8888_init | 896 | .macro pixman_composite_over_n_8888_init |
763 | add DUMMY, sp, #ARGS_STACK_OFFSET | 897 | mov v3.s[0], w4 |
764 | vld1.32 {d3[0]}, [DUMMY] | 898 | dup v0.8b, v3.b[0] |
765 | vdup.8 d0, d3[0] | 899 | dup v1.8b, v3.b[1] |
766 | vdup.8 d1, d3[1] | 900 | dup v2.8b, v3.b[2] |
767 | vdup.8 d2, d3[2] | 901 | dup v3.8b, v3.b[3] |
768 | vdup.8 d3, d3[3] | 902 | mvn v24.8b, v3.8b /* get inverted alpha */ |
769 | vmvn.8 d24, d3 /* get inverted alpha */ | ||
770 | .endm | 903 | .endm |
771 | 904 | ||
772 | generate_composite_function \ | 905 | generate_composite_function \ |
773 | pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ | 906 | pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ |
774 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 907 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -781,45 +914,57 @@ generate_composite_function \ | |||
781 | pixman_composite_over_n_8888_process_pixblock_tail_head | 914 | pixman_composite_over_n_8888_process_pixblock_tail_head |
782 | 915 | ||
783 | /******************************************************************************/ | 916 | /******************************************************************************/ |
784 | 917 | ||
785 | .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head | 918 | .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head |
786 | vrshr.u16 q14, q8, #8 | 919 | urshr v14.8h, v8.8h, #8 |
787 | PF add PF_X, PF_X, #8 | 920 | PF add PF_X, PF_X, #8 |
788 | PF tst PF_CTL, #0xF | 921 | PF tst PF_CTL, #0xF |
789 | vrshr.u16 q15, q9, #8 | 922 | urshr v15.8h, v9.8h, #8 |
790 | vrshr.u16 q12, q10, #8 | 923 | urshr v12.8h, v10.8h, #8 |
791 | vrshr.u16 q13, q11, #8 | 924 | urshr v13.8h, v11.8h, #8 |
792 | PF addne PF_X, PF_X, #8 | 925 | PF beq 10f |
793 | PF subne PF_CTL, PF_CTL, #1 | 926 | PF add PF_X, PF_X, #8 |
794 | vraddhn.u16 d28, q14, q8 | 927 | PF sub PF_CTL, PF_CTL, #1 |
795 | vraddhn.u16 d29, q15, q9 | 928 | 10: |
929 | raddhn v28.8b, v14.8h, v8.8h | ||
930 | raddhn v29.8b, v15.8h, v9.8h | ||
796 | PF cmp PF_X, ORIG_W | 931 | PF cmp PF_X, ORIG_W |
797 | vraddhn.u16 d30, q12, q10 | 932 | raddhn v30.8b, v12.8h, v10.8h |
798 | vraddhn.u16 d31, q13, q11 | 933 | raddhn v31.8b, v13.8h, v11.8h |
799 | vqadd.u8 q14, q0, q14 | 934 | uqadd v28.8b, v0.8b, v28.8b |
800 | vqadd.u8 q15, q1, q15 | 935 | uqadd v29.8b, v1.8b, v29.8b |
801 | vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! | 936 | uqadd v30.8b, v2.8b, v30.8b |
802 | vmvn.8 d22, d3 | 937 | uqadd v31.8b, v3.8b, v31.8b |
803 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 938 | ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 |
804 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 939 | mvn v22.8b, v3.8b |
805 | PF subge PF_X, PF_X, ORIG_W | 940 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
806 | vmull.u8 q8, d22, d4 | 941 | PF prfm pldl2strm, [PF_DST, DUMMY] |
807 | PF subges PF_CTL, PF_CTL, #0x10 | 942 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
808 | vmull.u8 q9, d22, d5 | 943 | PF blt 10f |
809 | vmull.u8 q10, d22, d6 | 944 | PF sub PF_X, PF_X, ORIG_W |
810 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 945 | 10: |
811 | vmull.u8 q11, d22, d7 | 946 | umull v8.8h, v22.8b, v4.8b |
947 | PF blt 10f | ||
948 | PF subs PF_CTL, PF_CTL, #0x10 | ||
949 | 10: | ||
950 | umull v9.8h, v22.8b, v5.8b | ||
951 | umull v10.8h, v22.8b, v6.8b | ||
952 | PF blt 10f | ||
953 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift | ||
954 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
955 | PF add PF_DST, PF_DST, #1 | ||
956 | 10: | ||
957 | umull v11.8h, v22.8b, v7.8b | ||
812 | .endm | 958 | .endm |
813 | 959 | ||
814 | .macro pixman_composite_over_reverse_n_8888_init | 960 | .macro pixman_composite_over_reverse_n_8888_init |
815 | add DUMMY, sp, #ARGS_STACK_OFFSET | 961 | mov v7.s[0], w4 |
816 | vld1.32 {d7[0]}, [DUMMY] | 962 | dup v4.8b, v7.b[0] |
817 | vdup.8 d4, d7[0] | 963 | dup v5.8b, v7.b[1] |
818 | vdup.8 d5, d7[1] | 964 | dup v6.8b, v7.b[2] |
819 | vdup.8 d6, d7[2] | 965 | dup v7.8b, v7.b[3] |
820 | vdup.8 d7, d7[3] | ||
821 | .endm | 966 | .endm |
822 | 967 | ||
823 | generate_composite_function \ | 968 | generate_composite_function \ |
824 | pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ | 969 | pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ |
825 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 970 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -836,96 +981,118 @@ generate_composite_function \ | |||
836 | 24 /* mask_basereg */ | 981 | 24 /* mask_basereg */ |
837 | 982 | ||
838 | /******************************************************************************/ | 983 | /******************************************************************************/ |
839 | 984 | ||
840 | .macro pixman_composite_over_8888_8_0565_process_pixblock_head | 985 | .macro pixman_composite_over_8888_8_0565_process_pixblock_head |
841 | vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ | 986 | umull v0.8h, v24.8b, v8.8b /* IN for SRC pixels (part1) */ |
842 | vmull.u8 q1, d24, d9 | 987 | umull v1.8h, v24.8b, v9.8b |
843 | vmull.u8 q6, d24, d10 | 988 | umull v2.8h, v24.8b, v10.8b |
844 | vmull.u8 q7, d24, d11 | 989 | umull v3.8h, v24.8b, v11.8b |
845 | vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ | 990 | mov v4.d[1], v5.d[0] |
846 | vshrn.u16 d7, q2, #3 | 991 | shrn v25.8b, v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */ |
847 | vsli.u16 q2, q2, #5 | 992 | shrn v26.8b, v4.8h, #3 |
848 | vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ | 993 | sli v4.8h, v4.8h, #5 |
849 | vrshr.u16 q9, q1, #8 | 994 | urshr v17.8h, v0.8h, #8 /* IN for SRC pixels (part2) */ |
850 | vrshr.u16 q10, q6, #8 | 995 | urshr v18.8h, v1.8h, #8 |
851 | vrshr.u16 q11, q7, #8 | 996 | urshr v19.8h, v2.8h, #8 |
852 | vraddhn.u16 d0, q0, q8 | 997 | urshr v20.8h, v3.8h, #8 |
853 | vraddhn.u16 d1, q1, q9 | 998 | raddhn v0.8b, v0.8h, v17.8h |
854 | vraddhn.u16 d2, q6, q10 | 999 | raddhn v1.8b, v1.8h, v18.8h |
855 | vraddhn.u16 d3, q7, q11 | 1000 | raddhn v2.8b, v2.8h, v19.8h |
856 | vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ | 1001 | raddhn v3.8b, v3.8h, v20.8h |
857 | vsri.u8 d7, d7, #6 | 1002 | sri v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */ |
858 | vmvn.8 d3, d3 | 1003 | sri v26.8b, v26.8b, #6 |
859 | vshrn.u16 d30, q2, #2 | 1004 | mvn v3.8b, v3.8b |
860 | vmull.u8 q8, d3, d6 /* now do alpha blending */ | 1005 | shrn v30.8b, v4.8h, #2 |
861 | vmull.u8 q9, d3, d7 | 1006 | umull v18.8h, v3.8b, v25.8b /* now do alpha blending */ |
862 | vmull.u8 q10, d3, d30 | 1007 | umull v19.8h, v3.8b, v26.8b |
1008 | umull v20.8h, v3.8b, v30.8b | ||
863 | .endm | 1009 | .endm |
864 | 1010 | ||
865 | .macro pixman_composite_over_8888_8_0565_process_pixblock_tail | 1011 | .macro pixman_composite_over_8888_8_0565_process_pixblock_tail |
866 | /* 3 cycle bubble (after vmull.u8) */ | 1012 | /* 3 cycle bubble (after vmull.u8) */ |
867 | vrshr.u16 q13, q8, #8 | 1013 | urshr v5.8h, v18.8h, #8 |
868 | vrshr.u16 q11, q9, #8 | 1014 | urshr v6.8h, v19.8h, #8 |
869 | vrshr.u16 q15, q10, #8 | 1015 | urshr v7.8h, v20.8h, #8 |
870 | vraddhn.u16 d16, q8, q13 | 1016 | raddhn v17.8b, v18.8h, v5.8h |
871 | vraddhn.u16 d27, q9, q11 | 1017 | raddhn v19.8b, v19.8h, v6.8h |
872 | vraddhn.u16 d26, q10, q15 | 1018 | raddhn v18.8b, v20.8h, v7.8h |
873 | vqadd.u8 d16, d2, d16 | 1019 | uqadd v5.8b, v2.8b, v17.8b |
874 | /* 1 cycle bubble */ | 1020 | /* 1 cycle bubble */ |
875 | vqadd.u8 q9, q0, q13 | 1021 | uqadd v6.8b, v0.8b, v18.8b |
876 | vshll.u8 q14, d16, #8 /* convert to 16bpp */ | 1022 | uqadd v7.8b, v1.8b, v19.8b |
877 | vshll.u8 q8, d19, #8 | 1023 | ushll v14.8h, v5.8b, #7 /* convert to 16bpp */ |
878 | vshll.u8 q9, d18, #8 | 1024 | sli v14.8h, v14.8h, #1 |
879 | vsri.u16 q14, q8, #5 | 1025 | ushll v18.8h, v7.8b, #7 |
1026 | sli v18.8h, v18.8h, #1 | ||
1027 | ushll v19.8h, v6.8b, #7 | ||
1028 | sli v19.8h, v19.8h, #1 | ||
1029 | sri v14.8h, v18.8h, #5 | ||
880 | /* 1 cycle bubble */ | 1030 | /* 1 cycle bubble */ |
881 | vsri.u16 q14, q9, #11 | 1031 | sri v14.8h, v19.8h, #11 |
1032 | mov v28.d[0], v14.d[0] | ||
1033 | mov v29.d[0], v14.d[1] | ||
882 | .endm | 1034 | .endm |
883 | 1035 | ||
884 | .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head | 1036 | .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head |
885 | vld1.16 {d4, d5}, [DST_R, :128]! | 1037 | #if 0 |
886 | vshrn.u16 d6, q2, #8 | 1038 | ld1 {v4.8h}, [DST_R], #16 |
1039 | shrn v25.8b, v4.8h, #8 | ||
887 | fetch_mask_pixblock | 1040 | fetch_mask_pixblock |
888 | vshrn.u16 d7, q2, #3 | 1041 | shrn v26.8b, v4.8h, #3 |
889 | fetch_src_pixblock | 1042 | fetch_src_pixblock |
890 | vmull.u8 q6, d24, d10 | 1043 | umull v22.8h, v24.8b, v10.8b |
891 | vrshr.u16 q13, q8, #8 | 1044 | urshr v13.8h, v18.8h, #8 |
892 | vrshr.u16 q11, q9, #8 | 1045 | urshr v11.8h, v19.8h, #8 |
893 | vrshr.u16 q15, q10, #8 | 1046 | urshr v15.8h, v20.8h, #8 |
894 | vraddhn.u16 d16, q8, q13 | 1047 | raddhn v17.8b, v18.8h, v13.8h |
895 | vraddhn.u16 d27, q9, q11 | 1048 | raddhn v19.8b, v19.8h, v11.8h |
896 | vraddhn.u16 d26, q10, q15 | 1049 | raddhn v18.8b, v20.8h, v15.8h |
897 | vqadd.u8 d16, d2, d16 | 1050 | uqadd v17.8b, v2.8b, v17.8b |
898 | vmull.u8 q1, d24, d9 | 1051 | umull v21.8h, v24.8b, v9.8b |
899 | vqadd.u8 q9, q0, q13 | 1052 | uqadd v18.8b, v0.8b, v18.8b |
900 | vshll.u8 q14, d16, #8 | 1053 | uqadd v19.8b, v1.8b, v19.8b |
901 | vmull.u8 q0, d24, d8 | 1054 | ushll v14.8h, v17.8b, #7 |
902 | vshll.u8 q8, d19, #8 | 1055 | sli v14.8h, v14.8h, #1 |
903 | vshll.u8 q9, d18, #8 | 1056 | umull v20.8h, v24.8b, v8.8b |
904 | vsri.u16 q14, q8, #5 | 1057 | ushll v18.8h, v18.8b, #7 |
905 | vmull.u8 q7, d24, d11 | 1058 | sli v18.8h, v18.8h, #1 |
906 | vsri.u16 q14, q9, #11 | 1059 | ushll v19.8h, v19.8b, #7 |
1060 | sli v19.8h, v19.8h, #1 | ||
1061 | sri v14.8h, v18.8h, #5 | ||
1062 | umull v23.8h, v24.8b, v11.8b | ||
1063 | sri v14.8h, v19.8h, #11 | ||
1064 | mov v28.d[0], v14.d[0] | ||
1065 | mov v29.d[0], v14.d[1] | ||
907 | 1066 | ||
908 | cache_preload 8, 8 | 1067 | cache_preload 8, 8 |
909 | 1068 | ||
910 | vsli.u16 q2, q2, #5 | 1069 | sli v4.8h, v4.8h, #5 |
911 | vrshr.u16 q8, q0, #8 | 1070 | urshr v16.8h, v20.8h, #8 |
912 | vrshr.u16 q9, q1, #8 | 1071 | urshr v17.8h, v21.8h, #8 |
913 | vrshr.u16 q10, q6, #8 | 1072 | urshr v18.8h, v22.8h, #8 |
914 | vrshr.u16 q11, q7, #8 | 1073 | urshr v19.8h, v23.8h, #8 |
915 | vraddhn.u16 d0, q0, q8 | 1074 | raddhn v0.8b, v20.8h, v16.8h |
916 | vraddhn.u16 d1, q1, q9 | 1075 | raddhn v1.8b, v21.8h, v17.8h |
917 | vraddhn.u16 d2, q6, q10 | 1076 | raddhn v2.8b, v22.8h, v18.8h |
918 | vraddhn.u16 d3, q7, q11 | 1077 | raddhn v3.8b, v23.8h, v19.8h |
919 | vsri.u8 d6, d6, #5 | 1078 | sri v25.8b, v25.8b, #5 |
920 | vsri.u8 d7, d7, #6 | 1079 | sri v26.8b, v26.8b, #6 |
921 | vmvn.8 d3, d3 | 1080 | mvn v3.8b, v3.8b |
922 | vshrn.u16 d30, q2, #2 | 1081 | shrn v30.8b, v4.8h, #2 |
923 | vst1.16 {d28, d29}, [DST_W, :128]! | 1082 | st1 {v14.8h}, [DST_W], #16 |
924 | vmull.u8 q8, d3, d6 | 1083 | umull v18.8h, v3.8b, v25.8b |
925 | vmull.u8 q9, d3, d7 | 1084 | umull v19.8h, v3.8b, v26.8b |
926 | vmull.u8 q10, d3, d30 | 1085 | umull v20.8h, v3.8b, v30.8b |
1086 | #else | ||
1087 | pixman_composite_over_8888_8_0565_process_pixblock_tail | ||
1088 | st1 {v28.4h, v29.4h}, [DST_W], #16 | ||
1089 | ld1 {v4.4h, v5.4h}, [DST_R], #16 | ||
1090 | fetch_mask_pixblock | ||
1091 | fetch_src_pixblock | ||
1092 | pixman_composite_over_8888_8_0565_process_pixblock_head | ||
1093 | #endif | ||
927 | .endm | 1094 | .endm |
928 | 1095 | ||
929 | generate_composite_function \ | 1096 | generate_composite_function \ |
930 | pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ | 1097 | pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ |
931 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 1098 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -952,21 +1119,18 @@ generate_composite_function \ | |||
952 | * to ABI. These registers are restored from 'cleanup' macro. All the | 1119 | * to ABI. These registers are restored from 'cleanup' macro. All the |
953 | * other NEON registers are caller saved, so can be clobbered freely | 1120 | * other NEON registers are caller saved, so can be clobbered freely |
954 | * without introducing any problems. | 1121 | * without introducing any problems. |
955 | */ | 1122 | */ |
956 | .macro pixman_composite_over_n_8_0565_init | 1123 | .macro pixman_composite_over_n_8_0565_init |
957 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1124 | mov v11.s[0], w4 |
958 | vpush {d8-d15} | 1125 | dup v8.8b, v11.b[0] |
959 | vld1.32 {d11[0]}, [DUMMY] | 1126 | dup v9.8b, v11.b[1] |
960 | vdup.8 d8, d11[0] | 1127 | dup v10.8b, v11.b[2] |
961 | vdup.8 d9, d11[1] | 1128 | dup v11.8b, v11.b[3] |
962 | vdup.8 d10, d11[2] | ||
963 | vdup.8 d11, d11[3] | ||
964 | .endm | 1129 | .endm |
965 | 1130 | ||
966 | .macro pixman_composite_over_n_8_0565_cleanup | 1131 | .macro pixman_composite_over_n_8_0565_cleanup |
967 | vpop {d8-d15} | ||
968 | .endm | 1132 | .endm |
969 | 1133 | ||
970 | generate_composite_function \ | 1134 | generate_composite_function \ |
971 | pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ | 1135 | pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ |
972 | FLAG_DST_READWRITE, \ | 1136 | FLAG_DST_READWRITE, \ |
@@ -974,23 +1138,24 @@ generate_composite_function \ | |||
974 | 5, /* prefetch distance */ \ | 1138 | 5, /* prefetch distance */ \ |
975 | pixman_composite_over_n_8_0565_init, \ | 1139 | pixman_composite_over_n_8_0565_init, \ |
976 | pixman_composite_over_n_8_0565_cleanup, \ | 1140 | pixman_composite_over_n_8_0565_cleanup, \ |
977 | pixman_composite_over_8888_8_0565_process_pixblock_head, \ | 1141 | pixman_composite_over_8888_8_0565_process_pixblock_head, \ |
978 | pixman_composite_over_8888_8_0565_process_pixblock_tail, \ | 1142 | pixman_composite_over_8888_8_0565_process_pixblock_tail, \ |
979 | pixman_composite_over_8888_8_0565_process_pixblock_tail_head | 1143 | pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ |
1144 | 28, /* dst_w_basereg */ \ | ||
1145 | 4, /* dst_r_basereg */ \ | ||
1146 | 8, /* src_basereg */ \ | ||
1147 | 24 /* mask_basereg */ | ||
980 | 1148 | ||
981 | /******************************************************************************/ | 1149 | /******************************************************************************/ |
982 | 1150 | ||
983 | .macro pixman_composite_over_8888_n_0565_init | 1151 | .macro pixman_composite_over_8888_n_0565_init |
984 | add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) | 1152 | mov v24.s[0], w6 |
985 | vpush {d8-d15} | 1153 | dup v24.8b, v24.b[3] |
986 | vld1.32 {d24[0]}, [DUMMY] | ||
987 | vdup.8 d24, d24[3] | ||
988 | .endm | 1154 | .endm |
989 | 1155 | ||
990 | .macro pixman_composite_over_8888_n_0565_cleanup | 1156 | .macro pixman_composite_over_8888_n_0565_cleanup |
991 | vpop {d8-d15} | ||
992 | .endm | 1157 | .endm |
993 | 1158 | ||
994 | generate_composite_function \ | 1159 | generate_composite_function \ |
995 | pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ | 1160 | pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ |
996 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 1161 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -1013,11 +1178,11 @@ generate_composite_function \ | |||
1013 | 1178 | ||
1014 | .macro pixman_composite_src_0565_0565_process_pixblock_tail | 1179 | .macro pixman_composite_src_0565_0565_process_pixblock_tail |
1015 | .endm | 1180 | .endm |
1016 | 1181 | ||
1017 | .macro pixman_composite_src_0565_0565_process_pixblock_tail_head | 1182 | .macro pixman_composite_src_0565_0565_process_pixblock_tail_head |
1018 | vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! | 1183 | st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 |
1019 | fetch_src_pixblock | 1184 | fetch_src_pixblock |
1020 | cache_preload 16, 16 | 1185 | cache_preload 16, 16 |
1021 | .endm | 1186 | .endm |
1022 | 1187 | ||
1023 | generate_composite_function \ | 1188 | generate_composite_function \ |
@@ -1042,21 +1207,19 @@ generate_composite_function \ | |||
1042 | 1207 | ||
1043 | .macro pixman_composite_src_n_8_process_pixblock_tail | 1208 | .macro pixman_composite_src_n_8_process_pixblock_tail |
1044 | .endm | 1209 | .endm |
1045 | 1210 | ||
1046 | .macro pixman_composite_src_n_8_process_pixblock_tail_head | 1211 | .macro pixman_composite_src_n_8_process_pixblock_tail_head |
1047 | vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! | 1212 | st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32 |
1048 | .endm | 1213 | .endm |
1049 | 1214 | ||
1050 | .macro pixman_composite_src_n_8_init | 1215 | .macro pixman_composite_src_n_8_init |
1051 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1216 | mov v0.s[0], w4 |
1052 | vld1.32 {d0[0]}, [DUMMY] | 1217 | dup v3.8b, v0.b[0] |
1053 | vsli.u64 d0, d0, #8 | 1218 | dup v2.8b, v0.b[0] |
1054 | vsli.u64 d0, d0, #16 | 1219 | dup v1.8b, v0.b[0] |
1055 | vsli.u64 d0, d0, #32 | 1220 | dup v0.8b, v0.b[0] |
1056 | vorr d1, d0, d0 | ||
1057 | vorr q1, q0, q0 | ||
1058 | .endm | 1221 | .endm |
1059 | 1222 | ||
1060 | .macro pixman_composite_src_n_8_cleanup | 1223 | .macro pixman_composite_src_n_8_cleanup |
1061 | .endm | 1224 | .endm |
1062 | 1225 | ||
@@ -1082,20 +1245,19 @@ generate_composite_function \ | |||
1082 | 1245 | ||
1083 | .macro pixman_composite_src_n_0565_process_pixblock_tail | 1246 | .macro pixman_composite_src_n_0565_process_pixblock_tail |
1084 | .endm | 1247 | .endm |
1085 | 1248 | ||
1086 | .macro pixman_composite_src_n_0565_process_pixblock_tail_head | 1249 | .macro pixman_composite_src_n_0565_process_pixblock_tail_head |
1087 | vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! | 1250 | st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 |
1088 | .endm | 1251 | .endm |
1089 | 1252 | ||
1090 | .macro pixman_composite_src_n_0565_init | 1253 | .macro pixman_composite_src_n_0565_init |
1091 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1254 | mov v0.s[0], w4 |
1092 | vld1.32 {d0[0]}, [DUMMY] | 1255 | dup v3.4h, v0.h[0] |
1093 | vsli.u64 d0, d0, #16 | 1256 | dup v2.4h, v0.h[0] |
1094 | vsli.u64 d0, d0, #32 | 1257 | dup v1.4h, v0.h[0] |
1095 | vorr d1, d0, d0 | 1258 | dup v0.4h, v0.h[0] |
1096 | vorr q1, q0, q0 | ||
1097 | .endm | 1259 | .endm |
1098 | 1260 | ||
1099 | .macro pixman_composite_src_n_0565_cleanup | 1261 | .macro pixman_composite_src_n_0565_cleanup |
1100 | .endm | 1262 | .endm |
1101 | 1263 | ||
@@ -1121,19 +1283,19 @@ generate_composite_function \ | |||
1121 | 1283 | ||
1122 | .macro pixman_composite_src_n_8888_process_pixblock_tail | 1284 | .macro pixman_composite_src_n_8888_process_pixblock_tail |
1123 | .endm | 1285 | .endm |
1124 | 1286 | ||
1125 | .macro pixman_composite_src_n_8888_process_pixblock_tail_head | 1287 | .macro pixman_composite_src_n_8888_process_pixblock_tail_head |
1126 | vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! | 1288 | st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 |
1127 | .endm | 1289 | .endm |
1128 | 1290 | ||
1129 | .macro pixman_composite_src_n_8888_init | 1291 | .macro pixman_composite_src_n_8888_init |
1130 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1292 | mov v0.s[0], w4 |
1131 | vld1.32 {d0[0]}, [DUMMY] | 1293 | dup v3.2s, v0.s[0] |
1132 | vsli.u64 d0, d0, #32 | 1294 | dup v2.2s, v0.s[0] |
1133 | vorr d1, d0, d0 | 1295 | dup v1.2s, v0.s[0] |
1134 | vorr q1, q0, q0 | 1296 | dup v0.2s, v0.s[0] |
1135 | .endm | 1297 | .endm |
1136 | 1298 | ||
1137 | .macro pixman_composite_src_n_8888_cleanup | 1299 | .macro pixman_composite_src_n_8888_cleanup |
1138 | .endm | 1300 | .endm |
1139 | 1301 | ||
@@ -1159,11 +1321,11 @@ generate_composite_function \ | |||
1159 | 1321 | ||
1160 | .macro pixman_composite_src_8888_8888_process_pixblock_tail | 1322 | .macro pixman_composite_src_8888_8888_process_pixblock_tail |
1161 | .endm | 1323 | .endm |
1162 | 1324 | ||
1163 | .macro pixman_composite_src_8888_8888_process_pixblock_tail_head | 1325 | .macro pixman_composite_src_8888_8888_process_pixblock_tail_head |
1164 | vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! | 1326 | st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 |
1165 | fetch_src_pixblock | 1327 | fetch_src_pixblock |
1166 | cache_preload 8, 8 | 1328 | cache_preload 8, 8 |
1167 | .endm | 1329 | .endm |
1168 | 1330 | ||
1169 | generate_composite_function \ | 1331 | generate_composite_function \ |
@@ -1182,28 +1344,33 @@ generate_composite_function \ | |||
1182 | 0 /* mask_basereg */ | 1344 | 0 /* mask_basereg */ |
1183 | 1345 | ||
1184 | /******************************************************************************/ | 1346 | /******************************************************************************/ |
1185 | 1347 | ||
1186 | .macro pixman_composite_src_x888_8888_process_pixblock_head | 1348 | .macro pixman_composite_src_x888_8888_process_pixblock_head |
1187 | vorr q0, q0, q2 | 1349 | orr v0.8b, v0.8b, v4.8b |
1188 | vorr q1, q1, q2 | 1350 | orr v1.8b, v1.8b, v4.8b |
1351 | orr v2.8b, v2.8b, v4.8b | ||
1352 | orr v3.8b, v3.8b, v4.8b | ||
1189 | .endm | 1353 | .endm |
1190 | 1354 | ||
1191 | .macro pixman_composite_src_x888_8888_process_pixblock_tail | 1355 | .macro pixman_composite_src_x888_8888_process_pixblock_tail |
1192 | .endm | 1356 | .endm |
1193 | 1357 | ||
1194 | .macro pixman_composite_src_x888_8888_process_pixblock_tail_head | 1358 | .macro pixman_composite_src_x888_8888_process_pixblock_tail_head |
1195 | vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! | 1359 | st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 |
1196 | fetch_src_pixblock | 1360 | fetch_src_pixblock |
1197 | vorr q0, q0, q2 | 1361 | orr v0.8b, v0.8b, v4.8b |
1198 | vorr q1, q1, q2 | 1362 | orr v1.8b, v1.8b, v4.8b |
1363 | orr v2.8b, v2.8b, v4.8b | ||
1364 | orr v3.8b, v3.8b, v4.8b | ||
1199 | cache_preload 8, 8 | 1365 | cache_preload 8, 8 |
1200 | .endm | 1366 | .endm |
1201 | 1367 | ||
1202 | .macro pixman_composite_src_x888_8888_init | 1368 | .macro pixman_composite_src_x888_8888_init |
1203 | vmov.u8 q2, #0xFF | 1369 | mov w20, #0xFF |
1204 | vshl.u32 q2, q2, #24 | 1370 | dup v4.8b, w20 |
1371 | shl v4.2s, v4.2s, #24 | ||
1205 | .endm | 1372 | .endm |
1206 | 1373 | ||
1207 | generate_composite_function \ | 1374 | generate_composite_function \ |
1208 | pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ | 1375 | pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ |
1209 | FLAG_DST_WRITEONLY, \ | 1376 | FLAG_DST_WRITEONLY, \ |
@@ -1220,64 +1387,76 @@ generate_composite_function \ | |||
1220 | 0 /* mask_basereg */ | 1387 | 0 /* mask_basereg */ |
1221 | 1388 | ||
1222 | /******************************************************************************/ | 1389 | /******************************************************************************/ |
1223 | 1390 | ||
1224 | .macro pixman_composite_src_n_8_8888_process_pixblock_head | 1391 | .macro pixman_composite_src_n_8_8888_process_pixblock_head |
1225 | /* expecting solid source in {d0, d1, d2, d3} */ | 1392 | /* expecting solid source in {v0, v1, v2, v3} */ |
1226 | /* mask is in d24 (d25, d26, d27 are unused) */ | 1393 | /* mask is in v24 (v25, v26, v27 are unused) */ |
1227 | 1394 | ||
1228 | /* in */ | 1395 | /* in */ |
1229 | vmull.u8 q8, d24, d0 | 1396 | umull v8.8h, v24.8b, v0.8b |
1230 | vmull.u8 q9, d24, d1 | 1397 | umull v9.8h, v24.8b, v1.8b |
1231 | vmull.u8 q10, d24, d2 | 1398 | umull v10.8h, v24.8b, v2.8b |
1232 | vmull.u8 q11, d24, d3 | 1399 | umull v11.8h, v24.8b, v3.8b |
1233 | vrsra.u16 q8, q8, #8 | 1400 | ursra v8.8h, v8.8h, #8 |
1234 | vrsra.u16 q9, q9, #8 | 1401 | ursra v9.8h, v9.8h, #8 |
1235 | vrsra.u16 q10, q10, #8 | 1402 | ursra v10.8h, v10.8h, #8 |
1236 | vrsra.u16 q11, q11, #8 | 1403 | ursra v11.8h, v11.8h, #8 |
1237 | .endm | 1404 | .endm |
1238 | 1405 | ||
1239 | .macro pixman_composite_src_n_8_8888_process_pixblock_tail | 1406 | .macro pixman_composite_src_n_8_8888_process_pixblock_tail |
1240 | vrshrn.u16 d28, q8, #8 | 1407 | rshrn v28.8b, v8.8h, #8 |
1241 | vrshrn.u16 d29, q9, #8 | 1408 | rshrn v29.8b, v9.8h, #8 |
1242 | vrshrn.u16 d30, q10, #8 | 1409 | rshrn v30.8b, v10.8h, #8 |
1243 | vrshrn.u16 d31, q11, #8 | 1410 | rshrn v31.8b, v11.8h, #8 |
1244 | .endm | 1411 | .endm |
1245 | 1412 | ||
1246 | .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head | 1413 | .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head |
1247 | fetch_mask_pixblock | 1414 | fetch_mask_pixblock |
1248 | PF add PF_X, PF_X, #8 | 1415 | PF add PF_X, PF_X, #8 |
1249 | vrshrn.u16 d28, q8, #8 | 1416 | rshrn v28.8b, v8.8h, #8 |
1250 | PF tst PF_CTL, #0x0F | 1417 | PF tst PF_CTL, #0x0F |
1251 | vrshrn.u16 d29, q9, #8 | 1418 | rshrn v29.8b, v9.8h, #8 |
1252 | PF addne PF_X, PF_X, #8 | 1419 | PF beq 10f |
1253 | vrshrn.u16 d30, q10, #8 | 1420 | PF add PF_X, PF_X, #8 |
1254 | PF subne PF_CTL, PF_CTL, #1 | 1421 | 10: |
1255 | vrshrn.u16 d31, q11, #8 | 1422 | rshrn v30.8b, v10.8h, #8 |
1423 | PF beq 10f | ||
1424 | PF sub PF_CTL, PF_CTL, #1 | ||
1425 | 10: | ||
1426 | rshrn v31.8b, v11.8h, #8 | ||
1256 | PF cmp PF_X, ORIG_W | 1427 | PF cmp PF_X, ORIG_W |
1257 | vmull.u8 q8, d24, d0 | 1428 | umull v8.8h, v24.8b, v0.8b |
1258 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] | 1429 | PF lsl DUMMY, PF_X, #mask_bpp_shift |
1259 | vmull.u8 q9, d24, d1 | 1430 | PF prfm pldl2strm, [PF_MASK, DUMMY] |
1260 | PF subge PF_X, PF_X, ORIG_W | 1431 | umull v9.8h, v24.8b, v1.8b |
1261 | vmull.u8 q10, d24, d2 | 1432 | PF ble 10f |
1262 | PF subges PF_CTL, PF_CTL, #0x10 | 1433 | PF sub PF_X, PF_X, ORIG_W |
1263 | vmull.u8 q11, d24, d3 | 1434 | 10: |
1264 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! | 1435 | umull v10.8h, v24.8b, v2.8b |
1265 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 1436 | PF ble 10f |
1266 | vrsra.u16 q8, q8, #8 | 1437 | PF subs PF_CTL, PF_CTL, #0x10 |
1267 | vrsra.u16 q9, q9, #8 | 1438 | 10: |
1268 | vrsra.u16 q10, q10, #8 | 1439 | umull v11.8h, v24.8b, v3.8b |
1269 | vrsra.u16 q11, q11, #8 | 1440 | PF ble 10f |
1441 | PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift | ||
1442 | PF ldrsb DUMMY, [PF_MASK, DUMMY] | ||
1443 | PF add PF_MASK, PF_MASK, #1 | ||
1444 | 10: | ||
1445 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
1446 | ursra v8.8h, v8.8h, #8 | ||
1447 | ursra v9.8h, v9.8h, #8 | ||
1448 | ursra v10.8h, v10.8h, #8 | ||
1449 | ursra v11.8h, v11.8h, #8 | ||
1270 | .endm | 1450 | .endm |
1271 | 1451 | ||
1272 | .macro pixman_composite_src_n_8_8888_init | 1452 | .macro pixman_composite_src_n_8_8888_init |
1273 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1453 | mov v3.s[0], w4 |
1274 | vld1.32 {d3[0]}, [DUMMY] | 1454 | dup v0.8b, v3.b[0] |
1275 | vdup.8 d0, d3[0] | 1455 | dup v1.8b, v3.b[1] |
1276 | vdup.8 d1, d3[1] | 1456 | dup v2.8b, v3.b[2] |
1277 | vdup.8 d2, d3[2] | 1457 | dup v3.8b, v3.b[3] |
1278 | vdup.8 d3, d3[3] | ||
1279 | .endm | 1458 | .endm |
1280 | 1459 | ||
1281 | .macro pixman_composite_src_n_8_8888_cleanup | 1460 | .macro pixman_composite_src_n_8_8888_cleanup |
1282 | .endm | 1461 | .endm |
1283 | 1462 | ||
@@ -1293,57 +1472,69 @@ generate_composite_function \ | |||
1293 | pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ | 1472 | pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ |
1294 | 1473 | ||
1295 | /******************************************************************************/ | 1474 | /******************************************************************************/ |
1296 | 1475 | ||
1297 | .macro pixman_composite_src_n_8_8_process_pixblock_head | 1476 | .macro pixman_composite_src_n_8_8_process_pixblock_head |
1298 | vmull.u8 q0, d24, d16 | 1477 | umull v0.8h, v24.8b, v16.8b |
1299 | vmull.u8 q1, d25, d16 | 1478 | umull v1.8h, v25.8b, v16.8b |
1300 | vmull.u8 q2, d26, d16 | 1479 | umull v2.8h, v26.8b, v16.8b |
1301 | vmull.u8 q3, d27, d16 | 1480 | umull v3.8h, v27.8b, v16.8b |
1302 | vrsra.u16 q0, q0, #8 | 1481 | ursra v0.8h, v0.8h, #8 |
1303 | vrsra.u16 q1, q1, #8 | 1482 | ursra v1.8h, v1.8h, #8 |
1304 | vrsra.u16 q2, q2, #8 | 1483 | ursra v2.8h, v2.8h, #8 |
1305 | vrsra.u16 q3, q3, #8 | 1484 | ursra v3.8h, v3.8h, #8 |
1306 | .endm | 1485 | .endm |
1307 | 1486 | ||
1308 | .macro pixman_composite_src_n_8_8_process_pixblock_tail | 1487 | .macro pixman_composite_src_n_8_8_process_pixblock_tail |
1309 | vrshrn.u16 d28, q0, #8 | 1488 | rshrn v28.8b, v0.8h, #8 |
1310 | vrshrn.u16 d29, q1, #8 | 1489 | rshrn v29.8b, v1.8h, #8 |
1311 | vrshrn.u16 d30, q2, #8 | 1490 | rshrn v30.8b, v2.8h, #8 |
1312 | vrshrn.u16 d31, q3, #8 | 1491 | rshrn v31.8b, v3.8h, #8 |
1313 | .endm | 1492 | .endm |
1314 | 1493 | ||
1315 | .macro pixman_composite_src_n_8_8_process_pixblock_tail_head | 1494 | .macro pixman_composite_src_n_8_8_process_pixblock_tail_head |
1316 | fetch_mask_pixblock | 1495 | fetch_mask_pixblock |
1317 | PF add PF_X, PF_X, #8 | 1496 | PF add PF_X, PF_X, #8 |
1318 | vrshrn.u16 d28, q0, #8 | 1497 | rshrn v28.8b, v0.8h, #8 |
1319 | PF tst PF_CTL, #0x0F | 1498 | PF tst PF_CTL, #0x0F |
1320 | vrshrn.u16 d29, q1, #8 | 1499 | rshrn v29.8b, v1.8h, #8 |
1321 | PF addne PF_X, PF_X, #8 | 1500 | PF beq 10f |
1322 | vrshrn.u16 d30, q2, #8 | 1501 | PF add PF_X, PF_X, #8 |
1323 | PF subne PF_CTL, PF_CTL, #1 | 1502 | 10: |
1324 | vrshrn.u16 d31, q3, #8 | 1503 | rshrn v30.8b, v2.8h, #8 |
1504 | PF beq 10f | ||
1505 | PF sub PF_CTL, PF_CTL, #1 | ||
1506 | 10: | ||
1507 | rshrn v31.8b, v3.8h, #8 | ||
1325 | PF cmp PF_X, ORIG_W | 1508 | PF cmp PF_X, ORIG_W |
1326 | vmull.u8 q0, d24, d16 | 1509 | umull v0.8h, v24.8b, v16.8b |
1327 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] | 1510 | PF lsl DUMMY, PF_X, mask_bpp_shift |
1328 | vmull.u8 q1, d25, d16 | 1511 | PF prfm pldl2strm, [PF_MASK, DUMMY] |
1329 | PF subge PF_X, PF_X, ORIG_W | 1512 | umull v1.8h, v25.8b, v16.8b |
1330 | vmull.u8 q2, d26, d16 | 1513 | PF ble 10f |
1331 | PF subges PF_CTL, PF_CTL, #0x10 | 1514 | PF sub PF_X, PF_X, ORIG_W |
1332 | vmull.u8 q3, d27, d16 | 1515 | 10: |
1333 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! | 1516 | umull v2.8h, v26.8b, v16.8b |
1334 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! | 1517 | PF ble 10f |
1335 | vrsra.u16 q0, q0, #8 | 1518 | PF subs PF_CTL, PF_CTL, #0x10 |
1336 | vrsra.u16 q1, q1, #8 | 1519 | 10: |
1337 | vrsra.u16 q2, q2, #8 | 1520 | umull v3.8h, v27.8b, v16.8b |
1338 | vrsra.u16 q3, q3, #8 | 1521 | PF ble 10f |
1522 | PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift | ||
1523 | PF ldrsb DUMMY, [PF_MASK, DUMMY] | ||
1524 | PF add PF_MASK, PF_MASK, #1 | ||
1525 | 10: | ||
1526 | st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
1527 | ursra v0.8h, v0.8h, #8 | ||
1528 | ursra v1.8h, v1.8h, #8 | ||
1529 | ursra v2.8h, v2.8h, #8 | ||
1530 | ursra v3.8h, v3.8h, #8 | ||
1339 | .endm | 1531 | .endm |
1340 | 1532 | ||
1341 | .macro pixman_composite_src_n_8_8_init | 1533 | .macro pixman_composite_src_n_8_8_init |
1342 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1534 | mov v16.s[0], w4 |
1343 | vld1.32 {d16[0]}, [DUMMY] | 1535 | dup v16.8b, v16.b[3] |
1344 | vdup.8 d16, d16[3] | ||
1345 | .endm | 1536 | .endm |
1346 | 1537 | ||
1347 | .macro pixman_composite_src_n_8_8_cleanup | 1538 | .macro pixman_composite_src_n_8_8_cleanup |
1348 | .endm | 1539 | .endm |
1349 | 1540 | ||
@@ -1359,107 +1550,126 @@ generate_composite_function \ | |||
1359 | pixman_composite_src_n_8_8_process_pixblock_tail_head | 1550 | pixman_composite_src_n_8_8_process_pixblock_tail_head |
1360 | 1551 | ||
1361 | /******************************************************************************/ | 1552 | /******************************************************************************/ |
1362 | 1553 | ||
1363 | .macro pixman_composite_over_n_8_8888_process_pixblock_head | 1554 | .macro pixman_composite_over_n_8_8888_process_pixblock_head |
1364 | /* expecting deinterleaved source data in {d8, d9, d10, d11} */ | 1555 | /* expecting deinterleaved source data in {v8, v9, v10, v11} */ |
1365 | /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ | 1556 | /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ |
1366 | /* and destination data in {d4, d5, d6, d7} */ | 1557 | /* and destination data in {v4, v5, v6, v7} */ |
1367 | /* mask is in d24 (d25, d26, d27 are unused) */ | 1558 | /* mask is in v24 (v25, v26, v27 are unused) */ |
1368 | 1559 | ||
1369 | /* in */ | 1560 | /* in */ |
1370 | vmull.u8 q6, d24, d8 | 1561 | umull v12.8h, v24.8b, v8.8b |
1371 | vmull.u8 q7, d24, d9 | 1562 | umull v13.8h, v24.8b, v9.8b |
1372 | vmull.u8 q8, d24, d10 | 1563 | umull v14.8h, v24.8b, v10.8b |
1373 | vmull.u8 q9, d24, d11 | 1564 | umull v15.8h, v24.8b, v11.8b |
1374 | vrshr.u16 q10, q6, #8 | 1565 | urshr v16.8h, v12.8h, #8 |
1375 | vrshr.u16 q11, q7, #8 | 1566 | urshr v17.8h, v13.8h, #8 |
1376 | vrshr.u16 q12, q8, #8 | 1567 | urshr v18.8h, v14.8h, #8 |
1377 | vrshr.u16 q13, q9, #8 | 1568 | urshr v19.8h, v15.8h, #8 |
1378 | vraddhn.u16 d0, q6, q10 | 1569 | raddhn v0.8b, v12.8h, v16.8h |
1379 | vraddhn.u16 d1, q7, q11 | 1570 | raddhn v1.8b, v13.8h, v17.8h |
1380 | vraddhn.u16 d2, q8, q12 | 1571 | raddhn v2.8b, v14.8h, v18.8h |
1381 | vraddhn.u16 d3, q9, q13 | 1572 | raddhn v3.8b, v15.8h, v19.8h |
1382 | vmvn.8 d25, d3 /* get inverted alpha */ | 1573 | mvn v25.8b, v3.8b /* get inverted alpha */ |
1383 | /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ | 1574 | /* source: v0 - blue, v1 - green, v2 - red, v3 - alpha */ |
1384 | /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ | 1575 | /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */ |
1385 | /* now do alpha blending */ | 1576 | /* now do alpha blending */ |
1386 | vmull.u8 q8, d25, d4 | 1577 | umull v12.8h, v25.8b, v4.8b |
1387 | vmull.u8 q9, d25, d5 | 1578 | umull v13.8h, v25.8b, v5.8b |
1388 | vmull.u8 q10, d25, d6 | 1579 | umull v14.8h, v25.8b, v6.8b |
1389 | vmull.u8 q11, d25, d7 | 1580 | umull v15.8h, v25.8b, v7.8b |
1390 | .endm | 1581 | .endm |
1391 | 1582 | ||
1392 | .macro pixman_composite_over_n_8_8888_process_pixblock_tail | 1583 | .macro pixman_composite_over_n_8_8888_process_pixblock_tail |
1393 | vrshr.u16 q14, q8, #8 | 1584 | urshr v16.8h, v12.8h, #8 |
1394 | vrshr.u16 q15, q9, #8 | 1585 | urshr v17.8h, v13.8h, #8 |
1395 | vrshr.u16 q6, q10, #8 | 1586 | urshr v18.8h, v14.8h, #8 |
1396 | vrshr.u16 q7, q11, #8 | 1587 | urshr v19.8h, v15.8h, #8 |
1397 | vraddhn.u16 d28, q14, q8 | 1588 | raddhn v28.8b, v16.8h, v12.8h |
1398 | vraddhn.u16 d29, q15, q9 | 1589 | raddhn v29.8b, v17.8h, v13.8h |
1399 | vraddhn.u16 d30, q6, q10 | 1590 | raddhn v30.8b, v18.8h, v14.8h |
1400 | vraddhn.u16 d31, q7, q11 | 1591 | raddhn v31.8b, v19.8h, v15.8h |
1401 | vqadd.u8 q14, q0, q14 | 1592 | uqadd v28.8b, v0.8b, v28.8b |
1402 | vqadd.u8 q15, q1, q15 | 1593 | uqadd v29.8b, v1.8b, v29.8b |
1594 | uqadd v30.8b, v2.8b, v30.8b | ||
1595 | uqadd v31.8b, v3.8b, v31.8b | ||
1403 | .endm | 1596 | .endm |
1404 | 1597 | ||
1405 | .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head | 1598 | .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head |
1406 | vrshr.u16 q14, q8, #8 | 1599 | urshr v16.8h, v12.8h, #8 |
1407 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 1600 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
1408 | vrshr.u16 q15, q9, #8 | 1601 | urshr v17.8h, v13.8h, #8 |
1409 | fetch_mask_pixblock | 1602 | fetch_mask_pixblock |
1410 | vrshr.u16 q6, q10, #8 | 1603 | urshr v18.8h, v14.8h, #8 |
1411 | PF add PF_X, PF_X, #8 | 1604 | PF add PF_X, PF_X, #8 |
1412 | vrshr.u16 q7, q11, #8 | 1605 | urshr v19.8h, v15.8h, #8 |
1413 | PF tst PF_CTL, #0x0F | 1606 | PF tst PF_CTL, #0x0F |
1414 | vraddhn.u16 d28, q14, q8 | 1607 | raddhn v28.8b, v16.8h, v12.8h |
1415 | PF addne PF_X, PF_X, #8 | 1608 | PF beq 10f |
1416 | vraddhn.u16 d29, q15, q9 | 1609 | PF add PF_X, PF_X, #8 |
1417 | PF subne PF_CTL, PF_CTL, #1 | 1610 | 10: |
1418 | vraddhn.u16 d30, q6, q10 | 1611 | raddhn v29.8b, v17.8h, v13.8h |
1612 | PF beq 10f | ||
1613 | PF sub PF_CTL, PF_CTL, #1 | ||
1614 | 10: | ||
1615 | raddhn v30.8b, v18.8h, v14.8h | ||
1419 | PF cmp PF_X, ORIG_W | 1616 | PF cmp PF_X, ORIG_W |
1420 | vraddhn.u16 d31, q7, q11 | 1617 | raddhn v31.8b, v19.8h, v15.8h |
1421 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 1618 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
1422 | vmull.u8 q6, d24, d8 | 1619 | PF prfm pldl2strm, [PF_DST, DUMMY] |
1423 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] | 1620 | umull v16.8h, v24.8b, v8.8b |
1424 | vmull.u8 q7, d24, d9 | 1621 | PF lsl DUMMY, PF_X, #mask_bpp_shift |
1425 | PF subge PF_X, PF_X, ORIG_W | 1622 | PF prfm pldl2strm, [PF_MASK, DUMMY] |
1426 | vmull.u8 q8, d24, d10 | 1623 | umull v17.8h, v24.8b, v9.8b |
1427 | PF subges PF_CTL, PF_CTL, #0x10 | 1624 | PF ble 10f |
1428 | vmull.u8 q9, d24, d11 | 1625 | PF sub PF_X, PF_X, ORIG_W |
1429 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 1626 | 10: |
1430 | vqadd.u8 q14, q0, q14 | 1627 | umull v18.8h, v24.8b, v10.8b |
1431 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! | 1628 | PF ble 10f |
1432 | vqadd.u8 q15, q1, q15 | 1629 | PF subs PF_CTL, PF_CTL, #0x10 |
1433 | vrshr.u16 q10, q6, #8 | 1630 | 10: |
1434 | vrshr.u16 q11, q7, #8 | 1631 | umull v19.8h, v24.8b, v11.8b |
1435 | vrshr.u16 q12, q8, #8 | 1632 | PF ble 10f |
1436 | vrshr.u16 q13, q9, #8 | 1633 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift |
1437 | vraddhn.u16 d0, q6, q10 | 1634 | PF ldrsb DUMMY, [PF_DST, DUMMY] |
1438 | vraddhn.u16 d1, q7, q11 | 1635 | PF add PF_DST, PF_DST, #1 |
1439 | vraddhn.u16 d2, q8, q12 | 1636 | 10: |
1440 | vraddhn.u16 d3, q9, q13 | 1637 | uqadd v28.8b, v0.8b, v28.8b |
1441 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 1638 | PF ble 10f |
1442 | vmvn.8 d25, d3 | 1639 | PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift |
1443 | vmull.u8 q8, d25, d4 | 1640 | PF ldrsb DUMMY, [PF_MASK, DUMMY] |
1444 | vmull.u8 q9, d25, d5 | 1641 | PF add PF_MASK, PF_MASK, #1 |
1445 | vmull.u8 q10, d25, d6 | 1642 | 10: |
1446 | vmull.u8 q11, d25, d7 | 1643 | uqadd v29.8b, v1.8b, v29.8b |
1644 | uqadd v30.8b, v2.8b, v30.8b | ||
1645 | uqadd v31.8b, v3.8b, v31.8b | ||
1646 | urshr v12.8h, v16.8h, #8 | ||
1647 | urshr v13.8h, v17.8h, #8 | ||
1648 | urshr v14.8h, v18.8h, #8 | ||
1649 | urshr v15.8h, v19.8h, #8 | ||
1650 | raddhn v0.8b, v16.8h, v12.8h | ||
1651 | raddhn v1.8b, v17.8h, v13.8h | ||
1652 | raddhn v2.8b, v18.8h, v14.8h | ||
1653 | raddhn v3.8b, v19.8h, v15.8h | ||
1654 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
1655 | mvn v25.8b, v3.8b | ||
1656 | umull v12.8h, v25.8b, v4.8b | ||
1657 | umull v13.8h, v25.8b, v5.8b | ||
1658 | umull v14.8h, v25.8b, v6.8b | ||
1659 | umull v15.8h, v25.8b, v7.8b | ||
1447 | .endm | 1660 | .endm |
1448 | 1661 | ||
1449 | .macro pixman_composite_over_n_8_8888_init | 1662 | .macro pixman_composite_over_n_8_8888_init |
1450 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1663 | mov v11.s[0], w4 |
1451 | vpush {d8-d15} | 1664 | dup v8.8b, v11.b[0] |
1452 | vld1.32 {d11[0]}, [DUMMY] | 1665 | dup v9.8b, v11.b[1] |
1453 | vdup.8 d8, d11[0] | 1666 | dup v10.8b, v11.b[2] |
1454 | vdup.8 d9, d11[1] | 1667 | dup v11.8b, v11.b[3] |
1455 | vdup.8 d10, d11[2] | ||
1456 | vdup.8 d11, d11[3] | ||
1457 | .endm | 1668 | .endm |
1458 | 1669 | ||
1459 | .macro pixman_composite_over_n_8_8888_cleanup | 1670 | .macro pixman_composite_over_n_8_8888_cleanup |
1460 | vpop {d8-d15} | ||
1461 | .endm | 1671 | .endm |
1462 | 1672 | ||
1463 | generate_composite_function \ | 1673 | generate_composite_function \ |
1464 | pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ | 1674 | pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ |
1465 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 1675 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -1472,62 +1682,63 @@ generate_composite_function \ | |||
1472 | pixman_composite_over_n_8_8888_process_pixblock_tail_head | 1682 | pixman_composite_over_n_8_8888_process_pixblock_tail_head |
1473 | 1683 | ||
1474 | /******************************************************************************/ | 1684 | /******************************************************************************/ |
1475 | 1685 | ||
1476 | .macro pixman_composite_over_n_8_8_process_pixblock_head | 1686 | .macro pixman_composite_over_n_8_8_process_pixblock_head |
1477 | vmull.u8 q0, d24, d8 | 1687 | umull v0.8h, v24.8b, v8.8b |
1478 | vmull.u8 q1, d25, d8 | 1688 | umull v1.8h, v25.8b, v8.8b |
1479 | vmull.u8 q6, d26, d8 | 1689 | umull v2.8h, v26.8b, v8.8b |
1480 | vmull.u8 q7, d27, d8 | 1690 | umull v3.8h, v27.8b, v8.8b |
1481 | vrshr.u16 q10, q0, #8 | 1691 | urshr v10.8h, v0.8h, #8 |
1482 | vrshr.u16 q11, q1, #8 | 1692 | urshr v11.8h, v1.8h, #8 |
1483 | vrshr.u16 q12, q6, #8 | 1693 | urshr v12.8h, v2.8h, #8 |
1484 | vrshr.u16 q13, q7, #8 | 1694 | urshr v13.8h, v3.8h, #8 |
1485 | vraddhn.u16 d0, q0, q10 | 1695 | raddhn v0.8b, v0.8h, v10.8h |
1486 | vraddhn.u16 d1, q1, q11 | 1696 | raddhn v1.8b, v1.8h, v11.8h |
1487 | vraddhn.u16 d2, q6, q12 | 1697 | raddhn v2.8b, v2.8h, v12.8h |
1488 | vraddhn.u16 d3, q7, q13 | 1698 | raddhn v3.8b, v3.8h, v13.8h |
1489 | vmvn.8 q12, q0 | 1699 | mvn v24.8b, v0.8b |
1490 | vmvn.8 q13, q1 | 1700 | mvn v25.8b, v1.8b |
1491 | vmull.u8 q8, d24, d4 | 1701 | mvn v26.8b, v2.8b |
1492 | vmull.u8 q9, d25, d5 | 1702 | mvn v27.8b, v3.8b |
1493 | vmull.u8 q10, d26, d6 | 1703 | umull v10.8h, v24.8b, v4.8b |
1494 | vmull.u8 q11, d27, d7 | 1704 | umull v11.8h, v25.8b, v5.8b |
1705 | umull v12.8h, v26.8b, v6.8b | ||
1706 | umull v13.8h, v27.8b, v7.8b | ||
1495 | .endm | 1707 | .endm |
1496 | 1708 | ||
1497 | .macro pixman_composite_over_n_8_8_process_pixblock_tail | 1709 | .macro pixman_composite_over_n_8_8_process_pixblock_tail |
1498 | vrshr.u16 q14, q8, #8 | 1710 | urshr v14.8h, v10.8h, #8 |
1499 | vrshr.u16 q15, q9, #8 | 1711 | urshr v15.8h, v11.8h, #8 |
1500 | vrshr.u16 q12, q10, #8 | 1712 | urshr v16.8h, v12.8h, #8 |
1501 | vrshr.u16 q13, q11, #8 | 1713 | urshr v17.8h, v13.8h, #8 |
1502 | vraddhn.u16 d28, q14, q8 | 1714 | raddhn v28.8b, v14.8h, v10.8h |
1503 | vraddhn.u16 d29, q15, q9 | 1715 | raddhn v29.8b, v15.8h, v11.8h |
1504 | vraddhn.u16 d30, q12, q10 | 1716 | raddhn v30.8b, v16.8h, v12.8h |
1505 | vraddhn.u16 d31, q13, q11 | 1717 | raddhn v31.8b, v17.8h, v13.8h |
1506 | vqadd.u8 q14, q0, q14 | 1718 | uqadd v28.8b, v0.8b, v28.8b |
1507 | vqadd.u8 q15, q1, q15 | 1719 | uqadd v29.8b, v1.8b, v29.8b |
1720 | uqadd v30.8b, v2.8b, v30.8b | ||
1721 | uqadd v31.8b, v3.8b, v31.8b | ||
1508 | .endm | 1722 | .endm |
1509 | 1723 | ||
1510 | /* TODO: expand macros and do better instructions scheduling */ | 1724 | /* TODO: expand macros and do better instructions scheduling */ |
1511 | .macro pixman_composite_over_n_8_8_process_pixblock_tail_head | 1725 | .macro pixman_composite_over_n_8_8_process_pixblock_tail_head |
1512 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! | 1726 | ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
1513 | pixman_composite_over_n_8_8_process_pixblock_tail | 1727 | pixman_composite_over_n_8_8_process_pixblock_tail |
1514 | fetch_mask_pixblock | 1728 | fetch_mask_pixblock |
1515 | cache_preload 32, 32 | 1729 | cache_preload 32, 32 |
1516 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! | 1730 | st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
1517 | pixman_composite_over_n_8_8_process_pixblock_head | 1731 | pixman_composite_over_n_8_8_process_pixblock_head |
1518 | .endm | 1732 | .endm |
1519 | 1733 | ||
1520 | .macro pixman_composite_over_n_8_8_init | 1734 | .macro pixman_composite_over_n_8_8_init |
1521 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1735 | mov v8.s[0], w4 |
1522 | vpush {d8-d15} | 1736 | dup v8.8b, v8.b[3] |
1523 | vld1.32 {d8[0]}, [DUMMY] | ||
1524 | vdup.8 d8, d8[3] | ||
1525 | .endm | 1737 | .endm |
1526 | 1738 | ||
1527 | .macro pixman_composite_over_n_8_8_cleanup | 1739 | .macro pixman_composite_over_n_8_8_cleanup |
1528 | vpop {d8-d15} | ||
1529 | .endm | 1740 | .endm |
1530 | 1741 | ||
1531 | generate_composite_function \ | 1742 | generate_composite_function \ |
1532 | pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ | 1743 | pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ |
1533 | FLAG_DST_READWRITE, \ | 1744 | FLAG_DST_READWRITE, \ |
@@ -1543,95 +1754,97 @@ generate_composite_function \ | |||
1543 | 1754 | ||
1544 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head | 1755 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head |
1545 | /* | 1756 | /* |
1546 | * 'combine_mask_ca' replacement | 1757 | * 'combine_mask_ca' replacement |
1547 | * | 1758 | * |
1548 | * input: solid src (n) in {d8, d9, d10, d11} | 1759 | * input: solid src (n) in {v8, v9, v10, v11} |
1549 | * dest in {d4, d5, d6, d7 } | 1760 | * dest in {v4, v5, v6, v7 } |
1550 | * mask in {d24, d25, d26, d27} | 1761 | * mask in {v24, v25, v26, v27} |
1551 | * output: updated src in {d0, d1, d2, d3 } | 1762 | * output: updated src in {v0, v1, v2, v3 } |
1552 | * updated mask in {d24, d25, d26, d3 } | 1763 | * updated mask in {v24, v25, v26, v3 } |
1553 | */ | 1764 | */ |
1554 | vmull.u8 q0, d24, d8 | 1765 | umull v0.8h, v24.8b, v8.8b |
1555 | vmull.u8 q1, d25, d9 | 1766 | umull v1.8h, v25.8b, v9.8b |
1556 | vmull.u8 q6, d26, d10 | 1767 | umull v2.8h, v26.8b, v10.8b |
1557 | vmull.u8 q7, d27, d11 | 1768 | umull v3.8h, v27.8b, v11.8b |
1558 | vmull.u8 q9, d11, d25 | 1769 | umull v12.8h, v11.8b, v25.8b |
1559 | vmull.u8 q12, d11, d24 | 1770 | umull v13.8h, v11.8b, v24.8b |
1560 | vmull.u8 q13, d11, d26 | 1771 | umull v14.8h, v11.8b, v26.8b |
1561 | vrshr.u16 q8, q0, #8 | 1772 | urshr v15.8h, v0.8h, #8 |
1562 | vrshr.u16 q10, q1, #8 | 1773 | urshr v16.8h, v1.8h, #8 |
1563 | vrshr.u16 q11, q6, #8 | 1774 | urshr v17.8h, v2.8h, #8 |
1564 | vraddhn.u16 d0, q0, q8 | 1775 | raddhn v0.8b, v0.8h, v15.8h |
1565 | vraddhn.u16 d1, q1, q10 | 1776 | raddhn v1.8b, v1.8h, v16.8h |
1566 | vraddhn.u16 d2, q6, q11 | 1777 | raddhn v2.8b, v2.8h, v17.8h |
1567 | vrshr.u16 q11, q12, #8 | 1778 | urshr v15.8h, v13.8h, #8 |
1568 | vrshr.u16 q8, q9, #8 | 1779 | urshr v16.8h, v12.8h, #8 |
1569 | vrshr.u16 q6, q13, #8 | 1780 | urshr v17.8h, v14.8h, #8 |
1570 | vrshr.u16 q10, q7, #8 | 1781 | urshr v18.8h, v3.8h, #8 |
1571 | vraddhn.u16 d24, q12, q11 | 1782 | raddhn v24.8b, v13.8h, v15.8h |
1572 | vraddhn.u16 d25, q9, q8 | 1783 | raddhn v25.8b, v12.8h, v16.8h |
1573 | vraddhn.u16 d26, q13, q6 | 1784 | raddhn v26.8b, v14.8h, v17.8h |
1574 | vraddhn.u16 d3, q7, q10 | 1785 | raddhn v3.8b, v3.8h, v18.8h |
1575 | /* | 1786 | /* |
1576 | * 'combine_over_ca' replacement | 1787 | * 'combine_over_ca' replacement |
1577 | * | 1788 | * |
1578 | * output: updated dest in {d28, d29, d30, d31} | 1789 | * output: updated dest in {v28, v29, v30, v31} |
1579 | */ | 1790 | */ |
1580 | vmvn.8 q12, q12 | 1791 | mvn v24.8b, v24.8b |
1581 | vmvn.8 d26, d26 | 1792 | mvn v25.8b, v25.8b |
1582 | vmull.u8 q8, d24, d4 | 1793 | mvn v26.8b, v26.8b |
1583 | vmull.u8 q9, d25, d5 | 1794 | mvn v27.8b, v3.8b |
1584 | vmvn.8 d27, d3 | 1795 | umull v12.8h, v24.8b, v4.8b |
1585 | vmull.u8 q10, d26, d6 | 1796 | umull v13.8h, v25.8b, v5.8b |
1586 | vmull.u8 q11, d27, d7 | 1797 | umull v14.8h, v26.8b, v6.8b |
1798 | umull v15.8h, v27.8b, v7.8b | ||
1587 | .endm | 1799 | .endm |
1588 | 1800 | ||
1589 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail | 1801 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail |
1590 | /* ... continue 'combine_over_ca' replacement */ | 1802 | /* ... continue 'combine_over_ca' replacement */ |
1591 | vrshr.u16 q14, q8, #8 | 1803 | urshr v16.8h, v12.8h, #8 |
1592 | vrshr.u16 q15, q9, #8 | 1804 | urshr v17.8h, v13.8h, #8 |
1593 | vrshr.u16 q6, q10, #8 | 1805 | urshr v18.8h, v14.8h, #8 |
1594 | vrshr.u16 q7, q11, #8 | 1806 | urshr v19.8h, v15.8h, #8 |
1595 | vraddhn.u16 d28, q14, q8 | 1807 | raddhn v28.8b, v16.8h, v12.8h |
1596 | vraddhn.u16 d29, q15, q9 | 1808 | raddhn v29.8b, v17.8h, v13.8h |
1597 | vraddhn.u16 d30, q6, q10 | 1809 | raddhn v30.8b, v18.8h, v14.8h |
1598 | vraddhn.u16 d31, q7, q11 | 1810 | raddhn v31.8b, v19.8h, v15.8h |
1599 | vqadd.u8 q14, q0, q14 | 1811 | uqadd v28.8b, v0.8b, v28.8b |
1600 | vqadd.u8 q15, q1, q15 | 1812 | uqadd v29.8b, v1.8b, v29.8b |
1813 | uqadd v30.8b, v2.8b, v30.8b | ||
1814 | uqadd v31.8b, v3.8b, v31.8b | ||
1601 | .endm | 1815 | .endm |
1602 | 1816 | ||
1603 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head | 1817 | .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head |
1604 | vrshr.u16 q14, q8, #8 | 1818 | urshr v16.8h, v12.8h, #8 |
1605 | vrshr.u16 q15, q9, #8 | 1819 | urshr v17.8h, v13.8h, #8 |
1606 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 1820 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
1607 | vrshr.u16 q6, q10, #8 | 1821 | urshr v18.8h, v14.8h, #8 |
1608 | vrshr.u16 q7, q11, #8 | 1822 | urshr v19.8h, v15.8h, #8 |
1609 | vraddhn.u16 d28, q14, q8 | 1823 | raddhn v28.8b, v16.8h, v12.8h |
1610 | vraddhn.u16 d29, q15, q9 | 1824 | raddhn v29.8b, v17.8h, v13.8h |
1611 | vraddhn.u16 d30, q6, q10 | 1825 | raddhn v30.8b, v18.8h, v14.8h |
1612 | vraddhn.u16 d31, q7, q11 | 1826 | raddhn v31.8b, v19.8h, v15.8h |
1613 | fetch_mask_pixblock | 1827 | fetch_mask_pixblock |
1614 | vqadd.u8 q14, q0, q14 | 1828 | uqadd v28.8b, v0.8b, v28.8b |
1615 | vqadd.u8 q15, q1, q15 | 1829 | uqadd v29.8b, v1.8b, v29.8b |
1830 | uqadd v30.8b, v2.8b, v30.8b | ||
1831 | uqadd v31.8b, v3.8b, v31.8b | ||
1616 | cache_preload 8, 8 | 1832 | cache_preload 8, 8 |
1617 | pixman_composite_over_n_8888_8888_ca_process_pixblock_head | 1833 | pixman_composite_over_n_8888_8888_ca_process_pixblock_head |
1618 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 1834 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
1619 | .endm | 1835 | .endm |
1620 | 1836 | ||
1621 | .macro pixman_composite_over_n_8888_8888_ca_init | 1837 | .macro pixman_composite_over_n_8888_8888_ca_init |
1622 | add DUMMY, sp, #ARGS_STACK_OFFSET | 1838 | mov v13.s[0], w4 |
1623 | vpush {d8-d15} | 1839 | dup v8.8b, v13.b[0] |
1624 | vld1.32 {d11[0]}, [DUMMY] | 1840 | dup v9.8b, v13.b[1] |
1625 | vdup.8 d8, d11[0] | 1841 | dup v10.8b, v13.b[2] |
1626 | vdup.8 d9, d11[1] | 1842 | dup v11.8b, v13.b[3] |
1627 | vdup.8 d10, d11[2] | ||
1628 | vdup.8 d11, d11[3] | ||
1629 | .endm | 1843 | .endm |
1630 | 1844 | ||
1631 | .macro pixman_composite_over_n_8888_8888_ca_cleanup | 1845 | .macro pixman_composite_over_n_8888_8888_ca_cleanup |
1632 | vpop {d8-d15} | ||
1633 | .endm | 1846 | .endm |
1634 | 1847 | ||
1635 | generate_composite_function \ | 1848 | generate_composite_function \ |
1636 | pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ | 1849 | pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ |
1637 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 1850 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -1647,160 +1860,174 @@ generate_composite_function \ | |||
1647 | 1860 | ||
1648 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head | 1861 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head |
1649 | /* | 1862 | /* |
1650 | * 'combine_mask_ca' replacement | 1863 | * 'combine_mask_ca' replacement |
1651 | * | 1864 | * |
1652 | * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] | 1865 | * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] |
1653 | * mask in {d24, d25, d26} [B, G, R] | 1866 | * mask in {v24, v25, v26} [B, G, R] |
1654 | * output: updated src in {d0, d1, d2 } [B, G, R] | 1867 | * output: updated src in {v0, v1, v2 } [B, G, R] |
1655 | * updated mask in {d24, d25, d26} [B, G, R] | 1868 | * updated mask in {v24, v25, v26} [B, G, R] |
1656 | */ | 1869 | */ |
1657 | vmull.u8 q0, d24, d8 | 1870 | umull v0.8h, v24.8b, v8.8b |
1658 | vmull.u8 q1, d25, d9 | 1871 | umull v1.8h, v25.8b, v9.8b |
1659 | vmull.u8 q6, d26, d10 | 1872 | umull v2.8h, v26.8b, v10.8b |
1660 | vmull.u8 q9, d11, d25 | 1873 | umull v12.8h, v11.8b, v24.8b |
1661 | vmull.u8 q12, d11, d24 | 1874 | umull v13.8h, v11.8b, v25.8b |
1662 | vmull.u8 q13, d11, d26 | 1875 | umull v14.8h, v11.8b, v26.8b |
1663 | vrshr.u16 q8, q0, #8 | 1876 | urshr v15.8h, v0.8h, #8 |
1664 | vrshr.u16 q10, q1, #8 | 1877 | urshr v16.8h, v1.8h, #8 |
1665 | vrshr.u16 q11, q6, #8 | 1878 | urshr v17.8h, v2.8h, #8 |
1666 | vraddhn.u16 d0, q0, q8 | 1879 | raddhn v0.8b, v0.8h, v15.8h |
1667 | vraddhn.u16 d1, q1, q10 | 1880 | raddhn v1.8b, v1.8h, v16.8h |
1668 | vraddhn.u16 d2, q6, q11 | 1881 | raddhn v2.8b, v2.8h, v17.8h |
1669 | vrshr.u16 q11, q12, #8 | 1882 | urshr v19.8h, v12.8h, #8 |
1670 | vrshr.u16 q8, q9, #8 | 1883 | urshr v20.8h, v13.8h, #8 |
1671 | vrshr.u16 q6, q13, #8 | 1884 | urshr v21.8h, v14.8h, #8 |
1672 | vraddhn.u16 d24, q12, q11 | 1885 | raddhn v24.8b, v12.8h, v19.8h |
1673 | vraddhn.u16 d25, q9, q8 | 1886 | raddhn v25.8b, v13.8h, v20.8h |
1674 | /* | 1887 | /* |
1675 | * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format | 1888 | * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format |
1676 | * and put data into d16 - blue, d17 - green, d18 - red | 1889 | * and put data into v16 - blue, v17 - green, v18 - red |
1677 | */ | 1890 | */ |
1678 | vshrn.u16 d17, q2, #3 | 1891 | mov v4.d[1], v5.d[0] |
1679 | vshrn.u16 d18, q2, #8 | 1892 | shrn v17.8b, v4.8h, #3 |
1680 | vraddhn.u16 d26, q13, q6 | 1893 | shrn v18.8b, v4.8h, #8 |
1681 | vsli.u16 q2, q2, #5 | 1894 | raddhn v26.8b, v14.8h, v21.8h |
1682 | vsri.u8 d18, d18, #5 | 1895 | sli v4.8h, v4.8h, #5 |
1683 | vsri.u8 d17, d17, #6 | 1896 | sri v18.8b, v18.8b, #5 |
1897 | sri v17.8b, v17.8b, #6 | ||
1684 | /* | 1898 | /* |
1685 | * 'combine_over_ca' replacement | 1899 | * 'combine_over_ca' replacement |
1686 | * | 1900 | * |
1687 | * output: updated dest in d16 - blue, d17 - green, d18 - red | 1901 | * output: updated dest in v16 - blue, v17 - green, v18 - red |
1688 | */ | 1902 | */ |
1689 | vmvn.8 q12, q12 | 1903 | mvn v24.8b, v24.8b |
1690 | vshrn.u16 d16, q2, #2 | 1904 | mvn v25.8b, v25.8b |
1691 | vmvn.8 d26, d26 | 1905 | shrn v16.8b, v4.8h, #2 |
1692 | vmull.u8 q6, d16, d24 | 1906 | mvn v26.8b, v26.8b |
1693 | vmull.u8 q7, d17, d25 | 1907 | umull v5.8h, v16.8b, v24.8b |
1694 | vmull.u8 q11, d18, d26 | 1908 | umull v6.8h, v17.8b, v25.8b |
1909 | umull v7.8h, v18.8b, v26.8b | ||
1695 | .endm | 1910 | .endm |
1696 | 1911 | ||
1697 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail | 1912 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail |
1698 | /* ... continue 'combine_over_ca' replacement */ | 1913 | /* ... continue 'combine_over_ca' replacement */ |
1699 | vrshr.u16 q10, q6, #8 | 1914 | urshr v13.8h, v5.8h, #8 |
1700 | vrshr.u16 q14, q7, #8 | 1915 | urshr v14.8h, v6.8h, #8 |
1701 | vrshr.u16 q15, q11, #8 | 1916 | urshr v15.8h, v7.8h, #8 |
1702 | vraddhn.u16 d16, q10, q6 | 1917 | raddhn v16.8b, v13.8h, v5.8h |
1703 | vraddhn.u16 d17, q14, q7 | 1918 | raddhn v17.8b, v14.8h, v6.8h |
1704 | vraddhn.u16 d18, q15, q11 | 1919 | raddhn v18.8b, v15.8h, v7.8h |
1705 | vqadd.u8 q8, q0, q8 | 1920 | uqadd v16.8b, v0.8b, v16.8b |
1706 | vqadd.u8 d18, d2, d18 | 1921 | uqadd v17.8b, v1.8b, v17.8b |
1922 | uqadd v18.8b, v2.8b, v18.8b | ||
1707 | /* | 1923 | /* |
1708 | * convert the results in d16, d17, d18 to r5g6b5 and store | 1924 | * convert the results in v16, v17, v18 to r5g6b5 and store |
1709 | * them into {d28, d29} | 1925 | * them into {v14} |
1710 | */ | 1926 | */ |
1711 | vshll.u8 q14, d18, #8 | 1927 | ushll v14.8h, v18.8b, #7 |
1712 | vshll.u8 q10, d17, #8 | 1928 | sli v14.8h, v14.8h, #1 |
1713 | vshll.u8 q15, d16, #8 | 1929 | ushll v12.8h, v17.8b, #7 |
1714 | vsri.u16 q14, q10, #5 | 1930 | sli v12.8h, v12.8h, #1 |
1715 | vsri.u16 q14, q15, #11 | 1931 | ushll v13.8h, v16.8b, #7 |
1932 | sli v13.8h, v13.8h, #1 | ||
1933 | sri v14.8h, v12.8h, #5 | ||
1934 | sri v14.8h, v13.8h, #11 | ||
1935 | mov v28.d[0], v14.d[0] | ||
1936 | mov v29.d[0], v14.d[1] | ||
1716 | .endm | 1937 | .endm |
1717 | 1938 | ||
1718 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head | 1939 | .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head |
1719 | fetch_mask_pixblock | 1940 | fetch_mask_pixblock |
1720 | vrshr.u16 q10, q6, #8 | 1941 | urshr v13.8h, v5.8h, #8 |
1721 | vrshr.u16 q14, q7, #8 | 1942 | urshr v14.8h, v6.8h, #8 |
1722 | vld1.16 {d4, d5}, [DST_R, :128]! | 1943 | ld1 {v4.8h}, [DST_R], #16 |
1723 | vrshr.u16 q15, q11, #8 | 1944 | urshr v15.8h, v7.8h, #8 |
1724 | vraddhn.u16 d16, q10, q6 | 1945 | raddhn v16.8b, v13.8h, v5.8h |
1725 | vraddhn.u16 d17, q14, q7 | 1946 | raddhn v17.8b, v14.8h, v6.8h |
1726 | vraddhn.u16 d22, q15, q11 | 1947 | raddhn v18.8b, v15.8h, v7.8h |
1948 | mov v5.d[0], v4.d[1] | ||
1727 | /* process_pixblock_head */ | 1949 | /* process_pixblock_head */ |
1728 | /* | 1950 | /* |
1729 | * 'combine_mask_ca' replacement | 1951 | * 'combine_mask_ca' replacement |
1730 | * | 1952 | * |
1731 | * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] | 1953 | * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] |
1732 | * mask in {d24, d25, d26} [B, G, R] | 1954 | * mask in {v24, v25, v26} [B, G, R] |
1733 | * output: updated src in {d0, d1, d2 } [B, G, R] | 1955 | * output: updated src in {v0, v1, v2 } [B, G, R] |
1734 | * updated mask in {d24, d25, d26} [B, G, R] | 1956 | * updated mask in {v24, v25, v26} [B, G, R] |
1735 | */ | 1957 | */ |
1736 | vmull.u8 q6, d26, d10 | 1958 | uqadd v16.8b, v0.8b, v16.8b |
1737 | vqadd.u8 q8, q0, q8 | 1959 | uqadd v17.8b, v1.8b, v17.8b |
1738 | vmull.u8 q0, d24, d8 | 1960 | uqadd v18.8b, v2.8b, v18.8b |
1739 | vqadd.u8 d22, d2, d22 | 1961 | umull v0.8h, v24.8b, v8.8b |
1740 | vmull.u8 q1, d25, d9 | 1962 | umull v1.8h, v25.8b, v9.8b |
1963 | umull v2.8h, v26.8b, v10.8b | ||
1741 | /* | 1964 | /* |
1742 | * convert the result in d16, d17, d22 to r5g6b5 and store | 1965 | * convert the result in v16, v17, v18 to r5g6b5 and store |
1743 | * it into {d28, d29} | 1966 | * it into {v14} |
1744 | */ | 1967 | */ |
1745 | vshll.u8 q14, d22, #8 | 1968 | ushll v14.8h, v18.8b, #7 |
1746 | vshll.u8 q10, d17, #8 | 1969 | sli v14.8h, v14.8h, #1 |
1747 | vshll.u8 q15, d16, #8 | 1970 | ushll v18.8h, v16.8b, #7 |
1748 | vmull.u8 q9, d11, d25 | 1971 | sli v18.8h, v18.8h, #1 |
1749 | vsri.u16 q14, q10, #5 | 1972 | ushll v19.8h, v17.8b, #7 |
1750 | vmull.u8 q12, d11, d24 | 1973 | sli v19.8h, v19.8h, #1 |
1751 | vmull.u8 q13, d11, d26 | 1974 | umull v12.8h, v11.8b, v24.8b |
1752 | vsri.u16 q14, q15, #11 | 1975 | sri v14.8h, v19.8h, #5 |
1976 | umull v13.8h, v11.8b, v25.8b | ||
1977 | umull v15.8h, v11.8b, v26.8b | ||
1978 | sri v14.8h, v18.8h, #11 | ||
1979 | mov v28.d[0], v14.d[0] | ||
1980 | mov v29.d[0], v14.d[1] | ||
1753 | cache_preload 8, 8 | 1981 | cache_preload 8, 8 |
1754 | vrshr.u16 q8, q0, #8 | 1982 | urshr v16.8h, v0.8h, #8 |
1755 | vrshr.u16 q10, q1, #8 | 1983 | urshr v17.8h, v1.8h, #8 |
1756 | vrshr.u16 q11, q6, #8 | 1984 | urshr v18.8h, v2.8h, #8 |
1757 | vraddhn.u16 d0, q0, q8 | 1985 | raddhn v0.8b, v0.8h, v16.8h |
1758 | vraddhn.u16 d1, q1, q10 | 1986 | raddhn v1.8b, v1.8h, v17.8h |
1759 | vraddhn.u16 d2, q6, q11 | 1987 | raddhn v2.8b, v2.8h, v18.8h |
1760 | vrshr.u16 q11, q12, #8 | 1988 | urshr v19.8h, v12.8h, #8 |
1761 | vrshr.u16 q8, q9, #8 | 1989 | urshr v20.8h, v13.8h, #8 |
1762 | vrshr.u16 q6, q13, #8 | 1990 | urshr v21.8h, v15.8h, #8 |
1763 | vraddhn.u16 d24, q12, q11 | 1991 | raddhn v24.8b, v12.8h, v19.8h |
1764 | vraddhn.u16 d25, q9, q8 | 1992 | raddhn v25.8b, v13.8h, v20.8h |
1765 | /* | 1993 | /* |
1766 | * convert 8 r5g6b5 pixel data from {d4, d5} to planar | 1994 | * convert 8 r5g6b5 pixel data from {v4, v5} to planar |
1767 | * 8-bit format and put data into d16 - blue, d17 - green, | 1995 | * 8-bit format and put data into v16 - blue, v17 - green, |
1768 | * d18 - red | 1996 | * v18 - red |
1769 | */ | 1997 | */ |
1770 | vshrn.u16 d17, q2, #3 | 1998 | mov v4.d[1], v5.d[0] |
1771 | vshrn.u16 d18, q2, #8 | 1999 | shrn v17.8b, v4.8h, #3 |
1772 | vraddhn.u16 d26, q13, q6 | 2000 | shrn v18.8b, v4.8h, #8 |
1773 | vsli.u16 q2, q2, #5 | 2001 | raddhn v26.8b, v15.8h, v21.8h |
1774 | vsri.u8 d17, d17, #6 | 2002 | sli v4.8h, v4.8h, #5 |
1775 | vsri.u8 d18, d18, #5 | 2003 | sri v17.8b, v17.8b, #6 |
2004 | sri v18.8b, v18.8b, #5 | ||
1776 | /* | 2005 | /* |
1777 | * 'combine_over_ca' replacement | 2006 | * 'combine_over_ca' replacement |
1778 | * | 2007 | * |
1779 | * output: updated dest in d16 - blue, d17 - green, d18 - red | 2008 | * output: updated dest in v16 - blue, v17 - green, v18 - red |
1780 | */ | 2009 | */ |
1781 | vmvn.8 q12, q12 | 2010 | mvn v24.8b, v24.8b |
1782 | vshrn.u16 d16, q2, #2 | 2011 | mvn v25.8b, v25.8b |
1783 | vmvn.8 d26, d26 | 2012 | shrn v16.8b, v4.8h, #2 |
1784 | vmull.u8 q7, d17, d25 | 2013 | mvn v26.8b, v26.8b |
1785 | vmull.u8 q6, d16, d24 | 2014 | umull v5.8h, v16.8b, v24.8b |
1786 | vmull.u8 q11, d18, d26 | 2015 | umull v6.8h, v17.8b, v25.8b |
1787 | vst1.16 {d28, d29}, [DST_W, :128]! | 2016 | umull v7.8h, v18.8b, v26.8b |
2017 | st1 {v14.8h}, [DST_W], #16 | ||
1788 | .endm | 2018 | .endm |
1789 | 2019 | ||
1790 | .macro pixman_composite_over_n_8888_0565_ca_init | 2020 | .macro pixman_composite_over_n_8888_0565_ca_init |
1791 | add DUMMY, sp, #ARGS_STACK_OFFSET | 2021 | mov v13.s[0], w4 |
1792 | vpush {d8-d15} | 2022 | dup v8.8b, v13.b[0] |
1793 | vld1.32 {d11[0]}, [DUMMY] | 2023 | dup v9.8b, v13.b[1] |
1794 | vdup.8 d8, d11[0] | 2024 | dup v10.8b, v13.b[2] |
1795 | vdup.8 d9, d11[1] | 2025 | dup v11.8b, v13.b[3] |
1796 | vdup.8 d10, d11[2] | ||
1797 | vdup.8 d11, d11[3] | ||
1798 | .endm | 2026 | .endm |
1799 | 2027 | ||
1800 | .macro pixman_composite_over_n_8888_0565_ca_cleanup | 2028 | .macro pixman_composite_over_n_8888_0565_ca_cleanup |
1801 | vpop {d8-d15} | ||
1802 | .endm | 2029 | .endm |
1803 | 2030 | ||
1804 | generate_composite_function \ | 2031 | generate_composite_function \ |
1805 | pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ | 2032 | pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ |
1806 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 2033 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -1813,41 +2040,40 @@ generate_composite_function \ | |||
1813 | pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head | 2040 | pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head |
1814 | 2041 | ||
1815 | /******************************************************************************/ | 2042 | /******************************************************************************/ |
1816 | 2043 | ||
1817 | .macro pixman_composite_in_n_8_process_pixblock_head | 2044 | .macro pixman_composite_in_n_8_process_pixblock_head |
1818 | /* expecting source data in {d0, d1, d2, d3} */ | 2045 | /* expecting source data in {v0, v1, v2, v3} */ |
1819 | /* and destination data in {d4, d5, d6, d7} */ | 2046 | /* and destination data in {v4, v5, v6, v7} */ |
1820 | vmull.u8 q8, d4, d3 | 2047 | umull v8.8h, v4.8b, v3.8b |
1821 | vmull.u8 q9, d5, d3 | 2048 | umull v9.8h, v5.8b, v3.8b |
1822 | vmull.u8 q10, d6, d3 | 2049 | umull v10.8h, v6.8b, v3.8b |
1823 | vmull.u8 q11, d7, d3 | 2050 | umull v11.8h, v7.8b, v3.8b |
1824 | .endm | 2051 | .endm |
1825 | 2052 | ||
1826 | .macro pixman_composite_in_n_8_process_pixblock_tail | 2053 | .macro pixman_composite_in_n_8_process_pixblock_tail |
1827 | vrshr.u16 q14, q8, #8 | 2054 | urshr v14.8h, v8.8h, #8 |
1828 | vrshr.u16 q15, q9, #8 | 2055 | urshr v15.8h, v9.8h, #8 |
1829 | vrshr.u16 q12, q10, #8 | 2056 | urshr v12.8h, v10.8h, #8 |
1830 | vrshr.u16 q13, q11, #8 | 2057 | urshr v13.8h, v11.8h, #8 |
1831 | vraddhn.u16 d28, q8, q14 | 2058 | raddhn v28.8b, v8.8h, v14.8h |
1832 | vraddhn.u16 d29, q9, q15 | 2059 | raddhn v29.8b, v9.8h, v15.8h |
1833 | vraddhn.u16 d30, q10, q12 | 2060 | raddhn v30.8b, v10.8h, v12.8h |
1834 | vraddhn.u16 d31, q11, q13 | 2061 | raddhn v31.8b, v11.8h, v13.8h |
1835 | .endm | 2062 | .endm |
1836 | 2063 | ||
1837 | .macro pixman_composite_in_n_8_process_pixblock_tail_head | 2064 | .macro pixman_composite_in_n_8_process_pixblock_tail_head |
1838 | pixman_composite_in_n_8_process_pixblock_tail | 2065 | pixman_composite_in_n_8_process_pixblock_tail |
1839 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2066 | ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
1840 | cache_preload 32, 32 | 2067 | cache_preload 32, 32 |
1841 | pixman_composite_in_n_8_process_pixblock_head | 2068 | pixman_composite_in_n_8_process_pixblock_head |
1842 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2069 | st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
1843 | .endm | 2070 | .endm |
1844 | 2071 | ||
1845 | .macro pixman_composite_in_n_8_init | 2072 | .macro pixman_composite_in_n_8_init |
1846 | add DUMMY, sp, #ARGS_STACK_OFFSET | 2073 | mov v3.s[0], w4 |
1847 | vld1.32 {d3[0]}, [DUMMY] | 2074 | dup v3.8b, v3.b[3] |
1848 | vdup.8 d3, d3[3] | ||
1849 | .endm | 2075 | .endm |
1850 | 2076 | ||
1851 | .macro pixman_composite_in_n_8_cleanup | 2077 | .macro pixman_composite_in_n_8_cleanup |
1852 | .endm | 2078 | .endm |
1853 | 2079 | ||
@@ -1865,52 +2091,51 @@ generate_composite_function \ | |||
1865 | 4, /* dst_r_basereg */ \ | 2091 | 4, /* dst_r_basereg */ \ |
1866 | 0, /* src_basereg */ \ | 2092 | 0, /* src_basereg */ \ |
1867 | 24 /* mask_basereg */ | 2093 | 24 /* mask_basereg */ |
1868 | 2094 | ||
1869 | .macro pixman_composite_add_n_8_8_process_pixblock_head | 2095 | .macro pixman_composite_add_n_8_8_process_pixblock_head |
1870 | /* expecting source data in {d8, d9, d10, d11} */ | 2096 | /* expecting source data in {v8, v9, v10, v11} */ |
1871 | /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ | 2097 | /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ |
1872 | /* and destination data in {d4, d5, d6, d7} */ | 2098 | /* and destination data in {v4, v5, v6, v7} */ |
1873 | /* mask is in d24, d25, d26, d27 */ | 2099 | /* mask is in v24, v25, v26, v27 */ |
1874 | vmull.u8 q0, d24, d11 | 2100 | umull v0.8h, v24.8b, v11.8b |
1875 | vmull.u8 q1, d25, d11 | 2101 | umull v1.8h, v25.8b, v11.8b |
1876 | vmull.u8 q6, d26, d11 | 2102 | umull v2.8h, v26.8b, v11.8b |
1877 | vmull.u8 q7, d27, d11 | 2103 | umull v3.8h, v27.8b, v11.8b |
1878 | vrshr.u16 q10, q0, #8 | 2104 | urshr v12.8h, v0.8h, #8 |
1879 | vrshr.u16 q11, q1, #8 | 2105 | urshr v13.8h, v1.8h, #8 |
1880 | vrshr.u16 q12, q6, #8 | 2106 | urshr v14.8h, v2.8h, #8 |
1881 | vrshr.u16 q13, q7, #8 | 2107 | urshr v15.8h, v3.8h, #8 |
1882 | vraddhn.u16 d0, q0, q10 | 2108 | raddhn v0.8b, v0.8h, v12.8h |
1883 | vraddhn.u16 d1, q1, q11 | 2109 | raddhn v1.8b, v1.8h, v13.8h |
1884 | vraddhn.u16 d2, q6, q12 | 2110 | raddhn v2.8b, v2.8h, v14.8h |
1885 | vraddhn.u16 d3, q7, q13 | 2111 | raddhn v3.8b, v3.8h, v15.8h |
1886 | vqadd.u8 q14, q0, q2 | 2112 | uqadd v28.8b, v0.8b, v4.8b |
1887 | vqadd.u8 q15, q1, q3 | 2113 | uqadd v29.8b, v1.8b, v5.8b |
2114 | uqadd v30.8b, v2.8b, v6.8b | ||
2115 | uqadd v31.8b, v3.8b, v7.8b | ||
1888 | .endm | 2116 | .endm |
1889 | 2117 | ||
1890 | .macro pixman_composite_add_n_8_8_process_pixblock_tail | 2118 | .macro pixman_composite_add_n_8_8_process_pixblock_tail |
1891 | .endm | 2119 | .endm |
1892 | 2120 | ||
1893 | /* TODO: expand macros and do better instructions scheduling */ | 2121 | /* TODO: expand macros and do better instructions scheduling */ |
1894 | .macro pixman_composite_add_n_8_8_process_pixblock_tail_head | 2122 | .macro pixman_composite_add_n_8_8_process_pixblock_tail_head |
1895 | pixman_composite_add_n_8_8_process_pixblock_tail | 2123 | pixman_composite_add_n_8_8_process_pixblock_tail |
1896 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2124 | st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
1897 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2125 | ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
1898 | fetch_mask_pixblock | 2126 | fetch_mask_pixblock |
1899 | cache_preload 32, 32 | 2127 | cache_preload 32, 32 |
1900 | pixman_composite_add_n_8_8_process_pixblock_head | 2128 | pixman_composite_add_n_8_8_process_pixblock_head |
1901 | .endm | 2129 | .endm |
1902 | 2130 | ||
1903 | .macro pixman_composite_add_n_8_8_init | 2131 | .macro pixman_composite_add_n_8_8_init |
1904 | add DUMMY, sp, #ARGS_STACK_OFFSET | 2132 | mov v11.s[0], w4 |
1905 | vpush {d8-d15} | 2133 | dup v11.8b, v11.b[3] |
1906 | vld1.32 {d11[0]}, [DUMMY] | ||
1907 | vdup.8 d11, d11[3] | ||
1908 | .endm | 2134 | .endm |
1909 | 2135 | ||
1910 | .macro pixman_composite_add_n_8_8_cleanup | 2136 | .macro pixman_composite_add_n_8_8_cleanup |
1911 | vpop {d8-d15} | ||
1912 | .endm | 2137 | .endm |
1913 | 2138 | ||
1914 | generate_composite_function \ | 2139 | generate_composite_function \ |
1915 | pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ | 2140 | pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ |
1916 | FLAG_DST_READWRITE, \ | 2141 | FLAG_DST_READWRITE, \ |
@@ -1923,37 +2148,39 @@ generate_composite_function \ | |||
1923 | pixman_composite_add_n_8_8_process_pixblock_tail_head | 2148 | pixman_composite_add_n_8_8_process_pixblock_tail_head |
1924 | 2149 | ||
1925 | /******************************************************************************/ | 2150 | /******************************************************************************/ |
1926 | 2151 | ||
1927 | .macro pixman_composite_add_8_8_8_process_pixblock_head | 2152 | .macro pixman_composite_add_8_8_8_process_pixblock_head |
1928 | /* expecting source data in {d0, d1, d2, d3} */ | 2153 | /* expecting source data in {v0, v1, v2, v3} */ |
1929 | /* destination data in {d4, d5, d6, d7} */ | 2154 | /* destination data in {v4, v5, v6, v7} */ |
1930 | /* mask in {d24, d25, d26, d27} */ | 2155 | /* mask in {v24, v25, v26, v27} */ |
1931 | vmull.u8 q8, d24, d0 | 2156 | umull v8.8h, v24.8b, v0.8b |
1932 | vmull.u8 q9, d25, d1 | 2157 | umull v9.8h, v25.8b, v1.8b |
1933 | vmull.u8 q10, d26, d2 | 2158 | umull v10.8h, v26.8b, v2.8b |
1934 | vmull.u8 q11, d27, d3 | 2159 | umull v11.8h, v27.8b, v3.8b |
1935 | vrshr.u16 q0, q8, #8 | 2160 | urshr v0.8h, v8.8h, #8 |
1936 | vrshr.u16 q1, q9, #8 | 2161 | urshr v1.8h, v9.8h, #8 |
1937 | vrshr.u16 q12, q10, #8 | 2162 | urshr v12.8h, v10.8h, #8 |
1938 | vrshr.u16 q13, q11, #8 | 2163 | urshr v13.8h, v11.8h, #8 |
1939 | vraddhn.u16 d0, q0, q8 | 2164 | raddhn v0.8b, v0.8h, v8.8h |
1940 | vraddhn.u16 d1, q1, q9 | 2165 | raddhn v1.8b, v1.8h, v9.8h |
1941 | vraddhn.u16 d2, q12, q10 | 2166 | raddhn v2.8b, v12.8h, v10.8h |
1942 | vraddhn.u16 d3, q13, q11 | 2167 | raddhn v3.8b, v13.8h, v11.8h |
1943 | vqadd.u8 q14, q0, q2 | 2168 | uqadd v28.8b, v0.8b, v4.8b |
1944 | vqadd.u8 q15, q1, q3 | 2169 | uqadd v29.8b, v1.8b, v5.8b |
2170 | uqadd v30.8b, v2.8b, v6.8b | ||
2171 | uqadd v31.8b, v3.8b, v7.8b | ||
1945 | .endm | 2172 | .endm |
1946 | 2173 | ||
1947 | .macro pixman_composite_add_8_8_8_process_pixblock_tail | 2174 | .macro pixman_composite_add_8_8_8_process_pixblock_tail |
1948 | .endm | 2175 | .endm |
1949 | 2176 | ||
1950 | /* TODO: expand macros and do better instructions scheduling */ | 2177 | /* TODO: expand macros and do better instructions scheduling */ |
1951 | .macro pixman_composite_add_8_8_8_process_pixblock_tail_head | 2178 | .macro pixman_composite_add_8_8_8_process_pixblock_tail_head |
1952 | pixman_composite_add_8_8_8_process_pixblock_tail | 2179 | pixman_composite_add_8_8_8_process_pixblock_tail |
1953 | vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2180 | st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
1954 | vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2181 | ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
1955 | fetch_mask_pixblock | 2182 | fetch_mask_pixblock |
1956 | fetch_src_pixblock | 2183 | fetch_src_pixblock |
1957 | cache_preload 32, 32 | 2184 | cache_preload 32, 32 |
1958 | pixman_composite_add_8_8_8_process_pixblock_head | 2185 | pixman_composite_add_8_8_8_process_pixblock_head |
1959 | .endm | 2186 | .endm |
@@ -1976,57 +2203,60 @@ generate_composite_function \ | |||
1976 | pixman_composite_add_8_8_8_process_pixblock_tail_head | 2203 | pixman_composite_add_8_8_8_process_pixblock_tail_head |
1977 | 2204 | ||
1978 | /******************************************************************************/ | 2205 | /******************************************************************************/ |
1979 | 2206 | ||
1980 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_head | 2207 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_head |
1981 | /* expecting source data in {d0, d1, d2, d3} */ | 2208 | /* expecting source data in {v0, v1, v2, v3} */ |
1982 | /* destination data in {d4, d5, d6, d7} */ | 2209 | /* destination data in {v4, v5, v6, v7} */ |
1983 | /* mask in {d24, d25, d26, d27} */ | 2210 | /* mask in {v24, v25, v26, v27} */ |
1984 | vmull.u8 q8, d27, d0 | 2211 | umull v8.8h, v27.8b, v0.8b |
1985 | vmull.u8 q9, d27, d1 | 2212 | umull v9.8h, v27.8b, v1.8b |
1986 | vmull.u8 q10, d27, d2 | 2213 | umull v10.8h, v27.8b, v2.8b |
1987 | vmull.u8 q11, d27, d3 | 2214 | umull v11.8h, v27.8b, v3.8b |
1988 | /* 1 cycle bubble */ | 2215 | /* 1 cycle bubble */ |
1989 | vrsra.u16 q8, q8, #8 | 2216 | ursra v8.8h, v8.8h, #8 |
1990 | vrsra.u16 q9, q9, #8 | 2217 | ursra v9.8h, v9.8h, #8 |
1991 | vrsra.u16 q10, q10, #8 | 2218 | ursra v10.8h, v10.8h, #8 |
1992 | vrsra.u16 q11, q11, #8 | 2219 | ursra v11.8h, v11.8h, #8 |
1993 | .endm | 2220 | .endm |
1994 | 2221 | ||
1995 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail | 2222 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail |
1996 | /* 2 cycle bubble */ | 2223 | /* 2 cycle bubble */ |
1997 | vrshrn.u16 d28, q8, #8 | 2224 | rshrn v28.8b, v8.8h, #8 |
1998 | vrshrn.u16 d29, q9, #8 | 2225 | rshrn v29.8b, v9.8h, #8 |
1999 | vrshrn.u16 d30, q10, #8 | 2226 | rshrn v30.8b, v10.8h, #8 |
2000 | vrshrn.u16 d31, q11, #8 | 2227 | rshrn v31.8b, v11.8h, #8 |
2001 | vqadd.u8 q14, q2, q14 | 2228 | uqadd v28.8b, v4.8b, v28.8b |
2002 | /* 1 cycle bubble */ | 2229 | uqadd v29.8b, v5.8b, v29.8b |
2003 | vqadd.u8 q15, q3, q15 | 2230 | uqadd v30.8b, v6.8b, v30.8b |
2231 | uqadd v31.8b, v7.8b, v31.8b | ||
2004 | .endm | 2232 | .endm |
2005 | 2233 | ||
2006 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head | 2234 | .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head |
2007 | fetch_src_pixblock | 2235 | fetch_src_pixblock |
2008 | vrshrn.u16 d28, q8, #8 | 2236 | rshrn v28.8b, v8.8h, #8 |
2009 | fetch_mask_pixblock | 2237 | fetch_mask_pixblock |
2010 | vrshrn.u16 d29, q9, #8 | 2238 | rshrn v29.8b, v9.8h, #8 |
2011 | vmull.u8 q8, d27, d0 | 2239 | umull v8.8h, v27.8b, v0.8b |
2012 | vrshrn.u16 d30, q10, #8 | 2240 | rshrn v30.8b, v10.8h, #8 |
2013 | vmull.u8 q9, d27, d1 | 2241 | umull v9.8h, v27.8b, v1.8b |
2014 | vrshrn.u16 d31, q11, #8 | 2242 | rshrn v31.8b, v11.8h, #8 |
2015 | vmull.u8 q10, d27, d2 | 2243 | umull v10.8h, v27.8b, v2.8b |
2016 | vqadd.u8 q14, q2, q14 | 2244 | umull v11.8h, v27.8b, v3.8b |
2017 | vmull.u8 q11, d27, d3 | 2245 | uqadd v28.8b, v4.8b, v28.8b |
2018 | vqadd.u8 q15, q3, q15 | 2246 | uqadd v29.8b, v5.8b, v29.8b |
2019 | vrsra.u16 q8, q8, #8 | 2247 | uqadd v30.8b, v6.8b, v30.8b |
2020 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2248 | uqadd v31.8b, v7.8b, v31.8b |
2021 | vrsra.u16 q9, q9, #8 | 2249 | ursra v8.8h, v8.8h, #8 |
2022 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2250 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
2023 | vrsra.u16 q10, q10, #8 | 2251 | ursra v9.8h, v9.8h, #8 |
2252 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
2253 | ursra v10.8h, v10.8h, #8 | ||
2024 | 2254 | ||
2025 | cache_preload 8, 8 | 2255 | cache_preload 8, 8 |
2026 | 2256 | ||
2027 | vrsra.u16 q11, q11, #8 | 2257 | ursra v11.8h, v11.8h, #8 |
2028 | .endm | 2258 | .endm |
2029 | 2259 | ||
2030 | generate_composite_function \ | 2260 | generate_composite_function \ |
2031 | pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ | 2261 | pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ |
2032 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 2262 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2034,21 +2264,29 @@ generate_composite_function \ | |||
2034 | 10, /* prefetch distance */ \ | 2264 | 10, /* prefetch distance */ \ |
2035 | default_init, \ | 2265 | default_init, \ |
2036 | default_cleanup, \ | 2266 | default_cleanup, \ |
2037 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ | 2267 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
2038 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ | 2268 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
2039 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head | 2269 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
2270 | 28, /* dst_w_basereg */ \ | ||
2271 | 4, /* dst_r_basereg */ \ | ||
2272 | 0, /* src_basereg */ \ | ||
2273 | 24 /* mask_basereg */ | ||
2040 | 2274 | ||
2041 | generate_composite_function_single_scanline \ | 2275 | generate_composite_function_single_scanline \ |
2042 | pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ | 2276 | pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ |
2043 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 2277 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
2044 | 8, /* number of pixels, processed in a single block */ \ | 2278 | 8, /* number of pixels, processed in a single block */ \ |
2045 | default_init, \ | 2279 | default_init, \ |
2046 | default_cleanup, \ | 2280 | default_cleanup, \ |
2047 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ | 2281 | pixman_composite_add_8888_8888_8888_process_pixblock_head, \ |
2048 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ | 2282 | pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ |
2049 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head | 2283 | pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ |
2284 | 28, /* dst_w_basereg */ \ | ||
2285 | 4, /* dst_r_basereg */ \ | ||
2286 | 0, /* src_basereg */ \ | ||
2287 | 24 /* mask_basereg */ | ||
2050 | 2288 | ||
2051 | /******************************************************************************/ | 2289 | /******************************************************************************/ |
2052 | 2290 | ||
2053 | generate_composite_function \ | 2291 | generate_composite_function \ |
2054 | pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ | 2292 | pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ |
@@ -2066,16 +2304,15 @@ generate_composite_function \ | |||
2066 | 27 /* mask_basereg */ | 2304 | 27 /* mask_basereg */ |
2067 | 2305 | ||
2068 | /******************************************************************************/ | 2306 | /******************************************************************************/ |
2069 | 2307 | ||
2070 | .macro pixman_composite_add_n_8_8888_init | 2308 | .macro pixman_composite_add_n_8_8888_init |
2071 | add DUMMY, sp, #ARGS_STACK_OFFSET | 2309 | mov v3.s[0], w4 |
2072 | vld1.32 {d3[0]}, [DUMMY] | 2310 | dup v0.8b, v3.b[0] |
2073 | vdup.8 d0, d3[0] | 2311 | dup v1.8b, v3.b[1] |
2074 | vdup.8 d1, d3[1] | 2312 | dup v2.8b, v3.b[2] |
2075 | vdup.8 d2, d3[2] | 2313 | dup v3.8b, v3.b[3] |
2076 | vdup.8 d3, d3[3] | ||
2077 | .endm | 2314 | .endm |
2078 | 2315 | ||
2079 | .macro pixman_composite_add_n_8_8888_cleanup | 2316 | .macro pixman_composite_add_n_8_8888_cleanup |
2080 | .endm | 2317 | .endm |
2081 | 2318 | ||
@@ -2095,13 +2332,12 @@ generate_composite_function \ | |||
2095 | 27 /* mask_basereg */ | 2332 | 27 /* mask_basereg */ |
2096 | 2333 | ||
2097 | /******************************************************************************/ | 2334 | /******************************************************************************/ |
2098 | 2335 | ||
2099 | .macro pixman_composite_add_8888_n_8888_init | 2336 | .macro pixman_composite_add_8888_n_8888_init |
2100 | add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) | 2337 | mov v27.s[0], w6 |
2101 | vld1.32 {d27[0]}, [DUMMY] | 2338 | dup v27.8b, v27.b[3] |
2102 | vdup.8 d27, d27[3] | ||
2103 | .endm | 2339 | .endm |
2104 | 2340 | ||
2105 | .macro pixman_composite_add_8888_n_8888_cleanup | 2341 | .macro pixman_composite_add_8888_n_8888_cleanup |
2106 | .endm | 2342 | .endm |
2107 | 2343 | ||
@@ -2121,55 +2357,55 @@ generate_composite_function \ | |||
2121 | 27 /* mask_basereg */ | 2357 | 27 /* mask_basereg */ |
2122 | 2358 | ||
2123 | /******************************************************************************/ | 2359 | /******************************************************************************/ |
2124 | 2360 | ||
2125 | .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head | 2361 | .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
2126 | /* expecting source data in {d0, d1, d2, d3} */ | 2362 | /* expecting source data in {v0, v1, v2, v3} */ |
2127 | /* destination data in {d4, d5, d6, d7} */ | 2363 | /* destination data in {v4, v5, v6, v7} */ |
2128 | /* solid mask is in d15 */ | 2364 | /* solid mask is in v15 */ |
2129 | 2365 | ||
2130 | /* 'in' */ | 2366 | /* 'in' */ |
2131 | vmull.u8 q8, d15, d3 | 2367 | umull v11.8h, v15.8b, v3.8b |
2132 | vmull.u8 q6, d15, d2 | 2368 | umull v10.8h, v15.8b, v2.8b |
2133 | vmull.u8 q5, d15, d1 | 2369 | umull v9.8h, v15.8b, v1.8b |
2134 | vmull.u8 q4, d15, d0 | 2370 | umull v8.8h, v15.8b, v0.8b |
2135 | vrshr.u16 q13, q8, #8 | 2371 | urshr v16.8h, v11.8h, #8 |
2136 | vrshr.u16 q12, q6, #8 | 2372 | urshr v14.8h, v10.8h, #8 |
2137 | vrshr.u16 q11, q5, #8 | 2373 | urshr v13.8h, v9.8h, #8 |
2138 | vrshr.u16 q10, q4, #8 | 2374 | urshr v12.8h, v8.8h, #8 |
2139 | vraddhn.u16 d3, q8, q13 | 2375 | raddhn v3.8b, v11.8h, v16.8h |
2140 | vraddhn.u16 d2, q6, q12 | 2376 | raddhn v2.8b, v10.8h, v14.8h |
2141 | vraddhn.u16 d1, q5, q11 | 2377 | raddhn v1.8b, v9.8h, v13.8h |
2142 | vraddhn.u16 d0, q4, q10 | 2378 | raddhn v0.8b, v8.8h, v12.8h |
2143 | vmvn.8 d24, d3 /* get inverted alpha */ | 2379 | mvn v24.8b, v3.8b /* get inverted alpha */ |
2144 | /* now do alpha blending */ | 2380 | /* now do alpha blending */ |
2145 | vmull.u8 q8, d24, d4 | 2381 | umull v8.8h, v24.8b, v4.8b |
2146 | vmull.u8 q9, d24, d5 | 2382 | umull v9.8h, v24.8b, v5.8b |
2147 | vmull.u8 q10, d24, d6 | 2383 | umull v10.8h, v24.8b, v6.8b |
2148 | vmull.u8 q11, d24, d7 | 2384 | umull v11.8h, v24.8b, v7.8b |
2149 | .endm | 2385 | .endm |
2150 | 2386 | ||
2151 | .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail | 2387 | .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
2152 | vrshr.u16 q14, q8, #8 | 2388 | urshr v16.8h, v8.8h, #8 |
2153 | vrshr.u16 q15, q9, #8 | 2389 | urshr v17.8h, v9.8h, #8 |
2154 | vrshr.u16 q12, q10, #8 | 2390 | urshr v18.8h, v10.8h, #8 |
2155 | vrshr.u16 q13, q11, #8 | 2391 | urshr v19.8h, v11.8h, #8 |
2156 | vraddhn.u16 d28, q14, q8 | 2392 | raddhn v28.8b, v16.8h, v8.8h |
2157 | vraddhn.u16 d29, q15, q9 | 2393 | raddhn v29.8b, v17.8h, v9.8h |
2158 | vraddhn.u16 d30, q12, q10 | 2394 | raddhn v30.8b, v18.8h, v10.8h |
2159 | vraddhn.u16 d31, q13, q11 | 2395 | raddhn v31.8b, v19.8h, v11.8h |
2160 | .endm | 2396 | .endm |
2161 | 2397 | ||
2162 | /* TODO: expand macros and do better instructions scheduling */ | 2398 | /* TODO: expand macros and do better instructions scheduling */ |
2163 | .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head | 2399 | .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head |
2164 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2400 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
2165 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail | 2401 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
2166 | fetch_src_pixblock | 2402 | fetch_src_pixblock |
2167 | cache_preload 8, 8 | 2403 | cache_preload 8, 8 |
2168 | fetch_mask_pixblock | 2404 | fetch_mask_pixblock |
2169 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_head | 2405 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
2170 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2406 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
2171 | .endm | 2407 | .endm |
2172 | 2408 | ||
2173 | generate_composite_function_single_scanline \ | 2409 | generate_composite_function_single_scanline \ |
2174 | pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ | 2410 | pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ |
2175 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 2411 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2190,33 +2426,32 @@ generate_composite_function_single_scanline \ | |||
2190 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_head | 2426 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_head |
2191 | .endm | 2427 | .endm |
2192 | 2428 | ||
2193 | .macro pixman_composite_over_8888_n_8888_process_pixblock_tail | 2429 | .macro pixman_composite_over_8888_n_8888_process_pixblock_tail |
2194 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail | 2430 | pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail |
2195 | vqadd.u8 q14, q0, q14 | 2431 | uqadd v28.8b, v0.8b, v28.8b |
2196 | vqadd.u8 q15, q1, q15 | 2432 | uqadd v29.8b, v1.8b, v29.8b |
2433 | uqadd v30.8b, v2.8b, v30.8b | ||
2434 | uqadd v31.8b, v3.8b, v31.8b | ||
2197 | .endm | 2435 | .endm |
2198 | 2436 | ||
2199 | /* TODO: expand macros and do better instructions scheduling */ | 2437 | /* TODO: expand macros and do better instructions scheduling */ |
2200 | .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head | 2438 | .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head |
2201 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2439 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
2202 | pixman_composite_over_8888_n_8888_process_pixblock_tail | 2440 | pixman_composite_over_8888_n_8888_process_pixblock_tail |
2203 | fetch_src_pixblock | 2441 | fetch_src_pixblock |
2204 | cache_preload 8, 8 | 2442 | cache_preload 8, 8 |
2205 | pixman_composite_over_8888_n_8888_process_pixblock_head | 2443 | pixman_composite_over_8888_n_8888_process_pixblock_head |
2206 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2444 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
2207 | .endm | 2445 | .endm |
2208 | 2446 | ||
2209 | .macro pixman_composite_over_8888_n_8888_init | 2447 | .macro pixman_composite_over_8888_n_8888_init |
2210 | add DUMMY, sp, #48 | 2448 | mov v15.s[0], w6 |
2211 | vpush {d8-d15} | 2449 | dup v15.8b, v15.b[3] |
2212 | vld1.32 {d15[0]}, [DUMMY] | ||
2213 | vdup.8 d15, d15[3] | ||
2214 | .endm | 2450 | .endm |
2215 | 2451 | ||
2216 | .macro pixman_composite_over_8888_n_8888_cleanup | 2452 | .macro pixman_composite_over_8888_n_8888_cleanup |
2217 | vpop {d8-d15} | ||
2218 | .endm | 2453 | .endm |
2219 | 2454 | ||
2220 | generate_composite_function \ | 2455 | generate_composite_function \ |
2221 | pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ | 2456 | pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ |
2222 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 2457 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2224,23 +2459,27 @@ generate_composite_function \ | |||
2224 | 5, /* prefetch distance */ \ | 2459 | 5, /* prefetch distance */ \ |
2225 | pixman_composite_over_8888_n_8888_init, \ | 2460 | pixman_composite_over_8888_n_8888_init, \ |
2226 | pixman_composite_over_8888_n_8888_cleanup, \ | 2461 | pixman_composite_over_8888_n_8888_cleanup, \ |
2227 | pixman_composite_over_8888_n_8888_process_pixblock_head, \ | 2462 | pixman_composite_over_8888_n_8888_process_pixblock_head, \ |
2228 | pixman_composite_over_8888_n_8888_process_pixblock_tail, \ | 2463 | pixman_composite_over_8888_n_8888_process_pixblock_tail, \ |
2229 | pixman_composite_over_8888_n_8888_process_pixblock_tail_head | 2464 | pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \ |
2465 | 28, /* dst_w_basereg */ \ | ||
2466 | 4, /* dst_r_basereg */ \ | ||
2467 | 0, /* src_basereg */ \ | ||
2468 | 12 /* mask_basereg */ | ||
2230 | 2469 | ||
2231 | /******************************************************************************/ | 2470 | /******************************************************************************/ |
2232 | 2471 | ||
2233 | /* TODO: expand macros and do better instructions scheduling */ | 2472 | /* TODO: expand macros and do better instructions scheduling */ |
2234 | .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head | 2473 | .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head |
2235 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2474 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
2236 | pixman_composite_over_8888_n_8888_process_pixblock_tail | 2475 | pixman_composite_over_8888_n_8888_process_pixblock_tail |
2237 | fetch_src_pixblock | 2476 | fetch_src_pixblock |
2238 | cache_preload 8, 8 | 2477 | cache_preload 8, 8 |
2239 | fetch_mask_pixblock | 2478 | fetch_mask_pixblock |
2240 | pixman_composite_over_8888_n_8888_process_pixblock_head | 2479 | pixman_composite_over_8888_n_8888_process_pixblock_head |
2241 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2480 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
2242 | .endm | 2481 | .endm |
2243 | 2482 | ||
2244 | generate_composite_function \ | 2483 | generate_composite_function \ |
2245 | pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ | 2484 | pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ |
2246 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 2485 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2272,17 +2511,17 @@ generate_composite_function_single_scanline \ | |||
2272 | 2511 | ||
2273 | /******************************************************************************/ | 2512 | /******************************************************************************/ |
2274 | 2513 | ||
2275 | /* TODO: expand macros and do better instructions scheduling */ | 2514 | /* TODO: expand macros and do better instructions scheduling */ |
2276 | .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head | 2515 | .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head |
2277 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 2516 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
2278 | pixman_composite_over_8888_n_8888_process_pixblock_tail | 2517 | pixman_composite_over_8888_n_8888_process_pixblock_tail |
2279 | fetch_src_pixblock | 2518 | fetch_src_pixblock |
2280 | cache_preload 8, 8 | 2519 | cache_preload 8, 8 |
2281 | fetch_mask_pixblock | 2520 | fetch_mask_pixblock |
2282 | pixman_composite_over_8888_n_8888_process_pixblock_head | 2521 | pixman_composite_over_8888_n_8888_process_pixblock_head |
2283 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2522 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
2284 | .endm | 2523 | .endm |
2285 | 2524 | ||
2286 | generate_composite_function \ | 2525 | generate_composite_function \ |
2287 | pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ | 2526 | pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ |
2288 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 2527 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2305,11 +2544,11 @@ generate_composite_function \ | |||
2305 | 2544 | ||
2306 | .macro pixman_composite_src_0888_0888_process_pixblock_tail | 2545 | .macro pixman_composite_src_0888_0888_process_pixblock_tail |
2307 | .endm | 2546 | .endm |
2308 | 2547 | ||
2309 | .macro pixman_composite_src_0888_0888_process_pixblock_tail_head | 2548 | .macro pixman_composite_src_0888_0888_process_pixblock_tail_head |
2310 | vst3.8 {d0, d1, d2}, [DST_W]! | 2549 | st3 {v0.8b, v1.8b, v2.8b}, [DST_W], #24 |
2311 | fetch_src_pixblock | 2550 | fetch_src_pixblock |
2312 | cache_preload 8, 8 | 2551 | cache_preload 8, 8 |
2313 | .endm | 2552 | .endm |
2314 | 2553 | ||
2315 | generate_composite_function \ | 2554 | generate_composite_function \ |
@@ -2328,25 +2567,29 @@ generate_composite_function \ | |||
2328 | 0 /* mask_basereg */ | 2567 | 0 /* mask_basereg */ |
2329 | 2568 | ||
2330 | /******************************************************************************/ | 2569 | /******************************************************************************/ |
2331 | 2570 | ||
2332 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_head | 2571 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_head |
2333 | vswp d0, d2 | 2572 | mov v31.8b, v2.8b |
2573 | mov v2.8b, v0.8b | ||
2574 | mov v0.8b, v31.8b | ||
2334 | .endm | 2575 | .endm |
2335 | 2576 | ||
2336 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail | 2577 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail |
2337 | .endm | 2578 | .endm |
2338 | 2579 | ||
2339 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head | 2580 | .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head |
2340 | vst4.8 {d0, d1, d2, d3}, [DST_W]! | 2581 | st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32 |
2341 | fetch_src_pixblock | 2582 | fetch_src_pixblock |
2342 | vswp d0, d2 | 2583 | mov v31.8b, v2.8b |
2584 | mov v2.8b, v0.8b | ||
2585 | mov v0.8b, v31.8b | ||
2343 | cache_preload 8, 8 | 2586 | cache_preload 8, 8 |
2344 | .endm | 2587 | .endm |
2345 | 2588 | ||
2346 | .macro pixman_composite_src_0888_8888_rev_init | 2589 | .macro pixman_composite_src_0888_8888_rev_init |
2347 | veor d3, d3, d3 | 2590 | eor v3.8b, v3.8b, v3.8b |
2348 | .endm | 2591 | .endm |
2349 | 2592 | ||
2350 | generate_composite_function \ | 2593 | generate_composite_function \ |
2351 | pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ | 2594 | pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ |
2352 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ | 2595 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2363,28 +2606,38 @@ generate_composite_function \ | |||
2363 | 0 /* mask_basereg */ | 2606 | 0 /* mask_basereg */ |
2364 | 2607 | ||
2365 | /******************************************************************************/ | 2608 | /******************************************************************************/ |
2366 | 2609 | ||
2367 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_head | 2610 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_head |
2368 | vshll.u8 q8, d1, #8 | 2611 | ushll v8.8h, v1.8b, #7 |
2369 | vshll.u8 q9, d2, #8 | 2612 | sli v8.8h, v8.8h, #1 |
2613 | ushll v9.8h, v2.8b, #7 | ||
2614 | sli v9.8h, v9.8h, #1 | ||
2370 | .endm | 2615 | .endm |
2371 | 2616 | ||
2372 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail | 2617 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail |
2373 | vshll.u8 q14, d0, #8 | 2618 | ushll v14.8h, v0.8b, #7 |
2374 | vsri.u16 q14, q8, #5 | 2619 | sli v14.8h, v14.8h, #1 |
2375 | vsri.u16 q14, q9, #11 | 2620 | sri v14.8h, v8.8h, #5 |
2621 | sri v14.8h, v9.8h, #11 | ||
2622 | mov v28.d[0], v14.d[0] | ||
2623 | mov v29.d[0], v14.d[1] | ||
2376 | .endm | 2624 | .endm |
2377 | 2625 | ||
2378 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head | 2626 | .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head |
2379 | vshll.u8 q14, d0, #8 | 2627 | ushll v14.8h, v0.8b, #7 |
2628 | sli v14.8h, v14.8h, #1 | ||
2380 | fetch_src_pixblock | 2629 | fetch_src_pixblock |
2381 | vsri.u16 q14, q8, #5 | 2630 | sri v14.8h, v8.8h, #5 |
2382 | vsri.u16 q14, q9, #11 | 2631 | sri v14.8h, v9.8h, #11 |
2383 | vshll.u8 q8, d1, #8 | 2632 | mov v28.d[0], v14.d[0] |
2384 | vst1.16 {d28, d29}, [DST_W, :128]! | 2633 | mov v29.d[0], v14.d[1] |
2385 | vshll.u8 q9, d2, #8 | 2634 | ushll v8.8h, v1.8b, #7 |
2635 | sli v8.8h, v8.8h, #1 | ||
2636 | st1 {v14.8h}, [DST_W], #16 | ||
2637 | ushll v9.8h, v2.8b, #7 | ||
2638 | sli v9.8h, v9.8h, #1 | ||
2386 | .endm | 2639 | .endm |
2387 | 2640 | ||
2388 | generate_composite_function \ | 2641 | generate_composite_function \ |
2389 | pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ | 2642 | pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ |
2390 | FLAG_DST_WRITEONLY, \ | 2643 | FLAG_DST_WRITEONLY, \ |
@@ -2401,47 +2654,59 @@ generate_composite_function \ | |||
2401 | 0 /* mask_basereg */ | 2654 | 0 /* mask_basereg */ |
2402 | 2655 | ||
2403 | /******************************************************************************/ | 2656 | /******************************************************************************/ |
2404 | 2657 | ||
2405 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_head | 2658 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_head |
2406 | vmull.u8 q8, d3, d0 | 2659 | umull v8.8h, v3.8b, v0.8b |
2407 | vmull.u8 q9, d3, d1 | 2660 | umull v9.8h, v3.8b, v1.8b |
2408 | vmull.u8 q10, d3, d2 | 2661 | umull v10.8h, v3.8b, v2.8b |
2409 | .endm | 2662 | .endm |
2410 | 2663 | ||
2411 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail | 2664 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail |
2412 | vrshr.u16 q11, q8, #8 | 2665 | urshr v11.8h, v8.8h, #8 |
2413 | vswp d3, d31 | 2666 | mov v30.8b, v31.8b |
2414 | vrshr.u16 q12, q9, #8 | 2667 | mov v31.8b, v3.8b |
2415 | vrshr.u16 q13, q10, #8 | 2668 | mov v3.8b, v30.8b |
2416 | vraddhn.u16 d30, q11, q8 | 2669 | urshr v12.8h, v9.8h, #8 |
2417 | vraddhn.u16 d29, q12, q9 | 2670 | urshr v13.8h, v10.8h, #8 |
2418 | vraddhn.u16 d28, q13, q10 | 2671 | raddhn v30.8b, v11.8h, v8.8h |
2672 | raddhn v29.8b, v12.8h, v9.8h | ||
2673 | raddhn v28.8b, v13.8h, v10.8h | ||
2419 | .endm | 2674 | .endm |
2420 | 2675 | ||
2421 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head | 2676 | .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head |
2422 | vrshr.u16 q11, q8, #8 | 2677 | urshr v11.8h, v8.8h, #8 |
2423 | vswp d3, d31 | 2678 | mov v30.8b, v31.8b |
2424 | vrshr.u16 q12, q9, #8 | 2679 | mov v31.8b, v3.8b |
2425 | vrshr.u16 q13, q10, #8 | 2680 | mov v3.8b, v31.8b |
2681 | urshr v12.8h, v9.8h, #8 | ||
2682 | urshr v13.8h, v10.8h, #8 | ||
2426 | fetch_src_pixblock | 2683 | fetch_src_pixblock |
2427 | vraddhn.u16 d30, q11, q8 | 2684 | raddhn v30.8b, v11.8h, v8.8h |
2428 | PF add PF_X, PF_X, #8 | 2685 | PF add PF_X, PF_X, #8 |
2429 | PF tst PF_CTL, #0xF | 2686 | PF tst PF_CTL, #0xF |
2430 | PF addne PF_X, PF_X, #8 | 2687 | PF beq 10f |
2431 | PF subne PF_CTL, PF_CTL, #1 | 2688 | PF add PF_X, PF_X, #8 |
2432 | vraddhn.u16 d29, q12, q9 | 2689 | PF sub PF_CTL, PF_CTL, #1 |
2433 | vraddhn.u16 d28, q13, q10 | 2690 | 10: |
2434 | vmull.u8 q8, d3, d0 | 2691 | raddhn v29.8b, v12.8h, v9.8h |
2435 | vmull.u8 q9, d3, d1 | 2692 | raddhn v28.8b, v13.8h, v10.8h |
2436 | vmull.u8 q10, d3, d2 | 2693 | umull v8.8h, v3.8b, v0.8b |
2437 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2694 | umull v9.8h, v3.8b, v1.8b |
2695 | umull v10.8h, v3.8b, v2.8b | ||
2696 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
2438 | PF cmp PF_X, ORIG_W | 2697 | PF cmp PF_X, ORIG_W |
2439 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 2698 | PF lsl DUMMY, PF_X, src_bpp_shift |
2440 | PF subge PF_X, PF_X, ORIG_W | 2699 | PF prfm pldl2strm, [PF_SRC, DUMMY] |
2441 | PF subges PF_CTL, PF_CTL, #0x10 | 2700 | PF ble 10f |
2442 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 2701 | PF sub PF_X, PF_X, ORIG_W |
2702 | PF subs PF_CTL, PF_CTL, #0x10 | ||
2703 | PF ble 10f | ||
2704 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
2705 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
2706 | PF add PF_SRC, PF_SRC, #1 | ||
2707 | 10: | ||
2443 | .endm | 2708 | .endm |
2444 | 2709 | ||
2445 | generate_composite_function \ | 2710 | generate_composite_function \ |
2446 | pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ | 2711 | pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ |
2447 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ | 2712 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2458,47 +2723,59 @@ generate_composite_function \ | |||
2458 | 0 /* mask_basereg */ | 2723 | 0 /* mask_basereg */ |
2459 | 2724 | ||
2460 | /******************************************************************************/ | 2725 | /******************************************************************************/ |
2461 | 2726 | ||
2462 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head | 2727 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head |
2463 | vmull.u8 q8, d3, d0 | 2728 | umull v8.8h, v3.8b, v0.8b |
2464 | vmull.u8 q9, d3, d1 | 2729 | umull v9.8h, v3.8b, v1.8b |
2465 | vmull.u8 q10, d3, d2 | 2730 | umull v10.8h, v3.8b, v2.8b |
2466 | .endm | 2731 | .endm |
2467 | 2732 | ||
2468 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail | 2733 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail |
2469 | vrshr.u16 q11, q8, #8 | 2734 | urshr v11.8h, v8.8h, #8 |
2470 | vswp d3, d31 | 2735 | mov v30.8b, v31.8b |
2471 | vrshr.u16 q12, q9, #8 | 2736 | mov v31.8b, v3.8b |
2472 | vrshr.u16 q13, q10, #8 | 2737 | mov v3.8b, v30.8b |
2473 | vraddhn.u16 d28, q11, q8 | 2738 | urshr v12.8h, v9.8h, #8 |
2474 | vraddhn.u16 d29, q12, q9 | 2739 | urshr v13.8h, v10.8h, #8 |
2475 | vraddhn.u16 d30, q13, q10 | 2740 | raddhn v28.8b, v11.8h, v8.8h |
2741 | raddhn v29.8b, v12.8h, v9.8h | ||
2742 | raddhn v30.8b, v13.8h, v10.8h | ||
2476 | .endm | 2743 | .endm |
2477 | 2744 | ||
2478 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head | 2745 | .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head |
2479 | vrshr.u16 q11, q8, #8 | 2746 | urshr v11.8h, v8.8h, #8 |
2480 | vswp d3, d31 | 2747 | mov v30.8b, v31.8b |
2481 | vrshr.u16 q12, q9, #8 | 2748 | mov v31.8b, v3.8b |
2482 | vrshr.u16 q13, q10, #8 | 2749 | mov v3.8b, v30.8b |
2750 | urshr v12.8h, v9.8h, #8 | ||
2751 | urshr v13.8h, v10.8h, #8 | ||
2483 | fetch_src_pixblock | 2752 | fetch_src_pixblock |
2484 | vraddhn.u16 d28, q11, q8 | 2753 | raddhn v28.8b, v11.8h, v8.8h |
2485 | PF add PF_X, PF_X, #8 | 2754 | PF add PF_X, PF_X, #8 |
2486 | PF tst PF_CTL, #0xF | 2755 | PF tst PF_CTL, #0xF |
2487 | PF addne PF_X, PF_X, #8 | 2756 | PF beq 10f |
2488 | PF subne PF_CTL, PF_CTL, #1 | 2757 | PF add PF_X, PF_X, #8 |
2489 | vraddhn.u16 d29, q12, q9 | 2758 | PF sub PF_CTL, PF_CTL, #1 |
2490 | vraddhn.u16 d30, q13, q10 | 2759 | 10: |
2491 | vmull.u8 q8, d3, d0 | 2760 | raddhn v29.8b, v12.8h, v9.8h |
2492 | vmull.u8 q9, d3, d1 | 2761 | raddhn v30.8b, v13.8h, v10.8h |
2493 | vmull.u8 q10, d3, d2 | 2762 | umull v8.8h, v3.8b, v0.8b |
2494 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 2763 | umull v9.8h, v3.8b, v1.8b |
2764 | umull v10.8h, v3.8b, v2.8b | ||
2765 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 | ||
2495 | PF cmp PF_X, ORIG_W | 2766 | PF cmp PF_X, ORIG_W |
2496 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 2767 | PF lsl DUMMY, PF_X, src_bpp_shift |
2497 | PF subge PF_X, PF_X, ORIG_W | 2768 | PF prfm pldl2strm, [PF_SRC, DUMMY] |
2498 | PF subges PF_CTL, PF_CTL, #0x10 | 2769 | PF ble 10f |
2499 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 2770 | PF sub PF_X, PF_X, ORIG_W |
2771 | PF subs PF_CTL, PF_CTL, #0x10 | ||
2772 | PF ble 10f | ||
2773 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift | ||
2774 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
2775 | PF add PF_SRC, PF_SRC, #1 | ||
2776 | 10: | ||
2500 | .endm | 2777 | .endm |
2501 | 2778 | ||
2502 | generate_composite_function \ | 2779 | generate_composite_function \ |
2503 | pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ | 2780 | pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ |
2504 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ | 2781 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2515,52 +2792,59 @@ generate_composite_function \ | |||
2515 | 0 /* mask_basereg */ | 2792 | 0 /* mask_basereg */ |
2516 | 2793 | ||
2517 | /******************************************************************************/ | 2794 | /******************************************************************************/ |
2518 | 2795 | ||
2519 | .macro pixman_composite_over_0565_8_0565_process_pixblock_head | 2796 | .macro pixman_composite_over_0565_8_0565_process_pixblock_head |
2520 | /* mask is in d15 */ | 2797 | /* mask is in v15 */ |
2521 | convert_0565_to_x888 q4, d2, d1, d0 | 2798 | mov v4.d[0], v8.d[0] |
2522 | convert_0565_to_x888 q5, d6, d5, d4 | 2799 | mov v4.d[1], v9.d[0] |
2523 | /* source pixel data is in {d0, d1, d2, XX} */ | 2800 | mov v13.d[0], v10.d[0] |
2524 | /* destination pixel data is in {d4, d5, d6, XX} */ | 2801 | mov v13.d[1], v11.d[0] |
2525 | vmvn.8 d7, d15 | 2802 | convert_0565_to_x888 v4, v2, v1, v0 |
2526 | vmull.u8 q6, d15, d2 | 2803 | convert_0565_to_x888 v13, v6, v5, v4 |
2527 | vmull.u8 q5, d15, d1 | 2804 | /* source pixel data is in {v0, v1, v2, XX} */ |
2528 | vmull.u8 q4, d15, d0 | 2805 | /* destination pixel data is in {v4, v5, v6, XX} */ |
2529 | vmull.u8 q8, d7, d4 | 2806 | mvn v7.8b, v15.8b |
2530 | vmull.u8 q9, d7, d5 | 2807 | umull v10.8h, v15.8b, v2.8b |
2531 | vmull.u8 q13, d7, d6 | 2808 | umull v9.8h, v15.8b, v1.8b |
2532 | vrshr.u16 q12, q6, #8 | 2809 | umull v8.8h, v15.8b, v0.8b |
2533 | vrshr.u16 q11, q5, #8 | 2810 | umull v11.8h, v7.8b, v4.8b |
2534 | vrshr.u16 q10, q4, #8 | 2811 | umull v12.8h, v7.8b, v5.8b |
2535 | vraddhn.u16 d2, q6, q12 | 2812 | umull v13.8h, v7.8b, v6.8b |
2536 | vraddhn.u16 d1, q5, q11 | 2813 | urshr v19.8h, v10.8h, #8 |
2537 | vraddhn.u16 d0, q4, q10 | 2814 | urshr v18.8h, v9.8h, #8 |
2815 | urshr v17.8h, v8.8h, #8 | ||
2816 | raddhn v2.8b, v10.8h, v19.8h | ||
2817 | raddhn v1.8b, v9.8h, v18.8h | ||
2818 | raddhn v0.8b, v8.8h, v17.8h | ||
2538 | .endm | 2819 | .endm |
2539 | 2820 | ||
2540 | .macro pixman_composite_over_0565_8_0565_process_pixblock_tail | 2821 | .macro pixman_composite_over_0565_8_0565_process_pixblock_tail |
2541 | vrshr.u16 q14, q8, #8 | 2822 | urshr v17.8h, v11.8h, #8 |
2542 | vrshr.u16 q15, q9, #8 | 2823 | urshr v18.8h, v12.8h, #8 |
2543 | vrshr.u16 q12, q13, #8 | 2824 | urshr v19.8h, v13.8h, #8 |
2544 | vraddhn.u16 d28, q14, q8 | 2825 | raddhn v28.8b, v17.8h, v11.8h |
2545 | vraddhn.u16 d29, q15, q9 | 2826 | raddhn v29.8b, v18.8h, v12.8h |
2546 | vraddhn.u16 d30, q12, q13 | 2827 | raddhn v30.8b, v19.8h, v13.8h |
2547 | vqadd.u8 q0, q0, q14 | 2828 | uqadd v0.8b, v0.8b, v28.8b |
2548 | vqadd.u8 q1, q1, q15 | 2829 | uqadd v1.8b, v1.8b, v29.8b |
2549 | /* 32bpp result is in {d0, d1, d2, XX} */ | 2830 | uqadd v2.8b, v2.8b, v30.8b |
2550 | convert_8888_to_0565 d2, d1, d0, q14, q15, q3 | 2831 | /* 32bpp result is in {v0, v1, v2, XX} */ |
2832 | convert_8888_to_0565 v2, v1, v0, v14, v30, v13 | ||
2833 | mov v28.d[0], v14.d[0] | ||
2834 | mov v29.d[0], v14.d[1] | ||
2551 | .endm | 2835 | .endm |
2552 | 2836 | ||
2553 | /* TODO: expand macros and do better instructions scheduling */ | 2837 | /* TODO: expand macros and do better instructions scheduling */ |
2554 | .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head | 2838 | .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head |
2555 | fetch_mask_pixblock | 2839 | fetch_mask_pixblock |
2556 | pixman_composite_over_0565_8_0565_process_pixblock_tail | 2840 | pixman_composite_over_0565_8_0565_process_pixblock_tail |
2557 | fetch_src_pixblock | 2841 | fetch_src_pixblock |
2558 | vld1.16 {d10, d11}, [DST_R, :128]! | 2842 | ld1 {v10.4h, v11.4h}, [DST_R], #16 |
2559 | cache_preload 8, 8 | 2843 | cache_preload 8, 8 |
2560 | pixman_composite_over_0565_8_0565_process_pixblock_head | 2844 | pixman_composite_over_0565_8_0565_process_pixblock_head |
2561 | vst1.16 {d28, d29}, [DST_W, :128]! | 2845 | st1 {v14.8h}, [DST_W], #16 |
2562 | .endm | 2846 | .endm |
2563 | 2847 | ||
2564 | generate_composite_function \ | 2848 | generate_composite_function \ |
2565 | pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ | 2849 | pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ |
2566 | FLAG_DST_READWRITE, \ | 2850 | FLAG_DST_READWRITE, \ |
@@ -2577,18 +2861,15 @@ generate_composite_function \ | |||
2577 | 15 /* mask_basereg */ | 2861 | 15 /* mask_basereg */ |
2578 | 2862 | ||
2579 | /******************************************************************************/ | 2863 | /******************************************************************************/ |
2580 | 2864 | ||
2581 | .macro pixman_composite_over_0565_n_0565_init | 2865 | .macro pixman_composite_over_0565_n_0565_init |
2582 | add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) | 2866 | mov v15.s[0], w6 |
2583 | vpush {d8-d15} | 2867 | dup v15.8b, v15.b[3] |
2584 | vld1.32 {d15[0]}, [DUMMY] | ||
2585 | vdup.8 d15, d15[3] | ||
2586 | .endm | 2868 | .endm |
2587 | 2869 | ||
2588 | .macro pixman_composite_over_0565_n_0565_cleanup | 2870 | .macro pixman_composite_over_0565_n_0565_cleanup |
2589 | vpop {d8-d15} | ||
2590 | .endm | 2871 | .endm |
2591 | 2872 | ||
2592 | generate_composite_function \ | 2873 | generate_composite_function \ |
2593 | pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ | 2874 | pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ |
2594 | FLAG_DST_READWRITE, \ | 2875 | FLAG_DST_READWRITE, \ |
@@ -2598,49 +2879,56 @@ generate_composite_function \ | |||
2598 | pixman_composite_over_0565_n_0565_cleanup, \ | 2879 | pixman_composite_over_0565_n_0565_cleanup, \ |
2599 | pixman_composite_over_0565_8_0565_process_pixblock_head, \ | 2880 | pixman_composite_over_0565_8_0565_process_pixblock_head, \ |
2600 | pixman_composite_over_0565_8_0565_process_pixblock_tail, \ | 2881 | pixman_composite_over_0565_8_0565_process_pixblock_tail, \ |
2601 | pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ | 2882 | pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ |
2602 | 28, /* dst_w_basereg */ \ | 2883 | 28, /* dst_w_basereg */ \ |
2603 | 10, /* dst_r_basereg */ \ | 2884 | 10, /* dst_r_basereg */ \ |
2604 | 8, /* src_basereg */ \ | 2885 | 8, /* src_basereg */ \ |
2605 | 15 /* mask_basereg */ | 2886 | 15 /* mask_basereg */ |
2606 | 2887 | ||
2607 | /******************************************************************************/ | 2888 | /******************************************************************************/ |
2608 | 2889 | ||
2609 | .macro pixman_composite_add_0565_8_0565_process_pixblock_head | 2890 | .macro pixman_composite_add_0565_8_0565_process_pixblock_head |
2610 | /* mask is in d15 */ | 2891 | /* mask is in v15 */ |
2611 | convert_0565_to_x888 q4, d2, d1, d0 | 2892 | mov v4.d[0], v8.d[0] |
2612 | convert_0565_to_x888 q5, d6, d5, d4 | 2893 | mov v4.d[1], v9.d[0] |
2613 | /* source pixel data is in {d0, d1, d2, XX} */ | 2894 | mov v13.d[0], v10.d[0] |
2614 | /* destination pixel data is in {d4, d5, d6, XX} */ | 2895 | mov v13.d[1], v11.d[0] |
2615 | vmull.u8 q6, d15, d2 | 2896 | convert_0565_to_x888 v4, v2, v1, v0 |
2616 | vmull.u8 q5, d15, d1 | 2897 | convert_0565_to_x888 v13, v6, v5, v4 |
2617 | vmull.u8 q4, d15, d0 | 2898 | /* source pixel data is in {v0, v1, v2, XX} */ |
2618 | vrshr.u16 q12, q6, #8 | 2899 | /* destination pixel data is in {v4, v5, v6, XX} */ |
2619 | vrshr.u16 q11, q5, #8 | 2900 | umull v9.8h, v15.8b, v2.8b |
2620 | vrshr.u16 q10, q4, #8 | 2901 | umull v8.8h, v15.8b, v1.8b |
2621 | vraddhn.u16 d2, q6, q12 | 2902 | umull v7.8h, v15.8b, v0.8b |
2622 | vraddhn.u16 d1, q5, q11 | 2903 | urshr v12.8h, v9.8h, #8 |
2623 | vraddhn.u16 d0, q4, q10 | 2904 | urshr v11.8h, v8.8h, #8 |
2905 | urshr v10.8h, v7.8h, #8 | ||
2906 | raddhn v2.8b, v9.8h, v12.8h | ||
2907 | raddhn v1.8b, v8.8h, v11.8h | ||
2908 | raddhn v0.8b, v7.8h, v10.8h | ||
2624 | .endm | 2909 | .endm |
2625 | 2910 | ||
2626 | .macro pixman_composite_add_0565_8_0565_process_pixblock_tail | 2911 | .macro pixman_composite_add_0565_8_0565_process_pixblock_tail |
2627 | vqadd.u8 q0, q0, q2 | 2912 | uqadd v0.8b, v0.8b, v4.8b |
2628 | vqadd.u8 q1, q1, q3 | 2913 | uqadd v1.8b, v1.8b, v5.8b |
2629 | /* 32bpp result is in {d0, d1, d2, XX} */ | 2914 | uqadd v2.8b, v2.8b, v6.8b |
2630 | convert_8888_to_0565 d2, d1, d0, q14, q15, q3 | 2915 | /* 32bpp result is in {v0, v1, v2, XX} */ |
2916 | convert_8888_to_0565 v2, v1, v0, v14, v30, v13 | ||
2917 | mov v28.d[0], v14.d[0] | ||
2918 | mov v29.d[0], v14.d[1] | ||
2631 | .endm | 2919 | .endm |
2632 | 2920 | ||
2633 | /* TODO: expand macros and do better instructions scheduling */ | 2921 | /* TODO: expand macros and do better instructions scheduling */ |
2634 | .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head | 2922 | .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head |
2635 | fetch_mask_pixblock | 2923 | fetch_mask_pixblock |
2636 | pixman_composite_add_0565_8_0565_process_pixblock_tail | 2924 | pixman_composite_add_0565_8_0565_process_pixblock_tail |
2637 | fetch_src_pixblock | 2925 | fetch_src_pixblock |
2638 | vld1.16 {d10, d11}, [DST_R, :128]! | 2926 | ld1 {v10.4h, v11.4h}, [DST_R], #16 |
2639 | cache_preload 8, 8 | 2927 | cache_preload 8, 8 |
2640 | pixman_composite_add_0565_8_0565_process_pixblock_head | 2928 | pixman_composite_add_0565_8_0565_process_pixblock_head |
2641 | vst1.16 {d28, d29}, [DST_W, :128]! | 2929 | st1 {v14.8h}, [DST_W], #16 |
2642 | .endm | 2930 | .endm |
2643 | 2931 | ||
2644 | generate_composite_function \ | 2932 | generate_composite_function \ |
2645 | pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ | 2933 | pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ |
2646 | FLAG_DST_READWRITE, \ | 2934 | FLAG_DST_READWRITE, \ |
@@ -2657,39 +2945,43 @@ generate_composite_function \ | |||
2657 | 15 /* mask_basereg */ | 2945 | 15 /* mask_basereg */ |
2658 | 2946 | ||
2659 | /******************************************************************************/ | 2947 | /******************************************************************************/ |
2660 | 2948 | ||
2661 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_head | 2949 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_head |
2662 | /* mask is in d15 */ | 2950 | /* mask is in v15 */ |
2663 | convert_0565_to_x888 q5, d6, d5, d4 | 2951 | mov v12.d[0], v10.d[0] |
2664 | /* destination pixel data is in {d4, d5, d6, xx} */ | 2952 | mov v12.d[1], v11.d[0] |
2665 | vmvn.8 d24, d15 /* get inverted alpha */ | 2953 | convert_0565_to_x888 v12, v6, v5, v4 |
2954 | /* destination pixel data is in {v4, v5, v6, xx} */ | ||
2955 | mvn v24.8b, v15.8b /* get inverted alpha */ | ||
2666 | /* now do alpha blending */ | 2956 | /* now do alpha blending */ |
2667 | vmull.u8 q8, d24, d4 | 2957 | umull v8.8h, v24.8b, v4.8b |
2668 | vmull.u8 q9, d24, d5 | 2958 | umull v9.8h, v24.8b, v5.8b |
2669 | vmull.u8 q10, d24, d6 | 2959 | umull v10.8h, v24.8b, v6.8b |
2670 | .endm | 2960 | .endm |
2671 | 2961 | ||
2672 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail | 2962 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail |
2673 | vrshr.u16 q14, q8, #8 | 2963 | urshr v11.8h, v8.8h, #8 |
2674 | vrshr.u16 q15, q9, #8 | 2964 | urshr v12.8h, v9.8h, #8 |
2675 | vrshr.u16 q12, q10, #8 | 2965 | urshr v13.8h, v10.8h, #8 |
2676 | vraddhn.u16 d0, q14, q8 | 2966 | raddhn v0.8b, v11.8h, v8.8h |
2677 | vraddhn.u16 d1, q15, q9 | 2967 | raddhn v1.8b, v12.8h, v9.8h |
2678 | vraddhn.u16 d2, q12, q10 | 2968 | raddhn v2.8b, v13.8h, v10.8h |
2679 | /* 32bpp result is in {d0, d1, d2, XX} */ | 2969 | /* 32bpp result is in {v0, v1, v2, XX} */ |
2680 | convert_8888_to_0565 d2, d1, d0, q14, q15, q3 | 2970 | convert_8888_to_0565 v2, v1, v0, v14, v12, v3 |
2971 | mov v28.d[0], v14.d[0] | ||
2972 | mov v29.d[0], v14.d[1] | ||
2681 | .endm | 2973 | .endm |
2682 | 2974 | ||
2683 | /* TODO: expand macros and do better instructions scheduling */ | 2975 | /* TODO: expand macros and do better instructions scheduling */ |
2684 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head | 2976 | .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head |
2685 | fetch_src_pixblock | 2977 | fetch_src_pixblock |
2686 | pixman_composite_out_reverse_8_0565_process_pixblock_tail | 2978 | pixman_composite_out_reverse_8_0565_process_pixblock_tail |
2687 | vld1.16 {d10, d11}, [DST_R, :128]! | 2979 | ld1 {v10.4h, v11.4h}, [DST_R], #16 |
2688 | cache_preload 8, 8 | 2980 | cache_preload 8, 8 |
2689 | pixman_composite_out_reverse_8_0565_process_pixblock_head | 2981 | pixman_composite_out_reverse_8_0565_process_pixblock_head |
2690 | vst1.16 {d28, d29}, [DST_W, :128]! | 2982 | st1 {v14.8h}, [DST_W], #16 |
2691 | .endm | 2983 | .endm |
2692 | 2984 | ||
2693 | generate_composite_function \ | 2985 | generate_composite_function \ |
2694 | pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ | 2986 | pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ |
2695 | FLAG_DST_READWRITE, \ | 2987 | FLAG_DST_READWRITE, \ |
@@ -2699,47 +2991,47 @@ generate_composite_function \ | |||
2699 | default_cleanup_need_all_regs, \ | 2991 | default_cleanup_need_all_regs, \ |
2700 | pixman_composite_out_reverse_8_0565_process_pixblock_head, \ | 2992 | pixman_composite_out_reverse_8_0565_process_pixblock_head, \ |
2701 | pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ | 2993 | pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ |
2702 | pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ | 2994 | pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ |
2703 | 28, /* dst_w_basereg */ \ | 2995 | 28, /* dst_w_basereg */ \ |
2704 | 10, /* dst_r_basereg */ \ | 2996 | 10, /* dst_r_basereg */ \ |
2705 | 15, /* src_basereg */ \ | 2997 | 15, /* src_basereg */ \ |
2706 | 0 /* mask_basereg */ | 2998 | 0 /* mask_basereg */ |
2707 | 2999 | ||
2708 | /******************************************************************************/ | 3000 | /******************************************************************************/ |
2709 | 3001 | ||
2710 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_head | 3002 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_head |
2711 | /* src is in d0 */ | 3003 | /* src is in v0 */ |
2712 | /* destination pixel data is in {d4, d5, d6, d7} */ | 3004 | /* destination pixel data is in {v4, v5, v6, v7} */ |
2713 | vmvn.8 d1, d0 /* get inverted alpha */ | 3005 | mvn v1.8b, v0.8b /* get inverted alpha */ |
2714 | /* now do alpha blending */ | 3006 | /* now do alpha blending */ |
2715 | vmull.u8 q8, d1, d4 | 3007 | umull v8.8h, v1.8b, v4.8b |
2716 | vmull.u8 q9, d1, d5 | 3008 | umull v9.8h, v1.8b, v5.8b |
2717 | vmull.u8 q10, d1, d6 | 3009 | umull v10.8h, v1.8b, v6.8b |
2718 | vmull.u8 q11, d1, d7 | 3010 | umull v11.8h, v1.8b, v7.8b |
2719 | .endm | 3011 | .endm |
2720 | 3012 | ||
2721 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail | 3013 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail |
2722 | vrshr.u16 q14, q8, #8 | 3014 | urshr v14.8h, v8.8h, #8 |
2723 | vrshr.u16 q15, q9, #8 | 3015 | urshr v15.8h, v9.8h, #8 |
2724 | vrshr.u16 q12, q10, #8 | 3016 | urshr v12.8h, v10.8h, #8 |
2725 | vrshr.u16 q13, q11, #8 | 3017 | urshr v13.8h, v11.8h, #8 |
2726 | vraddhn.u16 d28, q14, q8 | 3018 | raddhn v28.8b, v14.8h, v8.8h |
2727 | vraddhn.u16 d29, q15, q9 | 3019 | raddhn v29.8b, v15.8h, v9.8h |
2728 | vraddhn.u16 d30, q12, q10 | 3020 | raddhn v30.8b, v12.8h, v10.8h |
2729 | vraddhn.u16 d31, q13, q11 | 3021 | raddhn v31.8b, v13.8h, v11.8h |
2730 | /* 32bpp result is in {d28, d29, d30, d31} */ | 3022 | /* 32bpp result is in {v28, v29, v30, v31} */ |
2731 | .endm | 3023 | .endm |
2732 | 3024 | ||
2733 | /* TODO: expand macros and do better instructions scheduling */ | 3025 | /* TODO: expand macros and do better instructions scheduling */ |
2734 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head | 3026 | .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head |
2735 | fetch_src_pixblock | 3027 | fetch_src_pixblock |
2736 | pixman_composite_out_reverse_8_8888_process_pixblock_tail | 3028 | pixman_composite_out_reverse_8_8888_process_pixblock_tail |
2737 | vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! | 3029 | ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 |
2738 | cache_preload 8, 8 | 3030 | cache_preload 8, 8 |
2739 | pixman_composite_out_reverse_8_8888_process_pixblock_head | 3031 | pixman_composite_out_reverse_8_8888_process_pixblock_head |
2740 | vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! | 3032 | st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 |
2741 | .endm | 3033 | .endm |
2742 | 3034 | ||
2743 | generate_composite_function \ | 3035 | generate_composite_function \ |
2744 | pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ | 3036 | pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ |
2745 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 3037 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2752,11 +3044,11 @@ generate_composite_function \ | |||
2752 | pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ | 3044 | pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ |
2753 | 28, /* dst_w_basereg */ \ | 3045 | 28, /* dst_w_basereg */ \ |
2754 | 4, /* dst_r_basereg */ \ | 3046 | 4, /* dst_r_basereg */ \ |
2755 | 0, /* src_basereg */ \ | 3047 | 0, /* src_basereg */ \ |
2756 | 0 /* mask_basereg */ | 3048 | 0 /* mask_basereg */ |
2757 | 3049 | ||
2758 | /******************************************************************************/ | 3050 | /******************************************************************************/ |
2759 | 3051 | ||
2760 | generate_composite_function_nearest_scanline \ | 3052 | generate_composite_function_nearest_scanline \ |
2761 | pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ | 3053 | pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ |
2762 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ | 3054 | FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ |
@@ -2787,12 +3079,12 @@ generate_composite_function_nearest_scanline \ | |||
2787 | 8, /* number of pixels, processed in a single block */ \ | 3079 | 8, /* number of pixels, processed in a single block */ \ |
2788 | default_init, \ | 3080 | default_init, \ |
2789 | default_cleanup, \ | 3081 | default_cleanup, \ |
2790 | pixman_composite_src_8888_0565_process_pixblock_head, \ | 3082 | pixman_composite_src_8888_0565_process_pixblock_head, \ |
2791 | pixman_composite_src_8888_0565_process_pixblock_tail, \ | 3083 | pixman_composite_src_8888_0565_process_pixblock_tail, \ |
2792 | pixman_composite_src_8888_0565_process_pixblock_tail_head | 3084 | pixman_composite_src_8888_0565_process_pixblock_tail_head, \ |
2793 | 3085 | ||
2794 | generate_composite_function_nearest_scanline \ | 3086 | generate_composite_function_nearest_scanline \ |
2795 | pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ | 3087 | pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ |
2796 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ | 3088 | FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ |
2797 | 8, /* number of pixels, processed in a single block */ \ | 3089 | 8, /* number of pixels, processed in a single block */ \ |
2798 | default_init, \ | 3090 | default_init, \ |
@@ -2838,33 +3130,35 @@ generate_composite_function_nearest_scanline \ | |||
2838 | */ | 3130 | */ |
2839 | 3131 | ||
2840 | .macro bilinear_load_8888 reg1, reg2, tmp | 3132 | .macro bilinear_load_8888 reg1, reg2, tmp |
2841 | asr TMP1, X, #16 | 3133 | asr TMP1, X, #16 |
2842 | add X, X, UX | 3134 | add X, X, UX |
2843 | add TMP1, TOP, TMP1, lsl #2 | 3135 | lsl TMP2, TMP1, #2 |
2844 | vld1.32 {reg1}, [TMP1], STRIDE | 3136 | add TMP1, TOP, TMP2 |
2845 | vld1.32 {reg2}, [TMP1] | 3137 | ld1 {®1&.2s}, [TMP1], STRIDE |
3138 | ld1 {®2&.2s}, [TMP1] | ||
2846 | .endm | 3139 | .endm |
2847 | 3140 | ||
2848 | .macro bilinear_load_0565 reg1, reg2, tmp | 3141 | .macro bilinear_load_0565 reg1, reg2, tmp |
2849 | asr TMP1, X, #16 | 3142 | asr TMP1, X, #16 |
2850 | add X, X, UX | 3143 | add X, X, UX |
2851 | add TMP1, TOP, TMP1, lsl #1 | 3144 | lsl TMP2, TMP1, #1 |
2852 | vld1.32 {reg2[0]}, [TMP1], STRIDE | 3145 | add TMP1, TOP, TMP2 |
2853 | vld1.32 {reg2[1]}, [TMP1] | 3146 | ld1 {®2&.s}[0], [TMP1], STRIDE |
3147 | ld1 {®2&.s}[1], [TMP1] | ||
2854 | convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp | 3148 | convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp |
2855 | .endm | 3149 | .endm |
2856 | 3150 | ||
2857 | .macro bilinear_load_and_vertical_interpolate_two_8888 \ | 3151 | .macro bilinear_load_and_vertical_interpolate_two_8888 \ |
2858 | acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 | 3152 | acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 |
2859 | 3153 | ||
2860 | bilinear_load_8888 reg1, reg2, tmp1 | 3154 | bilinear_load_8888 reg1, reg2, tmp1 |
2861 | vmull.u8 acc1, reg1, d28 | 3155 | umull &acc1&.8h, ®1&.8b, v28.8b |
2862 | vmlal.u8 acc1, reg2, d29 | 3156 | umlal &acc1&.8h, ®2&.8b, v29.8b |
2863 | bilinear_load_8888 reg3, reg4, tmp2 | 3157 | bilinear_load_8888 reg3, reg4, tmp2 |
2864 | vmull.u8 acc2, reg3, d28 | 3158 | umull &acc2&.8h, ®3&.8b, v28.8b |
2865 | vmlal.u8 acc2, reg4, d29 | 3159 | umlal &acc2&.8h, ®4&.8b, v29.8b |
2866 | .endm | 3160 | .endm |
2867 | 3161 | ||
2868 | .macro bilinear_load_and_vertical_interpolate_four_8888 \ | 3162 | .macro bilinear_load_and_vertical_interpolate_four_8888 \ |
2869 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ | 3163 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
2870 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi | 3164 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
@@ -2873,170 +3167,190 @@ generate_composite_function_nearest_scanline \ | |||
2873 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi | 3167 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi |
2874 | bilinear_load_and_vertical_interpolate_two_8888 \ | 3168 | bilinear_load_and_vertical_interpolate_two_8888 \ |
2875 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi | 3169 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
2876 | .endm | 3170 | .endm |
2877 | 3171 | ||
3172 | .macro vzip reg1, reg2 | ||
3173 | umov TMP4, v31.d[0] | ||
3174 | zip1 v31.8b, reg1, reg2 | ||
3175 | zip2 reg2, reg1, reg2 | ||
3176 | mov reg1, v31.8b | ||
3177 | mov v31.d[0], TMP4 | ||
3178 | .endm | ||
3179 | |||
3180 | .macro vuzp reg1, reg2 | ||
3181 | umov TMP4, v31.d[0] | ||
3182 | uzp1 v31.8b, reg1, reg2 | ||
3183 | uzp2 reg2, reg1, reg2 | ||
3184 | mov reg1, v31.8b | ||
3185 | mov v31.d[0], TMP4 | ||
3186 | .endm | ||
3187 | |||
2878 | .macro bilinear_load_and_vertical_interpolate_two_0565 \ | 3188 | .macro bilinear_load_and_vertical_interpolate_two_0565 \ |
2879 | acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi | 3189 | acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi |
2880 | |||
2881 | asr TMP1, X, #16 | 3190 | asr TMP1, X, #16 |
2882 | add X, X, UX | 3191 | add X, X, UX |
2883 | add TMP1, TOP, TMP1, lsl #1 | 3192 | lsl TMP2, TMP1, #1 |
3193 | add TMP1, TOP, TMP2 | ||
2884 | asr TMP2, X, #16 | 3194 | asr TMP2, X, #16 |
2885 | add X, X, UX | 3195 | add X, X, UX |
2886 | add TMP2, TOP, TMP2, lsl #1 | 3196 | lsl TMP3, TMP2, #1 |
2887 | vld1.32 {acc2lo[0]}, [TMP1], STRIDE | 3197 | add TMP2, TOP, TMP3 |
2888 | vld1.32 {acc2hi[0]}, [TMP2], STRIDE | 3198 | ld1 {&acc2&.s}[0], [TMP1], STRIDE |
2889 | vld1.32 {acc2lo[1]}, [TMP1] | 3199 | ld1 {&acc2&.s}[2], [TMP2], STRIDE |
2890 | vld1.32 {acc2hi[1]}, [TMP2] | 3200 | ld1 {&acc2&.s}[1], [TMP1] |
3201 | ld1 {&acc2&.s}[3], [TMP2] | ||
2891 | convert_0565_to_x888 acc2, reg3, reg2, reg1 | 3202 | convert_0565_to_x888 acc2, reg3, reg2, reg1 |
2892 | vzip.u8 reg1, reg3 | 3203 | vzip ®1&.8b, ®3&.8b |
2893 | vzip.u8 reg2, reg4 | 3204 | vzip ®2&.8b, ®4&.8b |
2894 | vzip.u8 reg3, reg4 | 3205 | vzip ®3&.8b, ®4&.8b |
2895 | vzip.u8 reg1, reg2 | 3206 | vzip ®1&.8b, ®2&.8b |
2896 | vmull.u8 acc1, reg1, d28 | 3207 | umull &acc1&.8h, ®1&.8b, v28.8b |
2897 | vmlal.u8 acc1, reg2, d29 | 3208 | umlal &acc1&.8h, ®2&.8b, v29.8b |
2898 | vmull.u8 acc2, reg3, d28 | 3209 | umull &acc2&.8h, ®3&.8b, v28.8b |
2899 | vmlal.u8 acc2, reg4, d29 | 3210 | umlal &acc2&.8h, ®4&.8b, v29.8b |
2900 | .endm | 3211 | .endm |
2901 | 3212 | ||
2902 | .macro bilinear_load_and_vertical_interpolate_four_0565 \ | 3213 | .macro bilinear_load_and_vertical_interpolate_four_0565 \ |
2903 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ | 3214 | xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ |
2904 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi | 3215 | yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi |
2905 | |||
2906 | asr TMP1, X, #16 | 3216 | asr TMP1, X, #16 |
2907 | add X, X, UX | 3217 | add X, X, UX |
2908 | add TMP1, TOP, TMP1, lsl #1 | 3218 | lsl TMP2, TMP1, #1 |
3219 | add TMP1, TOP, TMP2 | ||
2909 | asr TMP2, X, #16 | 3220 | asr TMP2, X, #16 |
2910 | add X, X, UX | 3221 | add X, X, UX |
2911 | add TMP2, TOP, TMP2, lsl #1 | 3222 | lsl TMP3, TMP2, #1 |
2912 | vld1.32 {xacc2lo[0]}, [TMP1], STRIDE | 3223 | add TMP2, TOP, TMP3 |
2913 | vld1.32 {xacc2hi[0]}, [TMP2], STRIDE | 3224 | ld1 {&xacc2&.s}[0], [TMP1], STRIDE |
2914 | vld1.32 {xacc2lo[1]}, [TMP1] | 3225 | ld1 {&xacc2&.s}[2], [TMP2], STRIDE |
2915 | vld1.32 {xacc2hi[1]}, [TMP2] | 3226 | ld1 {&xacc2&.s}[1], [TMP1] |
3227 | ld1 {&xacc2&.s}[3], [TMP2] | ||
2916 | convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 | 3228 | convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 |
2917 | asr TMP1, X, #16 | 3229 | asr TMP1, X, #16 |
2918 | add X, X, UX | 3230 | add X, X, UX |
2919 | add TMP1, TOP, TMP1, lsl #1 | 3231 | lsl TMP2, TMP1, #1 |
3232 | add TMP1, TOP, TMP2 | ||
2920 | asr TMP2, X, #16 | 3233 | asr TMP2, X, #16 |
2921 | add X, X, UX | 3234 | add X, X, UX |
2922 | add TMP2, TOP, TMP2, lsl #1 | 3235 | lsl TMP3, TMP2, #1 |
2923 | vld1.32 {yacc2lo[0]}, [TMP1], STRIDE | 3236 | add TMP2, TOP, TMP3 |
2924 | vzip.u8 xreg1, xreg3 | 3237 | ld1 {&yacc2&.s}[0], [TMP1], STRIDE |
2925 | vld1.32 {yacc2hi[0]}, [TMP2], STRIDE | 3238 | vzip &xreg1&.8b, &xreg3&.8b |
2926 | vzip.u8 xreg2, xreg4 | 3239 | ld1 {&yacc2&.s}[2], [TMP2], STRIDE |
2927 | vld1.32 {yacc2lo[1]}, [TMP1] | 3240 | vzip &xreg2&.8b, &xreg4&.8b |
2928 | vzip.u8 xreg3, xreg4 | 3241 | ld1 {&yacc2&.s}[1], [TMP1] |
2929 | vld1.32 {yacc2hi[1]}, [TMP2] | 3242 | vzip &xreg3&.8b, &xreg4&.8b |
2930 | vzip.u8 xreg1, xreg2 | 3243 | ld1 {&yacc2&.s}[3], [TMP2] |
3244 | vzip &xreg1&.8b, &xreg2&.8b | ||
2931 | convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 | 3245 | convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 |
2932 | vmull.u8 xacc1, xreg1, d28 | 3246 | umull &xacc1&.8h, &xreg1&.8b, v28.8b |
2933 | vzip.u8 yreg1, yreg3 | 3247 | vzip &yreg1&.8b, &yreg3&.8b |
2934 | vmlal.u8 xacc1, xreg2, d29 | 3248 | umlal &xacc1&.8h, &xreg2&.8b, v29.8b |
2935 | vzip.u8 yreg2, yreg4 | 3249 | vzip &yreg2&.8b, &yreg4&.8b |
2936 | vmull.u8 xacc2, xreg3, d28 | 3250 | umull &xacc2&.8h, &xreg3&.8b, v28.8b |
2937 | vzip.u8 yreg3, yreg4 | 3251 | vzip &yreg3&.8b, &yreg4&.8b |
2938 | vmlal.u8 xacc2, xreg4, d29 | 3252 | umlal &xacc2&.8h, &xreg4&.8b, v29.8b |
2939 | vzip.u8 yreg1, yreg2 | 3253 | vzip &yreg1&.8b, &yreg2&.8b |
2940 | vmull.u8 yacc1, yreg1, d28 | 3254 | umull &yacc1&.8h, &yreg1&.8b, v28.8b |
2941 | vmlal.u8 yacc1, yreg2, d29 | 3255 | umlal &yacc1&.8h, &yreg2&.8b, v29.8b |
2942 | vmull.u8 yacc2, yreg3, d28 | 3256 | umull &yacc2&.8h, &yreg3&.8b, v28.8b |
2943 | vmlal.u8 yacc2, yreg4, d29 | 3257 | umlal &yacc2&.8h, &yreg4&.8b, v29.8b |
2944 | .endm | 3258 | .endm |
2945 | 3259 | ||
2946 | .macro bilinear_store_8888 numpix, tmp1, tmp2 | 3260 | .macro bilinear_store_8888 numpix, tmp1, tmp2 |
2947 | .if numpix == 4 | 3261 | .if numpix == 4 |
2948 | vst1.32 {d0, d1}, [OUT, :128]! | 3262 | st1 {v0.2s, v1.2s}, [OUT], #16 |
2949 | .elseif numpix == 2 | 3263 | .elseif numpix == 2 |
2950 | vst1.32 {d0}, [OUT, :64]! | 3264 | st1 {v0.2s}, [OUT], #8 |
2951 | .elseif numpix == 1 | 3265 | .elseif numpix == 1 |
2952 | vst1.32 {d0[0]}, [OUT, :32]! | 3266 | st1 {v0.s}[0], [OUT], #4 |
2953 | .else | 3267 | .else |
2954 | .error bilinear_store_8888 numpix is unsupported | 3268 | .error bilinear_store_8888 numpix is unsupported |
2955 | .endif | 3269 | .endif |
2956 | .endm | 3270 | .endm |
2957 | 3271 | ||
2958 | .macro bilinear_store_0565 numpix, tmp1, tmp2 | 3272 | .macro bilinear_store_0565 numpix, tmp1, tmp2 |
2959 | vuzp.u8 d0, d1 | 3273 | vuzp v0.8b, v1.8b |
2960 | vuzp.u8 d2, d3 | 3274 | vuzp v2.8b, v3.8b |
2961 | vuzp.u8 d1, d3 | 3275 | vuzp v1.8b, v3.8b |
2962 | vuzp.u8 d0, d2 | 3276 | vuzp v0.8b, v2.8b |
2963 | convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 | 3277 | convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 |
2964 | .if numpix == 4 | 3278 | .if numpix == 4 |
2965 | vst1.16 {d2}, [OUT, :64]! | 3279 | st1 {v1.4h}, [OUT], #8 |
2966 | .elseif numpix == 2 | 3280 | .elseif numpix == 2 |
2967 | vst1.32 {d2[0]}, [OUT, :32]! | 3281 | st1 {v1.s}[0], [OUT], #4 |
2968 | .elseif numpix == 1 | 3282 | .elseif numpix == 1 |
2969 | vst1.16 {d2[0]}, [OUT, :16]! | 3283 | st1 {v1.h}[0], [OUT], #2 |
2970 | .else | 3284 | .else |
2971 | .error bilinear_store_0565 numpix is unsupported | 3285 | .error bilinear_store_0565 numpix is unsupported |
2972 | .endif | 3286 | .endif |
2973 | .endm | 3287 | .endm |
2974 | 3288 | ||
2975 | .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt | 3289 | .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt |
2976 | bilinear_load_&src_fmt d0, d1, d2 | 3290 | bilinear_load_&src_fmt v0, v1, v2 |
2977 | vmull.u8 q1, d0, d28 | 3291 | umull v2.8h, v0.8b, v28.8b |
2978 | vmlal.u8 q1, d1, d29 | 3292 | umlal v2.8h, v1.8b, v29.8b |
2979 | /* 5 cycles bubble */ | 3293 | /* 5 cycles bubble */ |
2980 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS | 3294 | ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS |
2981 | vmlsl.u16 q0, d2, d30 | 3295 | umlsl v0.4s, v2.4h, v15.h[0] |
2982 | vmlal.u16 q0, d3, d30 | 3296 | umlal2 v0.4s, v2.8h, v15.h[0] |
2983 | /* 5 cycles bubble */ | 3297 | /* 5 cycles bubble */ |
2984 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 3298 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
2985 | /* 3 cycles bubble */ | 3299 | /* 3 cycles bubble */ |
2986 | vmovn.u16 d0, q0 | 3300 | xtn v0.8b, v0.8h |
2987 | /* 1 cycle bubble */ | 3301 | /* 1 cycle bubble */ |
2988 | bilinear_store_&dst_fmt 1, q2, q3 | 3302 | bilinear_store_&dst_fmt 1, v3, v4 |
2989 | .endm | 3303 | .endm |
2990 | 3304 | ||
2991 | .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt | 3305 | .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt |
2992 | bilinear_load_and_vertical_interpolate_two_&src_fmt \ | 3306 | bilinear_load_and_vertical_interpolate_two_&src_fmt \ |
2993 | q1, q11, d0, d1, d20, d21, d22, d23 | 3307 | v1, v11, v2, v3, v20, v21, v22, v23 |
2994 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS | 3308 | ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS |
2995 | vmlsl.u16 q0, d2, d30 | 3309 | umlsl v0.4s, v1.4h, v15.h[0] |
2996 | vmlal.u16 q0, d3, d30 | 3310 | umlal2 v0.4s, v1.8h, v15.h[0] |
2997 | vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS | 3311 | ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS |
2998 | vmlsl.u16 q10, d22, d31 | 3312 | umlsl v10.4s, v11.4h, v15.h[4] |
2999 | vmlal.u16 q10, d23, d31 | 3313 | umlal2 v10.4s, v11.8h, v15.h[4] |
3000 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 3314 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3001 | vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) | 3315 | shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3002 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3316 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3003 | vadd.u16 q12, q12, q13 | 3317 | add v12.8h, v12.8h, v13.8h |
3004 | vmovn.u16 d0, q0 | 3318 | xtn v0.8b, v0.8h |
3005 | bilinear_store_&dst_fmt 2, q2, q3 | 3319 | bilinear_store_&dst_fmt 2, v3, v4 |
3006 | .endm | 3320 | .endm |
3007 | 3321 | ||
3008 | .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt | 3322 | .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt |
3009 | bilinear_load_and_vertical_interpolate_four_&src_fmt \ | 3323 | bilinear_load_and_vertical_interpolate_four_&src_fmt \ |
3010 | q1, q11, d0, d1, d20, d21, d22, d23 \ | 3324 | v1, v11, v14, v20, v16, v17, v22, v23 \ |
3011 | q3, q9, d4, d5, d16, d17, d18, d19 | 3325 | v3, v9, v24, v25, v26, v27, v18, v19 |
3012 | pld [TMP1, PF_OFFS] | 3326 | prfm pldl2strm, [TMP1, PF_OFFS] |
3013 | sub TMP1, TMP1, STRIDE | 3327 | sub TMP1, TMP1, STRIDE |
3014 | vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS | 3328 | ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS |
3015 | vmlsl.u16 q0, d2, d30 | 3329 | umlsl v0.4s, v1.4h, v15.h[0] |
3016 | vmlal.u16 q0, d3, d30 | 3330 | umlal2 v0.4s, v1.8h, v15.h[0] |
3017 | vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS | 3331 | ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS |
3018 | vmlsl.u16 q10, d22, d31 | 3332 | umlsl v10.4s, v11.4h, v15.h[4] |
3019 | vmlal.u16 q10, d23, d31 | 3333 | umlal2 v10.4s, v11.8h, v15.h[4] |
3020 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3334 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3021 | vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS | 3335 | ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS |
3022 | vmlsl.u16 q2, d6, d30 | 3336 | umlsl v2.4s, v3.4h, v15.h[0] |
3023 | vmlal.u16 q2, d7, d30 | 3337 | umlal2 v2.4s, v3.8h, v15.h[0] |
3024 | vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS | 3338 | ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS |
3025 | pld [TMP2, PF_OFFS] | 3339 | prfm pldl2strm, [TMP2, PF_OFFS] |
3026 | vmlsl.u16 q8, d18, d31 | 3340 | umlsl v8.4s, v9.4h, v15.h[4] |
3027 | vmlal.u16 q8, d19, d31 | 3341 | umlal2 v8.4s, v9.8h, v15.h[4] |
3028 | vadd.u16 q12, q12, q13 | 3342 | add v12.8h, v12.8h, v13.8h |
3029 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 3343 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3030 | vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) | 3344 | shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3031 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | 3345 | shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3032 | vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) | 3346 | shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3033 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3347 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3034 | vmovn.u16 d0, q0 | 3348 | xtn v0.8b, v0.8h |
3035 | vmovn.u16 d1, q2 | 3349 | xtn v1.8b, v2.8h |
3036 | vadd.u16 q12, q12, q13 | 3350 | add v12.8h, v12.8h, v13.8h |
3037 | bilinear_store_&dst_fmt 4, q2, q3 | 3351 | bilinear_store_&dst_fmt 4, v3, v4 |
3038 | .endm | 3352 | .endm |
3039 | 3353 | ||
3040 | .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt | 3354 | .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
3041 | .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt | 3355 | .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt |
3042 | bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head | 3356 | bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head |
@@ -3105,125 +3419,141 @@ generate_composite_function_nearest_scanline \ | |||
3105 | .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ | 3419 | .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ |
3106 | src_bpp_shift, dst_bpp_shift, \ | 3420 | src_bpp_shift, dst_bpp_shift, \ |
3107 | prefetch_distance, flags | 3421 | prefetch_distance, flags |
3108 | 3422 | ||
3109 | pixman_asm_function fname | 3423 | pixman_asm_function fname |
3110 | OUT .req r0 | 3424 | OUT .req x0 |
3111 | TOP .req r1 | 3425 | TOP .req x1 |
3112 | BOTTOM .req r2 | 3426 | BOTTOM .req x2 |
3113 | WT .req r3 | 3427 | WT .req x3 |
3114 | WB .req r4 | 3428 | WB .req x4 |
3115 | X .req r5 | 3429 | X .req x5 |
3116 | UX .req r6 | 3430 | UX .req x6 |
3117 | WIDTH .req ip | 3431 | WIDTH .req x7 |
3118 | TMP1 .req r3 | 3432 | TMP1 .req x8 |
3119 | TMP2 .req r4 | 3433 | TMP2 .req x9 |
3120 | PF_OFFS .req r7 | 3434 | PF_OFFS .req x10 |
3121 | TMP3 .req r8 | 3435 | TMP3 .req x11 |
3122 | TMP4 .req r9 | 3436 | TMP4 .req x12 |
3123 | STRIDE .req r2 | 3437 | STRIDE .req x13 |
3124 | 3438 | ||
3125 | mov ip, sp | 3439 | sxtw x3, w3 |
3126 | push {r4, r5, r6, r7, r8, r9} | 3440 | sxtw x4, w4 |
3441 | sxtw x5, w5 | ||
3442 | sxtw x6, w6 | ||
3443 | sxtw x7, w7 | ||
3444 | |||
3445 | stp x29, x30, [sp, -16]! | ||
3446 | mov x29, sp | ||
3447 | sub sp, sp, 112 /* push all registers */ | ||
3448 | sub x29, x29, 64 | ||
3449 | st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 | ||
3450 | st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 | ||
3451 | stp x8, x9, [x29, -80] | ||
3452 | stp x10, x11, [x29, -96] | ||
3453 | stp x12, x13, [x29, -112] | ||
3454 | |||
3127 | mov PF_OFFS, #prefetch_distance | 3455 | mov PF_OFFS, #prefetch_distance |
3128 | ldmia ip, {WB, X, UX, WIDTH} | ||
3129 | mul PF_OFFS, PF_OFFS, UX | 3456 | mul PF_OFFS, PF_OFFS, UX |
3130 | 3457 | ||
3131 | .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 | 3458 | subs STRIDE, BOTTOM, TOP |
3132 | vpush {d8-d15} | ||
3133 | .endif | ||
3134 | |||
3135 | sub STRIDE, BOTTOM, TOP | ||
3136 | .unreq BOTTOM | 3459 | .unreq BOTTOM |
3137 | 3460 | ||
3138 | cmp WIDTH, #0 | 3461 | cmp WIDTH, #0 |
3139 | ble 3f | 3462 | ble 300f |
3140 | 3463 | ||
3141 | vdup.u16 q12, X | 3464 | dup v12.8h, w5 |
3142 | vdup.u16 q13, UX | 3465 | dup v13.8h, w6 |
3143 | vdup.u8 d28, WT | 3466 | dup v28.8b, w3 |
3144 | vdup.u8 d29, WB | 3467 | dup v29.8b, w4 |
3145 | vadd.u16 d25, d25, d26 | 3468 | mov v25.d[0], v12.d[1] |
3469 | mov v26.d[0], v13.d[0] | ||
3470 | add v25.4h, v25.4h, v26.4h | ||
3471 | mov v12.d[1], v25.d[0] | ||
3146 | 3472 | ||
3147 | /* ensure good destination alignment */ | 3473 | /* ensure good destination alignment */ |
3148 | cmp WIDTH, #1 | 3474 | cmp WIDTH, #1 |
3149 | blt 0f | 3475 | blt 100f |
3150 | tst OUT, #(1 << dst_bpp_shift) | 3476 | tst OUT, #(1 << dst_bpp_shift) |
3151 | beq 0f | 3477 | beq 100f |
3152 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3478 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3153 | vadd.u16 q12, q12, q13 | 3479 | add v12.8h, v12.8h, v13.8h |
3154 | bilinear_interpolate_last_pixel src_fmt, dst_fmt | 3480 | bilinear_interpolate_last_pixel src_fmt, dst_fmt |
3155 | sub WIDTH, WIDTH, #1 | 3481 | sub WIDTH, WIDTH, #1 |
3156 | 0: | 3482 | 100: |
3157 | vadd.u16 q13, q13, q13 | 3483 | add v13.8h, v13.8h, v13.8h |
3158 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3484 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3159 | vadd.u16 q12, q12, q13 | 3485 | add v12.8h, v12.8h, v13.8h |
3160 | 3486 | ||
3161 | cmp WIDTH, #2 | 3487 | cmp WIDTH, #2 |
3162 | blt 0f | 3488 | blt 100f |
3163 | tst OUT, #(1 << (dst_bpp_shift + 1)) | 3489 | tst OUT, #(1 << (dst_bpp_shift + 1)) |
3164 | beq 0f | 3490 | beq 100f |
3165 | bilinear_interpolate_two_pixels src_fmt, dst_fmt | 3491 | bilinear_interpolate_two_pixels src_fmt, dst_fmt |
3166 | sub WIDTH, WIDTH, #2 | 3492 | sub WIDTH, WIDTH, #2 |
3167 | 0: | 3493 | 100: |
3168 | .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 | 3494 | .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 |
3169 | /*********** 8 pixels per iteration *****************/ | 3495 | /*********** 8 pixels per iteration *****************/ |
3170 | cmp WIDTH, #4 | 3496 | cmp WIDTH, #4 |
3171 | blt 0f | 3497 | blt 100f |
3172 | tst OUT, #(1 << (dst_bpp_shift + 2)) | 3498 | tst OUT, #(1 << (dst_bpp_shift + 2)) |
3173 | beq 0f | 3499 | beq 100f |
3174 | bilinear_interpolate_four_pixels src_fmt, dst_fmt | 3500 | bilinear_interpolate_four_pixels src_fmt, dst_fmt |
3175 | sub WIDTH, WIDTH, #4 | 3501 | sub WIDTH, WIDTH, #4 |
3176 | 0: | 3502 | 100: |
3177 | subs WIDTH, WIDTH, #8 | 3503 | subs WIDTH, WIDTH, #8 |
3178 | blt 1f | 3504 | blt 100f |
3179 | asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) | 3505 | asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) |
3180 | bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt | 3506 | bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt |
3181 | subs WIDTH, WIDTH, #8 | 3507 | subs WIDTH, WIDTH, #8 |
3182 | blt 5f | 3508 | blt 500f |
3183 | 0: | 3509 | 1000: |
3184 | bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt | 3510 | bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt |
3185 | subs WIDTH, WIDTH, #8 | 3511 | subs WIDTH, WIDTH, #8 |
3186 | bge 0b | 3512 | bge 1000b |
3187 | 5: | 3513 | 500: |
3188 | bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt | 3514 | bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt |
3189 | 1: | 3515 | 100: |
3190 | tst WIDTH, #4 | 3516 | tst WIDTH, #4 |
3191 | beq 2f | 3517 | beq 200f |
3192 | bilinear_interpolate_four_pixels src_fmt, dst_fmt | 3518 | bilinear_interpolate_four_pixels src_fmt, dst_fmt |
3193 | 2: | 3519 | 200: |
3194 | .else | 3520 | .else |
3195 | /*********** 4 pixels per iteration *****************/ | 3521 | /*********** 4 pixels per iteration *****************/ |
3196 | subs WIDTH, WIDTH, #4 | 3522 | subs WIDTH, WIDTH, #4 |
3197 | blt 1f | 3523 | blt 100f |
3198 | asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) | 3524 | asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) |
3199 | bilinear_interpolate_four_pixels_head src_fmt, dst_fmt | 3525 | bilinear_interpolate_four_pixels_head src_fmt, dst_fmt |
3200 | subs WIDTH, WIDTH, #4 | 3526 | subs WIDTH, WIDTH, #4 |
3201 | blt 5f | 3527 | blt 500f |
3202 | 0: | 3528 | 1000: |
3203 | bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt | 3529 | bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt |
3204 | subs WIDTH, WIDTH, #4 | 3530 | subs WIDTH, WIDTH, #4 |
3205 | bge 0b | 3531 | bge 1000b |
3206 | 5: | 3532 | 500: |
3207 | bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt | 3533 | bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt |
3208 | 1: | 3534 | 100: |
3209 | /****************************************************/ | 3535 | /****************************************************/ |
3210 | .endif | 3536 | .endif |
3211 | /* handle the remaining trailing pixels */ | 3537 | /* handle the remaining trailing pixels */ |
3212 | tst WIDTH, #2 | 3538 | tst WIDTH, #2 |
3213 | beq 2f | 3539 | beq 200f |
3214 | bilinear_interpolate_two_pixels src_fmt, dst_fmt | 3540 | bilinear_interpolate_two_pixels src_fmt, dst_fmt |
3215 | 2: | 3541 | 200: |
3216 | tst WIDTH, #1 | 3542 | tst WIDTH, #1 |
3217 | beq 3f | 3543 | beq 300f |
3218 | bilinear_interpolate_last_pixel src_fmt, dst_fmt | 3544 | bilinear_interpolate_last_pixel src_fmt, dst_fmt |
3219 | 3: | 3545 | 300: |
3220 | .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 | 3546 | sub x29, x29, 64 |
3221 | vpop {d8-d15} | 3547 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 |
3222 | .endif | 3548 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 |
3223 | pop {r4, r5, r6, r7, r8, r9} | 3549 | ldp x8, x9, [x29, -80] |
3224 | bx lr | 3550 | ldp x10, x11, [x29, -96] |
3551 | ldp x12, x13, [x29, -104] | ||
3552 | mov sp, x29 | ||
3553 | ldp x29, x30, [sp], 16 | ||
3554 | ret | ||
3225 | 3555 | ||
3226 | .unreq OUT | 3556 | .unreq OUT |
3227 | .unreq TOP | 3557 | .unreq TOP |
3228 | .unreq WT | 3558 | .unreq WT |
3229 | .unreq WB | 3559 | .unreq WB |
@@ -3250,368 +3580,125 @@ pixman_asm_function fname | |||
3250 | add TMP1, TOP, TMP1, lsl #2 | 3580 | add TMP1, TOP, TMP1, lsl #2 |
3251 | asr TMP2, X, #16 | 3581 | asr TMP2, X, #16 |
3252 | add X, X, UX | 3582 | add X, X, UX |
3253 | add TMP2, TOP, TMP2, lsl #2 | 3583 | add TMP2, TOP, TMP2, lsl #2 |
3254 | 3584 | ||
3255 | vld1.32 {d22}, [TMP1], STRIDE | 3585 | ld1 {v22.2s}, [TMP1], STRIDE |
3256 | vld1.32 {d23}, [TMP1] | 3586 | ld1 {v23.2s}, [TMP1] |
3257 | asr TMP3, X, #16 | 3587 | asr TMP3, X, #16 |
3258 | add X, X, UX | 3588 | add X, X, UX |
3259 | add TMP3, TOP, TMP3, lsl #2 | 3589 | add TMP3, TOP, TMP3, lsl #2 |
3260 | vmull.u8 q8, d22, d28 | 3590 | umull v8.8h, v22.8b, v28.8b |
3261 | vmlal.u8 q8, d23, d29 | 3591 | umlal v8.8h, v23.8b, v29.8b |
3262 | 3592 | ||
3263 | vld1.32 {d22}, [TMP2], STRIDE | 3593 | ld1 {v22.2s}, [TMP2], STRIDE |
3264 | vld1.32 {d23}, [TMP2] | 3594 | ld1 {v23.2s}, [TMP2] |
3265 | asr TMP4, X, #16 | 3595 | asr TMP4, X, #16 |
3266 | add X, X, UX | 3596 | add X, X, UX |
3267 | add TMP4, TOP, TMP4, lsl #2 | 3597 | add TMP4, TOP, TMP4, lsl #2 |
3268 | vmull.u8 q9, d22, d28 | 3598 | umull v9.8h, v22.8b, v28.8b |
3269 | vmlal.u8 q9, d23, d29 | 3599 | umlal v9.8h, v23.8b, v29.8b |
3270 | 3600 | ||
3271 | vld1.32 {d22}, [TMP3], STRIDE | 3601 | ld1 {v22.2s}, [TMP3], STRIDE |
3272 | vld1.32 {d23}, [TMP3] | 3602 | ld1 {v23.2s}, [TMP3] |
3273 | vmull.u8 q10, d22, d28 | 3603 | umull v10.8h, v22.8b, v28.8b |
3274 | vmlal.u8 q10, d23, d29 | 3604 | umlal v10.8h, v23.8b, v29.8b |
3275 | 3605 | ||
3276 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | 3606 | ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS |
3277 | vmlsl.u16 q0, d16, d30 | 3607 | umlsl v0.4s, v8.4h, v15.h[0] |
3278 | vmlal.u16 q0, d17, d30 | 3608 | umlal2 v0.4s, v8.8h, v15.h[0] |
3279 | 3609 | ||
3280 | pld [TMP4, PF_OFFS] | 3610 | prfm pldl2strm, [TMP4, PF_OFFS] |
3281 | vld1.32 {d16}, [TMP4], STRIDE | 3611 | ld1 {v16.2s}, [TMP4], STRIDE |
3282 | vld1.32 {d17}, [TMP4] | 3612 | ld1 {v17.2s}, [TMP4] |
3283 | pld [TMP4, PF_OFFS] | 3613 | prfm pldl2strm, [TMP4, PF_OFFS] |
3284 | vmull.u8 q11, d16, d28 | 3614 | umull v11.8h, v16.8b, v28.8b |
3285 | vmlal.u8 q11, d17, d29 | 3615 | umlal v11.8h, v17.8b, v29.8b |
3286 | 3616 | ||
3287 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | 3617 | ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS |
3288 | vmlsl.u16 q1, d18, d31 | 3618 | umlsl v1.4s, v9.4h, v15.h[4] |
3289 | .endm | 3619 | .endm |
3290 | 3620 | ||
3291 | .macro bilinear_interpolate_four_pixels_8888_8888_tail | 3621 | .macro bilinear_interpolate_four_pixels_8888_8888_tail |
3292 | vmlal.u16 q1, d19, d31 | 3622 | umlal2 v1.4s, v9.8h, v15.h[4] |
3293 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3623 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3294 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | 3624 | ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS |
3295 | vmlsl.u16 q2, d20, d30 | 3625 | umlsl v2.4s, v10.4h, v15.h[0] |
3296 | vmlal.u16 q2, d21, d30 | 3626 | umlal2 v2.4s, v10.8h, v15.h[0] |
3297 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | 3627 | ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS |
3298 | vmlsl.u16 q3, d22, d31 | 3628 | umlsl v3.4s, v11.4h, v15.h[4] |
3299 | vmlal.u16 q3, d23, d31 | 3629 | umlal2 v3.4s, v11.8h, v15.h[4] |
3300 | vadd.u16 q12, q12, q13 | 3630 | add v12.8h, v12.8h, v13.8h |
3301 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 3631 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3302 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | 3632 | shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3303 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | 3633 | shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3304 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3634 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3305 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | 3635 | shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3306 | vmovn.u16 d6, q0 | 3636 | xtn v6.8b, v0.8h |
3307 | vmovn.u16 d7, q2 | 3637 | xtn v7.8b, v2.8h |
3308 | vadd.u16 q12, q12, q13 | 3638 | add v12.8h, v12.8h, v13.8h |
3309 | vst1.32 {d6, d7}, [OUT, :128]! | 3639 | st1 {v6.2s, v7.2s}, [OUT], #16 |
3310 | .endm | 3640 | .endm |
3311 | 3641 | ||
3312 | .macro bilinear_interpolate_four_pixels_8888_8888_tail_head | 3642 | .macro bilinear_interpolate_four_pixels_8888_8888_tail_head |
3313 | asr TMP1, X, #16 | 3643 | asr TMP1, X, #16 |
3314 | add X, X, UX | 3644 | add X, X, UX |
3315 | add TMP1, TOP, TMP1, lsl #2 | 3645 | add TMP1, TOP, TMP1, lsl #2 |
3316 | asr TMP2, X, #16 | 3646 | asr TMP2, X, #16 |
3317 | add X, X, UX | 3647 | add X, X, UX |
3318 | add TMP2, TOP, TMP2, lsl #2 | 3648 | add TMP2, TOP, TMP2, lsl #2 |
3319 | vmlal.u16 q1, d19, d31 | 3649 | umlal2 v1.4s, v9.8h, v15.h[4] |
3320 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3650 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3321 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | 3651 | ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS |
3322 | vmlsl.u16 q2, d20, d30 | 3652 | umlsl v2.4s, v10.4h, v15.h[0] |
3323 | vmlal.u16 q2, d21, d30 | 3653 | umlal2 v2.4s, v10.8h, v15.h[0] |
3324 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | 3654 | ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS |
3325 | vld1.32 {d20}, [TMP1], STRIDE | 3655 | ld1 {v20.2s}, [TMP1], STRIDE |
3326 | vmlsl.u16 q3, d22, d31 | 3656 | umlsl v3.4s, v11.4h, v15.h[4] |
3327 | vmlal.u16 q3, d23, d31 | 3657 | umlal2 v3.4s, v11.8h, v15.h[4] |
3328 | vld1.32 {d21}, [TMP1] | 3658 | ld1 {v21.2s}, [TMP1] |
3329 | vmull.u8 q8, d20, d28 | 3659 | umull v8.8h, v20.8b, v28.8b |
3330 | vmlal.u8 q8, d21, d29 | 3660 | umlal v8.8h, v21.8b, v29.8b |
3331 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | 3661 | shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3332 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | 3662 | shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3333 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | 3663 | shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3334 | vld1.32 {d22}, [TMP2], STRIDE | 3664 | ld1 {v22.2s}, [TMP2], STRIDE |
3335 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | 3665 | shrn2 v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) |
3336 | vadd.u16 q12, q12, q13 | 3666 | add v12.8h, v12.8h, v13.8h |
3337 | vld1.32 {d23}, [TMP2] | 3667 | ld1 {v23.2s}, [TMP2] |
3338 | vmull.u8 q9, d22, d28 | 3668 | umull v9.8h, v22.8b, v28.8b |
3339 | asr TMP3, X, #16 | 3669 | asr TMP3, X, #16 |
3340 | add X, X, UX | 3670 | add X, X, UX |
3341 | add TMP3, TOP, TMP3, lsl #2 | 3671 | add TMP3, TOP, TMP3, lsl #2 |
3342 | asr TMP4, X, #16 | 3672 | asr TMP4, X, #16 |
3343 | add X, X, UX | 3673 | add X, X, UX |
3344 | add TMP4, TOP, TMP4, lsl #2 | 3674 | add TMP4, TOP, TMP4, lsl #2 |
3345 | vmlal.u8 q9, d23, d29 | 3675 | umlal v9.8h, v23.8b, v29.8b |
3346 | vld1.32 {d22}, [TMP3], STRIDE | 3676 | ld1 {v22.2s}, [TMP3], STRIDE |
3347 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | 3677 | ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |
3348 | vld1.32 {d23}, [TMP3] | 3678 | ld1 {v23.2s}, [TMP3] |
3349 | vmull.u8 q10, d22, d28 | 3679 | umull v10.8h, v22.8b, v28.8b |
3350 | vmlal.u8 q10, d23, d29 | 3680 | umlal v10.8h, v23.8b, v29.8b |
3351 | vmovn.u16 d6, q0 | 3681 | xtn v6.8b, v0.8h |
3352 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | 3682 | ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS |
3353 | vmovn.u16 d7, q2 | 3683 | xtn v7.8b, v4.8h |
3354 | vmlsl.u16 q0, d16, d30 | 3684 | umlsl v0.4s, v8.4h, v15.h[0] |
3355 | vmlal.u16 q0, d17, d30 | 3685 | umlal2 v0.4s, v8.8h, v15.h[0] |
3356 | pld [TMP4, PF_OFFS] | 3686 | prfm pldl2strm, [TMP4, PF_OFFS] |
3357 | vld1.32 {d16}, [TMP4], STRIDE | 3687 | ld1 {v16.2s}, [TMP4], STRIDE |
3358 | vadd.u16 q12, q12, q13 | 3688 | add v12.8h, v12.8h, v13.8h |
3359 | vld1.32 {d17}, [TMP4] | 3689 | ld1 {v17.2s}, [TMP4] |
3360 | pld [TMP4, PF_OFFS] | 3690 | prfm pldl2strm, [TMP4, PF_OFFS] |
3361 | vmull.u8 q11, d16, d28 | 3691 | umull v11.8h, v16.8b, v28.8b |
3362 | vmlal.u8 q11, d17, d29 | 3692 | umlal v11.8h, v17.8b, v29.8b |
3363 | vst1.32 {d6, d7}, [OUT, :128]! | 3693 | st1 {v6.2s, v7.2s}, [OUT], #16 |
3364 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | 3694 | ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS |
3365 | vmlsl.u16 q1, d18, d31 | 3695 | umlsl v1.4s, v9.4h, v15.h[4] |
3366 | .endm | 3696 | .endm |
3367 | 3697 | ||
3368 | /*****************************************************************************/ | 3698 | /*****************************************************************************/ |
3369 | 3699 | ||
3370 | .set have_bilinear_interpolate_eight_pixels_8888_0565, 1 | ||
3371 | |||
3372 | .macro bilinear_interpolate_eight_pixels_8888_0565_head | ||
3373 | asr TMP1, X, #16 | ||
3374 | add X, X, UX | ||
3375 | add TMP1, TOP, TMP1, lsl #2 | ||
3376 | asr TMP2, X, #16 | ||
3377 | add X, X, UX | ||
3378 | add TMP2, TOP, TMP2, lsl #2 | ||
3379 | vld1.32 {d20}, [TMP1], STRIDE | ||
3380 | vld1.32 {d21}, [TMP1] | ||
3381 | vmull.u8 q8, d20, d28 | ||
3382 | vmlal.u8 q8, d21, d29 | ||
3383 | vld1.32 {d22}, [TMP2], STRIDE | ||
3384 | vld1.32 {d23}, [TMP2] | ||
3385 | vmull.u8 q9, d22, d28 | ||
3386 | asr TMP3, X, #16 | ||
3387 | add X, X, UX | ||
3388 | add TMP3, TOP, TMP3, lsl #2 | ||
3389 | asr TMP4, X, #16 | ||
3390 | add X, X, UX | ||
3391 | add TMP4, TOP, TMP4, lsl #2 | ||
3392 | vmlal.u8 q9, d23, d29 | ||
3393 | vld1.32 {d22}, [TMP3], STRIDE | ||
3394 | vld1.32 {d23}, [TMP3] | ||
3395 | vmull.u8 q10, d22, d28 | ||
3396 | vmlal.u8 q10, d23, d29 | ||
3397 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | ||
3398 | vmlsl.u16 q0, d16, d30 | ||
3399 | vmlal.u16 q0, d17, d30 | ||
3400 | pld [TMP4, PF_OFFS] | ||
3401 | vld1.32 {d16}, [TMP4], STRIDE | ||
3402 | vld1.32 {d17}, [TMP4] | ||
3403 | pld [TMP4, PF_OFFS] | ||
3404 | vmull.u8 q11, d16, d28 | ||
3405 | vmlal.u8 q11, d17, d29 | ||
3406 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | ||
3407 | vmlsl.u16 q1, d18, d31 | ||
3408 | |||
3409 | asr TMP1, X, #16 | ||
3410 | add X, X, UX | ||
3411 | add TMP1, TOP, TMP1, lsl #2 | ||
3412 | asr TMP2, X, #16 | ||
3413 | add X, X, UX | ||
3414 | add TMP2, TOP, TMP2, lsl #2 | ||
3415 | vmlal.u16 q1, d19, d31 | ||
3416 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3417 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | ||
3418 | vmlsl.u16 q2, d20, d30 | ||
3419 | vmlal.u16 q2, d21, d30 | ||
3420 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | ||
3421 | vld1.32 {d20}, [TMP1], STRIDE | ||
3422 | vmlsl.u16 q3, d22, d31 | ||
3423 | vmlal.u16 q3, d23, d31 | ||
3424 | vld1.32 {d21}, [TMP1] | ||
3425 | vmull.u8 q8, d20, d28 | ||
3426 | vmlal.u8 q8, d21, d29 | ||
3427 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3428 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3429 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3430 | vld1.32 {d22}, [TMP2], STRIDE | ||
3431 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3432 | vadd.u16 q12, q12, q13 | ||
3433 | vld1.32 {d23}, [TMP2] | ||
3434 | vmull.u8 q9, d22, d28 | ||
3435 | asr TMP3, X, #16 | ||
3436 | add X, X, UX | ||
3437 | add TMP3, TOP, TMP3, lsl #2 | ||
3438 | asr TMP4, X, #16 | ||
3439 | add X, X, UX | ||
3440 | add TMP4, TOP, TMP4, lsl #2 | ||
3441 | vmlal.u8 q9, d23, d29 | ||
3442 | vld1.32 {d22}, [TMP3], STRIDE | ||
3443 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3444 | vld1.32 {d23}, [TMP3] | ||
3445 | vmull.u8 q10, d22, d28 | ||
3446 | vmlal.u8 q10, d23, d29 | ||
3447 | vmovn.u16 d8, q0 | ||
3448 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | ||
3449 | vmovn.u16 d9, q2 | ||
3450 | vmlsl.u16 q0, d16, d30 | ||
3451 | vmlal.u16 q0, d17, d30 | ||
3452 | pld [TMP4, PF_OFFS] | ||
3453 | vld1.32 {d16}, [TMP4], STRIDE | ||
3454 | vadd.u16 q12, q12, q13 | ||
3455 | vld1.32 {d17}, [TMP4] | ||
3456 | pld [TMP4, PF_OFFS] | ||
3457 | vmull.u8 q11, d16, d28 | ||
3458 | vmlal.u8 q11, d17, d29 | ||
3459 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | ||
3460 | vmlsl.u16 q1, d18, d31 | ||
3461 | .endm | ||
3462 | |||
3463 | .macro bilinear_interpolate_eight_pixels_8888_0565_tail | ||
3464 | vmlal.u16 q1, d19, d31 | ||
3465 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3466 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | ||
3467 | vmlsl.u16 q2, d20, d30 | ||
3468 | vmlal.u16 q2, d21, d30 | ||
3469 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | ||
3470 | vmlsl.u16 q3, d22, d31 | ||
3471 | vmlal.u16 q3, d23, d31 | ||
3472 | vadd.u16 q12, q12, q13 | ||
3473 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3474 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3475 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3476 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3477 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3478 | vmovn.u16 d10, q0 | ||
3479 | vmovn.u16 d11, q2 | ||
3480 | vadd.u16 q12, q12, q13 | ||
3481 | |||
3482 | vuzp.u8 d8, d9 | ||
3483 | vuzp.u8 d10, d11 | ||
3484 | vuzp.u8 d9, d11 | ||
3485 | vuzp.u8 d8, d10 | ||
3486 | vshll.u8 q6, d9, #8 | ||
3487 | vshll.u8 q5, d10, #8 | ||
3488 | vshll.u8 q7, d8, #8 | ||
3489 | vsri.u16 q5, q6, #5 | ||
3490 | vsri.u16 q5, q7, #11 | ||
3491 | vst1.32 {d10, d11}, [OUT, :128]! | ||
3492 | .endm | ||
3493 | |||
3494 | .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head | ||
3495 | asr TMP1, X, #16 | ||
3496 | add X, X, UX | ||
3497 | add TMP1, TOP, TMP1, lsl #2 | ||
3498 | asr TMP2, X, #16 | ||
3499 | add X, X, UX | ||
3500 | add TMP2, TOP, TMP2, lsl #2 | ||
3501 | vmlal.u16 q1, d19, d31 | ||
3502 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3503 | vuzp.u8 d8, d9 | ||
3504 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | ||
3505 | vmlsl.u16 q2, d20, d30 | ||
3506 | vmlal.u16 q2, d21, d30 | ||
3507 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | ||
3508 | vld1.32 {d20}, [TMP1], STRIDE | ||
3509 | vmlsl.u16 q3, d22, d31 | ||
3510 | vmlal.u16 q3, d23, d31 | ||
3511 | vld1.32 {d21}, [TMP1] | ||
3512 | vmull.u8 q8, d20, d28 | ||
3513 | vmlal.u8 q8, d21, d29 | ||
3514 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3515 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3516 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3517 | vld1.32 {d22}, [TMP2], STRIDE | ||
3518 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3519 | vadd.u16 q12, q12, q13 | ||
3520 | vld1.32 {d23}, [TMP2] | ||
3521 | vmull.u8 q9, d22, d28 | ||
3522 | asr TMP3, X, #16 | ||
3523 | add X, X, UX | ||
3524 | add TMP3, TOP, TMP3, lsl #2 | ||
3525 | asr TMP4, X, #16 | ||
3526 | add X, X, UX | ||
3527 | add TMP4, TOP, TMP4, lsl #2 | ||
3528 | vmlal.u8 q9, d23, d29 | ||
3529 | vld1.32 {d22}, [TMP3], STRIDE | ||
3530 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3531 | vld1.32 {d23}, [TMP3] | ||
3532 | vmull.u8 q10, d22, d28 | ||
3533 | vmlal.u8 q10, d23, d29 | ||
3534 | vmovn.u16 d10, q0 | ||
3535 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | ||
3536 | vmovn.u16 d11, q2 | ||
3537 | vmlsl.u16 q0, d16, d30 | ||
3538 | vmlal.u16 q0, d17, d30 | ||
3539 | pld [TMP4, PF_OFFS] | ||
3540 | vld1.32 {d16}, [TMP4], STRIDE | ||
3541 | vadd.u16 q12, q12, q13 | ||
3542 | vld1.32 {d17}, [TMP4] | ||
3543 | pld [TMP4, PF_OFFS] | ||
3544 | vmull.u8 q11, d16, d28 | ||
3545 | vmlal.u8 q11, d17, d29 | ||
3546 | vuzp.u8 d10, d11 | ||
3547 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | ||
3548 | vmlsl.u16 q1, d18, d31 | ||
3549 | |||
3550 | asr TMP1, X, #16 | ||
3551 | add X, X, UX | ||
3552 | add TMP1, TOP, TMP1, lsl #2 | ||
3553 | asr TMP2, X, #16 | ||
3554 | add X, X, UX | ||
3555 | add TMP2, TOP, TMP2, lsl #2 | ||
3556 | vmlal.u16 q1, d19, d31 | ||
3557 | vuzp.u8 d9, d11 | ||
3558 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3559 | vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS | ||
3560 | vuzp.u8 d8, d10 | ||
3561 | vmlsl.u16 q2, d20, d30 | ||
3562 | vmlal.u16 q2, d21, d30 | ||
3563 | vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS | ||
3564 | vld1.32 {d20}, [TMP1], STRIDE | ||
3565 | vmlsl.u16 q3, d22, d31 | ||
3566 | vmlal.u16 q3, d23, d31 | ||
3567 | vld1.32 {d21}, [TMP1] | ||
3568 | vmull.u8 q8, d20, d28 | ||
3569 | vmlal.u8 q8, d21, d29 | ||
3570 | vshll.u8 q6, d9, #8 | ||
3571 | vshll.u8 q5, d10, #8 | ||
3572 | vshll.u8 q7, d8, #8 | ||
3573 | vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3574 | vsri.u16 q5, q6, #5 | ||
3575 | vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3576 | vsri.u16 q5, q7, #11 | ||
3577 | vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3578 | vld1.32 {d22}, [TMP2], STRIDE | ||
3579 | vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) | ||
3580 | vadd.u16 q12, q12, q13 | ||
3581 | vld1.32 {d23}, [TMP2] | ||
3582 | vmull.u8 q9, d22, d28 | ||
3583 | asr TMP3, X, #16 | ||
3584 | add X, X, UX | ||
3585 | add TMP3, TOP, TMP3, lsl #2 | ||
3586 | asr TMP4, X, #16 | ||
3587 | add X, X, UX | ||
3588 | add TMP4, TOP, TMP4, lsl #2 | ||
3589 | vmlal.u8 q9, d23, d29 | ||
3590 | vld1.32 {d22}, [TMP3], STRIDE | ||
3591 | vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) | ||
3592 | vld1.32 {d23}, [TMP3] | ||
3593 | vmull.u8 q10, d22, d28 | ||
3594 | vmlal.u8 q10, d23, d29 | ||
3595 | vmovn.u16 d8, q0 | ||
3596 | vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS | ||
3597 | vmovn.u16 d9, q2 | ||
3598 | vmlsl.u16 q0, d16, d30 | ||
3599 | vmlal.u16 q0, d17, d30 | ||
3600 | pld [TMP4, PF_OFFS] | ||
3601 | vld1.32 {d16}, [TMP4], STRIDE | ||
3602 | vadd.u16 q12, q12, q13 | ||
3603 | vld1.32 {d17}, [TMP4] | ||
3604 | pld [TMP4, PF_OFFS] | ||
3605 | vmull.u8 q11, d16, d28 | ||
3606 | vmlal.u8 q11, d17, d29 | ||
3607 | vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS | ||
3608 | vst1.32 {d10, d11}, [OUT, :128]! | ||
3609 | vmlsl.u16 q1, d18, d31 | ||
3610 | .endm | ||
3611 | /*****************************************************************************/ | ||
3612 | |||
3613 | generate_bilinear_scanline_func \ | 3700 | generate_bilinear_scanline_func \ |
3614 | pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ | 3701 | pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ |
3615 | 2, 2, 28, BILINEAR_FLAG_UNROLL_4 | 3702 | 2, 2, 28, BILINEAR_FLAG_UNROLL_4 |
3616 | 3703 | ||
3617 | generate_bilinear_scanline_func \ | 3704 | generate_bilinear_scanline_func \ |
diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h index 6a5a5fe..97cde5d 100644 --- a/pixman/pixman-arma64-neon-asm.h +++ b/pixman/pixman-arma64-neon-asm.h | |||
@@ -36,11 +36,11 @@ | |||
36 | * scheduling | 36 | * scheduling |
37 | * | 37 | * |
38 | * The user of this macro has to provide some configuration parameters | 38 | * The user of this macro has to provide some configuration parameters |
39 | * (bit depths for the images, prefetch distance, etc.) and a set of | 39 | * (bit depths for the images, prefetch distance, etc.) and a set of |
40 | * macros, which should implement basic code chunks responsible for | 40 | * macros, which should implement basic code chunks responsible for |
41 | * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage | 41 | * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage |
42 | * examples. | 42 | * examples. |
43 | * | 43 | * |
44 | * TODO: | 44 | * TODO: |
45 | * - try overlapped pixel method (from Ian Rickards) when processing | 45 | * - try overlapped pixel method (from Ian Rickards) when processing |
46 | * exactly two blocks of pixels | 46 | * exactly two blocks of pixels |
@@ -54,16 +54,10 @@ | |||
54 | .set FLAG_DST_WRITEONLY, 0 | 54 | .set FLAG_DST_WRITEONLY, 0 |
55 | .set FLAG_DST_READWRITE, 1 | 55 | .set FLAG_DST_READWRITE, 1 |
56 | .set FLAG_DEINTERLEAVE_32BPP, 2 | 56 | .set FLAG_DEINTERLEAVE_32BPP, 2 |
57 | 57 | ||
58 | /* | 58 | /* |
59 | * Offset in stack where mask and source pointer/stride can be accessed | ||
60 | * from 'init' macro. This is useful for doing special handling for solid mask. | ||
61 | */ | ||
62 | .set ARGS_STACK_OFFSET, 40 | ||
63 | |||
64 | /* | ||
65 | * Constants for selecting preferable prefetch type. | 59 | * Constants for selecting preferable prefetch type. |
66 | */ | 60 | */ |
67 | .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ | 61 | .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ |
68 | .set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ | 62 | .set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ |
69 | .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ | 63 | .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ |
@@ -72,121 +66,129 @@ | |||
72 | * Definitions of supplementary pixld/pixst macros (for partial load/store of | 66 | * Definitions of supplementary pixld/pixst macros (for partial load/store of |
73 | * pixel data). | 67 | * pixel data). |
74 | */ | 68 | */ |
75 | 69 | ||
76 | .macro pixldst1 op, elem_size, reg1, mem_operand, abits | 70 | .macro pixldst1 op, elem_size, reg1, mem_operand, abits |
77 | .if abits > 0 | 71 | op {v®1&.&elem_size}, [&mem_operand&], #8 |
78 | op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! | ||
79 | .else | ||
80 | op&.&elem_size {d®1}, [&mem_operand&]! | ||
81 | .endif | ||
82 | .endm | 72 | .endm |
83 | 73 | ||
84 | .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits | 74 | .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits |
85 | .if abits > 0 | 75 | op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 |
86 | op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! | ||
87 | .else | ||
88 | op&.&elem_size {d®1, d®2}, [&mem_operand&]! | ||
89 | .endif | ||
90 | .endm | 76 | .endm |
91 | 77 | ||
92 | .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits | 78 | .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits |
93 | .if abits > 0 | 79 | op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 |
94 | op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! | ||
95 | .else | ||
96 | op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! | ||
97 | .endif | ||
98 | .endm | 80 | .endm |
99 | 81 | ||
100 | .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits | 82 | .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes |
101 | op&.&elem_size {d®1[idx]}, [&mem_operand&]! | 83 | op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& |
102 | .endm | 84 | .endm |
103 | 85 | ||
104 | .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand | 86 | .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand |
105 | op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! | 87 | op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 |
106 | .endm | 88 | .endm |
107 | 89 | ||
108 | .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand | 90 | .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand |
109 | op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! | 91 | op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 |
110 | .endm | 92 | .endm |
111 | 93 | ||
112 | .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits | 94 | .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits |
113 | .if numbytes == 32 | 95 | .if numbytes == 32 |
114 | pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ | 96 | .if elem_size==32 |
97 | pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ | ||
98 | %(basereg+6), %(basereg+7), mem_operand, abits | ||
99 | .elseif elem_size==16 | ||
100 | pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ | ||
101 | %(basereg+6), %(basereg+7), mem_operand, abits | ||
102 | .else | ||
103 | pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ | ||
115 | %(basereg+6), %(basereg+7), mem_operand, abits | 104 | %(basereg+6), %(basereg+7), mem_operand, abits |
105 | .endif | ||
116 | .elseif numbytes == 16 | 106 | .elseif numbytes == 16 |
117 | pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits | 107 | .if elem_size==32 |
108 | pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits | ||
109 | .elseif elem_size==16 | ||
110 | pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits | ||
111 | .else | ||
112 | pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits | ||
113 | .endif | ||
118 | .elseif numbytes == 8 | 114 | .elseif numbytes == 8 |
119 | pixldst1 op, elem_size, %(basereg+1), mem_operand, abits | 115 | .if elem_size==32 |
116 | pixldst1 op, 2s, %(basereg+1), mem_operand, abits | ||
117 | .elseif elem_size==16 | ||
118 | pixldst1 op, 4h, %(basereg+1), mem_operand, abits | ||
119 | .else | ||
120 | pixldst1 op, 8b, %(basereg+1), mem_operand, abits | ||
121 | .endif | ||
120 | .elseif numbytes == 4 | 122 | .elseif numbytes == 4 |
121 | .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) | 123 | .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) |
122 | pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits | 124 | pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 |
123 | .elseif elem_size == 16 | 125 | .elseif elem_size == 16 |
124 | pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits | 126 | pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 |
125 | pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits | 127 | pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 |
126 | .else | 128 | .else |
127 | pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits | 129 | pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 |
128 | pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits | 130 | pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 |
129 | pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits | 131 | pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 |
130 | pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits | 132 | pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 |
131 | .endif | 133 | .endif |
132 | .elseif numbytes == 2 | 134 | .elseif numbytes == 2 |
133 | .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) | 135 | .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) |
134 | pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits | 136 | pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 |
135 | .else | 137 | .else |
136 | pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits | 138 | pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 |
137 | pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits | 139 | pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 |
138 | .endif | 140 | .endif |
139 | .elseif numbytes == 1 | 141 | .elseif numbytes == 1 |
140 | pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits | 142 | pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 |
141 | .else | 143 | .else |
142 | .error "unsupported size: numbytes" | 144 | .error "unsupported size: numbytes" |
143 | .endif | 145 | .endif |
144 | .endm | 146 | .endm |
145 | 147 | ||
146 | .macro pixld numpix, bpp, basereg, mem_operand, abits=0 | 148 | .macro pixld numpix, bpp, basereg, mem_operand, abits=0 |
147 | .if bpp > 0 | 149 | .if bpp > 0 |
148 | .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) | 150 | .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
149 | pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ | 151 | pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ |
150 | %(basereg+6), %(basereg+7), mem_operand, abits | 152 | %(basereg+6), %(basereg+7), mem_operand, abits |
151 | .elseif (bpp == 24) && (numpix == 8) | 153 | .elseif (bpp == 24) && (numpix == 8) |
152 | pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand | 154 | pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand |
153 | .elseif (bpp == 24) && (numpix == 4) | 155 | .elseif (bpp == 24) && (numpix == 4) |
154 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand | 156 | pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand |
155 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand | 157 | pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand |
156 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand | 158 | pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand |
157 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand | 159 | pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand |
158 | .elseif (bpp == 24) && (numpix == 2) | 160 | .elseif (bpp == 24) && (numpix == 2) |
159 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand | 161 | pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand |
160 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand | 162 | pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand |
161 | .elseif (bpp == 24) && (numpix == 1) | 163 | .elseif (bpp == 24) && (numpix == 1) |
162 | pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand | 164 | pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand |
163 | .else | 165 | .else |
164 | pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits | 166 | pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits |
165 | .endif | 167 | .endif |
166 | .endif | 168 | .endif |
167 | .endm | 169 | .endm |
168 | 170 | ||
169 | .macro pixst numpix, bpp, basereg, mem_operand, abits=0 | 171 | .macro pixst numpix, bpp, basereg, mem_operand, abits=0 |
170 | .if bpp > 0 | 172 | .if bpp > 0 |
171 | .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) | 173 | .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
172 | pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ | 174 | pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ |
173 | %(basereg+6), %(basereg+7), mem_operand, abits | 175 | %(basereg+6), %(basereg+7), mem_operand, abits |
174 | .elseif (bpp == 24) && (numpix == 8) | 176 | .elseif (bpp == 24) && (numpix == 8) |
175 | pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand | 177 | pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand |
176 | .elseif (bpp == 24) && (numpix == 4) | 178 | .elseif (bpp == 24) && (numpix == 4) |
177 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand | 179 | pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand |
178 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand | 180 | pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand |
179 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand | 181 | pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand |
180 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand | 182 | pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand |
181 | .elseif (bpp == 24) && (numpix == 2) | 183 | .elseif (bpp == 24) && (numpix == 2) |
182 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand | 184 | pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand |
183 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand | 185 | pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand |
184 | .elseif (bpp == 24) && (numpix == 1) | 186 | .elseif (bpp == 24) && (numpix == 1) |
185 | pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand | 187 | pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand |
186 | .else | 188 | .else |
187 | pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits | 189 | pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits |
188 | .endif | 190 | .endif |
189 | .endif | 191 | .endif |
190 | .endm | 192 | .endm |
191 | 193 | ||
192 | .macro pixld_a numpix, bpp, basereg, mem_operand | 194 | .macro pixld_a numpix, bpp, basereg, mem_operand |
@@ -211,89 +213,114 @@ | |||
211 | */ | 213 | */ |
212 | .macro pixld1_s elem_size, reg1, mem_operand | 214 | .macro pixld1_s elem_size, reg1, mem_operand |
213 | .if elem_size == 16 | 215 | .if elem_size == 16 |
214 | asr TMP1, VX, #16 | 216 | asr TMP1, VX, #16 |
215 | adds VX, VX, UNIT_X | 217 | adds VX, VX, UNIT_X |
216 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 218 | bmi 55f |
219 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
217 | bpl 5b | 220 | bpl 5b |
218 | add TMP1, mem_operand, TMP1, lsl #1 | 221 | 55: |
222 | lsl DUMMY, TMP1, #1 | ||
223 | add TMP1, mem_operand, DUMMY | ||
219 | asr TMP2, VX, #16 | 224 | asr TMP2, VX, #16 |
220 | adds VX, VX, UNIT_X | 225 | adds VX, VX, UNIT_X |
221 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 226 | bmi 55f |
227 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
222 | bpl 5b | 228 | bpl 5b |
223 | add TMP2, mem_operand, TMP2, lsl #1 | 229 | 55: |
224 | vld1.16 {d®1&[0]}, [TMP1, :16] | 230 | lsl DUMMY, TMP2, #1 |
231 | add TMP2, mem_operand, DUMMY | ||
232 | ld1 {v®1&.h}[0], [TMP1] | ||
225 | asr TMP1, VX, #16 | 233 | asr TMP1, VX, #16 |
226 | adds VX, VX, UNIT_X | 234 | adds VX, VX, UNIT_X |
227 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 235 | bmi 55f |
236 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
228 | bpl 5b | 237 | bpl 5b |
229 | add TMP1, mem_operand, TMP1, lsl #1 | 238 | 55: |
230 | vld1.16 {d®1&[1]}, [TMP2, :16] | 239 | lsl DUMMY, TMP1, #1 |
240 | add TMP1, mem_operand, DUMMY | ||
241 | ld1 {v®1&.h}[1], [TMP2] | ||
231 | asr TMP2, VX, #16 | 242 | asr TMP2, VX, #16 |
232 | adds VX, VX, UNIT_X | 243 | adds VX, VX, UNIT_X |
233 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 244 | bmi 55f |
245 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
234 | bpl 5b | 246 | bpl 5b |
235 | add TMP2, mem_operand, TMP2, lsl #1 | 247 | 55: |
236 | vld1.16 {d®1&[2]}, [TMP1, :16] | 248 | lsl DUMMY, TMP2, #1 |
237 | vld1.16 {d®1&[3]}, [TMP2, :16] | 249 | add TMP2, mem_operand, DUMMY |
250 | ld1 {v®1&.h}[2], [TMP1] | ||
251 | ld1 {v®1&.h}[3], [TMP2] | ||
238 | .elseif elem_size == 32 | 252 | .elseif elem_size == 32 |
239 | asr TMP1, VX, #16 | 253 | asr TMP1, VX, #16 |
240 | adds VX, VX, UNIT_X | 254 | adds VX, VX, UNIT_X |
241 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 255 | bmi 55f |
256 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
242 | bpl 5b | 257 | bpl 5b |
243 | add TMP1, mem_operand, TMP1, lsl #2 | 258 | 55: |
259 | lsl DUMMY, TMP1, #2 | ||
260 | add TMP1, mem_operand, DUMMY | ||
244 | asr TMP2, VX, #16 | 261 | asr TMP2, VX, #16 |
245 | adds VX, VX, UNIT_X | 262 | adds VX, VX, UNIT_X |
246 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 263 | bmi 55f |
264 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
247 | bpl 5b | 265 | bpl 5b |
248 | add TMP2, mem_operand, TMP2, lsl #2 | 266 | 55: |
249 | vld1.32 {d®1&[0]}, [TMP1, :32] | 267 | lsl DUMMY, TMP2, #2 |
250 | vld1.32 {d®1&[1]}, [TMP2, :32] | 268 | add TMP2, mem_operand, DUMMY |
269 | ld1 {v®1&.s}[0], [TMP1] | ||
270 | ld1 {v®1&.s}[1], [TMP2] | ||
251 | .else | 271 | .else |
252 | .error "unsupported" | 272 | .error "unsupported" |
253 | .endif | 273 | .endif |
254 | .endm | 274 | .endm |
255 | 275 | ||
256 | .macro pixld2_s elem_size, reg1, reg2, mem_operand | 276 | .macro pixld2_s elem_size, reg1, reg2, mem_operand |
257 | .if 0 /* elem_size == 32 */ | 277 | .if 0 /* elem_size == 32 */ |
258 | asr TMP1, VX, #16 | 278 | mov TMP1, VX, asr #16 |
259 | add VX, VX, UNIT_X, lsl #1 | 279 | add VX, VX, UNIT_X, asl #1 |
260 | add TMP1, mem_operand, TMP1, lsl #2 | 280 | add TMP1, mem_operand, TMP1, asl #2 |
261 | asr TMP2, VX, #16 | 281 | mov TMP2, VX, asr #16 |
262 | sub VX, VX, UNIT_X | 282 | sub VX, VX, UNIT_X |
263 | add TMP2, mem_operand, TMP2, lsl #2 | 283 | add TMP2, mem_operand, TMP2, asl #2 |
264 | vld1.32 {d®1&[0]}, [TMP1, :32] | 284 | ld1 {v®1&.s}[0], [TMP1] |
265 | asr TMP1, VX, #16 | 285 | mov TMP1, VX, asr #16 |
266 | add VX, VX, UNIT_X, lsl #1 | 286 | add VX, VX, UNIT_X, asl #1 |
267 | add TMP1, mem_operand, TMP1, lsl #2 | 287 | add TMP1, mem_operand, TMP1, asl #2 |
268 | vld1.32 {d®2&[0]}, [TMP2, :32] | 288 | ld1 {v®2&.s}[0], [TMP2, :32] |
269 | asr TMP2, VX, #16 | 289 | mov TMP2, VX, asr #16 |
270 | add VX, VX, UNIT_X | 290 | add VX, VX, UNIT_X |
271 | add TMP2, mem_operand, TMP2, lsl #2 | 291 | add TMP2, mem_operand, TMP2, asl #2 |
272 | vld1.32 {d®1&[1]}, [TMP1, :32] | 292 | ld1 {v®1&.s}[1], [TMP1] |
273 | vld1.32 {d®2&[1]}, [TMP2, :32] | 293 | ld1 {v®2&.s}[1], [TMP2] |
274 | .else | 294 | .else |
275 | pixld1_s elem_size, reg1, mem_operand | 295 | pixld1_s elem_size, reg1, mem_operand |
276 | pixld1_s elem_size, reg2, mem_operand | 296 | pixld1_s elem_size, reg2, mem_operand |
277 | .endif | 297 | .endif |
278 | .endm | 298 | .endm |
279 | 299 | ||
280 | .macro pixld0_s elem_size, reg1, idx, mem_operand | 300 | .macro pixld0_s elem_size, reg1, idx, mem_operand |
281 | .if elem_size == 16 | 301 | .if elem_size == 16 |
282 | asr TMP1, VX, #16 | 302 | asr TMP1, VX, #16 |
283 | adds VX, VX, UNIT_X | 303 | adds VX, VX, UNIT_X |
284 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 304 | bmi 55f |
305 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
285 | bpl 5b | 306 | bpl 5b |
286 | add TMP1, mem_operand, TMP1, lsl #1 | 307 | 55: |
287 | vld1.16 {d®1&[idx]}, [TMP1, :16] | 308 | lsl DUMMY, TMP1, #1 |
309 | add TMP1, mem_operand, DUMMY | ||
310 | ld1 {v®1&.h}[idx], [TMP1] | ||
288 | .elseif elem_size == 32 | 311 | .elseif elem_size == 32 |
289 | asr TMP1, VX, #16 | 312 | asr DUMMY, VX, #16 |
313 | mov TMP1, DUMMY | ||
290 | adds VX, VX, UNIT_X | 314 | adds VX, VX, UNIT_X |
291 | 5: subpls VX, VX, SRC_WIDTH_FIXED | 315 | bmi 55f |
316 | 5: subs VX, VX, SRC_WIDTH_FIXED | ||
292 | bpl 5b | 317 | bpl 5b |
293 | add TMP1, mem_operand, TMP1, lsl #2 | 318 | 55: |
294 | vld1.32 {d®1&[idx]}, [TMP1, :32] | 319 | lsl DUMMY, TMP1, #2 |
320 | add TMP1, mem_operand, DUMMY | ||
321 | ld1 {v®1&.s}[idx], [TMP1] | ||
295 | .endif | 322 | .endif |
296 | .endm | 323 | .endm |
297 | 324 | ||
298 | .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand | 325 | .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand |
299 | .if numbytes == 32 | 326 | .if numbytes == 32 |
@@ -335,15 +362,23 @@ | |||
335 | pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand | 362 | pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand |
336 | .endif | 363 | .endif |
337 | .endm | 364 | .endm |
338 | 365 | ||
339 | .macro vuzp8 reg1, reg2 | 366 | .macro vuzp8 reg1, reg2 |
340 | vuzp.8 d®1, d®2 | 367 | umov DUMMY, v16.d[0] |
368 | uzp1 v16.8b, v®1&.8b, v®2&.8b | ||
369 | uzp2 v®2&.8b, v®1&.8b, v®2&.8b | ||
370 | mov v®1&.8b, v16.8b | ||
371 | mov v16.d[0], DUMMY | ||
341 | .endm | 372 | .endm |
342 | 373 | ||
343 | .macro vzip8 reg1, reg2 | 374 | .macro vzip8 reg1, reg2 |
344 | vzip.8 d®1, d®2 | 375 | umov DUMMY, v16.d[0] |
376 | zip1 v16.8b, v®1&.8b, v®2&.8b | ||
377 | zip2 v®2&.8b, v®1&.8b, v®2&.8b | ||
378 | mov v®1&.8b, v16.8b | ||
379 | mov v16.d[0], DUMMY | ||
345 | .endm | 380 | .endm |
346 | 381 | ||
347 | /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ | 382 | /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ |
348 | .macro pixdeinterleave bpp, basereg | 383 | .macro pixdeinterleave bpp, basereg |
349 | .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) | 384 | .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
@@ -398,53 +433,65 @@ | |||
398 | .endif | 433 | .endif |
399 | .endm | 434 | .endm |
400 | 435 | ||
401 | .macro cache_preload std_increment, boost_increment | 436 | .macro cache_preload std_increment, boost_increment |
402 | .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) | 437 | .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) |
403 | .if regs_shortage | ||
404 | PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ | ||
405 | .endif | ||
406 | .if std_increment != 0 | 438 | .if std_increment != 0 |
407 | PF add PF_X, PF_X, #std_increment | 439 | PF add PF_X, PF_X, #std_increment |
408 | .endif | 440 | .endif |
409 | PF tst PF_CTL, #0xF | 441 | PF tst PF_CTL, #0xF |
410 | PF addne PF_X, PF_X, #boost_increment | 442 | PF beq 71f |
411 | PF subne PF_CTL, PF_CTL, #1 | 443 | PF add PF_X, PF_X, #boost_increment |
444 | PF sub PF_CTL, PF_CTL, #1 | ||
445 | 71: | ||
412 | PF cmp PF_X, ORIG_W | 446 | PF cmp PF_X, ORIG_W |
413 | .if src_bpp_shift >= 0 | 447 | .if src_bpp_shift >= 0 |
414 | PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] | 448 | PF lsl DUMMY, PF_X, #src_bpp_shift |
449 | PF prfm pldl2strm, [PF_SRC, DUMMY] | ||
415 | .endif | 450 | .endif |
416 | .if dst_r_bpp != 0 | 451 | .if dst_r_bpp != 0 |
417 | PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] | 452 | PF lsl DUMMY, PF_X, #dst_bpp_shift |
453 | PF prfm pldl2strm, [PF_DST, DUMMY] | ||
418 | .endif | 454 | .endif |
419 | .if mask_bpp_shift >= 0 | 455 | .if mask_bpp_shift >= 0 |
420 | PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] | 456 | PF lsl DUMMY, PF_X, #mask_bpp_shift |
421 | .endif | 457 | PF prfm pldl2strm, [PF_MASK, DUMMY] |
422 | PF subge PF_X, PF_X, ORIG_W | 458 | .endif |
423 | PF subges PF_CTL, PF_CTL, #0x10 | 459 | PF ble 71f |
460 | PF sub PF_X, PF_X, ORIG_W | ||
461 | PF subs PF_CTL, PF_CTL, #0x10 | ||
462 | 71: | ||
463 | PF ble 72f | ||
424 | .if src_bpp_shift >= 0 | 464 | .if src_bpp_shift >= 0 |
425 | PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! | 465 | PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift |
466 | PF ldrsb DUMMY, [PF_SRC, DUMMY] | ||
467 | PF add PF_SRC, PF_SRC, #1 | ||
426 | .endif | 468 | .endif |
427 | .if dst_r_bpp != 0 | 469 | .if dst_r_bpp != 0 |
428 | PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! | 470 | PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift |
471 | PF ldrsb DUMMY, [PF_DST, DUMMY] | ||
472 | PF add PF_DST, PF_DST, #1 | ||
429 | .endif | 473 | .endif |
430 | .if mask_bpp_shift >= 0 | 474 | .if mask_bpp_shift >= 0 |
431 | PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! | 475 | PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift |
476 | PF ldrsb DUMMY, [PF_MASK, DUMMY] | ||
477 | PF add PF_MASK, PF_MASK, #1 | ||
432 | .endif | 478 | .endif |
479 | 72: | ||
433 | .endif | 480 | .endif |
434 | .endm | 481 | .endm |
435 | 482 | ||
436 | .macro cache_preload_simple | 483 | .macro cache_preload_simple |
437 | .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) | 484 | .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) |
438 | .if src_bpp > 0 | 485 | .if src_bpp > 0 |
439 | pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] | 486 | prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] |
440 | .endif | 487 | .endif |
441 | .if dst_r_bpp > 0 | 488 | .if dst_r_bpp > 0 |
442 | pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] | 489 | prfm pldl2strm, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] |
443 | .endif | 490 | .endif |
444 | .if mask_bpp > 0 | 491 | .if mask_bpp > 0 |
445 | pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] | 492 | prfm pldl2strm, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] |
446 | .endif | 493 | .endif |
447 | .endif | 494 | .endif |
448 | .endm | 495 | .endm |
449 | 496 | ||
450 | .macro fetch_mask_pixblock | 497 | .macro fetch_mask_pixblock |
@@ -460,29 +507,28 @@ | |||
460 | .macro ensure_destination_ptr_alignment process_pixblock_head, \ | 507 | .macro ensure_destination_ptr_alignment process_pixblock_head, \ |
461 | process_pixblock_tail, \ | 508 | process_pixblock_tail, \ |
462 | process_pixblock_tail_head | 509 | process_pixblock_tail_head |
463 | .if dst_w_bpp != 24 | 510 | .if dst_w_bpp != 24 |
464 | tst DST_R, #0xF | 511 | tst DST_R, #0xF |
465 | beq 2f | 512 | beq 52f |
466 | |||
467 | .irp lowbit, 1, 2, 4, 8, 16 | 513 | .irp lowbit, 1, 2, 4, 8, 16 |
468 | local skip1 | 514 | local skip1 |
469 | .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) | 515 | .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) |
470 | .if lowbit < 16 /* we don't need more than 16-byte alignment */ | 516 | .if lowbit < 16 /* we don't need more than 16-byte alignment */ |
471 | tst DST_R, #lowbit | 517 | tst DST_R, #lowbit |
472 | beq 1f | 518 | beq 51f |
473 | .endif | 519 | .endif |
474 | pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC | 520 | pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC |
475 | pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK | 521 | pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK |
476 | .if dst_r_bpp > 0 | 522 | .if dst_r_bpp > 0 |
477 | pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R | 523 | pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R |
478 | .else | 524 | .else |
479 | add DST_R, DST_R, #lowbit | 525 | add DST_R, DST_R, #lowbit |
480 | .endif | 526 | .endif |
481 | PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) | 527 | PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) |
482 | sub W, W, #(lowbit * 8 / dst_w_bpp) | 528 | sub W, W, #(lowbit * 8 / dst_w_bpp) |
483 | 1: | 529 | 51: |
484 | .endif | 530 | .endif |
485 | .endr | 531 | .endr |
486 | pixdeinterleave src_bpp, src_basereg | 532 | pixdeinterleave src_bpp, src_basereg |
487 | pixdeinterleave mask_bpp, mask_basereg | 533 | pixdeinterleave mask_bpp, mask_basereg |
488 | pixdeinterleave dst_r_bpp, dst_r_basereg | 534 | pixdeinterleave dst_r_bpp, dst_r_basereg |
@@ -491,22 +537,23 @@ local skip1 | |||
491 | cache_preload 0, pixblock_size | 537 | cache_preload 0, pixblock_size |
492 | cache_preload_simple | 538 | cache_preload_simple |
493 | process_pixblock_tail | 539 | process_pixblock_tail |
494 | 540 | ||
495 | pixinterleave dst_w_bpp, dst_w_basereg | 541 | pixinterleave dst_w_bpp, dst_w_basereg |
542 | |||
496 | .irp lowbit, 1, 2, 4, 8, 16 | 543 | .irp lowbit, 1, 2, 4, 8, 16 |
497 | .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) | 544 | .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) |
498 | .if lowbit < 16 /* we don't need more than 16-byte alignment */ | 545 | .if lowbit < 16 /* we don't need more than 16-byte alignment */ |
499 | tst DST_W, #lowbit | 546 | tst DST_W, #lowbit |
500 | beq 1f | 547 | beq 51f |
501 | .endif | 548 | .endif |
502 | pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W | 549 | pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W |
503 | 1: | 550 | 51: |
504 | .endif | 551 | .endif |
505 | .endr | 552 | .endr |
506 | .endif | 553 | .endif |
507 | 2: | 554 | 52: |
508 | .endm | 555 | .endm |
509 | 556 | ||
510 | /* | 557 | /* |
511 | * Special code for processing up to (pixblock_size - 1) remaining | 558 | * Special code for processing up to (pixblock_size - 1) remaining |
512 | * trailing pixels. As SIMD processing performs operation on | 559 | * trailing pixels. As SIMD processing performs operation on |
@@ -526,26 +573,26 @@ local skip1 | |||
526 | dst_aligned_flag, \ | 573 | dst_aligned_flag, \ |
527 | process_pixblock_head, \ | 574 | process_pixblock_head, \ |
528 | process_pixblock_tail, \ | 575 | process_pixblock_tail, \ |
529 | process_pixblock_tail_head | 576 | process_pixblock_tail_head |
530 | tst W, #(pixblock_size - 1) | 577 | tst W, #(pixblock_size - 1) |
531 | beq 2f | 578 | beq 52f |
532 | .irp chunk_size, 16, 8, 4, 2, 1 | 579 | .irp chunk_size, 16, 8, 4, 2, 1 |
533 | .if pixblock_size > chunk_size | 580 | .if pixblock_size > chunk_size |
534 | tst W, #chunk_size | 581 | tst W, #chunk_size |
535 | beq 1f | 582 | beq 51f |
536 | pixld_src chunk_size, src_bpp, src_basereg, SRC | 583 | pixld_src chunk_size, src_bpp, src_basereg, SRC |
537 | pixld chunk_size, mask_bpp, mask_basereg, MASK | 584 | pixld chunk_size, mask_bpp, mask_basereg, MASK |
538 | .if dst_aligned_flag != 0 | 585 | .if dst_aligned_flag != 0 |
539 | pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R | 586 | pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R |
540 | .else | 587 | .else |
541 | pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R | 588 | pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R |
542 | .endif | 589 | .endif |
543 | .if cache_preload_flag != 0 | 590 | .if cache_preload_flag != 0 |
544 | PF add PF_X, PF_X, #chunk_size | 591 | PF add PF_X, PF_X, #chunk_size |
545 | .endif | 592 | .endif |
546 | 1: | 593 | 51: |
547 | .endif | 594 | .endif |
548 | .endr | 595 | .endr |
549 | pixdeinterleave src_bpp, src_basereg | 596 | pixdeinterleave src_bpp, src_basereg |
550 | pixdeinterleave mask_bpp, mask_basereg | 597 | pixdeinterleave mask_bpp, mask_basereg |
551 | pixdeinterleave dst_r_bpp, dst_r_basereg | 598 | pixdeinterleave dst_r_bpp, dst_r_basereg |
@@ -558,63 +605,62 @@ local skip1 | |||
558 | process_pixblock_tail | 605 | process_pixblock_tail |
559 | pixinterleave dst_w_bpp, dst_w_basereg | 606 | pixinterleave dst_w_bpp, dst_w_basereg |
560 | .irp chunk_size, 16, 8, 4, 2, 1 | 607 | .irp chunk_size, 16, 8, 4, 2, 1 |
561 | .if pixblock_size > chunk_size | 608 | .if pixblock_size > chunk_size |
562 | tst W, #chunk_size | 609 | tst W, #chunk_size |
563 | beq 1f | 610 | beq 51f |
564 | .if dst_aligned_flag != 0 | 611 | .if dst_aligned_flag != 0 |
565 | pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W | 612 | pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W |
566 | .else | 613 | .else |
567 | pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W | 614 | pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W |
568 | .endif | 615 | .endif |
569 | 1: | 616 | 51: |
570 | .endif | 617 | .endif |
571 | .endr | 618 | .endr |
572 | 2: | 619 | 52: |
573 | .endm | 620 | .endm |
574 | 621 | ||
575 | /* | 622 | /* |
576 | * Macro, which performs all the needed operations to switch to the next | 623 | * Macro, which performs all the needed operations to switch to the next |
577 | * scanline and start the next loop iteration unless all the scanlines | 624 | * scanline and start the next loop iteration unless all the scanlines |
578 | * are already processed. | 625 | * are already processed. |
579 | */ | 626 | */ |
580 | .macro advance_to_next_scanline start_of_loop_label | 627 | .macro advance_to_next_scanline start_of_loop_label |
581 | .if regs_shortage | ||
582 | ldrd W, [sp] /* load W and H (width and height) from stack */ | ||
583 | .else | ||
584 | mov W, ORIG_W | 628 | mov W, ORIG_W |
585 | .endif | 629 | lsl DUMMY, DST_STRIDE, #dst_bpp_shift |
586 | add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift | 630 | add DST_W, DST_W, DUMMY |
587 | .if src_bpp != 0 | 631 | .if src_bpp != 0 |
588 | add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift | 632 | lsl DUMMY, SRC_STRIDE, #src_bpp_shift |
633 | add SRC, SRC, DUMMY | ||
589 | .endif | 634 | .endif |
590 | .if mask_bpp != 0 | 635 | .if mask_bpp != 0 |
591 | add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift | 636 | lsl DUMMY, MASK_STRIDE, #mask_bpp_shift |
637 | add MASK, MASK, DUMMY | ||
592 | .endif | 638 | .endif |
593 | .if (dst_w_bpp != 24) | 639 | .if (dst_w_bpp != 24) |
594 | sub DST_W, DST_W, W, lsl #dst_bpp_shift | 640 | lsl DUMMY, W, #dst_bpp_shift |
641 | sub DST_W, DST_W, DUMMY | ||
595 | .endif | 642 | .endif |
596 | .if (src_bpp != 24) && (src_bpp != 0) | 643 | .if (src_bpp != 24) && (src_bpp != 0) |
597 | sub SRC, SRC, W, lsl #src_bpp_shift | 644 | lsl DUMMY, W, #src_bpp_shift |
645 | sub SRC, SRC, DUMMY | ||
598 | .endif | 646 | .endif |
599 | .if (mask_bpp != 24) && (mask_bpp != 0) | 647 | .if (mask_bpp != 24) && (mask_bpp != 0) |
600 | sub MASK, MASK, W, lsl #mask_bpp_shift | 648 | lsl DUMMY, W, #mask_bpp_shift |
649 | sub MASK, MASK, DUMMY | ||
601 | .endif | 650 | .endif |
602 | subs H, H, #1 | 651 | subs H, H, #1 |
603 | mov DST_R, DST_W | 652 | mov DST_R, DST_W |
604 | .if regs_shortage | ||
605 | str H, [sp, #4] /* save updated height to stack */ | ||
606 | .endif | ||
607 | bge start_of_loop_label | 653 | bge start_of_loop_label |
608 | .endm | 654 | .endm |
609 | 655 | ||
610 | /* | 656 | /* |
611 | * Registers are allocated in the following way by default: | 657 | * Registers are allocated in the following way by default: |
612 | * d0, d1, d2, d3 - reserved for loading source pixel data | 658 | * v0, v1, v2, v3 - reserved for loading source pixel data |
613 | * d4, d5, d6, d7 - reserved for loading destination pixel data | 659 | * v4, v5, v6, v7 - reserved for loading destination pixel data |
614 | * d24, d25, d26, d27 - reserved for loading mask pixel data | 660 | * v24, v25, v26, v27 - reserved for loading mask pixel data |
615 | * d28, d29, d30, d31 - final destination pixel data for writeback to memory | 661 | * v28, v29, v30, v31 - final destination pixel data for writeback to memory |
616 | */ | 662 | */ |
617 | .macro generate_composite_function fname, \ | 663 | .macro generate_composite_function fname, \ |
618 | src_bpp_, \ | 664 | src_bpp_, \ |
619 | mask_bpp_, \ | 665 | mask_bpp_, \ |
620 | dst_w_bpp_, \ | 666 | dst_w_bpp_, \ |
@@ -630,12 +676,27 @@ local skip1 | |||
630 | dst_r_basereg_ = 4, \ | 676 | dst_r_basereg_ = 4, \ |
631 | src_basereg_ = 0, \ | 677 | src_basereg_ = 0, \ |
632 | mask_basereg_ = 24 | 678 | mask_basereg_ = 24 |
633 | 679 | ||
634 | pixman_asm_function fname | 680 | pixman_asm_function fname |
635 | 681 | stp x29, x30, [sp, -16]! | |
636 | push {r4-r12, lr} /* save all registers */ | 682 | mov x29, sp |
683 | sub sp, sp, 232 /* push all registers */ | ||
684 | sub x29, x29, 64 | ||
685 | st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 | ||
686 | st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 | ||
687 | stp x8, x9, [x29, -80] | ||
688 | stp x10, x11, [x29, -96] | ||
689 | stp x12, x13, [x29, -112] | ||
690 | stp x14, x15, [x29, -128] | ||
691 | stp x16, x17, [x29, -144] | ||
692 | stp x18, x19, [x29, -160] | ||
693 | stp x20, x21, [x29, -176] | ||
694 | stp x22, x23, [x29, -192] | ||
695 | stp x24, x25, [x29, -208] | ||
696 | stp x26, x27, [x29, -224] | ||
697 | str x28, [x29, -232] | ||
637 | 698 | ||
638 | /* | 699 | /* |
639 | * Select prefetch type for this function. If prefetch distance is | 700 | * Select prefetch type for this function. If prefetch distance is |
640 | * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch | 701 | * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch |
641 | * has to be used instead of ADVANCED. | 702 | * has to be used instead of ADVANCED. |
@@ -669,56 +730,40 @@ local skip1 | |||
669 | (src_basereg - pixblock_size * src_bpp / 64), SRC | 730 | (src_basereg - pixblock_size * src_bpp / 64), SRC |
670 | .endm | 731 | .endm |
671 | /* | 732 | /* |
672 | * Assign symbolic names to registers | 733 | * Assign symbolic names to registers |
673 | */ | 734 | */ |
674 | W .req r0 /* width (is updated during processing) */ | 735 | W .req x0 /* width (is updated during processing) */ |
675 | H .req r1 /* height (is updated during processing) */ | 736 | H .req x1 /* height (is updated during processing) */ |
676 | DST_W .req r2 /* destination buffer pointer for writes */ | 737 | DST_W .req x2 /* destination buffer pointer for writes */ |
677 | DST_STRIDE .req r3 /* destination image stride */ | 738 | DST_STRIDE .req x3 /* destination image stride */ |
678 | SRC .req r4 /* source buffer pointer */ | 739 | SRC .req x4 /* source buffer pointer */ |
679 | SRC_STRIDE .req r5 /* source image stride */ | 740 | SRC_STRIDE .req x5 /* source image stride */ |
680 | DST_R .req r6 /* destination buffer pointer for reads */ | 741 | MASK .req x6 /* mask pointer */ |
681 | 742 | MASK_STRIDE .req x7 /* mask stride */ | |
682 | MASK .req r7 /* mask pointer */ | 743 | |
683 | MASK_STRIDE .req r8 /* mask stride */ | 744 | DST_R .req x8 /* destination buffer pointer for reads */ |
684 | 745 | ||
685 | PF_CTL .req r9 /* combined lines counter and prefetch */ | 746 | PF_CTL .req x9 /* combined lines counter and prefetch */ |
686 | /* distance increment counter */ | 747 | /* distance increment counter */ |
687 | PF_X .req r10 /* pixel index in a scanline for current */ | 748 | PF_X .req x10 /* pixel index in a scanline for current */ |
688 | /* pretetch position */ | 749 | /* pretetch position */ |
689 | PF_SRC .req r11 /* pointer to source scanline start */ | 750 | PF_SRC .req x11 /* pointer to source scanline start */ |
690 | /* for prefetch purposes */ | 751 | /* for prefetch purposes */ |
691 | PF_DST .req r12 /* pointer to destination scanline start */ | 752 | PF_DST .req x12 /* pointer to destination scanline start */ |
692 | /* for prefetch purposes */ | 753 | /* for prefetch purposes */ |
693 | PF_MASK .req r14 /* pointer to mask scanline start */ | 754 | PF_MASK .req x13 /* pointer to mask scanline start */ |
694 | /* for prefetch purposes */ | 755 | /* for prefetch purposes */ |
695 | /* | 756 | |
696 | * Check whether we have enough registers for all the local variables. | 757 | ORIG_W .req x14 /* saved original width */ |
697 | * If we don't have enough registers, original width and height are | 758 | DUMMY .req x15 /* temporary register */ |
698 | * kept on top of stack (and 'regs_shortage' variable is set to indicate | 759 | |
699 | * this for the rest of code). Even if there are enough registers, the | 760 | sxtw x0, w0 |
700 | * allocation scheme may be a bit different depending on whether source | 761 | sxtw x1, w1 |
701 | * or mask is not used. | 762 | sxtw x3, w3 |
702 | */ | 763 | sxtw x5, w5 |
703 | .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) | 764 | sxtw x7, w7 |
704 | ORIG_W .req r10 /* saved original width */ | ||
705 | DUMMY .req r12 /* temporary register */ | ||
706 | .set regs_shortage, 0 | ||
707 | .elseif mask_bpp == 0 | ||
708 | ORIG_W .req r7 /* saved original width */ | ||
709 | DUMMY .req r8 /* temporary register */ | ||
710 | .set regs_shortage, 0 | ||
711 | .elseif src_bpp == 0 | ||
712 | ORIG_W .req r4 /* saved original width */ | ||
713 | DUMMY .req r5 /* temporary register */ | ||
714 | .set regs_shortage, 0 | ||
715 | .else | ||
716 | ORIG_W .req r1 /* saved original width */ | ||
717 | DUMMY .req r1 /* temporary register */ | ||
718 | .set regs_shortage, 1 | ||
719 | .endif | ||
720 | 765 | ||
721 | .set mask_bpp_shift, -1 | 766 | .set mask_bpp_shift, -1 |
722 | .if src_bpp == 32 | 767 | .if src_bpp == 32 |
723 | .set src_bpp_shift, 2 | 768 | .set src_bpp_shift, 2 |
724 | .elseif src_bpp == 24 | 769 | .elseif src_bpp == 24 |
@@ -768,23 +813,11 @@ local skip1 | |||
768 | 813 | ||
769 | .if prefetch_distance < 0 || prefetch_distance > 15 | 814 | .if prefetch_distance < 0 || prefetch_distance > 15 |
770 | .error "invalid prefetch distance (prefetch_distance)" | 815 | .error "invalid prefetch distance (prefetch_distance)" |
771 | .endif | 816 | .endif |
772 | 817 | ||
773 | .if src_bpp > 0 | ||
774 | ldr SRC, [sp, #40] | ||
775 | .endif | ||
776 | .if mask_bpp > 0 | ||
777 | ldr MASK, [sp, #48] | ||
778 | .endif | ||
779 | PF mov PF_X, #0 | 818 | PF mov PF_X, #0 |
780 | .if src_bpp > 0 | ||
781 | ldr SRC_STRIDE, [sp, #44] | ||
782 | .endif | ||
783 | .if mask_bpp > 0 | ||
784 | ldr MASK_STRIDE, [sp, #52] | ||
785 | .endif | ||
786 | mov DST_R, DST_W | 819 | mov DST_R, DST_W |
787 | 820 | ||
788 | .if src_bpp == 24 | 821 | .if src_bpp == 24 |
789 | sub SRC_STRIDE, SRC_STRIDE, W | 822 | sub SRC_STRIDE, SRC_STRIDE, W |
790 | sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 | 823 | sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 |
@@ -803,26 +836,20 @@ local skip1 | |||
803 | */ | 836 | */ |
804 | PF mov PF_SRC, SRC | 837 | PF mov PF_SRC, SRC |
805 | PF mov PF_DST, DST_R | 838 | PF mov PF_DST, DST_R |
806 | PF mov PF_MASK, MASK | 839 | PF mov PF_MASK, MASK |
807 | /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ | 840 | /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ |
808 | PF mov PF_CTL, H, lsl #4 | 841 | PF lsl DUMMY, H, #4 |
809 | PF add PF_CTL, #(prefetch_distance - 0x10) | 842 | PF mov PF_CTL, DUMMY |
843 | PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) | ||
810 | 844 | ||
811 | init | 845 | init |
812 | .if regs_shortage | ||
813 | push {r0, r1} | ||
814 | .endif | ||
815 | subs H, H, #1 | 846 | subs H, H, #1 |
816 | .if regs_shortage | ||
817 | str H, [sp, #4] /* save updated height to stack */ | ||
818 | .else | ||
819 | mov ORIG_W, W | 847 | mov ORIG_W, W |
820 | .endif | ||
821 | blt 9f | 848 | blt 9f |
822 | cmp W, #(pixblock_size * 2) | 849 | cmp W, #(pixblock_size * 2) |
823 | blt 8f | 850 | blt 800f |
824 | /* | 851 | /* |
825 | * This is the start of the pipelined loop, which if optimized for | 852 | * This is the start of the pipelined loop, which if optimized for |
826 | * long scanlines | 853 | * long scanlines |
827 | */ | 854 | */ |
828 | 0: | 855 | 0: |
@@ -839,17 +866,19 @@ local skip1 | |||
839 | PF add PF_X, PF_X, #pixblock_size | 866 | PF add PF_X, PF_X, #pixblock_size |
840 | process_pixblock_head | 867 | process_pixblock_head |
841 | cache_preload 0, pixblock_size | 868 | cache_preload 0, pixblock_size |
842 | cache_preload_simple | 869 | cache_preload_simple |
843 | subs W, W, #(pixblock_size * 2) | 870 | subs W, W, #(pixblock_size * 2) |
844 | blt 2f | 871 | blt 200f |
845 | 1: | 872 | |
873 | 100: | ||
846 | process_pixblock_tail_head | 874 | process_pixblock_tail_head |
847 | cache_preload_simple | 875 | cache_preload_simple |
848 | subs W, W, #pixblock_size | 876 | subs W, W, #pixblock_size |
849 | bge 1b | 877 | bge 100b |
850 | 2: | 878 | |
879 | 200: | ||
851 | process_pixblock_tail | 880 | process_pixblock_tail |
852 | pixst_a pixblock_size, dst_w_bpp, \ | 881 | pixst_a pixblock_size, dst_w_bpp, \ |
853 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W | 882 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
854 | 883 | ||
855 | /* Process the remaining trailing pixels in the scanline */ | 884 | /* Process the remaining trailing pixels in the scanline */ |
@@ -857,46 +886,75 @@ local skip1 | |||
857 | process_pixblock_head, \ | 886 | process_pixblock_head, \ |
858 | process_pixblock_tail, \ | 887 | process_pixblock_tail, \ |
859 | process_pixblock_tail_head | 888 | process_pixblock_tail_head |
860 | advance_to_next_scanline 0b | 889 | advance_to_next_scanline 0b |
861 | 890 | ||
862 | .if regs_shortage | ||
863 | pop {r0, r1} | ||
864 | .endif | ||
865 | cleanup | 891 | cleanup |
866 | pop {r4-r12, pc} /* exit */ | 892 | 1000: |
893 | /* pop all registers */ | ||
894 | sub x29, x29, 64 | ||
895 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
896 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
897 | ldp x8, x9, [x29, -80] | ||
898 | ldp x10, x11, [x29, -96] | ||
899 | ldp x12, x13, [x29, -112] | ||
900 | ldp x14, x15, [x29, -128] | ||
901 | ldp x16, x17, [x29, -144] | ||
902 | ldp x18, x19, [x29, -160] | ||
903 | ldp x20, x21, [x29, -176] | ||
904 | ldp x22, x23, [x29, -192] | ||
905 | ldp x24, x25, [x29, -208] | ||
906 | ldp x26, x27, [x29, -224] | ||
907 | ldr x28, [x29, -232] | ||
908 | mov sp, x29 | ||
909 | ldp x29, x30, [sp], 16 | ||
910 | ret /* exit */ | ||
867 | /* | 911 | /* |
868 | * This is the start of the loop, designed to process images with small width | 912 | * This is the start of the loop, designed to process images with small width |
869 | * (less than pixblock_size * 2 pixels). In this case neither pipelining | 913 | * (less than pixblock_size * 2 pixels). In this case neither pipelining |
870 | * nor prefetch are used. | 914 | * nor prefetch are used. |
871 | */ | 915 | */ |
872 | 8: | 916 | 800: |
873 | /* Process exactly pixblock_size pixels if needed */ | 917 | /* Process exactly pixblock_size pixels if needed */ |
874 | tst W, #pixblock_size | 918 | tst W, #pixblock_size |
875 | beq 1f | 919 | beq 100f |
876 | pixld pixblock_size, dst_r_bpp, \ | 920 | pixld pixblock_size, dst_r_bpp, \ |
877 | (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R | 921 | (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
878 | fetch_src_pixblock | 922 | fetch_src_pixblock |
879 | pixld pixblock_size, mask_bpp, \ | 923 | pixld pixblock_size, mask_bpp, \ |
880 | (mask_basereg - pixblock_size * mask_bpp / 64), MASK | 924 | (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
881 | process_pixblock_head | 925 | process_pixblock_head |
882 | process_pixblock_tail | 926 | process_pixblock_tail |
883 | pixst pixblock_size, dst_w_bpp, \ | 927 | pixst pixblock_size, dst_w_bpp, \ |
884 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W | 928 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
885 | 1: | 929 | 100: |
886 | /* Process the remaining trailing pixels in the scanline */ | 930 | /* Process the remaining trailing pixels in the scanline */ |
887 | process_trailing_pixels 0, 0, \ | 931 | process_trailing_pixels 0, 0, \ |
888 | process_pixblock_head, \ | 932 | process_pixblock_head, \ |
889 | process_pixblock_tail, \ | 933 | process_pixblock_tail, \ |
890 | process_pixblock_tail_head | 934 | process_pixblock_tail_head |
891 | advance_to_next_scanline 8b | 935 | advance_to_next_scanline 800b |
892 | 9: | 936 | 9: |
893 | .if regs_shortage | ||
894 | pop {r0, r1} | ||
895 | .endif | ||
896 | cleanup | 937 | cleanup |
897 | pop {r4-r12, pc} /* exit */ | 938 | /* pop all registers */ |
939 | sub x29, x29, 64 | ||
940 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
941 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
942 | ldp x8, x9, [x29, -80] | ||
943 | ldp x10, x11, [x29, -96] | ||
944 | ldp x12, x13, [x29, -112] | ||
945 | ldp x14, x15, [x29, -128] | ||
946 | ldp x16, x17, [x29, -144] | ||
947 | ldp x18, x19, [x29, -160] | ||
948 | ldp x20, x21, [x29, -176] | ||
949 | ldp x22, x23, [x29, -192] | ||
950 | ldp x24, x25, [x29, -208] | ||
951 | ldp x26, x27, [x29, -224] | ||
952 | ldr x28, [x29, -232] | ||
953 | mov sp, x29 | ||
954 | ldp x29, x30, [sp], 16 | ||
955 | ret /* exit */ | ||
898 | 956 | ||
899 | .purgem fetch_src_pixblock | 957 | .purgem fetch_src_pixblock |
900 | .purgem pixld_src | 958 | .purgem pixld_src |
901 | 959 | ||
902 | .unreq SRC | 960 | .unreq SRC |
@@ -938,12 +996,12 @@ local skip1 | |||
938 | dst_r_basereg_ = 4, \ | 996 | dst_r_basereg_ = 4, \ |
939 | src_basereg_ = 0, \ | 997 | src_basereg_ = 0, \ |
940 | mask_basereg_ = 24 | 998 | mask_basereg_ = 24 |
941 | 999 | ||
942 | pixman_asm_function fname | 1000 | pixman_asm_function fname |
943 | |||
944 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE | 1001 | .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
1002 | |||
945 | /* | 1003 | /* |
946 | * Make some macro arguments globally visible and accessible | 1004 | * Make some macro arguments globally visible and accessible |
947 | * from other macros | 1005 | * from other macros |
948 | */ | 1006 | */ |
949 | .set src_bpp, src_bpp_ | 1007 | .set src_bpp, src_bpp_ |
@@ -952,49 +1010,67 @@ local skip1 | |||
952 | .set pixblock_size, pixblock_size_ | 1010 | .set pixblock_size, pixblock_size_ |
953 | .set dst_w_basereg, dst_w_basereg_ | 1011 | .set dst_w_basereg, dst_w_basereg_ |
954 | .set dst_r_basereg, dst_r_basereg_ | 1012 | .set dst_r_basereg, dst_r_basereg_ |
955 | .set src_basereg, src_basereg_ | 1013 | .set src_basereg, src_basereg_ |
956 | .set mask_basereg, mask_basereg_ | 1014 | .set mask_basereg, mask_basereg_ |
957 | 1015 | ||
958 | .if use_nearest_scaling != 0 | 1016 | .if use_nearest_scaling != 0 |
959 | /* | 1017 | /* |
960 | * Assign symbolic names to registers for nearest scaling | 1018 | * Assign symbolic names to registers for nearest scaling |
961 | */ | 1019 | */ |
962 | W .req r0 | 1020 | W .req x0 |
963 | DST_W .req r1 | 1021 | DST_W .req x1 |
964 | SRC .req r2 | 1022 | SRC .req x2 |
965 | VX .req r3 | 1023 | VX .req x3 |
966 | UNIT_X .req ip | 1024 | UNIT_X .req x4 |
967 | MASK .req lr | 1025 | SRC_WIDTH_FIXED .req x5 |
968 | TMP1 .req r4 | 1026 | MASK .req x6 |
969 | TMP2 .req r5 | 1027 | TMP1 .req x8 |
970 | DST_R .req r6 | 1028 | TMP2 .req x9 |
971 | SRC_WIDTH_FIXED .req r7 | 1029 | DST_R .req x10 |
1030 | DUMMY .req x30 | ||
972 | 1031 | ||
973 | .macro pixld_src x:vararg | 1032 | .macro pixld_src x:vararg |
974 | pixld_s x | 1033 | pixld_s x |
975 | .endm | 1034 | .endm |
976 | 1035 | ||
977 | ldr UNIT_X, [sp] | 1036 | sxtw x0, w0 |
978 | push {r4-r8, lr} | 1037 | sxtw x3, w3 |
979 | ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] | 1038 | sxtw x4, w4 |
980 | .if mask_bpp != 0 | 1039 | sxtw x5, w5 |
981 | ldr MASK, [sp, #(24 + 8)] | 1040 | |
982 | .endif | 1041 | stp x29, x30, [sp, -16]! |
1042 | mov x29, sp | ||
1043 | sub sp, sp, 88 | ||
1044 | sub x29, x29, 64 | ||
1045 | st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
1046 | st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
1047 | stp x8, x9, [x29, -80] | ||
1048 | str x10, [x29, -88] | ||
983 | .else | 1049 | .else |
984 | /* | 1050 | /* |
985 | * Assign symbolic names to registers | 1051 | * Assign symbolic names to registers |
986 | */ | 1052 | */ |
987 | W .req r0 /* width (is updated during processing) */ | 1053 | W .req x0 /* width (is updated during processing) */ |
988 | DST_W .req r1 /* destination buffer pointer for writes */ | 1054 | DST_W .req x1 /* destination buffer pointer for writes */ |
989 | SRC .req r2 /* source buffer pointer */ | 1055 | SRC .req x2 /* source buffer pointer */ |
990 | DST_R .req ip /* destination buffer pointer for reads */ | 1056 | MASK .req x3 /* mask pointer */ |
991 | MASK .req r3 /* mask pointer */ | 1057 | DST_R .req x4 /* destination buffer pointer for reads */ |
1058 | DUMMY .req x30 | ||
992 | 1059 | ||
993 | .macro pixld_src x:vararg | 1060 | .macro pixld_src x:vararg |
994 | pixld x | 1061 | pixld x |
995 | .endm | 1062 | .endm |
1063 | |||
1064 | sxtw x0, w0 | ||
1065 | |||
1066 | stp x29, x30, [sp, -16]! | ||
1067 | mov x29, sp | ||
1068 | sub sp, sp, 64 | ||
1069 | sub x29, x29, 64 | ||
1070 | st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
1071 | st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
996 | .endif | 1072 | .endif |
997 | 1073 | ||
998 | .if (((flags) & FLAG_DST_READWRITE) != 0) | 1074 | .if (((flags) & FLAG_DST_READWRITE) != 0) |
999 | .set dst_r_bpp, dst_w_bpp | 1075 | .set dst_r_bpp, dst_w_bpp |
1000 | .else | 1076 | .else |
@@ -1013,61 +1089,80 @@ local skip1 | |||
1013 | 1089 | ||
1014 | init | 1090 | init |
1015 | mov DST_R, DST_W | 1091 | mov DST_R, DST_W |
1016 | 1092 | ||
1017 | cmp W, #pixblock_size | 1093 | cmp W, #pixblock_size |
1018 | blt 8f | 1094 | blt 800f |
1019 | 1095 | ||
1020 | ensure_destination_ptr_alignment process_pixblock_head, \ | 1096 | ensure_destination_ptr_alignment process_pixblock_head, \ |
1021 | process_pixblock_tail, \ | 1097 | process_pixblock_tail, \ |
1022 | process_pixblock_tail_head | 1098 | process_pixblock_tail_head |
1023 | 1099 | ||
1024 | subs W, W, #pixblock_size | 1100 | subs W, W, #pixblock_size |
1025 | blt 7f | 1101 | blt 700f |
1026 | 1102 | ||
1027 | /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ | 1103 | /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ |
1028 | pixld_a pixblock_size, dst_r_bpp, \ | 1104 | pixld_a pixblock_size, dst_r_bpp, \ |
1029 | (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R | 1105 | (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
1030 | fetch_src_pixblock | 1106 | fetch_src_pixblock |
1031 | pixld pixblock_size, mask_bpp, \ | 1107 | pixld pixblock_size, mask_bpp, \ |
1032 | (mask_basereg - pixblock_size * mask_bpp / 64), MASK | 1108 | (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
1033 | process_pixblock_head | 1109 | process_pixblock_head |
1034 | subs W, W, #pixblock_size | 1110 | subs W, W, #pixblock_size |
1035 | blt 2f | 1111 | blt 200f |
1036 | 1: | 1112 | 100: |
1037 | process_pixblock_tail_head | 1113 | process_pixblock_tail_head |
1038 | subs W, W, #pixblock_size | 1114 | subs W, W, #pixblock_size |
1039 | bge 1b | 1115 | bge 100b |
1040 | 2: | 1116 | 200: |
1041 | process_pixblock_tail | 1117 | process_pixblock_tail |
1042 | pixst_a pixblock_size, dst_w_bpp, \ | 1118 | pixst_a pixblock_size, dst_w_bpp, \ |
1043 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W | 1119 | (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
1044 | 7: | 1120 | 700: |
1045 | /* Process the remaining trailing pixels in the scanline (dst aligned) */ | 1121 | /* Process the remaining trailing pixels in the scanline (dst aligned) */ |
1046 | process_trailing_pixels 0, 1, \ | 1122 | process_trailing_pixels 0, 1, \ |
1047 | process_pixblock_head, \ | 1123 | process_pixblock_head, \ |
1048 | process_pixblock_tail, \ | 1124 | process_pixblock_tail, \ |
1049 | process_pixblock_tail_head | 1125 | process_pixblock_tail_head |
1050 | 1126 | ||
1051 | cleanup | 1127 | cleanup |
1052 | .if use_nearest_scaling != 0 | 1128 | .if use_nearest_scaling != 0 |
1053 | pop {r4-r8, pc} /* exit */ | 1129 | sub x29, x29, 64 |
1130 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
1131 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
1132 | ldp x8, x9, [x29, -80] | ||
1133 | ldr x10, [x29, -96] | ||
1134 | mov sp, x29 | ||
1135 | ldp x29, x30, [sp], 16 | ||
1136 | ret /* exit */ | ||
1054 | .else | 1137 | .else |
1055 | bx lr /* exit */ | 1138 | sub x29, x29, 64 |
1056 | .endif | 1139 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 |
1057 | 8: | 1140 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 |
1141 | mov sp, x29 | ||
1142 | ldp x29, x30, [sp], 16 | ||
1143 | ret /* exit */ | ||
1144 | .endif | ||
1145 | 800: | ||
1058 | /* Process the remaining trailing pixels in the scanline (dst unaligned) */ | 1146 | /* Process the remaining trailing pixels in the scanline (dst unaligned) */ |
1059 | process_trailing_pixels 0, 0, \ | 1147 | process_trailing_pixels 0, 0, \ |
1060 | process_pixblock_head, \ | 1148 | process_pixblock_head, \ |
1061 | process_pixblock_tail, \ | 1149 | process_pixblock_tail, \ |
1062 | process_pixblock_tail_head | 1150 | process_pixblock_tail_head |
1063 | 1151 | ||
1064 | cleanup | 1152 | cleanup |
1065 | |||
1066 | .if use_nearest_scaling != 0 | 1153 | .if use_nearest_scaling != 0 |
1067 | pop {r4-r8, pc} /* exit */ | 1154 | sub x29, x29, 64 |
1155 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
1156 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
1157 | ldp x8, x9, [x29, -80] | ||
1158 | ldr x10, [x29, -88] | ||
1159 | mov sp, x29 | ||
1160 | ldp x29, x30, [sp], 16 | ||
1161 | ret /* exit */ | ||
1068 | 1162 | ||
1163 | .unreq DUMMY | ||
1069 | .unreq DST_R | 1164 | .unreq DST_R |
1070 | .unreq SRC | 1165 | .unreq SRC |
1071 | .unreq W | 1166 | .unreq W |
1072 | .unreq VX | 1167 | .unreq VX |
1073 | .unreq UNIT_X | 1168 | .unreq UNIT_X |
@@ -1076,12 +1171,18 @@ local skip1 | |||
1076 | .unreq DST_W | 1171 | .unreq DST_W |
1077 | .unreq MASK | 1172 | .unreq MASK |
1078 | .unreq SRC_WIDTH_FIXED | 1173 | .unreq SRC_WIDTH_FIXED |
1079 | 1174 | ||
1080 | .else | 1175 | .else |
1081 | bx lr /* exit */ | 1176 | sub x29, x29, 64 |
1177 | ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 | ||
1178 | ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 | ||
1179 | mov sp, x29 | ||
1180 | ldp x29, x30, [sp], 16 | ||
1181 | ret /* exit */ | ||
1082 | 1182 | ||
1183 | .unreq DUMMY | ||
1083 | .unreq SRC | 1184 | .unreq SRC |
1084 | .unreq MASK | 1185 | .unreq MASK |
1085 | .unreq DST_R | 1186 | .unreq DST_R |
1086 | .unreq DST_W | 1187 | .unreq DST_W |
1087 | .unreq W | 1188 | .unreq W |
@@ -1108,21 +1209,19 @@ local skip1 | |||
1108 | 1209 | ||
1109 | .macro default_cleanup | 1210 | .macro default_cleanup |
1110 | .endm | 1211 | .endm |
1111 | 1212 | ||
1112 | /* | 1213 | /* |
1113 | * Prologue/epilogue variant which additionally saves/restores d8-d15 | 1214 | * Prologue/epilogue variant which additionally saves/restores v8-v15 |
1114 | * registers (they need to be saved/restored by callee according to ABI). | 1215 | * registers (they need to be saved/restored by callee according to ABI). |
1115 | * This is required if the code needs to use all the NEON registers. | 1216 | * This is required if the code needs to use all the NEON registers. |
1116 | */ | 1217 | */ |
1117 | 1218 | ||
1118 | .macro default_init_need_all_regs | 1219 | .macro default_init_need_all_regs |
1119 | vpush {d8-d15} | ||
1120 | .endm | 1220 | .endm |
1121 | 1221 | ||
1122 | .macro default_cleanup_need_all_regs | 1222 | .macro default_cleanup_need_all_regs |
1123 | vpop {d8-d15} | ||
1124 | .endm | 1223 | .endm |
1125 | 1224 | ||
1126 | /******************************************************************************/ | 1225 | /******************************************************************************/ |
1127 | 1226 | ||
1128 | /* | 1227 | /* |
@@ -1132,53 +1231,58 @@ local skip1 | |||
1132 | * | 1231 | * |
1133 | * Warning: the conversion is destructive and the original | 1232 | * Warning: the conversion is destructive and the original |
1134 | * value (in) is lost. | 1233 | * value (in) is lost. |
1135 | */ | 1234 | */ |
1136 | .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b | 1235 | .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b |
1137 | vshrn.u16 out_r, in, #8 | 1236 | shrn &out_r&.8b, &in&.8h, #8 |
1138 | vshrn.u16 out_g, in, #3 | 1237 | shrn &out_g&.8b, &in&.8h, #3 |
1139 | vsli.u16 in, in, #5 | 1238 | sli &in&.8h, &in&.8h, #5 |
1140 | vmov.u8 out_a, #255 | 1239 | movi &out_a&.8b, #255 |
1141 | vsri.u8 out_r, out_r, #5 | 1240 | sri &out_r&.8b, &out_r&.8b, #5 |
1142 | vsri.u8 out_g, out_g, #6 | 1241 | sri &out_g&.8b, &out_g&.8b, #6 |
1143 | vshrn.u16 out_b, in, #2 | 1242 | shrn &out_b&.8b, &in&.8h, #2 |
1144 | .endm | 1243 | .endm |
1145 | 1244 | ||
1146 | .macro convert_0565_to_x888 in, out_r, out_g, out_b | 1245 | .macro convert_0565_to_x888 in, out_r, out_g, out_b |
1147 | vshrn.u16 out_r, in, #8 | 1246 | shrn &out_r&.8b, &in&.8h, #8 |
1148 | vshrn.u16 out_g, in, #3 | 1247 | shrn &out_g&.8b, &in&.8h, #3 |
1149 | vsli.u16 in, in, #5 | 1248 | sli &in&.8h, &in&.8h, #5 |
1150 | vsri.u8 out_r, out_r, #5 | 1249 | sri &out_r&.8b, &out_r&.8b, #5 |
1151 | vsri.u8 out_g, out_g, #6 | 1250 | sri &out_g&.8b, &out_g&.8b, #6 |
1152 | vshrn.u16 out_b, in, #2 | 1251 | shrn &out_b&.8b, &in&.8h, #2 |
1153 | .endm | 1252 | .endm |
1154 | 1253 | ||
1155 | /* | 1254 | /* |
1156 | * Conversion from planar a8r8g8b8 format (with a, r, g, b color components | 1255 | * Conversion from planar a8r8g8b8 format (with a, r, g, b color components |
1157 | * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 | 1256 | * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 |
1158 | * pixels packed in 128-bit register (out). Requires two temporary 128-bit | 1257 | * pixels packed in 128-bit register (out). Requires two temporary 128-bit |
1159 | * registers (tmp1, tmp2) | 1258 | * registers (tmp1, tmp2) |
1160 | */ | 1259 | */ |
1161 | .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 | 1260 | .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 |
1162 | vshll.u8 tmp1, in_g, #8 | 1261 | ushll &tmp1&.8h, &in_g&.8b, #7 |
1163 | vshll.u8 out, in_r, #8 | 1262 | shl &tmp1&.8h, &tmp1&.8h, #1 |
1164 | vshll.u8 tmp2, in_b, #8 | 1263 | ushll &out&.8h, &in_r&.8b, #7 |
1165 | vsri.u16 out, tmp1, #5 | 1264 | shl &out&.8h, &out&.8h, #1 |
1166 | vsri.u16 out, tmp2, #11 | 1265 | ushll &tmp2&.8h, &in_b&.8b, #7 |
1266 | shl &tmp2&.8h, &tmp2&.8h, #1 | ||
1267 | sri &out&.8h, &tmp1&.8h, #5 | ||
1268 | sri &out&.8h, &tmp2&.8h, #11 | ||
1167 | .endm | 1269 | .endm |
1168 | 1270 | ||
1169 | /* | 1271 | /* |
1170 | * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels | 1272 | * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels |
1171 | * returned in (out0, out1) registers pair. Requires one temporary | 1273 | * returned in (out0, out1) registers pair. Requires one temporary |
1172 | * 64-bit register (tmp). 'out1' and 'in' may overlap, the original | 1274 | * 64-bit register (tmp). 'out1' and 'in' may overlap, the original |
1173 | * value from 'in' is lost | 1275 | * value from 'in' is lost |
1174 | */ | 1276 | */ |
1175 | .macro convert_four_0565_to_x888_packed in, out0, out1, tmp | 1277 | .macro convert_four_0565_to_x888_packed in, out0, out1, tmp |
1176 | vshl.u16 out0, in, #5 /* G top 6 bits */ | 1278 | shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ |
1177 | vshl.u16 tmp, in, #11 /* B top 5 bits */ | 1279 | shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ |
1178 | vsri.u16 in, in, #5 /* R is ready in top bits */ | 1280 | sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ |
1179 | vsri.u16 out0, out0, #6 /* G is ready in top bits */ | 1281 | sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ |
1180 | vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ | 1282 | sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ |
1181 | vshr.u16 out1, in, #8 /* R is in place */ | 1283 | ushr &out1&.4h, &in&.4h, #8 /* R is in place */ |
1182 | vsri.u16 out0, tmp, #8 /* G & B is in place */ | 1284 | sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ |
1183 | vzip.u16 out0, out1 /* everything is in place */ | 1285 | zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ |
1286 | zip2 &out1&.4h, &out0&.4h, &out1&.4h | ||
1287 | mov &out0&.d[0], &tmp&.d[0] | ||
1184 | .endm | 1288 | .endm |
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h index 73a5414..81e0f23 100644 --- a/pixman/pixman-private.h +++ b/pixman/pixman-private.h | |||
@@ -605,10 +605,15 @@ _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback); | |||
605 | #ifdef USE_ARM_NEON | 605 | #ifdef USE_ARM_NEON |
606 | pixman_implementation_t * | 606 | pixman_implementation_t * |
607 | _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); | 607 | _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); |
608 | #endif | 608 | #endif |
609 | 609 | ||
610 | #ifdef USE_ARM_A64_NEON | ||
611 | pixman_implementation_t * | ||
612 | _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); | ||
613 | #endif | ||
614 | |||
610 | #ifdef USE_MIPS_DSPR2 | 615 | #ifdef USE_MIPS_DSPR2 |
611 | pixman_implementation_t * | 616 | pixman_implementation_t * |
612 | _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback); | 617 | _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback); |
613 | #endif | 618 | #endif |
614 | 619 | ||