summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2015-04-14 14:26:00 +0100
committerBen Avison <bavison@riscosopen.org>2015-10-15 12:55:30 +0100
commitc4f76222e25de2b20dbfaac72883ebbda8ca4484 (patch)
tree6114938c75b461b679d97d3dbfff8794cc9b6343
parent038b97a7e3669b3adc6c0e3d91141e9749e2b4e7 (diff)
armv7: Use prefetch for small-width images too
After discovering that the ARMv6 optimised fast paths often out-performed the ARMv7 ones on a Cortex-A7, particularly on the RT benchmark, I found that the problem was due to the fact that the ARMv7 macros didn't attempt any sort of prefetch for small images (fewer than pixblock_size * 2 pixels across). Since a pixblock is chosen to be no larger than a cacheline, and is in many cases smaller, it seemed a reasonable compromise to avoid adding a lot of complexity by simply doing one prefetch for the start of a pixel row when starting to process the preceding one, and that is what this patch does. I compared the effect of using LDRB (which is what is currently used at the end of each long pixel row) against PLD for each of the source and destination buffers for a selection of common operations: src_8888_8888, over_8888_8888 and add_8888_8888, and in each case PLD of both buffers was the most beneficial. PLDW didn't make any measurable difference. The overall effect of this patch on the three operations is as follows (L1, L2 and M tests can be ignored because they're known not to involve the use of short rows): src_8888_8888 Before After Mean StdDev Mean StdDev Confidence Change HT 60.8 0.1 61.1 0.1 100.0% +0.6% VT 61.0 0.3 62.6 0.2 100.0% +2.6% R 45.5 0.2 46.2 0.2 100.0% +1.5% RT 19.8 0.0 21.4 0.0 100.0% +7.8% over_8888_8888 Before After Mean StdDev Mean StdDev Confidence Change HT 40.2 0.1 40.7 0.4 100.0% +1.0% VT 35.5 0.2 37.9 0.3 100.0% +6.7% R 32.8 0.0 33.8 0.3 100.0% +3.0% RT 12.9 0.0 15.6 0.2 100.0% +21.4% add_8888_8888 Before After Mean StdDev Mean StdDev Confidence Change HT 51.0 0.6 51.9 0.5 100.0% +1.7% VT 44.0 0.4 46.8 0.5 100.0% +6.3% R 39.6 0.5 41.0 0.4 100.0% +3.5% RT 15.2 0.2 18.0 0.2 100.0% +18.5%
-rw-r--r--pixman/pixman-arm-neon-asm.h9
1 files changed, 9 insertions, 0 deletions
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 03257cc..a116e47 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -881,6 +881,15 @@ local skip1
* nor prefetch are used.
*/
8:
+.if src_bpp_shift >= 0
+ PF pld, [SRC, SRC_STRIDE, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+ PF pld, [DST_R, DST_STRIDE, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+ PF pld, [MASK, MASK_STRIDE, lsl #mask_bpp_shift]
+.endif
/* Process exactly pixblock_size pixels if needed */
tst W, #pixblock_size
beq 1f