119 files changed, 16297 insertions, 4656 deletions
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..5153bb0
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,11 @@
+# To use this config on you editor, follow the instructions at:
+# http://editorconfig.org
+
+root = true
+
+[*]
+tab_width = 8
+
+[meson.build,meson_options.txt]
+indent_style = space
+indent_size = 2
diff --git a/.gitignore b/.gitignore
index 0f11496..046b161 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,61 +26,28 @@ stamp-h?
 config.h
 config.h.in
 .*.swp
-demos/alpha-test
+demos/*-test
 demos/checkerboard
 demos/clip-in
-demos/clip-test
-demos/composite-test
-demos/conical-test
-demos/convolution-test
-demos/gradient-test
 demos/linear-gradient
 demos/quad2quad
-demos/radial-test
 demos/scale
-demos/screen-test
-demos/srgb-test
-demos/srgb-trap-test
-demos/trap-test
-demos/tri-test
+demos/dither
 pixman/pixman-srgb.c
 pixman/pixman-version.h
-test/a1-trap-test
-test/affine-test
+test/*-test
+test/affine-bench
 test/alpha-loop
 test/alphamap
-test/alpha-test
-test/blitters-test
+test/check-formats
 test/clip-in
-test/clip-test
-test/combiner-test
 test/composite
-test/composite-test
-test/composite-traps-test
-test/convolution-test
-test/fetch-test
-test/glyph-test
-test/gradient-crash-test
-test/gradient-test
 test/infinite-loop
 test/lowlevel-blt-bench
-test/oob-test
-test/pdf-op-test
-test/prng-test
-test/radial-perf-test
-test/region-contains-test
-test/region-test
+test/radial-invalid
 test/region-translate
-test/region-translate-test
-test/rotate-test
-test/scaling-crash-test
-test/scaling-helpers-test
-test/scaling-test
-test/screen-test
-test/stress-test
+test/scaling-bench
 test/trap-crasher
-test/trap-test
-test/window-test
 *.pdb
 *.dll
 *.lib
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..296b6cd
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,10 @@
+image: fedora:39
+
+meson-build:
+    script:
+    - dnf -y install dnf-plugins-core
+    - dnf -y groupinstall buildsys-build
+    - dnf -y builddep pixman
+    - dnf -y install meson
+    - meson setup build
+    - ninja -C build test
diff --git a/ChangeLog b/ChangeLog
deleted file mode 100644
index e69de29..0000000
--- a/ChangeLog
+++ /dev/null
diff --git a/INSTALL b/INSTALL
index 5458714..5824704 100644
--- a/INSTALL
+++ b/INSTALL
@@ -10,225 +10,29 @@ unlimited permission to copy, distribute and modify it.
 Basic Installation
 ==================
 
-Briefly, the shell commands `./configure; make; make install' should
-configure, build, and install this package.  The following
-more-detailed instructions are generic; see the `README' file for
-instructions specific to this package.
+Briefly, the shell commands `meson setup build/; ninja -C build; ninja
+-C build install` should configure, build, and install this package. The
+following more-detailed instructions are generic; see the `README` file
+for instructions specific to this package.
 
-   The `configure' shell script attempts to guess correct values for
-various system-dependent variables used during compilation.  It uses
-those values to create a `Makefile' in each directory of the package.
-It may also create one or more `.h' files containing system-dependent
-definitions.  Finally, it creates a shell script `config.status' that
-you can run in the future to recreate the current configuration, and a
-file `config.log' containing compiler output (useful mainly for
-debugging `configure').
-
-   It can also use an optional file (typically called `config.cache'
-and enabled with `--cache-file=config.cache' or simply `-C') that saves
-the results of its tests to speed up reconfiguring.  Caching is
-disabled by default to prevent problems with accidental use of stale
-cache files.
-
-   If you need to do unusual things to compile the package, please try
-to figure out how `configure' could check whether to do them, and mail
-diffs or instructions to the address given in the `README' so they can
-be considered for the next release.  If you are using the cache, and at
-some point `config.cache' contains results you don't want to keep, you
-may remove or edit it.
-
-   The file `configure.ac' (or `configure.in') is used to create
-`configure' by a program called `autoconf'.  You need `configure.ac' if
-you want to change it or regenerate `configure' using a newer version
-of `autoconf'.
+   Running `meson setup` attempts to guess correct values for various
+system-dependent variables used during compilation.
 
 The simplest way to compile this package is:
 
-  1. `cd' to the directory containing the package's source code and type
-     `./configure' to configure the package for your system.
+  1. `cd` to the directory containing the package's source code and type
+     `meson setup build/` to configure the package for your system.
 
-     Running `configure' might take a while.  While running, it prints
-     some messages telling which features it is checking for.
+     While running, it prints some messages telling which features it
+     is checking for.
 
-  2. Type `make' to compile the package.
+  2. Type `ninja -C build` to compile the package.
 
-  3. Optionally, type `make check' to run any self-tests that come with
-     the package.
+  3. Optionally, type `ninja -C build test` to run any self-tests that
+     come with the package.
 
-  4. Type `make install' to install the programs and any data files and
-     documentation.
+  4. Type `ninja -C build install` to install the programs and any
+     data files and documentation.
 
   5. You can remove the program binaries and object files from the
-     source code directory by typing `make clean'.  To also remove the
-     files that `configure' created (so you can compile the package for
-     a different kind of computer), type `make distclean'.  There is
-     also a `make maintainer-clean' target, but that is intended mainly
-     for the package's developers.  If you use it, you may have to get
-     all sorts of other programs in order to regenerate files that came
-     with the distribution.
-
-Compilers and Options
-=====================
-
-Some systems require unusual options for compilation or linking that the
-`configure' script does not know about.  Run `./configure --help' for
-details on some of the pertinent environment variables.
-
-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-is an example:
-
-     ./configure CC=c99 CFLAGS=-g LIBS=-lposix
-
-   *Note Defining Variables::, for more details.
-
-Compiling For Multiple Architectures
-====================================
-
-You can compile the package for more than one kind of computer at the
-same time, by placing the object files for each architecture in their
-own directory.  To do this, you can use GNU `make'.  `cd' to the
-directory where you want the object files and executables to go and run
-the `configure' script.  `configure' automatically checks for the
-source code in the directory that `configure' is in and in `..'.
-
-   With a non-GNU `make', it is safer to compile the package for one
-architecture at a time in the source code directory.  After you have
-installed the package for one architecture, use `make distclean' before
-reconfiguring for another architecture.
-
-Installation Names
-==================
-
-By default, `make install' installs the package's commands under
-`/usr/local/bin', include files under `/usr/local/include', etc.  You
-can specify an installation prefix other than `/usr/local' by giving
-`configure' the option `--prefix=PREFIX'.
-
-   You can specify separate installation prefixes for
-architecture-specific files and architecture-independent files.  If you
-pass the option `--exec-prefix=PREFIX' to `configure', the package uses
-PREFIX as the prefix for installing programs and libraries.
-Documentation and other data files still use the regular prefix.
-
-   In addition, if you use an unusual directory layout you can give
-options like `--bindir=DIR' to specify different values for particular
-kinds of files.  Run `configure --help' for a list of the directories
-you can set and what kinds of files go in them.
-
-   If the package supports it, you can cause programs to be installed
-with an extra prefix or suffix on their names by giving `configure' the
-option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
-
-Optional Features
-=================
-
-Some packages pay attention to `--enable-FEATURE' options to
-`configure', where FEATURE indicates an optional part of the package.
-They may also pay attention to `--with-PACKAGE' options, where PACKAGE
-is something like `gnu-as' or `x' (for the X Window System).  The
-`README' should mention any `--enable-' and `--with-' options that the
-package recognizes.
-
-   For packages that use the X Window System, `configure' can usually
-find the X include and library files automatically, but if it doesn't,
-you can use the `configure' options `--x-includes=DIR' and
-`--x-libraries=DIR' to specify their locations.
-
-Specifying the System Type
-==========================
-
-There may be some features `configure' cannot figure out automatically,
-but needs to determine by the type of machine the package will run on.
-Usually, assuming the package is built to be run on the _same_
-architectures, `configure' can figure that out, but if it prints a
-message saying it cannot guess the machine type, give it the
-`--build=TYPE' option.  TYPE can either be a short name for the system
-type, such as `sun4', or a canonical name which has the form:
-
-     CPU-COMPANY-SYSTEM
-
-where SYSTEM can have one of these forms:
-
-     OS KERNEL-OS
-
-   See the file `config.sub' for the possible values of each field.  If
-`config.sub' isn't included in this package, then this package doesn't
-need to know the machine type.
-
-   If you are _building_ compiler tools for cross-compiling, you should
-use the option `--target=TYPE' to select the type of system they will
-produce code for.
-
-   If you want to _use_ a cross compiler, that generates code for a
-platform different from the build platform, you should specify the
-"host" platform (i.e., that on which the generated programs will
-eventually be run) with `--host=TYPE'.
-
-Sharing Defaults
-================
-
-If you want to set default values for `configure' scripts to share, you
-can create a site shell script called `config.site' that gives default
-values for variables like `CC', `cache_file', and `prefix'.
-`configure' looks for `PREFIX/share/config.site' if it exists, then
-`PREFIX/etc/config.site' if it exists.  Or, you can set the
-`CONFIG_SITE' environment variable to the location of the site script.
-A warning: not all `configure' scripts look for a site script.
-
-Defining Variables
-==================
-
-Variables not defined in a site shell script can be set in the
-environment passed to `configure'.  However, some packages may run
-configure again during the build, and the customized values of these
-variables may be lost.  In order to avoid this problem, you should set
-them in the `configure' command line, using `VAR=value'.  For example:
-
-     ./configure CC=/usr/local2/bin/gcc
-
-causes the specified `gcc' to be used as the C compiler (unless it is
-overridden in the site shell script).
-
-Unfortunately, this technique does not work for `CONFIG_SHELL' due to
-an Autoconf bug.  Until the bug is fixed you can use this workaround:
-
-     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
-
-`configure' Invocation
-======================
-
-`configure' recognizes the following options to control how it operates.
-
-`--help'
-`-h'
-     Print a summary of the options to `configure', and exit.
-
-`--version'
-`-V'
-     Print the version of Autoconf used to generate the `configure'
-     script, and exit.
-
-`--cache-file=FILE'
-     Enable the cache: use and save the results of the tests in FILE,
-     traditionally `config.cache'.  FILE defaults to `/dev/null' to
-     disable caching.
-
-`--config-cache'
-`-C'
-     Alias for `--cache-file=config.cache'.
-
-`--quiet'
-`--silent'
-`-q'
-     Do not print messages saying which checks are being made.  To
-     suppress all normal output, redirect it to `/dev/null' (any error
-     messages will still be shown).
-
-`--srcdir=DIR'
-     Look for the package's source code in directory DIR.  Usually
-     `configure' can determine that directory automatically.
-
-`configure' also accepts some other, not widely useful, options.  Run
-`configure --help' for more details.
-
+     source code directory by typing `ninja -C build clean`.
diff --git a/Makefile.am b/Makefile.am
deleted file mode 100644
index 5137c9e..0000000
--- a/Makefile.am
+++ /dev/null
@@ -1,137 +0,0 @@
-SUBDIRS = pixman demos test
-
-pkgconfigdir=$(libdir)/pkgconfig
-pkgconfig_DATA=pixman-1.pc
-
-$(pkgconfig_DATA): pixman-1.pc.in
-
-snapshot:
-	distdir="$(distdir)-`date '+%Y%m%d'`"; \
-	test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
-	$(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
-
-GPGKEY=3892336E
-USERNAME=$$USER
-RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
-RELEASE_CAIRO_HOST =	$(USERNAME)@cairographics.org
-RELEASE_CAIRO_DIR =	/srv/cairo.freedesktop.org/www/$(RELEASE_OR_SNAPSHOT)s
-RELEASE_CAIRO_URL = 	http://cairographics.org/$(RELEASE_OR_SNAPSHOT)s
-RELEASE_XORG_URL =	http://xorg.freedesktop.org/archive/individual/lib
-RELEASE_XORG_HOST =	$(USERNAME)@xorg.freedesktop.org
-RELEASE_XORG_DIR =	/srv/xorg.freedesktop.org/archive/individual/lib
-RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
-
-EXTRA_DIST =				\
-	Makefile.win32			\
-	Makefile.win32.common
-
-tar_gz = $(PACKAGE)-$(VERSION).tar.gz
-tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
-
-sha1_tgz = $(tar_gz).sha1
-md5_tgz = $(tar_gz).md5
-
-sha1_tbz2 = $(tar_bz2).sha1
-md5_tbz2 = $(tar_bz2).md5
-
-gpg_file = $(sha1_tgz).asc
-
-$(sha1_tgz): $(tar_gz)
-	sha1sum $^ > $@
-
-$(md5_tgz): $(tar_gz)
-	md5sum $^ > $@
-
-$(sha1_tbz2): $(tar_bz2)
-	sha1sum $^ > $@
-
-$(md5_tbz2): $(tar_bz2)
-	md5sum $^ > $@
-
-$(gpg_file): $(sha1_tgz)
-	@echo "Please enter your GPG password to sign the checksum."
-	gpg --armor --sign $^ 
-
-HASHFILES = $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(md5_tbz2)
-
-release-verify-newer:
-	@echo -n "Checking that no $(VERSION) release already exists at $(RELEASE_XORG_HOST)..."
-	@ssh $(RELEASE_XORG_HOST) test ! -e $(RELEASE_XORG_DIR)/$(tar_gz) \
-		|| (echo "Ouch." && echo "Found: $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)/$(tar_gz)" \
-		&& echo "Refusing to try to generate a new release of the same name." \
-		&& false)
-	@ssh $(RELEASE_CAIRO_HOST) test ! -e $(RELEASE_CAIRO_DIR)/$(tar_gz) \
-		|| (echo "Ouch." && echo "Found: $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)/$(tar_gz)" \
-		&& echo "Refusing to try to generate a new release of the same name." \
-		&& false)
-	@echo "Good."
-
-release-remove-old:
-	$(RM) $(tar_gz) $(tar_bz2) $(HASHFILES) $(gpg_file)
-
-ensure-prev:
-	@if [[ "$(PREV)" == "" ]]; then							\
-		echo ""							          &&	\
-		echo "You must set the PREV variable on the make command line to" &&	\
-		echo "the last version."				  	  &&	\
-		echo ""								  &&	\
-		echo "For example:"						  &&	\
-		echo "      make PREV=0.7.3"				  	  &&	\
-		echo ""								  &&	\
-		false;									\
-	fi
-
-release-check: ensure-prev release-verify-newer release-remove-old distcheck
-
-release-tag:
-	git tag -u $(GPGKEY) -m "$(PACKAGE) $(VERSION) release" $(PACKAGE)-$(VERSION)
-
-release-upload: release-check $(tar_gz) $(tar_bz2) $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(gpg_file)
-	scp $(tar_gz) $(sha1_tgz) $(gpg_file) $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)
-	scp $(tar_gz) $(tar_bz2) $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)
-	ssh $(RELEASE_CAIRO_HOST) "rm -f $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-[0-9]* && ln -s $(tar_gz) $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-$(VERSION)"
-
-RELEASE_TYPE = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo "stable release in the" ; else echo "development snapshot leading up to a stable"; fi)
-
-release-publish-message: $(HASHFILES) ensure-prev
-	@echo "Please follow the instructions in RELEASING to push stuff out and"
-	@echo "send out the announcement mails.  Here is the excerpt you need:"
-	@echo ""
-	@echo "Lists:  $(RELEASE_ANNOUNCE_LIST)"
-	@echo "Subject: [ANNOUNCE] $(PACKAGE) release $(VERSION) now available"
-	@echo "============================== CUT HERE =============================="
-	@echo "A new $(PACKAGE) release $(VERSION) is now available. This is a $(RELEASE_TYPE)"
-	@echo ""
-	@echo "tar.gz:"
-	@echo "	$(RELEASE_CAIRO_URL)/$(tar_gz)"
-	@echo "	$(RELEASE_XORG_URL)/$(tar_gz)"
-	@echo ""
-	@echo "tar.bz2:"
-	@echo "	$(RELEASE_XORG_URL)/$(tar_bz2)"
-	@echo ""
-	@echo "Hashes:"
-	@echo -n "	MD5:  "
-	@cat $(md5_tgz)
-	@echo -n "	MD5:  "
-	@cat $(md5_tbz2)
-	@echo -n "	SHA1: "
-	@cat $(sha1_tgz)
-	@echo -n "	SHA1: "
-	@cat $(sha1_tbz2)
-	@echo ""
-	@echo "GPG signature:"
-	@echo "	$(RELEASE_CAIRO_URL)/$(gpg_file)"
-	@echo "	(signed by`gpg --list-keys $(GPGKEY) | grep uid | cut -b4- | tr -s " "`)"
-	@echo ""
-	@echo "Git:"
-	@echo "	git://git.freedesktop.org/git/pixman"
-	@echo "	tag: $(PACKAGE)-$(VERSION)"
-	@echo ""
-	@echo "Log:"
-	@git log --no-merges "$(PACKAGE)-$(PREV)".."$(PACKAGE)-$(VERSION)" | git shortlog | awk '{ printf "\t"; print ; }' | cut -b1-80
-	@echo "============================== CUT HERE =============================="
-	@echo ""
-
-release-publish: release-upload release-tag release-publish-message
-
-.PHONY: release-upload release-publish release-publish-message release-tag
diff --git a/Makefile.win32 b/Makefile.win32
deleted file mode 100644
index c3ca3bc..0000000
--- a/Makefile.win32
+++ /dev/null
@@ -1,25 +0,0 @@
-default: all
-
-top_srcdir = .
-include $(top_srcdir)/Makefile.win32.common
-
-all: pixman test
-
-pixman:
-	@$(MAKE) -C pixman -f Makefile.win32
-
-test:
-	@$(MAKE) -C test -f Makefile.win32
-
-clean_r:
-	@$(MAKE) -C pixman -f Makefile.win32 clean
-	@$(MAKE) -C test   -f Makefile.win32 clean
-
-check:
-	@$(MAKE) -C test -f Makefile.win32 check
-
-
-clean: clean_r
-
-
-.PHONY: all pixman test clean check
diff --git a/Makefile.win32.common b/Makefile.win32.common
deleted file mode 100644
index 777f94c..0000000
--- a/Makefile.win32.common
+++ /dev/null
@@ -1,56 +0,0 @@
-LIBRARY = pixman-1
-
-CC = cl
-LD = link
-AR = lib
-PERL = perl
-
-ifeq ($(top_builddir),)
-top_builddir = $(top_srcdir)
-endif
-
-CFG_VAR = $(CFG)
-ifeq ($(CFG_VAR),)
-CFG_VAR = release
-endif
-
-ifeq ($(CFG_VAR),debug)
-CFG_CFLAGS  = -MDd -Od -Zi
-CFG_LDFLAGS = -DEBUG
-else
-CFG_CFLAGS  = -MD -O2
-CFG_LDFLAGS =
-endif
-
-# Package definitions, to be used instead of those provided in config.h
-PKG_CFLAGS  = -DPACKAGE=$(LIBRARY) -DPACKAGE_VERSION="" -DPACKAGE_BUGREPORT=""
-
-BASE_CFLAGS = -nologo -I. -I$(top_srcdir) -I$(top_srcdir)/pixman
-
-PIXMAN_CFLAGS  = $(BASE_CFLAGS) $(PKG_CFLAGS) $(CFG_CFLAGS) $(CFLAGS)
-PIXMAN_LDFLAGS = -nologo $(CFG_LDFLAGS) $(LDFLAGS)
-PIXMAN_ARFLAGS = -nologo $(LDFLAGS)
-
-
-inform:
-ifneq ($(CFG),release)
-ifneq ($(CFG),debug)
-ifneq ($(CFG),)
-	@echo "Invalid specified configuration option: "$(CFG)"."
-	@echo
-	@echo "Possible choices for configuration are 'release' and 'debug'"
-	@exit 1
-endif
-	@echo "Using default RELEASE configuration... (use CFG=release or CFG=debug)"
-endif
-endif
-
-
-$(CFG_VAR)/%.obj: %.c $(libpixman_headers)
-	@mkdir -p $(CFG_VAR)
-	@$(CC) -c $(PIXMAN_CFLAGS) -Fo"$@" $<
-
-clean: inform
-	@$(RM) $(CFG_VAR)/*.{exe,ilk,lib,obj,pdb} $(BUILT_SOURCES) || exit 0
-
-.PHONY: inform clean
diff --git a/README b/README
index 6d8cfd8..e0e7dca 100644
--- a/README
+++ b/README
@@ -1,14 +1,20 @@
+Pixman
+======
+
 Pixman is a library that provides low-level pixel manipulation
 features such as image compositing and trapezoid rasterization.
 
-Questions, bug reports and patches should be directed to the pixman
-mailing list:
+Questions should be directed to the pixman mailing list:
 
-        http://lists.freedesktop.org/mailman/listinfo/pixman
+    https://lists.freedesktop.org/mailman/listinfo/pixman
 
 You can also file bugs at
 
-        https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+    https://gitlab.freedesktop.org/pixman/pixman/-/issues/new
+
+or submit improvements in form of a Merge Request via
+
+    https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests
 
 For real time discussions about pixman, feel free to join the IRC
 channels #cairo and #xorg-devel on the FreeNode IRC network.
@@ -21,53 +27,65 @@ In order to contribute to pixman, you will need a working knowledge of
 the git version control system. For a quick getting started guide,
 there is the "Everyday Git With 20 Commands Or So guide"
 
-        http://www.kernel.org/pub/software/scm/git/docs/everyday.html
+    https://www.kernel.org/pub/software/scm/git/docs/everyday.html
 
 from the Git homepage. For more in depth git documentation, see the
 resources on the Git community documentation page:
 
-        http://git-scm.com/documentation
+    https://git-scm.com/documentation
 
 Pixman uses the infrastructure from the freedesktop.org umbrella
 project. For instructions about how to use the git service on
 freedesktop.org, see:
 
-        http://www.freedesktop.org/wiki/Infrastructure/git/Developers
+    https://www.freedesktop.org/wiki/Infrastructure/git/Developers
 
 The Pixman master repository can be found at:
 
-	git://anongit.freedesktop.org/git/pixman
-
-and browsed on the web here:
-
-	http://cgit.freedesktop.org/pixman/
+    https://gitlab.freedesktop.org/pixman/pixman
 
 
 Sending patches
 ---------------
 
-The general workflow for sending patches is to first make sure that
-git can send mail on your system. Then, 
+Patches should be submitted in form of Merge Requests via Gitlab.
 
- - create a branch off of master in your local git repository
+You will first need to create a fork of the main pixman repository at
 
- - make your changes as one or more commits
+    https://gitlab.freedesktop.org/pixman/pixman
 
- - use the 
+via the Fork button on the top right. Once that is done you can add your
+personal repository as a remote to your local pixman development git checkout:
 
-        git send-email
+    git remote add my-gitlab git@gitlab.freedesktop.org:YOURUSERNAME/pixman.git
 
-   command to send the patch series to pixman@lists.freedesktop.org.
+    git fetch my-gitlab
 
-In order for your patches to be accepted, please consider the
-following guidelines:
+Make sure to have added ssh keys to your gitlab profile at
 
- - This link:
+    https://gitlab.freedesktop.org/profile/keys
 
-        http://www.kernel.org/pub/software/scm/git/docs/user-manual.html#patch-series
+Once that is set up, the general workflow for sending patches is to create a
+new local branch with your improvements and once it's ready push it to your
+personal pixman fork:
 
-   describes how what a good patch series is, and to create one with
-   git.
+    git checkout -b fix-some-bug
+    ...
+    git push my-gitlab
+
+The output of the `git push` command will include a link that allows you to
+create a Merge Request against the official pixman repository.
+
+Whenever you make changes to your branch (add new commits or fix up commits)
+you push them back to your personal pixman fork:
+
+    git push -f my-gitlab
+
+If there is an open Merge Request Gitlab will automatically pick up the
+changes from your branch and pixman developers can review them anew.
+
+In order for your patches to be accepted, please consider the
+following guidelines:
 
  - At each point in the series, pixman should compile and the test
    suite should pass.
@@ -79,7 +97,7 @@ following guidelines:
 
    You can run the test suite with 
 
-        make check
+       meson test -C builddir
 
    It will take around two minutes to run on a modern PC.
 
@@ -101,7 +119,7 @@ following guidelines:
 	- If review comments were incorporated, a brief version
           history describing what those changes were.
 
- - For big patch series, send an introductory email with an overall
+ - For big patch series, write an introductory post with an overall
    description of the patch series, including benchmarks and
    motivation. Each commit message should still be descriptive and
    include enough information to understand why this particular commit
@@ -111,6 +129,6 @@ Pixman has high standards for code quality and so almost everybody
 should expect to have the first versions of their patches rejected.
 
 If you think that the reviewers are wrong about something, or that the
-guidelines above are wrong, feel free to discuss the issue on the
-list. The purpose of the guidelines and code review is to ensure high
-code quality; it is not an exercise in compliance.
+guidelines above are wrong, feel free to discuss the issue. The purpose
+of the guidelines and code review is to ensure high code quality; it is
+not an exercise in compliance.
diff --git a/RELEASING b/RELEASING
index 657857d..fc4edc9 100644
--- a/RELEASING
+++ b/RELEASING
@@ -10,12 +10,11 @@ Here are the steps to follow to create a new pixman release:
 
 	git log master...origin		(no output; note: *3* dots)
 
-2) Increment pixman_(major|minor|micro) in configure.ac according to
-   the directions in that file.
+2) Increment the version in meson.build.
 
 3) Make sure that new version works, including
 
-	- make distcheck passes
+	- meson test passes
 
 	- the X server still works with the new pixman version
 	  installed
diff --git a/a64-neon-test.S b/a64-neon-test.S
new file mode 100644
index 0000000..5d4a4ea
--- /dev/null
+++ b/a64-neon-test.S
@@ -0,0 +1,5 @@
+.text
+.arch armv8-a
+.altmacro
+prfm pldl2strm, [x0]
+xtn v0.8b, v0.8h
diff --git a/arm-simd-test.S b/arm-simd-test.S
new file mode 100644
index 0000000..910c814
--- /dev/null
+++ b/arm-simd-test.S
@@ -0,0 +1,10 @@
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0
diff --git a/autogen.sh b/autogen.sh
deleted file mode 100755
index fc34bd5..0000000
--- a/autogen.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /bin/sh
-
-srcdir=`dirname $0`
-test -z "$srcdir" && srcdir=.
-
-ORIGDIR=`pwd`
-cd $srcdir
-
-autoreconf -v --install || exit 1
-cd $ORIGDIR || exit $?
-
-if test -z "$NOCONFIGURE"; then
-    $srcdir/configure "$@"
-fi
diff --git a/configure.ac b/configure.ac
deleted file mode 100644
index 0339494..0000000
--- a/configure.ac
+++ /dev/null
@@ -1,1129 +0,0 @@
-dnl  Copyright 2005 Red Hat, Inc.
-dnl 
-dnl  Permission to use, copy, modify, distribute, and sell this software and its
-dnl  documentation for any purpose is hereby granted without fee, provided that
-dnl  the above copyright notice appear in all copies and that both that
-dnl  copyright notice and this permission notice appear in supporting
-dnl  documentation, and that the name of Red Hat not be used in
-dnl  advertising or publicity pertaining to distribution of the software without
-dnl  specific, written prior permission.  Red Hat makes no
-dnl  representations about the suitability of this software for any purpose.  It
-dnl  is provided "as is" without express or implied warranty.
-dnl 
-dnl  RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
-dnl  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
-dnl  EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
-dnl  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-dnl  DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-dnl  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-dnl  PERFORMANCE OF THIS SOFTWARE.
-dnl
-dnl Process this file with autoconf to create configure.
-
-AC_PREREQ([2.57])
-
-#   Pixman versioning scheme
-#
-#   - The version in git has an odd MICRO version number
-#
-#   - Released versions, both development and stable, have an
-#     even MICRO version number
-#
-#   - Released development versions have an odd MINOR number
-#
-#   - Released stable versions have an even MINOR number
-#
-#   - Versions that break ABI must have a new MAJOR number
-#
-#   - If you break the ABI, then at least this must be done:
-#
-#        - increment MAJOR
-#
-#        - In the first development release where you break ABI, find
-#          all instances of "pixman-n" and change them to pixman-(n+1)
-#
-#          This needs to be done at least in 
-#                    configure.ac
-#                    all Makefile.am's
-#                    pixman-n.pc.in
-#
-#      This ensures that binary incompatible versions can be installed
-#      in parallel.  See http://www106.pair.com/rhp/parallel.html for
-#      more information
-#
-
-m4_define([pixman_major], 0)
-m4_define([pixman_minor], 33)
-m4_define([pixman_micro], 1)
-
-m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
-
-AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
-AM_INIT_AUTOMAKE([foreign dist-bzip2])
-
-# Suppress verbose compile lines
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-AC_CONFIG_HEADERS(config.h)
-
-AC_CANONICAL_HOST
-
-test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
-
-AC_PROG_CC
-AM_PROG_AS
-AC_PROG_LIBTOOL
-AC_CHECK_FUNCS([getisax])
-AC_C_BIGENDIAN
-AC_C_INLINE
-
-dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
-dnl
-dnl Compiles and links the given program in the environment setup by env-setup
-dnl and executes true-action on success and false-action on failure.
-AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
-	save_CFLAGS="$CFLAGS"
-	save_LDFLAGS="$LDFLAGS"
-	save_LIBS="$LIBS"
-	CFLAGS=""
-	LDFLAGS=""
-	LIBS=""
-	$1
-	CFLAGS="$save_CFLAGS $CFLAGS"
-	LDFLAGS="$save_LDFLAGS $LDFLAGS"
-	LIBS="$save_LIBS $LIBS"
-	AC_LINK_IFELSE(
-		[AC_LANG_SOURCE([$2])],
-		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
-		 pixman_cc_flag=yes],
-		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
-		 pixman_cc_flag=no])
-
-	if test "x$pixman_cc_stderr" != "x"; then
-		pixman_cc_flag=no
-	fi
-
-	if test "x$pixman_cc_flag" = "xyes"; then
-		ifelse([$3], , :, [$3])
-	else
-		ifelse([$4], , :, [$4])
-	fi
-	CFLAGS="$save_CFLAGS"
-	LDFLAGS="$save_LDFLAGS"
-	LIBS="$save_LIBS"
-])
-
-dnl Find a -Werror for catching warnings.
-WERROR=
-for w in -Werror -errwarn; do
-    if test "z$WERROR" = "z"; then
-        AC_MSG_CHECKING([whether the compiler supports $w])
-        PIXMAN_LINK_WITH_ENV(
-		[CFLAGS=$w],
-		[int main(int c, char **v) { (void)c; (void)v; return 0; }],
-		[WERROR=$w; yesno=yes], [yesno=no])
-	AC_MSG_RESULT($yesno)
-    fi
-done
-
-dnl PIXMAN_CHECK_CFLAG(flag, [program])
-dnl  Adds flag to CFLAGS if the given program links without warnings or errors.
-AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
-	AC_MSG_CHECKING([whether the compiler supports $1])
-	PIXMAN_LINK_WITH_ENV(
-		[CFLAGS="$WERROR $1"],
-		[$2
-		 int main(int c, char **v) { (void)c; (void)v; return 0; }
-		],
-		[_yesno=yes],
-		[_yesno=no])
-	if test "x$_yesno" = xyes; then
-	   CFLAGS="$CFLAGS $1"
-	fi
-	AC_MSG_RESULT($_yesno)
-])
-
-AC_CHECK_SIZEOF(long)
-
-# Checks for Sun Studio compilers
-AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
-AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
-
-# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
-# if we're using Sun Studio and neither the user nor a config.site
-# has set CFLAGS.
-if test $SUNCC = yes &&			\
-   test "x$test_CFLAGS" = "x" &&	\
-   test "$CFLAGS" = "-g"
-then
-  CFLAGS="-O -g"
-fi
-
-# 
-# We ignore pixman_major in the version here because the major version should
-# always be encoded in the actual library name. Ie., the soname is:
-#
-#      pixman-$(pixman_major).0.minor.micro
-#
-m4_define([lt_current], [pixman_minor])
-m4_define([lt_revision], [pixman_micro])
-m4_define([lt_age], [pixman_minor])
-
-LT_VERSION_INFO="lt_current:lt_revision:lt_age"
-
-PIXMAN_VERSION_MAJOR=pixman_major()
-AC_SUBST(PIXMAN_VERSION_MAJOR)
-PIXMAN_VERSION_MINOR=pixman_minor()
-AC_SUBST(PIXMAN_VERSION_MINOR)
-PIXMAN_VERSION_MICRO=pixman_micro()
-AC_SUBST(PIXMAN_VERSION_MICRO)
-
-AC_SUBST(LT_VERSION_INFO)
-
-# Check for dependencies
-
-PIXMAN_CHECK_CFLAG([-Wall])
-PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
-PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
-PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
-
-dnl =========================================================================
-dnl OpenMP for the test suite?
-dnl
-
-# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
-OPENMP_CFLAGS=
-m4_ifdef([AC_OPENMP], [AC_OPENMP])
-
-if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
-  AC_MSG_WARN([OpenMP support requested but found unsupported])
-fi
-
-dnl May not fail to link without -Wall -Werror added
-dnl So try to link only when openmp is supported
-dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
-if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
-  m4_define([openmp_test_program],[dnl
-  #include <stdio.h>
-
-  extern unsigned int lcg_seed;
-  #pragma omp threadprivate(lcg_seed)
-  unsigned int lcg_seed;
-
-  unsigned function(unsigned a, unsigned b)
-  {
-	lcg_seed ^= b;
-	return ((a + b) ^ a ) + lcg_seed;
-  }
-
-  int main(int argc, char **argv)
-  {
-	int i;
-	int n1 = 0, n2 = argc;
-	unsigned checksum = 0;
-	int verbose = argv != NULL;
-	unsigned (*test_function)(unsigned, unsigned);
-	test_function = function;
-	#pragma omp parallel for reduction(+:checksum) default(none) \
-					shared(n1, n2, test_function, verbose)
-	for (i = n1; i < n2; i++)
-	{
-		unsigned crc = test_function (i, 0);
-		if (verbose)
-			printf ("%d: %08X\n", i, crc);
-		checksum += crc;
-	}
-	printf("%u\n", checksum);
-	return 0;
-  }
-  ])
-
-  PIXMAN_LINK_WITH_ENV(
-	[CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
-	[openmp_test_program],
-	[have_openmp=yes],
-	[have_openmp=no])
-  if test "x$have_openmp" = "xyes" ; then
-    AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
-  fi
-fi
-AC_SUBST(OPENMP_CFLAGS)
-
-dnl =========================================================================
-dnl -fvisibility stuff
-
-PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
-#if defined(__GNUC__) && (__GNUC__ >= 4)
-#ifdef _WIN32
-#error Have -fvisibility but it is ignored and generates a warning
-#endif
-#else
-#error Need GCC 4.0 for visibility
-#endif
-])
-
-PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-#else
-#error Need Sun Studio 8 for visibility
-#endif
-])
-
-dnl ===========================================================================
-dnl Check for Loongson Multimedia Instructions
-
-if test "x$LS_CFLAGS" = "x" ; then
-    LS_CFLAGS="-march=loongson2f"
-fi
-
-have_loongson_mmi=no
-AC_MSG_CHECKING(whether to use Loongson MMI assembler)
-
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir"
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-#ifndef __mips_loongson_vector_rev
-#error "Loongson Multimedia Instructions are only available on Loongson"
-#endif
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 4.4 for Loongson MMI compilation"
-#endif
-#include "pixman/loongson-mmintrin.h"
-int main () {
-    union {
-        __m64 v;
-        char c[8];
-    } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
-    int b = 4;
-    __m64 c = _mm_srli_pi16 (a.v, b);
-    return 0;
-}]])], have_loongson_mmi=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(loongson-mmi,
-   [AC_HELP_STRING([--disable-loongson-mmi],
-                   [disable Loongson MMI fast paths])],
-   [enable_loongson_mmi=$enableval], [enable_loongson_mmi=auto])
-
-if test $enable_loongson_mmi = no ; then
-   have_loongson_mmi=disabled
-fi
-
-if test $have_loongson_mmi = yes ; then
-   AC_DEFINE(USE_LOONGSON_MMI, 1, [use Loongson Multimedia Instructions])
-else
-   LS_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_loongson_mmi)
-if test $enable_loongson_mmi = yes && test $have_loongson_mmi = no ; then
-   AC_MSG_ERROR([Loongson MMI not detected])
-fi
-
-AM_CONDITIONAL(USE_LOONGSON_MMI, test $have_loongson_mmi = yes)
-
-dnl ===========================================================================
-dnl Check for MMX
-
-if test "x$MMX_CFLAGS" = "x" ; then
-   if test "x$SUNCC" = "xyes"; then
-      # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
-      # but if we're building 64-bit, mmx & sse support is on by default and
-      # -xarch=sse throws an error instead
-      if test "$AMD64_ABI" = "no" ; then
-         MMX_CFLAGS="-xarch=sse"
-      fi
-   else
-      MMX_CFLAGS="-mmmx -Winline"
-   fi
-fi
-
-have_mmx_intrinsics=no
-AC_MSG_CHECKING(whether to use MMX intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$MMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 3.4 for MMX intrinsics"
-#endif
-#include <mmintrin.h>
-int main () {
-    __m64 v = _mm_cvtsi32_si64 (1);
-    __m64 w;
-
-    /* Some versions of clang will choke on K */
-    asm ("pshufw %2, %1, %0\n\t"
-        : "=y" (w)
-        : "y" (v), "K" (5)
-    );
-
-    /* Some versions of clang will choke on this */
-    asm ("pmulhuw %1, %0\n\t"
-	: "+y" (w)
-	: "y" (v)
-    );
-
-    return _mm_cvtsi64_si32 (v);
-}]])], have_mmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(mmx,
-   [AC_HELP_STRING([--disable-mmx],
-                   [disable x86 MMX fast paths])],
-   [enable_mmx=$enableval], [enable_mmx=auto])
-
-if test $enable_mmx = no ; then
-   have_mmx_intrinsics=disabled
-fi
-
-if test $have_mmx_intrinsics = yes ; then
-   AC_DEFINE(USE_X86_MMX, 1, [use x86 MMX compiler intrinsics])
-else
-   MMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_mmx_intrinsics)
-if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
-   AC_MSG_ERROR([x86 MMX intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_X86_MMX, test $have_mmx_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Check for SSE2
-
-if test "x$SSE2_CFLAGS" = "x" ; then
-   if test "x$SUNCC" = "xyes"; then
-      # SSE2 is enabled by default in the Sun Studio 64-bit environment
-      if test "$AMD64_ABI" = "no" ; then
-         SSE2_CFLAGS="-xarch=sse2"
-      fi
-   else
-      SSE2_CFLAGS="-msse2 -Winline"
-   fi
-fi
-
-have_sse2_intrinsics=no
-AC_MSG_CHECKING(whether to use SSE2 intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$SSE2_CFLAGS $CFLAGS"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
-#   if !defined(__amd64__) && !defined(__x86_64__)
-#      error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
-#   endif
-#endif
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
-	c = _mm_xor_si128 (a, b);
-    return 0;
-}]])], have_sse2_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(sse2,
-   [AC_HELP_STRING([--disable-sse2],
-                   [disable SSE2 fast paths])],
-   [enable_sse2=$enableval], [enable_sse2=auto])
-
-if test $enable_sse2 = no ; then
-   have_sse2_intrinsics=disabled
-fi
-
-if test $have_sse2_intrinsics = yes ; then
-   AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
-fi
-
-AC_MSG_RESULT($have_sse2_intrinsics)
-if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
-   AC_MSG_ERROR([SSE2 intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Check for SSSE3
-
-if test "x$SSSE3_CFLAGS" = "x" ; then
-    SSSE3_CFLAGS="-mssse3 -Winline"
-fi
-
-have_ssse3_intrinsics=no
-AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$SSSE3_CFLAGS $CFLAGS"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
-    c = _mm_maddubs_epi16 (a, b);
-    return 0;
-}]])], have_ssse3_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(ssse3,
-   [AC_HELP_STRING([--disable-ssse3],
-                   [disable SSSE3 fast paths])],
-   [enable_ssse3=$enableval], [enable_ssse3=auto])
-
-if test $enable_ssse3 = no ; then
-   have_ssse3_intrinsics=disabled
-fi
-
-if test $have_ssse3_intrinsics = yes ; then
-   AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
-fi
-
-AC_MSG_RESULT($have_ssse3_intrinsics)
-if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
-   AC_MSG_ERROR([SSSE3 intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Other special flags needed when building code using MMX or SSE instructions
-case $host_os in
-   solaris*)
-      # When building 32-bit binaries, apply a mapfile to ensure that the
-      # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
-      # since they check at runtime before using those instructions.
-      # Not all linkers grok the mapfile format so we check for that first.
-      if test "$AMD64_ABI" = "no" ; then
-	 use_hwcap_mapfile=no
-	 AC_MSG_CHECKING(whether to use a hardware capability map file)
-	 hwcap_save_LDFLAGS="$LDFLAGS"
-	 HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
-	 LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
-	 AC_LINK_IFELSE([AC_LANG_SOURCE([[int main() { return 0; }]])],
-			use_hwcap_mapfile=yes,
-			HWCAP_LDFLAGS="")
-	 LDFLAGS="$hwcap_save_LDFLAGS"
-	 AC_MSG_RESULT($use_hwcap_mapfile)
-      fi
-      if test "x$MMX_LDFLAGS" = "x" ; then
-         MMX_LDFLAGS="$HWCAP_LDFLAGS"
-      fi
-      if test "x$SSE2_LDFLAGS" = "x" ; then
-	 SSE2_LDFLAGS="$HWCAP_LDFLAGS"
-      fi
-      ;;
-esac
-
-AC_SUBST(LS_CFLAGS)
-AC_SUBST(IWMMXT_CFLAGS)
-AC_SUBST(MMX_CFLAGS)
-AC_SUBST(MMX_LDFLAGS)
-AC_SUBST(SSE2_CFLAGS)
-AC_SUBST(SSE2_LDFLAGS)
-AC_SUBST(SSSE3_CFLAGS)
-
-dnl ===========================================================================
-dnl Check for VMX/Altivec
-if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
-    VMX_CFLAGS="-faltivec"
-else
-    VMX_CFLAGS="-maltivec -mabi=altivec"
-fi
-
-have_vmx_intrinsics=no
-AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$VMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 3.4 for sane altivec support"
-#endif
-#include <altivec.h>
-int main () {
-    vector unsigned int v = vec_splat_u32 (1);
-    v = vec_sub (v, v);
-    return 0;
-}]])], have_vmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(vmx,
-   [AC_HELP_STRING([--disable-vmx],
-                   [disable VMX fast paths])],
-   [enable_vmx=$enableval], [enable_vmx=auto])
-
-if test $enable_vmx = no ; then
-   have_vmx_intrinsics=disabled
-fi
-
-if test $have_vmx_intrinsics = yes ; then
-   AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
-else
-   VMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_vmx_intrinsics)
-if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
-   AC_MSG_ERROR([VMX intrinsics not detected])
-fi
-
-AC_SUBST(VMX_CFLAGS)
-
-AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports ARM SIMD instructions
-have_arm_simd=no
-AC_MSG_CHECKING(whether to use ARM SIMD assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-.text
-.arch armv6
-.object_arch armv4
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-uqadd8 r0, r0, r0]])], have_arm_simd=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-simd,
-   [AC_HELP_STRING([--disable-arm-simd],
-                   [disable ARM SIMD fast paths])],
-   [enable_arm_simd=$enableval], [enable_arm_simd=auto])
-
-if test $enable_arm_simd = no ; then
-   have_arm_simd=disabled
-fi
-
-if test $have_arm_simd = yes ; then
-   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
-
-AC_MSG_RESULT($have_arm_simd)
-if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
-   AC_MSG_ERROR([ARM SIMD intrinsics not detected])
-fi
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports NEON instructions
-have_arm_neon=no
-AC_MSG_CHECKING(whether to use ARM NEON assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.eabi_attribute 10, 0
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-vmovn.u16 d0, q0]])], have_arm_neon=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-neon,
-   [AC_HELP_STRING([--disable-arm-neon],
-                   [disable ARM NEON fast paths])],
-   [enable_arm_neon=$enableval], [enable_arm_neon=auto])
-
-if test $enable_arm_neon = no ; then
-   have_arm_neon=disabled
-fi
-
-if test $have_arm_neon = yes ; then
-   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
-
-AC_MSG_RESULT($have_arm_neon)
-if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
-   AC_MSG_ERROR([ARM NEON intrinsics not detected])
-fi
-
-dnl ===========================================================================
-dnl Check for IWMMXT
-
-AC_ARG_ENABLE(arm-iwmmxt,
-   [AC_HELP_STRING([--disable-arm-iwmmxt],
-                   [disable ARM IWMMXT fast paths])],
-   [enable_iwmmxt=$enableval], [enable_iwmmxt=auto])
-
-AC_ARG_ENABLE(arm-iwmmxt2,
-   [AC_HELP_STRING([--disable-arm-iwmmxt2],
-                   [build ARM IWMMXT fast paths with -march=iwmmxt instead of -march=iwmmxt2])],
-   [enable_iwmmxt2=$enableval], [enable_iwmmxt2=auto])
-
-if test "x$IWMMXT_CFLAGS" = "x" ; then
-   IWMMXT_CFLAGS="-flax-vector-conversions -Winline -march=iwmmxt"
-   if test $enable_iwmmxt2 != no ; then
-      IWMMXT_CFLAGS="${IWMMXT_CFLAGS}2"
-   fi
-fi
-
-have_iwmmxt_intrinsics=no
-AC_MSG_CHECKING(whether to use ARM IWMMXT intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $IWMMXT_CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#ifndef __arm__
-#error "IWMMXT is only available on ARM"
-#endif
-#ifndef __IWMMXT__
-#error "IWMMXT not enabled (with -march=iwmmxt)"
-#endif
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
-#error "Need GCC >= 4.8 for IWMMXT intrinsics"
-#endif
-#include <mmintrin.h>
-int main () {
-	union {
-		__m64 v;
-		char c[8];
-	} a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
-	int b = 4;
-	__m64 c = _mm_srli_si64 (a.v, b);
-}]])], have_iwmmxt_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-if test $enable_iwmmxt = no ; then
-   have_iwmmxt_intrinsics=disabled
-fi
-
-if test $have_iwmmxt_intrinsics = yes ; then
-   AC_DEFINE(USE_ARM_IWMMXT, 1, [use ARM IWMMXT compiler intrinsics])
-else
-   IWMMXT_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_iwmmxt_intrinsics)
-if test $enable_iwmmxt = yes && test $have_iwmmxt_intrinsics = no ; then
-   AC_MSG_ERROR([IWMMXT intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_ARM_IWMMXT, test $have_iwmmxt_intrinsics = yes)
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports MIPS DSPr2 instructions
-
-have_mips_dspr2=no
-AC_MSG_CHECKING(whether to use MIPS DSPr2 assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-mdspr2 $CFLAGS"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if !(defined(__mips__) &&  __mips_isa_rev >= 2)
-#error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
-#endif
-int
-main ()
-{
-    int c = 0, a = 0, b = 0;
-    __asm__ __volatile__ (
-        "precr.qb.ph %[c], %[a], %[b]          \n\t"
-        : [c] "=r" (c)
-        : [a] "r" (a), [b] "r" (b)
-    );
-    return c;
-}]])], have_mips_dspr2=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(mips-dspr2,
-   [AC_HELP_STRING([--disable-mips-dspr2],
-                   [disable MIPS DSPr2 fast paths])],
-   [enable_mips_dspr2=$enableval], [enable_mips_dspr2=auto])
-
-if test $enable_mips_dspr2 = no ; then
-   have_mips_dspr2=disabled
-fi
-
-if test $have_mips_dspr2 = yes ; then
-   AC_DEFINE(USE_MIPS_DSPR2, 1, [use MIPS DSPr2 assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_MIPS_DSPR2, test $have_mips_dspr2 = yes)
-
-AC_MSG_RESULT($have_mips_dspr2)
-if test $enable_mips_dspr2 = yes && test $have_mips_dspr2 = no ; then
-   AC_MSG_ERROR([MIPS DSPr2 instructions not detected])
-fi
-
-dnl =========================================================================================
-dnl Check for GNU-style inline assembly support
-
-have_gcc_inline_asm=no
-AC_MSG_CHECKING(whether to use GNU-style inline assembler)
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-int main () {
-    /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
-	asm volatile ( "\tnop\n" : : : "cc", "memory" );
-    return 0;
-}]])], have_gcc_inline_asm=yes)
-
-AC_ARG_ENABLE(gcc-inline-asm,
-   [AC_HELP_STRING([--disable-gcc-inline-asm],
-                   [disable GNU-style inline assembler])],
-   [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
-
-if test $enable_gcc_inline_asm = no ; then
-   have_gcc_inline_asm=disabled
-fi
-
-if test $have_gcc_inline_asm = yes ; then
-   AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
-fi
-
-AC_MSG_RESULT($have_gcc_inline_asm)
-if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
-   AC_MSG_ERROR([GNU-style inline assembler not detected])
-fi
-
-AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
-
-dnl ==============================================
-dnl Static test programs
-
-AC_ARG_ENABLE(static-testprogs,
-   [AC_HELP_STRING([--enable-static-testprogs],
-		   [build test programs as static binaries [default=no]])],
-   [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
-
-TESTPROGS_EXTRA_LDFLAGS=
-if test "x$enable_static_testprogs" = "xyes" ; then
-   TESTPROGS_EXTRA_LDFLAGS="-all-static"
-fi
-AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
-
-dnl ==============================================
-dnl Timers
-
-AC_ARG_ENABLE(timers,
-   [AC_HELP_STRING([--enable-timers],
-		   [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
-   [enable_timers=$enableval], [enable_timers=no])
-
-if test $enable_timers = yes ; then 
-   AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
-fi
-AC_SUBST(PIXMAN_TIMERS)
-
-dnl ===================================
-dnl GTK+
-
-AC_ARG_ENABLE(gtk,
-   [AC_HELP_STRING([--enable-gtk],
-                   [enable tests using GTK+ [default=auto]])],
-   [enable_gtk=$enableval], [enable_gtk=auto])
-
-PKG_PROG_PKG_CONFIG
-
-if test $enable_gtk = yes ; then
-   AC_CHECK_LIB([pixman-1], [pixman_version_string])
-   PKG_CHECK_MODULES(GTK, [gtk+-2.0 >= 2.16 pixman-1])
-fi
-
-if test $enable_gtk = auto ; then
-   AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
-fi
-
-if test $enable_gtk = auto ; then
-   PKG_CHECK_MODULES(GTK, [gtk+-2.0 >= 2.16 pixman-1], [enable_gtk=yes], [enable_gtk=no])
-fi
-
-AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
-
-AC_SUBST(GTK_CFLAGS)
-AC_SUBST(GTK_LIBS)
-
-dnl =====================================
-dnl posix_memalign, sigaction, alarm, gettimeofday
-
-AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
-if test x$have_posix_memalign = xyes; then
-   AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
-fi
-
-AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
-if test x$have_sigaction = xyes; then
-   AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
-fi
-
-AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
-if test x$have_alarm = xyes; then
-   AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
-fi
-
-AC_CHECK_HEADER([sys/mman.h],
-   [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
-
-AC_CHECK_FUNC(mmap, have_mmap=yes, have_mmap=no)
-if test x$have_mmap = xyes; then
-   AC_DEFINE(HAVE_MMAP, 1, [Whether we have mmap()])
-fi
-
-AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
-if test x$have_mprotect = xyes; then
-   AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
-fi
-
-AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
-if test x$have_getpagesize = xyes; then
-   AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
-fi
-
-AC_CHECK_HEADER([fenv.h],
-   [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
-
-AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
-if test x$have_feenableexcept = xyes; then
-   AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
-fi
-
-AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
-AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
-if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
-   AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
-fi
-
-dnl =====================================
-dnl Check for missing sqrtf() as, e.g., for Solaris 9
-
-AC_SEARCH_LIBS([sqrtf], [m], [],
-               [AC_DEFINE([sqrtf], [sqrt],
-                          [Define to sqrt if you do not have the `sqrtf' function.])])
-
-dnl =====================================
-dnl Thread local storage
-
-AC_MSG_CHECKING(for thread local storage (TLS) support)
-AC_CACHE_VAL(ac_cv_tls, [
-    ac_cv_tls=none
-    keywords="__thread __declspec(thread)"
-    for kw in $keywords ; do
-        AC_TRY_COMPILE([
-#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
-#error This MinGW version has broken __thread support
-#endif
-#ifdef __OpenBSD__
-#error OpenBSD has broken __thread support
-#endif
-
-int $kw test;], [], [ac_cv_tls=$kw; break])
-    done
-])
-AC_MSG_RESULT($ac_cv_tls)
-
-if test "$ac_cv_tls" != "none"; then
-    AC_DEFINE_UNQUOTED([TLS], $ac_cv_tls, [The compiler supported TLS storage class])
-fi
-
-dnl
-dnl posix tls
-dnl
-
-m4_define([pthread_test_program],AC_LANG_SOURCE([[dnl
-#include <stdlib.h>
-#include <pthread.h>
-
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-static pthread_key_t key;
-
-static void
-make_key (void)
-{
-    pthread_key_create (&key, NULL);
-}
-
-int
-main ()
-{
-    void *value = NULL;
-
-    if (pthread_once (&once_control, make_key) != 0)
-    {
-	value = NULL;
-    }
-    else
-    {
-	value = pthread_getspecific (key);
-	if (!value)
-	{
-	    value = malloc (100);
-	    pthread_setspecific (key, value);
-	}
-    }
-    return 0;
-}
-]]))
-
-AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
-    if test "z$support_for_pthreads" != "zyes"; then
-	PIXMAN_LINK_WITH_ENV(
-		[$1], [pthread_test_program],
-		[PTHREAD_CFLAGS="$CFLAGS"
-		 PTHREAD_LIBS="$LIBS"
-		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthreads=yes])
-    fi
-])
-
-support_for_pthreads=no
-
-AC_MSG_CHECKING(for pthreads)
-
-PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
-PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
-PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
-    
-if test $support_for_pthreads = yes; then
-    AC_DEFINE([HAVE_PTHREADS], [], [Whether pthreads is supported])
-    if test $ac_cv_tls = none ; then
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-    fi
-fi
-
-AC_MSG_RESULT($support_for_pthreads)
-
-AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
-AC_SUBST(HAVE_PTHREADS)
-AC_SUBST(PTHREAD_LDFLAGS)
-AC_SUBST(PTHREAD_LIBS)
-AC_SUBST(PTHREAD_CFLAGS)
-
-dnl =====================================
-dnl __attribute__((constructor))
-
-support_for_attribute_constructor=no
-
-AC_MSG_CHECKING(for __attribute__((constructor)))
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
-/* attribute 'constructor' is supported since gcc 2.7, but some compilers
- * may only pretend to be gcc, so let's try to actually use it
- */
-static int x = 1;
-static void __attribute__((constructor)) constructor_function () { x = 0; }
-int main (void) { return x; }
-#else
-#error not gcc or gcc version is older than 2.7
-#endif
-]])], support_for_attribute_constructor=yes)
-
-if test x$support_for_attribute_constructor = xyes; then
-   AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
-             [],[Whether the tool chain supports __attribute__((constructor))])
-fi
-
-AC_MSG_RESULT($support_for_attribute_constructor)
-AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
-
-dnl =====================================
-dnl __float128
-
-support_for_float128=no
-
-AC_MSG_CHECKING(for __float128)
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-__float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }
-]])], support_for_float128=yes)
-
-if test x$support_for_float128 = xyes; then
-   AC_DEFINE([HAVE_FLOAT128], [], [Whether the tool chain supports __float128])
-fi
-
-AC_MSG_RESULT($support_for_float128)
-
-dnl =====================================
-dnl __builtin_clz
-
-support_for_builtin_clz=no
-
-AC_MSG_CHECKING(for __builtin_clz)
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-unsigned int x = 11; int main (void) { return __builtin_clz(x); }
-]])], support_for_builtin_clz=yes)
-
-if test x$support_for_builtin_clz = xyes; then
-   AC_DEFINE([HAVE_BUILTIN_CLZ], [], [Whether the compiler supports __builtin_clz])
-fi
-
-AC_MSG_RESULT($support_for_builtin_clz)
-
-dnl =====================================
-dnl GCC vector extensions
-
-support_for_gcc_vector_extensions=no
-
-AC_MSG_CHECKING(for GCC vector extensions)
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-unsigned int __attribute__ ((vector_size(16))) e, a, b;
-int main (void) { e = a - ((b << 27) + (b >> (32 - 27))) + 1; return e[0]; }
-]])], support_for_gcc_vector_extensions=yes)
-
-if test x$support_for_gcc_vector_extensions = xyes; then
-   AC_DEFINE([HAVE_GCC_VECTOR_EXTENSIONS], [],
-             [Whether the compiler supports GCC vector extensions])
-fi
-
-AC_MSG_RESULT($support_for_gcc_vector_extensions)
-
-dnl ==================
-dnl libpng
-
-AC_ARG_ENABLE(libpng, AS_HELP_STRING([--enable-libpng], [Build support for libpng (default: auto)]),
-                      [have_libpng=$enableval], [have_libpng=auto])
-
-case x$have_libpng in
-	xyes) PKG_CHECK_MODULES(PNG, [libpng]) ;;
-	xno) ;;
-	*) PKG_CHECK_MODULES(PNG, [libpng], have_libpng=yes, have_libpng=no) ;;
-esac
-
-if test x$have_libpng = xyes; then
-    AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
-fi
-
-AC_SUBST(HAVE_LIBPNG)
-
-AC_OUTPUT([pixman-1.pc
-           pixman-1-uninstalled.pc
-           Makefile
-	   pixman/Makefile
-	   pixman/pixman-version.h
-	   demos/Makefile
-	   test/Makefile])
-
-m4_if(m4_eval(pixman_minor % 2), [1], [
-   echo
-   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
-   echo
-   echo "      Thanks for testing this development snapshot of pixman. Please"
-   echo "      report any problems you find, either by sending email to "
-   echo
-   echo "          pixman@lists.freedesktop.org"
-   echo
-   echo "      or by filing a bug at "
-   echo
-   echo "          https://bugs.freedesktop.org/enter_bug.cgi?product=pixman "
-   echo
-   echo "      If you are looking for a stable release of pixman, please note "
-   echo "      that stable releases have _even_ minor version numbers. Ie., "
-   echo "      pixman-0.]m4_eval(pixman_minor & ~1)[.x are stable releases, whereas pixman-$PIXMAN_VERSION_MAJOR.$PIXMAN_VERSION_MINOR.$PIXMAN_VERSION_MICRO is a "
-   echo "      development snapshot that may contain bugs and experimental "
-   echo "      features. "
-   echo 
-   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
-   echo
-])
diff --git a/demos/Makefile.am b/demos/Makefile.am
deleted file mode 100644
index e04743d..0000000
--- a/demos/Makefile.am
+++ /dev/null
@@ -1,52 +0,0 @@
-EXTRA_DIST = parrot.c parrot.jpg scale.ui
-
-if HAVE_GTK
-
-AM_CFLAGS = $(OPENMP_CFLAGS)
-AM_LDFLAGS = $(OPENMP_CFLAGS)
-
-LDADD = $(top_builddir)/pixman/libpixman-1.la -lm $(GTK_LIBS) $(PNG_LIBS)
-AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS) $(PNG_CFLAGS)
-
-GTK_UTILS = gtk-utils.c gtk-utils.h ../test/utils.c ../test/utils.h \
-            ../test/utils-prng.c ../test/utils-prng.h
-
-DEMOS =				\
-	clip-test		\
-	clip-in			\
-	composite-test		\
-	gradient-test		\
-	radial-test		\
-	linear-gradient		\
-	conical-test		\
-	alpha-test		\
-	screen-test		\
-	convolution-test	\
-	trap-test		\
-	tri-test		\
-	quad2quad		\
-	checkerboard		\
-	srgb-trap-test		\
-	srgb-test		\
-	scale
-
-gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
-alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
-composite_test_SOURCES = composite-test.c $(GTK_UTILS)
-clip_test_SOURCES = clip-test.c $(GTK_UTILS)
-clip_in_SOURCES = clip-in.c $(GTK_UTILS)
-trap_test_SOURCES = trap-test.c $(GTK_UTILS)
-screen_test_SOURCES = screen-test.c $(GTK_UTILS)
-convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
-radial_test_SOURCES = radial-test.c $(GTK_UTILS)
-linear_gradient_SOURCES = linear-gradient.c $(GTK_UTILS)
-conical_test_SOURCES = conical-test.c $(GTK_UTILS)
-tri_test_SOURCES = tri-test.c $(GTK_UTILS)
-checkerboard_SOURCES = checkerboard.c $(GTK_UTILS)
-srgb_test_SOURCES = srgb-test.c $(GTK_UTILS)
-srgb_trap_test_SOURCES = srgb-trap-test.c $(GTK_UTILS)
-scale_SOURCES = scale.c $(GTK_UTILS)
-
-noinst_PROGRAMS = $(DEMOS)
-
-endif
diff --git a/demos/conical-test.c b/demos/conical-test.c
index 6b32430..5ff1be9 100644
--- a/demos/conical-test.c
+++ b/demos/conical-test.c
@@ -1,4 +1,4 @@
-#include "../test/utils.h"
+#include "utils.h"
 #include "gtk-utils.h"
 
 #define SIZE 128
diff --git a/demos/dither.c b/demos/dither.c
new file mode 100644
index 0000000..a6a157a
--- /dev/null
+++ b/demos/dither.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright 2012, Red Hat, Inc.
+ * Copyright 2012, Soren Sandmann
+ * Copyright 2018, Basile Clement
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include "pixman-config.h"
+#endif
+#include <math.h>
+#include <gtk/gtk.h>
+#include <stdlib.h>
+#include "utils.h"
+#include "gtk-utils.h"
+
+#define WIDTH 1024
+#define HEIGHT 640
+
+typedef struct
+{
+    GtkBuilder *         builder;
+    pixman_image_t *	 original;
+    pixman_format_code_t format;
+    pixman_dither_t      dither;
+    int                  width;
+    int                  height;
+} app_t;
+
+static GtkWidget *
+get_widget (app_t *app, const char *name)
+{
+    GtkWidget *widget = GTK_WIDGET (gtk_builder_get_object (app->builder, name));
+
+    if (!widget)
+	g_error ("Widget %s not found\n", name);
+
+    return widget;
+}
+
+typedef struct
+{
+    char	name [20];
+    int		value;
+} named_int_t;
+
+static const named_int_t formats[] =
+{
+    { "a8r8g8b8",  PIXMAN_a8r8g8b8      },
+    { "rgb",       PIXMAN_rgb_float     },
+    { "sRGB",      PIXMAN_a8r8g8b8_sRGB },
+    { "r5g6b5",    PIXMAN_r5g6b5        },
+    { "a4r4g4b4",  PIXMAN_a4r4g4b4      },
+    { "a2r2g2b2",  PIXMAN_a2r2g2b2      },
+    { "r3g3b2",    PIXMAN_r3g3b2        },
+    { "r1g2b1",    PIXMAN_r1g2b1        },
+    { "a1r1g1b1",  PIXMAN_a1r1g1b1      },
+};
+
+static const named_int_t dithers[] =
+{
+    { "None",                   PIXMAN_REPEAT_NONE },
+    { "Bayer 8x8",              PIXMAN_DITHER_ORDERED_BAYER_8 },
+    { "Blue noise 64x64",       PIXMAN_DITHER_ORDERED_BLUE_NOISE_64 },
+};
+
+static int
+get_value (app_t *app, const named_int_t table[], const char *box_name)
+{
+    GtkComboBox *box = GTK_COMBO_BOX (get_widget (app, box_name));
+
+    return table[gtk_combo_box_get_active (box)].value;
+}
+
+static void
+rescale (GtkWidget *may_be_null, app_t *app)
+{
+    app->dither = get_value (app, dithers, "dithering_combo_box");
+    app->format = get_value (app, formats, "target_format_combo_box");
+
+    gtk_widget_set_size_request (
+	get_widget (app, "drawing_area"), app->width + 0.5, app->height + 0.5);
+
+    gtk_widget_queue_draw (
+	get_widget (app, "drawing_area"));
+}
+
+static gboolean
+on_draw (GtkWidget *widget, cairo_t *cr, gpointer user_data)
+{
+    app_t *app = user_data;
+    GdkRectangle area;
+    cairo_surface_t *surface;
+    pixman_image_t *tmp, *final;
+    uint32_t *pixels;
+
+    gdk_cairo_get_clip_rectangle(cr, &area);
+
+    tmp = pixman_image_create_bits (
+	app->format, area.width, area.height, NULL, 0);
+    pixman_image_set_dither (tmp, app->dither);
+
+    pixman_image_composite (
+	PIXMAN_OP_SRC,
+	app->original, NULL, tmp,
+	area.x, area.y, 0, 0, 0, 0,
+	app->width - area.x,
+	app->height - area.y);
+
+    pixels = calloc (1, area.width * area.height * 4);
+    final = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, area.width, area.height, pixels, area.width * 4);
+
+    pixman_image_composite (
+	PIXMAN_OP_SRC,
+	tmp, NULL, final,
+	area.x, area.y, 0, 0, 0, 0,
+	app->width - area.x,
+	app->height - area.y);
+
+    surface = cairo_image_surface_create_for_data (
+	(uint8_t *)pixels, CAIRO_FORMAT_ARGB32,
+	area.width, area.height, area.width * 4);
+
+    cairo_set_source_surface (cr, surface, area.x, area.y);
+
+    cairo_paint (cr);
+
+    cairo_surface_destroy (surface);
+    free (pixels);
+    pixman_image_unref (final);
+    pixman_image_unref (tmp);
+
+    return TRUE;
+}
+
+static void
+set_up_combo_box (app_t *app, const char *box_name,
+		  int n_entries, const named_int_t table[])
+{
+    GtkWidget *widget = get_widget (app, box_name);
+    GtkListStore *model;
+    GtkCellRenderer *cell;
+    int i;
+
+    model = gtk_list_store_new (1, G_TYPE_STRING);
+
+    cell = gtk_cell_renderer_text_new ();
+    gtk_cell_layout_pack_start (GTK_CELL_LAYOUT (widget), cell, TRUE);
+    gtk_cell_layout_set_attributes (GTK_CELL_LAYOUT (widget), cell,
+				    "text", 0,
+				    NULL);
+
+    gtk_combo_box_set_model (GTK_COMBO_BOX (widget), GTK_TREE_MODEL (model));
+
+    for (i = 0; i < n_entries; ++i)
+    {
+	const named_int_t *info = &(table[i]);
+	GtkTreeIter iter;
+
+	gtk_list_store_append (model, &iter);
+	gtk_list_store_set (model, &iter, 0, info->name, -1);
+    }
+
+    gtk_combo_box_set_active (GTK_COMBO_BOX (widget), 0);
+
+    g_signal_connect (widget, "changed", G_CALLBACK (rescale), app);
+}
+
+static app_t *
+app_new (pixman_image_t *original)
+{
+    GtkWidget *widget;
+    app_t *app = g_malloc (sizeof *app);
+    GError *err = NULL;
+
+    app->builder = gtk_builder_new ();
+    app->original = original;
+
+    if (original->type == BITS)
+    {
+	app->width = pixman_image_get_width (original);
+	app->height = pixman_image_get_height (original);
+    }
+    else
+    {
+	app->width = WIDTH;
+	app->height = HEIGHT;
+    }
+
+    if (!gtk_builder_add_from_file (app->builder, "dither.ui", &err))
+	g_error ("Could not read file dither.ui: %s", err->message);
+
+    widget = get_widget (app, "drawing_area");
+    g_signal_connect (widget, "draw", G_CALLBACK (on_draw), app);
+
+    set_up_combo_box (app, "target_format_combo_box",
+		      G_N_ELEMENTS (formats), formats);
+    set_up_combo_box (app, "dithering_combo_box",
+		      G_N_ELEMENTS (dithers), dithers);
+
+    app->dither = get_value (app, dithers, "dithering_combo_box");
+    app->format = get_value (app, formats, "target_format_combo_box");
+
+    rescale (NULL, app);
+
+    return app;
+}
+
+int
+main (int argc, char **argv)
+{
+    GtkWidget *window;
+    pixman_image_t *image;
+    app_t *app;
+
+    gtk_init (&argc, &argv);
+
+    if (argc < 2)
+    {
+	pixman_gradient_stop_t stops[] = {
+	    /* These colors make it very obvious that dithering
+	     * is useful even for 8-bit gradients
+	     */
+	    { 0x00000, { 0x1b1b, 0x5d5d, 0x7c7c, 0xffff } },
+	    { 0x10000, { 0x3838, 0x3232, 0x1010, 0xffff } },
+	};
+	pixman_point_fixed_t p1, p2;
+
+	p1.x = p1.y = 0x0000;
+	p2.x = WIDTH << 16;
+	p2.y = HEIGHT << 16;
+
+	if (!(image = pixman_image_create_linear_gradient (
+		  &p1, &p2, stops, ARRAY_LENGTH (stops))))
+	{
+	    printf ("Could not create gradient\n");
+	    return -1;
+	}
+    }
+    else if (!(image = pixman_image_from_file (argv[1], PIXMAN_a8r8g8b8)))
+    {
+	printf ("Could not load image \"%s\"\n", argv[1]);
+	return -1;
+    }
+
+    app = app_new (image);
+
+    window = get_widget (app, "main");
+
+    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), 1024, 768);
+
+    gtk_widget_show_all (window);
+
+    gtk_main ();
+
+    return 0;
+}
diff --git a/demos/dither.ui b/demos/dither.ui
new file mode 100644
index 0000000..7c3d068
--- /dev/null
+++ b/demos/dither.ui
@@ -0,0 +1,147 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<interface>
+  <requires lib="gtk+" version="2.12"/>
+  <object class="GtkWindow" id="main">
+    <property name="can_focus">False</property>
+    <child>
+      <placeholder/>
+    </child>
+    <child>
+      <object class="GtkHBox" id="u">
+        <property name="visible">True</property>
+        <property name="can_focus">False</property>
+        <property name="spacing">12</property>
+        <child>
+          <object class="GtkScrolledWindow" id="scrolledwindow1">
+            <property name="visible">True</property>
+            <property name="can_focus">True</property>
+            <property name="shadow_type">in</property>
+            <child>
+              <object class="GtkViewport" id="viewport1">
+                <property name="visible">True</property>
+                <property name="can_focus">False</property>
+                <child>
+                  <object class="GtkDrawingArea" id="drawing_area">
+                    <property name="visible">True</property>
+                    <property name="can_focus">False</property>
+                  </object>
+                </child>
+              </object>
+            </child>
+          </object>
+          <packing>
+            <property name="expand">True</property>
+            <property name="fill">True</property>
+            <property name="position">0</property>
+          </packing>
+        </child>
+        <child>
+          <object class="GtkVBox" id="box1">
+            <property name="visible">True</property>
+            <property name="can_focus">False</property>
+            <property name="spacing">12</property>
+            <child>
+              <object class="GtkVBox" id="box6">
+                <property name="visible">True</property>
+                <property name="can_focus">False</property>
+                <child>
+                  <object class="GtkTable" id="grid1">
+                    <property name="visible">True</property>
+                    <property name="can_focus">False</property>
+                    <property name="n_rows">2</property>
+                    <property name="n_columns">2</property>
+                    <property name="column_spacing">8</property>
+                    <property name="row_spacing">6</property>
+                    <child>
+                      <object class="GtkLabel" id="label4">
+                        <property name="visible">True</property>
+                        <property name="can_focus">False</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Target format:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                        <property name="xalign">1</property>
+                      </object>
+                    </child>
+                    <child>
+                      <object class="GtkLabel" id="label5">
+                        <property name="visible">True</property>
+                        <property name="can_focus">False</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Dithering:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                        <property name="xalign">1</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">1</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkComboBox" id="target_format_combo_box">
+                        <property name="visible">True</property>
+                        <property name="can_focus">False</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkComboBox" id="dithering_combo_box">
+                        <property name="visible">True</property>
+                        <property name="can_focus">False</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">1</property>
+                      </packing>
+                    </child>
+                  </object>
+                  <packing>
+                    <property name="expand">False</property>
+                    <property name="fill">True</property>
+                    <property name="padding">6</property>
+                    <property name="position">1</property>
+                  </packing>
+                </child>
+              </object>
+              <packing>
+                <property name="expand">False</property>
+                <property name="fill">True</property>
+                <property name="position">0</property>
+              </packing>
+            </child>
+          </object>
+          <packing>
+            <property name="expand">False</property>
+            <property name="fill">True</property>
+            <property name="position">1</property>
+          </packing>
+        </child>
+      </object>
+    </child>
+  </object>
+  <object class="GtkAdjustment" id="rotate_adjustment">
+    <property name="lower">-180</property>
+    <property name="upper">190</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">10</property>
+    <property name="page_size">10</property>
+  </object>
+  <object class="GtkAdjustment" id="scale_x_adjustment">
+    <property name="lower">-32</property>
+    <property name="upper">42</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">10</property>
+    <property name="page_size">10</property>
+  </object>
+  <object class="GtkAdjustment" id="scale_y_adjustment">
+    <property name="lower">-32</property>
+    <property name="upper">42</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">10</property>
+    <property name="page_size">10</property>
+  </object>
+  <object class="GtkAdjustment" id="subsample_adjustment">
+    <property name="upper">12</property>
+    <property name="value">4</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">1</property>
+  </object>
+</interface>
diff --git a/demos/gtk-utils.c b/demos/gtk-utils.c
index 32d4aec..dc872a9 100644
--- a/demos/gtk-utils.c
+++ b/demos/gtk-utils.c
@@ -1,6 +1,8 @@
 #include <gtk/gtk.h>
-#include <config.h>
-#include "../test/utils.h"
+#ifdef HAVE_CONFIG_H
+#include <pixman-config.h>
+#endif
+#include "utils.h"
 #include "gtk-utils.h"
 
 pixman_image_t *
@@ -93,15 +95,14 @@ pixbuf_from_argb32 (uint32_t *bits,
 }
 
 static gboolean
-on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+on_draw (GtkWidget *widget, cairo_t *cr, gpointer user_data)
 {
-    pixman_image_t *pimage = data;
+    pixman_image_t *pimage = user_data;
     int width = pixman_image_get_width (pimage);
     int height = pixman_image_get_height (pimage);
     int stride = pixman_image_get_stride (pimage);
     cairo_surface_t *cimage;
     cairo_format_t format;
-    cairo_t *cr;
 
     if (pixman_image_get_format (pimage) == PIXMAN_x8r8g8b8)
 	format = CAIRO_FORMAT_RGB24;
@@ -111,14 +112,11 @@ on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
     cimage = cairo_image_surface_create_for_data (
 	(uint8_t *)pixman_image_get_data (pimage),
 	format, width, height, stride);
-    
-    cr = gdk_cairo_create (widget->window);
 
     cairo_rectangle (cr, 0, 0, width, height);
     cairo_set_source_surface (cr, cimage, 0, 0);
     cairo_fill (cr);
 
-    cairo_destroy (cr);
     cairo_surface_destroy (cimage);
     
     return TRUE;
@@ -170,7 +168,7 @@ show_image (pixman_image_t *image)
 	break;
     }
 
-    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), copy);
+    g_signal_connect (window, "draw", G_CALLBACK (on_draw), copy);
     g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
     
     gtk_widget_show (window);
diff --git a/demos/linear-gradient.c b/demos/linear-gradient.c
index 46433a6..03807a6 100644
--- a/demos/linear-gradient.c
+++ b/demos/linear-gradient.c
@@ -1,4 +1,4 @@
-#include "../test/utils.h"
+#include "utils.h"
 #include "gtk-utils.h"
 
 #define WIDTH 1024
diff --git a/demos/meson.build b/demos/meson.build
new file mode 100644
index 0000000..cff3cf2
--- /dev/null
+++ b/demos/meson.build
@@ -0,0 +1,66 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+extra_demo_cflags = []
+if cc.get_argument_syntax() == 'msvc'
+  extra_demo_cflags = ['-D_USE_MATH_DEFINES']
+endif
+
+demos = [
+  'gradient-test',
+  'alpha-test',
+  'composite-test',
+  'clip-test',
+  'trap-test',
+  'screen-test',
+  'convolution-test',
+  'radial-test',
+  'linear-gradient',
+  'conical-test',
+  'tri-test',
+  'checkerboard',
+  'srgb-test',
+  'srgb-trap-test',
+  'scale',
+  'dither',
+]
+
+if dep_gtk.found()
+
+  libdemo = static_library(
+    'demo',
+    ['gtk-utils.c', config_h, version_h],
+    dependencies : [libtestutils_dep, dep_gtk, dep_glib, dep_png, dep_m, dep_openmp],
+    include_directories : inc_pixman,
+  )
+
+  if dep_gtk.found()
+    foreach d : demos
+      executable(
+        d,
+        [d + '.c', config_h, version_h],
+        c_args : extra_demo_cflags,
+        link_with : [libdemo],
+        dependencies : [idep_pixman, libtestutils_dep, dep_glib, dep_gtk, dep_openmp, dep_png],
+      )
+    endforeach
+  endif
+
+endif
diff --git a/demos/radial-test.c b/demos/radial-test.c
index 08a367c..52292eb 100644
--- a/demos/radial-test.c
+++ b/demos/radial-test.c
@@ -1,4 +1,4 @@
-#include "../test/utils.h"
+#include "utils.h"
 #include "gtk-utils.h"
 
 #define NUM_GRADIENTS 9
diff --git a/demos/scale.c b/demos/scale.c
index d00307e..fa2a28a 100644
--- a/demos/scale.c
+++ b/demos/scale.c
@@ -24,7 +24,7 @@
  * Author: Soren Sandmann <soren.sandmann@gmail.com>
  */
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "pixman-config.h"
 #endif
 #include <math.h>
 #include <gtk/gtk.h>
@@ -55,50 +55,70 @@ get_widget (app_t *app, const char *name)
     return widget;
 }
 
-static double
-min4 (double a, double b, double c, double d)
-{
-    double m1, m2;
-
-    m1 = MIN (a, b);
-    m2 = MIN (c, d);
-    return MIN (m1, m2);
-}
-
-static double
-max4 (double a, double b, double c, double d)
-{
-    double m1, m2;
-
-    m1 = MAX (a, b);
-    m2 = MAX (c, d);
-    return MAX (m1, m2);
-}
-
+/* Figure out the boundary of a diameter=1 circle transformed into an ellipse
+ * by trans. Proof that this is the correct calculation:
+ *
+ * Transform x,y to u,v by this matrix calculation:
+ *
+ *  |u|   |a c| |x|
+ *  |v| = |b d|*|y|
+ *
+ * Horizontal component:
+ *
+ *  u = ax+cy (1)
+ *
+ * For each x,y on a radius-1 circle (p is angle to the point):
+ *
+ *  x^2+y^2 = 1
+ *  x = cos(p)
+ *  y = sin(p)
+ *  dx/dp = -sin(p) = -y
+ *  dy/dp = cos(p) = x
+ *
+ * Figure out derivative of (1) relative to p:
+ *
+ *  du/dp = a(dx/dp) + c(dy/dp)
+ *        = -ay + cx
+ *
+ * The min and max u are when du/dp is zero:
+ *
+ *  -ay + cx = 0
+ *  cx = ay
+ *  c = ay/x  (2)
+ *  y = cx/a  (3)
+ *
+ * Substitute (2) into (1) and simplify:
+ *
+ *  u = ax + ay^2/x
+ *    = a(x^2+y^2)/x
+ *    = a/x (because x^2+y^2 = 1)
+ *  x = a/u (4)
+ *
+ * Substitute (4) into (3) and simplify:
+ *
+ *  y = c(a/u)/a
+ *  y = c/u (5)
+ *
+ * Square (4) and (5) and add:
+ *
+ *  x^2+y^2 = (a^2+c^2)/u^2
+ *
+ * But x^2+y^2 is 1:
+ *
+ *  1 = (a^2+c^2)/u^2
+ *  u^2 = a^2+c^2
+ *  u = hypot(a,c)
+ *
+ * Similarily the max/min of v is at:
+ *
+ *  v = hypot(b,d)
+ *
+ */
 static void
 compute_extents (pixman_f_transform_t *trans, double *sx, double *sy)
 {
-    double min_x, max_x, min_y, max_y;
-    pixman_f_vector_t v[4] =
-    {
-	{ { 1, 1, 1 } },
-	{ { -1, 1, 1 } },
-	{ { -1, -1, 1 } },
-	{ { 1, -1, 1 } },
-    };
-
-    pixman_f_transform_point (trans, &v[0]);
-    pixman_f_transform_point (trans, &v[1]);
-    pixman_f_transform_point (trans, &v[2]);
-    pixman_f_transform_point (trans, &v[3]);
-
-    min_x = min4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
-    max_x = max4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
-    min_y = min4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
-    max_y = max4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
-
-    *sx = (max_x - min_x) / 2.0;
-    *sy = (max_y - min_y) / 2.0;
+    *sx = hypot (trans->m[0][0], trans->m[0][1]) / trans->m[2][2];
+    *sy = hypot (trans->m[1][0], trans->m[1][1]) / trans->m[2][2];
 }
 
 typedef struct
@@ -258,39 +278,37 @@ rescale (GtkWidget *may_be_null, app_t *app)
 }
 
 static gboolean
-on_expose (GtkWidget *da, GdkEvent *event, gpointer data)
+on_draw (GtkWidget *widget, cairo_t *cr, gpointer user_data)
 {
-    app_t *app = data;
-    GdkRectangle *area = &event->expose.area;
+    app_t *app = user_data;
+    GdkRectangle area;
     cairo_surface_t *surface;
     pixman_image_t *tmp;
-    cairo_t *cr;
     uint32_t *pixels;
 
-    pixels = calloc (1, area->width * area->height * 4);
+    gdk_cairo_get_clip_rectangle(cr, &area);
+
+    pixels = calloc (1, area.width * area.height * 4);
     tmp = pixman_image_create_bits (
-        PIXMAN_a8r8g8b8, area->width, area->height, pixels, area->width * 4);
+        PIXMAN_a8r8g8b8, area.width, area.height, pixels, area.width * 4);
 
-    if (area->x < app->scaled_width && area->y < app->scaled_height)
+    if (area.x < app->scaled_width && area.y < app->scaled_height)
     {
         pixman_image_composite (
             PIXMAN_OP_SRC,
             app->original, NULL, tmp,
-            area->x, area->y, 0, 0, 0, 0,
-            app->scaled_width - area->x, app->scaled_height - area->y);
+            area.x, area.y, 0, 0, 0, 0,
+            app->scaled_width - area.x, app->scaled_height - area.y);
     }
 
     surface = cairo_image_surface_create_for_data (
         (uint8_t *)pixels, CAIRO_FORMAT_ARGB32,
-        area->width, area->height, area->width * 4);
-
-    cr = gdk_cairo_create (da->window);
+        area.width, area.height, area.width * 4);
 
-    cairo_set_source_surface (cr, surface, area->x, area->y);
+    cairo_set_source_surface (cr, surface, area.x, area.y);
 
     cairo_paint (cr);
 
-    cairo_destroy (cr);
     cairo_surface_destroy (surface);
     free (pixels);
     pixman_image_unref (tmp);
@@ -380,7 +398,7 @@ app_new (pixman_image_t *original)
     gtk_scale_add_mark (GTK_SCALE (widget), 0.0, GTK_POS_LEFT, NULL);
 
     widget = get_widget (app, "drawing_area");
-    g_signal_connect (widget, "expose_event", G_CALLBACK (on_expose), app);
+    g_signal_connect (widget, "draw", G_CALLBACK (on_draw), app);
 
     set_up_filter_box (app, "reconstruct_x_combo_box");
     set_up_filter_box (app, "reconstruct_y_combo_box");
diff --git a/demos/scale.ui b/demos/scale.ui
index ee985dd..d498d26 100644
--- a/demos/scale.ui
+++ b/demos/scale.ui
@@ -177,6 +177,7 @@
 			  id="lock_checkbutton">
 		    <property name="label" translatable="yes">Lock X and Y Dimensions</property>
 		    <property name="xalign">0.0</property>
+		    <property name="active">True</property>
 		  </object>
                   <packing>
                     <property name="expand">False</property>
@@ -301,6 +302,7 @@
                       <object class="GtkSpinButton" id="subsample_spin_button">
                         <property name="visible">True</property>
 			<property name="adjustment">subsample_adjustment</property>
+			<property name="value">4</property>
                       </object>
                       <packing>
                         <property name="left_attach">1</property>
diff --git a/demos/tri-test.c b/demos/tri-test.c
index a71869a..9d213eb 100644
--- a/demos/tri-test.c
+++ b/demos/tri-test.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "../test/utils.h"
+#include "utils.h"
 #include "gtk-utils.h"
 
 int
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..3dd3b12
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,619 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+project(
+  'pixman',
+  ['c'],
+  version : '0.43.5',
+  license : 'MIT',
+  meson_version : '>= 0.52.0',
+  default_options : ['c_std=gnu99', 'buildtype=debugoptimized'],
+)
+
+config = configuration_data()
+cc = meson.get_compiler('c')
+null_dep = dependency('', required : false)
+
+add_project_arguments(
+  cc.get_supported_arguments([
+    '-Wdeclaration-after-statement',
+    '-fno-strict-aliasing',
+    '-fvisibility=hidden',
+    '-Wundef',
+    # -ftrapping-math is the default for gcc, but -fno-trapping-math is the
+    # default for clang.  The FLOAT_IS_ZERO macro is used to guard against
+    # floating-point exceptions, however with -fno-trapping-math, the compiler
+    # can reorder floating-point operations so that they occur before the guard.
+    # Note, this function is ignored in clang < 10.0.0.
+    '-ftrapping-math'
+  ]),
+  language : ['c']
+)
+
+# GCC and Clang both ignore -Wno options that they don't recognize, so test for
+# -W<opt>, then add -Wno-<opt> if it's ignored
+foreach opt : ['unused-local-typedefs']
+  if cc.has_argument('-W' + opt)
+    add_project_arguments(['-Wno-' + opt], language : ['c'])
+  endif
+endforeach
+
+use_loongson_mmi = get_option('loongson-mmi')
+have_loongson_mmi = false
+loongson_mmi_flags = ['-mloongson-mmi']
+if not use_loongson_mmi.disabled()
+  if host_machine.cpu_family() == 'mips64' and cc.compiles('''
+      #ifndef __mips_loongson_vector_rev
+      #error "Loongson Multimedia Instructions are only available on Loongson"
+      #endif
+      #if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+      #error "Need GCC >= 4.4 for Loongson MMI compilation"
+      #endif
+      #include "pixman/loongson-mmintrin.h"
+      int main () {
+        union {
+          __m64 v;
+          char c[8];
+        } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+        int b = 4;
+        __m64 c = _mm_srli_pi16 (a.v, b);
+        return 0;
+      }''',
+      args : loongson_mmi_flags,
+      include_directories : include_directories('.'),
+      name : 'Loongson MMI Intrinsic Support')
+    have_loongson_mmi = true
+  endif
+endif
+
+if have_loongson_mmi
+  config.set10('USE_LOONGSON_MMI', true)
+elif use_loongson_mmi.enabled()
+  error('Loongson MMI Support unavailable, but required')
+endif
+
+use_mmx = get_option('mmx')
+have_mmx = false
+mmx_flags = []
+
+if cc.get_id() == 'msvc'
+  mmx_flags = ['/w14710', '/w14714', '/wd4244']
+elif cc.get_id() == 'sun'
+  mmx_flags = ['-xarch=sse']
+else
+  mmx_flags = ['-mmmx', '-Winline']
+endif
+if not use_mmx.disabled()
+  if host_machine.cpu_family() == 'x86_64' or cc.get_id() == 'msvc'
+    have_mmx = true
+  elif host_machine.cpu_family() == 'x86' and cc.compiles('''
+      #include <mmintrin.h>
+      #include <stdint.h>
+
+      /* Check support for block expressions */
+      #define _mm_shuffle_pi16(A, N)                    \
+        ({                                              \
+        __m64 ret;                                      \
+                                                        \
+        /* Some versions of clang will choke on K */    \
+        asm ("pshufw %2, %1, %0\n\t"                    \
+             : "=y" (ret)                               \
+             : "y" (A), "K" ((const int8_t)N)           \
+        );                                              \
+                                                        \
+        ret;                                            \
+        })
+
+      int main () {
+          __m64 v = _mm_cvtsi32_si64 (1);
+          __m64 w;
+
+          w = _mm_shuffle_pi16(v, 5);
+
+          /* Some versions of clang will choke on this */
+          asm ("pmulhuw %1, %0\n\t"
+               : "+y" (w)
+               : "y" (v)
+          );
+
+          return _mm_cvtsi64_si32 (v);
+      }''',
+      args : mmx_flags,
+      name : 'MMX Intrinsic Support')
+    have_mmx = true
+  endif
+endif
+
+if have_mmx
+  # Inline assembly do not work on X64 MSVC, so we use
+  # compatibility intrinsics there
+  if cc.get_id() != 'msvc' or host_machine.cpu_family() != 'x86_64'
+    config.set10('USE_X86_MMX', true)
+  endif
+elif use_mmx.enabled()
+  error('MMX Support unavailable, but required')
+endif
+
+use_sse2 = get_option('sse2')
+have_sse2 = false
+sse2_flags = []
+if cc.get_id() == 'sun'
+  sse2_flags = ['-xarch=sse2']
+elif cc.get_id() != 'msvc'
+  sse2_flags = ['-msse2', '-Winline']
+endif
+if not use_sse2.disabled()
+  if host_machine.cpu_family() == 'x86'
+    if cc.compiles('''
+        #if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+        #   if !defined(__amd64__) && !defined(__x86_64__)
+        #      error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
+        #   endif
+        #endif
+        #include <mmintrin.h>
+        #include <xmmintrin.h>
+        #include <emmintrin.h>
+        int param;
+        int main () {
+          __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
+          c = _mm_xor_si128 (a, b);
+          return _mm_cvtsi128_si32(c);
+        }''',
+        args : sse2_flags,
+        name : 'SSE2 Intrinsic Support')
+      have_sse2 = true
+    endif
+  elif host_machine.cpu_family() == 'x86_64'
+    have_sse2 = true
+  endif
+endif
+
+if have_sse2
+  config.set10('USE_SSE2', true)
+elif use_sse2.enabled()
+  error('sse2 Support unavailable, but required')
+endif
+
+use_ssse3 = get_option('ssse3')
+have_ssse3 = false
+ssse3_flags = []
+if cc.get_id() != 'msvc'
+  ssse3_flags = ['-mssse3', '-Winline']
+endif
+
+# x64 pre-2010 MSVC compilers crashes when building the ssse3 code
+if not use_ssse3.disabled() and not (cc.get_id() == 'msvc' and cc.version().version_compare('<16') and host_machine.cpu_family() == 'x86_64')
+  if host_machine.cpu_family().startswith('x86')
+    if cc.compiles('''
+        #include <mmintrin.h>
+        #include <xmmintrin.h>
+        #include <emmintrin.h>
+        int param;
+        int main () {
+          __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
+          c = _mm_xor_si128 (a, b);
+          return _mm_cvtsi128_si32(c);
+        }''',
+        args : ssse3_flags,
+        name : 'SSSE3 Intrinsic Support')
+      have_ssse3 = true
+    endif
+  endif
+endif
+
+if have_ssse3
+  config.set10('USE_SSSE3', true)
+elif use_ssse3.enabled()
+  error('ssse3 Support unavailable, but required')
+endif
+
+use_vmx = get_option('vmx')
+have_vmx = false
+vmx_flags = ['-maltivec', '-mabi=altivec']
+if not use_vmx.disabled()
+  if host_machine.cpu_family().startswith('ppc')
+    if cc.compiles('''
+        #include <altivec.h>
+        int main () {
+            vector unsigned int v = vec_splat_u32 (1);
+            v = vec_sub (v, v);
+            return 0;
+        }''',
+        args : vmx_flags,
+        name : 'VMX/Altivec Intrinsic Support')
+      have_vmx = true
+    endif
+  endif
+endif
+
+if cc.compiles('''
+    __asm__ (
+    ".func meson_test"
+    ".endfunc"
+    );''',
+    name : 'test for ASM .func directive')
+    config.set('ASM_HAVE_FUNC_DIRECTIVE', 1)    
+endif
+
+if cc.compiles('''
+    __asm__ (
+    ".syntax unified\n"
+    );''',
+    name : 'test for ASM .syntax unified directive')
+    config.set('ASM_HAVE_SYNTAX_UNIFIED', 1)
+endif
+
+if cc.links('''
+    #include <stdint.h>
+
+    __asm__ (
+        "   .global _testlabel\n"
+        "_testlabel:\n"
+    );
+
+    int testlabel();
+    int main(int argc, char* argv[]) {
+        return testlabel();
+    }''',
+    name : 'test for ASM leading underscore')
+    config.set('ASM_LEADING_UNDERSCORE', 1)    
+endif
+
+
+
+if have_vmx
+  config.set10('USE_VMX', true)
+elif use_vmx.enabled()
+  error('vmx Support unavailable, but required')
+endif
+
+use_armv6_simd = get_option('arm-simd')
+have_armv6_simd = false
+if not use_armv6_simd.disabled()
+  if host_machine.cpu_family() == 'arm'
+    if cc.compiles(files('arm-simd-test.S'), name : 'ARMv6 SIMD Intrinsic Support')
+      have_armv6_simd = true
+    endif
+  endif
+endif
+
+if have_armv6_simd
+  config.set10('USE_ARM_SIMD', true)
+elif use_armv6_simd.enabled()
+  error('ARMv6 SIMD Support unavailable, but required')
+endif
+
+use_neon = get_option('neon')
+have_neon = false
+if not use_neon.disabled()
+  if host_machine.cpu_family() == 'arm'
+    if cc.compiles(files('neon-test.S'), name : 'NEON Intrinsic Support')
+      have_neon = true
+    endif
+  endif
+endif
+
+if have_neon
+  config.set10('USE_ARM_NEON', true)
+elif use_neon.enabled()
+  error('NEON Support unavailable, but required')
+endif
+
+use_a64neon = get_option('a64-neon')
+have_a64neon = false
+if not use_a64neon.disabled()
+  if host_machine.cpu_family() == 'aarch64'
+    if cc.compiles(files('a64-neon-test.S'), name : 'NEON A64 Intrinsic Support')
+      have_a64neon = true
+    endif
+  endif
+endif
+
+if have_a64neon
+  config.set10('USE_ARM_A64_NEON', true)
+elif use_a64neon.enabled()
+  error('A64 NEON Support unavailable, but required')
+endif
+
+use_iwmmxt = get_option('iwmmxt')
+have_iwmmxt = false
+iwmmxt_flags = ['-flax-vector-conversions', '-Winline']
+if not use_iwmmxt.disabled()
+  if get_option('iwmmxt2')
+    iwmmxt_flags += '-march=iwmmxt2'
+  else
+    iwmmxt_flags += '-march=iwmmxt'
+  endif
+
+  if host_machine.cpu_family() == 'arm'
+    if cc.compiles('''
+        #ifndef __IWMMXT__
+        #error "IWMMXT not enabled (with -march=iwmmxt)"
+        #endif
+        #if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+        #error "Need GCC >= 4.8 for IWMMXT intrinsics"
+        #endif
+        #include <mmintrin.h>
+        int main () {
+          union {
+            __m64 v;
+            char c[8];
+          } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+          int b = 4;
+          __m64 c = _mm_srli_si64 (a.v, b);
+        }
+        ''',
+        args : iwmmxt_flags,
+        name : 'IWMMXT Intrinsic Support')
+      have_iwmmxt = true
+    endif
+  endif
+endif
+
+if have_iwmmxt
+  config.set10('USE_ARM_IWMMXT', true)
+elif use_iwmmxt.enabled()
+  error('IWMMXT Support unavailable, but required')
+endif
+
+use_mips_dspr2 = get_option('mips-dspr2')
+have_mips_dspr2 = false
+mips_dspr2_flags = ['-mdspr2']
+if not use_mips_dspr2.disabled()
+  if host_machine.cpu_family() == 'mips32'
+    if cc.compiles('''
+        #if !(defined(__mips__) &&  __mips_isa_rev >= 2)
+        #error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
+        #endif
+        int
+        main ()
+        {
+            int c = 0, a = 0, b = 0;
+            __asm__ __volatile__ (
+                "precr.qb.ph %[c], %[a], %[b]          \n\t"
+                : [c] "=r" (c)
+                : [a] "r" (a), [b] "r" (b)
+            );
+            return c;
+        }''',
+        args : mipds_dspr2_flags,
+        name : 'DSPr2 Intrinsic Support')
+      have_mips_dspr2 = true
+    endif
+  endif
+endif
+
+if have_mips_dspr2
+  config.set10('USE_MIPS_DSPR2', true)
+elif use_mips_dspr2.enabled()
+  error('MIPS DSPr2 Support unavailable, but required')
+endif
+
+use_gnu_asm = get_option('gnu-inline-asm')
+if not use_gnu_asm.disabled()
+  if cc.compiles('''
+      int main () {
+        /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
+        asm volatile ( "\tnop\n" : : : "cc", "memory" );
+        return 0;
+      }
+      ''',
+      name : 'GNU Inline ASM support.')
+    config.set10('USE_GCC_INLINE_ASM', true)
+  elif use_gnu_asm.enabled()
+    error('GNU inline assembly support missing but required.')
+  endif
+endif
+
+if get_option('timers')
+  config.set('PIXMAN_TIMERS', 1)
+endif
+if get_option('gnuplot')
+  config.set('PIXMAN_GNUPLOT', 1)
+endif
+
+if cc.get_id() != 'msvc'
+  dep_openmp = dependency('openmp', required : get_option('openmp'))
+  if dep_openmp.found()
+    config.set10('USE_OPENMP', true)
+  elif meson.version().version_compare('<0.51.0')
+  # In versions of meson before 0.51 the openmp dependency can still
+  # inject arguments in the the auto case when it is not found, the
+  # detection does work correctly in that case however, so we just
+  # replace dep_openmp with null_dep to work around this.
+    dep_openmp = null_dep
+  endif
+else
+  # the MSVC implementation of openmp is not compliant enough for our
+  # uses here, so we disable it here.
+  # Please see: https://stackoverflow.com/questions/12560243/using-threadprivate-directive-in-visual-studio
+  dep_openmp = null_dep
+endif
+
+dep_gtk = dependency('gtk+-3.0', required : get_option('gtk').enabled() and get_option('demos').enabled())
+dep_glib = dependency('glib-2.0', required : get_option('gtk').enabled() and get_option('demos').enabled())
+
+dep_png = null_dep
+if not get_option('libpng').disabled()
+  dep_png = dependency('libpng', required : false)
+
+  # We need to look for the right library to link to for libpng,
+  # when looking for libpng manually
+  foreach png_ver : [ '16', '15', '14', '13', '12', '10' ]
+    if not dep_png.found()
+      dep_png = cc.find_library('libpng@0@'.format(png_ver), has_headers : ['png.h'], required : false)
+    endif
+  endforeach
+
+  if get_option('libpng').enabled() and not dep_png.found()
+    error('libpng support requested but libpng library not found')
+  endif
+endif
+
+if dep_png.found()
+  config.set('HAVE_LIBPNG', 1)
+endif
+dep_m = cc.find_library('m', required : false)
+dep_threads = dependency('threads')
+
+# MSVC-style compilers do not come with pthreads, so we must link
+# to it explicitly, currently pthreads-win32 is supported
+pthreads_found = false
+
+if dep_threads.found() and cc.has_header('pthread.h')
+  if cc.get_argument_syntax() == 'msvc'
+    pthread_lib = null_dep
+    foreach pthread_type : ['VC3', 'VSE3', 'VCE3', 'VC2', 'VSE2', 'VCE2']
+      if not pthread_lib.found()
+        pthread_lib = cc.find_library('pthread@0@'.format(pthread_type), required : false)
+      endif
+    endforeach
+    if pthread_lib.found()
+      pthreads_found = true
+      dep_threads = pthread_lib
+    endif
+  else
+    pthreads_found = true
+  endif
+else
+  # Avoid linking with -pthread if we don't actually have pthreads
+  dep_threads = null_dep
+endif
+
+if pthreads_found
+  config.set('HAVE_PTHREADS', 1)
+endif
+
+funcs = ['sigaction', 'alarm', 'mprotect', 'getpagesize', 'mmap', 'getisax', 'gettimeofday']
+# mingw claimes to have posix_memalign, but it doesn't
+if host_machine.system() != 'windows'
+  funcs += 'posix_memalign'
+endif
+
+foreach f : funcs
+  if cc.has_function(f)
+    config.set('HAVE_@0@'.format(f.to_upper()), 1)
+  endif
+endforeach
+
+# This is only used in one test, that defines _GNU_SOURCE
+if cc.has_function('feenableexcept',
+                   prefix : '#define _GNU_SOURCE\n#include <fenv.h>',
+                   dependencies : dep_m)
+  config.set('HAVE_FEENABLEEXCEPT', 1)
+endif
+
+if cc.has_header_symbol('fenv.h', 'FE_DIVBYZERO')
+  config.set('HAVE_FEDIVBYZERO', 1)
+endif
+
+foreach h : ['sys/mman.h', 'fenv.h', 'unistd.h']
+  if cc.check_header(h)
+    config.set('HAVE_@0@'.format(h.underscorify().to_upper()), 1)
+  endif
+endforeach
+
+use_tls = get_option('tls')
+have_tls = ''
+if not use_tls.disabled()
+  # gcc on Windows only warns that __declspec(thread) isn't supported,
+  # passing -Werror=attributes makes it fail.
+  if (host_machine.system() == 'windows' and
+      cc.compiles('int __declspec(thread) foo;',
+                  args : cc.get_supported_arguments(['-Werror=attributes']),
+                  name : 'TLS via __declspec(thread)'))
+    have_tls = '__declspec(thread)'
+  elif cc.compiles('int __thread foo;', name : 'TLS via __thread')
+    have_tls = '__thread'
+  endif
+endif
+
+if have_tls != ''
+  config.set('TLS', have_tls)
+elif use_tls.enabled()
+  error('Compiler TLS Support unavailable, but required')
+endif
+
+if cc.links('''
+    static int x = 1;
+    static void __attribute__((constructor)) constructor_function () { x = 0; }
+    int main (void) { return x; }
+    ''',
+    name : '__attribute__((constructor))')
+  config.set('TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR', 1)
+endif
+
+if cc.links(
+    ' __float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }',
+    name : 'Has float128 support')
+  config.set('HAVE_FLOAT128', 1)
+endif
+
+if cc.has_function('clz')
+  config.set('HAVE_BUILTIN_CLZ', 1)
+endif
+
+if cc.links('''
+    unsigned int __attribute__ ((vector_size(16))) e, a, b;
+    int main (void) { e = a - ((b << 27) + (b >> (32 - 27))) + 1; return e[0]; }
+    ''',
+    name : 'Support for GCC vector extensions')
+  config.set('HAVE_GCC_VECTOR_EXTENSIONS', 1)
+endif
+
+if host_machine.endian() == 'big'
+  config.set('WORDS_BIGENDIAN', 1)
+endif
+
+config.set('SIZEOF_LONG', cc.sizeof('long'))
+
+# Required to make pixman-private.h
+config.set('PACKAGE', 'foo')
+
+version_conf = configuration_data()
+split = meson.project_version().split('.')
+version_conf.set('PIXMAN_VERSION_MAJOR', split[0])
+version_conf.set('PIXMAN_VERSION_MINOR', split[1])
+version_conf.set('PIXMAN_VERSION_MICRO', split[2])
+
+add_project_arguments('-DHAVE_CONFIG_H', language : ['c'])
+
+subdir('pixman')
+
+if not get_option('tests').disabled() or not get_option('demos').disabled()
+  subdir(join_paths('test', 'utils'))
+endif
+
+if not get_option('demos').disabled()
+  subdir('demos')
+endif
+
+if not get_option('tests').disabled()
+  subdir('test')
+endif
+
+pkg = import('pkgconfig')
+pkg.generate(libpixman,
+  name : 'Pixman',
+  filebase : 'pixman-1',
+  description : 'The pixman library (version 1)',
+  subdirs: 'pixman-1',
+  version : meson.project_version(),
+)
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 0000000..df10889
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,128 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+option(
+  'loongson-mmi',
+  type : 'feature',
+  description : 'Use Loongson MMI intrinsic optimized paths',
+)
+option(
+  'mmx',
+  type : 'feature',
+  description : 'Use X86 MMX intrinsic optimized paths',
+)
+option(
+  'sse2',
+  type : 'feature',
+  description : 'Use X86 SSE2 intrinsic optimized paths',
+)
+option(
+  'ssse3',
+  type : 'feature',
+  description : 'Use X86 SSSE3 intrinsic optimized paths',
+)
+option(
+  'vmx',
+  type : 'feature',
+  description : 'Use PPC VMX/Altivec intrinsic optimized paths',
+)
+option(
+  'arm-simd',
+  type : 'feature',
+  description : 'Use ARMv6 SIMD intrinsic optimized paths',
+)
+option(
+  'neon',
+  type : 'feature',
+  description : 'Use ARM NEON intrinsic optimized paths',
+)
+option(
+  'a64-neon',
+  type : 'feature',
+  description : 'Use ARM A64 NEON intrinsic optimized paths',
+)
+option(
+  'iwmmxt',
+  type : 'feature',
+  description : 'Use ARM IWMMXT intrinsic optimized paths',
+)
+option(
+  'iwmmxt2',
+  type : 'boolean',
+  value : true,
+  description : 'Use ARM IWMMXT2 intrinsic instead of IWMMXT',
+)
+option(
+  'mips-dspr2',
+  type : 'feature',
+  description : 'Use MIPS32 DSPr2 intrinsic optimized paths',
+)
+option(
+  'gnu-inline-asm',
+  type : 'feature',
+  description : 'Use GNU style inline assembler',
+)
+option(
+  'tls',
+  type : 'feature',
+  description : 'Use compiler support for thread-local storage',
+)
+option(
+  'cpu-features-path',
+  type : 'string',
+  description : 'Path to platform-specific cpu-features.[ch] for systems that do not provide it (e.g. Android)',
+)
+option(
+  'openmp',
+  type : 'feature',
+  description : 'Enable OpenMP for tests',
+)
+option(
+  'timers',
+  type : 'boolean',
+  value : false,
+  description : 'Enable TIMER_* macros',
+)
+option(
+  'gnuplot',
+  type : 'boolean',
+  value : false,
+  description : 'Enable output of filters that can be piped to gnuplot',
+)
+option(
+  'gtk',
+  type : 'feature',
+  description : 'Enable demos using GTK',
+)
+option(
+  'libpng',
+  type : 'feature',
+  description : 'Use libpng in tests'
+)
+option(
+  'tests',
+  type : 'feature',
+  description : 'Build tests'
+)
+option(
+  'demos',
+  type : 'feature',
+  description : 'Build demos'
+)
diff --git a/neon-test.S b/neon-test.S
new file mode 100644
index 0000000..c30a399
--- /dev/null
+++ b/neon-test.S
@@ -0,0 +1,12 @@
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0
diff --git a/pixman-1-uninstalled.pc.in b/pixman-1-uninstalled.pc.in
deleted file mode 100644
index e0347d0..0000000
--- a/pixman-1-uninstalled.pc.in
+++ /dev/null
@@ -1,5 +0,0 @@
-Name: Pixman
-Description: The pixman library (version 1)
-Version: @PACKAGE_VERSION@
-Cflags: -I${pc_top_builddir}/${pcfiledir}/pixman
-Libs: ${pc_top_builddir}/${pcfiledir}/pixman/libpixman-1.la
diff --git a/pixman-1.pc.in b/pixman-1.pc.in
deleted file mode 100644
index e3b9711..0000000
--- a/pixman-1.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Pixman
-Description: The pixman library (version 1)
-Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/pixman-1
-Libs: -L${libdir} -lpixman-1
-
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
deleted file mode 100644
index 581b6f6..0000000
--- a/pixman/Makefile.am
+++ /dev/null
@@ -1,141 +0,0 @@
-include $(top_srcdir)/pixman/Makefile.sources
-
-lib_LTLIBRARIES = libpixman-1.la
-
-libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
-libpixman_1_la_LIBADD = @PTHREAD_LIBS@ -lm
-libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers)
-
-libpixmanincludedir = $(includedir)/pixman-1
-libpixmaninclude_HEADERS = pixman.h pixman-version.h
-noinst_LTLIBRARIES = 
-
-EXTRA_DIST =				\
-	Makefile.win32			\
-	pixman-region.c			\
-	solaris-hwcap.mapfile		\
-	$(NULL)
-
-# mmx code
-if USE_X86_MMX
-noinst_LTLIBRARIES += libpixman-mmx.la
-libpixman_mmx_la_SOURCES = \
-	pixman-mmx.c
-libpixman_mmx_la_CFLAGS = $(MMX_CFLAGS)
-libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-mmx.la
-
-ASM_CFLAGS_mmx=$(MMX_CFLAGS)
-endif
-
-# vmx code
-if USE_VMX
-noinst_LTLIBRARIES += libpixman-vmx.la
-libpixman_vmx_la_SOURCES = \
-	pixman-vmx.c \
-	pixman-combine32.h
-libpixman_vmx_la_CFLAGS = $(VMX_CFLAGS)
-libpixman_1_la_LIBADD += libpixman-vmx.la
-
-ASM_CFLAGS_vmx=$(VMX_CFLAGS)
-endif
-
-# sse2 code
-if USE_SSE2
-noinst_LTLIBRARIES += libpixman-sse2.la
-libpixman_sse2_la_SOURCES = \
-	pixman-sse2.c
-libpixman_sse2_la_CFLAGS = $(SSE2_CFLAGS)
-libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-sse2.la
-
-ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
-endif
-
-# ssse3 code
-if USE_SSSE3
-noinst_LTLIBRARIES += libpixman-ssse3.la
-libpixman_ssse3_la_SOURCES = \
-	pixman-ssse3.c
-libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS)
-libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-ssse3.la
-
-ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
-endif
-
-# arm simd code
-if USE_ARM_SIMD
-noinst_LTLIBRARIES += libpixman-arm-simd.la
-libpixman_arm_simd_la_SOURCES = \
-	pixman-arm-simd.c	\
-	pixman-arm-common.h	\
-	pixman-arm-simd-asm.S   \
-	pixman-arm-simd-asm-scaled.S \
-	pixman-arm-asm.h	\
-	pixman-arm-simd-asm.h
-libpixman_1_la_LIBADD += libpixman-arm-simd.la
-
-ASM_CFLAGS_arm_simd=
-endif
-
-# arm neon code
-if USE_ARM_NEON
-noinst_LTLIBRARIES += libpixman-arm-neon.la
-libpixman_arm_neon_la_SOURCES = \
-        pixman-arm-neon.c	\
-        pixman-arm-common.h	\
-        pixman-arm-neon-asm.S	\
-		pixman-arm-neon-asm-bilinear.S \
-        pixman-arm-asm.h	\
-        pixman-arm-neon-asm.h
-libpixman_1_la_LIBADD += libpixman-arm-neon.la
-
-ASM_CFLAGS_arm_neon=
-endif
-
-# iwmmxt code
-if USE_ARM_IWMMXT
-libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
-noinst_LTLIBRARIES += libpixman-iwmmxt.la
-libpixman_1_la_LIBADD += libpixman-iwmmxt.la
-
-libpixman_iwmmxt_la-pixman-mmx.lo: pixman-mmx.c
-	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(AM_CPPFLAGS) $(AM_CPPFLAGS) $(CPPFLAGS) $(CFLAGS) $(IWMMXT_CFLAGS) -MT libpixman_iwmmxt_la-pixman-mmx.lo -MD -MP -MF $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo -c -o libpixman_iwmmxt_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c
-	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Plo
-
-libpixman_iwmmxt_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
-libpixman_iwmmxt_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
-        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
-	$(CFLAGS) $(IWMMXT_CFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
-
-libpixman-iwmmxt.la: libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(libpixman_iwmmxt_la_LINK) libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_LIBADD) $(LIBS)
-endif
-
-# mips dspr2 code
-if USE_MIPS_DSPR2
-noinst_LTLIBRARIES += libpixman-mips-dspr2.la
-libpixman_mips_dspr2_la_SOURCES = \
-        pixman-mips-dspr2.c \
-        pixman-mips-dspr2.h \
-        pixman-mips-dspr2-asm.S \
-        pixman-mips-dspr2-asm.h \
-        pixman-mips-memcpy-asm.S
-libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
-
-ASM_CFLAGS_mips_dspr2=
-endif
-
-# loongson code
-if USE_LOONGSON_MMI
-noinst_LTLIBRARIES += libpixman-loongson-mmi.la
-libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
-libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
-libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
-endif
-
-.c.s : $(libpixmaninclude_HEADERS)
-	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/Makefile.sources b/pixman/Makefile.sources
deleted file mode 100644
index c624eb9..0000000
--- a/pixman/Makefile.sources
+++ /dev/null
@@ -1,42 +0,0 @@
-libpixman_sources =			\
-	pixman.c			\
-	pixman-access.c			\
-	pixman-access-accessors.c	\
-	pixman-bits-image.c		\
-	pixman-combine32.c		\
-	pixman-combine-float.c		\
-	pixman-conical-gradient.c	\
-	pixman-filter.c			\
-	pixman-x86.c			\
-	pixman-mips.c			\
-	pixman-arm.c			\
-	pixman-ppc.c			\
-	pixman-edge.c			\
-	pixman-edge-accessors.c		\
-	pixman-fast-path.c		\
-	pixman-glyph.c			\
-	pixman-general.c		\
-	pixman-gradient-walker.c	\
-	pixman-image.c			\
-	pixman-implementation.c		\
-	pixman-linear-gradient.c	\
-	pixman-matrix.c			\
-	pixman-noop.c			\
-	pixman-radial-gradient.c	\
-	pixman-region16.c		\
-	pixman-region32.c		\
-	pixman-solid-fill.c		\
-	pixman-timer.c			\
-	pixman-trap.c			\
-	pixman-utils.c			\
-	$(NULL)
-
-libpixman_headers =			\
-	pixman.h			\
-	pixman-accessor.h		\
-	pixman-combine32.h		\
-	pixman-compiler.h		\
-	pixman-edge-imp.h		\
-	pixman-inlines.h		\
-	pixman-private.h		\
-	$(NULL)
diff --git a/pixman/Makefile.win32 b/pixman/Makefile.win32
deleted file mode 100644
index 7b64033..0000000
--- a/pixman/Makefile.win32
+++ /dev/null
@@ -1,93 +0,0 @@
-default: all
-
-top_srcdir = ..
-include $(top_srcdir)/pixman/Makefile.sources
-include $(top_srcdir)/Makefile.win32.common
-
-MMX_VAR = $(MMX)
-ifeq ($(MMX_VAR),)
-MMX_VAR=on
-endif
-
-SSE2_VAR = $(SSE2)
-ifeq ($(SSE2_VAR),)
-SSE2_VAR=on
-endif
-
-SSSE3_VAR = $(SSSE3)
-ifeq ($(SSSE3_VAR),)
-SSSE3_VAR=on
-endif
-
-MMX_CFLAGS = -DUSE_X86_MMX -w14710 -w14714
-SSE2_CFLAGS = -DUSE_SSE2
-SSSE3_CFLAGS = -DUSE_SSSE3
-
-# MMX compilation flags
-ifeq ($(MMX_VAR),on)
-PIXMAN_CFLAGS += $(MMX_CFLAGS)
-libpixman_sources += pixman-mmx.c
-endif
-
-# SSE2 compilation flags
-ifeq ($(SSE2_VAR),on)
-PIXMAN_CFLAGS += $(SSE2_CFLAGS)
-libpixman_sources += pixman-sse2.c
-endif
-
-# SSSE3 compilation flags
-ifeq ($(SSSE3_VAR),on)
-PIXMAN_CFLAGS += $(SSSE3_CFLAGS)
-libpixman_sources += pixman-ssse3.c
-endif
-
-OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libpixman_sources))
-
-# targets
-all: inform informMMX informSSE2 informSSSE3 $(CFG_VAR)/$(LIBRARY).lib
-
-informMMX:
-ifneq ($(MMX),off)
-ifneq ($(MMX),on)
-ifneq ($(MMX),)
-	@echo "Invalid specified MMX option : "$(MMX_VAR)"."
-	@echo
-	@echo "Possible choices for MMX are 'on' or 'off'"
-	@exit 1
-endif
-	@echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)"
-endif
-endif
-
-informSSE2:
-ifneq ($(SSE2),off)
-ifneq ($(SSE2),on)
-ifneq ($(SSE2),)
-	@echo "Invalid specified SSE option : "$(SSE2)"."
-	@echo
-	@echo "Possible choices for SSE2 are 'on' or 'off'"
-	@exit 1
-endif
-	@echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)"
-endif
-endif
-
-informSSSE3:
-ifneq ($(SSSE3),off)
-ifneq ($(SSSE3),on)
-ifneq ($(SSSE3),)
-	@echo "Invalid specified SSE option : "$(SSSE3)"."
-	@echo
-	@echo "Possible choices for SSSE3 are 'on' or 'off'"
-	@exit 1
-endif
-	@echo "Setting SSSE3 flag to default value 'on'... (use SSSE3=on or SSSE3=off)"
-endif
-endif
-
-
-# pixman linking
-$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
-	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
-
-.PHONY: all informMMX informSSE2 informSSSE3
diff --git a/pixman/dither/blue-noise-64x64.h b/pixman/dither/blue-noise-64x64.h
new file mode 100644
index 0000000..93c8805
--- /dev/null
+++ b/pixman/dither/blue-noise-64x64.h
@@ -0,0 +1,77 @@
+/* WARNING: This file is generated by make-blue-noise.c
+ * Please edit that file instead of this one.
+ */
+
+#ifndef BLUE_NOISE_64X64_H
+#define BLUE_NOISE_64X64_H
+
+#include <stdint.h>
+
+static const uint16_t dither_blue_noise_64x64[4096] = {
+    3039, 1368, 3169, 103, 2211, 1248, 2981, 668, 2633, 37, 3963, 2903, 384, 2564, 3115, 1973, 3348, 830, 2505, 1293, 3054, 1060, 1505, 3268, 400, 1341, 593, 3802, 3384, 429, 4082, 1411, 2503, 3863, 126, 1292, 1887, 2855, 205, 2094, 2977, 1899, 3924, 356, 3088, 2500, 3942, 1409, 2293, 1734, 3732, 1291, 3227, 277, 2054, 786, 2871, 411, 2425, 1678, 3986, 455, 2879, 2288,
+    388, 1972, 3851, 778, 2768, 3697, 944, 2123, 1501, 3533, 937, 1713, 1381, 3888, 156, 1242, 516, 2888, 1607, 3676, 632, 2397, 3804, 2673, 1898, 3534, 2593, 1777, 1170, 2299, 3013, 1838, 523, 3053, 1647, 3601, 3197, 959, 1520, 3633, 893, 2437, 3367, 2187, 1258, 137, 1965, 401, 3546, 643, 3087, 2498, 733, 2786, 3371, 4053, 1266, 1977, 3663, 183, 2570, 2107, 1183, 3708,
+    907, 2473, 1151, 3363, 1527, 1902, 232, 3903, 3060, 496, 2486, 3206, 2165, 861, 2387, 3653, 2101, 3972, 132, 2162, 3437, 1827, 215, 895, 3114, 271, 969, 2932, 197, 1598, 878, 3696, 1140, 2120, 904, 2431, 302, 3846, 2675, 481, 3187, 66, 1440, 650, 3833, 2826, 3435, 901, 2936, 2111, 250, 1875, 3609, 1174, 1747, 162, 2346, 3420, 913, 3172, 1383, 752, 3298, 1735,
+    3540, 2938, 249, 2324, 526, 3099, 2561, 1324, 2347, 1861, 1200, 3702, 257, 3442, 1514, 2999, 992, 1766, 2735, 1163, 478, 2943, 1279, 3635, 2177, 1464, 3672, 2386, 3871, 3340, 2690, 64, 3489, 2811, 3999, 633, 1948, 1243, 2269, 1807, 1143, 2750, 3729, 1790, 2363, 1053, 1537, 2636, 4065, 1076, 1476, 3869, 450, 2200, 2676, 658, 2979, 1548, 544, 1913, 2838, 3911, 116, 2698,
+    517, 1295, 3997, 1739, 3665, 1083, 3509, 599, 3400, 118, 2956, 720, 2689, 1907, 567, 2523, 284, 3397, 711, 3219, 2450, 3985, 1665, 2549, 562, 3011, 1855, 729, 1355, 528, 1908, 2456, 1384, 337, 1540, 2654, 3138, 3513, 703, 4080, 3314, 2047, 855, 3037, 209, 3317, 577, 1828, 17, 2336, 3193, 2748, 962, 3441, 1450, 3246, 1075, 3878, 2615, 3497, 1033, 2310, 1442, 2183,
+    1654, 3254, 2061, 738, 2832, 148, 2030, 1670, 909, 3850, 2109, 1533, 4046, 1085, 3098, 3897, 1378, 2248, 3829, 1495, 1966, 23, 797, 3427, 1124, 4057, 95, 2787, 2190, 3074, 3950, 742, 3194, 1999, 3386, 1113, 16, 1657, 2804, 201, 1543, 383, 2559, 1325, 3604, 2068, 2493, 3771, 1284, 3460, 710, 1716, 2447, 80, 3811, 2032, 347, 2227, 15, 1689, 397, 3084, 662, 3798,
+    973, 43, 2608, 3143, 1459, 2423, 4066, 2770, 3191, 1283, 2630, 314, 3235, 2289, 72, 1822, 2840, 924, 350, 2653, 1057, 3715, 2235, 2775, 346, 2083, 1553, 3292, 1081, 274, 1686, 1188, 2327, 3743, 578, 2234, 3916, 2519, 1011, 3056, 2207, 3438, 3890, 537, 1617, 837, 3094, 373, 2795, 1980, 276, 3951, 1353, 3015, 844, 1724, 3651, 2923, 1316, 4092, 2504, 3627, 1936, 2854,
+    2461, 3929, 1193, 421, 3746, 820, 1180, 286, 2261, 532, 3625, 1812, 802, 1327, 3527, 670, 3730, 2025, 3124, 3565, 529, 2960, 1769, 1390, 3196, 2494, 3756, 796, 3618, 2602, 3463, 2847, 166, 953, 1745, 2900, 438, 2070, 1418, 3741, 639, 1205, 1891, 2882, 2282, 4012, 1182, 1696, 3630, 951, 2904, 2170, 3530, 375, 2320, 2742, 1132, 701, 3216, 2023, 847, 1230, 310, 3431,
+    770, 1961, 3531, 1702, 2181, 3370, 1877, 3072, 1571, 3389, 1071, 2415, 3782, 2803, 1610, 2454, 1211, 182, 1655, 2322, 1282, 3372, 287, 3935, 704, 1232, 415, 1910, 2286, 1399, 556, 1964, 4068, 2444, 3605, 1272, 3345, 816, 3526, 256, 2402, 2777, 955, 345, 3289, 111, 2727, 635, 2396, 1488, 3331, 600, 1032, 1575, 4026, 515, 3507, 2433, 1605, 460, 3364, 2783, 1810, 1397,
+    2334, 223, 2945, 688, 2533, 99, 2705, 624, 3944, 2073, 46, 2978, 508, 2132, 269, 3173, 3453, 2631, 4076, 694, 1892, 2586, 972, 2178, 3470, 1695, 2849, 3141, 77, 3884, 994, 3029, 1536, 673, 3083, 124, 2583, 1722, 2821, 1944, 4027, 1661, 3176, 3728, 1337, 1813, 3503, 2035, 3930, 157, 2537, 1865, 3096, 2646, 1941, 3252, 1449, 135, 2836, 3758, 2139, 84, 3678, 3106,
+    3862, 1545, 3307, 1320, 3955, 1031, 3664, 1306, 2460, 776, 1487, 3294, 1187, 3990, 1903, 1021, 549, 1484, 943, 3027, 97, 3853, 1499, 2880, 198, 2575, 3995, 1089, 1587, 2475, 3282, 339, 2657, 1158, 2105, 1493, 3943, 580, 3232, 1287, 846, 48, 2480, 2112, 771, 2534, 459, 3134, 850, 1298, 3790, 325, 3652, 1249, 193, 940, 2202, 3895, 1829, 911, 1366, 2577, 1069, 534,
+    2104, 1009, 2667, 392, 1983, 2917, 1645, 324, 3439, 2869, 3705, 1767, 2592, 756, 2916, 3683, 2276, 2850, 2053, 3594, 2403, 3181, 634, 3699, 1933, 906, 519, 2150, 3673, 764, 1770, 2220, 3795, 3336, 502, 3547, 2339, 1110, 301, 2210, 3354, 3643, 569, 1518, 2940, 3973, 1138, 1613, 2773, 2127, 2983, 1671, 769, 2161, 3800, 2730, 3127, 1179, 533, 3259, 2284, 4014, 1651, 2820,
+    3566, 653, 1839, 3455, 2399, 789, 3149, 2244, 1863, 1099, 474, 2307, 158, 3541, 1312, 1711, 0, 3902, 360, 1629, 1091, 395, 1781, 1191, 2374, 3353, 1419, 3225, 206, 2931, 3553, 1046, 54, 1646, 2470, 910, 1860, 3137, 3770, 2635, 1562, 2809, 1215, 3788, 222, 2199, 3335, 67, 3606, 524, 1001, 3309, 2410, 3473, 591, 1619, 291, 2502, 3629, 2891, 335, 741, 3378, 168,
+    2384, 3129, 4051, 22, 1444, 3613, 543, 3893, 186, 2665, 4062, 933, 3058, 2142, 449, 2711, 3224, 849, 1330, 3349, 2195, 2670, 3484, 2993, 32, 3774, 2722, 1859, 2548, 1268, 583, 2027, 3165, 2807, 4029, 227, 2897, 1434, 721, 1816, 195, 905, 2066, 3258, 1754, 970, 2674, 1880, 2338, 3915, 1485, 2660, 14, 1313, 2914, 2046, 4074, 791, 1917, 1301, 1725, 2687, 2019, 1443,
+    418, 1186, 1664, 2859, 1049, 2056, 2741, 1226, 1589, 3186, 2042, 1377, 3449, 1574, 3941, 1063, 1930, 2501, 3751, 2930, 671, 4031, 888, 2081, 1544, 684, 1117, 351, 4052, 1698, 2393, 3881, 1439, 785, 1277, 2013, 3488, 441, 2459, 3980, 3061, 3481, 2543, 419, 3020, 609, 3515, 1350, 799, 2878, 348, 2034, 3966, 1824, 950, 3281, 1394, 2239, 3452, 55, 3922, 3119, 892, 3785,
+    3023, 2140, 782, 2492, 3817, 241, 3355, 2424, 856, 3639, 612, 2556, 245, 2858, 705, 2316, 3562, 495, 1748, 128, 1912, 1454, 280, 2552, 3905, 3130, 2274, 3472, 834, 3055, 240, 2692, 471, 2272, 3301, 2632, 1080, 3693, 2136, 1029, 1364, 590, 1611, 4067, 1190, 2360, 3827, 261, 3180, 1768, 3471, 1103, 3003, 520, 3674, 151, 2571, 555, 3033, 982, 2353, 504, 1259, 2555,
+    149, 3889, 3380, 493, 3178, 1681, 663, 1924, 2990, 49, 1792, 3861, 1192, 1987, 3273, 297, 1457, 3043, 1177, 2292, 3249, 2829, 3682, 1154, 1758, 428, 2872, 1993, 1500, 3703, 1129, 3421, 1840, 3754, 163, 659, 1733, 3182, 38, 2875, 1957, 3614, 2237, 78, 1873, 2801, 1513, 2121, 1074, 2516, 667, 3710, 1429, 2430, 2088, 2830, 1072, 3557, 1531, 2733, 1955, 3286, 3590, 1826,
+    2778, 1068, 1932, 1452, 2279, 1185, 3564, 3952, 1391, 2726, 3313, 2331, 870, 3709, 1674, 2772, 4085, 808, 2596, 3848, 927, 538, 2335, 3334, 773, 3597, 1347, 109, 2663, 608, 2108, 2994, 936, 1524, 2922, 3968, 2422, 1467, 845, 3870, 321, 2704, 1073, 3308, 3680, 823, 430, 3375, 4030, 112, 2171, 2695, 267, 3374, 731, 1627, 3919, 1871, 352, 3839, 1370, 234, 794, 1532,
+    3245, 647, 3575, 74, 3045, 2766, 285, 2174, 498, 1059, 1551, 385, 3125, 2598, 143, 1128, 2095, 3395, 318, 1590, 3524, 1345, 1969, 242, 2759, 2092, 947, 3926, 3244, 2356, 1658, 6, 3593, 2554, 1172, 1995, 371, 2755, 3417, 2294, 1570, 3164, 748, 2517, 1401, 3111, 2420, 1662, 2910, 1276, 3276, 854, 1804, 4000, 1253, 2987, 229, 2344, 3184, 649, 2196, 2921, 4095, 2389,
+    1289, 2193, 2579, 4023, 757, 1858, 986, 3199, 2514, 3475, 4021, 2154, 651, 1432, 3468, 2404, 574, 1799, 3105, 2145, 86, 2614, 3218, 1565, 4088, 2481, 3079, 1815, 323, 1212, 3837, 759, 2159, 435, 3223, 784, 3659, 1114, 1888, 550, 1221, 3786, 1803, 499, 2117, 185, 3763, 942, 589, 2001, 3838, 1483, 3154, 2256, 468, 2544, 3403, 898, 1208, 2610, 3622, 967, 1929, 378,
+    3781, 220, 1656, 1115, 3347, 2428, 3822, 1577, 712, 1959, 110, 2765, 1762, 3854, 979, 2928, 3714, 1371, 746, 3969, 2884, 975, 3779, 641, 1142, 159, 1460, 702, 3485, 2866, 2495, 3330, 1305, 3937, 1635, 2229, 2962, 146, 4055, 3091, 2417, 100, 3508, 2933, 4006, 1167, 1920, 2760, 3552, 2545, 433, 2845, 142, 1056, 1886, 3616, 1435, 2099, 3803, 1749, 27, 1446, 3350, 2843,
+    884, 3310, 2948, 2103, 447, 1351, 187, 2895, 3655, 1256, 3036, 932, 3325, 2257, 451, 1915, 40, 2780, 2438, 1112, 1814, 423, 2290, 1905, 2898, 3419, 2306, 3760, 1938, 486, 1019, 1791, 3010, 2628, 203, 3408, 1269, 2507, 1606, 862, 2779, 2078, 952, 1529, 2638, 708, 3332, 1413, 2, 1726, 1156, 3500, 2392, 3791, 3076, 812, 107, 2861, 501, 3050, 3487, 2455, 594, 1731,
+    2685, 1498, 680, 3908, 2621, 3529, 1786, 2236, 342, 2569, 1526, 3722, 230, 1290, 3203, 3947, 1609, 3516, 467, 3267, 3685, 1461, 3140, 3569, 367, 1759, 928, 2754, 1332, 2219, 4034, 260, 655, 1984, 978, 3814, 617, 2086, 3525, 279, 3841, 1373, 3361, 319, 2251, 3066, 407, 2382, 3918, 3133, 2168, 762, 1523, 507, 2641, 1677, 4025, 2413, 1584, 793, 2049, 1109, 3962, 2218,
+    1194, 3692, 266, 1687, 981, 3103, 740, 3983, 1005, 3434, 570, 2383, 1942, 2718, 676, 2462, 1007, 2089, 1308, 2222, 233, 2568, 829, 1241, 2669, 3987, 514, 3303, 69, 3142, 1603, 3560, 2295, 3288, 1497, 2696, 1764, 2865, 1058, 3271, 1914, 477, 2529, 3927, 1736, 1273, 3752, 2029, 1012, 565, 2798, 4078, 1949, 3305, 1175, 2179, 380, 3366, 1195, 3849, 2637, 416, 2959, 125,
+    3396, 2467, 2036, 3234, 2340, 68, 2819, 1436, 2011, 3139, 1704, 4073, 860, 3582, 1468, 2969, 211, 3157, 4056, 866, 2935, 2000, 3923, 31, 2157, 1477, 2429, 1147, 3792, 2557, 774, 2802, 1153, 3747, 464, 3192, 42, 3904, 539, 1474, 2283, 803, 2876, 1061, 75, 3477, 747, 2893, 1538, 3626, 251, 1322, 2506, 189, 2791, 3667, 939, 2991, 1971, 175, 3195, 1416, 3648, 1857,
+    3052, 454, 851, 3789, 1271, 1906, 3694, 2484, 406, 2757, 26, 1189, 2909, 296, 2215, 3784, 1864, 637, 2715, 1673, 3445, 581, 1572, 3059, 3469, 761, 2984, 1737, 2058, 440, 1414, 1921, 121, 2527, 894, 2223, 1302, 2377, 3077, 2666, 3759, 3198, 1811, 3661, 2166, 2731, 1883, 359, 3285, 2458, 1805, 3459, 926, 3834, 675, 1893, 1496, 2612, 657, 3523, 1763, 2354, 564, 961,
+    1367, 3977, 1588, 2714, 322, 3446, 1088, 625, 3887, 1354, 3535, 2090, 3316, 1760, 1127, 483, 3491, 1421, 2301, 94, 1202, 3740, 2311, 1014, 1878, 3836, 180, 3412, 991, 2868, 3953, 3450, 3081, 1632, 4071, 1882, 3543, 726, 1719, 179, 1171, 364, 1420, 622, 3090, 1490, 946, 4007, 2212, 1102, 619, 2739, 2189, 1669, 2937, 3426, 39, 3940, 2191, 1264, 887, 4091, 2792, 2135,
+    4, 2883, 2281, 631, 3044, 1641, 2232, 3243, 1773, 2319, 827, 2591, 629, 3938, 2426, 3222, 2629, 1044, 3879, 3293, 1952, 2749, 275, 2590, 472, 1372, 2496, 660, 3669, 2264, 208, 915, 2167, 561, 2828, 307, 3265, 1104, 3964, 2155, 3425, 1951, 4077, 2391, 283, 3387, 2581, 115, 1415, 3069, 3896, 141, 3158, 1214, 442, 2405, 1349, 3085, 425, 2528, 3002, 312, 1602, 3588,
+    1137, 3323, 1963, 1002, 3578, 2521, 127, 925, 2970, 273, 3737, 1573, 167, 2863, 1509, 800, 147, 2059, 2942, 409, 921, 3151, 1451, 3909, 3333, 2844, 2096, 1512, 3136, 1210, 1798, 2709, 1331, 3586, 1034, 1521, 2441, 2926, 488, 2585, 775, 3031, 2693, 879, 3602, 1173, 2028, 3654, 2781, 841, 1975, 1507, 3646, 768, 3991, 2012, 996, 3544, 1666, 3810, 1990, 3360, 753, 2597,
+    3736, 304, 1473, 3828, 485, 1334, 4008, 2072, 3495, 1136, 2806, 2004, 3236, 1010, 2130, 3819, 1750, 3567, 644, 2515, 1794, 3636, 698, 2137, 1162, 832, 3761, 326, 2613, 513, 3302, 3820, 357, 3163, 2259, 3733, 101, 1922, 1386, 3587, 1640, 28, 1286, 2141, 1761, 2918, 693, 1639, 457, 3250, 2434, 365, 2599, 1729, 3284, 2643, 306, 2793, 689, 1090, 104, 1309, 2305, 1831,
+    2776, 859, 2446, 2915, 1778, 3337, 2677, 614, 1508, 2409, 469, 4033, 1321, 3563, 402, 3131, 2720, 1093, 1569, 4042, 1229, 2277, 216, 3046, 1817, 57, 3006, 1684, 4059, 2016, 795, 2440, 1652, 1960, 610, 2763, 920, 3864, 3110, 1026, 2326, 3762, 3233, 521, 3856, 173, 2457, 3939, 2138, 1262, 3572, 989, 3021, 2238, 119, 1445, 3832, 1809, 2297, 3467, 2700, 3684, 3102, 394,
+    4036, 2050, 3256, 89, 2198, 1079, 248, 1845, 3805, 3104, 880, 1779, 2688, 717, 2373, 1375, 262, 2249, 3071, 13, 2813, 3429, 1600, 3984, 2416, 3603, 1299, 2298, 998, 3492, 1393, 2951, 10, 4009, 1247, 3462, 1679, 2204, 414, 2736, 316, 1894, 2816, 1050, 3373, 1462, 3107, 817, 3464, 21, 1835, 4070, 568, 1178, 3718, 875, 3168, 466, 2974, 1458, 2084, 616, 1564, 1018,
+    1693, 546, 1244, 3899, 716, 3160, 3608, 2877, 1220, 334, 3443, 2270, 44, 3000, 1843, 3928, 3405, 766, 3686, 2040, 587, 993, 2647, 387, 930, 2753, 630, 3274, 150, 2808, 453, 3638, 1092, 2352, 3030, 239, 2562, 700, 3240, 1257, 4016, 730, 1515, 2203, 2551, 417, 1866, 1123, 2348, 2902, 1550, 2678, 2075, 3238, 1630, 2531, 2115, 1255, 4054, 840, 290, 3874, 2477, 3399,
+    2250, 3577, 2817, 1626, 2576, 1356, 2315, 792, 2087, 2618, 1612, 3855, 1263, 3637, 1036, 494, 1535, 2553, 1198, 1715, 3867, 3170, 1359, 1954, 3483, 1539, 2069, 3886, 1772, 2487, 1534, 2045, 3242, 806, 1578, 2018, 3948, 1423, 3596, 2076, 2466, 3424, 139, 3688, 871, 4049, 2852, 3342, 547, 3719, 327, 852, 3505, 207, 2794, 542, 3600, 45, 2411, 3324, 1788, 3012, 1235, 61,
+    2655, 917, 253, 1986, 3738, 313, 1706, 4072, 120, 3229, 957, 597, 2024, 3262, 2453, 2857, 2002, 3190, 210, 2784, 2206, 300, 2400, 3766, 553, 3152, 218, 1150, 2988, 883, 3753, 627, 2664, 3831, 437, 3385, 1008, 2957, 60, 1636, 891, 2899, 1776, 3062, 1315, 2026, 194, 1643, 2079, 1296, 3201, 2465, 1379, 1927, 3898, 1125, 1847, 2846, 1552, 1028, 2725, 2169, 787, 3202,
+    1441, 3982, 3032, 1052, 3251, 605, 2639, 3073, 1431, 3642, 2329, 2949, 341, 1634, 833, 129, 4020, 916, 3571, 669, 1506, 3411, 821, 2856, 1207, 2337, 2683, 3448, 340, 2214, 3128, 235, 1738, 1288, 2833, 2419, 606, 1884, 2668, 552, 3765, 1176, 399, 2302, 596, 3591, 2634, 767, 3845, 2767, 995, 3967, 491, 3057, 814, 2300, 3422, 691, 3797, 254, 3645, 509, 3478, 1836,
+    2119, 475, 2445, 1525, 2175, 3539, 914, 1926, 473, 1157, 1800, 3971, 2701, 3739, 2129, 3486, 1333, 1784, 2366, 2982, 1070, 4089, 1802, 73, 1642, 3958, 835, 1837, 1480, 4043, 1217, 2469, 3416, 2113, 88, 3668, 1240, 3255, 3920, 2355, 3167, 2003, 2645, 3936, 3228, 1592, 1144, 3474, 2394, 79, 1820, 2241, 1594, 3656, 2584, 153, 1448, 3034, 2005, 2511, 1692, 1335, 3913, 217,
+    2822, 3391, 745, 3813, 192, 1274, 2941, 3847, 2489, 3440, 744, 161, 1422, 1086, 572, 3004, 2617, 338, 3807, 2031, 236, 2472, 3065, 2098, 3358, 362, 2163, 3574, 497, 2788, 1970, 948, 3885, 685, 3100, 1712, 2228, 292, 1408, 1016, 164, 3537, 1417, 941, 34, 2172, 3001, 358, 1491, 3147, 699, 3356, 258, 1149, 2946, 1787, 3931, 382, 1146, 3291, 818, 2890, 2379, 1096,
+    3679, 1328, 1901, 3162, 2747, 1730, 2253, 5, 1556, 2818, 2093, 3166, 2522, 3410, 2287, 1701, 956, 3237, 620, 1596, 3300, 1307, 511, 3701, 1020, 2939, 1362, 2532, 3208, 749, 3641, 160, 1522, 2624, 1095, 4086, 826, 2841, 3583, 2173, 1727, 723, 2925, 1911, 2482, 3726, 863, 1962, 4028, 1111, 2835, 3773, 2449, 2022, 582, 3278, 923, 2619, 2152, 4039, 92, 1934, 3145, 677,
+    2530, 53, 2303, 1003, 458, 3989, 739, 3321, 1064, 369, 3556, 877, 1900, 426, 3876, 1, 3617, 2106, 1197, 2805, 3634, 857, 2706, 1504, 2418, 682, 3868, 20, 1139, 1688, 2333, 3311, 2907, 1945, 265, 2385, 3433, 1601, 636, 2620, 3095, 4044, 386, 3382, 1184, 527, 2814, 3414, 2342, 465, 1889, 1343, 874, 3479, 1502, 2233, 3689, 1385, 559, 2745, 1463, 3465, 376, 1718,
+    3217, 4045, 1580, 3612, 2525, 1228, 3018, 1958, 3725, 2358, 1361, 3996, 1581, 3063, 1224, 2737, 1475, 2442, 3946, 191, 1796, 2128, 3975, 134, 1916, 3318, 1597, 2071, 3749, 2672, 403, 1278, 602, 3745, 3220, 1374, 445, 2064, 3830, 243, 1252, 2390, 1563, 2724, 3875, 1818, 1346, 165, 1650, 3264, 2680, 117, 2998, 4081, 343, 2799, 9, 3122, 1743, 3724, 1040, 2231, 3842, 1209,
+    900, 398, 2851, 697, 1797, 3482, 293, 2679, 1649, 566, 2954, 91, 2697, 714, 2060, 3211, 781, 480, 3040, 1038, 2611, 666, 2989, 3458, 1201, 2796, 548, 2975, 839, 3121, 1850, 4001, 2208, 1631, 790, 2558, 2972, 1148, 3213, 1849, 3624, 971, 2102, 108, 772, 3101, 2589, 3777, 1042, 656, 3907, 2097, 1615, 2540, 805, 1935, 1231, 3494, 2451, 268, 2995, 750, 2682, 2020,
+    3024, 1392, 2124, 3279, 106, 2217, 1387, 822, 3214, 3825, 2160, 1000, 2395, 3691, 228, 4038, 1872, 3413, 1608, 2225, 3536, 303, 1653, 886, 2541, 224, 4037, 2252, 1428, 172, 3504, 958, 2848, 113, 3628, 1834, 3979, 19, 2317, 779, 2797, 518, 3174, 3549, 1482, 2266, 444, 2014, 3555, 2439, 1213, 3113, 535, 1135, 3204, 3858, 2309, 931, 623, 2009, 3359, 1566, 140, 3550,
+    1808, 3872, 2488, 1152, 3764, 2892, 3960, 2412, 353, 1223, 1825, 3444, 3116, 1717, 1082, 2313, 1280, 2661, 82, 3852, 1389, 3200, 2330, 3812, 2038, 3581, 1728, 1039, 3339, 2427, 586, 2580, 1238, 3328, 2280, 1047, 595, 2662, 1363, 3338, 1620, 3934, 2497, 1881, 1054, 3954, 3215, 864, 2887, 1801, 320, 3519, 2378, 3704, 1753, 424, 2958, 1660, 4005, 2601, 1116, 3912, 2381, 573,
+    2740, 200, 828, 1667, 432, 1931, 1035, 1616, 3598, 2640, 728, 264, 1437, 557, 3501, 2966, 372, 3734, 974, 1978, 758, 2719, 1145, 452, 1433, 725, 2681, 408, 3843, 1918, 1547, 3906, 1996, 503, 1456, 3019, 3493, 1700, 3742, 355, 2134, 176, 1311, 615, 2867, 315, 1680, 1314, 8, 3297, 1494, 783, 1950, 83, 2656, 1382, 3561, 138, 2834, 1404, 330, 1904, 3156, 1027,
+    1357, 3381, 3041, 3666, 2729, 734, 3415, 177, 3051, 2021, 4079, 2823, 3775, 2186, 2616, 869, 1668, 3148, 2367, 3315, 393, 4075, 1870, 2920, 3343, 2362, 3188, 1303, 2782, 825, 3171, 259, 2905, 3717, 2538, 184, 2074, 838, 2860, 2407, 1024, 3496, 3008, 3706, 1985, 2349, 3623, 2582, 4058, 2184, 2694, 3873, 2964, 990, 3346, 690, 2033, 1066, 2201, 3490, 2971, 718, 3700, 2188,
+    4061, 391, 1989, 2325, 1430, 3150, 2125, 2526, 592, 1403, 976, 2351, 1165, 1851, 114, 3921, 2063, 613, 1358, 2785, 1623, 2254, 25, 3542, 1045, 246, 1852, 3554, 87, 2243, 3615, 1169, 727, 1705, 968, 3957, 3185, 1251, 500, 4063, 1751, 2622, 842, 1519, 90, 3393, 819, 490, 1874, 999, 571, 1275, 2271, 1586, 4040, 2448, 3126, 3731, 436, 885, 1708, 2421, 24, 1599,
+    889, 2563, 1199, 645, 70, 4013, 1237, 3723, 1694, 3499, 3, 3266, 484, 2997, 3390, 1233, 2842, 3687, 152, 3480, 1084, 3698, 881, 2490, 1542, 3992, 2209, 692, 1690, 3022, 1470, 2625, 2114, 3512, 2359, 381, 2684, 1897, 3368, 1395, 3080, 289, 2065, 3981, 2758, 1141, 3097, 1472, 2870, 3352, 3707, 225, 3159, 505, 1895, 214, 1222, 1774, 2686, 3978, 3275, 1196, 3518, 2825,
+    3270, 1720, 3796, 3466, 2650, 1841, 298, 899, 2862, 2091, 2671, 1744, 3735, 801, 1560, 349, 2262, 903, 1833, 2524, 512, 3117, 1793, 2827, 476, 3038, 1216, 2550, 3826, 980, 431, 4048, 35, 2992, 1265, 1595, 765, 3675, 76, 2247, 696, 3456, 1254, 2452, 664, 1757, 2133, 3750, 145, 2332, 1554, 1981, 3580, 2712, 868, 3640, 2919, 638, 2275, 1427, 309, 2595, 2006, 492,
+    2226, 178, 2911, 836, 1528, 3028, 2240, 3327, 404, 3970, 707, 1294, 2464, 2131, 4032, 2600, 3319, 1406, 2913, 3974, 2156, 1425, 221, 3877, 2017, 811, 3662, 272, 3287, 1988, 2408, 3357, 1746, 598, 3239, 3823, 2182, 2934, 1078, 2604, 3840, 1697, 2906, 413, 3210, 3880, 331, 2644, 1260, 848, 3042, 2535, 1077, 1438, 3261, 2365, 1561, 3799, 85, 3082, 1876, 674, 3932, 1101,
+    3644, 1344, 1943, 2401, 390, 3835, 1048, 2572, 1541, 1133, 3075, 3584, 308, 2889, 1065, 1869, 601, 3783, 282, 1181, 736, 3312, 2368, 1126, 3383, 1675, 2734, 1426, 628, 2873, 1317, 843, 2717, 2048, 1004, 2536, 333, 1782, 3295, 1517, 219, 2153, 815, 3502, 1579, 2268, 987, 3409, 1780, 4018, 354, 665, 3914, 47, 1956, 456, 1006, 2010, 3406, 1130, 3621, 2894, 1549, 3092,
+    2485, 640, 3993, 3179, 1270, 3436, 585, 1925, 3757, 2304, 136, 1976, 1486, 646, 3520, 50, 3155, 1637, 2435, 3522, 1937, 2756, 3748, 661, 2224, 58, 3230, 2357, 1830, 3892, 170, 3607, 1447, 3949, 190, 3392, 1336, 584, 4010, 918, 3016, 3670, 1155, 2406, 52, 1304, 3009, 607, 2085, 2699, 3205, 1848, 2291, 3402, 2764, 3865, 3048, 2508, 735, 2710, 443, 2341, 897, 263,
+    1785, 2769, 983, 56, 2197, 1685, 2703, 202, 2944, 810, 3377, 2626, 3787, 3047, 2055, 1236, 2752, 2122, 945, 3093, 96, 1624, 439, 3014, 1388, 4015, 977, 448, 3506, 1098, 2242, 3026, 506, 2361, 2952, 1862, 3619, 2790, 1992, 2483, 525, 1868, 2652, 4093, 1998, 3595, 2478, 3816, 122, 1412, 929, 3716, 1166, 1648, 813, 1300, 199, 1489, 3998, 1771, 1310, 3808, 2052, 3423,
+    434, 3712, 1625, 3558, 2955, 853, 4019, 1348, 3511, 1732, 1246, 487, 934, 1672, 2510, 3965, 788, 3711, 396, 1369, 4090, 1055, 2603, 1879, 3528, 2518, 2067, 3005, 1516, 2588, 751, 1740, 3418, 1131, 1576, 686, 2296, 1118, 18, 3263, 1365, 3401, 294, 737, 3177, 410, 867, 1633, 2963, 3579, 2375, 252, 2881, 479, 2471, 3576, 2180, 3306, 332, 2255, 3035, 41, 2648, 1396,
+    2929, 2230, 1219, 2512, 446, 2008, 3189, 2388, 626, 2164, 2831, 4047, 2376, 174, 3272, 368, 1469, 3226, 2578, 1991, 2874, 2263, 3681, 876, 188, 1239, 683, 3776, 226, 3183, 4083, 2148, 63, 2649, 3859, 299, 3086, 3933, 1585, 2185, 3767, 988, 1707, 2908, 1407, 1844, 2771, 2245, 1161, 560, 1755, 3376, 2051, 4064, 3135, 1832, 652, 2853, 1051, 3649, 760, 3290, 1105, 3945,
+    872, 154, 3207, 713, 3780, 1453, 281, 1087, 3695, 30, 3299, 1919, 1400, 3551, 1119, 1890, 2314, 618, 1703, 3428, 724, 295, 3146, 1557, 3341, 2896, 1683, 2723, 1974, 1017, 541, 1380, 3720, 804, 3280, 2082, 997, 2567, 777, 2961, 213, 2707, 2328, 3632, 1025, 3891, 3304, 255, 4003, 3108, 2587, 1323, 743, 1479, 105, 1013, 3901, 1618, 2044, 2627, 1465, 1846, 576, 1994,
+    2560, 3521, 1742, 2118, 2800, 3404, 1783, 2609, 2968, 1582, 1022, 412, 2713, 687, 2976, 3857, 2761, 3620, 62, 1108, 3844, 1340, 2100, 540, 2345, 3925, 405, 3457, 1319, 2468, 3362, 2815, 1867, 2372, 1281, 1714, 3690, 482, 3498, 1842, 1285, 3994, 558, 2039, 81, 2499, 678, 1481, 1923, 964, 12, 3824, 2980, 2205, 2762, 3432, 2398, 181, 3247, 462, 4094, 2350, 3589, 3089,
+    1555, 1094, 4041, 247, 1267, 908, 3959, 2041, 732, 3860, 2343, 3132, 3769, 2144, 1621, 237, 912, 1329, 3025, 2146, 2642, 1775, 3721, 2746, 1121, 1953, 902, 2285, 130, 3671, 1659, 278, 3153, 522, 2721, 123, 2996, 1466, 2380, 377, 3231, 873, 1510, 3476, 3123, 1250, 2147, 3650, 2839, 3451, 2323, 1122, 3545, 379, 1765, 1218, 603, 3768, 1360, 938, 2885, 133, 1245, 363,
+    2364, 554, 2743, 3344, 2474, 530, 3112, 169, 1297, 3430, 536, 1741, 98, 1043, 2574, 3253, 2246, 1854, 4022, 510, 3283, 204, 858, 3398, 36, 3118, 1478, 3794, 2986, 706, 2176, 922, 3559, 1097, 3976, 3322, 2149, 1160, 2810, 3883, 2007, 2513, 2953, 328, 1721, 3793, 422, 2566, 807, 329, 1638, 1967, 648, 2520, 3727, 3109, 2116, 2927, 2491, 1939, 3365, 1709, 2728, 3815,
+    2037, 3120, 831, 1405, 1896, 3592, 1622, 2369, 2864, 2151, 1107, 2542, 3532, 1410, 3917, 427, 3568, 709, 2509, 1503, 1037, 2973, 2436, 1604, 4035, 2594, 563, 1819, 2659, 1234, 4004, 2565, 1511, 2273, 1823, 336, 882, 3772, 575, 1628, 171, 3570, 1120, 2260, 2716, 935, 3064, 1806, 1342, 3144, 3900, 2744, 3296, 985, 1546, 238, 896, 1663, 305, 3660, 695, 2213, 960, 3407,
+    144, 1795, 3894, 2267, 51, 2708, 1023, 3818, 366, 1821, 4087, 2985, 755, 2057, 2912, 949, 1583, 2774, 231, 3447, 2258, 3866, 1982, 672, 1225, 2077, 3320, 1062, 370, 3241, 1968, 7, 3068, 681, 3631, 2573, 1567, 3175, 2321, 1067, 3070, 722, 1856, 3744, 642, 1471, 4084, 131, 3514, 2443, 531, 1227, 155, 2265, 4024, 2658, 3326, 3910, 1168, 3078, 1530, 3956, 489, 1424,
+    3647, 1203, 420, 2924, 3755, 719, 3248, 1376, 3067, 890, 196, 1559, 3269, 270, 2432, 1885, 3212, 1164, 3778, 1752, 579, 1338, 344, 3585, 3017, 288, 3658, 2371, 3882, 1691, 611, 2789, 3809, 1339, 389, 2950, 2015, 59, 3548, 2751, 2158, 4011, 1352, 29, 3388, 2370, 2812, 1946, 954, 2110, 1558, 2947, 3573, 1909, 1326, 679, 1853, 2312, 551, 2702, 33, 2414, 3209, 2824,
+    2547, 2143, 3379, 966, 1492, 1979, 2479, 463, 2194, 3657, 2738, 2318, 1261, 3713, 604, 4002, 11, 2192, 2967, 919, 2607, 3369, 2837, 1676, 2539, 984, 1568, 93, 2901, 1318, 3538, 1041, 2216, 1756, 3454, 1030, 4050, 1402, 798, 1723, 311, 3277, 2546, 2886, 2043, 461, 1206, 3677, 361, 3260, 3988, 809, 2605, 470, 3007, 3517, 102, 3221, 1398, 2062, 3611, 1134, 1928, 865,
+    4060, 621, 1710, 2606, 3510, 317, 4017, 1682, 3329, 1159, 1940, 654, 3461, 1789, 1015, 2691, 1455, 3599, 374, 1947, 4069, 71, 2126, 763, 3961, 2278, 3161, 1997, 824, 2623, 2080, 244, 3257, 780, 2732, 2308, 545, 3351, 2476, 3806, 1204, 588, 1591, 963, 3610, 1699, 754, 3049, 2651, 1106, 65, 2221, 1644, 3821, 1100, 2463, 1614, 3801, 965, 2965, 715, 3394, 1593, 212,
+};
+
+#endif /* BLUE_NOISE_64X64_H */
diff --git a/pixman/dither/make-blue-noise.c b/pixman/dither/make-blue-noise.c
new file mode 100644
index 0000000..f9974b4
--- /dev/null
+++ b/pixman/dither/make-blue-noise.c
@@ -0,0 +1,679 @@
+/* Blue noise generation using the void-and-cluster method as described in
+ *
+ *     The void-and-cluster method for dither array generation
+ *     Ulichney, Robert A (1993)
+ *
+ *     http://cv.ulichney.com/papers/1993-void-cluster.pdf
+ *
+ * Note that running with openmp (-DUSE_OPENMP) will trigger additional
+ * randomness due to computing reductions in parallel, and is not recommended
+ * unless generating very large dither arrays.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <stdio.h>
+
+/* Booleans and utility functions */
+
+#ifndef TRUE
+#   define TRUE 1
+#endif
+
+#ifndef FALSE
+#   define FALSE 0
+#endif
+
+typedef int bool_t;
+
+int
+imin (int x, int y)
+{
+    return x < y ? x : y;
+}
+
+/* Memory allocation */
+void *
+malloc_abc (unsigned int a, unsigned int b, unsigned int c)
+{
+    if (a >= INT32_MAX / b)
+	return NULL;
+    else if (a * b >= INT32_MAX / c)
+	return NULL;
+    else
+	return malloc (a * b * c);
+}
+
+/* Random number generation */
+typedef uint32_t xorwow_state_t[5];
+
+uint32_t
+xorwow_next (xorwow_state_t *state)
+{
+    uint32_t s = (*state)[0],
+    t = (*state)[3];
+    (*state)[3] = (*state)[2];
+    (*state)[2] = (*state)[1];
+    (*state)[1] = s;
+
+    t ^= t >> 2;
+    t ^= t << 1;
+    t ^= s ^ (s << 4);
+
+    (*state)[0] = t;
+    (*state)[4] += 362437;
+
+    return t + (*state)[4];
+}
+
+float
+xorwow_float (xorwow_state_t *s)
+{
+    return (xorwow_next (s) >> 9) / (float)((1 << 23) - 1);
+}
+
+/* Floating point matrices
+ *
+ * Used to cache the cluster sizes.
+ */
+typedef struct matrix_t {
+    int width;
+    int height;
+    float *buffer;
+} matrix_t;
+
+bool_t
+matrix_init (matrix_t *matrix, int width, int height)
+{
+    float *buffer;
+
+    if (!matrix)
+	return FALSE;
+
+    buffer = malloc_abc (width, height, sizeof (float));
+
+    if (!buffer)
+	return FALSE;
+
+    matrix->buffer = buffer;
+    matrix->width  = width;
+    matrix->height = height;
+
+    return TRUE;
+}
+
+bool_t
+matrix_copy (matrix_t *dst, matrix_t const *src)
+{
+    float *srcbuf = src->buffer,
+	  *srcend = src->buffer + src->width * src->height,
+	  *dstbuf = dst->buffer;
+
+    if (dst->width != src->width || dst->height != src->height)
+	return FALSE;
+
+    while (srcbuf < srcend)
+	*dstbuf++ = *srcbuf++;
+
+    return TRUE;
+}
+
+float *
+matrix_get (matrix_t *matrix, int x, int y)
+{
+    return &matrix->buffer[y * matrix->width + x];
+}
+
+void
+matrix_destroy (matrix_t *matrix)
+{
+    free (matrix->buffer);
+}
+
+/* Binary patterns */
+typedef struct pattern_t {
+    int width;
+    int height;
+    bool_t *buffer;
+} pattern_t;
+
+bool_t
+pattern_init (pattern_t *pattern, int width, int height)
+{
+    bool_t *buffer;
+
+    if (!pattern)
+	return FALSE;
+
+    buffer = malloc_abc (width, height, sizeof (bool_t));
+
+    if (!buffer)
+	return FALSE;
+
+    pattern->buffer = buffer;
+    pattern->width  = width;
+    pattern->height = height;
+
+    return TRUE;
+}
+
+bool_t
+pattern_copy (pattern_t *dst, pattern_t const *src)
+{
+    bool_t *srcbuf = src->buffer,
+	   *srcend = src->buffer + src->width * src->height,
+	   *dstbuf = dst->buffer;
+
+    if (dst->width != src->width || dst->height != src->height)
+	return FALSE;
+
+    while (srcbuf < srcend)
+	*dstbuf++ = *srcbuf++;
+
+    return TRUE;
+}
+
+bool_t *
+pattern_get (pattern_t *pattern, int x, int y)
+{
+    return &pattern->buffer[y * pattern->width + x];
+}
+
+void
+pattern_fill_white_noise (pattern_t *pattern, float fraction,
+			  xorwow_state_t *s)
+{
+    bool_t *buffer = pattern->buffer;
+    bool_t *end    = buffer + (pattern->width * pattern->height);
+
+    while (buffer < end)
+	*buffer++ = xorwow_float (s) < fraction;
+}
+
+void
+pattern_destroy (pattern_t *pattern)
+{
+    free (pattern->buffer);
+}
+
+/* Dither arrays */
+typedef struct array_t {
+    int width;
+    int height;
+    uint32_t *buffer;
+} array_t;
+
+bool_t
+array_init (array_t *array, int width, int height)
+{
+    uint32_t *buffer;
+
+    if (!array)
+	return FALSE;
+
+    buffer = malloc_abc (width, height, sizeof (uint32_t));
+
+    if (!buffer)
+	return FALSE;
+
+    array->buffer = buffer;
+    array->width  = width;
+    array->height = height;
+
+    return TRUE;
+}
+
+uint32_t *
+array_get (array_t *array, int x, int y)
+{
+    return &array->buffer[y * array->width + x];
+}
+
+bool_t
+array_save_ppm (array_t *array, const char *filename)
+{
+    FILE *f = fopen(filename, "wb");
+
+    int i   = 0;
+    int bpp = 2;
+    uint8_t buffer[1024];
+
+    if (!f)
+	return FALSE;
+
+    if (array->width * array->height - 1 < 256)
+	bpp = 1;
+
+    fprintf(f, "P5 %d %d %d\n", array->width, array->height,
+	    array->width * array->height - 1);
+    while (i < array->width * array->height)
+    {
+	    int j = 0;
+	    for (; j < 1024 / bpp && j < array->width * array->height; ++j)
+	    {
+		    uint32_t v = array->buffer[i + j];
+		    if (bpp == 2)
+		    {
+			buffer[2 * j] = v & 0xff;
+			buffer[2 * j + 1] = (v & 0xff00) >> 8;
+		    } else {
+			buffer[j] = v;
+		    }
+	    }
+
+	    fwrite((void *)buffer, bpp, j, f);
+	    i += j;
+    }
+
+    if (fclose(f) != 0)
+	return FALSE;
+
+    return TRUE;
+}
+
+bool_t
+array_save (array_t *array, const char *filename)
+{
+    int x, y;
+    FILE *f = fopen(filename, "wb");
+
+    if (!f)
+	return FALSE;
+
+    fprintf (f, 
+"/* WARNING: This file is generated by make-blue-noise.c\n"
+" * Please edit that file instead of this one.\n"
+" */\n"
+"\n"
+"#ifndef BLUE_NOISE_%dX%d_H\n"
+"#define BLUE_NOISE_%dX%d_H\n"
+"\n"
+"#include <stdint.h>\n"
+"\n", array->width, array->height, array->width, array->height);
+
+    fprintf (f, "static const uint16_t dither_blue_noise_%dx%d[%d] = {\n",
+	     array->width, array->height, array->width * array->height);
+
+    for (y = 0; y < array->height; ++y)
+    {
+	fprintf (f, "    ");
+	for (x = 0; x < array->width; ++x)
+	{
+	    if (x != 0)
+		fprintf (f, ", ");
+
+	    fprintf (f, "%d", *array_get (array, x, y));
+	}
+
+	fprintf (f, ",\n");
+    }
+    fprintf (f, "};\n");
+
+    fprintf (f, "\n#endif /* BLUE_NOISE_%dX%d_H */\n",
+	     array->width, array->height);
+
+    if (fclose(f) != 0)
+	return FALSE;
+
+    return TRUE;
+}
+
+void
+array_destroy (array_t *array)
+{
+    free (array->buffer);
+}
+
+/* Dither array generation */
+bool_t
+compute_cluster_sizes (pattern_t *pattern, matrix_t *matrix)
+{
+    int width  = pattern->width,
+	height = pattern->height;
+
+    if (matrix->width != width || matrix->height != height)
+	return FALSE;
+
+    int px, py, qx, qy, dx, dy;
+    float tsqsi = 2.f * 1.5f * 1.5f;
+
+#ifdef USE_OPENMP
+#pragma omp parallel for default (none) \
+    private (py, px, qy, qx, dx, dy) \
+    shared (height, width, pattern, matrix, tsqsi)
+#endif
+    for (py = 0; py < height; ++py)
+    {
+	for (px = 0; px < width; ++px)
+	{
+	    bool_t pixel = *pattern_get (pattern, px, py);
+	    float dist   = 0.f;
+
+	    for (qx = 0; qx < width; ++qx)
+	    {
+		dx = imin (abs (qx - px), width - abs (qx - px));
+		dx = dx * dx;
+
+		for (qy = 0; qy < height; ++qy)
+		{
+		    dy = imin (abs (qy - py), height - abs (qy - py));
+		    dy = dy * dy;
+
+		    dist += (pixel == *pattern_get (pattern, qx, qy))
+			* expf (- (dx + dy) / tsqsi);
+		}
+	    }
+
+	    *matrix_get (matrix, px, py) = dist;
+	}
+    }
+
+    return TRUE;
+}
+
+bool_t
+swap_pixel (pattern_t *pattern, matrix_t *matrix, int x, int y)
+{
+    int width  = pattern->width,
+	height = pattern->height;
+
+    bool_t new;
+
+    float f,
+          dist  = 0.f,
+	  tsqsi = 2.f * 1.5f * 1.5f;
+
+    int px, py, dx, dy;
+    bool_t b;
+
+    new = !*pattern_get (pattern, x, y);
+    *pattern_get (pattern, x, y) = new;
+
+    if (matrix->width != width || matrix->height != height)
+	return FALSE;
+
+
+#ifdef USE_OPENMP
+#pragma omp parallel for reduction (+:dist) default (none) \
+    private (px, py, dx, dy, b, f) \
+    shared (x, y, width, height, pattern, matrix, new, tsqsi)
+#endif
+    for (py = 0; py < height; ++py)
+    {
+	dy = imin (abs (py - y), height - abs (py - y));
+	dy = dy * dy;
+
+	for (px = 0; px < width; ++px)
+	{
+	    dx = imin (abs (px - x), width - abs (px - x));
+	    dx = dx * dx;
+
+	    b = (*pattern_get (pattern, px, py) == new);
+	    f = expf (- (dx + dy) / tsqsi);
+	    *matrix_get (matrix, px, py) += (2 * b - 1) * f;
+
+	    dist += b * f;
+	}
+    }
+
+    *matrix_get (matrix, x, y) = dist;
+    return TRUE;
+}
+
+void
+largest_cluster (pattern_t *pattern, matrix_t *matrix,
+		 bool_t pixel, int *xmax, int *ymax)
+{
+    int width       = pattern->width,
+	height      = pattern->height;
+
+    int   x, y;
+
+    float vmax = -INFINITY;
+
+#ifdef USE_OPENMP
+#pragma omp parallel default (none) \
+    private (x, y) \
+    shared (height, width, pattern, matrix, pixel, xmax, ymax, vmax)
+#endif
+    {
+	int xbest = -1,
+	    ybest = -1;
+
+#ifdef USE_OPENMP
+	float vbest = -INFINITY;
+
+#pragma omp for reduction (max: vmax) collapse (2)
+#endif
+	for (y = 0; y < height; ++y)
+	{
+	    for (x = 0; x < width; ++x)
+	    {
+		if (*pattern_get (pattern, x, y) != pixel)
+		    continue;
+
+		if (*matrix_get (matrix, x, y) > vmax)
+		{
+		    vmax = *matrix_get (matrix, x, y);
+#ifdef USE_OPENMP
+		    vbest = vmax;
+#endif
+		    xbest = x;
+		    ybest = y;
+		}
+	    }
+	}
+
+#ifdef USE_OPENMP
+#pragma omp barrier
+#pragma omp critical
+	{
+	    if (vmax == vbest)
+	    {
+		*xmax = xbest;
+		*ymax = ybest;
+	    }
+	}
+#else
+	*xmax = xbest;
+	*ymax = ybest;
+#endif
+    }
+
+    assert (vmax > -INFINITY);
+}
+
+void
+generate_initial_binary_pattern (pattern_t *pattern, matrix_t *matrix)
+{
+    int xcluster = 0,
+	ycluster = 0,
+	xvoid    = 0,
+	yvoid    = 0;
+
+    for (;;)
+    {
+	largest_cluster (pattern, matrix, TRUE, &xcluster, &ycluster);
+	assert (*pattern_get (pattern, xcluster, ycluster) == TRUE);
+	swap_pixel (pattern, matrix, xcluster, ycluster);
+
+	largest_cluster (pattern, matrix, FALSE, &xvoid, &yvoid);
+	assert (*pattern_get (pattern, xvoid, yvoid) == FALSE);
+	swap_pixel (pattern, matrix, xvoid, yvoid);
+
+	if (xcluster == xvoid && ycluster == yvoid)
+	    return;
+    }
+}
+
+bool_t
+generate_dither_array (array_t *array,
+		       pattern_t const *prototype, matrix_t const *matrix,
+		       pattern_t *temp_pattern, matrix_t *temp_matrix)
+{
+    int width        = prototype->width,
+	height       = prototype->height;
+
+    int x, y, rank;
+
+    int initial_rank = 0;
+
+    if (array->width != width || array->height != height)
+	return FALSE;
+
+    // Make copies of the prototype and associated sizes matrix since we will
+    // trash them
+    if (!pattern_copy (temp_pattern, prototype))
+	return FALSE;
+
+    if (!matrix_copy (temp_matrix, matrix))
+	return FALSE;
+
+    // Compute initial rank
+    for (y = 0; y < height; ++y)
+    {
+	for (x = 0; x < width; ++x)
+	{
+	    if (*pattern_get (temp_pattern, x, y))
+		initial_rank += 1;
+
+	    *array_get (array, x, y) = 0;
+	}
+    }
+
+    // Phase 1
+    for (rank = initial_rank; rank > 0; --rank)
+    {
+	largest_cluster (temp_pattern, temp_matrix, TRUE, &x, &y);
+	swap_pixel (temp_pattern, temp_matrix, x, y);
+	*array_get (array, x, y) = rank - 1;
+    }
+
+    // Make copies again for phases 2 & 3
+    if (!pattern_copy (temp_pattern, prototype))
+	return FALSE;
+
+    if (!matrix_copy (temp_matrix, matrix))
+	return FALSE;
+
+    // Phase 2 & 3
+    for (rank = initial_rank; rank < width * height; ++rank)
+    {
+	largest_cluster (temp_pattern, temp_matrix, FALSE, &x, &y);
+	swap_pixel (temp_pattern, temp_matrix, x, y);
+	*array_get (array, x, y) = rank;
+    }
+
+    return TRUE;
+}
+
+bool_t
+generate (int size, xorwow_state_t *s,
+	  char const *c_filename, char const *ppm_filename)
+{
+    bool_t ok = TRUE;
+
+    pattern_t prototype, temp_pattern;
+    array_t   array;
+    matrix_t  matrix, temp_matrix;
+
+    printf ("Generating %dx%d blue noise...\n", size, size);
+
+    if (!pattern_init (&prototype, size, size))
+	return FALSE;
+
+    if (!pattern_init (&temp_pattern, size, size))
+    {
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    if (!matrix_init (&matrix, size, size))
+    {
+	pattern_destroy (&temp_pattern);
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    if (!matrix_init (&temp_matrix, size, size))
+    {
+	matrix_destroy (&matrix);
+	pattern_destroy (&temp_pattern);
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    if (!array_init (&array, size, size))
+    {
+	matrix_destroy (&temp_matrix);
+	matrix_destroy (&matrix);
+	pattern_destroy (&temp_pattern);
+	pattern_destroy (&prototype);
+	return FALSE;
+    }
+
+    printf("Filling initial binary pattern with white noise...\n");
+    pattern_fill_white_noise (&prototype, .1, s);
+
+    printf("Initializing cluster sizes...\n");
+    if (!compute_cluster_sizes (&prototype, &matrix))
+    {
+	fprintf (stderr, "Error while computing cluster sizes\n");
+	ok = FALSE;
+	goto out;
+    }
+
+    printf("Generating initial binary pattern...\n");
+    generate_initial_binary_pattern (&prototype, &matrix);
+
+    printf("Generating dither array...\n");
+    if (!generate_dither_array (&array, &prototype, &matrix,
+			 &temp_pattern, &temp_matrix))
+    {
+	fprintf (stderr, "Error while generating dither array\n");
+	ok = FALSE;
+	goto out;
+    }
+
+    printf("Saving dither array...\n");
+    if (!array_save (&array, c_filename))
+    {
+	fprintf (stderr, "Error saving dither array\n");
+	ok = FALSE;
+	goto out;
+    }
+
+#if SAVE_PPM
+    if (!array_save_ppm (&array, ppm_filename))
+    {
+	fprintf (stderr, "Error saving dither array PPM\n");
+	ok = FALSE;
+	goto out;
+    }
+#else
+    (void)ppm_filename;
+#endif
+
+    printf("All done!\n");
+
+out:
+    array_destroy (&array);
+    matrix_destroy (&temp_matrix);
+    matrix_destroy (&matrix);
+    pattern_destroy (&temp_pattern);
+    pattern_destroy (&prototype);
+    return ok;
+}
+
+int
+main (void)
+{
+    xorwow_state_t s = {1185956906, 12385940, 983948, 349208051, 901842};
+
+    if (!generate (64, &s, "blue-noise-64x64.h", "blue-noise-64x64.ppm"))
+	return -1;
+
+    return 0;
+}
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 086c6e0..0e79e86 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -209,12 +209,13 @@ _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
 		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
 		);
 		return ret;
+	} else {
+		uint64_t val = ((uint64_t)__w3 << 48)
+			     | ((uint64_t)__w2 << 32)
+			     | ((uint64_t)__w1 << 16)
+			     | ((uint64_t)__w0 <<  0);
+		return *(__m64 *)&val;
 	}
-	uint64_t val = ((uint64_t)__w3 << 48)
-		     | ((uint64_t)__w2 << 32)
-		     | ((uint64_t)__w1 << 16)
-		     | ((uint64_t)__w0 <<  0);
-	return *(__m64 *)&val;
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -236,10 +237,11 @@ _mm_set_pi32 (unsigned __i1, unsigned __i0)
 		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
 		);
 		return ret;
+	} else {
+		uint64_t val = ((uint64_t)__i1 << 32)
+			     | ((uint64_t)__i0 <<  0);
+		return *(__m64 *)&val;
 	}
-	uint64_t val = ((uint64_t)__i1 << 32)
-		     | ((uint64_t)__i0 <<  0);
-	return *(__m64 *)&val;
 }
 #undef _MM_SHUFFLE
 
diff --git a/pixman/make-srgb.pl b/pixman/make-srgb.pl
index cdaa80b..8bba160 100644
--- a/pixman/make-srgb.pl
+++ b/pixman/make-srgb.pl
@@ -73,7 +73,7 @@ print <<"PROLOG";
 #include <stdint.h>
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
diff --git a/pixman/meson.build b/pixman/meson.build
new file mode 100644
index 0000000..62ec66b
--- /dev/null
+++ b/pixman/meson.build
@@ -0,0 +1,143 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+config_h = configure_file(
+  configuration : config,
+  output : 'pixman-config.h'
+)
+
+version_h = configure_file(
+  configuration : version_conf,
+  input : 'pixman-version.h.in',
+  output : 'pixman-version.h',
+  install_dir : join_paths(get_option('prefix'), get_option('includedir'), 'pixman-1')
+)
+
+libpixman_extra_cargs = []
+default_library = get_option('default_library')
+if default_library != 'static' and cc.has_function_attribute('dllexport')
+  libpixman_extra_cargs = ['-DPIXMAN_API=__declspec(dllexport)']
+endif
+
+pixman_simd_libs = []
+simds = [
+  # the mmx library can be compiled with mmx on x86/x86_64, iwmmxt on
+  # some arm cores, or loongson mmi on loongson mips systems. The
+  # libraries will all have the same name, "pixman-mmx", but there is
+  # no chance of more than one version being built in the same build
+  # because no system could have mmx, iwmmxt, and mmi, and it
+  # simplifies the build logic to give them the same name.
+  ['mmx', have_mmx, mmx_flags, []],
+  ['mmx', have_loongson_mmi, loongson_mmi_flags, []],
+  ['mmx', have_iwmmxt, iwmmxt_flags, []],
+
+  ['sse2', have_sse2, sse2_flags, []],
+  ['ssse3', have_ssse3, ssse3_flags, []],
+  ['vmx', have_vmx, vmx_flags, []],
+  ['arm-simd', have_armv6_simd, [],
+   ['pixman-arm-simd-asm.S', 'pixman-arm-simd-asm-scaled.S']],
+  ['arm-neon', have_neon, [],
+   ['pixman-arm-neon-asm.S', 'pixman-arm-neon-asm-bilinear.S']],
+  ['arm-neon', have_a64neon, [],
+   ['pixman-arma64-neon-asm.S', 'pixman-arma64-neon-asm-bilinear.S']],
+  ['mips-dspr2', have_mips_dspr2, mips_dspr2_flags,
+   ['pixman-mips-dspr2-asm.S', 'pixman-mips-memcpy-asm.S']],
+]
+
+foreach simd : simds
+  if simd[1]
+    name = 'pixman-' + simd[0]
+    pixman_simd_libs += static_library(
+      name,
+      [name + '.c', config_h, version_h, simd[3]],
+      c_args : simd[2]
+    )
+  endif
+endforeach
+
+pixman_files = files(
+  'pixman.c',
+  'pixman-access.c',
+  'pixman-access-accessors.c',
+  'pixman-bits-image.c',
+  'pixman-combine32.c',
+  'pixman-combine-float.c',
+  'pixman-conical-gradient.c',
+  'pixman-filter.c',
+  'pixman-x86.c',
+  'pixman-mips.c',
+  'pixman-arm.c',
+  'pixman-ppc.c',
+  'pixman-edge.c',
+  'pixman-edge-accessors.c',
+  'pixman-fast-path.c',
+  'pixman-glyph.c',
+  'pixman-general.c',
+  'pixman-gradient-walker.c',
+  'pixman-image.c',
+  'pixman-implementation.c',
+  'pixman-linear-gradient.c',
+  'pixman-matrix.c',
+  'pixman-noop.c',
+  'pixman-radial-gradient.c',
+  'pixman-region16.c',
+  'pixman-region32.c',
+  'pixman-solid-fill.c',
+  'pixman-timer.c',
+  'pixman-trap.c',
+  'pixman-utils.c',
+)
+
+# Android cpu-features
+cpu_features_path = get_option('cpu-features-path')
+cpu_features_sources = []
+cpu_features_inc = []
+if cpu_features_path != ''
+  message('Using cpu-features.[ch] from ' + cpu_features_path)
+  cpu_features_sources = files(
+    cpu_features_path / 'cpu-features.h',
+    cpu_features_path / 'cpu-features.c',
+  )
+  cpu_features_inc = include_directories(cpu_features_path)
+endif
+
+libpixman = library(
+  'pixman-1',
+  [pixman_files, config_h, version_h, cpu_features_sources],
+  link_with: pixman_simd_libs,
+  c_args : libpixman_extra_cargs,
+  dependencies : [dep_m, dep_threads],
+  include_directories : cpu_features_inc,
+  version : meson.project_version(),
+  install : true,
+)
+
+inc_pixman = include_directories('.')
+
+idep_pixman = declare_dependency(
+  link_with: libpixman,
+  include_directories : inc_pixman,
+)
+
+if meson.version().version_compare('>= 0.54.0')
+  meson.override_dependency('pixman-1', idep_pixman)
+endif
+
+install_headers('pixman.h', subdir : 'pixman-1')
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
index 4f0642d..7bd7a5a 100644
--- a/pixman/pixman-access.c
+++ b/pixman/pixman-access.c
@@ -25,7 +25,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <stdlib.h>
@@ -68,14 +68,14 @@
 
 #ifdef WORDS_BIGENDIAN
 #define FETCH_24(img,l,o)                                              \
-    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
+    ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
 #else
 #define FETCH_24(img,l,o)						\
-    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\
-     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
+    ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\
+     (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
 #endif
 
 /* Store macros */
@@ -87,7 +87,7 @@
 	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
 	uint32_t __m, __v;						\
 									\
-	__m = 1 << (0x1f - ((o) & 0x1f));				\
+	__m = 1U << (0x1f - ((o) & 0x1f));				\
 	__v = (v)? __m : 0;						\
 									\
 	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
@@ -100,7 +100,7 @@
 	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
 	uint32_t __m, __v;						\
 									\
-	__m = 1 << ((o) & 0x1f);					\
+	__m = 1U << ((o) & 0x1f);					\
 	__v = (v)? __m : 0;						\
 									\
 	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
@@ -465,7 +465,7 @@ convert_and_store_pixel (bits_image_t *		image,
 	    image, bits, offset, PIXMAN_ ## format);			\
     }									\
 									\
-    static const void *const __dummy__ ## format
+    static const void *const __dummy__ ## format MAYBE_UNUSED
 
 MAKE_ACCESSORS(a8r8g8b8);
 MAKE_ACCESSORS(x8r8g8b8);
@@ -610,6 +610,32 @@ fetch_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
     }
 }
 
+static void
+fetch_scanline_r8g8b8_sRGB_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint8_t *bits = (uint8_t *)(image->bits + y * image->rowstride);
+    argb_t *buffer = (argb_t *)b;
+    int i;
+    for (i = x; i < width; ++i)
+    {
+	uint32_t p = FETCH_24 (image, bits, i);
+	argb_t *argb = buffer;
+
+	argb->a = 1.0f;
+
+	argb->r = to_linear[(p >> 16) & 0xff];
+	argb->g = to_linear[(p >>  8) & 0xff];
+	argb->b = to_linear[(p >>  0) & 0xff];
+
+	buffer++;
+    }
+}
+
 /* Expects a float buffer */
 static void
 fetch_scanline_a2r10g10b10_float (bits_image_t *  image,
@@ -642,6 +668,48 @@ fetch_scanline_a2r10g10b10_float (bits_image_t *  image,
 }
 
 /* Expects a float buffer */
+#ifndef PIXMAN_FB_ACCESSORS
+static void
+fetch_scanline_rgbf_float (bits_image_t   *image,
+			   int             x,
+			   int             y,
+			   int             width,
+			   uint32_t *      b,
+			   const uint32_t *mask)
+{
+    const float *bits = (float *)image->bits + y * image->rowstride;
+    const float *pixel = bits + x * 3;
+    argb_t *buffer = (argb_t *)b;
+
+    for (; width--; buffer++) {
+	buffer->r = *pixel++;
+	buffer->g = *pixel++;
+	buffer->b = *pixel++;
+	buffer->a = 1.f;
+    }
+}
+
+static void
+fetch_scanline_rgbaf_float (bits_image_t   *image,
+			    int             x,
+			    int             y,
+			    int             width,
+			    uint32_t *      b,
+			    const uint32_t *mask)
+{
+    const float *bits = (float *)image->bits + y * image->rowstride;
+    const float *pixel = bits + x * 4;
+    argb_t *buffer = (argb_t *)b;
+
+    for (; width--; buffer++) {
+	buffer->r = *pixel++;
+	buffer->g = *pixel++;
+	buffer->b = *pixel++;
+	buffer->a = *pixel++;
+    }
+}
+#endif
+
 static void
 fetch_scanline_x2r10g10b10_float (bits_image_t   *image,
 				  int             x,
@@ -805,6 +873,40 @@ fetch_scanline_yv12 (bits_image_t   *image,
 
 /**************************** Pixel wise fetching *****************************/
 
+#ifndef PIXMAN_FB_ACCESSORS
+static argb_t
+fetch_pixel_rgbf_float (bits_image_t *image,
+			int	    offset,
+			int	    line)
+{
+    float *bits = (float *)image->bits + line * image->rowstride;
+    argb_t argb;
+
+    argb.r = bits[offset * 3];
+    argb.g = bits[offset * 3 + 1];
+    argb.b = bits[offset * 3 + 2];
+    argb.a = 1.f;
+
+    return argb;
+}
+
+static argb_t
+fetch_pixel_rgbaf_float (bits_image_t *image,
+			 int	    offset,
+			 int	    line)
+{
+    float *bits = (float *)image->bits + line * image->rowstride;
+    argb_t argb;
+
+    argb.r = bits[offset * 4];
+    argb.g = bits[offset * 4 + 1];
+    argb.b = bits[offset * 4 + 2];
+    argb.a = bits[offset * 4 + 3];
+
+    return argb;
+}
+#endif
+
 static argb_t
 fetch_pixel_x2r10g10b10_float (bits_image_t *image,
 			       int	   offset,
@@ -905,6 +1007,24 @@ fetch_pixel_a8r8g8b8_sRGB_float (bits_image_t *image,
     return argb;
 }
 
+static argb_t
+fetch_pixel_r8g8b8_sRGB_float (bits_image_t *image,
+			       int	     offset,
+			       int           line)
+{
+    uint8_t *bits = (uint8_t *)(image->bits + line * image->rowstride);
+    uint32_t p = FETCH_24 (image, bits, offset);
+    argb_t argb;
+
+    argb.a = 1.0f;
+
+    argb.r = to_linear[(p >> 16) & 0xff];
+    argb.g = to_linear[(p >>  8) & 0xff];
+    argb.b = to_linear[(p >>  0) & 0xff];
+
+    return argb;
+}
+
 static uint32_t
 fetch_pixel_yuy2 (bits_image_t *image,
 		  int           offset,
@@ -962,6 +1082,45 @@ fetch_pixel_yv12 (bits_image_t *image,
 
 /*********************************** Store ************************************/
 
+#ifndef PIXMAN_FB_ACCESSORS
+static void
+store_scanline_rgbaf_float (bits_image_t *  image,
+			    int             x,
+			    int             y,
+			    int             width,
+			    const uint32_t *v)
+{
+    float *bits = (float *)image->bits + image->rowstride * y + 4 * x;
+    const argb_t *values = (argb_t *)v;
+
+    for (; width; width--, values++)
+    {
+	*bits++ = values->r;
+	*bits++ = values->g;
+	*bits++ = values->b;
+	*bits++ = values->a;
+    }
+}
+
+static void
+store_scanline_rgbf_float (bits_image_t *  image,
+			   int             x,
+			   int             y,
+			   int             width,
+			   const uint32_t *v)
+{
+    float *bits = (float *)image->bits + image->rowstride * y + 3 * x;
+    const argb_t *values = (argb_t *)v;
+
+    for (; width; width--, values++)
+    {
+	*bits++ = values->r;
+	*bits++ = values->g;
+	*bits++ = values->b;
+    }
+}
+#endif
+
 static void
 store_scanline_a2r10g10b10_float (bits_image_t *  image,
 				  int             x,
@@ -976,7 +1135,7 @@ store_scanline_a2r10g10b10_float (bits_image_t *  image,
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	a = pixman_float_to_unorm (values[i].a, 2);
 	r = pixman_float_to_unorm (values[i].r, 10);
@@ -1002,7 +1161,7 @@ store_scanline_x2r10g10b10_float (bits_image_t *  image,
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t r, g, b;
+	uint32_t r, g, b;
 
 	r = pixman_float_to_unorm (values[i].r, 10);
 	g = pixman_float_to_unorm (values[i].g, 10);
@@ -1027,7 +1186,7 @@ store_scanline_a2b10g10r10_float (bits_image_t *  image,
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	a = pixman_float_to_unorm (values[i].a, 2);
 	r = pixman_float_to_unorm (values[i].r, 10);
@@ -1053,7 +1212,7 @@ store_scanline_x2b10g10r10_float (bits_image_t *  image,
 
     for (i = 0; i < width; ++i)
     {
-	uint16_t r, g, b;
+	uint32_t r, g, b;
 
 	r = pixman_float_to_unorm (values[i].r, 10);
 	g = pixman_float_to_unorm (values[i].g, 10);
@@ -1078,7 +1237,7 @@ store_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
 
     for (i = 0; i < width; ++i)
     {
-	uint8_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	a = pixman_float_to_unorm (values[i].a, 8);
 	r = to_srgb (values[i].r);
@@ -1090,6 +1249,31 @@ store_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
     }
 }
 
+static void
+store_scanline_r8g8b8_sRGB_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
+{
+    uint8_t *bits = (uint8_t *)(image->bits + image->rowstride * y) + 3 * x;
+    argb_t *values = (argb_t *)v;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t r, g, b, rgb;
+
+	r = to_srgb (values[i].r);
+	g = to_srgb (values[i].g);
+	b = to_srgb (values[i].b);
+
+	rgb = (r << 16) | (g << 8) | b;
+
+	STORE_24 (image, bits, i, rgb);
+    }
+}
+
 /*
  * Contracts a floating point image to 32bpp and then stores it using a
  * regular 32-bit store proc. Despite the type, this function expects an
@@ -1151,7 +1335,7 @@ fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
     
     while (pixel < end)
     {
-	uint8_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	tmp = READ (image, pixel++);
 
@@ -1168,6 +1352,37 @@ fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
     }
 }
 
+static void
+fetch_scanline_r8g8b8_32_sRGB (bits_image_t   *image,
+                               int             x,
+                               int             y,
+                               int             width,
+                               uint32_t       *buffer,
+                               const uint32_t *mask)
+{
+    const uint8_t *bits = (uint8_t *)(image->bits + y * image->rowstride) + 3 * x;
+    uint32_t tmp;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t a, r, g, b;
+
+	tmp = FETCH_24 (image, bits, i);
+
+	a = 0xff;
+	r = (tmp >> 16) & 0xff;
+	g = (tmp >> 8) & 0xff;
+	b = (tmp >> 0) & 0xff;
+
+	r = to_linear[r] * 255.0f + 0.5f;
+	g = to_linear[g] * 255.0f + 0.5f;
+	b = to_linear[b] * 255.0f + 0.5f;
+
+	*buffer++ = (a << 24) | (r << 16) | (g << 8) | (b << 0);
+    }
+}
+
 static uint32_t
 fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
 			      int           offset,
@@ -1175,7 +1390,7 @@ fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
 {
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t tmp = READ (image, bits + offset);
-    uint8_t a, r, g, b;
+    uint32_t a, r, g, b;
 
     a = (tmp >> 24) & 0xff;
     r = (tmp >> 16) & 0xff;
@@ -1189,6 +1404,27 @@ fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
     return (a << 24) | (r << 16) | (g << 8) | (b << 0);
 }
 
+static uint32_t
+fetch_pixel_r8g8b8_32_sRGB (bits_image_t *image,
+			    int           offset,
+			    int           line)
+{
+    uint8_t *bits = (uint8_t *)(image->bits + line * image->rowstride);
+    uint32_t tmp = FETCH_24 (image, bits, offset);
+    uint32_t a, r, g, b;
+
+    a = 0xff;
+    r = (tmp >> 16) & 0xff;
+    g = (tmp >> 8) & 0xff;
+    b = (tmp >> 0) & 0xff;
+
+    r = to_linear[r] * 255.0f + 0.5f;
+    g = to_linear[g] * 255.0f + 0.5f;
+    b = to_linear[b] * 255.0f + 0.5f;
+
+    return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
 static void
 store_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
                                  int             x,
@@ -1204,7 +1440,7 @@ store_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
     
     for (i = 0; i < width; ++i)
     {
-	uint8_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	tmp = values[i];
 
@@ -1221,6 +1457,36 @@ store_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
     }
 }
 
+static void
+store_scanline_r8g8b8_32_sRGB (bits_image_t   *image,
+			       int             x,
+                               int             y,
+                               int             width,
+                               const uint32_t *v)
+{
+    uint8_t *bits = (uint8_t *)(image->bits + image->rowstride * y) + 3 * x;
+    uint64_t *values = (uint64_t *)v;
+    uint64_t tmp;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t r, g, b;
+
+	tmp = values[i];
+
+	r = (tmp >> 16) & 0xff;
+	g = (tmp >> 8) & 0xff;
+	b = (tmp >> 0) & 0xff;
+
+	r = to_srgb (r * (1/255.0f));
+	g = to_srgb (g * (1/255.0f));
+	b = to_srgb (b * (1/255.0f));
+
+	STORE_24 (image, bits, i, (r << 16) | (g << 8) | (b << 0));
+    }
+}
+
 static argb_t
 fetch_pixel_generic_float (bits_image_t *image,
 			   int		 offset,
@@ -1294,6 +1560,11 @@ static const format_info_t accessors[] =
     fetch_pixel_a8r8g8b8_32_sRGB, fetch_pixel_a8r8g8b8_sRGB_float,
     store_scanline_a8r8g8b8_32_sRGB, store_scanline_a8r8g8b8_sRGB_float,
   },
+  { PIXMAN_r8g8b8_sRGB,
+    fetch_scanline_r8g8b8_32_sRGB, fetch_scanline_r8g8b8_sRGB_float,
+    fetch_pixel_r8g8b8_32_sRGB, fetch_pixel_r8g8b8_sRGB_float,
+    store_scanline_r8g8b8_32_sRGB, store_scanline_r8g8b8_sRGB_float,
+  },
 
 /* 24bpp formats */
     FORMAT_INFO (r8g8b8),
@@ -1351,7 +1622,18 @@ static const format_info_t accessors[] =
     FORMAT_INFO (g1),
     
 /* Wide formats */
-    
+#ifndef PIXMAN_FB_ACCESSORS
+    { PIXMAN_rgba_float,
+      NULL, fetch_scanline_rgbaf_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_rgbaf_float,
+      NULL, store_scanline_rgbaf_float },
+
+    { PIXMAN_rgb_float,
+      NULL, fetch_scanline_rgbf_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_rgbf_float,
+      NULL, store_scanline_rgbf_float },
+#endif
+
     { PIXMAN_a2r10g10b10,
       NULL, fetch_scanline_a2r10g10b10_float,
       fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10_float,
diff --git a/pixman/pixman-arm-asm.h b/pixman/pixman-arm-asm.h
index ee78541..edf8e82 100644
--- a/pixman/pixman-arm-asm.h
+++ b/pixman/pixman-arm-asm.h
@@ -25,13 +25,39 @@
  *
  */
 
+
+#include "pixman-config.h"
+
+
 /* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
-	.func fname
-	.global fname
+.macro pixman_asm_function_impl fname
+#ifdef ASM_HAVE_FUNC_DIRECTIVE
+	.func \fname
+#endif
+	.global \fname
 #ifdef __ELF__
-	.hidden fname
-	.type fname, %function
+	.hidden \fname
+	.type \fname, %function
+#endif
+\fname:
+.endm
+
+.macro pixman_asm_function fname
+#ifdef ASM_LEADING_UNDERSCORE
+	pixman_asm_function_impl _\fname
+#else
+	pixman_asm_function_impl \fname
+#endif
+.endm
+
+.macro pixman_syntax_unified
+#ifdef ASM_HAVE_SYNTAX_UNIFIED
+	.syntax unified
+#endif
+.endm
+
+.macro pixman_end_asm_function
+#ifdef ASM_HAVE_FUNC_DIRECTIVE
+	.endfunc
 #endif
-fname:
 .endm
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 3a7cb2b..9537688 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -266,13 +266,6 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
                        src_type, dst_type, NORMAL)
 
-/* Provide entries for the fast path table */
-#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
-    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
-    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                               \
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
 #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
                                                   src_type, dst_type)         \
 void                                                                          \
@@ -318,9 +311,7 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH (op,s,d,func),                           \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
 
 /*****************************************************************************/
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 0fd92d6..6bd2736 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -68,6 +68,8 @@
 #include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
+pixman_syntax_unified
+
 /*
  * Bilinear macros from pixman-arm-neon-asm.S
  */
@@ -82,28 +84,28 @@
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #2
-    vld1.32   {reg1}, [TMP1], STRIDE
-    vld1.32   {reg2}, [TMP1]
+    vld1.32   {\reg1}, [TMP1], STRIDE
+    vld1.32   {\reg2}, [TMP1]
 .endm
 
 .macro bilinear_load_0565 reg1, reg2, tmp
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
-    vld1.32   {reg2[0]}, [TMP1], STRIDE
-    vld1.32   {reg2[1]}, [TMP1]
-    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+    vld1.32   {\reg2[0]}, [TMP1], STRIDE
+    vld1.32   {\reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_8888 \
                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
 
-    bilinear_load_8888 reg1, reg2, tmp1
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    bilinear_load_8888 reg3, reg4, tmp2
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    bilinear_load_8888 \reg1, \reg2, \tmp1
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    bilinear_load_8888 \reg3, \reg4, \tmp2
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -111,9 +113,9 @@
                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
 
     bilinear_load_and_vertical_interpolate_two_8888 \
-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
     bilinear_load_and_vertical_interpolate_two_8888 \
-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -125,19 +127,19 @@
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {acc2lo[1]}, [TMP1]
-    vld1.32   {acc2hi[1]}, [TMP2]
-    convert_0565_to_x888 acc2, reg3, reg2, reg1
-    vzip.u8   reg1, reg3
-    vzip.u8   reg2, reg4
-    vzip.u8   reg3, reg4
-    vzip.u8   reg1, reg2
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\acc2lo[1]}, [TMP1]
+    vld1.32   {\acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+    vzip.u8   \reg1, \reg3
+    vzip.u8   \reg2, \reg4
+    vzip.u8   \reg3, \reg4
+    vzip.u8   \reg1, \reg2
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -150,46 +152,46 @@
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {xacc2lo[1]}, [TMP1]
-    vld1.32   {xacc2hi[1]}, [TMP2]
-    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\xacc2lo[1]}, [TMP1]
+    vld1.32   {\xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
-    vzip.u8   xreg1, xreg3
-    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
-    vzip.u8   xreg2, xreg4
-    vld1.32   {yacc2lo[1]}, [TMP1]
-    vzip.u8   xreg3, xreg4
-    vld1.32   {yacc2hi[1]}, [TMP2]
-    vzip.u8   xreg1, xreg2
-    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-    vmull.u8  xacc1, xreg1, d28
-    vzip.u8   yreg1, yreg3
-    vmlal.u8  xacc1, xreg2, d29
-    vzip.u8   yreg2, yreg4
-    vmull.u8  xacc2, xreg3, d28
-    vzip.u8   yreg3, yreg4
-    vmlal.u8  xacc2, xreg4, d29
-    vzip.u8   yreg1, yreg2
-    vmull.u8  yacc1, yreg1, d28
-    vmlal.u8  yacc1, yreg2, d29
-    vmull.u8  yacc2, yreg3, d28
-    vmlal.u8  yacc2, yreg4, d29
+    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   \xreg1, \xreg3
+    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   \xreg2, \xreg4
+    vld1.32   {\yacc2lo[1]}, [TMP1]
+    vzip.u8   \xreg3, \xreg4
+    vld1.32   {\yacc2hi[1]}, [TMP2]
+    vzip.u8   \xreg1, \xreg2
+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+    vmull.u8  \xacc1, \xreg1, d28
+    vzip.u8   \yreg1, \yreg3
+    vmlal.u8  \xacc1, \xreg2, d29
+    vzip.u8   \yreg2, \yreg4
+    vmull.u8  \xacc2, \xreg3, d28
+    vzip.u8   \yreg3, \yreg4
+    vmlal.u8  \xacc2, \xreg4, d29
+    vzip.u8   \yreg1, \yreg2
+    vmull.u8  \yacc1, \yreg1, d28
+    vmlal.u8  \yacc1, \yreg2, d29
+    vmull.u8  \yacc2, \yreg3, d28
+    vmlal.u8  \yacc2, \yreg4, d29
 .endm
 
 .macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
     vst1.32   {d0, d1}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d0}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.32   {d0[0]}, [OUT, :32]!
 .else
     .error bilinear_store_8888 numpix is unsupported
@@ -201,12 +203,12 @@
     vuzp.u8 d2, d3
     vuzp.u8 d1, d3
     vuzp.u8 d0, d2
-    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
     vst1.16   {d2}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d2[0]}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.16   {d2[0]}, [OUT]!
 .else
     .error bilinear_store_0565 numpix is unsupported
@@ -222,20 +224,20 @@
 .endm
 
 .macro bilinear_load_mask_8 numpix, mask
-.if numpix == 4
-    vld1.32     {mask[0]}, [MASK]!
-.elseif numpix == 2
-    vld1.16     {mask[0]}, [MASK]!
-.elseif numpix == 1
-    vld1.8      {mask[0]}, [MASK]!
+.if \numpix == 4
+    vld1.32     {\mask[0]}, [MASK]!
+.elseif \numpix == 2
+    vld1.16     {\mask[0]}, [MASK]!
+.elseif \numpix == 1
+    vld1.8      {\mask[0]}, [MASK]!
 .else
-    .error bilinear_load_mask_8 numpix is unsupported
+    .error bilinear_load_mask_8 \numpix is unsupported
 .endif
     pld         [MASK, #prefetch_offset]
 .endm
 
 .macro bilinear_load_mask mask_fmt, numpix, mask
-    bilinear_load_mask_&mask_fmt numpix, mask
+    bilinear_load_mask_\()\mask_fmt \numpix, \mask
 .endm
 
 
@@ -250,28 +252,28 @@
 .endm
 
 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
-.if numpix == 4
-    vld1.32     {dst0, dst1}, [OUT]
-.elseif numpix == 2
-    vld1.32     {dst0}, [OUT]
-.elseif numpix == 1
-    vld1.32     {dst0[0]}, [OUT]
+.if \numpix == 4
+    vld1.32     {\dst0, \dst1}, [OUT]
+.elseif \numpix == 2
+    vld1.32     {\dst0}, [OUT]
+.elseif \numpix == 1
+    vld1.32     {\dst0[0]}, [OUT]
 .else
-    .error bilinear_load_dst_8888 numpix is unsupported
+    .error bilinear_load_dst_8888 \numpix is unsupported
 .endif
     pld         [OUT, #(prefetch_offset * 4)]
 .endm
 
 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
-    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
-    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
-    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+    bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
 .endm
 
 /*
@@ -290,19 +292,19 @@
 .endm
 
 .macro bilinear_duplicate_mask_8 numpix, mask
-.if numpix == 4
-    vdup.32     mask, mask[0]
-.elseif numpix == 2
-    vdup.16     mask, mask[0]
-.elseif numpix == 1
-    vdup.8      mask, mask[0]
+.if \numpix == 4
+    vdup.32     \mask, \mask[0]
+.elseif \numpix == 2
+    vdup.16     \mask, \mask[0]
+.elseif \numpix == 1
+    vdup.8      \mask, \mask[0]
 .else
     .error bilinear_duplicate_mask_8 is unsupported
 .endif
 .endm
 
 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
-    bilinear_duplicate_mask_&mask_fmt numpix, mask
+    bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
 .endm
 
 /*
@@ -310,10 +312,10 @@
  * Interleave should be done when maks is enabled or operator is 'over'.
  */
 .macro bilinear_interleave src0, src1, dst0, dst1
-    vuzp.8      src0, src1
-    vuzp.8      dst0, dst1
-    vuzp.8      src0, src1
-    vuzp.8      dst0, dst1
+    vuzp.8      \src0, \src1
+    vuzp.8      \dst0, \dst1
+    vuzp.8      \src0, \src1
+    vuzp.8      \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_x_src \
@@ -323,7 +325,7 @@
 .macro bilinear_interleave_src_dst_x_over \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_x_add \
@@ -333,26 +335,26 @@
 .macro bilinear_interleave_src_dst_8_src \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_8_over \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst_8_add \
                 numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave src0, src1, dst0, dst1
+    bilinear_interleave \src0, \src1, \dst0, \dst1
 .endm
 
 .macro bilinear_interleave_src_dst \
                 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
 
-    bilinear_interleave_src_dst_&mask_fmt&_&op \
-                numpix, src0, src1, src01, dst0, dst1, dst01
+    bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
 .endm
 
 
@@ -370,23 +372,23 @@
                 numpix, src0, src1, src01, mask, \
                 tmp01, tmp23, tmp45, tmp67
 
-    vmull.u8        tmp01, src0, mask
-    vmull.u8        tmp23, src1, mask
+    vmull.u8        \tmp01, \src0, \mask
+    vmull.u8        \tmp23, \src1, \mask
     /* bubbles */
-    vrshr.u16       tmp45, tmp01, #8
-    vrshr.u16       tmp67, tmp23, #8
+    vrshr.u16       \tmp45, \tmp01, #8
+    vrshr.u16       \tmp67, \tmp23, #8
     /* bubbles */
-    vraddhn.u16     src0, tmp45, tmp01
-    vraddhn.u16     src1, tmp67, tmp23
+    vraddhn.u16     \src0, \tmp45, \tmp01
+    vraddhn.u16     \src1, \tmp67, \tmp23
 .endm
 
 .macro bilinear_apply_mask_to_src \
                 mask_fmt, numpix, src0, src1, src01, mask, \
                 tmp01, tmp23, tmp45, tmp67
 
-    bilinear_apply_mask_to_src_&mask_fmt \
-                numpix, src0, src1, src01, mask, \
-                tmp01, tmp23, tmp45, tmp67
+    bilinear_apply_mask_to_src_\()\mask_fmt \
+                \numpix, \src0, \src1, \src01, \mask, \
+                \tmp01, \tmp23, \tmp45, \tmp67
 .endm
 
 
@@ -403,79 +405,79 @@
                 numpix, src0, src1, src01, dst0, dst1, dst01, \
                 tmp01, tmp23, tmp45, tmp67, tmp8
 
-    vdup.32     tmp8, src1[1]
+    vdup.32     \tmp8, \src1[1]
     /* bubbles */
-    vmvn.8      tmp8, tmp8
+    vmvn.8      \tmp8, \tmp8
     /* bubbles */
-    vmull.u8    tmp01, dst0, tmp8
+    vmull.u8    \tmp01, \dst0, \tmp8
     /* bubbles */
-    vmull.u8    tmp23, dst1, tmp8
+    vmull.u8    \tmp23, \dst1, \tmp8
     /* bubbles */
-    vrshr.u16   tmp45, tmp01, #8
-    vrshr.u16   tmp67, tmp23, #8
+    vrshr.u16   \tmp45, \tmp01, #8
+    vrshr.u16   \tmp67, \tmp23, #8
     /* bubbles */
-    vraddhn.u16 dst0, tmp45, tmp01
-    vraddhn.u16 dst1, tmp67, tmp23
+    vraddhn.u16 \dst0, \tmp45, \tmp01
+    vraddhn.u16 \dst1, \tmp67, \tmp23
     /* bubbles */
-    vqadd.u8    src01, dst01, src01
+    vqadd.u8    \src01, \dst01, \src01
 .endm
 
 .macro bilinear_combine_add \
                 numpix, src0, src1, src01, dst0, dst1, dst01, \
                 tmp01, tmp23, tmp45, tmp67, tmp8
 
-    vqadd.u8    src01, dst01, src01
+    vqadd.u8    \src01, \dst01, \src01
 .endm
 
 .macro bilinear_combine \
                 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
                 tmp01, tmp23, tmp45, tmp67, tmp8
 
-    bilinear_combine_&op \
-                numpix, src0, src1, src01, dst0, dst1, dst01, \
-                tmp01, tmp23, tmp45, tmp67, tmp8
+    bilinear_combine_\()\op \
+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
+                \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
 .endm
 
 /*
  * Macros for final deinterleaving of destination pixels if needed.
  */
 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
-    vuzp.8      dst0, dst1
+    vuzp.8      \dst0, \dst1
     /* bubbles */
-    vuzp.8      dst0, dst1
+    vuzp.8      \dst0, \dst1
 .endm
 
 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
 .endm
 
 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
 .endm
 
 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
-    bilinear_deinterleave numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 .endm
 
 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
-    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+    bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
 .endm
 
 
 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
-    bilinear_load_&src_fmt d0, d1, d2
-    bilinear_load_mask mask_fmt, 1, d4
-    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
+    bilinear_load_\()\src_fmt d0, d1, d2
+    bilinear_load_mask \mask_fmt, 1, d4
+    bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
     /* 5 cycles bubble */
@@ -483,28 +485,28 @@
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
     /* 5 cycles bubble */
-    bilinear_duplicate_mask mask_fmt, 1, d4
+    bilinear_duplicate_mask \mask_fmt, 1, d4
     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
     bilinear_interleave_src_dst \
-                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
+                \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
     bilinear_apply_mask_to_src \
-                mask_fmt, 1, d0, d1, q0, d4, \
+                \mask_fmt, 1, d0, d1, q0, d4, \
                 q3, q8, q10, q11
     bilinear_combine \
-                op, 1, d0, d1, q0, d18, d19, q9, \
+                \op, 1, d0, d1, q0, d18, d19, q9, \
                 q3, q8, q10, q11, d5
-    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
-    bilinear_store_&dst_fmt 1, q2, q3
+    bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
+    bilinear_store_\()\dst_fmt 1, q2, q3
 .endm
 
 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
-    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23
-    bilinear_load_mask mask_fmt, 2, d4
-    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
+    bilinear_load_mask \mask_fmt, 2, d4
+    bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
     vmlal.u16 q0, d3, d30
@@ -513,24 +515,24 @@
     vmlal.u16 q10, d23, d31
     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
-    bilinear_duplicate_mask mask_fmt, 2, d4
+    bilinear_duplicate_mask \mask_fmt, 2, d4
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     vmovn.u16 d0, q0
     bilinear_interleave_src_dst \
-                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
+                \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
     bilinear_apply_mask_to_src \
-                mask_fmt, 2, d0, d1, q0, d4, \
+                \mask_fmt, 2, d0, d1, q0, d4, \
                 q3, q8, q10, q11
     bilinear_combine \
-                op, 2, d0, d1, q0, d18, d19, q9, \
+                \op, 2, d0, d1, q0, d18, d19, q9, \
                 q3, q8, q10, q11, d5
-    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
-    bilinear_store_&dst_fmt 2, q2, q3
+    bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
+    bilinear_store_\()\dst_fmt 2, q2, q3
 .endm
 
 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
-    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23 \
                 q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
@@ -546,8 +548,8 @@
     vmlsl.u16 q2, d6, d30
     vmlal.u16 q2, d7, d30
     vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
-    bilinear_load_mask mask_fmt, 4, d22
-    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
+    bilinear_load_mask \mask_fmt, 4, d22
+    bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
     pld       [TMP1, PF_OFFS]
     vmlsl.u16 q8, d18, d31
     vmlal.u16 q8, d19, d31
@@ -556,21 +558,21 @@
     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
     vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
-    bilinear_duplicate_mask mask_fmt, 4, d22
+    bilinear_duplicate_mask \mask_fmt, 4, d22
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
     vadd.u16  q12, q12, q13
     bilinear_interleave_src_dst \
-                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
+                \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
     bilinear_apply_mask_to_src \
-                mask_fmt, 4, d0, d1, q0, d22, \
+                \mask_fmt, 4, d0, d1, q0, d22, \
                 q3, q8, q9, q10
     bilinear_combine \
-                op, 4, d0, d1, q0, d2, d3, q1, \
+                \op, 4, d0, d1, q0, d2, d3, q1, \
                 q3, q8, q9, q10, d23
-    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
-    bilinear_store_&dst_fmt 4, q2, q3
+    bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
+    bilinear_store_\()\dst_fmt 4, q2, q3
 .endm
 
 .set BILINEAR_FLAG_USE_MASK,		1
@@ -610,14 +612,14 @@
 	prefetch_distance, \
 	flags
 
-pixman_asm_function fname
-.if pixblock_size == 8
-.elseif pixblock_size == 4
+pixman_asm_function \fname
+.if \pixblock_size == 8
+.elseif \pixblock_size == 4
 .else
     .error unsupported pixblock size
 .endif
 
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
     OUT       .req    r0
     TOP       .req    r1
     BOTTOM    .req    r2
@@ -635,7 +637,7 @@ pixman_asm_function fname
 
     mov		ip, sp
     push	{r4, r5, r6, r7, r8, r9}
-    mov		PF_OFFS, #prefetch_distance
+    mov		PF_OFFS, #\prefetch_distance
     ldmia	ip, {WB, X, UX, WIDTH}
 .else
     OUT       .req      r0
@@ -654,17 +656,17 @@ pixman_asm_function fname
     TMP4      .req      r10
     STRIDE    .req      r3
 
-    .set prefetch_offset, prefetch_distance
+    .set prefetch_offset, \prefetch_distance
 
     mov       ip, sp
     push      {r4, r5, r6, r7, r8, r9, r10, ip}
-    mov       PF_OFFS, #prefetch_distance
+    mov       PF_OFFS, #\prefetch_distance
     ldmia     ip, {WT, WB, X, UX, WIDTH}
 .endif
 
     mul       PF_OFFS, PF_OFFS, UX
 
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpush     {d8-d15}
 .endif
 
@@ -683,11 +685,11 @@ pixman_asm_function fname
     /* ensure good destination alignment  */
     cmp       WIDTH, #1
     blt       0f
-    tst       OUT, #(1 << dst_bpp_shift)
+    tst       OUT, #(1 << \dst_bpp_shift)
     beq       0f
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
-    bilinear_process_last_pixel
+    \bilinear_process_last_pixel
     sub       WIDTH, WIDTH, #1
 0:
     vadd.u16  q13, q13, q13
@@ -696,53 +698,53 @@ pixman_asm_function fname
 
     cmp       WIDTH, #2
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    tst       OUT, #(1 << (\dst_bpp_shift + 1))
     beq       0f
-    bilinear_process_two_pixels
+    \bilinear_process_two_pixels
     sub       WIDTH, WIDTH, #2
 0:
-.if pixblock_size == 8
+.if \pixblock_size == 8
     cmp       WIDTH, #4
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    tst       OUT, #(1 << (\dst_bpp_shift + 2))
     beq       0f
-    bilinear_process_four_pixels
+    \bilinear_process_four_pixels
     sub       WIDTH, WIDTH, #4
 0:
 .endif
-    subs      WIDTH, WIDTH, #pixblock_size
+    subs      WIDTH, WIDTH, #\pixblock_size
     blt       1f
-    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_process_pixblock_head
-    subs      WIDTH, WIDTH, #pixblock_size
+    mov       PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
+    \bilinear_process_pixblock_head
+    subs      WIDTH, WIDTH, #\pixblock_size
     blt       5f
 0:
-    bilinear_process_pixblock_tail_head
-    subs      WIDTH, WIDTH, #pixblock_size
+    \bilinear_process_pixblock_tail_head
+    subs      WIDTH, WIDTH, #\pixblock_size
     bge       0b
 5:
-    bilinear_process_pixblock_tail
+    \bilinear_process_pixblock_tail
 1:
-.if pixblock_size == 8
+.if \pixblock_size == 8
     tst       WIDTH, #4
     beq       2f
-    bilinear_process_four_pixels
+    \bilinear_process_four_pixels
 2:
 .endif
     /* handle the remaining trailing pixels */
     tst       WIDTH, #2
     beq       2f
-    bilinear_process_two_pixels
+    \bilinear_process_two_pixels
 2:
     tst       WIDTH, #1
     beq       3f
-    bilinear_process_last_pixel
+    \bilinear_process_last_pixel
 3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpop      {d8-d15}
 .endif
 
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
     pop       {r4, r5, r6, r7, r8, r9}
 .else
     pop       {r4, r5, r6, r7, r8, r9, r10, ip}
@@ -762,11 +764,11 @@ pixman_asm_function fname
     .unreq    TMP3
     .unreq    TMP4
     .unreq    STRIDE
-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
     .unreq    MASK
 .endif
 
-.endfunc
+pixman_end_asm_function
 
 .endm
 
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 7e949a3..0e09257 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -53,6 +53,8 @@
 #include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 
+    pixman_syntax_unified
+
 /* Global configuration options and preferences */
 
 /*
@@ -260,13 +262,13 @@
     vshrn.u16   d7, q2, #3
     vsli.u16    q2, q2, #5
         vshll.u8    q14, d16, #8
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vshll.u8    q8, d19, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF tst, PF_CTL, #0xF
     vsri.u8     d6, d6, #5
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
     vmvn.8      d3, d3
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
     vsri.u8     d7, d7, #6
     vshrn.u16   d30, q2, #2
     vmull.u8    q10, d3, d6
@@ -275,18 +277,18 @@
     vmull.u8    q12, d3, d30
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vsri.u16    q14, q8, #5
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vshll.u8    q9, d18, #8
     vrshr.u16   q13, q10, #8
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vrshr.u16   q3, q11, #8
     vrshr.u16   q15, q12, #8
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
         vsri.u16    q14, q9, #11
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vraddhn.u16 d20, q10, q13
     vraddhn.u16 d23, q11, q3
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vraddhn.u16 d22, q12, q15
         vst1.16     {d28, d29}, [DST_W, :128]!
 .endm
@@ -434,20 +436,20 @@ generate_composite_function \
 
 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
         vsri.u16    q14, q8, #5
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
     fetch_src_pixblock
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vsri.u16    q14, q9, #11
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vshll.u8    q8, d1, #8
         vst1.16     {d28, d29}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vshll.u8    q14, d2, #8
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vshll.u8    q9, d0, #8
 .endm
 
@@ -509,20 +511,20 @@ generate_composite_function \
 
 .macro pixman_composite_add_8_8_process_pixblock_tail_head
     fetch_src_pixblock
-                                    PF add PF_X, PF_X, #32
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #32
+                                    PF tst, PF_CTL, #0xF
     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #32
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #32
+                                    PF subne, PF_CTL, PF_CTL, #1
         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vqadd.u8    q15, q1, q3
 .endm
 
@@ -541,20 +543,20 @@ generate_composite_function \
 
 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
     fetch_src_pixblock
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
     vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vqadd.u8    q14, q0, q2
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vqadd.u8    q15, q1, q3
 .endm
 
@@ -604,16 +606,16 @@ generate_composite_function_single_scanline \
 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
     fetch_src_pixblock
@@ -621,13 +623,13 @@ generate_composite_function_single_scanline \
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
@@ -656,16 +658,16 @@ generate_composite_function_single_scanline \
 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
         vqadd.u8    q14, q0, q14
@@ -675,13 +677,13 @@ generate_composite_function_single_scanline \
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
@@ -742,20 +744,20 @@ generate_composite_function_single_scanline \
         vraddhn.u16 d31, q3, q11
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
         vqadd.u8    q14, q0, q14
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0x0F
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0x0F
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vqadd.u8    q15, q1, q15
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
     vmull.u8    q8, d24, d4
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
     vmull.u8    q9, d24, d5
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q10, d24, d6
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q11, d24, d7
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
 .endm
 
@@ -784,16 +786,16 @@ generate_composite_function \
 
 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
         vrshr.u16   q14, q8, #8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
         vrshr.u16   q15, q9, #8
         vrshr.u16   q12, q10, #8
         vrshr.u16   q13, q11, #8
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d28, q14, q8
         vraddhn.u16 d29, q15, q9
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
         vqadd.u8    q14, q0, q14
@@ -802,12 +804,12 @@ generate_composite_function \
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d22, d4
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d22, d5
     vmull.u8    q10, d22, d6
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vmull.u8    q11, d22, d7
 .endm
 
@@ -1245,23 +1247,23 @@ generate_composite_function \
 
 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
     fetch_mask_pixblock
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vrshrn.u16  d28, q8, #8
-                                    PF tst PF_CTL, #0x0F
+                                    PF tst, PF_CTL, #0x0F
         vrshrn.u16  d29, q9, #8
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
         vrshrn.u16  d30, q10, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
         vrshrn.u16  d31, q11, #8
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
     vmull.u8    q8, d24, d0
                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
     vmull.u8    q9, d24, d1
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q10, d24, d2
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q11, d24, d3
-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
     vrsra.u16   q8, q8, #8
     vrsra.u16   q9, q9, #8
@@ -1314,23 +1316,23 @@ generate_composite_function \
 
 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
     fetch_mask_pixblock
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vrshrn.u16  d28, q0, #8
-                                    PF tst PF_CTL, #0x0F
+                                    PF tst, PF_CTL, #0x0F
         vrshrn.u16  d29, q1, #8
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
         vrshrn.u16  d30, q2, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
         vrshrn.u16  d31, q3, #8
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
     vmull.u8    q0,  d24, d16
                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
     vmull.u8    q1,  d25, d16
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q2,  d26, d16
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q3,  d27, d16
-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
     vrsra.u16   q0, q0,  #8
     vrsra.u16   q1, q1,  #8
@@ -1408,27 +1410,27 @@ generate_composite_function \
         vrshr.u16   q15, q9, #8
     fetch_mask_pixblock
         vrshr.u16   q6, q10, #8
-                                    PF add PF_X, PF_X, #8
+                                    PF add, PF_X, PF_X, #8
         vrshr.u16   q7, q11, #8
-                                    PF tst PF_CTL, #0x0F
+                                    PF tst, PF_CTL, #0x0F
         vraddhn.u16 d28, q14, q8
-                                    PF addne PF_X, PF_X, #8
+                                    PF addne, PF_X, PF_X, #8
         vraddhn.u16 d29, q15, q9
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d30, q6, q10
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
         vraddhn.u16 d31, q7, q11
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
     vmull.u8    q6, d24, d8
                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
     vmull.u8    q7, d24, d9
-                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subge, PF_X, PF_X, ORIG_W
     vmull.u8    q8, d24, d10
-                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
     vmull.u8    q9, d24, d11
-                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
         vqadd.u8    q14, q0, q14
-                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+                                    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
         vqadd.u8    q15, q1, q15
     vrshr.u16   q10, q6, #8
     vrshr.u16   q11, q7, #8
@@ -2425,21 +2427,21 @@ generate_composite_function \
         vrshr.u16   q13, q10, #8
     fetch_src_pixblock
         vraddhn.u16 d30, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d29, q12, q9
         vraddhn.u16 d28, q13, q10
     vmull.u8    q8, d3, d0
     vmull.u8    q9, d3, d1
     vmull.u8    q10, d3, d2
         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endm
 
 generate_composite_function \
@@ -2482,21 +2484,21 @@ generate_composite_function \
         vrshr.u16   q13, q10, #8
     fetch_src_pixblock
         vraddhn.u16 d28, q11, q8
-                                    PF add PF_X, PF_X, #8
-                                    PF tst PF_CTL, #0xF
-                                    PF addne PF_X, PF_X, #8
-                                    PF subne PF_CTL, PF_CTL, #1
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+                                    PF addne, PF_X, PF_X, #8
+                                    PF subne, PF_CTL, PF_CTL, #1
         vraddhn.u16 d29, q12, q9
         vraddhn.u16 d30, q13, q10
     vmull.u8    q8, d3, d0
     vmull.u8    q9, d3, d1
     vmull.u8    q10, d3, d2
         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
-                                    PF cmp PF_X, ORIG_W
+                                    PF cmp, PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
-                                    PF subge PF_X, PF_X, ORIG_W
-                                    PF subges PF_CTL, PF_CTL, #0x10
-                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF subge, PF_X, PF_X, ORIG_W
+                                    PF subsge, PF_CTL, PF_CTL, #0x10
+                                    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endm
 
 generate_composite_function \
@@ -2841,28 +2843,28 @@ generate_composite_function_nearest_scanline \
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #2
-    vld1.32   {reg1}, [TMP1], STRIDE
-    vld1.32   {reg2}, [TMP1]
+    vld1.32   {\reg1}, [TMP1], STRIDE
+    vld1.32   {\reg2}, [TMP1]
 .endm
 
 .macro bilinear_load_0565 reg1, reg2, tmp
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
-    vld1.32   {reg2[0]}, [TMP1], STRIDE
-    vld1.32   {reg2[1]}, [TMP1]
-    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+    vld1.32   {\reg2[0]}, [TMP1], STRIDE
+    vld1.32   {\reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_8888 \
                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
 
-    bilinear_load_8888 reg1, reg2, tmp1
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    bilinear_load_8888 reg3, reg4, tmp2
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    bilinear_load_8888 \reg1, \reg2, \tmp1
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    bilinear_load_8888 \reg3, \reg4, \tmp2
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -2870,9 +2872,9 @@ generate_composite_function_nearest_scanline \
                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
 
     bilinear_load_and_vertical_interpolate_two_8888 \
-                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
     bilinear_load_and_vertical_interpolate_two_8888 \
-                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -2884,19 +2886,19 @@ generate_composite_function_nearest_scanline \
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {acc2lo[1]}, [TMP1]
-    vld1.32   {acc2hi[1]}, [TMP2]
-    convert_0565_to_x888 acc2, reg3, reg2, reg1
-    vzip.u8   reg1, reg3
-    vzip.u8   reg2, reg4
-    vzip.u8   reg3, reg4
-    vzip.u8   reg1, reg2
-    vmull.u8  acc1, reg1, d28
-    vmlal.u8  acc1, reg2, d29
-    vmull.u8  acc2, reg3, d28
-    vmlal.u8  acc2, reg4, d29
+    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\acc2lo[1]}, [TMP1]
+    vld1.32   {\acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+    vzip.u8   \reg1, \reg3
+    vzip.u8   \reg2, \reg4
+    vzip.u8   \reg3, \reg4
+    vzip.u8   \reg1, \reg2
+    vmull.u8  \acc1, \reg1, d28
+    vmlal.u8  \acc1, \reg2, d29
+    vmull.u8  \acc2, \reg3, d28
+    vmlal.u8  \acc2, \reg4, d29
 .endm
 
 .macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -2909,49 +2911,49 @@ generate_composite_function_nearest_scanline \
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
-    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
-    vld1.32   {xacc2lo[1]}, [TMP1]
-    vld1.32   {xacc2hi[1]}, [TMP2]
-    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {\xacc2lo[1]}, [TMP1]
+    vld1.32   {\xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
     mov       TMP1, X, asr #16
     add       X, X, UX
     add       TMP1, TOP, TMP1, asl #1
     mov       TMP2, X, asr #16
     add       X, X, UX
     add       TMP2, TOP, TMP2, asl #1
-    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
-    vzip.u8   xreg1, xreg3
-    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
-    vzip.u8   xreg2, xreg4
-    vld1.32   {yacc2lo[1]}, [TMP1]
-    vzip.u8   xreg3, xreg4
-    vld1.32   {yacc2hi[1]}, [TMP2]
-    vzip.u8   xreg1, xreg2
-    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-    vmull.u8  xacc1, xreg1, d28
-    vzip.u8   yreg1, yreg3
-    vmlal.u8  xacc1, xreg2, d29
-    vzip.u8   yreg2, yreg4
-    vmull.u8  xacc2, xreg3, d28
-    vzip.u8   yreg3, yreg4
-    vmlal.u8  xacc2, xreg4, d29
-    vzip.u8   yreg1, yreg2
-    vmull.u8  yacc1, yreg1, d28
-    vmlal.u8  yacc1, yreg2, d29
-    vmull.u8  yacc2, yreg3, d28
-    vmlal.u8  yacc2, yreg4, d29
+    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   \xreg1, \xreg3
+    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   \xreg2, \xreg4
+    vld1.32   {\yacc2lo[1]}, [TMP1]
+    vzip.u8   \xreg3, \xreg4
+    vld1.32   {\yacc2hi[1]}, [TMP2]
+    vzip.u8   \xreg1, \xreg2
+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+    vmull.u8  \xacc1, \xreg1, d28
+    vzip.u8   \yreg1, \yreg3
+    vmlal.u8  \xacc1, \xreg2, d29
+    vzip.u8   \yreg2, \yreg4
+    vmull.u8  \xacc2, \xreg3, d28
+    vzip.u8   \yreg3, \yreg4
+    vmlal.u8  \xacc2, \xreg4, d29
+    vzip.u8   \yreg1, \yreg2
+    vmull.u8  \yacc1, \yreg1, d28
+    vmlal.u8  \yacc1, \yreg2, d29
+    vmull.u8  \yacc2, \yreg3, d28
+    vmlal.u8  \yacc2, \yreg4, d29
 .endm
 
 .macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
     vst1.32   {d0, d1}, [OUT, :128]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d0}, [OUT, :64]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.32   {d0[0]}, [OUT, :32]!
 .else
-    .error bilinear_store_8888 numpix is unsupported
+    .error bilinear_store_8888 \numpix is unsupported
 .endif
 .endm
 
@@ -2960,20 +2962,20 @@ generate_composite_function_nearest_scanline \
     vuzp.u8 d2, d3
     vuzp.u8 d1, d3
     vuzp.u8 d0, d2
-    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
     vst1.16   {d2}, [OUT, :64]!
-.elseif numpix == 2
+.elseif \numpix == 2
     vst1.32   {d2[0]}, [OUT, :32]!
-.elseif numpix == 1
+.elseif \numpix == 1
     vst1.16   {d2[0]}, [OUT, :16]!
 .else
-    .error bilinear_store_0565 numpix is unsupported
+    .error bilinear_store_0565 \numpix is unsupported
 .endif
 .endm
 
 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
-    bilinear_load_&src_fmt d0, d1, d2
+    bilinear_load_\()\src_fmt d0, d1, d2
     vmull.u8  q1, d0, d28
     vmlal.u8  q1, d1, d29
     /* 5 cycles bubble */
@@ -2985,11 +2987,11 @@ generate_composite_function_nearest_scanline \
     /* 3 cycles bubble */
     vmovn.u16 d0, q0
     /* 1 cycle bubble */
-    bilinear_store_&dst_fmt 1, q2, q3
+    bilinear_store_\()\dst_fmt 1, q2, q3
 .endm
 
 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
-    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23
     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
     vmlsl.u16 q0, d2, d30
@@ -3002,11 +3004,11 @@ generate_composite_function_nearest_scanline \
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
     vmovn.u16 d0, q0
-    bilinear_store_&dst_fmt 2, q2, q3
+    bilinear_store_\()\dst_fmt 2, q2, q3
 .endm
 
 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
-    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
                 q1, q11, d0, d1, d20, d21, d22, d23 \
                 q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
@@ -3034,54 +3036,54 @@ generate_composite_function_nearest_scanline \
     vmovn.u16 d0, q0
     vmovn.u16 d1, q2
     vadd.u16  q12, q12, q13
-    bilinear_store_&dst_fmt 4, q2, q3
+    bilinear_store_\()\dst_fmt 4, q2, q3
 .endm
 
 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
 .else
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
 .endif
 .endm
 
 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
 .else
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
 .else
-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
 .else
-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
 .endif
 .endm
 
 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
 .else
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
 .endif
 .endm
 
@@ -3106,7 +3108,7 @@ generate_composite_function_nearest_scanline \
                                        src_bpp_shift, dst_bpp_shift, \
                                        prefetch_distance, flags
 
-pixman_asm_function fname
+pixman_asm_function \fname
     OUT       .req      r0
     TOP       .req      r1
     BOTTOM    .req      r2
@@ -3124,11 +3126,11 @@ pixman_asm_function fname
 
     mov       ip, sp
     push      {r4, r5, r6, r7, r8, r9}
-    mov       PF_OFFS, #prefetch_distance
+    mov       PF_OFFS, #\prefetch_distance
     ldmia     ip, {WB, X, UX, WIDTH}
     mul       PF_OFFS, PF_OFFS, UX
 
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpush     {d8-d15}
 .endif
 
@@ -3147,11 +3149,11 @@ pixman_asm_function fname
     /* ensure good destination alignment  */
     cmp       WIDTH, #1
     blt       0f
-    tst       OUT, #(1 << dst_bpp_shift)
+    tst       OUT, #(1 << \dst_bpp_shift)
     beq       0f
     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
     vadd.u16  q12, q12, q13
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
     sub       WIDTH, WIDTH, #1
 0:
     vadd.u16  q13, q13, q13
@@ -3160,64 +3162,64 @@ pixman_asm_function fname
 
     cmp       WIDTH, #2
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    tst       OUT, #(1 << (\dst_bpp_shift + 1))
     beq       0f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
     sub       WIDTH, WIDTH, #2
 0:
-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
 /*********** 8 pixels per iteration *****************/
     cmp       WIDTH, #4
     blt       0f
-    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    tst       OUT, #(1 << (\dst_bpp_shift + 2))
     beq       0f
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
     sub       WIDTH, WIDTH, #4
 0:
     subs      WIDTH, WIDTH, #8
     blt       1f
     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #8
     blt       5f
 0:
-    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #8
     bge       0b
 5:
-    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+    bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
 1:
     tst       WIDTH, #4
     beq       2f
-    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
 2:
 .else
 /*********** 4 pixels per iteration *****************/
     subs      WIDTH, WIDTH, #4
     blt       1f
     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
-    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #4
     blt       5f
 0:
-    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
     subs      WIDTH, WIDTH, #4
     bge       0b
 5:
-    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
 1:
 /****************************************************/
 .endif
     /* handle the remaining trailing pixels */
     tst       WIDTH, #2
     beq       2f
-    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
 2:
     tst       WIDTH, #1
     beq       3f
-    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
 3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
     vpop      {d8-d15}
 .endif
     pop       {r4, r5, r6, r7, r8, r9}
@@ -3236,7 +3238,7 @@ pixman_asm_function fname
     .unreq    TMP3
     .unreq    TMP4
     .unreq    STRIDE
-.endfunc
+    pixman_end_asm_function
 
 .endm
 
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index bdcf6a9..06318d9 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -74,134 +74,134 @@
  */
 
 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
 .else
-    op&.&elem_size {d&reg1}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
 .endif
 .endm
 
 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
 .else
-    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
 .endif
 .endm
 
 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
-.if abits > 0
-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
 .else
-    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
 .endif
 .endm
 
 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
-    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
 .endm
 
 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
-    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
 .endm
 
 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
-    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+    \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
 .endm
 
 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
-.if numbytes == 32
-    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
-                              %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif numbytes == 16
-    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
-.elseif numbytes == 8
-    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
-.elseif numbytes == 4
-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
-        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
-    .elseif elem_size == 16
-        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
-        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+.if \numbytes == 32
+    pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
+                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif \numbytes == 16
+    pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+.elseif \numbytes == 8
+    pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
+.elseif \numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
+        pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
+    .elseif \elem_size == 16
+        pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
+        pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
     .else
-        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+        pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
     .endif
-.elseif numbytes == 2
-    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
-        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
+        pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
     .else
-        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
-        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+        pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
+        pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
     .endif
-.elseif numbytes == 1
-    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 1
+    pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
 .else
-    .error "unsupported size: numbytes"
+    .error "unsupported size: \numbytes"
 .endif
 .endm
 
 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
-                      %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
-    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
-    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+    pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
 .else
-    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+    pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
 .endif
 .endif
 .endm
 
 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
-                      %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
-    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
-    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+    pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
 .else
-    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+    pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
 .endif
 .endif
 .endm
 
 .macro pixld_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
-    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+    pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
 .else
-    pixld numpix, bpp, basereg, mem_operand, 128
+    pixld \numpix, \bpp, \basereg, \mem_operand, 128
 .endif
 .endm
 
 .macro pixst_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
-    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+    pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
 .else
-    pixst numpix, bpp, basereg, mem_operand, 128
+    pixst \numpix, \bpp, \basereg, \mem_operand, 128
 .endif
 .endm
 
@@ -210,44 +210,44 @@
  * aliases to be defined)
  */
 .macro pixld1_s elem_size, reg1, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #1
+    add     TMP1, \mem_operand, TMP1, asl #1
     mov     TMP2, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP2, mem_operand, TMP2, asl #1
-    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    add     TMP2, \mem_operand, TMP2, asl #1
+    vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #1
-    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    add     TMP1, \mem_operand, TMP1, asl #1
+    vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
     mov     TMP2, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP2, mem_operand, TMP2, asl #1
-    vld1.16 {d&reg1&[2]}, [TMP1, :16]
-    vld1.16 {d&reg1&[3]}, [TMP2, :16]
-.elseif elem_size == 32
+    add     TMP2, \mem_operand, TMP2, asl #1
+    vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
+    vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
+.elseif \elem_size == 32
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #2
+    add     TMP1, \mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
-    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+    add     TMP2, \mem_operand, TMP2, asl #2
+    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
+    vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
 .else
     .error "unsupported"
 .endif
@@ -257,110 +257,110 @@
 .if 0 /* elem_size == 32 */
     mov     TMP1, VX, asr #16
     add     VX, VX, UNIT_X, asl #1
-    add     TMP1, mem_operand, TMP1, asl #2
+    add     TMP1, \mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
     sub     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    add     TMP2, \mem_operand, TMP2, asl #2
+    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
     mov     TMP1, VX, asr #16
     add     VX, VX, UNIT_X, asl #1
-    add     TMP1, mem_operand, TMP1, asl #2
-    vld1.32 {d&reg2&[0]}, [TMP2, :32]
+    add     TMP1, \mem_operand, TMP1, asl #2
+    vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
     mov     TMP2, VX, asr #16
     add     VX, VX, UNIT_X
-    add     TMP2, mem_operand, TMP2, asl #2
-    vld1.32 {d&reg1&[1]}, [TMP1, :32]
-    vld1.32 {d&reg2&[1]}, [TMP2, :32]
+    add     TMP2, \mem_operand, TMP2, asl #2
+    vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
+    vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
 .else
-    pixld1_s elem_size, reg1, mem_operand
-    pixld1_s elem_size, reg2, mem_operand
+    pixld1_s \elem_size, \reg1, \mem_operand
+    pixld1_s \elem_size, \reg2, \mem_operand
 .endif
 .endm
 
 .macro pixld0_s elem_size, reg1, idx, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #1
-    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
-.elseif elem_size == 32
+    add     TMP1, \mem_operand, TMP1, asl #1
+    vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
+.elseif \elem_size == 32
     mov     TMP1, VX, asr #16
     adds    VX, VX, UNIT_X
-5:  subpls  VX, VX, SRC_WIDTH_FIXED
+5:  subspl  VX, VX, SRC_WIDTH_FIXED
     bpl     5b
-    add     TMP1, mem_operand, TMP1, asl #2
-    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+    add     TMP1, \mem_operand, TMP1, asl #2
+    vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
 .endif
 .endm
 
 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
-.if numbytes == 32
-    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
-    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
-    pixdeinterleave elem_size, %(basereg+4)
-.elseif numbytes == 16
-    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
-.elseif numbytes == 8
-    pixld1_s elem_size, %(basereg+1), mem_operand
-.elseif numbytes == 4
-    .if elem_size == 32
-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
-    .elseif elem_size == 16
-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+.if \numbytes == 32
+    pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
+    pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
+    pixdeinterleave \elem_size, %(\basereg+4)
+.elseif \numbytes == 16
+    pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
+.elseif \numbytes == 8
+    pixld1_s \elem_size, %(\basereg+1), \mem_operand
+.elseif \numbytes == 4
+    .if \elem_size == 32
+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+    .elseif \elem_size == 16
+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
     .else
-        pixld0_s elem_size, %(basereg+0), 4, mem_operand
-        pixld0_s elem_size, %(basereg+0), 5, mem_operand
-        pixld0_s elem_size, %(basereg+0), 6, mem_operand
-        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
     .endif
-.elseif numbytes == 2
-    .if elem_size == 16
-        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 2
+    .if \elem_size == 16
+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
     .else
-        pixld0_s elem_size, %(basereg+0), 2, mem_operand
-        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
     .endif
-.elseif numbytes == 1
-    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 1
+    pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
 .else
-    .error "unsupported size: numbytes"
+    .error "unsupported size: \numbytes"
 .endif
 .endm
 
 .macro pixld_s numpix, bpp, basereg, mem_operand
-.if bpp > 0
-    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.if \bpp > 0
+    pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
 .endif
 .endm
 
 .macro vuzp8 reg1, reg2
-    vuzp.8 d&reg1, d&reg2
+    vuzp.8 d\()\reg1, d\()\reg2
 .endm
 
 .macro vzip8 reg1, reg2
-    vzip.8 d&reg1, d&reg2
+    vzip.8 d\()\reg1, d\()\reg2
 .endm
 
 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 .macro pixdeinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    vuzp8 %(basereg+0), %(basereg+1)
-    vuzp8 %(basereg+2), %(basereg+3)
-    vuzp8 %(basereg+1), %(basereg+3)
-    vuzp8 %(basereg+0), %(basereg+2)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(\basereg+0), %(\basereg+1)
+    vuzp8 %(\basereg+2), %(\basereg+3)
+    vuzp8 %(\basereg+1), %(\basereg+3)
+    vuzp8 %(\basereg+0), %(\basereg+2)
 .endif
 .endm
 
 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 .macro pixinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-    vzip8 %(basereg+0), %(basereg+2)
-    vzip8 %(basereg+1), %(basereg+3)
-    vzip8 %(basereg+2), %(basereg+3)
-    vzip8 %(basereg+0), %(basereg+1)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(\basereg+0), %(\basereg+2)
+    vzip8 %(\basereg+1), %(\basereg+3)
+    vzip8 %(\basereg+2), %(\basereg+3)
+    vzip8 %(\basereg+0), %(\basereg+1)
 .endif
 .endm
 
@@ -394,22 +394,22 @@
  */
 .macro PF a, x:vararg
 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
-    a x
+    \a \x
 .endif
 .endm
 
 .macro cache_preload std_increment, boost_increment
 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
 .if regs_shortage
-    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+    PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
 .endif
-.if std_increment != 0
-    PF add PF_X, PF_X, #std_increment
+.if \std_increment != 0
+    PF add, PF_X, PF_X, #\std_increment
 .endif
-    PF tst PF_CTL, #0xF
-    PF addne PF_X, PF_X, #boost_increment
-    PF subne PF_CTL, PF_CTL, #1
-    PF cmp PF_X, ORIG_W
+    PF tst, PF_CTL, #0xF
+    PF addne, PF_X, PF_X, #\boost_increment
+    PF subne, PF_CTL, PF_CTL, #1
+    PF cmp, PF_X, ORIG_W
 .if src_bpp_shift >= 0
     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 .endif
@@ -419,16 +419,16 @@
 .if mask_bpp_shift >= 0
     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
 .endif
-    PF subge PF_X, PF_X, ORIG_W
-    PF subges PF_CTL, PF_CTL, #0x10
+    PF subge, PF_X, PF_X, ORIG_W
+    PF subsge, PF_CTL, PF_CTL, #0x10
 .if src_bpp_shift >= 0
-    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 .endif
 .if dst_r_bpp != 0
-    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 .endif
 .if mask_bpp_shift >= 0
-    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
 .endif
 .endif
 .endm
@@ -465,21 +465,20 @@
     beq         2f
 
 .irp lowbit, 1, 2, 4, 8, 16
-local skip1
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
-    tst         DST_R, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #\lowbit
     beq         1f
 .endif
-    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
-    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+    pixld_src   (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
 .if dst_r_bpp > 0
-    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+    pixld_a     (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
 .else
-    add         DST_R, DST_R, #lowbit
+    add         DST_R, DST_R, #\lowbit
 .endif
-    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
-    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+    PF add,     PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
 1:
 .endif
 .endr
@@ -487,19 +486,19 @@ local skip1
     pixdeinterleave mask_bpp, mask_basereg
     pixdeinterleave dst_r_bpp, dst_r_basereg
 
-    process_pixblock_head
+    \process_pixblock_head
     cache_preload 0, pixblock_size
     cache_preload_simple
-    process_pixblock_tail
+    \process_pixblock_tail
 
     pixinterleave dst_w_bpp, dst_w_basereg
 .irp lowbit, 1, 2, 4, 8, 16
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
-    tst         DST_W, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #\lowbit
     beq         1f
 .endif
-    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+    pixst_a     (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
 1:
 .endif
 .endr
@@ -530,18 +529,18 @@ local skip1
     tst         W, #(pixblock_size - 1)
     beq         2f
 .irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
-    tst         W, #chunk_size
+.if pixblock_size > \chunk_size
+    tst         W, #\chunk_size
     beq         1f
-    pixld_src   chunk_size, src_bpp, src_basereg, SRC
-    pixld       chunk_size, mask_bpp, mask_basereg, MASK
-.if dst_aligned_flag != 0
-    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+    pixld_src   \chunk_size, src_bpp, src_basereg, SRC
+    pixld       \chunk_size, mask_bpp, mask_basereg, MASK
+.if \dst_aligned_flag != 0
+    pixld_a     \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 .else
-    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+    pixld       \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 .endif
-.if cache_preload_flag != 0
-    PF add      PF_X, PF_X, #chunk_size
+.if \cache_preload_flag != 0
+    PF add,     PF_X, PF_X, #\chunk_size
 .endif
 1:
 .endif
@@ -550,21 +549,21 @@ local skip1
     pixdeinterleave mask_bpp, mask_basereg
     pixdeinterleave dst_r_bpp, dst_r_basereg
 
-    process_pixblock_head
-.if cache_preload_flag != 0
+    \process_pixblock_head
+.if \cache_preload_flag != 0
     cache_preload 0, pixblock_size
     cache_preload_simple
 .endif
-    process_pixblock_tail
+    \process_pixblock_tail
     pixinterleave dst_w_bpp, dst_w_basereg
 .irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
-    tst         W, #chunk_size
+.if pixblock_size > \chunk_size
+    tst         W, #\chunk_size
     beq         1f
-.if dst_aligned_flag != 0
-    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.if \dst_aligned_flag != 0
+    pixst_a     \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 .else
-    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+    pixst       \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 .endif
 1:
 .endif
@@ -604,7 +603,7 @@ local skip1
 .if regs_shortage
     str         H, [sp, #4] /* save updated height to stack */
 .endif
-    bge         start_of_loop_label
+    bge         \start_of_loop_label
 .endm
 
 /*
@@ -631,7 +630,7 @@ local skip1
                                    src_basereg_   = 0, \
                                    mask_basereg_  = 24
 
-    pixman_asm_function fname
+    pixman_asm_function \fname
 
     push        {r4-r12, lr}        /* save all registers */
 
@@ -641,10 +640,10 @@ local skip1
  * has to be used instead of ADVANCED.
  */
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
-.if prefetch_distance == 0
+.if \prefetch_distance == 0
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
-        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+        ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
 .endif
 
@@ -652,17 +651,17 @@ local skip1
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
-    .set src_bpp, src_bpp_
-    .set mask_bpp, mask_bpp_
-    .set dst_w_bpp, dst_w_bpp_
-    .set pixblock_size, pixblock_size_
-    .set dst_w_basereg, dst_w_basereg_
-    .set dst_r_basereg, dst_r_basereg_
-    .set src_basereg, src_basereg_
-    .set mask_basereg, mask_basereg_
+    .set src_bpp, \src_bpp_
+    .set mask_bpp, \mask_bpp_
+    .set dst_w_bpp, \dst_w_bpp_
+    .set pixblock_size, \pixblock_size_
+    .set dst_w_basereg, \dst_w_basereg_
+    .set dst_r_basereg, \dst_r_basereg_
+    .set src_basereg, \src_basereg_
+    .set mask_basereg, \mask_basereg_
 
     .macro pixld_src x:vararg
-        pixld x
+        pixld \x
     .endm
     .macro fetch_src_pixblock
         pixld_src   pixblock_size, src_bpp, \
@@ -755,19 +754,19 @@ local skip1
     .error "requested dst bpp (dst_w_bpp) is not supported"
 .endif
 
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
     .set dst_r_bpp, dst_w_bpp
 .else
     .set dst_r_bpp, 0
 .endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
     .set DEINTERLEAVE_32BPP_ENABLED, 1
 .else
     .set DEINTERLEAVE_32BPP_ENABLED, 0
 .endif
 
-.if prefetch_distance < 0 || prefetch_distance > 15
-    .error "invalid prefetch distance (prefetch_distance)"
+.if \prefetch_distance < 0 || \prefetch_distance > 15
+    .error "invalid prefetch distance (\prefetch_distance)"
 .endif
 
 .if src_bpp > 0
@@ -776,7 +775,7 @@ local skip1
 .if mask_bpp > 0
     ldr         MASK, [sp, #48]
 .endif
-    PF mov      PF_X, #0
+    PF mov,     PF_X, #0
 .if src_bpp > 0
     ldr         SRC_STRIDE, [sp, #44]
 .endif
@@ -801,14 +800,14 @@ local skip1
 /*
  * Setup advanced prefetcher initial state
  */
-    PF mov      PF_SRC, SRC
-    PF mov      PF_DST, DST_R
-    PF mov      PF_MASK, MASK
+    PF mov,     PF_SRC, SRC
+    PF mov,     PF_DST, DST_R
+    PF mov,     PF_MASK, MASK
     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
-    PF mov      PF_CTL, H, lsl #4
-    PF add      PF_CTL, #(prefetch_distance - 0x10)
+    PF mov,     PF_CTL, H, lsl #4
+    PF add,     PF_CTL, #(\prefetch_distance - 0x10)
 
-    init
+    \init
 .if regs_shortage
     push        {r0, r1}
 .endif
@@ -826,9 +825,9 @@ local skip1
  * long scanlines
  */
 0:
-    ensure_destination_ptr_alignment process_pixblock_head, \
-                                     process_pixblock_tail, \
-                                     process_pixblock_tail_head
+    ensure_destination_ptr_alignment \process_pixblock_head, \
+                                     \process_pixblock_tail, \
+                                     \process_pixblock_tail_head
 
     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
     pixld_a     pixblock_size, dst_r_bpp, \
@@ -836,33 +835,33 @@ local skip1
     fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    PF add      PF_X, PF_X, #pixblock_size
-    process_pixblock_head
+    PF add,     PF_X, PF_X, #pixblock_size
+    \process_pixblock_head
     cache_preload 0, pixblock_size
     cache_preload_simple
     subs        W, W, #(pixblock_size * 2)
     blt         2f
 1:
-    process_pixblock_tail_head
+    \process_pixblock_tail_head
     cache_preload_simple
     subs        W, W, #pixblock_size
     bge         1b
 2:
-    process_pixblock_tail
+    \process_pixblock_tail
     pixst_a     pixblock_size, dst_w_bpp, \
                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 
     /* Process the remaining trailing pixels in the scanline */
     process_trailing_pixels 1, 1, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
     advance_to_next_scanline 0b
 
 .if regs_shortage
     pop         {r0, r1}
 .endif
-    cleanup
+    \cleanup
     pop         {r4-r12, pc}  /* exit */
 /*
  * This is the start of the loop, designed to process images with small width
@@ -878,22 +877,22 @@ local skip1
     fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    process_pixblock_head
-    process_pixblock_tail
+    \process_pixblock_head
+    \process_pixblock_tail
     pixst       pixblock_size, dst_w_bpp, \
                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 1:
     /* Process the remaining trailing pixels in the scanline */
     process_trailing_pixels 0, 0, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
     advance_to_next_scanline 8b
 9:
 .if regs_shortage
     pop         {r0, r1}
 .endif
-    cleanup
+    \cleanup
     pop         {r4-r12, pc}  /* exit */
 
     .purgem     fetch_src_pixblock
@@ -915,7 +914,7 @@ local skip1
     .unreq      PF_DST
     .unreq      PF_MASK
     .unreq      DUMMY
-    .endfunc
+    pixman_end_asm_function
 .endm
 
 /*
@@ -939,23 +938,23 @@ local skip1
                                                    src_basereg_   = 0, \
                                                    mask_basereg_  = 24
 
-    pixman_asm_function fname
+    pixman_asm_function \fname
 
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 /*
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
-    .set src_bpp, src_bpp_
-    .set mask_bpp, mask_bpp_
-    .set dst_w_bpp, dst_w_bpp_
-    .set pixblock_size, pixblock_size_
-    .set dst_w_basereg, dst_w_basereg_
-    .set dst_r_basereg, dst_r_basereg_
-    .set src_basereg, src_basereg_
-    .set mask_basereg, mask_basereg_
-
-.if use_nearest_scaling != 0
+    .set src_bpp, \src_bpp_
+    .set mask_bpp, \mask_bpp_
+    .set dst_w_bpp, \dst_w_bpp_
+    .set pixblock_size, \pixblock_size_
+    .set dst_w_basereg, \dst_w_basereg_
+    .set dst_r_basereg, \dst_r_basereg_
+    .set src_basereg, \src_basereg_
+    .set mask_basereg, \mask_basereg_
+
+.if \use_nearest_scaling != 0
     /*
      * Assign symbolic names to registers for nearest scaling
      */
@@ -971,7 +970,7 @@ local skip1
     SRC_WIDTH_FIXED .req        r7
 
     .macro pixld_src x:vararg
-        pixld_s x
+        pixld_s \x
     .endm
 
     ldr         UNIT_X, [sp]
@@ -991,16 +990,16 @@ local skip1
     MASK        .req        r3      /* mask pointer */
 
     .macro pixld_src x:vararg
-        pixld x
+        pixld \x
     .endm
 .endif
 
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
     .set dst_r_bpp, dst_w_bpp
 .else
     .set dst_r_bpp, 0
 .endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
     .set DEINTERLEAVE_32BPP_ENABLED, 1
 .else
     .set DEINTERLEAVE_32BPP_ENABLED, 0
@@ -1011,15 +1010,15 @@ local skip1
                     (src_basereg - pixblock_size * src_bpp / 64), SRC
     .endm
 
-    init
+    \init
     mov         DST_R, DST_W
 
     cmp         W, #pixblock_size
     blt         8f
 
-    ensure_destination_ptr_alignment process_pixblock_head, \
-                                     process_pixblock_tail, \
-                                     process_pixblock_tail_head
+    ensure_destination_ptr_alignment \process_pixblock_head, \
+                                     \process_pixblock_tail, \
+                                     \process_pixblock_tail_head
 
     subs        W, W, #pixblock_size
     blt         7f
@@ -1030,26 +1029,26 @@ local skip1
     fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-    process_pixblock_head
+    \process_pixblock_head
     subs        W, W, #pixblock_size
     blt         2f
 1:
-    process_pixblock_tail_head
+    \process_pixblock_tail_head
     subs        W, W, #pixblock_size
     bge         1b
 2:
-    process_pixblock_tail
+    \process_pixblock_tail
     pixst_a     pixblock_size, dst_w_bpp, \
                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 7:
     /* Process the remaining trailing pixels in the scanline (dst aligned) */
     process_trailing_pixels 0, 1, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
 
-    cleanup
-.if use_nearest_scaling != 0
+    \cleanup
+.if \use_nearest_scaling != 0
     pop         {r4-r8, pc}  /* exit */
 .else
     bx          lr  /* exit */
@@ -1057,13 +1056,13 @@ local skip1
 8:
     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
     process_trailing_pixels 0, 0, \
-                            process_pixblock_head, \
-                            process_pixblock_tail, \
-                            process_pixblock_tail_head
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
 
-    cleanup
+    \cleanup
 
-.if use_nearest_scaling != 0
+.if \use_nearest_scaling != 0
     pop         {r4-r8, pc}  /* exit */
 
     .unreq      DST_R
@@ -1090,15 +1089,15 @@ local skip1
     .purgem     fetch_src_pixblock
     .purgem     pixld_src
 
-    .endfunc
+    pixman_end_asm_function
 .endm
 
 .macro generate_composite_function_single_scanline x:vararg
-    generate_composite_function_scanline 0, x
+    generate_composite_function_scanline 0, \x
 .endm
 
 .macro generate_composite_function_nearest_scanline x:vararg
-    generate_composite_function_scanline 1, x
+    generate_composite_function_scanline 1, \x
 .endm
 
 /* Default prologue/epilogue, nothing special needs to be done */
@@ -1134,22 +1133,22 @@ local skip1
  *          value (in) is lost.
  */
 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
-    vshrn.u16   out_r, in,    #8
-    vshrn.u16   out_g, in,    #3
-    vsli.u16    in,    in,    #5
-    vmov.u8     out_a, #255
-    vsri.u8     out_r, out_r, #5
-    vsri.u8     out_g, out_g, #6
-    vshrn.u16   out_b, in,    #2
+    vshrn.u16   \out_r, \in,    #8
+    vshrn.u16   \out_g, \in,    #3
+    vsli.u16    \in,    \in,    #5
+    vmov.u8     \out_a, #255
+    vsri.u8     \out_r, \out_r, #5
+    vsri.u8     \out_g, \out_g, #6
+    vshrn.u16   \out_b, \in,    #2
 .endm
 
 .macro convert_0565_to_x888 in, out_r, out_g, out_b
-    vshrn.u16   out_r, in,    #8
-    vshrn.u16   out_g, in,    #3
-    vsli.u16    in,    in,    #5
-    vsri.u8     out_r, out_r, #5
-    vsri.u8     out_g, out_g, #6
-    vshrn.u16   out_b, in,    #2
+    vshrn.u16   \out_r, \in,    #8
+    vshrn.u16   \out_g, \in,    #3
+    vsli.u16    \in,    \in,    #5
+    vsri.u8     \out_r, \out_r, #5
+    vsri.u8     \out_g, \out_g, #6
+    vshrn.u16   \out_b, \in,    #2
 .endm
 
 /*
@@ -1159,11 +1158,11 @@ local skip1
  * registers (tmp1, tmp2)
  */
 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
-    vshll.u8    tmp1, in_g, #8
-    vshll.u8    out, in_r, #8
-    vshll.u8    tmp2, in_b, #8
-    vsri.u16    out, tmp1, #5
-    vsri.u16    out, tmp2, #11
+    vshll.u8    \tmp1, \in_g, #8
+    vshll.u8    \out, \in_r, #8
+    vshll.u8    \tmp2, \in_b, #8
+    vsri.u16    \out, \tmp1, #5
+    vsri.u16    \out, \tmp2, #11
 .endm
 
 /*
@@ -1173,12 +1172,12 @@ local skip1
  * value from 'in' is lost
  */
 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
-    vshl.u16    out0, in,   #5  /* G top 6 bits */
-    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
-    vsri.u16    in,   in,   #5  /* R is ready in top bits */
-    vsri.u16    out0, out0, #6  /* G is ready in top bits */
-    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
-    vshr.u16    out1, in,   #8  /* R is in place */
-    vsri.u16    out0, tmp,  #8  /* G & B is in place */
-    vzip.u16    out0, out1      /* everything is in place */
+    vshl.u16    \out0, \in,   #5  /* G top 6 bits */
+    vshl.u16    \tmp,  \in,   #11 /* B top 5 bits */
+    vsri.u16    \in,   \in,   #5  /* R is ready in top bits */
+    vsri.u16    \out0, \out0, #6  /* G is ready in top bits */
+    vsri.u16    \tmp,  \tmp,  #5  /* B is ready in top bits */
+    vshr.u16    \out1, \in,   #8  /* R is in place */
+    vsri.u16    \out0, \tmp,  #8  /* G & B is in place */
+    vzip.u16    \out0, \out1      /* everything is in place */
 .endm
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 60e9c78..103f1c2 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -27,7 +27,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <string.h>
@@ -194,7 +194,7 @@ arm_neon_fill (pixman_implementation_t *imp,
 	       uint32_t                 _xor)
 {
     /* stride is always multiple of 32bit units in pixman */
-    uint32_t byte_stride = stride * sizeof(uint32_t);
+    int32_t byte_stride = stride * sizeof(uint32_t);
 
     switch (bpp)
     {
@@ -331,6 +331,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_over_8888_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
@@ -341,17 +342,33 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       x8r8g8b8, neon_composite_add_n_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       x8b8g8r8, neon_composite_add_n_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
     PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
     PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, a8,       x8r8g8b8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       x8r8g8b8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, a8,       x8b8g8r8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       x8b8g8r8, neon_composite_add_8888_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, solid,    x8r8g8b8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    x8r8g8b8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, solid,    x8b8g8r8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    x8b8g8r8, neon_composite_add_8888_n_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  x8r8g8b8, null,     x8r8g8b8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     x8r8g8b8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  x8b8g8r8, null,     x8b8g8r8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     x8b8g8r8, neon_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
@@ -359,24 +376,26 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, x8r8g8b8, neon_composite_out_reverse_8_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, x8b8g8r8, neon_composite_out_reverse_8_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
     /* Note: NONE repeat is not supported yet */
     SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
     SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
@@ -404,6 +423,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
 
     SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8888),
 
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
@@ -420,6 +440,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
 
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
 
     { PIXMAN_OP_NONE },
 };
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index e050292..cc62c81 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -39,6 +39,8 @@
 
 #include "pixman-arm-asm.h"
 
+	pixman_syntax_unified
+
 /*
  * Note: This code is only using armv5te instructions (not even armv6),
  *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
@@ -62,7 +64,7 @@
                                       prefetch_distance,        \
                                       prefetch_braking_distance
 
-pixman_asm_function fname
+pixman_asm_function \fname
 	W		.req	r0
 	DST		.req	r1
 	SRC		.req	r2
@@ -76,39 +78,39 @@ pixman_asm_function fname
 
 	ldr	UNIT_X, [sp]
 	push	{r4, r5, r6, r7, r8, r10}
-	mvn	VXMASK, #((1 << bpp_shift) - 1)
+	mvn	VXMASK, #((1 << \bpp_shift) - 1)
 	ldr	SRC_WIDTH_FIXED, [sp, #28]
 
 	/* define helper macro */
 	.macro	scale_2_pixels
-		ldr&t	TMP1, [SRC, TMP1]
-		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+		ldr\()\t	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
 		adds	VX, VX, UNIT_X
-		str&t	TMP1, [DST], #(1 << bpp_shift)
-9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		str\()\t	TMP1, [DST], #(1 << \bpp_shift)
+9:		subspl	VX, VX, SRC_WIDTH_FIXED
 		bpl	9b
 
-		ldr&t	TMP2, [SRC, TMP2]
-		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+		ldr\()\t	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
 		adds	VX, VX, UNIT_X
-		str&t	TMP2, [DST], #(1 << bpp_shift)
-9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		str\()\t	TMP2, [DST], #(1 << \bpp_shift)
+9:		subspl	VX, VX, SRC_WIDTH_FIXED
 		bpl	9b
 	.endm
 
 	/* now do the scaling */
-	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+	and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
 	adds	VX, VX, UNIT_X
-9:	subpls	VX, VX, SRC_WIDTH_FIXED
+9:	subspl	VX, VX, SRC_WIDTH_FIXED
 	bpl	9b
-	subs	W, W, #(8 + prefetch_braking_distance)
+	subs	W, W, #(8 + \prefetch_braking_distance)
 	blt	2f
 	/* calculate prefetch offset */
-	mov	PF_OFFS, #prefetch_distance
+	mov	PF_OFFS, #\prefetch_distance
 	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
 1:	/* main loop, process 8 pixels per iteration with prefetch */
-	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
-	add	PF_OFFS, UNIT_X, lsl #3
+	pld	[SRC, PF_OFFS, asr #(16 - \bpp_shift)]
+	add	PF_OFFS, PF_OFFS, UNIT_X, lsl #3
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
@@ -116,7 +118,7 @@ pixman_asm_function fname
 	subs	W, W, #8
 	bge	1b
 2:
-	subs	W, W, #(4 - 8 - prefetch_braking_distance)
+	subs	W, W, #(4 - 8 - \prefetch_braking_distance)
 	blt	2f
 1:	/* process the remaining pixels */
 	scale_2_pixels
@@ -129,8 +131,8 @@ pixman_asm_function fname
 	scale_2_pixels
 2:
 	tst	W, #1
-	ldrne&t	TMP1, [SRC, TMP1]
-	strne&t	TMP1, [DST]
+	ldr\()\t\()ne	TMP1, [SRC, TMP1]
+	str\()\t\()ne	TMP1, [DST]
 	/* cleanup helper macro */
 	.purgem	scale_2_pixels
 	.unreq	DST
@@ -146,7 +148,7 @@ pixman_asm_function fname
 	/* return */
 	pop	{r4, r5, r6, r7, r8, r10}
 	bx	lr
-.endfunc
+	pixman_end_asm_function
 .endm
 
 generate_nearest_scanline_func \
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 7b0727b..34d38f1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -40,6 +40,8 @@
 #include "pixman-arm-asm.h"
 #include "pixman-arm-simd-asm.h"
 
+	pixman_syntax_unified
+
 /* A head macro should do all processing which results in an output of up to
  * 16 bytes, as far as the final load instruction. The corresponding tail macro
  * should complete the processing of the up-to-16 bytes. The calling macro will
@@ -57,7 +59,7 @@
 .endm
 
 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
 .endm
 
 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
@@ -65,8 +67,8 @@
     WK5     .req    STRIDE_S
     WK6     .req    MASK
     WK7     .req    STRIDE_M
-110:    pixld   , 16, 0, SRC, unaligned_src
-        pixld   , 16, 4, SRC, unaligned_src
+110:    pixld   , 16, 0, SRC, \unaligned_src
+        pixld   , 16, 4, SRC, \unaligned_src
         pld     [SRC, SCRATCH]
         pixst   , 16, 0, DST
         pixst   , 16, 4, DST
@@ -122,7 +124,7 @@ generate_composite_function \
 
 .macro src_n_0565_init
         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
-        orr     SRC, SRC, lsl #16
+        orr     SRC, SRC, SRC, lsl #16
         mov     STRIDE_S, SRC
         mov     MASK, SRC
         mov     STRIDE_M, SRC
@@ -130,8 +132,8 @@ generate_composite_function \
 
 .macro src_n_8_init
         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
-        orr     SRC, SRC, lsl #8
-        orr     SRC, SRC, lsl #16
+        orr     SRC, SRC, SRC, lsl #8
+        orr     SRC, SRC, SRC, lsl #16
         mov     STRIDE_S, SRC
         mov     MASK, SRC
         mov     STRIDE_M, SRC
@@ -142,7 +144,7 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    MASK
     WK7     .req    STRIDE_M
-        pixst   cond, numbytes, 4, DST
+        pixst   \cond, \numbytes, 4, DST
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -182,20 +184,20 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro src_x888_8888_pixel, cond, reg
-        orr&cond WK&reg, WK&reg, #0xFF000000
+        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
 .endm
 
 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
 .endm
 
 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
-        src_x888_8888_pixel cond, %(firstreg+0)
- .if numbytes >= 8
-        src_x888_8888_pixel cond, %(firstreg+1)
-  .if numbytes == 16
-        src_x888_8888_pixel cond, %(firstreg+2)
-        src_x888_8888_pixel cond, %(firstreg+3)
+        src_x888_8888_pixel \cond, %(\firstreg+0)
+ .if \numbytes >= 8
+        src_x888_8888_pixel \cond, %(\firstreg+1)
+  .if \numbytes == 16
+        src_x888_8888_pixel \cond, %(\firstreg+2)
+        src_x888_8888_pixel \cond, %(\firstreg+3)
   .endif
  .endif
 .endm
@@ -222,73 +224,73 @@ generate_composite_function \
 .endm
 
 .macro src_0565_8888_2pixels, reg1, reg2
-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
-        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
-        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
-        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
-        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
-        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
-        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
-        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
-        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
-        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
-        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
+        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000
+        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG
+        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
+        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
+        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------
+        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg
+        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
+        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
 .endm
 
 /* This version doesn't need STRIDE_M, but is one instruction longer.
    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
-        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
-        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
-        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
-        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
-        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
-        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
-        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
-        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
-        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
-        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
-        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
-        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
-        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
-        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
+        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
+        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB
+        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000
+        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000
+        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000
+        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
+        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
+        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb
 */
 
 .macro src_0565_8888_1pixel, reg
-        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
-        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
-        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
-        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
-        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
-        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
-        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
-        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
-        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
+        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb
+        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000
+        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000
+        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000
+        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
 .endm
 
 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 16
-        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
- .elseif numbytes == 8
-        pixld   , 4, firstreg, SRC, unaligned_src
- .elseif numbytes == 4
-        pixld   , 2, firstreg, SRC, unaligned_src
+ .if \numbytes == 16
+        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
+ .elseif \numbytes == 8
+        pixld   , 4, \firstreg, SRC, \unaligned_src
+ .elseif \numbytes == 4
+        pixld   , 2, \firstreg, SRC, \unaligned_src
  .endif
 .endm
 
 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
- .if numbytes == 16
-        src_0565_8888_2pixels firstreg, %(firstreg+1)
-        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
-        src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .if \numbytes == 16
+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
+        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
  .else
-        src_0565_8888_1pixel firstreg
+        src_0565_8888_1pixel \firstreg
  .endif
 .endm
 
@@ -311,23 +313,23 @@ generate_composite_function \
 .endm
 
 .macro src_x888_0565_1pixel  s, d
-        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
-        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
-        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
-        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
+        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000
+        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb
+        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
         /* Top 16 bits are discarded during the following STRH */
 .endm
 
 .macro src_x888_0565_2pixels  slo, shi, d, tmp
-        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
-        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
-        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
-        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
-        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
-        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
-        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
-        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
-        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000
+        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB
+        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb
+        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
+        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB
+        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000
+        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
+        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
 .endm
 
 .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
@@ -335,33 +337,33 @@ generate_composite_function \
         WK5     .req    STRIDE_M
         WK6     .req    WK3
         WK7     .req    ORIG_W
- .if numbytes == 16
+ .if \numbytes == 16
         pixld   , 16, 4, SRC, 0
         src_x888_0565_2pixels  4, 5, 0, 0
         pixld   , 8, 4, SRC, 0
         src_x888_0565_2pixels  6, 7, 1, 1
         pixld   , 8, 6, SRC, 0
  .else
-        pixld   , numbytes*2, 4, SRC, 0
+        pixld   , \numbytes*2, 4, SRC, 0
  .endif
 .endm
 
 .macro src_x888_0565_process_tail   cond, numbytes, firstreg
- .if numbytes == 16
+ .if \numbytes == 16
         src_x888_0565_2pixels  4, 5, 2, 2
         src_x888_0565_2pixels  6, 7, 3, 4
- .elseif numbytes == 8
+ .elseif \numbytes == 8
         src_x888_0565_2pixels  4, 5, 1, 1
         src_x888_0565_2pixels  6, 7, 2, 2
- .elseif numbytes == 4
+ .elseif \numbytes == 4
         src_x888_0565_2pixels  4, 5, 1, 1
  .else
         src_x888_0565_1pixel  4, 1
  .endif
- .if numbytes == 16
-        pixst   , numbytes, 0, DST
+ .if \numbytes == 16
+        pixst   , \numbytes, 0, DST
  .else
-        pixst   , numbytes, 1, DST
+        pixst   , \numbytes, 1, DST
  .endif
         .unreq  WK4
         .unreq  WK5
@@ -382,37 +384,37 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro add_8_8_8pixels  cond, dst1, dst2
-        uqadd8&cond  WK&dst1, WK&dst1, MASK
-        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
+        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK
+        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M
 .endm
 
 .macro add_8_8_4pixels  cond, dst
-        uqadd8&cond  WK&dst, WK&dst, MASK
+        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK
 .endm
 
 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     WK4     .req    MASK
     WK5     .req    STRIDE_M
- .if numbytes == 16
-        pixld   cond, 8, 4, SRC, unaligned_src
-        pixld   cond, 16, firstreg, DST, 0
-        add_8_8_8pixels cond, firstreg, %(firstreg+1)
-        pixld   cond, 8, 4, SRC, unaligned_src
+ .if \numbytes == 16
+        pixld   \cond, 8, 4, SRC, \unaligned_src
+        pixld   \cond, 16, \firstreg, DST, 0
+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
+        pixld   \cond, 8, 4, SRC, \unaligned_src
  .else
-        pixld   cond, numbytes, 4, SRC, unaligned_src
-        pixld   cond, numbytes, firstreg, DST, 0
+        pixld   \cond, \numbytes, 4, SRC, \unaligned_src
+        pixld   \cond, \numbytes, \firstreg, DST, 0
  .endif
     .unreq  WK4
     .unreq  WK5
 .endm
 
 .macro add_8_8_process_tail  cond, numbytes, firstreg
- .if numbytes == 16
-        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
-        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .if \numbytes == 16
+        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
  .else
-        add_8_8_4pixels cond, firstreg
+        add_8_8_4pixels \cond, \firstreg
  .endif
 .endm
 
@@ -441,8 +443,8 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    STRIDE_M
     WK7     .req    ORIG_W
-        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -451,44 +453,44 @@ generate_composite_function \
 
 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
-        teq     WK&reg0, #0
- .if numbytes > 4
-        teqeq   WK&reg1, #0
-  .if numbytes > 8
-        teqeq   WK&reg2, #0
-        teqeq   WK&reg3, #0
+        teq     WK\()\reg0, #0
+ .if \numbytes > 4
+        teqeq   WK\()\reg1, #0
+  .if \numbytes > 8
+        teqeq   WK\()\reg2, #0
+        teqeq   WK\()\reg3, #0
   .endif
  .endif
 .endm
 
 .macro over_8888_8888_prepare  next
-        mov     WK&next, WK&next, lsr #24
+        mov     WK\()\next, WK\()\next, lsr #24
 .endm
 
 .macro over_8888_8888_1pixel src, dst, offset, next
         /* src = destination component multiplier */
-        rsb     WK&src, WK&src, #255
+        rsb     WK\()\src, WK\()\src, #255
         /* Split even/odd bytes of dst into SCRATCH/dst */
-        uxtb16  SCRATCH, WK&dst
-        uxtb16  WK&dst, WK&dst, ror #8
+        uxtb16  SCRATCH, WK\()\dst
+        uxtb16  WK\()\dst, WK\()\dst, ror #8
         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
-        mla     SCRATCH, SCRATCH, WK&src, MASK
-        mla     WK&dst, WK&dst, WK&src, MASK
+        mla     SCRATCH, SCRATCH, WK\()\src, MASK
+        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK
         /* Where we would have had a stall between the result of the first MLA and the shifter input,
          * reload the complete source pixel */
-        ldr     WK&src, [SRC, #offset]
+        ldr     WK\()\src, [SRC, #\offset]
         /* Multiply by 257/256 to approximate 256/255 */
         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
         /* In this stall, start processing the next pixel */
- .if offset < -4
-        mov     WK&next, WK&next, lsr #24
+ .if \offset < -4
+        mov     WK\()\next, WK\()\next, lsr #24
  .endif
-        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
         /* Recombine even/odd bytes of multiplied destination */
         mov     SCRATCH, SCRATCH, ror #8
-        sel     WK&dst, SCRATCH, WK&dst
+        sel     WK\()\dst, SCRATCH, WK\()\dst
         /* Saturated add of source to multiplied destination */
-        uqadd8  WK&dst, WK&dst, WK&src
+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
 .endm
 
 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
@@ -496,17 +498,17 @@ generate_composite_function \
     WK5     .req    STRIDE_S
     WK6     .req    STRIDE_M
     WK7     .req    ORIG_W
-        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
         beq     10f
-        over_8888_8888_prepare  %(4+firstreg)
- .set PROCESS_REG, firstreg
- .set PROCESS_OFF, -numbytes
- .rept numbytes / 4
+        over_8888_8888_prepare  %(4+\firstreg)
+ .set PROCESS_REG, \firstreg
+ .set PROCESS_OFF, -\numbytes
+ .rept \numbytes / 4
         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
   .set PROCESS_OFF, PROCESS_OFF+4
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
     .unreq  WK5
@@ -536,16 +538,16 @@ generate_composite_function \
  */
 .macro mul_8888_8  word, byte, tmp, half
         /* Split even/odd bytes of word apart */
-        uxtb16  tmp, word
-        uxtb16  word, word, ror #8
+        uxtb16  \tmp, \word
+        uxtb16  \word, \word, ror #8
         /* Multiply bytes together with rounding, then by 257/256 */
-        mla     tmp, tmp, byte, half
-        mla     word, word, byte, half /* 1 stall follows */
-        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
-        uxtab16 word, word, word, ror #8
+        mla     \tmp, \tmp, \byte, \half
+        mla     \word, \word, \byte, \half /* 1 stall follows */
+        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */
+        uxtab16 \word, \word, \word, ror #8
         /* Recombine bytes */
-        mov     tmp, tmp, ror #8
-        sel     word, tmp, word
+        mov     \tmp, \tmp, ror #8
+        sel     \word, \tmp, \word
 .endm
 
 /******************************************************************************/
@@ -567,8 +569,8 @@ generate_composite_function \
     WK5     .req    STRIDE_D
     WK6     .req    STRIDE_S
     WK7     .req    ORIG_W
-        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
     .unreq  WK5
     .unreq  WK6
@@ -576,10 +578,10 @@ generate_composite_function \
 .endm
 
 .macro over_8888_n_8888_1pixel src, dst
-        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
-        sub     WK7, WK6, WK&src, lsr #24
-        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
-        uqadd8  WK&dst, WK&dst, WK&src
+        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M
+        sub     WK7, WK6, WK\()\src, lsr #24
+        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M
+        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
 .endm
 
 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
@@ -587,12 +589,12 @@ generate_composite_function \
     WK5     .req    STRIDE_D
     WK6     .req    STRIDE_S
     WK7     .req    ORIG_W
-        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
         beq     10f
         mov     WK6, #255
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
-  .if numbytes == 16 && PROCESS_REG == 2
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+  .if \numbytes == 16 && PROCESS_REG == 2
         /* We're using WK6 and WK7 as temporaries, so half way through
          * 4 pixels, reload the second two source pixels but this time
          * into WK4 and WK5 */
@@ -601,7 +603,7 @@ generate_composite_function \
         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
     .unreq  WK5
@@ -642,13 +644,13 @@ generate_composite_function \
 
 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     WK4     .req    STRIDE_M
-        pixld   , numbytes/4, 4, MASK, unaligned_mask
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes/4, 4, MASK, \unaligned_mask
+        pixld   , \numbytes, \firstreg, DST, 0
     .unreq  WK4
 .endm
 
 .macro over_n_8_8888_1pixel src, dst
-        uxtb    Y, WK4, ror #src*8
+        uxtb    Y, WK4, ror #\src*8
         /* Trailing part of multiplication of source */
         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
         mla     Y, SRC, Y, STRIDE_D
@@ -659,20 +661,20 @@ generate_composite_function \
         sub     ORIG_W, ORIG_W, Y, lsr #24
         sel     Y, SCRATCH, Y
         /* Then multiply the destination */
-        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
-        uqadd8  WK&dst, WK&dst, Y
+        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
+        uqadd8  WK\()\dst, WK\()\dst, Y
 .endm
 
 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
     WK4     .req    STRIDE_M
         teq     WK4, #0
         beq     10f
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
-        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)
   .set PROCESS_REG, PROCESS_REG+1
  .endr
-        pixst   , numbytes, firstreg, DST
+        pixst   , \numbytes, \firstreg, DST
 10:
     .unreq  WK4
 .endm
@@ -705,14 +707,14 @@ generate_composite_function \
 .endm
 
 .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        pixld   , numbytes, firstreg, DST, 0
+        pixld   , \numbytes, \firstreg, DST, 0
 .endm
 
 .macro over_reverse_n_8888_1pixel  d, is_only
-        teq     WK&d, #0
+        teq     WK\()\d, #0
         beq     8f       /* replace with source */
-        bics    ORIG_W, STRIDE_D, WK&d, lsr #24
- .if is_only == 1
+        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24
+ .if \is_only == 1
         beq     49f      /* skip store */
  .else
         beq     9f       /* write same value back */
@@ -723,36 +725,36 @@ generate_composite_function \
         uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
         mov     SCRATCH, SCRATCH, ror #8
         sel     ORIG_W, SCRATCH, ORIG_W
-        uqadd8  WK&d, WK&d, ORIG_W
+        uqadd8  WK\()\d, WK\()\d, ORIG_W
         b       9f
-8:      mov     WK&d, SRC
+8:      mov     WK\()\d, SRC
 9:
 .endm
 
 .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
-        over_reverse_n_8888_1pixel  reg1, 1
+ .if \numbytes == 4
+        over_reverse_n_8888_1pixel  \reg1, 1
  .else
-        and     SCRATCH, WK&reg1, WK&reg2
-  .if numbytes == 16
-        and     SCRATCH, SCRATCH, WK&reg3
-        and     SCRATCH, SCRATCH, WK&reg4
+        and     SCRATCH, WK\()\reg1, WK\()\reg2
+  .if \numbytes == 16
+        and     SCRATCH, SCRATCH, WK\()\reg3
+        and     SCRATCH, SCRATCH, WK\()\reg4
   .endif
         mvns    SCRATCH, SCRATCH, asr #24
         beq     49f /* skip store if all opaque */
-        over_reverse_n_8888_1pixel  reg1, 0
-        over_reverse_n_8888_1pixel  reg2, 0
-  .if numbytes == 16
-        over_reverse_n_8888_1pixel  reg3, 0
-        over_reverse_n_8888_1pixel  reg4, 0
+        over_reverse_n_8888_1pixel  \reg1, 0
+        over_reverse_n_8888_1pixel  \reg2, 0
+  .if \numbytes == 16
+        over_reverse_n_8888_1pixel  \reg3, 0
+        over_reverse_n_8888_1pixel  \reg4, 0
   .endif
  .endif
-        pixst   , numbytes, reg1, DST
+        pixst   , \numbytes, \reg1, DST
 49:
 .endm
 
 .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
-        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
 .endm
 
 generate_composite_function \
@@ -794,20 +796,20 @@ generate_composite_function \
 
 .macro over_white_8888_8888_ca_combine  m, d
         uxtb16  TMP1, TMP0                /* rb_notmask */
-        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
+        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */
         smlatt  TMP3, TMP2, TMP1, HALF    /* red */
         smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
         uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
-        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
-        smlatt  d, TMP1, TMP0, HALF       /* alpha */
+        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */
+        smlatt  \d, TMP1, TMP0, HALF      /* alpha */
         smlabb  TMP1, TMP1, TMP0, HALF    /* green */
         pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
-        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
+        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */
         uxtab16 TMP0, TMP0, TMP0, ror #8
         uxtab16 TMP1, TMP1, TMP1, ror #8
         mov     TMP0, TMP0, ror #8
-        sel     d, TMP0, TMP1
-        uqadd8  d, d, m                   /* d is a late result */
+        sel     \d, TMP0, TMP1
+        uqadd8  \d, \d, \m                 /* d is a late result */
 .endm
 
 .macro over_white_8888_8888_ca_1pixel_head
@@ -853,10 +855,10 @@ generate_composite_function \
 .endm
 
 .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 4
+ .if \numbytes == 4
         over_white_8888_8888_ca_1pixel_head
  .else
-  .if numbytes == 16
+  .if \numbytes == 16
         over_white_8888_8888_ca_2pixels_head
         over_white_8888_8888_ca_2pixels_tail
   .endif
@@ -865,7 +867,7 @@ generate_composite_function \
 .endm
 
 .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
- .if numbytes == 4
+ .if \numbytes == 4
         over_white_8888_8888_ca_1pixel_tail
  .else
         over_white_8888_8888_ca_2pixels_tail
@@ -1004,7 +1006,7 @@ generate_composite_function \
 .endm
 
 .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .rept (numbytes / 4) - 1
+ .rept (\numbytes / 4) - 1
         over_n_8888_8888_ca_1pixel_head
         over_n_8888_8888_ca_1pixel_tail
  .endr
@@ -1020,7 +1022,7 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
         cmp     ip, #-1
         beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
         /* else drop through... */
- .endfunc
+pixman_end_asm_function
 generate_composite_function \
     pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
@@ -1045,84 +1047,84 @@ generate_composite_function \
 
 .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
         ldrb    ORIG_W, [SRC], #4
- .if numbytes >= 8
-        ldrb    WK&reg1, [SRC], #4
-  .if numbytes == 16
-        ldrb    WK&reg2, [SRC], #4
-        ldrb    WK&reg3, [SRC], #4
+ .if \numbytes >= 8
+        ldrb    WK\()\reg1, [SRC], #4
+  .if \numbytes == 16
+        ldrb    WK\()\reg2, [SRC], #4
+        ldrb    WK\()\reg3, [SRC], #4
   .endif
  .endif
-        add     DST, DST, #numbytes
+        add     DST, DST, #\numbytes
 .endm
 
 .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
-        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
 .endm
 
 .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
- .if is_only != 1
-        movs    s, ORIG_W
-  .if offset != 0
-        ldrb    ORIG_W, [SRC, #offset]
+ .if \is_only != 1
+        movs    \s, ORIG_W
+  .if \offset != 0
+        ldrb    ORIG_W, [SRC, #\offset]
   .endif
         beq     01f
         teq     STRIDE_M, #0xFF
         beq     02f
  .endif
-        uxtb16  SCRATCH, d                 /* rb_dest */
-        uxtb16  d, d, ror #8               /* ag_dest */
-        mla     SCRATCH, SCRATCH, s, MASK
-        mla     d, d, s, MASK
+        uxtb16  SCRATCH, \d                 /* rb_dest */
+        uxtb16  \d, \d, ror #8               /* ag_dest */
+        mla     SCRATCH, SCRATCH, \s, MASK
+        mla     \d, \d, \s, MASK
         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
-        uxtab16 d, d, d, ror #8
+        uxtab16 \d, \d, \d, ror #8
         mov     SCRATCH, SCRATCH, ror #8
-        sel     d, SCRATCH, d
+        sel     \d, SCRATCH, \d
         b       02f
- .if offset == 0
+ .if \offset == 0
 48:     /* Last mov d,#0 of the set - used as part of shortcut for
          * source values all 0 */
  .endif
-01:     mov     d, #0
+01:     mov     \d, #0
 02:
 .endm
 
 .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
+ .if \numbytes == 4
         teq     ORIG_W, ORIG_W, asr #32
-        ldrne   WK&reg1, [DST, #-4]
- .elseif numbytes == 8
-        teq     ORIG_W, WK&reg1
+        ldrne   WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+        teq     ORIG_W, WK\()\reg1
         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
-        ldmnedb DST, {WK&reg1-WK&reg2}
+        ldmdbne DST, {WK\()\reg1-WK\()\reg2}
  .else
-        teq     ORIG_W, WK&reg1
-        teqeq   ORIG_W, WK&reg2
-        teqeq   ORIG_W, WK&reg3
+        teq     ORIG_W, WK\()\reg1
+        teqeq   ORIG_W, WK\()\reg2
+        teqeq   ORIG_W, WK\()\reg3
         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
-        ldmnedb DST, {WK&reg1-WK&reg4}
+        ldmdbne DST, {WK\()\reg1-WK\()\reg4}
  .endif
         cmnne   DST, #0   /* clear C if NE */
         bcs     49f       /* no writes to dest if source all -1 */
         beq     48f       /* set dest to all 0 if source all 0 */
- .if numbytes == 4
-        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
-        str     WK&reg1, [DST, #-4]
- .elseif numbytes == 8
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
-        stmdb   DST, {WK&reg1-WK&reg2}
+ .if \numbytes == 4
+        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1
+        str     WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0
+        stmdb   DST, {WK\()\reg1-WK\()\reg2}
  .else
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
-        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
-        stmdb   DST, {WK&reg1-WK&reg4}
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0
+        stmdb   DST, {WK\()\reg1-WK\()\reg4}
  .endif
 49:
 .endm
 
 .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
-        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
 .endm
 
 generate_composite_function \
@@ -1136,3 +1138,44 @@ generate_composite_function \
     in_reverse_8888_8888_process_tail
 
 /******************************************************************************/
+
+.macro over_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Hold multiplier for destination in STRIDE_M */
+        mov     STRIDE_M, #255
+        sub     STRIDE_M, STRIDE_M, SRC, lsr #24
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+.endm
+
+.macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   , \numbytes, \firstreg, DST, 0
+.endm
+
+.macro over_n_8888_1pixel dst
+        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK
+        uqadd8  WK\()\dst, WK\()\dst, SRC
+.endm
+
+.macro over_n_8888_process_tail  cond, numbytes, firstreg
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+        over_n_8888_1pixel %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , \numbytes, \firstreg, DST
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
+    2, /* prefetch distance */ \
+    over_n_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_n_8888_process_head, \
+    over_n_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 8de060a..5ec19e0 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -112,64 +112,64 @@
  */
 
 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
- .if numbytes == 16
-  .if unaligned == 1
-        op&r&cond    WK&reg0, [base], #4
-        op&r&cond    WK&reg1, [base], #4
-        op&r&cond    WK&reg2, [base], #4
-        op&r&cond    WK&reg3, [base], #4
+ .if \numbytes == 16
+  .if \unaligned == 1
+        \op\()r\()\cond    WK\()\reg0, [\base], #4
+        \op\()r\()\cond    WK\()\reg1, [\base], #4
+        \op\()r\()\cond    WK\()\reg2, [\base], #4
+        \op\()r\()\cond    WK\()\reg3, [\base], #4
   .else
-        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
   .endif
- .elseif numbytes == 8
-  .if unaligned == 1
-        op&r&cond    WK&reg0, [base], #4
-        op&r&cond    WK&reg1, [base], #4
+ .elseif \numbytes == 8
+  .if \unaligned == 1
+        \op\()r\()\cond    WK\()\reg0, [\base], #4
+        \op\()r\()\cond    WK\()\reg1, [\base], #4
   .else
-        op&m&cond&ia base!, {WK&reg0,WK&reg1}
-  .endif
- .elseif numbytes == 4
-        op&r&cond    WK&reg0, [base], #4
- .elseif numbytes == 2
-        op&r&cond&h  WK&reg0, [base], #2
- .elseif numbytes == 1
-        op&r&cond&b  WK&reg0, [base], #1
+        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1}
+  .endif
+ .elseif \numbytes == 4
+        \op\()r\()\cond    WK\()\reg0, [\base], #4
+ .elseif \numbytes == 2
+        \op\()rh\()\cond   WK\()\reg0, [\base], #2
+ .elseif \numbytes == 1
+        \op\()rb\()\cond   WK\()\reg0, [\base], #1
  .else
-  .error "unsupported size: numbytes"
+  .error "unsupported size: \numbytes"
  .endif
 .endm
 
 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
- .if numbytes == 16
-        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
- .elseif numbytes == 8
-        stm&cond&db base, {WK&reg0,WK&reg1}
- .elseif numbytes == 4
-        str&cond    WK&reg0, [base, #-4]
- .elseif numbytes == 2
-        str&cond&h  WK&reg0, [base, #-2]
- .elseif numbytes == 1
-        str&cond&b  WK&reg0, [base, #-1]
+ .if \numbytes == 16
+        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
+ .elseif \numbytes == 8
+        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
+ .elseif \numbytes == 4
+        str\()\cond    WK\()\reg0, [\base, #-4]
+ .elseif \numbytes == 2
+        strh\()\cond   WK\()\reg0, [\base, #-2]
+ .elseif \numbytes == 1
+        strb\()\cond   WK\()\reg0, [\base, #-1]
  .else
-  .error "unsupported size: numbytes"
+  .error "unsupported size: \numbytes"
  .endif
 .endm
 
 .macro pixld cond, numbytes, firstreg, base, unaligned
-        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+        pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
 .endm
 
 .macro pixst cond, numbytes, firstreg, base
  .if (flags) & FLAG_DST_READWRITE
-        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+        pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
  .else
-        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+        pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
  .endif
 .endm
 
 .macro PF a, x:vararg
  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
-        a x
+        \a \x
  .endif
 .endm
 
@@ -179,11 +179,11 @@
  * between 0 and prefetch_distance (inclusive) cache lines ahead so there
  * are no gaps when the inner loop starts.
  */
- .if bpp > 0
-        PF  bic,    ptr, base, #31
+ .if \bpp > 0
+        PF  bic,    \ptr, \base, #31
   .set OFFSET, 0
   .rept prefetch_distance+1
-        PF  pld,    [ptr, #OFFSET]
+        PF  pld,    [\ptr, #OFFSET]
    .set OFFSET, OFFSET+32
   .endr
  .endif
@@ -201,42 +201,42 @@
  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
  * possible when there are 4 src bytes for every 1 dst byte).
  */
- .if bpp > 0
-  .ifc base,DST
+ .if \bpp > 0
+  .ifc \base,DST
         /* The test can be simplified further when preloading the destination */
-        PF  tst,    base, #16
+        PF  tst,    \base, #16
         PF  beq,    61f
   .else
-   .if bpp/dst_w_bpp == 4
-        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+   .if \bpp/dst_w_bpp == 4
+        PF  add,    SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
         PF  and,    SCRATCH, SCRATCH, #31
-        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
-        PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
-        PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
+        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
+        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
+        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
         PF  bcs,    61f
         PF  bpl,    60f
         PF  pld,    [ptr, #32*(prefetch_distance+2)]
    .else
-        PF  mov,    SCRATCH, base, lsl #32-5
-        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
-        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  mov,    SCRATCH, \base, lsl #32-5
+        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
+        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
         PF  bls,    61f
    .endif
   .endif
-60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
+60:     PF  pld,    [\ptr, #32*(prefetch_distance+1)]
 61:
  .endif
 .endm
 
 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
 .macro preload_middle   bpp, base, scratch_holds_offset
- .if bpp > 0
+ .if \bpp > 0
         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
-  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
-   .if scratch_holds_offset
-        PF  pld,    [base, SCRATCH]
+  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
+   .if \scratch_holds_offset
+        PF  pld,    [\base, SCRATCH]
    .else
-        PF  bic,    SCRATCH, base, #31
+        PF  bic,    SCRATCH, \base, #31
         PF  pld,    [SCRATCH, #32*prefetch_distance]
    .endif
   .endif
@@ -244,28 +244,28 @@
 .endm
 
 .macro preload_trailing  bpp, bpp_shift, base
- .if bpp > 0
-  .if bpp*pix_per_block > 256
+ .if \bpp > 0
+  .if \bpp*pix_per_block > 256
         /* Calculations are more complex if more than one fetch per block */
-        PF  and,    WK1, base, #31
-        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
-        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
-        PF  bic,    SCRATCH, base, #31
+        PF  and,    WK1, \base, #31
+        PF  add,    WK1, WK1, WK0, lsl #\bpp_shift
+        PF  add,    WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
+        PF  bic,    SCRATCH, \base, #31
 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
         PF  add,    SCRATCH, SCRATCH, #32
         PF  subs,   WK1, WK1, #32
         PF  bhi,    80b
   .else
         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
-        PF  mov,    SCRATCH, base, lsl #32-5
-        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
-        PF  adceqs, SCRATCH, SCRATCH, #0
+        PF  mov,    SCRATCH, \base, lsl #32-5
+        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
+        PF  adcseq, SCRATCH, SCRATCH, #0
         /* The instruction above has two effects: ensures Z is only
          * set if C was clear (so Z indicates that both shifted quantities
          * were 0), and clears C if Z was set (so C indicates that the sum
          * of the shifted quantities was greater and not equal to 32) */
         PF  beq,    82f
-        PF  bic,    SCRATCH, base, #31
+        PF  bic,    SCRATCH, \base, #31
         PF  bcc,    81f
         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
@@ -288,12 +288,12 @@
  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
  * "base" - base address register of channel to preload (SRC, MASK or DST)
  */
- .if bpp > 0
-  .if narrow_case && (bpp <= dst_w_bpp)
+ .if \bpp > 0
+  .if \narrow_case && (\bpp <= dst_w_bpp)
         /* In these cases, each line for each channel is in either 1 or 2 cache lines */
-        PF  bic,    WK0, base, #31
+        PF  bic,    WK0, \base, #31
         PF  pld,    [WK0]
-        PF  add,    WK1, base, X, LSL #bpp_shift
+        PF  add,    WK1, \base, X, LSL #\bpp_shift
         PF  sub,    WK1, WK1, #1
         PF  bic,    WK1, WK1, #31
         PF  cmp,    WK1, WK0
@@ -301,9 +301,9 @@
         PF  pld,    [WK1]
 90:
   .else
-        PF  bic,    WK0, base, #31
+        PF  bic,    WK0, \base, #31
         PF  pld,    [WK0]
-        PF  add,    WK1, base, X, lsl #bpp_shift
+        PF  add,    WK1, \base, X, lsl #\bpp_shift
         PF  sub,    WK1, WK1, #1
         PF  bic,    WK1, WK1, #31
         PF  cmp,    WK1, WK0
@@ -319,56 +319,56 @@
 
 
 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
-        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
- .if decrementx
-        sub&cond X, X, #8*numbytes/dst_w_bpp
+        \process_head  \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
+ .if \decrementx
+        sub\()\cond X, X, #8*\numbytes/dst_w_bpp
  .endif
-        process_tail  cond, numbytes, firstreg
+        \process_tail  \cond, \numbytes, \firstreg
  .if !((flags) & FLAG_PROCESS_DOES_STORE)
-        pixst   cond, numbytes, firstreg, DST
+        pixst   \cond, \numbytes, \firstreg, DST
  .endif
 .endm
 
 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
  .if (flags) & FLAG_BRANCH_OVER
-  .ifc cond,mi
+  .ifc \cond,mi
         bpl     100f
   .endif
-  .ifc cond,cs
+  .ifc \cond,cs
         bcc     100f
   .endif
-  .ifc cond,ne
+  .ifc \cond,ne
         beq     100f
   .endif
-        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+        conditional_process1_helper  , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
 100:
  .else
-        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+        conditional_process1_helper  \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
  .endif
 .endm
 
 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
         /* Can't interleave reads and writes */
-        test
-        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+        \test
+        conditional_process1  \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
-        test
+        \test
   .endif
-        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+        conditional_process1  \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
  .else
         /* Can interleave reads and writes for better scheduling */
-        test
-        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
-        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
-  .if decrementx
-        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
-        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
-  .endif
-        process_tail  cond1, numbytes1, firstreg1
-        process_tail  cond2, numbytes2, firstreg2
-        pixst   cond1, numbytes1, firstreg1, DST
-        pixst   cond2, numbytes2, firstreg2, DST
+        \test
+        \process_head  \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
+        \process_head  \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
+  .if \decrementx
+        sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
+        sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
+  .endif
+        \process_tail  \cond1, \numbytes1, \firstreg1
+        \process_tail  \cond2, \numbytes2, \firstreg2
+        pixst   \cond1, \numbytes1, \firstreg1, DST
+        pixst   \cond2, \numbytes2, \firstreg2, DST
  .endif
 .endm
 
@@ -400,12 +400,12 @@
  .endif
         /* Use unaligned loads in all cases for simplicity */
  .if dst_w_bpp == 8
-        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
+        conditional_process2  test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
  .elseif dst_w_bpp == 16
         test_bits_1_0_ptr
-        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+        conditional_process1  cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
  .endif
-        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+        conditional_process2  test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
         ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
  .endif
@@ -424,12 +424,12 @@
 .endm
 
 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
-        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+        conditional_process2  test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
  .if dst_w_bpp == 16
         test_bits_1_0_pix
-        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+        conditional_process1  cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
  .elseif dst_w_bpp == 8
-        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+        conditional_process2  test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
  .endif
 .endm
 
@@ -438,7 +438,7 @@
 110:
  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
  .rept pix_per_block*dst_w_bpp/128
-        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
+        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 1
   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
         preload_middle  src_bpp, SRC, 1
   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -453,9 +453,9 @@
          * preloads for, to achieve staggered prefetches for multiple channels, because there are
          * always two STMs per prefetch, so there is always an opposite STM on which to put the
          * preload. Note, no need to BIC the base register here */
-        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
+        PF  pld,    [DST, #32*prefetch_distance - \dst_alignment]
   .endif
-        process_tail  , 16, 0
+        \process_tail  , 16, 0
   .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 16, 0, DST
   .endif
@@ -470,11 +470,11 @@
  .if dst_r_bpp > 0
         tst     DST, #16
         bne     111f
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
+        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
         b       112f
 111:
  .endif
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
+        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
 112:
         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
@@ -487,13 +487,13 @@
  .endif
         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
         /* The remainder of the line is handled identically to the medium case */
-        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+        medium_case_inner_loop_and_trailing_pixels  \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
 .endm
 
 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
 120:
-        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
-        process_tail  , 16, 0
+        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 0
+        \process_tail  , 16, 0
  .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 16, 0, DST
  .endif
@@ -501,16 +501,16 @@
         bhs     120b
         /* Trailing pixels */
         tst     X, #128/dst_w_bpp - 1
-        beq     exit_label
-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+        beq     \exit_label
+        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
 .endm
 
 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
         tst     X, #16*8/dst_w_bpp
-        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+        conditional_process1  ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
         /* Trailing pixels */
         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
-        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
 .endm
 
 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
@@ -523,37 +523,37 @@
         tst     SRC, #3
         bne     140f
   .endif
-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
   .if src_bpp == 8 || src_bpp == 16
-        b       exit_label
+        b       \exit_label
 140:
-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
   .endif
  .if mask_bpp == 8 || mask_bpp == 16
-        b       exit_label
+        b       \exit_label
 141:
   .if src_bpp == 8 || src_bpp == 16
         tst     SRC, #3
         bne     142f
   .endif
-        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
   .if src_bpp == 8 || src_bpp == 16
-        b       exit_label
+        b       \exit_label
 142:
-        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
+        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
   .endif
  .endif
 .endm
 
 
 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
- .if vars_spilled
+ .if \vars_spilled
         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
         /* This is ldmia sp,{} */
         .word   0xE89D0000 | LINE_SAVED_REGS
  .endif
         subs    Y, Y, #1
- .if vars_spilled
+ .if \vars_spilled
   .if (LINE_SAVED_REGS) & (1<<1)
         str     Y, [sp]
   .endif
@@ -565,18 +565,18 @@
  .if mask_bpp > 0
         add     MASK, MASK, STRIDE_M
  .endif
- .if restore_x
+ .if \restore_x
         mov     X, ORIG_W
  .endif
-        bhs     loop_label
- .ifc "last_one",""
-  .if vars_spilled
+        bhs     \loop_label
+ .ifc "\last_one",""
+  .if \vars_spilled
         b       197f
   .else
         b       198f
   .endif
  .else
-  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+  .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
         b       198f
   .endif
  .endif
@@ -596,17 +596,17 @@
                                    process_tail, \
                                    process_inner_loop
 
-    pixman_asm_function fname
+    pixman_asm_function \fname
 
 /*
  * Make some macro arguments globally visible and accessible
  * from other macros
  */
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set flags, flags_
- .set prefetch_distance, prefetch_distance_
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set flags, \flags_
+ .set prefetch_distance, \prefetch_distance_
 
 /*
  * Select prefetch type for this function.
@@ -732,7 +732,7 @@
         sub     Y, Y, #1
 #endif
 
-        init
+        \init
 
  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
         /* Reserve a word in which to store X during leading pixels */
@@ -773,7 +773,7 @@
    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
   .endif
 151:    /* New line */
-        newline
+        \newline
         preload_leading_step1  src_bpp, WK1, SRC
         preload_leading_step1  mask_bpp, WK2, MASK
   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -790,7 +790,7 @@
         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
   .endif
 
-        leading_15bytes  process_head, process_tail
+        leading_15bytes  \process_head, \process_tail
         
 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -800,10 +800,10 @@
         and     SCRATCH, MASK, #31
         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
   .endif
-  .ifc "process_inner_loop",""
-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+  .ifc "\process_inner_loop",""
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
   .else
-        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
   .endif
 
 157:    /* Check for another line */
@@ -825,7 +825,7 @@
   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
  .endif
 161:    /* New line */
-        newline
+        \newline
         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 0, mask_bpp, mask_bpp_shift, MASK
  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -837,10 +837,10 @@
         beq     164f
         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
         
-        leading_15bytes  process_head, process_tail
+        leading_15bytes  \process_head, \process_tail
         
 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
-        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
         
 167:    /* Check for another line */
         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
@@ -856,7 +856,7 @@
         .word   0xE92D0000 | LINE_SAVED_REGS
  .endif
 171:    /* New line */
-        newline
+        \newline
         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 1, mask_bpp, mask_bpp_shift, MASK
  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -868,8 +868,8 @@
         beq     174f
 172:    subs    X, X, #1
         blo     177f
-        process_head  , 1, 0, 1, 1, 0
-        process_tail  , 1, 0
+        \process_head  , 1, 0, 1, 1, 0
+        \process_tail  , 1, 0
   .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 1, 0, DST
   .endif
@@ -880,15 +880,15 @@
         beq     174f
         subs    X, X, #1
         blo     177f
-        process_head  , 2, 0, 1, 1, 0
-        process_tail  , 2, 0
+        \process_head  , 2, 0, 1, 1, 0
+        \process_tail  , 2, 0
   .if !((flags) & FLAG_PROCESS_DOES_STORE)
         pixst   , 2, 0, DST
   .endif
  .endif
 
 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
-        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
 
 177:    /* Check for another line */
         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
@@ -908,7 +908,7 @@
         add     sp, sp, #4
  .endif
 
-        cleanup
+        \cleanup
 
 #ifdef DEBUG_PARAMS
         add     sp, sp, #9*4 /* junk the debug copy of arguments */
@@ -932,13 +932,13 @@
     .unreq  WK3
     .unreq  SCRATCH
     .unreq  ORIG_W
-    .endfunc
+    pixman_end_asm_function
 .endm
 
 .macro line_saved_regs  x:vararg
  .set LINE_SAVED_REGS, 0
  .set LINE_SAVED_REG_COUNT, 0
- .irp SAVED_REG,x
+ .irp SAVED_REG,\x
   .ifc "SAVED_REG","Y"
    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index fa1ab5c..40f3a97 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -24,7 +24,7 @@
  *
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
@@ -51,6 +51,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_8888,
+                                 uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
                                  uint32_t, 1)
 
@@ -240,6 +242,10 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
 
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, armv6_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8b8g8r8, armv6_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8b8g8r8, armv6_composite_over_n_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888),
 
@@ -260,15 +266,15 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
 
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
-    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
 
     { PIXMAN_OP_NONE },
 };
diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
index 23374e4..288172b 100644
--- a/pixman/pixman-arm.c
+++ b/pixman/pixman-arm.c
@@ -20,7 +20,7 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
@@ -176,6 +176,31 @@ detect_cpu_features (void)
     return features;
 }
 
+#elif defined (_3DS) /* 3DS homebrew (devkitARM) */
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+
+    features |= ARM_V6;
+
+    return features;
+}
+
+#elif defined (PSP2) || defined (__SWITCH__)
+/* Vita (VitaSDK) or Switch (devkitA64) homebrew */
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+
+    features |= ARM_NEON;
+
+    return features;
+}
+
 #else /* Unknown */
 
 static arm_cpu_features_t
@@ -221,5 +246,11 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp)
 	imp = _pixman_implementation_create_arm_neon (imp);
 #endif
 
+#ifdef USE_ARM_A64_NEON
+    /* neon is a part of aarch64 */
+    if (!_pixman_disabled ("arm-neon"))
+        imp = _pixman_implementation_create_arm_neon (imp);
+#endif
+
     return imp;
 }
diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S
new file mode 100644
index 0000000..7303bdc
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm-bilinear.S
@@ -0,0 +1,1276 @@
+/*
+ * Copyright © 2011 SCore Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ * Author:  Taekyun Kim (tkq.kim@samsung.com)
+ */
+
+/*
+ * This file contains scaled bilinear scanline functions implemented
+ * using older siarhei's bilinear macro template.
+ *
+ * << General scanline function procedures >>
+ *  1. bilinear interpolate source pixels
+ *  2. load mask pixels
+ *  3. load destination pixels
+ *  4. duplicate mask to fill whole register
+ *  5. interleave source & destination pixels
+ *  6. apply mask to source pixels
+ *  7. combine source & destination pixels
+ *  8, Deinterleave final result
+ *  9. store destination pixels
+ *
+ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
+ * Registers with double numbers(src01, dst01) are 128-bits registers.
+ * All temp registers can be used freely outside the code block.
+ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
+ *
+ * Remarks
+ *  There can be lots of pipeline stalls inside code block and between code blocks.
+ *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined (__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.arch armv8-a
+.altmacro
+.p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-asm.h"
+#include "pixman-arma64-neon-asm.h"
+
+/*
+ * Bilinear macros from pixman-arm-neon-asm.S
+ */
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    asr       WTMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #2
+    ld1       {\()\reg1\().2s}, [TMP1], STRIDE
+    ld1       {\()\reg2\().2s}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    asr       WTMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    ld1       {\()\reg2\().s}[0], [TMP1], STRIDE
+    ld1       {\()\reg2\().s}[1], [TMP1]
+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 \reg1, \reg2, \tmp1
+    umull     \()\acc1\().8h, \()\reg1\().8b, v28.8b
+    umlal     \()\acc1\().8h, \()\reg2\().8b, v29.8b
+    bilinear_load_8888 \reg3, \reg4, \tmp2
+    umull     \()\acc2\().8h, \()\reg3\().8b, v28.8b
+    umlal     \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
+.endm
+
+.macro vzip reg1, reg2
+    zip1      v24.8b, \reg1, \reg2
+    zip2      \reg2,  \reg1, \reg2
+    mov       \reg1,  v24.8b
+.endm
+
+.macro vuzp reg1, reg2
+    uzp1     v24.8b, \reg1, \reg2
+    uzp2     \reg2,  \reg1, \reg2
+    mov      \reg1,  v24.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+    asr       WTMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    asr       WTMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #1
+    ld1       {\()\acc2\().s}[0], [TMP1], STRIDE
+    ld1       {\()\acc2\().s}[2], [TMP2], STRIDE
+    ld1       {\()\acc2\().s}[1], [TMP1]
+    ld1       {\()\acc2\().s}[3], [TMP2]
+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+    vzip      \()\reg1\().8b, \()\reg3\().8b
+    vzip      \()\reg2\().8b, \()\reg4\().8b
+    vzip      \()\reg3\().8b, \()\reg4\().8b
+    vzip      \()\reg1\().8b, \()\reg2\().8b
+    umull     \()\acc1\().8h, \()\reg1\().8b, v28.8b
+    umlal     \()\acc1\().8h, \()\reg2\().8b, v29.8b
+    umull     \()\acc2\().8h, \()\reg3\().8b, v28.8b
+    umlal     \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    asr       WTMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    asr       WTMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #1
+    ld1       {\()\xacc2\().s}[0], [TMP1], STRIDE
+    ld1       {\()\xacc2\().s}[2], [TMP2], STRIDE
+    ld1       {\()\xacc2\().s}[1], [TMP1]
+    ld1       {\()\xacc2\().s}[3], [TMP2]
+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
+    asr       WTMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    asr       WTMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #1
+    ld1       {\()\yacc2\().s}[0], [TMP1], STRIDE
+    vzip      \()\xreg1\().8b, \()\xreg3\().8b
+    ld1       {\()\yacc2\().s}[2], [TMP2], STRIDE
+    vzip      \()\xreg2\().8b, \()\xreg4\().8b
+    ld1       {\()\yacc2\().s}[1], [TMP1]
+    vzip      \()\xreg3\().8b, \()\xreg4\().8b
+    ld1       {\()\yacc2\().s}[3], [TMP2]
+    vzip      \()\xreg1\().8b, \()\xreg2\().8b
+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+    umull     \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
+    vzip      \()\yreg1\().8b, \()\yreg3\().8b
+    umlal     \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
+    vzip      \()\yreg2\().8b, \()\yreg4\().8b
+    umull     \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
+    vzip      \()\yreg3\().8b, \()\yreg4\().8b
+    umlal     \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
+    vzip      \()\yreg1\().8b, \()\yreg2\().8b
+    umull     \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
+    umlal     \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
+    umull     \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
+    umlal     \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if \numpix == 4
+    st1       {v0.2s, v1.2s}, [OUT], #16
+.elseif \numpix == 2
+    st1       {v0.2s}, [OUT], #8
+.elseif \numpix == 1
+    st1       {v0.s}[0], [OUT], #4
+.else
+    .error bilinear_store_8888 \numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp    v0.8b, v1.8b
+    vuzp    v2.8b, v3.8b
+    vuzp    v1.8b, v3.8b
+    vuzp    v0.8b, v2.8b
+    convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
+.if \numpix == 4
+    st1       {v1.4h}, [OUT], #8
+.elseif \numpix == 2
+    st1       {v1.s}[0], [OUT], #4
+.elseif \numpix == 1
+    st1       {v1.h}[0], [OUT], #2
+.else
+    .error bilinear_store_0565 \numpix is unsupported
+.endif
+.endm
+
+
+/*
+ * Macros for loading mask pixels into register 'mask'.
+ * dup must be done in somewhere else.
+ */
+.macro bilinear_load_mask_x numpix, mask
+.endm
+
+.macro bilinear_load_mask_8 numpix, mask
+.if \numpix == 4
+    ld1         {\()\mask\().s}[0], [MASK], #4
+.elseif \numpix == 2
+    ld1         {\()\mask\().h}[0], [MASK], #2
+.elseif \numpix == 1
+    ld1         {\()\mask\().b}[0], [MASK], #1
+.else
+    .error bilinear_load_mask_8 \numpix is unsupported
+.endif
+    prfum       PREFETCH_MODE, [MASK, #(prefetch_offset)]
+.endm
+
+.macro bilinear_load_mask mask_fmt, numpix, mask
+    bilinear_load_mask_\mask_fmt \numpix, \mask
+.endm
+
+
+/*
+ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
+ * Interleave should be done somewhere else.
+ */
+.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.if \numpix == 4
+    ld1         {\()\dst0\().2s, \()\dst1\().2s}, [OUT]
+.elseif \numpix == 2
+    ld1         {\()\dst0\().2s}, [OUT]
+.elseif \numpix == 1
+    ld1         {\()\dst0\().s}[0], [OUT]
+.else
+    .error bilinear_load_dst_8888 \numpix is unsupported
+.endif
+    mov         \()\dst01\().d[0], \()\dst0\().d[0]
+    mov         \()\dst01\().d[1], \()\dst1\().d[0]
+    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
+.endm
+
+.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
+.endm
+
+/*
+ * Macros for duplicating partially loaded mask to fill entire register.
+ * We will apply mask to interleaved source pixels, that is
+ *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * So, we need to duplicate loaded mask into whole register.
+ *
+ * For two pixel case
+ *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * We can do some optimizations for this including last pixel cases.
+ */
+.macro bilinear_duplicate_mask_x numpix, mask
+.endm
+
+.macro bilinear_duplicate_mask_8 numpix, mask
+.if \numpix == 4
+    dup         \()\mask\().2s, \()\mask\().s[0]
+.elseif \numpix == 2
+    dup         \()\mask\().4h, \()\mask\().h[0]
+.elseif \numpix == 1
+    dup         \()\mask\().8b, \()\mask\().b[0]
+.else
+    .error bilinear_duplicate_\mask_8 is unsupported
+.endif
+.endm
+
+.macro bilinear_duplicate_mask mask_fmt, numpix, mask
+    bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
+.endm
+
+/*
+ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
+ * Interleave should be done when maks is enabled or operator is 'over'.
+ */
+.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+    vuzp       \()\src0\().8b, \()\src1\().8b
+    vuzp       \()\dst0\().8b, \()\dst1\().8b
+    vuzp       \()\src0\().8b, \()\src1\().8b
+    vuzp       \()\dst0\().8b, \()\dst1\().8b
+    mov        \()\src01\().d[1], \()\src1\().d[0]
+    mov        \()\src01\().d[0], \()\src0\().d[0]
+    mov        \()\dst01\().d[1], \()\dst1\().d[0]
+    mov        \()\dst01\().d[0], \()\dst0\().d[0]
+.endm
+
+.macro bilinear_interleave_src_dst_x_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+                
+    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst \
+                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+
+/*
+ * Macros for applying masks to src pixels. (see combine_mask_u() function)
+ * src, dst should be in interleaved form.
+ * mask register should be in form (m0, m1, m2, m3).
+ */
+.macro bilinear_apply_mask_to_src_x \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+.endm
+
+.macro bilinear_apply_mask_to_src_8 \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    umull           \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b
+    umull           \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b
+    /* bubbles */
+    urshr           \()\tmp45\().8h, \()\tmp01\().8h, #8
+    urshr           \()\tmp67\().8h, \()\tmp23\().8h, #8
+    /* bubbles */
+    raddhn          \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
+    raddhn          \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
+    mov             \()\src01\().d[0], \()\src0\().d[0]
+    mov             \()\src01\().d[1], \()\src1\().d[0]
+.endm
+
+.macro bilinear_apply_mask_to_src \
+                mask_fmt, numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    bilinear_apply_mask_to_src_\()\mask_fmt \
+                \numpix, \src0, \src1, \src01, \mask, \
+                \tmp01, \tmp23, \tmp45, \tmp67
+.endm
+
+
+/*
+ * Macros for combining src and destination pixels.
+ * Interleave or not is depending on operator 'op'.
+ */
+.macro bilinear_combine_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+.macro bilinear_combine_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    dup         \()\tmp8\().2s, \()\src1\().s[1]
+    /* bubbles */
+    mvn         \()\tmp8\().8b, \()\tmp8\().8b
+    /* bubbles */
+    umull       \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b
+    /* bubbles */
+    umull       \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b
+    /* bubbles */
+    urshr       \()\tmp45\().8h, \()\tmp01\().8h, #8
+    urshr       \()\tmp67\().8h, \()\tmp23\().8h, #8
+    /* bubbles */
+    raddhn      \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
+    raddhn      \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
+    mov         \()\dst01\().d[0], \()\dst0\().d[0]
+    mov         \()\dst01\().d[1], \()\dst1\().d[0]
+    /* bubbles */
+    uqadd       \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
+    uqadd       \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
+    mov         \()\src01\().d[0], \()\src0\().d[0]
+    mov         \()\src01\().d[1], \()\src1\().d[0]
+.endm
+
+.macro bilinear_combine_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    uqadd       \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
+    uqadd       \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
+    mov         \()\src01\().d[0], \()\src0\().d[0]
+    mov         \()\src01\().d[1], \()\src1\().d[0]
+.endm
+
+.macro bilinear_combine \
+                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    bilinear_combine_\()\op \
+                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
+                \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
+.endm
+
+/*
+ * Macros for final deinterleaving of destination pixels if needed.
+ */
+.macro bilinear_deinterleave numpix, dst0, dst1, dst01
+    vuzp       \()\dst0\().8b, \()\dst1\().8b
+    /* bubbles */
+    vuzp       \()\dst0\().8b, \()\dst1\().8b
+    mov        \()\dst01\().d[0], \()\dst0\().d[0]
+    mov        \()\dst01\().d[1], \()\dst1\().d[0]
+.endm
+
+.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
+    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
+.endm
+
+
+.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_\()\src_fmt v0, v1, v2
+    bilinear_load_mask \mask_fmt, 1, v4
+    bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9
+    umull     v2.8h, v0.8b, v28.8b
+    umlal     v2.8h, v1.8b, v29.8b
+    /* 5 cycles bubble */
+    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v2.4h, v15.h[0]
+    umlal2    v0.4s, v2.8h, v15.h[0]
+    /* 5 cycles bubble */
+    bilinear_duplicate_mask \mask_fmt, 1, v4
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    /* 3 cycles bubble */
+    xtn       v0.8b, v0.8h
+    /* 1 cycle bubble */
+    bilinear_interleave_src_dst \
+                \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9
+    bilinear_apply_mask_to_src \
+                \mask_fmt, 1, v0, v1, v0, v4, \
+                v3, v8, v10, v11
+    bilinear_combine \
+                \op, 1, v0, v1, v0, v18, v19, v9, \
+                v3, v8, v10, v11, v5
+    bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0
+    bilinear_store_\()\dst_fmt 1, v17, v18
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
+                v1, v11, v18, v19, v20, v21, v22, v23
+    bilinear_load_mask \mask_fmt, 2, v4
+    bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9
+    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v1.4h, v15.h[0]
+    umlal2    v0.4s, v1.8h, v15.h[0]
+    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v10.4s, v11.4h, v15.h[4]
+    umlal2    v10.4s, v11.8h, v15.h[4]
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    bilinear_duplicate_mask \mask_fmt, 2, v4
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add       v12.8h, v12.8h, v13.8h
+    xtn       v0.8b, v0.8h
+    bilinear_interleave_src_dst \
+                \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9
+    bilinear_apply_mask_to_src \
+                \mask_fmt, 2, v0, v1, v0, v4, \
+                v3, v8, v10, v11
+    bilinear_combine \
+                \op, 2, v0, v1, v0, v18, v19, v9, \
+                v3, v8, v10, v11, v5
+    bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0
+    bilinear_store_\()\dst_fmt 2, v16, v17
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
+                v1, v11, v4,  v5,  v6,  v7,  v22, v23, \
+                v3, v9,  v16, v17, v20, v21, v18, v19
+    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
+    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v1.4h, v15.h[0]
+    umlal2    v0.4s, v1.8h, v15.h[0]
+    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v10.4s, v11.4h, v15.h[4]
+    umlal2    v10.4s, v11.8h, v15.h[4]
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    ushll     v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v2.4s, v3.4h, v15.h[0]
+    umlal2    v2.4s, v3.8h, v15.h[0]
+    ushll     v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v8.4s, v9.4h, v15.h[4]
+    umlal2    v8.4s, v9.8h, v15.h[4]
+    add       v12.8h, v12.8h, v13.8h
+    shrn      v0.4h,  v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn      v2.4h,  v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2     v2.8h,  v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    bilinear_load_mask \mask_fmt, 4, v4
+    bilinear_duplicate_mask \mask_fmt, 4, v4
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    xtn       v0.8b, v0.8h
+    xtn       v1.8b, v2.8h
+    add       v12.8h, v12.8h, v13.8h
+    bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21
+    bilinear_interleave_src_dst \
+                \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11
+    bilinear_apply_mask_to_src \
+                \mask_fmt, 4, v0, v1, v0, v4, \
+                v6, v8, v9, v10
+    bilinear_combine \
+                \op, 4, v0, v1, v0, v2, v3, v1, \
+                v6, v8, v9, v10, v23
+    bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0
+    bilinear_store_\()\dst_fmt 4, v6, v7
+.endm
+
+.set BILINEAR_FLAG_USE_MASK,        1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS,    2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ *  fname            - name of the function to generate
+ *  src_fmt            - source color format (8888 or 0565)
+ *  dst_fmt            - destination color format (8888 or 0565)
+ *  src/dst_bpp_shift        - (1 << bpp_shift) is the size of src/dst pixel in bytes
+ *  process_last_pixel        - code block that interpolate one pixel and does not
+ *                  update horizontal weight
+ *  process_two_pixels        - code block that interpolate two pixels and update
+ *                  horizontal weight
+ *  process_four_pixels        - code block that interpolate four pixels and update
+ *                  horizontal weight
+ *  process_pixblock_head    - head part of middle loop
+ *  process_pixblock_tail    - tail part of middle loop
+ *  process_pixblock_tail_head    - tail_head of middle loop
+ *  pixblock_size        - number of pixels processed in a single middle loop
+ *  prefetch_distance        - prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+    fname, \
+    src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+    bilinear_process_last_pixel, \
+    bilinear_process_two_pixels, \
+    bilinear_process_four_pixels, \
+    bilinear_process_pixblock_head, \
+    bilinear_process_pixblock_tail, \
+    bilinear_process_pixblock_tail_head, \
+    pixblock_size, \
+    prefetch_distance, \
+    flags
+
+pixman_asm_function \fname
+.if \pixblock_size == 8
+.elseif \pixblock_size == 4
+.else
+    .error unsupported pixblock size
+.endif
+
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
+    OUT       .req    x0
+    TOP       .req    x1
+    BOTTOM    .req    x2
+    WT        .req    x3
+    WWT       .req    w3
+    WB        .req    x4
+    WWB       .req    w4
+    X         .req    w5
+    UX        .req    w6
+    WIDTH     .req    x7
+    TMP1      .req    x10
+    WTMP1     .req    w10
+    TMP2      .req    x11
+    WTMP2     .req    w11
+    PF_OFFS   .req    x12
+    TMP3      .req    x13
+    WTMP3     .req    w13
+    TMP4      .req    x14
+    WTMP4     .req    w14
+    STRIDE    .req    x15
+    DUMMY     .req    x30
+
+    stp       x29, x30, [sp, -16]!
+    mov       x29, sp
+    sub       sp, sp, 112
+    sub       x29, x29, 64
+    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    stp       x10, x11, [x29, -80]
+    stp       x12, x13, [x29, -96]
+    stp       x14, x15, [x29, -112]
+.else
+    OUT       .req      x0
+    MASK      .req      x1
+    TOP       .req      x2
+    BOTTOM    .req      x3
+    WT        .req      x4
+    WWT       .req      w4
+    WB        .req      x5
+    WWB       .req      w5
+    X         .req      w6
+    UX        .req      w7
+    WIDTH     .req      x8
+    TMP1      .req      x10
+    WTMP1     .req      w10
+    TMP2      .req      x11
+    WTMP2     .req      w11
+    PF_OFFS   .req      x12
+    TMP3      .req      x13
+    WTMP3     .req      w13
+    TMP4      .req      x14
+    WTMP4     .req      w14
+    STRIDE    .req      x15
+    DUMMY     .req      x30
+
+    .set prefetch_offset, \prefetch_distance
+
+    stp      x29, x30, [sp, -16]!
+    mov      x29, sp
+    sub      x29, x29, 64
+    st1      {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    st1      {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    stp      x10, x11, [x29, -80]
+    stp      x12, x13, [x29, -96]
+    stp      x14, x15, [x29, -112]
+    str      x8, [x29, -120]
+    ldr      w8, [x29, 16]
+    sub      sp, sp, 120
+.endif
+
+    mov      WTMP1, #\prefetch_distance
+    umull    PF_OFFS, WTMP1, UX
+
+    sub      STRIDE, BOTTOM, TOP
+    .unreq   BOTTOM
+
+    cmp      WIDTH, #0
+    ble      300f
+
+    dup      v12.8h, X
+    dup      v13.8h, UX
+    dup      v28.8b, WWT
+    dup      v29.8b, WWB
+    mov      v25.d[0], v12.d[1]
+    mov      v26.d[0], v13.d[0]
+    add      v25.4h, v25.4h, v26.4h
+    mov      v12.d[1], v25.d[0]
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       100f
+    tst       OUT, #(1 << \dst_bpp_shift)
+    beq       100f
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add       v12.8h, v12.8h, v13.8h
+    \bilinear_process_last_pixel
+    sub       WIDTH, WIDTH, #1
+100:
+    add       v13.8h, v13.8h, v13.8h
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add       v12.8h, v12.8h, v13.8h
+
+    cmp       WIDTH, #2
+    blt       100f
+    tst       OUT, #(1 << (\dst_bpp_shift + 1))
+    beq       100f
+    \bilinear_process_two_pixels
+    sub       WIDTH, WIDTH, #2
+100:
+.if \pixblock_size == 8
+    cmp       WIDTH, #4
+    blt       100f
+    tst       OUT, #(1 << (\dst_bpp_shift + 2))
+    beq       100f
+    \bilinear_process_four_pixels
+    sub       WIDTH, WIDTH, #4
+100:
+.endif
+    subs      WIDTH, WIDTH, #\pixblock_size
+    blt       100f
+    asr       PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
+    \bilinear_process_pixblock_head
+    subs      WIDTH, WIDTH, #\pixblock_size
+    blt       500f
+0:
+    \bilinear_process_pixblock_tail_head
+    subs      WIDTH, WIDTH, #\pixblock_size
+    bge       0b
+500:
+    \bilinear_process_pixblock_tail
+100:
+.if \pixblock_size == 8
+    tst       WIDTH, #4
+    beq       200f
+    \bilinear_process_four_pixels
+200:
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       200f
+    \bilinear_process_two_pixels
+200:
+    tst       WIDTH, #1
+    beq       300f
+    \bilinear_process_last_pixel
+300:
+
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
+    sub       x29, x29, 64
+    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp       x10, x11, [x29, -80]
+    ldp       x12, x13, [x29, -96]
+    ldp       x14, x15, [x29, -112]
+    mov       sp, x29
+    ldp       x29, x30, [sp], 16
+.else
+    sub       x29, x29, 64
+    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp       x10, x11, [x29, -80]
+    ldp       x12, x13, [x29, -96]
+    ldp       x14, x15, [x29, -112]
+    ldr       x8, [x29, -120]
+    mov       sp, x29
+    ldp       x29, x30, [sp], 16
+.endif
+    ret
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WWT
+    .unreq    WB
+    .unreq    WWB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    WTMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
+    .unreq    MASK
+.endif
+
+pixman_end_asm_function
+
+.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+    bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+    bilinear_src_8888_8_8888_process_pixblock_tail
+    bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+    bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+    bilinear_src_8888_8_0565_process_pixblock_tail
+    bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+    bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+    bilinear_src_0565_8_x888_process_pixblock_tail
+    bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+    bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+    bilinear_src_0565_8_0565_process_pixblock_tail
+    bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+    asr         WTMP1, X, #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, lsl #2
+    asr         WTMP2, X, #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, lsl #2
+
+    ld1         {v22.2s}, [TMP1], STRIDE
+    ld1         {v23.2s}, [TMP1]
+    asr         WTMP3, X, #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, lsl #2
+    umull       v8.8h, v22.8b, v28.8b
+    umlal       v8.8h, v23.8b, v29.8b
+
+    ld1         {v22.2s}, [TMP2], STRIDE
+    ld1         {v23.2s}, [TMP2]
+    asr         WTMP4, X, #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, lsl #2
+    umull       v9.8h, v22.8b, v28.8b
+    umlal       v9.8h, v23.8b, v29.8b
+
+    ld1         {v22.2s}, [TMP3], STRIDE
+    ld1         {v23.2s}, [TMP3]
+    umull       v10.8h, v22.8b, v28.8b
+    umlal       v10.8h, v23.8b, v29.8b
+
+    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl       v0.4s, v8.4h, v15.h[0]
+    umlal2      v0.4s, v8.8h, v15.h[0]
+
+    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
+    ld1         {v16.2s}, [TMP4], STRIDE
+    ld1         {v17.2s}, [TMP4]
+    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
+    umull       v11.8h, v16.8b, v28.8b
+    umlal       v11.8h, v17.8b, v29.8b
+
+    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl       v1.4s, v9.4h, v15.h[4]
+    umlal2      v1.4s, v9.8h, v15.h[4]
+    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add         v12.8h, v12.8h, v13.8h
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+    ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl       v2.4s, v10.4h, v15.h[0]
+    umlal2      v2.4s, v10.8h, v15.h[0]
+    ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl       v3.4s, v11.4h, v15.h[4]
+    umlal2      v3.4s, v11.8h, v15.h[4]
+    shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    xtn         v6.8b, v0.8h
+    xtn         v7.8b, v2.8h
+    ld1         {v2.2s, v3.2s}, [OUT]
+    prfm        PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
+    vuzp        v6.8b, v7.8b
+    vuzp        v2.8b, v3.8b
+    vuzp        v6.8b, v7.8b
+    vuzp        v2.8b, v3.8b
+    dup         v4.2s, v7.s[1]
+    mvn         v4.8b, v4.8b
+    umull       v11.8h, v2.8b, v4.8b
+    umull       v2.8h,  v3.8b, v4.8b
+    urshr       v1.8h, v11.8h, #8
+    urshr       v10.8h, v2.8h, #8
+    raddhn      v3.8b, v10.8h, v2.8h
+    raddhn      v2.8b, v1.8h, v11.8h
+    uqadd       v6.8b, v2.8b,  v6.8b
+    uqadd       v7.8b, v3.8b,  v7.8b
+    vuzp        v6.8b, v7.8b
+    vuzp        v6.8b, v7.8b
+    add         v12.8h, v12.8h, v13.8h
+    st1         {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+                                            ushll       v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+    asr         WTMP1, X, #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, lsl #2
+                                            umlsl       v2.4s, v10.4h, v15.h[0]
+    asr         WTMP2, X, #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, lsl #2
+                                            umlal2      v2.4s, v10.8h, v15.h[0]
+                                            ushll       v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    ld1         {v20.2s}, [TMP1], STRIDE
+                                            umlsl       v3.4s, v11.4h, v15.h[4]
+                                            umlal2      v3.4s, v11.8h, v15.h[4]
+    ld1         {v21.2s}, [TMP1]
+    umull       v8.8h, v20.8b, v28.8b
+    umlal       v8.8h, v21.8b, v29.8b
+                                            shrn        v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            shrn2       v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            shrn        v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    ld1         {v22.2s}, [TMP2], STRIDE
+                                            shrn2       v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            xtn         v6.8b, v0.8h
+    ld1         {v23.2s}, [TMP2]
+    umull       v9.8h, v22.8b, v28.8b
+    asr         WTMP3, X, #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, lsl #2
+    asr         WTMP4, X, #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, lsl #2
+    umlal       v9.8h, v23.8b, v29.8b
+                                            xtn         v7.8b, v2.8h
+                                            ld1         {v2.2s, v3.2s}, [OUT]
+                                            prfm        PREFETCH_MODE, [OUT, PF_OFFS]
+    ld1         {v22.2s}, [TMP3], STRIDE
+                                            vuzp        v6.8b, v7.8b
+                                            vuzp        v2.8b, v3.8b
+                                            vuzp        v6.8b, v7.8b
+                                            vuzp        v2.8b, v3.8b
+                                            dup         v4.2s, v7.s[1]
+    ld1         {v23.2s}, [TMP3]
+                                            mvn         v4.8b, v4.8b
+    umull       v10.8h, v22.8b, v28.8b
+    umlal       v10.8h, v23.8b, v29.8b
+                                            umull       v11.8h, v2.8b, v4.8b
+                                            umull        v2.8h, v3.8b, v4.8b
+    ushll       v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl       v0.4s, v8.4h, v15.h[0]
+                                            urshr       v1.8h, v11.8h, #8
+    umlal2      v0.4s, v8.8h, v15.h[0]
+                                            urshr       v8.8h, v2.8h, #8
+                                            raddhn      v3.8b, v8.8h, v2.8h
+                                            raddhn      v2.8b, v1.8h, v11.8h
+    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
+    ld1         {v16.2s}, [TMP4], STRIDE
+                                            uqadd       v6.8b, v2.8b, v6.8b
+                                            uqadd       v7.8b, v3.8b, v7.8b
+    ld1         {v17.2s}, [TMP4]
+    prfm        PREFETCH_MODE, [TMP4, PF_OFFS]
+    umull       v11.8h, v16.8b, v28.8b
+    umlal       v11.8h, v17.8b, v29.8b
+                                            vuzp        v6.8b, v7.8b
+    ushll       v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+                                            vuzp        v6.8b, v7.8b
+    umlsl       v1.4s, v9.4h, v15.h[4]
+                                            add         v12.8h, v12.8h, v13.8h
+    umlal2      v1.4s, v9.8h, v15.h[4]
+    ushr        v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add         v12.8h, v12.8h, v13.8h
+                                            st1         {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, over
+    bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+    bilinear_over_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+     bilinear_over_8888_8_8888_process_pixblock_tail
+     bilinear_over_8888_8_8888_process_pixblock_head
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, add
+    bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+    bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+    bilinear_add_8888_8888_process_pixblock_tail
+    bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+    bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+    bilinear_add_8888_8_8888_process_pixblock_tail
+    bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_src_8888_8_8888_process_last_pixel, \
+    bilinear_src_8888_8_8888_process_two_pixels, \
+    bilinear_src_8888_8_8888_process_four_pixels, \
+    bilinear_src_8888_8_8888_process_pixblock_head, \
+    bilinear_src_8888_8_8888_process_pixblock_tail, \
+    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+    8888, 0565, 2, 1, \
+    bilinear_src_8888_8_0565_process_last_pixel, \
+    bilinear_src_8888_8_0565_process_two_pixels, \
+    bilinear_src_8888_8_0565_process_four_pixels, \
+    bilinear_src_8888_8_0565_process_pixblock_head, \
+    bilinear_src_8888_8_0565_process_pixblock_tail, \
+    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+    0565, 8888, 1, 2, \
+    bilinear_src_0565_8_x888_process_last_pixel, \
+    bilinear_src_0565_8_x888_process_two_pixels, \
+    bilinear_src_0565_8_x888_process_four_pixels, \
+    bilinear_src_0565_8_x888_process_pixblock_head, \
+    bilinear_src_0565_8_x888_process_pixblock_tail, \
+    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+    0565, 0565, 1, 1, \
+    bilinear_src_0565_8_0565_process_last_pixel, \
+    bilinear_src_0565_8_0565_process_two_pixels, \
+    bilinear_src_0565_8_0565_process_four_pixels, \
+    bilinear_src_0565_8_0565_process_pixblock_head, \
+    bilinear_src_0565_8_0565_process_pixblock_tail, \
+    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8888_process_last_pixel, \
+    bilinear_over_8888_8888_process_two_pixels, \
+    bilinear_over_8888_8888_process_four_pixels, \
+    bilinear_over_8888_8888_process_pixblock_head, \
+    bilinear_over_8888_8888_process_pixblock_tail, \
+    bilinear_over_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8_8888_process_last_pixel, \
+    bilinear_over_8888_8_8888_process_two_pixels, \
+    bilinear_over_8888_8_8888_process_four_pixels, \
+    bilinear_over_8888_8_8888_process_pixblock_head, \
+    bilinear_over_8888_8_8888_process_pixblock_tail, \
+    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8888_process_last_pixel, \
+    bilinear_add_8888_8888_process_two_pixels, \
+    bilinear_add_8888_8888_process_four_pixels, \
+    bilinear_add_8888_8888_process_pixblock_head, \
+    bilinear_add_8888_8888_process_pixblock_tail, \
+    bilinear_add_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8_8888_process_last_pixel, \
+    bilinear_add_8888_8_8888_process_two_pixels, \
+    bilinear_add_8888_8_8888_process_four_pixels, \
+    bilinear_add_8888_8_8888_process_pixblock_head, \
+    bilinear_add_8888_8_8888_process_pixblock_tail, \
+    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
new file mode 100644
index 0000000..107c133
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm.S
@@ -0,0 +1,3704 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.arch armv8-a
+
+.altmacro
+.p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-asm.h"
+#include "pixman-arma64-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * v0,  v1,  v2,  v3  - contain loaded source pixel data
+ * v4,  v5,  v6,  v7  - contain loaded destination pixels (if they are needed)
+ * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
+ * v28, v29, v30, v31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * v0,  v1,  v2,  v3  - contain loaded source pixel data
+ * v4,  v5            - contain loaded destination pixels (they are needed)
+ * v28, v29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
+ * perform all the needed calculations and write the result to {v28, v29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
+ * actually use v0 register for blue channel (a vector of eight 8-bit
+ * values), v1 register for green, v2 for red and v3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion: // vuzp8 is a wrapper macro
+ *  vuzp8 v0, v1
+ *  vuzp8 v2, v3
+ *  vuzp8 v1, v3
+ *  vuzp8 v0, v2
+ *
+ * Planar to packed conversion: // vzip8 is a wrapper macro
+ *  vzip8 v0, v2
+ *  vzip8 v1, v3
+ *  vzip8 v2, v3
+ *  vzip8 v0, v1
+ *
+ * But pixel can be loaded directly in planar format using LD4 / b NEON
+ * instruction. It is 1 cycle slower than LD1 / s, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+       and put data into v6 - red, v7 - green, v30 - blue */
+    mov         v4.d[1], v5.d[0]
+    shrn        v6.8b, v4.8h, #8
+    shrn        v7.8b, v4.8h, #3
+    sli         v4.8h, v4.8h, #5
+    sri         v6.8b, v6.8b, #5
+    mvn         v3.8b, v3.8b      /* invert source alpha */
+    sri         v7.8b, v7.8b, #6
+    shrn        v30.8b, v4.8h, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into v20 - red, v23 - green, v22 - blue */
+    umull       v10.8h, v3.8b, v6.8b
+    umull       v11.8h, v3.8b, v7.8b
+    umull       v12.8h, v3.8b, v30.8b
+    urshr       v17.8h, v10.8h, #8
+    urshr       v18.8h, v11.8h, #8
+    urshr       v19.8h, v12.8h, #8
+    raddhn      v20.8b, v10.8h, v17.8h
+    raddhn      v23.8b, v11.8h, v18.8h
+    raddhn      v22.8b, v12.8h, v19.8h
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    uqadd       v17.8b, v2.8b, v20.8b
+    uqadd       v18.8b, v0.8b, v22.8b
+    uqadd       v19.8b, v1.8b, v23.8b
+    /* convert the result to r5g6b5 and store it into {v14} */
+    ushll       v14.8h, v17.8b, #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v8.8h, v19.8b, #7
+    sli         v8.8h, v8.8h, #1
+    ushll       v9.8h, v18.8b, #7
+    sli         v9.8h, v9.8h, #1
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   st1         {v28.4h, v29.4h}, [DST_W], #32
+ *   ld1         {v4.4h, v5.4h}, [DST_R], #16
+ *   ld4         {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few LD/ST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        uqadd       v17.8b, v2.8b, v20.8b
+    ld1         {v4.4h, v5.4h}, [DST_R], #16
+    mov         v4.d[1], v5.d[0]
+        uqadd       v18.8b, v0.8b, v22.8b
+        uqadd       v19.8b, v1.8b, v23.8b
+    shrn        v6.8b, v4.8h, #8
+    fetch_src_pixblock
+    shrn        v7.8b, v4.8h, #3
+    sli         v4.8h, v4.8h, #5
+        ushll       v14.8h, v17.8b, #7
+        sli         v14.8h, v14.8h, #1
+                                    PF add, PF_X, PF_X, #8
+        ushll       v8.8h, v19.8b, #7
+        sli         v8.8h, v8.8h,  #1
+                                    PF tst, PF_CTL, #0xF
+    sri         v6.8b, v6.8b, #5
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+10:
+    mvn         v3.8b, v3.8b
+                                    PF beq, 10f
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+    sri         v7.8b, v7.8b, #6
+    shrn        v30.8b, v4.8h, #2
+    umull       v10.8h, v3.8b, v6.8b
+                                    PF lsl, DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+    umull       v11.8h, v3.8b, v7.8b
+    umull       v12.8h, v3.8b, v30.8b
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+        sri         v14.8h, v8.8h, #5
+                                    PF cmp, PF_X, ORIG_W
+        ushll       v9.8h, v18.8b, #7
+        sli         v9.8h, v9.8h, #1
+    urshr       v17.8h, v10.8h, #8
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    urshr       v19.8h, v11.8h, #8
+    urshr       v18.8h, v12.8h, #8
+                                    PF ble, 10f
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+        sri         v14.8h, v9.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+10:
+    raddhn      v20.8b, v10.8h, v17.8h
+    raddhn      v23.8b, v11.8h, v19.8h
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_SRC, #1
+10:
+    raddhn      v22.8b, v12.8h, v18.8h
+        st1         {v14.8h}, [DST_W], #16
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    st1         {v14.8h}, [DST_W], #16
+    ld1         {v4.4h, v4.5h}, [DST_R], #16
+    fetch_src_pixblock
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+       and put data into v6 - red, v7 - green, v30 - blue */
+    mov         v4.d[1], v5.d[0]
+    shrn        v6.8b, v4.8h, #8
+    shrn        v7.8b, v4.8h, #3
+    sli         v4.8h, v4.8h, #5
+    sri         v6.8b, v6.8b, #5
+    sri         v7.8b, v7.8b, #6
+    shrn        v30.8b, v4.8h, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into v20 - red, v23 - green, v22 - blue */
+    umull       v10.8h, v3.8b, v6.8b
+    umull       v11.8h, v3.8b, v7.8b
+    umull       v12.8h, v3.8b, v30.8b
+    urshr       v13.8h, v10.8h, #8
+    urshr       v14.8h, v11.8h, #8
+    urshr       v15.8h, v12.8h, #8
+    raddhn      v20.8b, v10.8h, v13.8h
+    raddhn      v23.8b, v11.8h, v14.8h
+    raddhn      v22.8b, v12.8h, v15.8h
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    uqadd       v17.8b, v2.8b, v20.8b
+    uqadd       v18.8b, v0.8b, v22.8b
+    uqadd       v19.8b, v1.8b, v23.8b
+    /* convert the result to r5g6b5 and store it into {v14} */
+    ushll       v14.8h, v17.8b, #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v8.8h, v19.8b, #7
+    sli         v8.8h, v8.8h, #1
+    ushll       v9.8h, v18.8b, #7
+    sli         v9.8h, v9.8h, #1
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    ld1         {v4.4h, v5.4h}, [DST_R], #16
+    st1         {v14.8h}, [DST_W], #16
+    pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+    mvn         v3.8b, v3.8b      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    ushll       v8.8h,  v1.8b,  #7
+    sli         v8.8h,  v8.8h,  #1
+    ushll       v14.8h, v2.8b,  #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v9.8h,  v0.8b,  #7
+    sli         v9.8h,  v9.8h,  #1
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        sri         v14.8h, v8.8h, #5
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+    fetch_src_pixblock
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        sri         v14.8h, v9.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+                                    PF cmp, PF_X, ORIG_W
+                                    PF lsl, DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+    ushll       v8.8h, v1.8b, #7
+    sli         v8.8h, v8.8h, #1
+        st1        {v14.8h}, [DST_W], #16
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    ushll       v14.8h, v2.8b, #7
+    sli         v14.8h, v14.8h, #1
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+10:
+    ushll       v9.8h, v0.8b, #7
+    sli         v9.8h, v9.8h, #1
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    mov         v0.d[1], v1.d[0]
+    shrn        v30.8b, v0.8h, #8
+    shrn        v29.8b, v0.8h, #3
+    sli         v0.8h,  v0.8h, #5
+    movi        v31.8b, #255
+    sri         v30.8b, v30.8b, #5
+    sri         v29.8b, v29.8b, #6
+    shrn        v28.8b, v0.8h, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    fetch_src_pixblock
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+    uqadd       v28.8b, v0.8b, v4.8b
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add, PF_X, PF_X, #32
+                                    PF tst, PF_CTL, #0xF
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #32
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp, PF_X, ORIG_W
+                                    PF lsl, DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    uqadd       v28.8b, v0.8b, v4.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_DST, #1
+10:
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp, PF_X, ORIG_W
+                                    PF lsl, DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    uqadd       v28.8b, v0.8b, v4.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_DST, #1
+10:
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+    mvn         v24.8b, v3.8b  /* get inverted alpha */
+    /* do alpha blending */
+    umull       v8.8h, v24.8b, v4.8b
+    umull       v9.8h, v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+    umull       v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    urshr       v14.8h, v8.8h, #8
+    urshr       v15.8h, v9.8h, #8
+    urshr       v16.8h, v10.8h, #8
+    urshr       v17.8h, v11.8h, #8
+    raddhn      v28.8b, v14.8h, v8.8h
+    raddhn      v29.8b, v15.8h, v9.8h
+    raddhn      v30.8b, v16.8h, v10.8h
+    raddhn      v31.8b, v17.8h, v11.8h
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v14.8h, v8.8h, #8
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+        urshr       v15.8h, v9.8h, #8
+        urshr       v16.8h, v10.8h, #8
+        urshr       v17.8h, v11.8h, #8
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+                                    PF cmp, PF_X, ORIG_W
+        raddhn      v30.8b, v16.8h, v10.8h
+        raddhn      v31.8b, v17.8h, v11.8h
+    fetch_src_pixblock
+                                    PF lsl, DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+    mvn         v22.8b, v3.8b
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    umull      v8.8h, v22.8b, v4.8b
+                                    PF ble, 10f
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    umull      v9.8h, v22.8b, v5.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+10:
+    umull      v10.8h, v22.8b, v6.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_DST, #1
+10:
+     umull     v11.8h, v22.8b, v7.8b
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v14.8h, v8.8h, #8
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+        urshr       v15.8h, v9.8h, #8
+        urshr       v16.8h, v10.8h, #8
+        urshr       v17.8h, v11.8h, #8
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+                                    PF cmp, PF_X, ORIG_W
+        raddhn      v30.8b, v16.8h, v10.8h
+        raddhn      v31.8b, v17.8h, v11.8h
+        uqadd       v28.8b, v0.8b, v28.8b
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+    fetch_src_pixblock
+                                    PF lsl, DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+    mvn        v22.8b, v3.8b
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    umull      v8.8h, v22.8b, v4.8b
+                                    PF ble, 10f
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    umull      v9.8h, v22.8b, v5.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+10:
+    umull      v10.8h, v22.8b, v6.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_DST, #1
+10:
+    umull      v11.8h, v22.8b, v7.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+    /* deinterleaved source pixels in {v0, v1, v2, v3} */
+    /* inverted alpha in {v24} */
+    /* destination pixels in {v4, v5, v6, v7} */
+    umull       v8.8h, v24.8b, v4.8b
+    umull       v9.8h, v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+    umull       v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+    urshr       v14.8h, v8.8h, #8
+    urshr       v15.8h, v9.8h, #8
+    urshr       v16.8h, v10.8h, #8
+    urshr       v17.8h, v11.8h, #8
+    raddhn      v28.8b, v14.8h, v8.8h
+    raddhn      v29.8b, v15.8h, v9.8h
+    raddhn      v30.8b, v16.8h, v10.8h
+    raddhn      v31.8b, v17.8h, v11.8h
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+        urshr       v14.8h, v8.8h, #8
+        urshr       v15.8h, v9.8h, #8
+        urshr       v16.8h, v10.8h, #8
+        urshr       v17.8h, v11.8h, #8
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+        raddhn      v30.8b, v16.8h, v10.8h
+        raddhn      v31.8b, v17.8h, v11.8h
+    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        uqadd       v28.8b, v0.8b, v28.8b
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0x0F
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+                                    PF cmp, PF_X, ORIG_W
+    umull       v8.8h, v24.8b, v4.8b
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+    umull       v9.8h, v24.8b, v5.8b
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    umull       v10.8h, v24.8b, v6.8b
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+    umull       v11.8h, v24.8b, v7.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_DST, #1
+10:
+        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+    mvn         v24.8b, v3.8b  /* get inverted alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        urshr       v14.8h, v8.8h, #8
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+        urshr       v15.8h, v9.8h, #8
+        urshr       v12.8h, v10.8h, #8
+        urshr       v13.8h, v11.8h, #8
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+                                    PF cmp, PF_X, ORIG_W
+        raddhn      v30.8b, v12.8h, v10.8h
+        raddhn      v31.8b, v13.8h, v11.8h
+        uqadd       v28.8b, v0.8b, v28.8b
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+    ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
+    mvn         v22.8b, v3.8b
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF blt, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    umull       v8.8h, v22.8b, v4.8b
+                                    PF blt, 10f
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    umull       v9.8h, v22.8b, v5.8b
+    umull       v10.8h, v22.8b, v6.8b
+                                    PF blt, 10f
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_DST, #1
+10:
+    umull       v11.8h, v22.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    mov         v7.s[0], w4
+    dup         v4.8b, v7.b[0]
+    dup         v5.8b, v7.b[1]
+    dup         v6.8b, v7.b[2]
+    dup         v7.8b, v7.b[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+    umull       v0.8h,  v24.8b, v8.8b    /* IN for SRC pixels (part1) */
+    umull       v1.8h,  v24.8b, v9.8b
+    umull       v2.8h,  v24.8b, v10.8b
+    umull       v3.8h,  v24.8b, v11.8b
+        mov         v4.d[1], v5.d[0]
+        shrn        v25.8b,  v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */
+        shrn        v26.8b,  v4.8h, #3
+        sli         v4.8h,   v4.8h, #5
+    urshr       v17.8h, v0.8h,  #8    /* IN for SRC pixels (part2) */
+    urshr       v18.8h, v1.8h,  #8
+    urshr       v19.8h, v2.8h,  #8
+    urshr       v20.8h, v3.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v17.8h
+    raddhn      v1.8b,  v1.8h,  v18.8h
+    raddhn      v2.8b,  v2.8h,  v19.8h
+    raddhn      v3.8b,  v3.8h,  v20.8h
+        sri         v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */
+        sri         v26.8b, v26.8b, #6
+    mvn         v3.8b,  v3.8b
+        shrn        v30.8b, v4.8h,  #2
+    umull       v18.8h, v3.8b, v25.8b     /* now do alpha blending */
+    umull       v19.8h, v3.8b, v26.8b
+    umull       v20.8h, v3.8b, v30.8b
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+    /* 3 cycle bubble (after vmull.u8) */
+    urshr       v5.8h, v18.8h, #8
+    urshr       v6.8h, v19.8h, #8
+    urshr       v7.8h, v20.8h, #8
+    raddhn      v17.8b, v18.8h, v5.8h
+    raddhn      v19.8b, v19.8h, v6.8h
+    raddhn      v18.8b, v20.8h, v7.8h
+    uqadd       v5.8b, v2.8b,  v17.8b
+    /* 1 cycle bubble */
+    uqadd       v6.8b, v0.8b,  v18.8b
+    uqadd       v7.8b, v1.8b,  v19.8b
+    ushll       v14.8h, v5.8b, #7    /* convert to 16bpp */
+    sli         v14.8h, v14.8h, #1
+    ushll       v18.8h, v7.8b, #7
+    sli         v18.8h, v18.8h, #1
+    ushll       v19.8h, v6.8b, #7
+    sli         v19.8h, v19.8h, #1
+    sri         v14.8h, v18.8h, #5
+    /* 1 cycle bubble */
+    sri         v14.8h, v19.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+#if 0
+    ld1         {v4.8h}, [DST_R], #16
+    shrn        v25.8b,  v4.8h,  #8
+    fetch_mask_pixblock
+    shrn        v26.8b,  v4.8h,  #3
+    fetch_src_pixblock
+    umull       v22.8h,  v24.8b, v10.8b
+        urshr       v13.8h, v18.8h, #8
+        urshr       v11.8h, v19.8h, #8
+        urshr       v15.8h, v20.8h, #8
+        raddhn      v17.8b, v18.8h, v13.8h
+        raddhn      v19.8b, v19.8h, v11.8h
+        raddhn      v18.8b, v20.8h, v15.8h
+        uqadd       v17.8b, v2.8b, v17.8b
+    umull       v21.8h,  v24.8b, v9.8b
+        uqadd       v18.8b, v0.8b, v18.8b
+        uqadd       v19.8b, v1.8b, v19.8b
+        ushll       v14.8h, v17.8b, #7
+        sli         v14.8h, v14.8h, #1
+    umull       v20.8h,  v24.8b, v8.8b
+        ushll       v18.8h,  v18.8b, #7
+        sli         v18.8h,  v18.8h, #1
+        ushll       v19.8h,  v19.8b, #7
+        sli         v19.8h,  v19.8h, #1
+        sri         v14.8h,  v18.8h, #5
+    umull       v23.8h,  v24.8b, v11.8b
+        sri         v14.8h,  v19.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+
+    cache_preload 8, 8
+
+    sli         v4.8h,  v4.8h,   #5
+    urshr       v16.8h, v20.8h,  #8
+    urshr       v17.8h, v21.8h,  #8
+    urshr       v18.8h, v22.8h,  #8
+    urshr       v19.8h, v23.8h,  #8
+    raddhn      v0.8b,  v20.8h, v16.8h
+    raddhn      v1.8b,  v21.8h, v17.8h
+    raddhn      v2.8b,  v22.8h, v18.8h
+    raddhn      v3.8b,  v23.8h, v19.8h
+    sri         v25.8b,  v25.8b,  #5
+    sri         v26.8b,  v26.8b,  #6
+    mvn         v3.8b,  v3.8b
+    shrn        v30.8b, v4.8h,  #2
+    st1         {v14.8h}, [DST_W], #16
+    umull       v18.8h, v3.8b, v25.8b
+    umull       v19.8h, v3.8b, v26.8b
+    umull       v20.8h, v3.8b, v30.8b
+#else
+    pixman_composite_over_8888_8_0565_process_pixblock_tail
+    st1         {v28.4h, v29.4h}, [DST_W], #16
+    ld1         {v4.4h, v5.4h}, [DST_R], #16
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    pixman_composite_over_8888_8_0565_process_pixblock_head
+#endif
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    mov         v11.s[0], w4
+    dup         v8.8b, v11.b[0]
+    dup         v9.8b, v11.b[1]
+    dup         v10.8b, v11.b[2]
+    dup         v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+    mov         v24.s[0], w6
+    dup         v24.8b, v24.b[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_0565_init, \
+    pixman_composite_over_8888_n_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
+    fetch_src_pixblock
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    st1         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
+.endm
+
+.macro pixman_composite_src_n_8_init
+    mov         v0.s[0], w4
+    dup         v3.8b, v0.b[0]
+    dup         v2.8b, v0.b[0]
+    dup         v1.8b, v0.b[0]
+    dup         v0.8b, v0.b[0]
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    mov         v0.s[0], w4
+    dup         v3.4h, v0.h[0]
+    dup         v2.4h, v0.h[0]
+    dup         v1.4h, v0.h[0]
+    dup         v0.4h, v0.h[0]
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    st1         {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    mov         v0.s[0], w4
+    dup         v3.2s, v0.s[0]
+    dup         v2.2s, v0.s[0]
+    dup         v1.2s, v0.s[0]
+    dup         v0.2s, v0.s[0]
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    st1  {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    orr      v0.8b, v0.8b, v4.8b
+    orr      v1.8b, v1.8b, v4.8b
+    orr      v2.8b, v2.8b, v4.8b
+    orr      v3.8b, v3.8b, v4.8b
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    st1      {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+    fetch_src_pixblock
+    orr      v0.8b, v0.8b, v4.8b
+    orr      v1.8b, v1.8b, v4.8b
+    orr      v2.8b, v2.8b, v4.8b
+    orr      v3.8b, v3.8b, v4.8b
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    movi    v4.2s, #0xff, lsl 24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+    /* expecting solid source in {v0, v1, v2, v3} */
+    /* mask is in v24 (v25, v26, v27 are unused) */
+
+    /* in */
+    umull       v8.8h,  v24.8b, v0.8b
+    umull       v9.8h,  v24.8b, v1.8b
+    umull       v10.8h, v24.8b, v2.8b
+    umull       v11.8h, v24.8b, v3.8b
+    ursra       v8.8h,  v8.8h, #8
+    ursra       v9.8h,  v9.8h, #8
+    ursra       v10.8h, v10.8h, #8
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+    rshrn       v28.8b, v8.8h, #8
+    rshrn       v29.8b, v9.8h, #8
+    rshrn       v30.8b, v10.8h, #8
+    rshrn       v31.8b, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add, PF_X, PF_X, #8
+        rshrn       v28.8b, v8.8h, #8
+                                    PF tst, PF_CTL, #0x0F
+        rshrn       v29.8b, v9.8h, #8
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+10:
+        rshrn      v30.8b, v10.8h, #8
+                                    PF beq, 10f
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        rshrn      v31.8b, v11.8h, #8
+                                    PF cmp, PF_X, ORIG_W
+    umull          v8.8h, v24.8b, v0.8b
+                                    PF lsl, DUMMY, PF_X, #mask_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+    umull          v9.8h, v24.8b, v1.8b
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    umull          v10.8h, v24.8b, v2.8b
+                                    PF ble, 10f
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    umull          v11.8h, v24.8b, v3.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+                                    PF add, PF_MASK, PF_MASK, #1
+10:
+        st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ursra       v8.8h, v8.8h, #8
+    ursra       v9.8h, v9.8h, #8
+    ursra       v10.8h, v10.8h, #8
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8888_init, \
+    pixman_composite_src_n_8_8888_cleanup, \
+    pixman_composite_src_n_8_8888_process_pixblock_head, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+    umull       v0.8h, v24.8b, v16.8b
+    umull       v1.8h, v25.8b, v16.8b
+    umull       v2.8h, v26.8b, v16.8b
+    umull       v3.8h, v27.8b, v16.8b
+    ursra       v0.8h, v0.8h,  #8
+    ursra       v1.8h, v1.8h,  #8
+    ursra       v2.8h, v2.8h,  #8
+    ursra       v3.8h, v3.8h,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+    rshrn       v28.8b, v0.8h, #8
+    rshrn       v29.8b, v1.8h, #8
+    rshrn       v30.8b, v2.8h, #8
+    rshrn       v31.8b, v3.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add, PF_X, PF_X, #8
+        rshrn       v28.8b, v0.8h, #8
+                                    PF tst, PF_CTL, #0x0F
+        rshrn       v29.8b, v1.8h, #8
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+10:
+        rshrn       v30.8b, v2.8h, #8
+                                    PF beq, 10f
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        rshrn       v31.8b, v3.8h, #8
+                                    PF cmp, PF_X, ORIG_W
+    umull       v0.8h,  v24.8b, v16.8b
+                                    PF lsl, DUMMY, PF_X, mask_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+    umull       v1.8h,  v25.8b, v16.8b
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    umull       v2.8h,  v26.8b, v16.8b
+                                    PF ble, 10f
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    umull       v3.8h,  v27.8b, v16.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+                                    PF add, PF_MASK, PF_MASK, #1
+10:
+        st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ursra       v0.8h, v0.8h,  #8
+    ursra       v1.8h, v1.8h,  #8
+    ursra       v2.8h, v2.8h,  #8
+    ursra       v3.8h, v3.8h,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+    mov         v16.s[0], w4
+    dup         v16.8b, v16.b[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8_init, \
+    pixman_composite_src_n_8_8_cleanup, \
+    pixman_composite_src_n_8_8_process_pixblock_head, \
+    pixman_composite_src_n_8_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {v8, v9, v10, v11} */
+    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
+    /* and destination data in {v4, v5, v6, v7} */
+    /* mask is in v24 (v25, v26, v27 are unused) */
+
+    /* in */
+    umull       v12.8h, v24.8b, v8.8b
+    umull       v13.8h, v24.8b, v9.8b
+    umull       v14.8h, v24.8b, v10.8b
+    umull       v15.8h, v24.8b, v11.8b
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    urshr       v18.8h, v14.8h, #8
+    urshr       v19.8h, v15.8h, #8
+    raddhn      v0.8b, v12.8h, v16.8h
+    raddhn      v1.8b, v13.8h, v17.8h
+    raddhn      v2.8b, v14.8h, v18.8h
+    raddhn      v3.8b, v15.8h, v19.8h
+    mvn         v25.8b, v3.8b  /* get inverted alpha */
+    /* source:      v0 - blue, v1 - green, v2 - red, v3 - alpha */
+    /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
+    /* now do alpha blending */
+    umull       v12.8h, v25.8b, v4.8b
+    umull       v13.8h, v25.8b, v5.8b
+    umull       v14.8h, v25.8b, v6.8b
+    umull       v15.8h, v25.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    urshr       v18.8h, v14.8h, #8
+    urshr       v19.8h, v15.8h, #8
+    raddhn      v28.8b, v16.8h, v12.8h
+    raddhn      v29.8b, v17.8h, v13.8h
+    raddhn      v30.8b, v18.8h, v14.8h
+    raddhn      v31.8b, v19.8h, v15.8h
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+        urshr       v16.8h, v12.8h, #8
+     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v17.8h, v13.8h, #8
+    fetch_mask_pixblock
+        urshr       v18.8h, v14.8h, #8
+                                    PF add, PF_X, PF_X, #8
+        urshr       v19.8h, v15.8h, #8
+                                    PF tst, PF_CTL, #0x0F
+        raddhn      v28.8b, v16.8h, v12.8h
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+10:
+        raddhn      v29.8b, v17.8h, v13.8h
+                                    PF beq, 10f
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        raddhn      v30.8b, v18.8h, v14.8h
+                                    PF cmp, PF_X, ORIG_W
+        raddhn      v31.8b, v19.8h, v15.8h
+                                    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+    umull       v16.8h, v24.8b, v8.8b
+                                    PF lsl, DUMMY, PF_X, #mask_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+    umull       v17.8h, v24.8b, v9.8b
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+10:
+    umull       v18.8h, v24.8b, v10.8b
+                                    PF ble, 10f
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+10:
+    umull       v19.8h, v24.8b, v11.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+                                    PF add, PF_DST, PF_DST, #1
+10:
+        uqadd       v28.8b, v0.8b, v28.8b
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+                                    PF add, PF_MASK, PF_MASK, #1
+10:
+        uqadd        v29.8b, v1.8b, v29.8b
+        uqadd        v30.8b, v2.8b, v30.8b
+        uqadd        v31.8b, v3.8b, v31.8b
+    urshr       v12.8h, v16.8h, #8
+    urshr       v13.8h, v17.8h, #8
+    urshr       v14.8h, v18.8h, #8
+    urshr       v15.8h, v19.8h, #8
+    raddhn      v0.8b, v16.8h, v12.8h
+    raddhn      v1.8b, v17.8h, v13.8h
+    raddhn      v2.8b, v18.8h, v14.8h
+    raddhn      v3.8b, v19.8h, v15.8h
+        st4          {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    mvn         v25.8b, v3.8b
+    umull       v12.8h, v25.8b, v4.8b
+    umull       v13.8h, v25.8b, v5.8b
+    umull       v14.8h, v25.8b, v6.8b
+    umull       v15.8h, v25.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    mov         v11.s[0], w4
+    dup         v8.8b, v11.b[0]
+    dup         v9.8b, v11.b[1]
+    dup         v10.8b, v11.b[2]
+    dup         v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+    umull       v0.8h,  v24.8b, v8.8b
+    umull       v1.8h,  v25.8b, v8.8b
+    umull       v2.8h,  v26.8b, v8.8b
+    umull       v3.8h,  v27.8b, v8.8b
+    urshr       v10.8h, v0.8h,  #8
+    urshr       v11.8h, v1.8h,  #8
+    urshr       v12.8h, v2.8h,  #8
+    urshr       v13.8h, v3.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v10.8h
+    raddhn      v1.8b,  v1.8h,  v11.8h
+    raddhn      v2.8b,  v2.8h,  v12.8h
+    raddhn      v3.8b,  v3.8h,  v13.8h
+    mvn         v24.8b, v0.8b
+    mvn         v25.8b, v1.8b
+    mvn         v26.8b, v2.8b
+    mvn         v27.8b, v3.8b
+    umull       v10.8h, v24.8b, v4.8b
+    umull       v11.8h, v25.8b, v5.8b
+    umull       v12.8h, v26.8b, v6.8b
+    umull       v13.8h, v27.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+    urshr       v14.8h, v10.8h,  #8
+    urshr       v15.8h, v11.8h,  #8
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    raddhn      v28.8b, v14.8h, v10.8h
+    raddhn      v29.8b, v15.8h, v11.8h
+    raddhn      v30.8b, v16.8h, v12.8h
+    raddhn      v31.8b, v17.8h, v13.8h
+    uqadd       v28.8b, v0.8b,  v28.8b
+    uqadd       v29.8b, v1.8b,  v29.8b
+    uqadd       v30.8b, v2.8b,  v30.8b
+    uqadd       v31.8b, v3.8b,  v31.8b
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_n_8_8_process_pixblock_tail
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+    mov         v8.s[0], w4
+    dup         v8.8b, v8.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8_init, \
+    pixman_composite_over_n_8_8_cleanup, \
+    pixman_composite_over_n_8_8_process_pixblock_head, \
+    pixman_composite_over_n_8_8_process_pixblock_tail, \
+    pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {v8,  v9,  v10, v11}
+     *         dest in          {v4,  v5,  v6,  v7 }
+     *         mask in          {v24, v25, v26, v27}
+     * output: updated src in   {v0,  v1,  v2,  v3 }
+     *         updated mask in  {v24, v25, v26, v3 }
+     */
+    umull       v0.8h,  v24.8b, v8.8b
+    umull       v1.8h,  v25.8b, v9.8b
+    umull       v2.8h,  v26.8b, v10.8b
+    umull       v3.8h,  v27.8b, v11.8b
+    umull       v12.8h, v11.8b, v25.8b
+    umull       v13.8h, v11.8b, v24.8b
+    umull       v14.8h, v11.8b, v26.8b
+    urshr       v15.8h, v0.8h,  #8
+    urshr       v16.8h, v1.8h,  #8
+    urshr       v17.8h, v2.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v15.8h
+    raddhn      v1.8b,  v1.8h,  v16.8h
+    raddhn      v2.8b,  v2.8h,  v17.8h
+    urshr       v15.8h, v13.8h, #8
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v14.8h, #8
+    urshr       v18.8h, v3.8h,  #8
+    raddhn      v24.8b, v13.8h, v15.8h
+    raddhn      v25.8b, v12.8h, v16.8h
+    raddhn      v26.8b, v14.8h, v17.8h
+    raddhn      v3.8b,  v3.8h,  v18.8h
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {v28, v29, v30, v31}
+     */
+    mvn         v24.8b, v24.8b
+    mvn         v25.8b, v25.8b
+    mvn         v26.8b, v26.8b
+    mvn         v27.8b, v3.8b
+    umull       v12.8h, v24.8b, v4.8b
+    umull       v13.8h, v25.8b, v5.8b
+    umull       v14.8h, v26.8b, v6.8b
+    umull       v15.8h, v27.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    urshr       v18.8h, v14.8h, #8
+    urshr       v19.8h, v15.8h, #8
+    raddhn      v28.8b, v16.8h, v12.8h
+    raddhn      v29.8b, v17.8h, v13.8h
+    raddhn      v30.8b, v18.8h, v14.8h
+    raddhn      v31.8b, v19.8h, v15.8h
+    uqadd       v28.8b, v0.8b,  v28.8b
+    uqadd       v29.8b, v1.8b,  v29.8b
+    uqadd       v30.8b, v2.8b,  v30.8b
+    uqadd       v31.8b, v3.8b,  v31.8b
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        urshr       v16.8h, v12.8h, #8
+        urshr       v17.8h, v13.8h, #8
+    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v18.8h, v14.8h, #8
+        urshr       v19.8h, v15.8h, #8
+        raddhn      v28.8b, v16.8h, v12.8h
+        raddhn      v29.8b, v17.8h, v13.8h
+        raddhn      v30.8b, v18.8h, v14.8h
+        raddhn      v31.8b, v19.8h, v15.8h
+    fetch_mask_pixblock
+        uqadd       v28.8b, v0.8b, v28.8b
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    mov         v13.s[0], w4
+    dup         v8.8b, v13.b[0]
+    dup         v9.8b, v13.b[1]
+    dup         v10.8b, v13.b[2]
+    dup         v11.8b, v13.b[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
+     *         mask in          {v24, v25, v26}       [B, G, R]
+     * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
+     *         updated mask in  {v24, v25, v26}       [B, G, R]
+     */
+    umull       v0.8h,  v24.8b, v8.8b
+    umull       v1.8h,  v25.8b, v9.8b
+    umull       v2.8h,  v26.8b, v10.8b
+    umull       v12.8h, v11.8b, v24.8b
+    umull       v13.8h, v11.8b, v25.8b
+    umull       v14.8h, v11.8b, v26.8b
+    urshr       v15.8h, v0.8h,  #8
+    urshr       v16.8h, v1.8h,  #8
+    urshr       v17.8h, v2.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v15.8h
+    raddhn      v1.8b,  v1.8h,  v16.8h
+    raddhn      v2.8b,  v2.8h,  v17.8h
+    urshr       v19.8h, v12.8h, #8
+    urshr       v20.8h, v13.8h, #8
+    urshr       v21.8h, v14.8h, #8
+    raddhn      v24.8b, v12.8h, v19.8h
+    raddhn      v25.8b, v13.8h, v20.8h
+    /*
+     * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+     * and put data into v16 - blue, v17 - green, v18 - red
+     */
+       mov         v4.d[1], v5.d[0]
+       shrn        v17.8b, v4.8h,  #3
+       shrn        v18.8b, v4.8h,  #8
+    raddhn      v26.8b, v14.8h, v21.8h
+       sli         v4.8h,  v4.8h,  #5
+       sri         v18.8b, v18.8b, #5
+       sri         v17.8b, v17.8b, #6
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in v16 - blue, v17 - green, v18 - red
+     */
+    mvn         v24.8b, v24.8b
+    mvn         v25.8b, v25.8b
+       shrn       v16.8b, v4.8h,  #2
+    mvn         v26.8b, v26.8b
+    umull       v5.8h, v16.8b, v24.8b
+    umull       v6.8h, v17.8b, v25.8b
+    umull       v7.8h, v18.8b, v26.8b
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    urshr       v13.8h, v5.8h, #8
+    urshr       v14.8h, v6.8h, #8
+    urshr       v15.8h, v7.8h, #8
+    raddhn      v16.8b, v13.8h, v5.8h
+    raddhn      v17.8b, v14.8h, v6.8h
+    raddhn      v18.8b, v15.8h, v7.8h
+    uqadd       v16.8b, v0.8b, v16.8b
+    uqadd       v17.8b, v1.8b, v17.8b
+    uqadd       v18.8b, v2.8b, v18.8b
+    /*
+     * convert the results in v16, v17, v18 to r5g6b5 and store
+     * them into {v14}
+     */
+    ushll       v14.8h, v18.8b, #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v12.8h, v17.8b, #7
+    sli         v12.8h, v12.8h, #1
+    ushll       v13.8h, v16.8b, #7
+    sli         v13.8h, v13.8h, #1
+    sri         v14.8h, v12.8h, #5
+    sri         v14.8h, v13.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+    fetch_mask_pixblock
+        urshr       v13.8h, v5.8h, #8
+        urshr       v14.8h, v6.8h, #8
+    ld1         {v4.8h}, [DST_R], #16
+        urshr       v15.8h, v7.8h, #8
+        raddhn      v16.8b, v13.8h, v5.8h
+        raddhn      v17.8b, v14.8h, v6.8h
+        raddhn      v18.8b, v15.8h, v7.8h
+    mov         v5.d[0], v4.d[1]
+            /* process_pixblock_head */
+            /*
+             * 'combine_mask_ca' replacement
+             *
+             * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
+             *         mask in          {v24, v25, v26}       [B, G, R]
+             * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
+             *         updated mask in  {v24, v25, v26}       [B, G, R]
+             */
+        uqadd       v16.8b, v0.8b, v16.8b
+        uqadd       v17.8b, v1.8b, v17.8b
+        uqadd       v18.8b, v2.8b, v18.8b
+            umull       v0.8h,  v24.8b, v8.8b
+            umull       v1.8h,  v25.8b, v9.8b
+            umull       v2.8h,  v26.8b, v10.8b
+        /*
+         * convert the result in v16, v17, v18 to r5g6b5 and store
+         * it into {v14}
+         */
+        ushll       v14.8h, v18.8b, #7
+        sli         v14.8h, v14.8h, #1
+        ushll       v18.8h, v16.8b, #7
+        sli         v18.8h, v18.8h, #1
+        ushll       v19.8h, v17.8b, #7
+        sli         v19.8h, v19.8h, #1
+            umull       v12.8h, v11.8b, v24.8b
+        sri         v14.8h, v19.8h, #5
+            umull       v13.8h, v11.8b, v25.8b
+            umull       v15.8h, v11.8b, v26.8b
+        sri         v14.8h, v18.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+    cache_preload 8, 8
+            urshr       v16.8h, v0.8h,  #8
+            urshr       v17.8h, v1.8h,  #8
+            urshr       v18.8h, v2.8h,  #8
+            raddhn      v0.8b,  v0.8h,  v16.8h
+            raddhn      v1.8b,  v1.8h,  v17.8h
+            raddhn      v2.8b,  v2.8h,  v18.8h
+            urshr       v19.8h, v12.8h, #8
+            urshr       v20.8h, v13.8h, #8
+            urshr       v21.8h, v15.8h, #8
+            raddhn      v24.8b, v12.8h, v19.8h
+            raddhn      v25.8b, v13.8h, v20.8h
+                /*
+                 * convert 8 r5g6b5 pixel data from {v4, v5} to planar
+             * 8-bit format and put data into v16 - blue, v17 - green,
+             * v18 - red
+                 */
+		mov         v4.d[1], v5.d[0]
+                shrn        v17.8b, v4.8h,  #3
+                shrn        v18.8b, v4.8h,  #8
+            raddhn      v26.8b, v15.8h, v21.8h
+                sli         v4.8h,  v4.8h,  #5
+                sri         v17.8b, v17.8b, #6
+                sri         v18.8b, v18.8b, #5
+            /*
+             * 'combine_over_ca' replacement
+             *
+             * output: updated dest in v16 - blue, v17 - green, v18 - red
+             */
+            mvn         v24.8b, v24.8b
+            mvn         v25.8b, v25.8b
+                shrn        v16.8b, v4.8h,  #2
+            mvn         v26.8b, v26.8b
+            umull       v5.8h, v16.8b, v24.8b
+            umull       v6.8h, v17.8b, v25.8b
+            umull       v7.8h, v18.8b, v26.8b
+    st1         {v14.8h}, [DST_W], #16
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+    mov         v13.s[0], w4
+    dup         v8.8b, v13.b[0]
+    dup         v9.8b, v13.b[1]
+    dup         v10.8b, v13.b[2]
+    dup         v11.8b, v13.b[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_0565_ca_init, \
+    pixman_composite_over_n_8888_0565_ca_cleanup, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* and destination data in {v4, v5, v6, v7} */
+    umull       v8.8h,  v4.8b,  v3.8b
+    umull       v9.8h,  v5.8b,  v3.8b
+    umull       v10.8h, v6.8b,  v3.8b
+    umull       v11.8h, v7.8b,  v3.8b
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+    urshr       v14.8h, v8.8h,  #8
+    urshr       v15.8h, v9.8h,  #8
+    urshr       v12.8h, v10.8h, #8
+    urshr       v13.8h, v11.8h, #8
+    raddhn      v28.8b, v8.8h,  v14.8h
+    raddhn      v29.8b, v9.8h,  v15.8h
+    raddhn      v30.8b, v10.8h, v12.8h
+    raddhn      v31.8b, v11.8h, v13.8h
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+    pixman_composite_in_n_8_process_pixblock_tail
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    cache_preload 32, 32
+    pixman_composite_in_n_8_process_pixblock_head
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_in_n_8_init
+    mov         v3.s[0], w4
+    dup         v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_in_n_8_init, \
+    pixman_composite_in_n_8_cleanup, \
+    pixman_composite_in_n_8_process_pixblock_head, \
+    pixman_composite_in_n_8_process_pixblock_tail, \
+    pixman_composite_in_n_8_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {v8, v9, v10, v11} */
+    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
+    /* and destination data in {v4, v5, v6, v7} */
+    /* mask is in v24, v25, v26, v27 */
+    umull       v0.8h, v24.8b, v11.8b
+    umull       v1.8h, v25.8b, v11.8b
+    umull       v2.8h, v26.8b, v11.8b
+    umull       v3.8h, v27.8b, v11.8b
+    urshr       v12.8h, v0.8h, #8
+    urshr       v13.8h, v1.8h, #8
+    urshr       v14.8h, v2.8h, #8
+    urshr       v15.8h, v3.8h, #8
+    raddhn      v0.8b, v0.8h, v12.8h
+    raddhn      v1.8b, v1.8h, v13.8h
+    raddhn      v2.8b, v2.8h, v14.8h
+    raddhn      v3.8b, v3.8h, v15.8h
+    uqadd       v28.8b, v0.8b, v4.8b
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    mov         v11.s[0], w4
+    dup         v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* destination data in {v4, v5, v6, v7} */
+    /* mask in {v24, v25, v26, v27} */
+    umull       v8.8h, v24.8b, v0.8b
+    umull       v9.8h, v25.8b, v1.8b
+    umull       v10.8h, v26.8b, v2.8b
+    umull       v11.8h, v27.8b, v3.8b
+    urshr       v0.8h, v8.8h, #8
+    urshr       v1.8h, v9.8h, #8
+    urshr       v12.8h, v10.8h, #8
+    urshr       v13.8h, v11.8h, #8
+    raddhn      v0.8b, v0.8h, v8.8h
+    raddhn      v1.8b, v1.8h, v9.8h
+    raddhn      v2.8b, v12.8h, v10.8h
+    raddhn      v3.8b, v13.8h, v11.8h
+    uqadd       v28.8b, v0.8b, v4.8b
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* destination data in {v4, v5, v6, v7} */
+    /* mask in {v24, v25, v26, v27} */
+    umull       v8.8h,  v27.8b, v0.8b
+    umull       v9.8h,  v27.8b, v1.8b
+    umull       v10.8h, v27.8b, v2.8b
+    umull       v11.8h, v27.8b, v3.8b
+    /* 1 cycle bubble */
+    ursra       v8.8h,  v8.8h,  #8
+    ursra       v9.8h,  v9.8h,  #8
+    ursra       v10.8h, v10.8h, #8
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    /* 2 cycle bubble */
+    rshrn       v28.8b, v8.8h,  #8
+    rshrn       v29.8b, v9.8h,  #8
+    rshrn       v30.8b, v10.8h, #8
+    rshrn       v31.8b, v11.8h, #8
+    uqadd       v28.8b, v4.8b,  v28.8b
+    uqadd       v29.8b, v5.8b,  v29.8b
+    uqadd       v30.8b, v6.8b,  v30.8b
+    uqadd       v31.8b, v7.8b,  v31.8b
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+        rshrn       v28.8b, v8.8h,  #8
+    fetch_mask_pixblock
+        rshrn       v29.8b, v9.8h,  #8
+    umull       v8.8h,  v27.8b, v0.8b
+        rshrn       v30.8b, v10.8h, #8
+    umull       v9.8h,  v27.8b, v1.8b
+        rshrn       v31.8b, v11.8h, #8
+    umull       v10.8h, v27.8b, v2.8b
+    umull       v11.8h, v27.8b, v3.8b
+        uqadd       v28.8b, v4.8b,  v28.8b
+        uqadd       v29.8b, v5.8b,  v29.8b
+        uqadd       v30.8b, v6.8b,  v30.8b
+        uqadd       v31.8b, v7.8b,  v31.8b
+    ursra       v8.8h,  v8.8h,  #8
+    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    ursra       v9.8h,  v9.8h,  #8
+        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ursra       v10.8h, v10.8h, #8
+
+    cache_preload 8, 8
+
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function \
+    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8888_init, \
+    pixman_composite_add_n_8_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+    mov         v27.s[0], w6
+    dup         v27.8b, v27.b[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8888_n_8888_init, \
+    pixman_composite_add_8888_n_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* destination data in {v4, v5, v6, v7} */
+    /* solid mask is in v15 */
+
+    /* 'in' */
+    umull       v11.8h, v15.8b, v3.8b
+    umull       v10.8h, v15.8b, v2.8b
+    umull       v9.8h,  v15.8b, v1.8b
+    umull       v8.8h,  v15.8b, v0.8b
+    urshr       v16.8h, v11.8h, #8
+    urshr       v14.8h, v10.8h, #8
+    urshr       v13.8h,  v9.8h, #8
+    urshr       v12.8h,  v8.8h, #8
+    raddhn      v3.8b, v11.8h, v16.8h
+    raddhn      v2.8b, v10.8h, v14.8h
+    raddhn      v1.8b,  v9.8h, v13.8h
+    raddhn      v0.8b,  v8.8h, v12.8h
+    mvn         v24.8b, v3.8b  /* get inverted alpha */
+    /* now do alpha blending */
+    umull       v8.8h, v24.8b, v4.8b
+    umull       v9.8h, v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+    umull       v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    urshr       v16.8h, v8.8h, #8
+    urshr       v17.8h, v9.8h, #8
+    urshr       v18.8h, v10.8h, #8
+    urshr       v19.8h, v11.8h, #8
+    raddhn      v28.8b, v16.8h, v8.8h
+    raddhn      v29.8b, v17.8h, v9.8h
+    raddhn      v30.8b, v18.8h, v10.8h
+    raddhn      v31.8b, v19.8h, v11.8h
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    mov         v15.s[0], w6
+    dup         v15.8b, v15.b[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    st3     {v0.8b, v1.8b, v2.8b}, [DST_W], #24
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    mov    v31.8b, v2.8b
+    mov    v2.8b, v0.8b
+    mov    v0.8b, v31.8b
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    st4    {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
+    fetch_src_pixblock
+    mov    v31.8b, v2.8b
+    mov    v2.8b, v0.8b
+    mov    v0.8b, v31.8b
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    eor    v3.8b, v3.8b, v3.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    ushll       v8.8h, v1.8b, #7
+    sli         v8.8h, v8.8h, #1
+    ushll       v9.8h, v2.8b, #7
+    sli         v9.8h, v9.8h, #1
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    ushll       v14.8h, v0.8b, #7
+    sli         v14.8h, v14.8h, #1
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        ushll       v14.8h, v0.8b, #7
+        sli         v14.8h, v14.8h, #1
+    fetch_src_pixblock
+        sri         v14.8h, v8.8h, #5
+        sri         v14.8h, v9.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+    ushll       v8.8h, v1.8b, #7
+    sli         v8.8h, v8.8h, #1
+        st1     {v14.8h}, [DST_W], #16
+    ushll       v9.8h, v2.8b, #7
+    sli         v9.8h, v9.8h, #1
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    urshr       v11.8h, v8.8h, #8
+    mov         v30.8b, v31.8b
+    mov         v31.8b, v3.8b
+    mov         v3.8b, v30.8b
+    urshr       v12.8h, v9.8h, #8
+    urshr       v13.8h, v10.8h, #8
+    raddhn      v30.8b, v11.8h, v8.8h
+    raddhn      v29.8b, v12.8h, v9.8h
+    raddhn      v28.8b, v13.8h, v10.8h
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        urshr       v11.8h, v8.8h, #8
+        mov         v30.8b, v31.8b
+        mov         v31.8b, v3.8b
+        mov         v3.8b, v31.8b
+        urshr       v12.8h, v9.8h, #8
+        urshr       v13.8h, v10.8h, #8
+    fetch_src_pixblock
+        raddhn      v30.8b, v11.8h, v8.8h
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        raddhn      v29.8b, v12.8h, v9.8h
+        raddhn      v28.8b, v13.8h, v10.8h
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp, PF_X, ORIG_W
+                                    PF lsl, DUMMY, PF_X, src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+10:
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    urshr       v11.8h, v8.8h, #8
+    mov         v30.8b, v31.8b
+    mov         v31.8b, v3.8b
+    mov         v3.8b, v30.8b
+    urshr       v12.8h, v9.8h, #8
+    urshr       v13.8h, v10.8h, #8
+    raddhn      v28.8b, v11.8h, v8.8h
+    raddhn      v29.8b, v12.8h, v9.8h
+    raddhn      v30.8b, v13.8h, v10.8h
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        urshr       v11.8h, v8.8h, #8
+        mov         v30.8b, v31.8b
+        mov         v31.8b, v3.8b
+        mov         v3.8b, v30.8b
+        urshr       v12.8h, v9.8h, #8
+        urshr       v13.8h, v10.8h, #8
+    fetch_src_pixblock
+        raddhn      v28.8b, v11.8h, v8.8h
+                                    PF add, PF_X, PF_X, #8
+                                    PF tst, PF_CTL, #0xF
+                                    PF beq, 10f
+                                    PF add, PF_X, PF_X, #8
+                                    PF sub, PF_CTL, PF_CTL, #1
+10:
+        raddhn      v29.8b, v12.8h, v9.8h
+        raddhn      v30.8b, v13.8h, v10.8h
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp, PF_X, ORIG_W
+                                    PF lsl, DUMMY, PF_X, src_bpp_shift
+                                    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+                                    PF ble, 10f
+                                    PF sub, PF_X, PF_X, ORIG_W
+                                    PF subs, PF_CTL, PF_CTL, #0x10
+                                    PF ble, 10f
+                                    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+                                    PF add, PF_SRC, PF_SRC, #1
+10:
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+    /* mask is in v15 */
+    mov         v4.d[0], v8.d[0]
+    mov         v4.d[1], v9.d[0]
+    mov         v13.d[0], v10.d[0]
+    mov         v13.d[1], v11.d[0]
+    convert_0565_to_x888 v4, v2, v1, v0
+    convert_0565_to_x888 v13, v6, v5, v4
+    /* source pixel data is in      {v0, v1, v2, XX} */
+    /* destination pixel data is in {v4, v5, v6, XX} */
+    mvn         v7.8b,  v15.8b
+    umull       v10.8h, v15.8b, v2.8b
+    umull       v9.8h,  v15.8b, v1.8b
+    umull       v8.8h,  v15.8b, v0.8b
+    umull       v11.8h, v7.8b,  v4.8b
+    umull       v12.8h, v7.8b,  v5.8b
+    umull       v13.8h, v7.8b,  v6.8b
+    urshr       v19.8h, v10.8h, #8
+    urshr       v18.8h, v9.8h,  #8
+    urshr       v17.8h, v8.8h,  #8
+    raddhn      v2.8b,  v10.8h, v19.8h
+    raddhn      v1.8b,  v9.8h,  v18.8h
+    raddhn      v0.8b,  v8.8h,  v17.8h
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+    urshr       v17.8h, v11.8h,  #8
+    urshr       v18.8h, v12.8h,  #8
+    urshr       v19.8h, v13.8h,  #8
+    raddhn      v28.8b, v17.8h, v11.8h
+    raddhn      v29.8b, v18.8h, v12.8h
+    raddhn      v30.8b, v19.8h, v13.8h
+    uqadd       v0.8b,  v0.8b,  v28.8b
+    uqadd       v1.8b,  v1.8b,  v29.8b
+    uqadd       v2.8b,  v2.8b,  v30.8b
+    /* 32bpp result is in {v0, v1, v2, XX} */
+    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_over_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    ld1        {v10.4h, v11.4h}, [DST_R], #16
+    cache_preload 8, 8
+    pixman_composite_over_0565_8_0565_process_pixblock_head
+    st1        {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+    mov         v15.s[0], w6
+    dup         v15.8b, v15.b[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_0565_n_0565_init, \
+    pixman_composite_over_0565_n_0565_cleanup, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in v15 */
+    mov         v4.d[0], v8.d[0]
+    mov         v4.d[1], v9.d[0]
+    mov         v13.d[0], v10.d[0]
+    mov         v13.d[1], v11.d[0]
+    convert_0565_to_x888 v4,  v2, v1, v0
+    convert_0565_to_x888 v13, v6, v5, v4
+    /* source pixel data is in      {v0, v1, v2, XX} */
+    /* destination pixel data is in {v4, v5, v6, XX} */
+    umull       v9.8h,  v15.8b, v2.8b
+    umull       v8.8h,  v15.8b, v1.8b
+    umull       v7.8h,  v15.8b, v0.8b
+    urshr       v12.8h, v9.8h,  #8
+    urshr       v11.8h, v8.8h,  #8
+    urshr       v10.8h, v7.8h,  #8
+    raddhn      v2.8b,  v9.8h,  v12.8h
+    raddhn      v1.8b,  v8.8h,  v11.8h
+    raddhn      v0.8b,  v7.8h,  v10.8h
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    uqadd       v0.8b,  v0.8b,  v4.8b
+    uqadd       v1.8b,  v1.8b,  v5.8b
+    uqadd       v2.8b,  v2.8b,  v6.8b
+    /* 32bpp result is in {v0, v1, v2, XX} */
+    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    ld1        {v10.4h, v11.4h}, [DST_R], #16
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    st1        {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in v15 */
+    mov         v12.d[0], v10.d[0]
+    mov         v12.d[1], v11.d[0]
+    convert_0565_to_x888 v12, v6, v5, v4
+    /* destination pixel data is in {v4, v5, v6, xx} */
+    mvn         v24.8b, v15.8b /* get inverted alpha */
+    /* now do alpha blending */
+    umull       v8.8h,  v24.8b, v4.8b
+    umull       v9.8h,  v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    urshr       v11.8h, v8.8h, #8
+    urshr       v12.8h, v9.8h, #8
+    urshr       v13.8h, v10.8h, #8
+    raddhn      v0.8b, v11.8h, v8.8h
+    raddhn      v1.8b, v12.8h, v9.8h
+    raddhn      v2.8b, v13.8h, v10.8h
+    /* 32bpp result is in {v0, v1, v2, XX} */
+    convert_8888_to_0565 v2, v1, v0, v14, v12, v3
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    ld1        {v10.4h, v11.4h}, [DST_R], #16
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    st1        {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+    /* src is in v0 */
+    /* destination pixel data is in {v4, v5, v6, v7} */
+    mvn         v1.8b, v0.8b /* get inverted alpha */
+    /* now do alpha blending */
+    umull       v8.8h, v1.8b, v4.8b
+    umull       v9.8h, v1.8b, v5.8b
+    umull       v10.8h, v1.8b, v6.8b
+    umull       v11.8h, v1.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    urshr       v14.8h, v8.8h, #8
+    urshr       v15.8h, v9.8h, #8
+    urshr       v12.8h, v10.8h, #8
+    urshr       v13.8h, v11.8h, #8
+    raddhn      v28.8b, v14.8h, v8.8h
+    raddhn      v29.8b, v15.8h, v9.8h
+    raddhn      v30.8b, v12.8h, v10.8h
+    raddhn      v31.8b, v13.8h, v11.8h
+    /* 32bpp result is in {v28, v29, v30, v31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    ld4       {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_8888_process_pixblock_head
+    st4       {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    asr       TMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #2
+    ld1       {\()\reg1\().2s}, [TMP1], STRIDE
+    ld1       {\()\reg2\().2s}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    asr       TMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    ld1       {\()\reg2\().s}[0], [TMP1], STRIDE
+    ld1       {\()\reg2\().s}[1], [TMP1]
+    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 \reg1, \reg2, \tmp1
+    umull     \()\acc1\().8h, \()\reg1\().8b, v28.8b
+    umlal     \()\acc1\().8h, \()\reg2\().8b, v29.8b
+    bilinear_load_8888 \reg3, \reg4, \tmp2
+    umull     \()\acc2\().8h, \()\reg3\().8b, v28.8b
+    umlal     \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
+.endm
+
+.macro vzip reg1, reg2
+    umov      TMP4, v31.d[0]
+    zip1      v31.8b, \reg1, \reg2
+    zip2      \reg2,  \reg1, \reg2
+    mov       \reg1,  v31.8b
+    mov       v31.d[0], TMP4
+.endm
+
+.macro vuzp reg1, reg2
+    umov      TMP4, v31.d[0]
+    uzp1      v31.8b, \reg1, \reg2
+    uzp2      \reg2,  \reg1, \reg2
+    mov       \reg1,  v31.8b
+    mov       v31.d[0], TMP4
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+    asr       TMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    asr       TMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #1
+    ld1       {\()\acc2\().s}[0], [TMP1], STRIDE
+    ld1       {\()\acc2\().s}[2], [TMP2], STRIDE
+    ld1       {\()\acc2\().s}[1], [TMP1]
+    ld1       {\()\acc2\().s}[3], [TMP2]
+    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+    vzip      \()\reg1\().8b, \()\reg3\().8b
+    vzip      \()\reg2\().8b, \()\reg4\().8b
+    vzip      \()\reg3\().8b, \()\reg4\().8b
+    vzip      \()\reg1\().8b, \()\reg2\().8b
+    umull     \()\acc1\().8h, \()\reg1\().8b, v28.8b
+    umlal     \()\acc1\().8h, \()\reg2\().8b, v29.8b
+    umull     \()\acc2\().8h, \()\reg3\().8b, v28.8b
+    umlal     \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+    asr       TMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    asr       TMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #1
+    ld1       {\()\xacc2\().s}[0], [TMP1], STRIDE
+    ld1       {\()\xacc2\().s}[2], [TMP2], STRIDE
+    ld1       {\()\xacc2\().s}[1], [TMP1]
+    ld1       {\()\xacc2\().s}[3], [TMP2]
+    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
+    asr       TMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #1
+    asr       TMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #1
+    ld1       {\()\yacc2\().s}[0], [TMP1], STRIDE
+    vzip      \()\xreg1\().8b, \()\xreg3\().8b
+    ld1       {\()\yacc2\().s}[2], [TMP2], STRIDE
+    vzip      \()\xreg2\().8b, \()\xreg4\().8b
+    ld1       {\()\yacc2\().s}[1], [TMP1]
+    vzip      \()\xreg3\().8b, \()\xreg4\().8b
+    ld1       {\()\yacc2\().s}[3], [TMP2]
+    vzip      \()\xreg1\().8b, \()\xreg2\().8b
+    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+    umull     \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
+    vzip      \()\yreg1\().8b, \()\yreg3\().8b
+    umlal     \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
+    vzip      \()\yreg2\().8b, \()\yreg4\().8b
+    umull     \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
+    vzip      \()\yreg3\().8b, \()\yreg4\().8b
+    umlal     \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
+    vzip      \()\yreg1\().8b, \()\yreg2\().8b
+    umull     \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
+    umlal     \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
+    umull     \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
+    umlal     \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if \numpix == 4
+    st1       {v0.2s, v1.2s}, [OUT], #16
+.elseif \numpix == 2
+    st1       {v0.2s}, [OUT], #8
+.elseif \numpix == 1
+    st1       {v0.s}[0], [OUT], #4
+.else
+    .error bilinear_store_8888 \numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp      v0.8b, v1.8b
+    vuzp      v2.8b, v3.8b
+    vuzp      v1.8b, v3.8b
+    vuzp      v0.8b, v2.8b
+    convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
+.if \numpix == 4
+    st1       {v1.4h}, [OUT], #8
+.elseif \numpix == 2
+    st1       {v1.s}[0], [OUT], #4
+.elseif \numpix == 1
+    st1       {v1.h}[0], [OUT], #2
+.else
+    .error bilinear_store_0565 \numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_load_\()\src_fmt v0, v1, v2
+    umull     v2.8h, v0.8b, v28.8b
+    umlal     v2.8h, v1.8b, v29.8b
+    /* 5 cycles bubble */
+    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v2.4h, v15.h[0]
+    umlal2    v0.4s, v2.8h, v15.h[0]
+    /* 5 cycles bubble */
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    /* 3 cycles bubble */
+    xtn       v0.8b, v0.8h
+    /* 1 cycle bubble */
+    bilinear_store_\()\dst_fmt 1, v3, v4
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
+                v1, v11, v2, v3, v20, v21, v22, v23
+    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v1.4h, v15.h[0]
+    umlal2    v0.4s, v1.8h, v15.h[0]
+    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v10.4s, v11.4h, v15.h[4]
+    umlal2    v10.4s, v11.8h, v15.h[4]
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add       v12.8h, v12.8h, v13.8h
+    xtn       v0.8b, v0.8h
+    bilinear_store_\()\dst_fmt 2, v3, v4
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
+                v1, v11, v14, v20, v16, v17, v22, v23, \
+                v3, v9,  v24, v25, v26, v27, v18, v19
+    prfm      PREFETCH_MODE, [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    ushll     v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v1.4h, v15.h[0]
+    umlal2    v0.4s, v1.8h, v15.h[0]
+    ushll     v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v10.4s, v11.4h, v15.h[4]
+    umlal2    v10.4s, v11.8h, v15.h[4]
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    ushll     v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v2.4s, v3.4h, v15.h[0]
+    umlal2    v2.4s, v3.8h, v15.h[0]
+    ushll     v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+    prfm      PREFETCH_MODE, [TMP2, PF_OFFS]
+    umlsl     v8.4s, v9.4h, v15.h[4]
+    umlal2    v8.4s, v9.8h, v15.h[4]
+    add       v12.8h, v12.8h, v13.8h
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2     v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2     v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    xtn       v0.8b, v0.8h
+    xtn       v1.8b, v2.8h
+    add       v12.8h, v12.8h, v13.8h
+    bilinear_store_\()\dst_fmt 4, v3, v4
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
+.else
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
+.else
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
+.else
+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
+.else
+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
+.else
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4,          0
+.set BILINEAR_FLAG_UNROLL_8,          1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ *  fname             - name of the function to generate
+ *  src_fmt           - source color format (8888 or 0565)
+ *  dst_fmt           - destination color format (8888 or 0565)
+ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
+ *  prefetch_distance - prefetch in the source image by that many
+ *                      pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+                                       src_bpp_shift, dst_bpp_shift, \
+                                       prefetch_distance, flags
+
+pixman_asm_function \fname
+    OUT       .req      x0
+    TOP       .req      x1
+    BOTTOM    .req      x2
+    WT        .req      x3
+    WB        .req      x4
+    X         .req      x5
+    UX        .req      x6
+    WIDTH     .req      x7
+    TMP1      .req      x8
+    TMP2      .req      x9
+    PF_OFFS   .req      x10
+    TMP3      .req      x11
+    TMP4      .req      x12
+    STRIDE    .req      x13
+
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
+    sxtw      x6, w6
+    sxtw      x7, w7
+
+    stp       x29, x30, [sp, -16]!
+    mov       x29, sp
+    sub       sp,  sp, 112  /* push all registers */
+    sub       x29, x29, 64
+    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+    stp        x8,  x9, [x29, -80]
+    stp       x10, x11, [x29, -96]
+    stp       x12, x13, [x29, -112]
+
+    mov       PF_OFFS, #\prefetch_distance
+    mul       PF_OFFS, PF_OFFS, UX
+
+    subs      STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       300f
+
+    dup       v12.8h, w5
+    dup       v13.8h, w6
+    dup       v28.8b, w3
+    dup       v29.8b, w4
+    mov       v25.d[0], v12.d[1]
+    mov       v26.d[0], v13.d[0]
+    add       v25.4h, v25.4h, v26.4h
+    mov       v12.d[1], v25.d[0]
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       100f
+    tst       OUT, #(1 << \dst_bpp_shift)
+    beq       100f
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add       v12.8h, v12.8h, v13.8h
+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
+    sub       WIDTH, WIDTH, #1
+100:
+    add       v13.8h, v13.8h, v13.8h
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    add       v12.8h, v12.8h, v13.8h
+
+    cmp       WIDTH, #2
+    blt       100f
+    tst       OUT, #(1 << (\dst_bpp_shift + 1))
+    beq       100f
+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
+    sub       WIDTH, WIDTH, #2
+100:
+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+    cmp       WIDTH, #4
+    blt       100f
+    tst       OUT, #(1 << (\dst_bpp_shift + 2))
+    beq       100f
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+    sub       WIDTH, WIDTH, #4
+100:
+    subs      WIDTH, WIDTH, #8
+    blt       100f
+    asr       PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
+    bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
+    subs      WIDTH, WIDTH, #8
+    blt       500f
+1000:
+    bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
+    subs      WIDTH, WIDTH, #8
+    bge       1000b
+500:
+    bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
+100:
+    tst       WIDTH, #4
+    beq       200f
+    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+200:
+.else
+/*********** 4 pixels per iteration *****************/
+    subs      WIDTH, WIDTH, #4
+    blt       100f
+    asr       PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
+    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+    subs      WIDTH, WIDTH, #4
+    blt       500f
+1000:
+    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+    subs      WIDTH, WIDTH, #4
+    bge       1000b
+500:
+    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
+100:
+/****************************************************/
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       200f
+    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
+200:
+    tst       WIDTH, #1
+    beq       300f
+    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
+300:
+    sub       x29, x29, 64
+    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+    ldp        x8,  x9, [x29, -80]
+    ldp       x10, x11, [x29, -96]
+    ldp       x12, x13, [x29, -104]
+    mov       sp, x29
+    ldp       x29, x30, [sp], 16
+    ret
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+pixman_end_asm_function
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+    asr       TMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #2
+    asr       TMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #2
+
+    ld1       {v22.2s}, [TMP1], STRIDE
+    ld1       {v23.2s}, [TMP1]
+    asr       TMP3, X, #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, lsl #2
+    umull     v8.8h, v22.8b, v28.8b
+    umlal     v8.8h, v23.8b, v29.8b
+
+    ld1       {v22.2s}, [TMP2], STRIDE
+    ld1       {v23.2s}, [TMP2]
+    asr       TMP4, X, #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, lsl #2
+    umull     v9.8h, v22.8b, v28.8b
+    umlal     v9.8h, v23.8b, v29.8b
+
+    ld1       {v22.2s}, [TMP3], STRIDE
+    ld1       {v23.2s}, [TMP3]
+    umull     v10.8h, v22.8b, v28.8b
+    umlal     v10.8h, v23.8b, v29.8b
+
+    ushll     v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v8.4h, v15.h[0]
+    umlal2    v0.4s, v8.8h, v15.h[0]
+
+    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
+    ld1       {v16.2s}, [TMP4], STRIDE
+    ld1       {v17.2s}, [TMP4]
+    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
+    umull     v11.8h, v16.8b, v28.8b
+    umlal     v11.8h, v17.8b, v29.8b
+
+    ushll     v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v1.4s, v9.4h, v15.h[4]
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+    umlal2    v1.4s, v9.8h, v15.h[4]
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    ushll     v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v2.4s, v10.4h, v15.h[0]
+    umlal2    v2.4s, v10.8h, v15.h[0]
+    ushll     v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v3.4s, v11.4h, v15.h[4]
+    umlal2    v3.4s, v11.8h, v15.h[4]
+    add       v12.8h, v12.8h, v13.8h
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn2     v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    shrn2     v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    xtn       v6.8b, v0.8h
+    xtn       v7.8b, v2.8h
+    add       v12.8h, v12.8h, v13.8h
+    st1       {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+    asr       TMP1, X, #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, lsl #2
+    asr       TMP2, X, #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, lsl #2
+        umlal2    v1.4s, v9.8h, v15.h[4]
+        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+        ushll     v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+        umlsl     v2.4s, v10.4h, v15.h[0]
+        umlal2    v2.4s, v10.8h, v15.h[0]
+        ushll     v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+    ld1       {v20.2s}, [TMP1], STRIDE
+        umlsl     v3.4s, v11.4h, v15.h[4]
+        umlal2    v3.4s, v11.8h, v15.h[4]
+    ld1       {v21.2s}, [TMP1]
+    umull     v8.8h, v20.8b, v28.8b
+    umlal     v8.8h, v21.8b, v29.8b
+        shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+        shrn2     v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+        shrn      v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    ld1       {v22.2s}, [TMP2], STRIDE
+        shrn2     v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+        add       v12.8h, v12.8h, v13.8h
+    ld1       {v23.2s}, [TMP2]
+    umull     v9.8h, v22.8b, v28.8b
+    asr       TMP3, X, #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, lsl #2
+    asr       TMP4, X, #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, lsl #2
+    umlal     v9.8h, v23.8b, v29.8b
+    ld1       {v22.2s}, [TMP3], STRIDE
+        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    ld1       {v23.2s}, [TMP3]
+    umull     v10.8h, v22.8b, v28.8b
+    umlal     v10.8h, v23.8b, v29.8b
+        xtn       v6.8b, v0.8h
+    ushll     v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+        xtn       v7.8b, v4.8h
+    umlsl     v0.4s, v8.4h, v15.h[0]
+    umlal2    v0.4s, v8.8h, v15.h[0]
+    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
+    ld1       {v16.2s}, [TMP4], STRIDE
+        add       v12.8h, v12.8h, v13.8h
+    ld1       {v17.2s}, [TMP4]
+    prfm      PREFETCH_MODE, [TMP4, PF_OFFS]
+    umull     v11.8h, v16.8b, v28.8b
+    umlal     v11.8h, v17.8b, v29.8b
+        st1       {v6.2s, v7.2s}, [OUT], #16
+    ushll     v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v1.4s, v9.4h, v15.h[4]
+.endm
+
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+    2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+    1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+    1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
new file mode 100644
index 0000000..6aa6838
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm.h
@@ -0,0 +1,1310 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * prefetch mode
+ * available modes are:
+ * pldl1keep
+ * pldl1strm
+ * pldl2keep
+ * pldl2strm
+ * pldl3keep
+ * pldl3strm
+ */
+#define PREFETCH_MODE pldl1keep
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+    \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
+    \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\()
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if \numbytes == 32
+    .if \elem_size==32
+        pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \
+                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+    .elseif \elem_size==16
+        pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \
+                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+    .else
+        pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \
+                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+    .endif
+.elseif \numbytes == 16
+    .if \elem_size==32
+          pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+    .elseif \elem_size==16
+          pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+    .else
+          pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+    .endif
+.elseif \numbytes == 8
+    .if \elem_size==32
+        pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits
+    .elseif \elem_size==16
+        pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits
+    .else
+        pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits
+    .endif
+.elseif \numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
+        pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4
+    .elseif \elem_size == 16
+        pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2
+        pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2
+    .else
+        pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1
+        pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1
+        pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1
+        pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1
+    .endif
+.elseif \numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
+        pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2
+    .else
+        pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1
+        pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1
+    .endif
+.elseif \numbytes == 1
+        pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1
+.else
+    .error "unsupported size: \numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \
+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+    pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
+.else
+    pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \
+                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+    pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
+.elseif \numpix * \bpp == 32 && \abits == 32
+    pixldst 4, st1, 32, \basereg, \mem_operand, \abits
+.elseif \numpix * \bpp == 16 && \abits == 16
+    pixldst 2, st1, 16, \basereg, \mem_operand, \abits
+.else
+    pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (\bpp * \numpix) <= 128
+    pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
+.else
+    pixld \numpix, \bpp, \basereg, \mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (\bpp * \numpix) <= 128
+    pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
+.else
+    pixst \numpix, \bpp, \basereg, \mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if \elem_size == 16
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP1, \mem_operand, TMP1, lsl #1
+    asr     TMP2, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP2, \mem_operand, TMP2, lsl #1
+    ld1     {v\()\reg1\().h}[0], [TMP1]
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP1, \mem_operand, TMP1, lsl #1
+    ld1     {v\()\reg1\().h}[1], [TMP2]
+    asr     TMP2, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP2, \mem_operand, TMP2, lsl #1
+    ld1     {v\()\reg1\().h}[2], [TMP1]
+    ld1     {v\()\reg1\().h}[3], [TMP2]
+.elseif \elem_size == 32
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP1, \mem_operand, TMP1, lsl #2
+    asr     TMP2, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP2, \mem_operand, TMP2, lsl #2
+    ld1     {v\()\reg1\().s}[0], [TMP1]
+    ld1     {v\()\reg1\().s}[1], [TMP2]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if 0 /* \elem_size == 32 */
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, \mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    sub     VX, VX, UNIT_X
+    add     TMP2, \mem_operand, TMP2, asl #2
+    ld1     {v\()\reg1\().s}[0], [TMP1]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, \mem_operand, TMP1, asl #2
+    ld1     {v\()\reg2\().s}[0], [TMP2, :32]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, \mem_operand, TMP2, asl #2
+    ld1     {v\()\reg1\().s}[1], [TMP1]
+    ld1     {v\()\reg2\().s}[1], [TMP2]
+.else
+    pixld1_s \elem_size, \reg1, \mem_operand
+    pixld1_s \elem_size, \reg2, \mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if \elem_size == 16
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP1, \mem_operand, TMP1, lsl #1
+    ld1     {v\()\reg1\().h}[\idx], [TMP1]
+.elseif \elem_size == 32
+    asr     DUMMY, VX, #16
+    mov     TMP1, DUMMY
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    add     TMP1, \mem_operand, TMP1, lsl #2
+    ld1     {v\()\reg1\().s}[\idx], [TMP1]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if \numbytes == 32
+    pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
+    pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
+    pixdeinterleave \elem_size, %(\basereg+4)
+.elseif \numbytes == 16
+    pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
+.elseif \numbytes == 8
+    pixld1_s \elem_size, %(\basereg+1), \mem_operand
+.elseif \numbytes == 4
+    .if \elem_size == 32
+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+    .elseif \elem_size == 16
+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
+    .else
+        pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
+    .endif
+.elseif \numbytes == 2
+    .if \elem_size == 16
+        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+    .else
+        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
+    .endif
+.elseif \numbytes == 1
+    pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+.else
+    .error "unsupported size: \numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if \bpp > 0
+    pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    umov DUMMY, v16.d[0]
+    uzp1 v16.8b,          v\()\reg1\().8b, v\()\reg2\().8b
+    uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
+    mov  v\()\reg1\().8b, v16.8b
+    mov  v16.d[0], DUMMY
+.endm
+
+.macro vzip8 reg1, reg2
+    umov DUMMY, v16.d[0]
+    zip1 v16.8b,          v\()\reg1\().8b, v\()\reg2\().8b
+    zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
+    mov  v\()\reg1\().8b, v16.8b
+    mov  v16.d[0], DUMMY
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(\basereg+0), %(\basereg+1)
+    vuzp8 %(\basereg+2), %(\basereg+3)
+    vuzp8 %(\basereg+1), %(\basereg+3)
+    vuzp8 %(\basereg+0), %(\basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(\basereg+0), %(\basereg+2)
+    vzip8 %(\basereg+1), %(\basereg+3)
+    vzip8 %(\basereg+2), %(\basereg+3)
+    vzip8 %(\basereg+0), %(\basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in software
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    \a \x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if \std_increment != 0
+    PF add, PF_X, PF_X, #\std_increment
+.endif
+    PF tst, PF_CTL, #0xF
+    PF beq, 71f
+    PF add, PF_X, PF_X, #\boost_increment
+    PF sub, PF_CTL, PF_CTL, #1
+71:
+    PF cmp, PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF lsl, DUMMY, PF_X, #src_bpp_shift
+    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+.endif
+.if dst_r_bpp != 0
+    PF lsl, DUMMY, PF_X, #dst_bpp_shift
+    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+.endif
+.if mask_bpp_shift >= 0
+    PF lsl, DUMMY, PF_X, #mask_bpp_shift
+    PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+.endif
+    PF ble, 71f
+    PF sub, PF_X, PF_X, ORIG_W
+    PF subs, PF_CTL, PF_CTL, #0x10
+71:
+    PF ble, 72f
+.if src_bpp_shift >= 0
+    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+    PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+    PF add, PF_SRC, PF_SRC, #1
+.endif
+.if dst_r_bpp != 0
+    PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+    PF ldrsb, DUMMY, [PF_DST, DUMMY]
+    PF add, PF_DST, PF_DST, #1
+.endif
+.if mask_bpp_shift >= 0
+    PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+    PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+    PF add, PF_MASK, PF_MASK, #1
+.endif
+72:
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         52f
+
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
+.irp lowbit, 1, 2, 4, 8, 16
+
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #\lowbit
+    beq         51f
+.endif
+    pixld_src   (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #\lowbit
+.endif
+    PF add,     PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
+51:
+.endif
+.endr
+.endif
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    \process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    \process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #\lowbit
+    beq         51f
+.endif
+.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
+    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
+.endif
+    pixst_a     (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+51:
+.endif
+.endr
+.endif
+52:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         52f
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > \chunk_size
+    tst         W, #\chunk_size
+    beq         51f
+    pixld_src   \chunk_size, src_bpp, src_basereg, SRC
+    pixld       \chunk_size, mask_bpp, mask_basereg, MASK
+.if \dst_aligned_flag != 0
+    pixld_a     \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if \cache_preload_flag != 0
+    PF add,     PF_X, PF_X, #\chunk_size
+.endif
+51:
+.endif
+.endr
+.endif
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    \process_pixblock_head
+.if \cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    \process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > \chunk_size
+    tst         W, #\chunk_size
+    beq         51f
+.if \dst_aligned_flag != 0
+    pixst_a     \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+51:
+.endif
+.endr
+52:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+    mov         W, ORIG_W
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+    bge         \start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * v0, v1, v2, v3     - reserved for loading source pixel data
+ * v4, v5, v6, v7     - reserved for loading destination pixel data
+ * v24, v25, v26, v27 - reserved for loading mask pixel data
+ * v28, v29, v30, v31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    pixman_asm_function \fname
+    stp         x29, x30, [sp, -16]!
+    mov         x29, sp
+    sub         sp,   sp, 232  /* push all registers */
+    sub         x29, x29, 64
+    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+    stp          x8,   x9, [x29, -80]
+    stp         x10,  x11, [x29, -96]
+    stp         x12,  x13, [x29, -112]
+    stp         x14,  x15, [x29, -128]
+    stp         x16,  x17, [x29, -144]
+    stp         x18,  x19, [x29, -160]
+    stp         x20,  x21, [x29, -176]
+    stp         x22,  x23, [x29, -192]
+    stp         x24,  x25, [x29, -208]
+    stp         x26,  x27, [x29, -224]
+    str         x28, [x29, -232]
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if \prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, \src_bpp_
+    .set mask_bpp, \mask_bpp_
+    .set dst_w_bpp, \dst_w_bpp_
+    .set pixblock_size, \pixblock_size_
+    .set dst_w_basereg, \dst_w_basereg_
+    .set dst_r_basereg, \dst_r_basereg_
+    .set src_basereg, \src_basereg_
+    .set mask_basereg, \mask_basereg_
+
+    .macro pixld_src x:vararg
+        pixld \x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req       x0      /* width (is updated during processing) */
+    H           .req       x1      /* height (is updated during processing) */
+    DST_W       .req       x2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req       x3      /* destination image stride */
+    SRC         .req       x4      /* source buffer pointer */
+    SRC_STRIDE  .req       x5      /* source image stride */
+    MASK        .req       x6      /* mask pointer */
+    MASK_STRIDE .req       x7      /* mask stride */
+
+    DST_R       .req       x8      /* destination buffer pointer for reads */
+
+    PF_CTL      .req       x9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req       x10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req       x11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req       x12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req       x13     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+
+    ORIG_W      .req       x14     /* saved original width */
+    DUMMY       .req       x15     /* temporary register */
+
+    sxtw        x0, w0
+    sxtw        x1, w1
+    sxtw        x3, w3
+    sxtw        x5, w5
+    sxtw        x7, w7
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if \prefetch_distance < 0 || \prefetch_distance > 15
+    .error "invalid prefetch distance (\prefetch_distance)"
+.endif
+
+    PF mov,     PF_X, #0
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov,     PF_SRC, SRC
+    PF mov,     PF_DST, DST_R
+    PF mov,     PF_MASK, MASK
+    /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
+    PF lsl,     DUMMY, H, #4
+    PF mov,     PF_CTL, DUMMY
+    PF add,     PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
+
+    \init
+    subs        H, H, #1
+    mov         ORIG_W, W
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         800f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment \process_pixblock_head, \
+                                     \process_pixblock_tail, \
+                                     \process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add,     PF_X, PF_X, #pixblock_size
+    \process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         200f
+
+100:
+    \process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         100b
+
+200:
+    \process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+    \cleanup
+1000:
+    /* pop all registers */
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp          x8,   x9, [x29, -80]
+    ldp         x10,  x11, [x29, -96]
+    ldp         x12,  x13, [x29, -112]
+    ldp         x14,  x15, [x29, -128]
+    ldp         x16,  x17, [x29, -144]
+    ldp         x18,  x19, [x29, -160]
+    ldp         x20,  x21, [x29, -176]
+    ldp         x22,  x23, [x29, -192]
+    ldp         x24,  x25, [x29, -208]
+    ldp         x26,  x27, [x29, -224]
+    ldr         x28, [x29, -232]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+800:
+.if src_bpp_shift >= 0
+    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+    PF prfm, PREFETCH_MODE, [SRC, DUMMY]
+.endif
+.if dst_r_bpp != 0
+    PF lsl,  DUMMY, DST_STRIDE, #dst_bpp_shift
+    PF prfm, PREFETCH_MODE, [DST_R, DUMMY]
+.endif
+.if mask_bpp_shift >= 0
+    PF lsl,  DUMMY, MASK_STRIDE, #mask_bpp_shift
+    PF prfm, PREFETCH_MODE, [MASK, DUMMY]
+.endif
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         100f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    \process_pixblock_head
+    \process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+100:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
+    advance_to_next_scanline 800b
+9:
+    \cleanup
+    /* pop all registers */
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp          x8,   x9, [x29, -80]
+    ldp         x10,  x11, [x29, -96]
+    ldp         x12,  x13, [x29, -112]
+    ldp         x14,  x15, [x29, -128]
+    ldp         x16,  x17, [x29, -144]
+    ldp         x18,  x19, [x29, -160]
+    ldp         x20,  x21, [x29, -176]
+    ldp         x22,  x23, [x29, -192]
+    ldp         x24,  x25, [x29, -208]
+    ldp         x26,  x27, [x29, -224]
+    ldr         x28, [x29, -232]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    pixman_end_asm_function
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+                                                   process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    pixman_asm_function \fname
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, \src_bpp_
+    .set mask_bpp, \mask_bpp_
+    .set dst_w_bpp, \dst_w_bpp_
+    .set pixblock_size, \pixblock_size_
+    .set dst_w_basereg, \dst_w_basereg_
+    .set dst_r_basereg, \dst_r_basereg_
+    .set src_basereg, \src_basereg_
+    .set mask_basereg, \mask_basereg_
+
+.if \use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        x0
+    DST_W       .req        x1
+    SRC         .req        x2
+    VX          .req        x3
+    UNIT_X      .req        x4
+    SRC_WIDTH_FIXED .req    x5
+    MASK        .req        x6
+    TMP1        .req        x8
+    TMP2        .req        x9
+    DST_R       .req        x10
+    DUMMY       .req        x30
+
+    .macro pixld_src x:vararg
+        pixld_s \x
+    .endm
+
+    sxtw        x0, w0
+    sxtw        x3, w3
+    sxtw        x4, w4
+    sxtw        x5, w5
+
+    stp         x29, x30, [sp, -16]!
+    mov         x29, sp
+    sub         sp, sp, 88
+    sub         x29, x29, 64
+    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    stp         x8, x9, [x29, -80]
+    str         x10, [x29, -88]
+.else
+    /*
+     * Assign symbolic names to registers
+     */
+    W           .req        x0      /* width (is updated during processing) */
+    DST_W       .req        x1      /* destination buffer pointer for writes */
+    SRC         .req        x2      /* source buffer pointer */
+    MASK        .req        x3      /* mask pointer */
+    DST_R       .req        x4      /* destination buffer pointer for reads */
+    DUMMY       .req        x30
+
+    .macro pixld_src x:vararg
+        pixld \x
+    .endm
+
+    sxtw        x0, w0
+
+    stp         x29, x30, [sp, -16]!
+    mov         x29, sp
+    sub         sp, sp, 64
+    sub         x29, x29, 64
+    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+.endif
+
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
+    \init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         800f
+
+    ensure_destination_ptr_alignment \process_pixblock_head, \
+                                     \process_pixblock_tail, \
+                                     \process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         700f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    \process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         200f
+100:
+    \process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         100b
+200:
+    \process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+700:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
+
+    \cleanup
+.if \use_nearest_scaling != 0
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp         x8, x9, [x29, -80]
+    ldr         x10, [x29, -96]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+.else
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+.endif
+800:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            \process_pixblock_head, \
+                            \process_pixblock_tail, \
+                            \process_pixblock_tail_head
+
+    \cleanup
+.if \use_nearest_scaling != 0
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp         x8, x9, [x29, -80]
+    ldr         x10, [x29, -88]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+
+    .unreq      DUMMY
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+    .unreq      SRC_WIDTH_FIXED
+
+.else
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    mov          sp, x29
+    ldp          x29, x30, [sp], 16
+    ret  /* exit */
+
+    .unreq      DUMMY
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    pixman_end_asm_function
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, \x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, \x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores v8-v15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+.endm
+
+.macro default_cleanup_need_all_regs
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ *          value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+    shrn        \()\out_r\().8b, \()\in\().8h,    #8
+    shrn        \()\out_g\().8b, \()\in\().8h,    #3
+    sli         \()\in\().8h,    \()\in\().8h,    #5
+    movi        \()\out_a\().8b, #255
+    sri         \()\out_r\().8b, \()\out_r\().8b, #5
+    sri         \()\out_g\().8b, \()\out_g\().8b, #6
+    shrn        \()\out_b\().8b, \()\in\().8h,    #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+    shrn        \()\out_r\().8b, \()\in\().8h,    #8
+    shrn        \()\out_g\().8b, \()\in\().8h,    #3
+    sli         \()\in\().8h,    \()\in\().8h,    #5
+    sri         \()\out_r\().8b, \()\out_r\().8b, #5
+    sri         \()\out_g\().8b, \()\out_g\().8b, #6
+    shrn        \()\out_b\().8b, \()\in\().8h,    #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+    ushll       \()\tmp1\().8h, \()\in_g\().8b, #7
+    shl         \()\tmp1\().8h, \()\tmp1\().8h, #1
+    ushll       \()\out\().8h,  \()\in_r\().8b, #7
+    shl         \()\out\().8h,  \()\out\().8h,  #1
+    ushll       \()\tmp2\().8h, \()\in_b\().8b, #7
+    shl         \()\tmp2\().8h, \()\tmp2\().8h, #1
+    sri         \()\out\().8h, \()\tmp1\().8h, #5
+    sri         \()\out\().8h, \()\tmp2\().8h, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+    shl         \()\out0\().4h, \()\in\().4h,   #5  /* G top 6 bits */
+    shl         \()\tmp\().4h,  \()\in\().4h,   #11 /* B top 5 bits */
+    sri         \()\in\().4h,   \()\in\().4h,   #5  /* R is ready \in top bits */
+    sri         \()\out0\().4h, \()\out0\().4h, #6  /* G is ready \in top bits */
+    sri         \()\tmp\().4h,  \()\tmp\().4h,  #5  /* B is ready \in top bits */
+    ushr        \()\out1\().4h, \()\in\().4h,   #8  /* R is \in place */
+    sri         \()\out0\().4h, \()\tmp\().4h,  #8  /* G \() B is \in place */
+    zip1        \()\tmp\().4h,  \()\out0\().4h, \()\out1\().4h  /* everything is \in place */
+    zip2        \()\out1\().4h, \()\out0\().4h, \()\out1\().4h
+    mov         \()\out0\().d[0], \()\tmp\().d[0]
+.endm
diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c
index dcdcc69..20353cf 100644
--- a/pixman/pixman-bits-image.c
+++ b/pixman/pixman-bits-image.c
@@ -27,7 +27,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
@@ -35,44 +35,47 @@
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 #include "pixman-inlines.h"
+#include "dither/blue-noise-64x64.h"
 
-static uint32_t *
-_pixman_image_get_scanline_generic_float (pixman_iter_t * iter,
-					  const uint32_t *mask)
-{
-    pixman_iter_get_scanline_t fetch_32 = iter->data;
-    uint32_t *buffer = iter->buffer;
-
-    fetch_32 (iter, NULL);
+/* Fetch functions */
 
-    pixman_expand_to_float ((argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+static force_inline void
+fetch_pixel_no_alpha_32 (bits_image_t *image,
+			 int x, int y, pixman_bool_t check_bounds,
+			 void *out)
+{
+    uint32_t *ret = out;
 
-    return iter->buffer;
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+	*ret = 0;
+    else
+	*ret = image->fetch_pixel_32 (image, x, y);
 }
 
-/* Fetch functions */
-
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
-		      int x, int y, pixman_bool_t check_bounds)
+static force_inline void
+fetch_pixel_no_alpha_float (bits_image_t *image,
+			    int x, int y, pixman_bool_t check_bounds,
+			    void *out)
 {
+    argb_t *ret = out;
+
     if (check_bounds &&
 	(x < 0 || x >= image->width || y < 0 || y >= image->height))
-    {
-	return 0;
-    }
-
-    return image->fetch_pixel_32 (image, x, y);
+	ret->a = ret->r = ret->g = ret->b = 0.f;
+    else
+	*ret = image->fetch_pixel_float (image, x, y);
 }
 
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
-				  int x, int y, pixman_bool_t check_bounds);
+typedef void (* get_pixel_t) (bits_image_t *image,
+			      int x, int y, pixman_bool_t check_bounds, void *out);
 
-static force_inline uint32_t
+static force_inline void
 bits_image_fetch_pixel_nearest (bits_image_t   *image,
 				pixman_fixed_t  x,
 				pixman_fixed_t  y,
-				get_pixel_t	get_pixel)
+				get_pixel_t	get_pixel,
+				void	       *out)
 {
     int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
     int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
@@ -82,19 +85,20 @@ bits_image_fetch_pixel_nearest (bits_image_t   *image,
 	repeat (image->common.repeat, &x0, image->width);
 	repeat (image->common.repeat, &y0, image->height);
 
-	return get_pixel (image, x0, y0, FALSE);
+	get_pixel (image, x0, y0, FALSE, out);
     }
     else
     {
-	return get_pixel (image, x0, y0, TRUE);
+	get_pixel (image, x0, y0, TRUE, out);
     }
 }
 
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t   *image,
-				 pixman_fixed_t  x,
-				 pixman_fixed_t  y,
-				 get_pixel_t	 get_pixel)
+static force_inline void
+bits_image_fetch_pixel_bilinear_32 (bits_image_t   *image,
+				    pixman_fixed_t  x,
+				    pixman_fixed_t  y,
+				    get_pixel_t	    get_pixel,
+				    void	   *out)
 {
     pixman_repeat_t repeat_mode = image->common.repeat;
     int width = image->width;
@@ -102,6 +106,7 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
     int x1, y1, x2, y2;
     uint32_t tl, tr, bl, br;
     int32_t distx, disty;
+    uint32_t *ret = out;
 
     x1 = x - pixman_fixed_1 / 2;
     y1 = y - pixman_fixed_1 / 2;
@@ -121,27 +126,142 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
 	repeat (repeat_mode, &x2, width);
 	repeat (repeat_mode, &y2, height);
 
-	tl = get_pixel (image, x1, y1, FALSE);
-	bl = get_pixel (image, x1, y2, FALSE);
-	tr = get_pixel (image, x2, y1, FALSE);
-	br = get_pixel (image, x2, y2, FALSE);
+	get_pixel (image, x1, y1, FALSE, &tl);
+	get_pixel (image, x2, y1, FALSE, &tr);
+	get_pixel (image, x1, y2, FALSE, &bl);
+	get_pixel (image, x2, y2, FALSE, &br);
     }
     else
     {
-	tl = get_pixel (image, x1, y1, TRUE);
-	tr = get_pixel (image, x2, y1, TRUE);
-	bl = get_pixel (image, x1, y2, TRUE);
-	br = get_pixel (image, x2, y2, TRUE);
+	get_pixel (image, x1, y1, TRUE, &tl);
+	get_pixel (image, x2, y1, TRUE, &tr);
+	get_pixel (image, x1, y2, TRUE, &bl);
+	get_pixel (image, x2, y2, TRUE, &br);
     }
 
-    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+    *ret = bilinear_interpolation (tl, tr, bl, br, distx, disty);
 }
 
-static force_inline uint32_t
+static force_inline void
+bits_image_fetch_pixel_bilinear_float (bits_image_t   *image,
+				       pixman_fixed_t  x,
+				       pixman_fixed_t  y,
+				       get_pixel_t     get_pixel,
+				       void	      *out)
+{
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    argb_t tl, tr, bl, br;
+    float distx, disty;
+    argb_t *ret = out;
+
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
+
+    distx = ((float)pixman_fixed_fraction(x1)) / 65536.f;
+    disty = ((float)pixman_fixed_fraction(y1)) / 65536.f;
+
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
+    y2 = y1 + 1;
+
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
+    {
+	repeat (repeat_mode, &x1, width);
+	repeat (repeat_mode, &y1, height);
+	repeat (repeat_mode, &x2, width);
+	repeat (repeat_mode, &y2, height);
+
+	get_pixel (image, x1, y1, FALSE, &tl);
+	get_pixel (image, x2, y1, FALSE, &tr);
+	get_pixel (image, x1, y2, FALSE, &bl);
+	get_pixel (image, x2, y2, FALSE, &br);
+    }
+    else
+    {
+	get_pixel (image, x1, y1, TRUE, &tl);
+	get_pixel (image, x2, y1, TRUE, &tr);
+	get_pixel (image, x1, y2, TRUE, &bl);
+	get_pixel (image, x2, y2, TRUE, &br);
+    }
+
+    *ret = bilinear_interpolation_float (tl, tr, bl, br, distx, disty);
+}
+
+static force_inline void accum_32(unsigned int *satot, unsigned int *srtot,
+				  unsigned int *sgtot, unsigned int *sbtot,
+				  const void *p, pixman_fixed_t f)
+{
+    uint32_t pixel = *(uint32_t *)p;
+
+    *srtot += (int)RED_8 (pixel) * f;
+    *sgtot += (int)GREEN_8 (pixel) * f;
+    *sbtot += (int)BLUE_8 (pixel) * f;
+    *satot += (int)ALPHA_8 (pixel) * f;
+}
+
+static force_inline void reduce_32(unsigned int satot, unsigned int srtot,
+				   unsigned int sgtot, unsigned int sbtot,
+                                   void *p)
+{
+    uint32_t *ret = p;
+
+    satot = (int32_t)(satot + 0x8000) / 65536;
+    srtot = (int32_t)(srtot + 0x8000) / 65536;
+    sgtot = (int32_t)(sgtot + 0x8000) / 65536;
+    sbtot = (int32_t)(sbtot + 0x8000) / 65536;
+
+    satot = CLIP ((int32_t)satot, 0, 0xff);
+    srtot = CLIP ((int32_t)srtot, 0, 0xff);
+    sgtot = CLIP ((int32_t)sgtot, 0, 0xff);
+    sbtot = CLIP ((int32_t)sbtot, 0, 0xff);
+
+    *ret = ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static force_inline void accum_float(unsigned int *satot, unsigned int *srtot,
+				     unsigned int *sgtot, unsigned int *sbtot,
+				     const void *p, pixman_fixed_t f)
+{
+    const argb_t *pixel = p;
+
+    *satot += pixel->a * f;
+    *srtot += pixel->r * f;
+    *sgtot += pixel->g * f;
+    *sbtot += pixel->b * f;
+}
+
+static force_inline void reduce_float(unsigned int satot, unsigned int srtot,
+				      unsigned int sgtot, unsigned int sbtot,
+				      void *p)
+{
+    argb_t *ret = p;
+
+    ret->a = CLIP ((int32_t)satot / 65536.f, 0.f, 1.f);
+    ret->r = CLIP ((int32_t)srtot / 65536.f, 0.f, 1.f);
+    ret->g = CLIP ((int32_t)sgtot / 65536.f, 0.f, 1.f);
+    ret->b = CLIP ((int32_t)sbtot / 65536.f, 0.f, 1.f);
+}
+
+typedef void (* accumulate_pixel_t) (unsigned int *satot, unsigned int *srtot,
+				     unsigned int *sgtot, unsigned int *sbtot,
+				     const void *pixel, pixman_fixed_t f);
+
+typedef void (* reduce_pixel_t) (unsigned int satot, unsigned int srtot,
+				 unsigned int sgtot, unsigned int sbtot,
+                                 void *out);
+
+static force_inline void
 bits_image_fetch_pixel_convolution (bits_image_t   *image,
 				    pixman_fixed_t  x,
 				    pixman_fixed_t  y,
-				    get_pixel_t     get_pixel)
+				    get_pixel_t     get_pixel,
+				    void	      *out,
+				    accumulate_pixel_t accum,
+				    reduce_pixel_t reduce)
 {
     pixman_fixed_t *params = image->common.filter_params;
     int x_off = (params[0] - pixman_fixed_1) >> 1;
@@ -152,7 +272,7 @@ bits_image_fetch_pixel_convolution (bits_image_t   *image,
     pixman_repeat_t repeat_mode = image->common.repeat;
     int width = image->width;
     int height = image->height;
-    int srtot, sgtot, sbtot, satot;
+    unsigned int srtot, sgtot, sbtot, satot;
 
     params += 2;
 
@@ -174,48 +294,39 @@ bits_image_fetch_pixel_convolution (bits_image_t   *image,
 
 	    if (f)
 	    {
-		uint32_t pixel;
+		/* Must be big enough to hold a argb_t */
+		argb_t pixel;
 
 		if (repeat_mode != PIXMAN_REPEAT_NONE)
 		{
 		    repeat (repeat_mode, &rx, width);
 		    repeat (repeat_mode, &ry, height);
 
-		    pixel = get_pixel (image, rx, ry, FALSE);
+		    get_pixel (image, rx, ry, FALSE, &pixel);
 		}
 		else
 		{
-		    pixel = get_pixel (image, rx, ry, TRUE);
+		    get_pixel (image, rx, ry, TRUE, &pixel);
 		}
 
-		srtot += (int)RED_8 (pixel) * f;
-		sgtot += (int)GREEN_8 (pixel) * f;
-		sbtot += (int)BLUE_8 (pixel) * f;
-		satot += (int)ALPHA_8 (pixel) * f;
+		accum (&satot, &srtot, &sgtot, &sbtot, &pixel, f);
 	    }
 
 	    params++;
 	}
     }
 
-    satot = (satot + 0x8000) >> 16;
-    srtot = (srtot + 0x8000) >> 16;
-    sgtot = (sgtot + 0x8000) >> 16;
-    sbtot = (sbtot + 0x8000) >> 16;
-
-    satot = CLIP (satot, 0, 0xff);
-    srtot = CLIP (srtot, 0, 0xff);
-    sgtot = CLIP (sgtot, 0, 0xff);
-    sbtot = CLIP (sbtot, 0, 0xff);
-
-    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+    reduce (satot, srtot, sgtot, sbtot, out);
 }
 
-static uint32_t
-bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
-                                              pixman_fixed_t x,
-                                              pixman_fixed_t y,
-                                              get_pixel_t    get_pixel)
+static void
+bits_image_fetch_pixel_separable_convolution (bits_image_t  *image,
+					      pixman_fixed_t x,
+					      pixman_fixed_t y,
+					      get_pixel_t    get_pixel,
+					      void	    *out,
+					      accumulate_pixel_t accum,
+					      reduce_pixel_t     reduce)
 {
     pixman_fixed_t *params = image->common.filter_params;
     pixman_repeat_t repeat_mode = image->common.repeat;
@@ -230,7 +341,7 @@ bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
     int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
     int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
     pixman_fixed_t *y_params;
-    int srtot, sgtot, sbtot, satot;
+    unsigned int srtot, sgtot, sbtot, satot;
     int32_t x1, x2, y1, y2;
     int32_t px, py;
     int i, j;
@@ -270,82 +381,100 @@ bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
 
                 if (fx)
                 {
+                    /* Must be big enough to hold a argb_t */
+                    argb_t pixel;
                     pixman_fixed_t f;
-                    uint32_t pixel;
 
                     if (repeat_mode != PIXMAN_REPEAT_NONE)
                     {
                         repeat (repeat_mode, &rx, width);
                         repeat (repeat_mode, &ry, height);
 
-                        pixel = get_pixel (image, rx, ry, FALSE);
+                        get_pixel (image, rx, ry, FALSE, &pixel);
                     }
                     else
                     {
-                        pixel = get_pixel (image, rx, ry, TRUE);
+                        get_pixel (image, rx, ry, TRUE, &pixel);
 		    }
 
                     f = (fy * fx + 0x8000) >> 16;
 
-                    srtot += (int)RED_8 (pixel) * f;
-                    sgtot += (int)GREEN_8 (pixel) * f;
-                    sbtot += (int)BLUE_8 (pixel) * f;
-                    satot += (int)ALPHA_8 (pixel) * f;
+		    accum(&satot, &srtot, &sgtot, &sbtot, &pixel, f);
                 }
             }
 	}
     }
 
-    satot = (satot + 0x8000) >> 16;
-    srtot = (srtot + 0x8000) >> 16;
-    sgtot = (sgtot + 0x8000) >> 16;
-    sbtot = (sbtot + 0x8000) >> 16;
-
-    satot = CLIP (satot, 0, 0xff);
-    srtot = CLIP (srtot, 0, 0xff);
-    sgtot = CLIP (sgtot, 0, 0xff);
-    sbtot = CLIP (sbtot, 0, 0xff);
 
-    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+    reduce(satot, srtot, sgtot, sbtot, out);
 }
 
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
+static force_inline void
+bits_image_fetch_pixel_filtered (bits_image_t  *image,
+				 pixman_bool_t  wide,
 				 pixman_fixed_t x,
 				 pixman_fixed_t y,
-				 get_pixel_t    get_pixel)
+				 get_pixel_t    get_pixel,
+				 void          *out)
 {
     switch (image->common.filter)
     {
     case PIXMAN_FILTER_NEAREST:
     case PIXMAN_FILTER_FAST:
-	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+	bits_image_fetch_pixel_nearest (image, x, y, get_pixel, out);
 	break;
 
     case PIXMAN_FILTER_BILINEAR:
     case PIXMAN_FILTER_GOOD:
     case PIXMAN_FILTER_BEST:
-	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+	if (wide)
+	    bits_image_fetch_pixel_bilinear_float (image, x, y, get_pixel, out);
+	else
+	    bits_image_fetch_pixel_bilinear_32 (image, x, y, get_pixel, out);
 	break;
 
     case PIXMAN_FILTER_CONVOLUTION:
-	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+	if (wide)
+	{
+	    bits_image_fetch_pixel_convolution (image, x, y,
+						get_pixel, out,
+						accum_float,
+						reduce_float);
+	}
+	else
+	{
+	    bits_image_fetch_pixel_convolution (image, x, y,
+						get_pixel, out,
+						accum_32, reduce_32);
+	}
 	break;
 
     case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
-        return bits_image_fetch_pixel_separable_convolution (image, x, y, get_pixel);
+	if (wide)
+	{
+	    bits_image_fetch_pixel_separable_convolution (image, x, y,
+							  get_pixel, out,
+							  accum_float,
+							  reduce_float);
+	}
+	else
+	{
+	    bits_image_fetch_pixel_separable_convolution (image, x, y,
+							  get_pixel, out,
+							  accum_32, reduce_32);
+	}
         break;
 
     default:
+	assert (0);
         break;
     }
-
-    return 0;
 }
 
 static uint32_t *
-bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
-				  const uint32_t * mask)
+__bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
+				    pixman_bool_t    wide,
+				    const uint32_t * mask)
 {
     pixman_image_t *image  = iter->image;
     int             offset = iter->x;
@@ -353,10 +482,13 @@ bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
     int             width  = iter->width;
     uint32_t *      buffer = iter->buffer;
 
+    const uint32_t wide_zero[4] = {0};
     pixman_fixed_t x, y;
     pixman_fixed_t ux, uy;
     pixman_vector_t v;
     int i;
+    get_pixel_t get_pixel =
+	wide ? fetch_pixel_no_alpha_float : fetch_pixel_no_alpha_32;
 
     /* reference point is the center of the pixel */
     v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
@@ -382,29 +514,48 @@ bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
 
     for (i = 0; i < width; ++i)
     {
-	if (!mask || mask[i])
+	if (!mask || (!wide && mask[i]) ||
+	    (wide && memcmp(&mask[4 * i], wide_zero, 16) != 0))
 	{
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x, y, fetch_pixel_no_alpha);
+	    bits_image_fetch_pixel_filtered (
+		&image->bits, wide, x, y, get_pixel, buffer);
 	}
 
 	x += ux;
 	y += uy;
+	buffer += wide ? 4 : 1;
     }
 
-    return buffer;
+    return iter->buffer;
+}
+
+static uint32_t *
+bits_image_fetch_affine_no_alpha_32 (pixman_iter_t  *iter,
+				     const uint32_t *mask)
+{
+    return __bits_image_fetch_affine_no_alpha(iter, FALSE, mask);
+}
+
+static uint32_t *
+bits_image_fetch_affine_no_alpha_float (pixman_iter_t  *iter,
+					const uint32_t *mask)
+{
+    return __bits_image_fetch_affine_no_alpha(iter, TRUE, mask);
 }
 
 /* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+static force_inline void
+fetch_pixel_general_32 (bits_image_t *image,
+			int x, int y, pixman_bool_t check_bounds,
+			void *out)
 {
-    uint32_t pixel;
+    uint32_t pixel, *ret = out;
 
     if (check_bounds &&
 	(x < 0 || x >= image->width || y < 0 || y >= image->height))
     {
-	return 0;
+	*ret = 0;
+	return;
     }
 
     pixel = image->fetch_pixel_32 (image, x, y);
@@ -433,19 +584,61 @@ fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_boun
 	pixel |= (pixel_a << 24);
     }
 
-    return pixel;
+    *ret = pixel;
+}
+
+static force_inline void
+fetch_pixel_general_float (bits_image_t *image,
+			int x, int y, pixman_bool_t check_bounds,
+			void *out)
+{
+    argb_t *ret = out;
+
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	ret->a = ret->r = ret->g = ret->b = 0;
+	return;
+    }
+
+    *ret = image->fetch_pixel_float (image, x, y);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	if (x < 0 || x >= image->common.alpha_map->width ||
+	    y < 0 || y >= image->common.alpha_map->height)
+	{
+	    ret->a = 0.f;
+	}
+	else
+	{
+	    argb_t alpha;
+
+	    alpha = image->common.alpha_map->fetch_pixel_float (
+		    image->common.alpha_map, x, y);
+
+	    ret->a = alpha.a;
+	}
+    }
 }
 
 static uint32_t *
-bits_image_fetch_general (pixman_iter_t  *iter,
-			  const uint32_t *mask)
+__bits_image_fetch_general (pixman_iter_t  *iter,
+			    pixman_bool_t wide,
+			    const uint32_t *mask)
 {
     pixman_image_t *image  = iter->image;
     int             offset = iter->x;
     int             line   = iter->y++;
     int             width  = iter->width;
     uint32_t *      buffer = iter->buffer;
+    get_pixel_t     get_pixel =
+	wide ? fetch_pixel_general_float : fetch_pixel_general_32;
 
+    const uint32_t wide_zero[4] = {0};
     pixman_fixed_t x, y, w;
     pixman_fixed_t ux, uy, uw;
     pixman_vector_t v;
@@ -480,12 +673,13 @@ bits_image_fetch_general (pixman_iter_t  *iter,
     {
 	pixman_fixed_t x0, y0;
 
-	if (!mask || mask[i])
+	if (!mask || (!wide && mask[i]) ||
+	    (wide && memcmp(&mask[4 * i], wide_zero, 16) != 0))
 	{
 	    if (w != 0)
 	    {
-		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
-		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+		x0 = ((uint64_t)x << 16) / w;
+		y0 = ((uint64_t)y << 16) / w;
 	    }
 	    else
 	    {
@@ -493,16 +687,31 @@ bits_image_fetch_general (pixman_iter_t  *iter,
 		y0 = 0;
 	    }
 
-	    buffer[i] = bits_image_fetch_pixel_filtered (
-		&image->bits, x0, y0, fetch_pixel_general);
+	    bits_image_fetch_pixel_filtered (
+		&image->bits, wide, x0, y0, get_pixel, buffer);
 	}
 
 	x += ux;
 	y += uy;
 	w += uw;
+	buffer += wide ? 4 : 1;
     }
 
-    return buffer;
+    return iter->buffer;
+}
+
+static uint32_t *
+bits_image_fetch_general_32 (pixman_iter_t  *iter,
+			     const uint32_t *mask)
+{
+    return __bits_image_fetch_general(iter, FALSE, mask);
+}
+
+static uint32_t *
+bits_image_fetch_general_float (pixman_iter_t  *iter,
+				const uint32_t *mask)
+{
+    return __bits_image_fetch_general(iter, TRUE, mask);
 }
 
 static void
@@ -703,15 +912,15 @@ static const fetcher_info_t fetcher_info[] =
     /* Affine, no alpha */
     { PIXMAN_any,
       (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
-      bits_image_fetch_affine_no_alpha,
-      _pixman_image_get_scanline_generic_float
+      bits_image_fetch_affine_no_alpha_32,
+      bits_image_fetch_affine_no_alpha_float,
     },
 
     /* General */
     { PIXMAN_any,
       0,
-      bits_image_fetch_general,
-      _pixman_image_get_scanline_generic_float
+      bits_image_fetch_general_32,
+      bits_image_fetch_general_float,
     },
 
     { PIXMAN_null },
@@ -741,7 +950,6 @@ _pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
 	    }
 	    else
 	    {
-		iter->data = info->get_scanline_32;
 		iter->get_scanline = info->get_scanline_float;
 	    }
 	    return;
@@ -847,6 +1055,119 @@ dest_write_back_narrow (pixman_iter_t *iter)
     iter->y++;
 }
 
+static float
+dither_factor_blue_noise_64 (int x, int y)
+{
+    float m = dither_blue_noise_64x64[((y & 0x3f) << 6) | (x & 0x3f)];
+    return m * (1. / 4096.f) + (1. / 8192.f);
+}
+
+static float
+dither_factor_bayer_8 (int x, int y)
+{
+    uint32_t m;
+
+    y ^= x;
+
+    /* Compute reverse(interleave(xor(x mod n, y mod n), x mod n))
+     * Here n = 8 and `mod n` is the bottom 3 bits.
+     */
+    m = ((y & 0x1) << 5) | ((x & 0x1) << 4) |
+	((y & 0x2) << 2) | ((x & 0x2) << 1) |
+	((y & 0x4) >> 1) | ((x & 0x4) >> 2);
+
+    /* m is in range [0, 63].  We scale it to [0, 63.0f/64.0f], then
+     * shift it to to [1.0f/128.0f, 127.0f/128.0f] so that 0 < d < 1.
+     * This ensures exact values are not changed by dithering.
+     */
+    return (float)(m) * (1 / 64.0f) + (1.0f / 128.0f);
+}
+
+typedef float (* dither_factor_t)(int x, int y);
+
+static force_inline float
+dither_apply_channel (float f, float d, float s)
+{
+    /* float_to_unorm splits the [0, 1] segment in (1 << n_bits)
+     * subsections of equal length; however unorm_to_float does not
+     * map to the center of those sections.  In fact, pixel value u is
+     * mapped to:
+     *
+     *       u              u              u               1
+     * -------------- = ---------- + -------------- * ----------
+     *  2^n_bits - 1     2^n_bits     2^n_bits - 1     2^n_bits
+     *
+     * Hence if f = u / (2^n_bits - 1) is exactly representable on a
+     * n_bits palette, all the numbers between
+     *
+     *     u
+     * ----------  =  f - f * 2^n_bits = f + (0 - f) * 2^n_bits
+     *  2^n_bits
+     *
+     *  and
+     *
+     *    u + 1
+     * ---------- = f - (f - 1) * 2^n_bits = f + (1 - f) * 2^n_bits
+     *  2^n_bits
+     *
+     * are also mapped back to u.
+     *
+     * Hence the following calculation ensures that we add as much
+     * noise as possible without perturbing values which are exactly
+     * representable in the target colorspace.  Note that this corresponds to
+     * mixing the original color with noise with a ratio of `1 / 2^n_bits`.
+     */
+    return f + (d - f) * s;
+}
+
+static force_inline float
+dither_compute_scale (int n_bits)
+{
+    // No dithering for wide formats
+    if (n_bits == 0 || n_bits >= 32)
+	return 0.f;
+
+    return 1.f / (float)(1 << n_bits);
+}
+
+static const uint32_t *
+dither_apply_ordered (pixman_iter_t *iter, dither_factor_t factor)
+{
+    bits_image_t        *image  = &iter->image->bits;
+    int                  x      = iter->x + image->dither_offset_x;
+    int                  y      = iter->y + image->dither_offset_y;
+    int                  width  = iter->width;
+    argb_t              *buffer = (argb_t *)iter->buffer;
+
+    pixman_format_code_t format = image->format;
+    int                  a_size = PIXMAN_FORMAT_A (format);
+    int                  r_size = PIXMAN_FORMAT_R (format);
+    int                  g_size = PIXMAN_FORMAT_G (format);
+    int                  b_size = PIXMAN_FORMAT_B (format);
+
+    float a_scale = dither_compute_scale (a_size);
+    float r_scale = dither_compute_scale (r_size);
+    float g_scale = dither_compute_scale (g_size);
+    float b_scale = dither_compute_scale (b_size);
+
+    int   i;
+    float d;
+
+    for (i = 0; i < width; ++i)
+    {
+	d = factor (x + i, y);
+
+	buffer->a = dither_apply_channel (buffer->a, d, a_scale);
+	buffer->r = dither_apply_channel (buffer->r, d, r_scale);
+	buffer->g = dither_apply_channel (buffer->g, d, g_scale);
+	buffer->b = dither_apply_channel (buffer->b, d, b_scale);
+
+	buffer++;
+    }
+
+    return iter->buffer;
+}
+
 static void
 dest_write_back_wide (pixman_iter_t *iter)
 {
@@ -856,6 +1177,23 @@ dest_write_back_wide (pixman_iter_t *iter)
     int             width  = iter->width;
     const uint32_t *buffer = iter->buffer;
 
+    switch (image->dither)
+    {
+    case PIXMAN_DITHER_NONE:
+	break;
+
+    case PIXMAN_DITHER_GOOD:
+    case PIXMAN_DITHER_BEST:
+    case PIXMAN_DITHER_ORDERED_BLUE_NOISE_64:
+	buffer = dither_apply_ordered (iter, dither_factor_blue_noise_64);
+	break;
+
+    case PIXMAN_DITHER_FAST:
+    case PIXMAN_DITHER_ORDERED_BAYER_8:
+	buffer = dither_apply_ordered (iter, dither_factor_bayer_8);
+	break;
+    }
+
     image->store_scanline_float (image, x, y, width, buffer);
 
     if (image->common.alpha_map)
@@ -932,7 +1270,7 @@ create_bits (pixman_format_code_t format,
 	*rowstride_bytes = stride;
 
     if (clear)
-	return calloc (buf_size, 1);
+	return calloc (1, buf_size);
     else
 	return malloc (buf_size);
 }
@@ -948,6 +1286,9 @@ _pixman_bits_image_init (pixman_image_t *     image,
 {
     uint32_t *free_me = NULL;
 
+    if (PIXMAN_FORMAT_BPP (format) == 128)
+	return_val_if_fail(!(rowstride % 4), FALSE);
+
     if (!bits && width && height)
     {
 	int rowstride_bytes;
@@ -968,6 +1309,9 @@ _pixman_bits_image_init (pixman_image_t *     image,
     image->bits.height = height;
     image->bits.bits = bits;
     image->bits.free_me = free_me;
+    image->bits.dither = PIXMAN_DITHER_NONE;
+    image->bits.dither_offset_x = 0;
+    image->bits.dither_offset_y = 0;
     image->bits.read_func = NULL;
     image->bits.write_func = NULL;
     image->bits.rowstride = rowstride;
diff --git a/pixman/pixman-combine-float.c b/pixman/pixman-combine-float.c
index f5145bc..27392d6 100644
--- a/pixman/pixman-combine-float.c
+++ b/pixman/pixman-combine-float.c
@@ -26,7 +26,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <math.h>
diff --git a/pixman/pixman-combine32.c b/pixman/pixman-combine32.c
index 4c484d3..de51f64 100644
--- a/pixman/pixman-combine32.c
+++ b/pixman/pixman-combine32.c
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <math.h>
@@ -568,7 +568,7 @@ combine_multiply_ca (pixman_implementation_t *imp,
 	    uint8_t isa = ~sa;						\
 	    uint8_t da = ALPHA_8 (d);					\
 	    uint8_t ida = ~da;						\
-	    int32_t ra, rr, rg, rb;					\
+	    uint32_t ra, rr, rg, rb;					\
 	    								\
 	    ra = da * 0xff + sa * 0xff - sa * da;			\
 	    rr = isa * RED_8 (d) + ida * RED_8 (s);			\
@@ -609,7 +609,7 @@ combine_multiply_ca (pixman_implementation_t *imp,
 	    uint32_t d = *(dest + i);					\
 	    uint8_t da = ALPHA_8 (d);					\
 	    uint8_t ida = ~da;						\
-	    int32_t ra, rr, rg, rb;					\
+	    uint32_t ra, rr, rg, rb;					\
 	    uint8_t ira, iga, iba;					\
 	    								\
 	    combine_mask_ca (&s, &m);					\
diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index cdd56a6..59bb247 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -12,7 +12,7 @@
 #define RB_MASK 0xff00ff
 #define AG_MASK 0xff00ff00
 #define RB_ONE_HALF 0x800080
-#define RB_MASK_PLUS_ONE 0x10000100
+#define RB_MASK_PLUS_ONE 0x1000100
 
 #define ALPHA_8(x) ((x) >> A_SHIFT)
 #define RED_8(x) (((x) >> R_SHIFT) & MASK)
diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h
index 2489adc..6394156 100644
--- a/pixman/pixman-compiler.h
+++ b/pixman/pixman-compiler.h
@@ -95,6 +95,8 @@
 /* Sun Studio 8 visibility */
 #elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
 #   define PIXMAN_EXPORT __global
+#elif defined (_MSC_VER) || defined(__MINGW32__)
+#   define PIXMAN_EXPORT PIXMAN_API
 #else
 #   define PIXMAN_EXPORT
 #endif
@@ -107,14 +109,14 @@
 #if defined(PIXMAN_NO_TLS)
 
 #   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
-    static type name
+    static type name;
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     (&name)
 
 #elif defined(TLS)
 
 #   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
-    static TLS type name
+    static TLS type name;
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     (&name)
 
@@ -174,7 +176,7 @@
 #elif defined(_MSC_VER)
 
 #   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
-    static __declspec(thread) type name
+    static __declspec(thread) type name;
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     (&name)
 
diff --git a/pixman/pixman-conical-gradient.c b/pixman/pixman-conical-gradient.c
index 8bb46ae..37dfffd 100644
--- a/pixman/pixman-conical-gradient.c
+++ b/pixman/pixman-conical-gradient.c
@@ -25,7 +25,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <stdlib.h>
@@ -51,7 +51,10 @@ coordinates_to_parameter (double x, double y, double angle)
 }
 
 static uint32_t *
-conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+conical_get_scanline (pixman_iter_t                 *iter,
+		      const uint32_t                *mask,
+		      int                            Bpp,
+		      pixman_gradient_walker_write_t write_pixel)
 {
     pixman_image_t *image = iter->image;
     int x = iter->x;
@@ -61,7 +64,7 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 
     gradient_t *gradient = (gradient_t *)image;
     conical_gradient_t *conical = (conical_gradient_t *)image;
-    uint32_t       *end = buffer + width;
+    uint32_t       *end = buffer + width * (Bpp / 4);
     pixman_gradient_walker_t walker;
     pixman_bool_t affine = TRUE;
     double cx = 1.;
@@ -109,11 +112,12 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 	    {
 		double t = coordinates_to_parameter (rx, ry, conical->angle);
 
-		*buffer = _pixman_gradient_walker_pixel (
-		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+		write_pixel (&walker,
+			     (pixman_fixed_48_16_t)pixman_double_to_fixed (t),
+			     buffer);
 	    }
 
-	    ++buffer;
+	    buffer += (Bpp / 4);
 
 	    rx += cx;
 	    ry += cy;
@@ -144,11 +148,12 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 
 		t = coordinates_to_parameter (x, y, conical->angle);
 
-		*buffer = _pixman_gradient_walker_pixel (
-		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+		write_pixel (&walker,
+			     (pixman_fixed_48_16_t)pixman_double_to_fixed (t),
+			     buffer);
 	    }
 
-	    ++buffer;
+	    buffer += (Bpp / 4);
 
 	    rx += cx;
 	    ry += cy;
@@ -161,14 +166,17 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 }
 
 static uint32_t *
-conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 {
-    uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
-
-    pixman_expand_to_float (
-	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+    return conical_get_scanline (iter, mask, 4,
+				 _pixman_gradient_walker_write_narrow);
+}
 
-    return buffer;
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return conical_get_scanline (iter, NULL, 16,
+				 _pixman_gradient_walker_write_wide);
 }
 
 void
diff --git a/pixman/pixman-edge.c b/pixman/pixman-edge.c
index ad6dfc4..c324cd3 100644
--- a/pixman/pixman-edge.c
+++ b/pixman/pixman-edge.c
@@ -21,7 +21,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <string.h>
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index c6e43de..d510cac 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -24,7 +24,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <string.h>
 #include <stdlib.h>
@@ -908,7 +908,7 @@ fast_composite_add_n_8_8 (pixman_implementation_t *imp,
 #define CREATE_BITMASK(n) (0x80000000 >> (n))
 #define UPDATE_BITMASK(n) ((n) >> 1)
 #else
-#define CREATE_BITMASK(n) (1 << (n))
+#define CREATE_BITMASK(n) (1U << (n))
 #define UPDATE_BITMASK(n) ((n) << 1)
 #endif
 
@@ -2343,6 +2343,8 @@ fast_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
     int32_t dist_y;
     int i;
 
+    COMPILE_TIME_ASSERT (BILINEAR_INTERPOLATION_BITS < 8);
+
     fx = info->x;
     ux = iter->image->common.transform->matrix[0][0];
 
@@ -2798,7 +2800,7 @@ bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
 			    repeat (repeat_mode, &rx, bits->width);
 			    repeat (repeat_mode, &ry, bits->height);
 
-			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+			    row = (uint8_t *)(bits->bits + bits->rowstride * ry);
 			    pixel = convert_pixel (row, rx) | mask;
 			}
 			else
@@ -2809,7 +2811,7 @@ bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
 			    }
 			    else
 			    {
-				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+				row = (uint8_t *)(bits->bits + bits->rowstride * ry);
 				pixel = convert_pixel (row, rx) | mask;
 			    }
 			}
@@ -2842,7 +2844,7 @@ bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
     }
 }
 
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const uint32_t zero[2] = { 0, 0 };
 
 static force_inline void
 bits_image_fetch_bilinear_affine (pixman_image_t * image,
@@ -2911,8 +2913,8 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
 	    repeat (repeat_mode, &x2, width);
 	    repeat (repeat_mode, &y2, height);
 
-	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+	    row1 = (uint8_t *)(bits->bits + bits->rowstride * y1);
+	    row2 = (uint8_t *)(bits->bits + bits->rowstride * y2);
 
 	    tl = convert_pixel (row1, x1) | mask;
 	    tr = convert_pixel (row1, x2) | mask;
@@ -2942,12 +2944,12 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
 
 	    if (y2 == 0)
 	    {
-		row1 = zero;
+		row1 = (const uint8_t *)zero;
 		mask1 = 0;
 	    }
 	    else
 	    {
-		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+		row1 = (uint8_t *)(bits->bits + bits->rowstride * y1);
 		row1 += bpp / 8 * x1;
 
 		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
@@ -2955,12 +2957,12 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
 
 	    if (y1 == height - 1)
 	    {
-		row2 = zero;
+		row2 = (const uint8_t *)zero;
 		mask2 = 0;
 	    }
 	    else
 	    {
-		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+		row2 = (uint8_t *)(bits->bits + bits->rowstride * y2);
 		row2 += bpp / 8 * x1;
 
 		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
@@ -3058,7 +3060,7 @@ bits_image_fetch_nearest_affine (pixman_image_t * image,
 		repeat (repeat_mode, &y0, height);
 	    }
 
-	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+	    row = (uint8_t *)(bits->bits + bits->rowstride * y0);
 
 	    buffer[i] = convert_pixel (row, x0) | mask;
 	}
@@ -3084,7 +3086,7 @@ convert_x8r8g8b8 (const uint8_t *row, int x)
 static force_inline uint32_t
 convert_a8 (const uint8_t *row, int x)
 {
-    return *(row + x) << 24;
+    return (uint32_t) *(row + x) << 24;
 }
 
 static force_inline uint32_t
@@ -3256,9 +3258,9 @@ static const pixman_iter_info_t fast_iters[] =
     },
 
 #define AFFINE_FAST_PATHS(name, format, repeat)				\
-    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
     BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)
     
     AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
     AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index b2bf53f..33327df 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -29,7 +29,7 @@
 #include <math.h>
 #include <assert.h>
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include "pixman-private.h"
 
@@ -109,14 +109,16 @@ general_cubic (double x, double B, double C)
 
     if (ax < 1)
     {
-	return ((12 - 9 * B - 6 * C) * ax * ax * ax +
-		(-18 + 12 * B + 6 * C) * ax * ax + (6 - 2 * B)) / 6;
+	return (((12 - 9 * B - 6 * C) * ax +
+		 (-18 + 12 * B + 6 * C)) * ax * ax +
+		(6 - 2 * B)) / 6;
     }
-    else if (ax >= 1 && ax < 2)
+    else if (ax < 2)
     {
-	return ((-B - 6 * C) * ax * ax * ax +
-		(6 * B + 30 * C) * ax * ax + (-12 * B - 48 * C) *
-		ax + (8 * B + 24 * C)) / 6;
+	return ((((-B - 6 * C) * ax +
+		  (6 * B + 30 * C)) * ax +
+		 (-12 * B - 48 * C)) * ax +
+		(8 * B + 24 * C)) / 6;
     }
     else
     {
@@ -141,7 +143,7 @@ static const filter_info_t filters[] =
     { PIXMAN_KERNEL_BOX,	        box_kernel,       1.0 },
     { PIXMAN_KERNEL_LINEAR,	        linear_kernel,    2.0 },
     { PIXMAN_KERNEL_CUBIC,		cubic_kernel,     4.0 },
-    { PIXMAN_KERNEL_GAUSSIAN,	        gaussian_kernel,  6 * SIGMA },
+    { PIXMAN_KERNEL_GAUSSIAN,	        gaussian_kernel,  5.0 },
     { PIXMAN_KERNEL_LANCZOS2,	        lanczos2_kernel,  4.0 },
     { PIXMAN_KERNEL_LANCZOS3,	        lanczos3_kernel,  6.0 },
     { PIXMAN_KERNEL_LANCZOS3_STRETCHED, nice_kernel,      8.0 },
@@ -160,18 +162,21 @@ integral (pixman_kernel_t kernel1, double x1,
 	  pixman_kernel_t kernel2, double scale, double x2,
 	  double width)
 {
-    /* If the integration interval crosses zero, break it into
-     * two separate integrals. This ensures that filters such
-     * as LINEAR that are not differentiable at 0 will still
-     * integrate properly.
+    if (kernel1 == PIXMAN_KERNEL_BOX && kernel2 == PIXMAN_KERNEL_BOX)
+    {
+	return width;
+    }
+    /* The LINEAR filter is not differentiable at 0, so if the
+     * integration interval crosses zero, break it into two
+     * separate integrals.
      */
-    if (x1 < 0 && x1 + width > 0)
+    else if (kernel1 == PIXMAN_KERNEL_LINEAR && x1 < 0 && x1 + width > 0)
     {
 	return
 	    integral (kernel1, x1, kernel2, scale, x2, - x1) +
 	    integral (kernel1, 0, kernel2, scale, x2 - x1, width + x1);
     }
-    else if (x2 < 0 && x2 + width > 0)
+    else if (kernel2 == PIXMAN_KERNEL_LINEAR && x2 < 0 && x2 + width > 0)
     {
 	return
 	    integral (kernel1, x1, kernel2, scale, x2, - x2) +
@@ -189,13 +194,19 @@ integral (pixman_kernel_t kernel1, double x1,
     }
     else
     {
-	/* Integration via Simpson's rule */
-#define N_SEGMENTS 128
+	/* Integration via Simpson's rule
+	 * See http://www.intmath.com/integration/6-simpsons-rule.php
+	 * 12 segments (6 cubic approximations) seems to produce best
+	 * result for lanczos3.linear, which was the combination that
+	 * showed the most errors.  This makes sense as the lanczos3
+	 * filter is 6 wide.
+	 */
+#define N_SEGMENTS 12
 #define SAMPLE(a1, a2)							\
 	(filters[kernel1].func ((a1)) * filters[kernel2].func ((a2) * scale))
 	
 	double s = 0.0;
-	double h = width / (double)N_SEGMENTS;
+	double h = width / N_SEGMENTS;
 	int i;
 
 	s = SAMPLE (x1, x2);
@@ -204,11 +215,14 @@ integral (pixman_kernel_t kernel1, double x1,
 	{
 	    double a1 = x1 + h * i;
 	    double a2 = x2 + h * i;
+	    s += 4 * SAMPLE (a1, a2);
+	}
 
+	for (i = 2; i < N_SEGMENTS; i += 2)
+	{
+	    double a1 = x1 + h * i;
+	    double a2 = x2 + h * i;
 	    s += 2 * SAMPLE (a1, a2);
-
-	    if (i >= 2 && i < N_SEGMENTS - 1)
-		s += 4 * SAMPLE (a1, a2);
 	}
 
 	s += SAMPLE (x1 + width, x2 + width);
@@ -217,25 +231,20 @@ integral (pixman_kernel_t kernel1, double x1,
     }
 }
 
-static pixman_fixed_t *
-create_1d_filter (int             *width,
+static void
+create_1d_filter (int              width,
 		  pixman_kernel_t  reconstruct,
 		  pixman_kernel_t  sample,
 		  double           scale,
-		  int              n_phases)
+		  int              n_phases,
+		  pixman_fixed_t *pstart,
+		  pixman_fixed_t *pend
+		  )
 {
-    pixman_fixed_t *params, *p;
+    pixman_fixed_t *p = pstart;
     double step;
-    double size;
     int i;
-
-    size = scale * filters[sample].width + filters[reconstruct].width;
-    *width = ceil (size);
-
-    p = params = malloc (*width * n_phases * sizeof (pixman_fixed_t));
-    if (!params)
-        return NULL;
-
+    if(width <= 0) return;
     step = 1.0 / n_phases;
 
     for (i = 0; i < n_phases; ++i)
@@ -243,16 +252,16 @@ create_1d_filter (int             *width,
         double frac = step / 2.0 + i * step;
 	pixman_fixed_t new_total;
         int x, x1, x2;
-	double total;
+	double total, e;
 
 	/* Sample convolution of reconstruction and sampling
 	 * filter. See rounding.txt regarding the rounding
 	 * and sample positions.
 	 */
 
-	x1 = ceil (frac - *width / 2.0 - 0.5);
-        x2 = x1 + *width;
-
+	x1 = ceil (frac - width / 2.0 - 0.5);
+	x2 = x1 + width;
+    assert( p >= pstart && p + (x2 - x1) <= pend ); /* assert validity of the following loop */
 	total = 0;
         for (x = x1; x < x2; ++x)
         {
@@ -274,29 +283,158 @@ create_1d_filter (int             *width,
 			      ihigh - ilow);
 	    }
 
-	    total += c;
-            *p++ = (pixman_fixed_t)(c * 65536.0 + 0.5);
+            *p = (pixman_fixed_t)floor (c * 65536.0 + 0.5);
+	    total += *p;
+	    p++;
         }
 
-	/* Normalize */
-	p -= *width;
-        total = 1 / total;
-        new_total = 0;
+	/* Normalize, with error diffusion */
+	p -= width;
+	assert(p >= pstart && p + (x2 - x1) <= pend); /* assert validity of the following loop */
+
+    total = 65536.0 / total;
+    new_total = 0;
+	e = 0.0;
 	for (x = x1; x < x2; ++x)
 	{
-	    pixman_fixed_t t = (*p) * total + 0.5;
+	    double v = (*p) * total + e;
+	    pixman_fixed_t t = floor (v + 0.5);
 
+	    e = v - t;
 	    new_total += t;
 	    *p++ = t;
 	}
 
-	if (new_total != pixman_fixed_1)
-	    *(p - *width / 2) += (pixman_fixed_1 - new_total);
+	/* pixman_fixed_e's worth of error may remain; put it
+	 * at the first sample, since that is the only one that
+	 * hasn't had any error diffused into it.
+	 */
+
+	assert(p - width >= pstart && p - width < pend); /* assert... */
+	*(p - width) += pixman_fixed_1 - new_total;
     }
+}
 
-    return params;
+
+static int
+filter_width (pixman_kernel_t reconstruct, pixman_kernel_t sample, double size)
+{
+    return ceil (filters[reconstruct].width + size * filters[sample].width);
+}
+
+#ifdef PIXMAN_GNUPLOT
+
+/* If enable-gnuplot is configured, then you can pipe the output of a
+ * pixman-using program to gnuplot and get a continuously-updated plot
+ * of the horizontal filter. This works well with demos/scale to test
+ * the filter generation.
+ *
+ * The plot is all the different subposition filters shuffled
+ * together. This is misleading in a few cases:
+ *
+ *  IMPULSE.BOX - goes up and down as the subfilters have different
+ *		  numbers of non-zero samples
+ *  IMPULSE.TRIANGLE - somewhat crooked for the same reason
+ *  1-wide filters - looks triangular, but a 1-wide box would be more
+ *		     accurate
+ */
+static void
+gnuplot_filter (int width, int n_phases, const pixman_fixed_t* p)
+{
+    double step;
+    int i, j;
+    int first;
+
+    step = 1.0 / n_phases;
+
+    printf ("set style line 1 lc rgb '#0060ad' lt 1 lw 0.5 pt 7 pi 1 ps 0.5\n");
+    printf ("plot [x=%g:%g] '-' with linespoints ls 1\n", -width*0.5, width*0.5);
+    /* Print a point at the origin so that y==0 line is included: */
+    printf ("0 0\n\n");
+
+    /* The position of the first sample of the phase corresponding to
+     * frac is given by:
+     * 
+     *     ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+     * 
+     * We have to find the frac that minimizes this expression.
+     * 
+     * For odd widths, we have
+     * 
+     *     ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+     *   = ceil (frac) + K - frac
+     *   = 1 + K - frac
+     * 
+     * for some K, so this is minimized when frac is maximized and
+     * strictly growing with frac. So for odd widths, we can simply
+     * start at the last phase and go backwards.
+     * 
+     * For even widths, we have
+     * 
+     *     ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+     *   = ceil (frac - 0.5) + K - frac
+     * 
+     * The graph for this function (ignoring K) looks like this:
+     * 
+     *        0.5
+     *           |    |\ 
+     *           |    | \ 
+     *           |    |  \ 
+     *         0 |    |   \ 
+     *           |\   |
+     *           | \  |
+     *           |  \ |
+     *      -0.5 |   \|
+     *   ---------------------------------
+     *           0    0.5   1
+     * 
+     * So in this case we need to start with the phase whose frac is
+     * less than, but as close as possible to 0.5, then go backwards
+     * until we hit the first phase, then wrap around to the last
+     * phase and continue backwards.
+     * 
+     * Which phase is as close as possible 0.5? The locations of the
+     * sampling point corresponding to the kth phase is given by
+     * 1/(2 * n_phases) + k / n_phases:
+     * 
+     *         1/(2 * n_phases) + k / n_phases = 0.5
+     *  
+     * from which it follows that
+     * 
+     *         k = (n_phases - 1) / 2
+     * 
+     * rounded down is the phase in question.
+     */
+    if (width & 1)
+	first = n_phases - 1;
+    else
+	first = (n_phases - 1) / 2;
+
+    for (j = 0; j < width; ++j)
+    {
+	for (i = 0; i < n_phases; ++i)
+	{
+	    int phase = first - i;
+	    double frac, pos;
+
+	    if (phase < 0)
+		phase = n_phases + phase;
+
+	    frac = step / 2.0 + phase * step;
+	    pos = ceil (frac - width / 2.0 - 0.5) + 0.5 - frac + j;
+
+	    printf ("%g %g\n",
+		    pos,
+		    pixman_fixed_to_double (*(p + phase * width + j)));
+	}
+    }
+
+    printf ("e\n");
+    fflush (stdout);
 }
 
+#endif
+
 /* Create the parameter list for a SEPARABLE_CONVOLUTION filter
  * with the given kernels and scale parameters
  */
@@ -313,38 +451,41 @@ pixman_filter_create_separable_convolution (int             *n_values,
 {
     double sx = fabs (pixman_fixed_to_double (scale_x));
     double sy = fabs (pixman_fixed_to_double (scale_y));
-    pixman_fixed_t *horz = NULL, *vert = NULL, *params = NULL;
+    pixman_fixed_t *params;
     int subsample_x, subsample_y;
     int width, height;
 
+    width = filter_width (reconstruct_x, sample_x, sx);
     subsample_x = (1 << subsample_bits_x);
-    subsample_y = (1 << subsample_bits_y);
 
-    horz = create_1d_filter (&width, reconstruct_x, sample_x, sx, subsample_x);
-    vert = create_1d_filter (&height, reconstruct_y, sample_y, sy, subsample_y);
+    height = filter_width (reconstruct_y, sample_y, sy);
+    subsample_y = (1 << subsample_bits_y);
 
-    if (!horz || !vert)
-        goto out;
-    
     *n_values = 4 + width * subsample_x + height * subsample_y;
     
     params = malloc (*n_values * sizeof (pixman_fixed_t));
     if (!params)
-        goto out;
+	return NULL;
 
     params[0] = pixman_int_to_fixed (width);
     params[1] = pixman_int_to_fixed (height);
     params[2] = pixman_int_to_fixed (subsample_bits_x);
     params[3] = pixman_int_to_fixed (subsample_bits_y);
 
-    memcpy (params + 4, horz,
-	    width * subsample_x * sizeof (pixman_fixed_t));
-    memcpy (params + 4 + width * subsample_x, vert,
-	    height * subsample_y * sizeof (pixman_fixed_t));
+    {
+        pixman_fixed_t
+            *xparams = params+4,
+            *yparams = xparams + width*subsample_x,
+            *endparams = params + *n_values;
+        create_1d_filter(width, reconstruct_x, sample_x, sx, subsample_x,
+                         xparams, yparams);
+        create_1d_filter(height, reconstruct_y, sample_y, sy, subsample_y,
+                         yparams, endparams);
+    }
 
-out:
-    free (horz);
-    free (vert);
+#ifdef PIXMAN_GNUPLOT
+    gnuplot_filter(width, subsample_x, params + 4);
+#endif
 
     return params;
 }
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 7cdea29..b4450cb 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -26,7 +26,7 @@
  * SOFTWARE.
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <stdlib.h>
 #include <string.h>
@@ -141,7 +141,8 @@ general_composite_rect  (pixman_implementation_t *imp,
     if ((src_image->common.flags & FAST_PATH_NARROW_FORMAT)		     &&
 	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT)  &&
 	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT)		     &&
-	!(operator_needs_division (op)))
+	!(operator_needs_division (op))                                      &&
+	(dest_image->bits.dither == PIXMAN_DITHER_NONE))
     {
 	width_flag = ITER_NARROW;
 	Bpp = 4;
@@ -155,23 +156,27 @@ general_composite_rect  (pixman_implementation_t *imp,
 #define ALIGN(addr)							\
     ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
 
-    src_buffer = ALIGN (scanline_buffer);
-    mask_buffer = ALIGN (src_buffer + width * Bpp);
-    dest_buffer = ALIGN (mask_buffer + width * Bpp);
+    if (width <= 0 || _pixman_multiply_overflows_int (width, Bpp * 3))
+	return;
 
-    if (ALIGN (dest_buffer + width * Bpp) >
-	    scanline_buffer + sizeof (stack_scanline_buffer))
+    if (width * Bpp * 3 > sizeof (stack_scanline_buffer) - 15 * 3)
     {
-	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
+	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 15 * 3);
 
 	if (!scanline_buffer)
 	    return;
 
-	src_buffer = ALIGN (scanline_buffer);
-	mask_buffer = ALIGN (src_buffer + width * Bpp);
-	dest_buffer = ALIGN (mask_buffer + width * Bpp);
+	memset (scanline_buffer, 0, width * Bpp * 3 + 15 * 3);
+    }
+    else
+    {
+	memset (stack_scanline_buffer, 0, sizeof (stack_scanline_buffer));
     }
 
+    src_buffer = ALIGN (scanline_buffer);
+    mask_buffer = ALIGN (src_buffer + width * Bpp);
+    dest_buffer = ALIGN (mask_buffer + width * Bpp);
+
     if (width_flag == ITER_WIDE)
     {
 	/* To make sure there aren't any NANs in the buffers */
diff --git a/pixman/pixman-glyph.c b/pixman/pixman-glyph.c
index 96a349a..dc90411 100644
--- a/pixman/pixman-glyph.c
+++ b/pixman/pixman-glyph.c
@@ -25,7 +25,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include "pixman-private.h"
 
diff --git a/pixman/pixman-gradient-walker.c b/pixman/pixman-gradient-walker.c
index 822f8e6..b31d5ad 100644
--- a/pixman/pixman-gradient-walker.c
+++ b/pixman/pixman-gradient-walker.c
@@ -24,7 +24,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include "pixman-private.h"
 
@@ -122,10 +122,9 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
 	    left_c = right_c;
     }
 
-    /* The alpha channel is scaled to be in the [0, 255] interval,
-     * and the red/green/blue channels are scaled to be in [0, 1].
+    /* The alpha/red/green/blue channels are scaled to be in [0, 1].
      * This ensures that after premultiplication all channels will
-     * be in the [0, 255] interval.
+     * be in the [0, 1] interval.
      */
     la = (left_c->alpha * (1.0f/257.0f));
     lr = (left_c->red * (1.0f/257.0f));
@@ -143,7 +142,7 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
     if (FLOAT_IS_ZERO (rx - lx) || left_x == INT32_MIN || right_x == INT32_MAX)
     {
 	walker->a_s = walker->r_s = walker->g_s = walker->b_s = 0.0f;
-	walker->a_b = (la + ra) / 2.0f;
+	walker->a_b = (la + ra) / 510.0f;
 	walker->r_b = (lr + rr) / 510.0f;
 	walker->g_b = (lg + rg) / 510.0f;
 	walker->b_b = (lb + rb) / 510.0f;
@@ -152,12 +151,12 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
     {
 	float w_rec = 1.0f / (rx - lx);
 
-	walker->a_b = (la * rx - ra * lx) * w_rec;
+	walker->a_b = (la * rx - ra * lx) * w_rec * (1.0f/255.0f);
 	walker->r_b = (lr * rx - rr * lx) * w_rec * (1.0f/255.0f);
 	walker->g_b = (lg * rx - rg * lx) * w_rec * (1.0f/255.0f);
 	walker->b_b = (lb * rx - rb * lx) * w_rec * (1.0f/255.0f);
 
-	walker->a_s = (ra - la) * w_rec;
+	walker->a_s = (ra - la) * w_rec * (1.0f/255.0f);
 	walker->r_s = (rr - lr) * w_rec * (1.0f/255.0f);
 	walker->g_s = (rg - lg) * w_rec * (1.0f/255.0f);
 	walker->b_s = (rb - lb) * w_rec * (1.0f/255.0f);
@@ -169,34 +168,97 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
     walker->need_reset = FALSE;
 }
 
-uint32_t
-_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
-                               pixman_fixed_48_16_t      x)
+static argb_t
+pixman_gradient_walker_pixel_float (pixman_gradient_walker_t *walker,
+				    pixman_fixed_48_16_t      x)
 {
-    float a, r, g, b;
-    uint8_t a8, r8, g8, b8;
-    uint32_t v;
+    argb_t f;
     float y;
 
     if (walker->need_reset || x < walker->left_x || x >= walker->right_x)
-        gradient_walker_reset (walker, x);
+	gradient_walker_reset (walker, x);
 
     y = x * (1.0f / 65536.0f);
 
-    a = walker->a_s * y + walker->a_b;
-    r = a * (walker->r_s * y + walker->r_b);
-    g = a * (walker->g_s * y + walker->g_b);
-    b = a * (walker->b_s * y + walker->b_b);
+    f.a = walker->a_s * y + walker->a_b;
+    f.r = f.a * (walker->r_s * y + walker->r_b);
+    f.g = f.a * (walker->g_s * y + walker->g_b);
+    f.b = f.a * (walker->b_s * y + walker->b_b);
 
-    a8 = a + 0.5f;
-    r8 = r + 0.5f;
-    g8 = g + 0.5f;
-    b8 = b + 0.5f;
+    return f;
+}
+
+static uint32_t
+pixman_gradient_walker_pixel_32 (pixman_gradient_walker_t *walker,
+				 pixman_fixed_48_16_t      x)
+{
+    argb_t f;
+    float y;
+
+    if (walker->need_reset || x < walker->left_x || x >= walker->right_x)
+	gradient_walker_reset (walker, x);
+
+    y = x * (1.0f / 65536.0f);
+
+    /* Instead of [0...1] for ARGB, we want [0...255],
+     * multiply alpha with 255 and the color channels
+     * also get multiplied by the alpha multiplier.
+     *
+     * We don't use pixman_contract_from_float because it causes a 2x
+     * slowdown to do so, and the values are already normalized,
+     * so we don't have to worry about values < 0.f or > 1.f
+     */
+    f.a = 255.f * (walker->a_s * y + walker->a_b);
+    f.r = f.a * (walker->r_s * y + walker->r_b);
+    f.g = f.a * (walker->g_s * y + walker->g_b);
+    f.b = f.a * (walker->b_s * y + walker->b_b);
 
-    v = ((a8 << 24) & 0xff000000) |
-        ((r8 << 16) & 0x00ff0000) |
-        ((g8 <<  8) & 0x0000ff00) |
-        ((b8 >>  0) & 0x000000ff);
+    return (((uint32_t)(f.a + .5f) << 24) & 0xff000000) |
+           (((uint32_t)(f.r + .5f) << 16) & 0x00ff0000) |
+           (((uint32_t)(f.g + .5f) <<  8) & 0x0000ff00) |
+           (((uint32_t)(f.b + .5f) >>  0) & 0x000000ff);
+}
+
+void
+_pixman_gradient_walker_write_narrow (pixman_gradient_walker_t *walker,
+				      pixman_fixed_48_16_t      x,
+				      uint32_t                 *buffer)
+{
+    *buffer = pixman_gradient_walker_pixel_32 (walker, x);
+}
+
+void
+_pixman_gradient_walker_write_wide (pixman_gradient_walker_t *walker,
+				    pixman_fixed_48_16_t      x,
+				    uint32_t                 *buffer)
+{
+    *(argb_t *)buffer = pixman_gradient_walker_pixel_float (walker, x);
+}
+
+void
+_pixman_gradient_walker_fill_narrow (pixman_gradient_walker_t *walker,
+				     pixman_fixed_48_16_t      x,
+				     uint32_t                 *buffer,
+				     uint32_t                 *end)
+{
+    register uint32_t color;
+
+    color = pixman_gradient_walker_pixel_32 (walker, x);
+    while (buffer < end)
+	*buffer++ = color;
+}
+
+void
+_pixman_gradient_walker_fill_wide (pixman_gradient_walker_t *walker,
+				   pixman_fixed_48_16_t      x,
+				   uint32_t                 *buffer,
+				   uint32_t                 *end)
+{
+    register argb_t color;
+    argb_t *buffer_wide = (argb_t *)buffer;
+    argb_t *end_wide    = (argb_t *)end;
 
-    return v;
+    color = pixman_gradient_walker_pixel_float (walker, x);
+    while (buffer_wide < end_wide)
+	*buffer_wide++ = color;
 }
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index 1ff1a49..72796fc 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -21,7 +21,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <stdlib.h>
@@ -335,37 +335,47 @@ compute_image_info (pixman_image_t *image)
 	{
 	    flags |= FAST_PATH_NEAREST_FILTER;
 	}
-	else if (
-	    /* affine and integer translation components in matrix ... */
-	    ((flags & FAST_PATH_AFFINE_TRANSFORM) &&
-	     !pixman_fixed_frac (image->common.transform->matrix[0][2] |
-				 image->common.transform->matrix[1][2])) &&
-	    (
-		/* ... combined with a simple rotation */
-		(flags & (FAST_PATH_ROTATE_90_TRANSFORM |
-			  FAST_PATH_ROTATE_180_TRANSFORM |
-			  FAST_PATH_ROTATE_270_TRANSFORM)) ||
-		/* ... or combined with a simple non-rotated translation */
-		(image->common.transform->matrix[0][0] == pixman_fixed_1 &&
-		 image->common.transform->matrix[1][1] == pixman_fixed_1 &&
-		 image->common.transform->matrix[0][1] == 0 &&
-		 image->common.transform->matrix[1][0] == 0)
-		)
-	    )
+	else if (flags & FAST_PATH_AFFINE_TRANSFORM)
 	{
-	    /* FIXME: there are some affine-test failures, showing that
-	     * handling of BILINEAR and NEAREST filter is not quite
-	     * equivalent when getting close to 32K for the translation
-	     * components of the matrix. That's likely some bug, but for
-	     * now just skip BILINEAR->NEAREST optimization in this case.
+	    /* Suppose the transform is
+	     *
+	     *    [ t00, t01, t02 ]
+	     *    [ t10, t11, t12 ]
+	     *    [   0,   0,   1 ]
+	     *
+	     * and the destination coordinates are (n + 0.5, m + 0.5). Then
+	     * the transformed x coordinate is:
+	     *
+	     *     tx = t00 * (n + 0.5) + t01 * (m + 0.5) + t02
+	     *        = t00 * n + t01 * m + t02 + (t00 + t01) * 0.5
+	     *
+	     * which implies that if t00, t01 and t02 are all integers
+	     * and (t00 + t01) is odd, then tx will be an integer plus 0.5,
+	     * which means a BILINEAR filter will reduce to NEAREST. The same
+	     * applies in the y direction
 	     */
-	    pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
-	    if (image->common.transform->matrix[0][2] <= magic_limit  &&
-	        image->common.transform->matrix[1][2] <= magic_limit  &&
-	        image->common.transform->matrix[0][2] >= -magic_limit &&
-	        image->common.transform->matrix[1][2] >= -magic_limit)
+	    pixman_fixed_t (*t)[3] = image->common.transform->matrix;
+
+	    if ((pixman_fixed_frac (
+		     t[0][0] | t[0][1] | t[0][2] |
+		     t[1][0] | t[1][1] | t[1][2]) == 0)			&&
+		(pixman_fixed_to_int (
+		    (t[0][0] + t[0][1]) & (t[1][0] + t[1][1])) % 2) == 1)
 	    {
-		flags |= FAST_PATH_NEAREST_FILTER;
+		/* FIXME: there are some affine-test failures, showing that
+		 * handling of BILINEAR and NEAREST filter is not quite
+		 * equivalent when getting close to 32K for the translation
+		 * components of the matrix. That's likely some bug, but for
+		 * now just skip BILINEAR->NEAREST optimization in this case.
+		 */
+		pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
+		if (image->common.transform->matrix[0][2] <= magic_limit  &&
+		    image->common.transform->matrix[1][2] <= magic_limit  &&
+		    image->common.transform->matrix[0][2] >= -magic_limit &&
+		    image->common.transform->matrix[1][2] >= -magic_limit)
+		{
+		    flags |= FAST_PATH_NEAREST_FILTER;
+		}
 	    }
 	}
 	break;
@@ -557,7 +567,7 @@ _pixman_image_validate (pixman_image_t *image)
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_image_set_clip_region32 (pixman_image_t *   image,
-                                pixman_region32_t *region)
+                                const pixman_region32_t *region)
 {
     image_common_t *common = (image_common_t *)image;
     pixman_bool_t result;
@@ -581,7 +591,7 @@ pixman_image_set_clip_region32 (pixman_image_t *   image,
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_image_set_clip_region (pixman_image_t *   image,
-                              pixman_region16_t *region)
+                              const pixman_region16_t *region)
 {
     image_common_t *common = (image_common_t *)image;
     pixman_bool_t result;
@@ -674,6 +684,41 @@ pixman_image_set_repeat (pixman_image_t *image,
     image_property_changed (image);
 }
 
+PIXMAN_EXPORT void
+pixman_image_set_dither (pixman_image_t *image,
+			 pixman_dither_t dither)
+{
+    if (image->type == BITS)
+    {
+	if (image->bits.dither == dither)
+	    return;
+
+	image->bits.dither = dither;
+
+	image_property_changed (image);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_dither_offset (pixman_image_t *image,
+				int             offset_x,
+				int             offset_y)
+{
+    if (image->type == BITS)
+    {
+	if (image->bits.dither_offset_x == offset_x &&
+	    image->bits.dither_offset_y == offset_y)
+	{
+	    return;
+	}
+
+	image->bits.dither_offset_x = offset_x;
+	image->bits.dither_offset_y = offset_y;
+
+	image_property_changed (image);
+    }
+}
+
 PIXMAN_EXPORT pixman_bool_t
 pixman_image_set_filter (pixman_image_t *      image,
                          pixman_filter_t       filter,
@@ -832,6 +877,10 @@ pixman_image_set_accessors (pixman_image_t *           image,
 
     if (image->type == BITS)
     {
+	/* Accessors only work for <= 32 bpp. */
+	if (PIXMAN_FORMAT_BPP(image->bits.format) > 32)
+	    return_if_fail (!read_func && !write_func);
+
 	image->bits.read_func = read_func;
 	image->bits.write_func = write_func;
 
@@ -911,7 +960,7 @@ _pixman_image_get_solid (pixman_implementation_t *imp,
 	else if (image->bits.format == PIXMAN_x8r8g8b8)
 	    result = image->bits.bits[0] | 0xff000000;
 	else if (image->bits.format == PIXMAN_a8)
-	    result = (*(uint8_t *)image->bits.bits) << 24;
+	    result = (uint32_t)(*(uint8_t *)image->bits.bits) << 24;
 	else
 	    goto otherwise;
     }
diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c
index 5884054..69fa70b 100644
--- a/pixman/pixman-implementation.c
+++ b/pixman/pixman-implementation.c
@@ -22,7 +22,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <stdlib.h>
 #include "pixman-private.h"
@@ -63,7 +63,7 @@ typedef struct
     } cache [N_CACHED_FAST_PATHS];
 } cache_t;
 
-PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache)
 
 static void
 dummy_composite_rect (pixman_implementation_t *imp,
@@ -380,6 +380,11 @@ _pixman_disabled (const char *name)
     return FALSE;
 }
 
+static const pixman_fast_path_t empty_fast_path[] =
+{
+    { PIXMAN_OP_NONE }
+};
+
 pixman_implementation_t *
 _pixman_choose_implementation (void)
 {
@@ -397,5 +402,16 @@ _pixman_choose_implementation (void)
 
     imp = _pixman_implementation_create_noop (imp);
 
+    if (_pixman_disabled ("wholeops"))
+    {
+        pixman_implementation_t *cur;
+
+        /* Disable all whole-operation paths except the general one,
+         * so that optimized iterators are used as much as possible.
+         */
+        for (cur = imp; cur->fallback; cur = cur->fallback)
+            cur->fast_paths = empty_fast_path;
+    }
+
     return imp;
 }
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h
index dd1c2f1..f785910 100644
--- a/pixman/pixman-inlines.h
+++ b/pixman/pixman-inlines.h
@@ -222,6 +222,31 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
 #endif
 #endif // BILINEAR_INTERPOLATION_BITS <= 4
 
+static force_inline argb_t
+bilinear_interpolation_float (argb_t tl, argb_t tr,
+			      argb_t bl, argb_t br,
+			      float distx, float disty)
+{
+    float distxy, distxiy, distixy, distixiy;
+    argb_t r;
+
+    distxy = distx * disty;
+    distxiy = distx * (1.f - disty);
+    distixy = (1.f - distx) * disty;
+    distixiy = (1.f - distx) * (1.f - disty);
+
+    r.a = tl.a * distixiy + tr.a * distxiy +
+          bl.a * distixy  + br.a * distxy;
+    r.r = tl.r * distixiy + tr.r * distxiy +
+          bl.r * distixy  + br.r * distxy;
+    r.g = tl.g * distixiy + tr.g * distxiy +
+          bl.g * distixy  + br.g * distxy;
+    r.b = tl.b * distixiy + tr.b * distxiy +
+          bl.b * distixy  + br.b * distxy;
+
+    return r;
+}
+
 /*
  * For each scanline fetched from source image with PAD repeat:
  * - calculate how many pixels need to be padded on the left side
@@ -747,7 +772,8 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),              \
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
 
 /*****************************************************************************/
 
diff --git a/pixman/pixman-linear-gradient.c b/pixman/pixman-linear-gradient.c
index 40c8c9f..014b69c 100644
--- a/pixman/pixman-linear-gradient.c
+++ b/pixman/pixman-linear-gradient.c
@@ -26,7 +26,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <stdlib.h>
 #include "pixman-private.h"
@@ -89,8 +89,11 @@ linear_gradient_is_horizontal (pixman_image_t *image,
 }
 
 static uint32_t *
-linear_get_scanline_narrow (pixman_iter_t  *iter,
-			    const uint32_t *mask)
+linear_get_scanline (pixman_iter_t                 *iter,
+		     const uint32_t                *mask,
+		     int                            Bpp,
+		     pixman_gradient_walker_write_t write_pixel,
+		     pixman_gradient_walker_fill_t  fill_pixel)
 {
     pixman_image_t *image  = iter->image;
     int             x      = iter->x;
@@ -103,7 +106,7 @@ linear_get_scanline_narrow (pixman_iter_t  *iter,
     pixman_fixed_48_16_t dx, dy;
     gradient_t *gradient = (gradient_t *)image;
     linear_gradient_t *linear = (linear_gradient_t *)image;
-    uint32_t *end = buffer + width;
+    uint32_t *end = buffer + width * (Bpp / 4);
     pixman_gradient_walker_t walker;
 
     _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
@@ -137,7 +140,7 @@ linear_get_scanline_narrow (pixman_iter_t  *iter,
     if (l == 0 || unit.vector[2] == 0)
     {
 	/* affine transformation only */
-        pixman_fixed_32_32_t t, next_inc;
+	pixman_fixed_32_32_t t, next_inc;
 	double inc;
 
 	if (l == 0 || v.vector[2] == 0)
@@ -152,7 +155,7 @@ linear_get_scanline_narrow (pixman_iter_t  *iter,
 	    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
 		(l * (double) v.vector[2]);
 	    v2 = v.vector[2] * (1. / pixman_fixed_1);
-	    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+	    t = ((dx * v.vector[0] + dy * v.vector[1]) -
 		 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
 	    inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
 	}
@@ -160,11 +163,7 @@ linear_get_scanline_narrow (pixman_iter_t  *iter,
 
 	if (((pixman_fixed_32_32_t )(inc * width)) == 0)
 	{
-	    register uint32_t color;
-
-	    color = _pixman_gradient_walker_pixel (&walker, t);
-	    while (buffer < end)
-		*buffer++ = color;
+	    fill_pixel (&walker, t, buffer, end);
 	}
 	else
 	{
@@ -175,12 +174,11 @@ linear_get_scanline_narrow (pixman_iter_t  *iter,
 	    {
 		if (!mask || *mask++)
 		{
-		    *buffer = _pixman_gradient_walker_pixel (&walker,
-							     t + next_inc);
+		    write_pixel (&walker, t + next_inc, buffer);
 		}
 		i++;
 		next_inc = inc * i;
-		buffer++;
+		buffer += (Bpp / 4);
 	    }
 	}
     }
@@ -202,14 +200,14 @@ linear_get_scanline_narrow (pixman_iter_t  *iter,
 		    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
 			(l * (double) v.vector[2]);
 		    v2 = v.vector[2] * (1. / pixman_fixed_1);
-		    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+		    t = ((dx * v.vector[0] + dy * v.vector[1]) -
 			 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
 		}
 
-		*buffer = _pixman_gradient_walker_pixel (&walker, t);
+		write_pixel (&walker, t, buffer);
 	    }
 
-	    ++buffer;
+	    buffer += (Bpp / 4);
 
 	    v.vector[0] += unit.vector[0];
 	    v.vector[1] += unit.vector[1];
@@ -223,14 +221,21 @@ linear_get_scanline_narrow (pixman_iter_t  *iter,
 }
 
 static uint32_t *
-linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+linear_get_scanline_narrow (pixman_iter_t  *iter,
+			    const uint32_t *mask)
 {
-    uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+    return linear_get_scanline (iter, mask, 4,
+				_pixman_gradient_walker_write_narrow,
+				_pixman_gradient_walker_fill_narrow);
+}
 
-    pixman_expand_to_float (
-	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
 
-    return buffer;
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return linear_get_scanline (iter, NULL, 16,
+				_pixman_gradient_walker_write_wide,
+				_pixman_gradient_walker_fill_wide);
 }
 
 void
diff --git a/pixman/pixman-matrix.c b/pixman/pixman-matrix.c
index 4032c13..da5209c 100644
--- a/pixman/pixman-matrix.c
+++ b/pixman/pixman-matrix.c
@@ -25,7 +25,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <math.h>
@@ -273,7 +273,7 @@ pixman_transform_point_31_16 (const pixman_transform_t    *t,
         {
             /* the divisor is small, we can actually keep all the bits */
             int64_t hi, rhi, lo, rlo;
-            int64_t div = (divint << 16) + divfrac;
+            int64_t div = ((uint64_t)divint << 16) + divfrac;
 
             fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32);
             rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 866e93e..9dad163 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * Author:  Nemanja Lukic (nlukic@mips.com)
+ * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com)
  */
 
 #include "pixman-private.h"
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index cab122d..e238566 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * Author:  Nemanja Lukic (nlukic@mips.com)
+ * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com)
  */
 
 #ifndef PIXMAN_MIPS_DSPR2_ASM_H
@@ -72,6 +72,7 @@
 #define LEAF_MIPS32R2(symbol)                           \
                 .globl  symbol;                         \
                 .align  2;                              \
+                .hidden symbol;                         \
                 .type   symbol, @function;              \
                 .ent    symbol, 0;                      \
 symbol:         .frame  sp, 0, ra;                      \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index e10c9df..c43eb1e 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -26,11 +26,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * Author:  Nemanja Lukic (nlukic@mips.com)
+ * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com)
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
@@ -388,11 +388,11 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
 
-    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
-    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
 
-    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565),
-    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565),
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565),
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
index 955ed70..57b3835 100644
--- a/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman-mips-dspr2.h
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * Author:  Nemanja Lukic (nlukic@mips.com)
+ * Author:  Nemanja Lukic (nemanja.lukic@rt-rk.com)
  */
 
 #ifndef PIXMAN_MIPS_DSPR2_H
@@ -328,12 +328,6 @@ FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_pad_##op,                         \
                               scaled_nearest_scanline_mips_##name##_##op,     \
                               src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
 
-/* Provide entries for the fast path table */
-#define PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)             \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
-
 /****************************************************************************/
 
 #define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST(flags, name, op,            \
diff --git a/pixman/pixman-mips.c b/pixman/pixman-mips.c
index 3048813..7479a08 100644
--- a/pixman/pixman-mips.c
+++ b/pixman/pixman-mips.c
@@ -20,7 +20,7 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index f9a92ce..52c70e9 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -30,7 +30,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
@@ -60,7 +60,7 @@ _mm_empty (void)
 #endif
 
 #ifdef USE_X86_MMX
-# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
+# if (defined(__SSE2__) || defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
 #  include <xmmintrin.h>
 # else
 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
@@ -89,21 +89,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
     return __A;
 }
 
-#  ifdef __OPTIMIZE__
-extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
-{
-    __m64 ret;
-
-    asm ("pshufw %2, %1, %0\n\t"
-	: "=y" (ret)
-	: "y" (__A), "K" (__N)
-    );
-
-    return ret;
-}
-#  else
-#   define _mm_shuffle_pi16(A, N)					\
+# define _mm_shuffle_pi16(A, N)						\
     ({									\
 	__m64 ret;							\
 									\
@@ -114,11 +100,10 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
 									\
 	ret;								\
     })
-#  endif
 # endif
 #endif
 
-#ifndef _MSC_VER
+#ifndef _MM_SHUFFLE
 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
 #endif
@@ -402,8 +387,10 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 static force_inline __m64 ldq_u(__m64 *p)
 {
 #ifdef USE_X86_MMX
-    /* x86's alignment restrictions are very relaxed. */
-    return *(__m64 *)p;
+    /* x86's alignment restrictions are very relaxed, but that's no excuse */
+    __m64 r;
+    memcpy(&r, p, sizeof(__m64));
+    return r;
 #elif defined USE_ARM_IWMMXT
     int align = (uintptr_t)p & 7;
     __m64 *aligned_p;
@@ -422,7 +409,9 @@ static force_inline uint32_t ldl_u(const uint32_t *p)
 {
 #ifdef USE_X86_MMX
     /* x86's alignment restrictions are very relaxed. */
-    return *p;
+    uint32_t r;
+    memcpy(&r, p, sizeof(uint32_t));
+    return r;
 #else
     struct __una_u32 { uint32_t x __attribute__((packed)); };
     const struct __una_u32 *ptr = (const struct __una_u32 *) p;
@@ -3555,6 +3544,105 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     _mm_empty ();
 }
 
+static force_inline void
+scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t*       pd,
+                                            const uint32_t* ps,
+                                            int32_t         w,
+                                            pixman_fixed_t  vx,
+                                            pixman_fixed_t  unit_x,
+                                            pixman_fixed_t  src_width_fixed,
+                                            pixman_bool_t   fully_transparent_src)
+{
+    if (fully_transparent_src)
+	return;
+
+    while (w)
+    {
+	__m64 d = load (pd);
+	__m64 s = load (ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
+	pd++;
+
+	w--;
+    }
+
+    _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_mmx_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
+		       scaled_nearest_scanline_mmx_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_mmx_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
+		       scaled_nearest_scanline_mmx_8888_8888_OVER,
+		       uint32_t, uint32_t, NORMAL)
+
+static force_inline void
+scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
+					      uint32_t *       dst,
+					      const uint32_t * src,
+					      int32_t          w,
+					      pixman_fixed_t   vx,
+					      pixman_fixed_t   unit_x,
+					      pixman_fixed_t   src_width_fixed,
+					      pixman_bool_t    zero_src)
+{
+    __m64 mm_mask;
+
+    if (zero_src || (*mask >> 24) == 0)
+    {
+	/* A workaround for https://gcc.gnu.org/PR47759 */
+	_mm_empty ();
+	return;
+    }
+
+    mm_mask = expand_alpha (load8888 (mask));
+
+    while (w)
+    {
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	if (s)
+	{
+	    __m64 ms = load8888 (&s);
+	    __m64 alpha = expand_alpha (ms);
+	    __m64 dest  = load8888 (dst);
+
+	    store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
+	}
+
+	dst++;
+	w--;
+    }
+
+    _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
+			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
+			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
+			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
+			      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
+
 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
 #define BMSK (BSHIFT - 1)
 
@@ -3866,7 +3954,7 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
 
     while (w && (((uintptr_t)dst) & 15))
     {
-        *dst++ = *(src++) << 24;
+        *dst++ = (uint32_t)*(src++) << 24;
         w--;
     }
 
@@ -3893,7 +3981,7 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
 
     while (w)
     {
-	*dst++ = *(src++) << 24;
+	*dst++ = (uint32_t)*(src++) << 24;
 	w--;
     }
 
@@ -3995,6 +4083,16 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
 
+    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, x8r8g8b8, mmx_8888_8888                            ),
+    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, x8b8g8r8, mmx_8888_8888                            ),
+    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, a8r8g8b8, mmx_8888_8888                            ),
+    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, a8b8g8r8, mmx_8888_8888                            ),
+
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888                 ),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888                 ),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888                 ),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888                 ),
+
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c
index e598904..e43199b 100644
--- a/pixman/pixman-noop.c
+++ b/pixman/pixman-noop.c
@@ -22,7 +22,7 @@
  * DEALINGS IN THE SOFTWARE.
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <string.h>
 #include <stdlib.h>
diff --git a/pixman/pixman-ppc.c b/pixman/pixman-ppc.c
index a6e7bb0..926eb44 100644
--- a/pixman/pixman-ppc.c
+++ b/pixman/pixman-ppc.c
@@ -20,7 +20,7 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
@@ -68,6 +68,24 @@ pixman_have_vmx (void)
     return have_vmx;
 }
 
+#elif defined (__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+
+    unsigned long cpufeatures;
+    int have_vmx;
+
+    if (elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)))
+    return FALSE;
+
+    have_vmx = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
+    return have_vmx;
+}
+
 #elif defined (__linux__)
 
 #include <sys/types.h>
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index fdc966a..34fb69b 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -1,5 +1,3 @@
-#include <float.h>
-
 #ifndef PIXMAN_PRIVATE_H
 #define PIXMAN_PRIVATE_H
 
@@ -7,7 +5,7 @@
  * The defines which are shared between C and assembly code
  */
 
-/* bilinear interpolation precision (must be <= 8) */
+/* bilinear interpolation precision (must be < 8) */
 #define BILINEAR_INTERPOLATION_BITS 7
 #define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
 
@@ -30,6 +28,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <stddef.h>
+#include <float.h>
 
 #include "pixman-compiler.h"
 
@@ -181,6 +180,10 @@ struct bits_image
     uint32_t *                 free_me;
     int                        rowstride;  /* in number of uint32_t's */
 
+    pixman_dither_t            dither;
+    uint32_t                   dither_offset_y;
+    uint32_t                   dither_offset_x;
+
     fetch_scanline_t           fetch_scanline_32;
     fetch_pixel_32_t	       fetch_pixel_32;
     store_scanline_t           store_scanline_32;
@@ -364,9 +367,38 @@ void
 _pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
                                pixman_fixed_48_16_t      pos);
 
-uint32_t
-_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
-                               pixman_fixed_48_16_t      x);
+typedef void (*pixman_gradient_walker_write_t) (
+    pixman_gradient_walker_t *walker,
+    pixman_fixed_48_16_t      x,
+    uint32_t                 *buffer);
+
+void
+_pixman_gradient_walker_write_narrow(pixman_gradient_walker_t *walker,
+				     pixman_fixed_48_16_t      x,
+				     uint32_t                 *buffer);
+
+void
+_pixman_gradient_walker_write_wide(pixman_gradient_walker_t *walker,
+				   pixman_fixed_48_16_t      x,
+				   uint32_t                 *buffer);
+
+typedef void (*pixman_gradient_walker_fill_t) (
+    pixman_gradient_walker_t *walker,
+    pixman_fixed_48_16_t      x,
+    uint32_t                 *buffer,
+    uint32_t                 *end);
+
+void
+_pixman_gradient_walker_fill_narrow(pixman_gradient_walker_t *walker,
+				    pixman_fixed_48_16_t      x,
+				    uint32_t                 *buffer,
+				    uint32_t                 *end);
+
+void
+_pixman_gradient_walker_fill_wide(pixman_gradient_walker_t *walker,
+				  pixman_fixed_48_16_t      x,
+				  uint32_t                 *buffer,
+				  uint32_t                 *end);
 
 /*
  * Edges
@@ -608,6 +640,11 @@ pixman_implementation_t *
 _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
 #endif
 
+#ifdef USE_ARM_A64_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
 #ifdef USE_MIPS_DSPR2
 pixman_implementation_t *
 _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
@@ -819,11 +856,11 @@ pixman_contract_from_float (uint32_t     *dst,
 /* Region Helpers */
 pixman_bool_t
 pixman_region32_copy_from_region16 (pixman_region32_t *dst,
-                                    pixman_region16_t *src);
+                                    const pixman_region16_t *src);
 
 pixman_bool_t
 pixman_region16_copy_from_region32 (pixman_region16_t *dst,
-                                    pixman_region32_t *src);
+                                    const pixman_region32_t *src);
 
 /* Doubly linked lists */
 typedef struct pixman_link_t pixman_link_t;
@@ -1013,28 +1050,9 @@ float pixman_unorm_to_float (uint16_t u, int n_bits);
  * Various debugging code
  */
 
-#undef DEBUG
-
 #define COMPILE_TIME_ASSERT(x)						\
     do { typedef int compile_time_assertion [(x)?1:-1]; } while (0)
 
-/* Turn on debugging depending on what type of release this is
- */
-#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1))
-
-/* Debugging gets turned on for development releases because these
- * are the things that end up in bleeding edge distributions such
- * as Rawhide etc.
- *
- * For performance reasons we don't turn it on for stable releases or
- * random git checkouts. (Random git checkouts are often used for
- * performance work).
- */
-
-#    define DEBUG
-
-#endif
-
 void
 _pixman_log_error (const char *function, const char *message);
 
@@ -1074,16 +1092,19 @@ _pixman_log_error (const char *function, const char *message);
 
 typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
 
+PIXMAN_EXPORT
 pixman_bool_t
 pixman_transform_point_31_16 (const pixman_transform_t    *t,
                               const pixman_vector_48_16_t *v,
                               pixman_vector_48_16_t       *result);
 
+PIXMAN_EXPORT
 void
 pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
                                  const pixman_vector_48_16_t *v,
                                  pixman_vector_48_16_t       *result);
 
+PIXMAN_EXPORT
 void
 pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
                                      const pixman_vector_48_16_t *v,
diff --git a/pixman/pixman-radial-gradient.c b/pixman/pixman-radial-gradient.c
index 6a21796..38e1052 100644
--- a/pixman/pixman-radial-gradient.c
+++ b/pixman/pixman-radial-gradient.c
@@ -28,7 +28,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <stdlib.h>
 #include <math.h>
@@ -66,15 +66,18 @@ fdot (double x1,
     return x1 * x2 + y1 * y2 + z1 * z2;
 }
 
-static uint32_t
-radial_compute_color (double                    a,
-		      double                    b,
-		      double                    c,
-		      double                    inva,
-		      double                    dr,
-		      double                    mindr,
-		      pixman_gradient_walker_t *walker,
-		      pixman_repeat_t           repeat)
+static void
+radial_write_color (double                         a,
+		    double                         b,
+		    double                         c,
+		    double                         inva,
+		    double                         dr,
+		    double                         mindr,
+		    pixman_gradient_walker_t      *walker,
+		    pixman_repeat_t                repeat,
+		    int                            Bpp,
+		    pixman_gradient_walker_write_t write_pixel,
+		    uint32_t                      *buffer)
 {
     /*
      * In this function error propagation can lead to bad results:
@@ -99,21 +102,31 @@ radial_compute_color (double                    a,
 	double t;
 
 	if (b == 0)
-	    return 0;
+	{
+	    memset (buffer, 0, Bpp);
+	    return;
+	}
 
 	t = pixman_fixed_1 / 2 * c / b;
 	if (repeat == PIXMAN_REPEAT_NONE)
 	{
 	    if (0 <= t && t <= pixman_fixed_1)
-		return _pixman_gradient_walker_pixel (walker, t);
+	    {
+		write_pixel (walker, t, buffer);
+		return;
+	    }
 	}
 	else
 	{
 	    if (t * dr >= mindr)
-		return _pixman_gradient_walker_pixel (walker, t);
+	    {
+		write_pixel (walker, t, buffer);
+		return;
+	    }
 	}
 
-	return 0;
+	memset (buffer, 0, Bpp);
+	return;
     }
 
     discr = fdot (b, a, 0, b, -c, 0);
@@ -139,24 +152,40 @@ radial_compute_color (double                    a,
 	if (repeat == PIXMAN_REPEAT_NONE)
 	{
 	    if (0 <= t0 && t0 <= pixman_fixed_1)
-		return _pixman_gradient_walker_pixel (walker, t0);
+	    {
+		write_pixel (walker, t0, buffer);
+		return;
+	    }
 	    else if (0 <= t1 && t1 <= pixman_fixed_1)
-		return _pixman_gradient_walker_pixel (walker, t1);
+	    {
+		write_pixel (walker, t1, buffer);
+		return;
+           }
 	}
 	else
 	{
 	    if (t0 * dr >= mindr)
-		return _pixman_gradient_walker_pixel (walker, t0);
+	    {
+		write_pixel (walker, t0, buffer);
+		return;
+	    }
 	    else if (t1 * dr >= mindr)
-		return _pixman_gradient_walker_pixel (walker, t1);
+	    {
+		write_pixel (walker, t1, buffer);
+		return;
+	    }
 	}
     }
 
-    return 0;
+    memset (buffer, 0, Bpp);
+    return;
 }
 
 static uint32_t *
-radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+radial_get_scanline (pixman_iter_t                 *iter,
+		     const uint32_t                *mask,
+		     int                            Bpp,
+		     pixman_gradient_walker_write_t write_pixel)
 {
     /*
      * Implementation of radial gradients following the PDF specification.
@@ -247,7 +276,7 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 
     gradient_t *gradient = (gradient_t *)image;
     radial_gradient_t *radial = (radial_gradient_t *)image;
-    uint32_t *end = buffer + width;
+    uint32_t *end = buffer + width * (Bpp / 4);
     pixman_gradient_walker_t walker;
     pixman_vector_t v, unit;
 
@@ -330,18 +359,21 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 	{
 	    if (!mask || *mask++)
 	    {
-		*buffer = radial_compute_color (radial->a, b, c,
-						radial->inva,
-						radial->delta.radius,
-						radial->mindr,
-						&walker,
-						image->common.repeat);
+		radial_write_color (radial->a, b, c,
+				    radial->inva,
+				    radial->delta.radius,
+				    radial->mindr,
+				    &walker,
+				    image->common.repeat,
+				    Bpp,
+				    write_pixel,
+				    buffer);
 	    }
 
 	    b += db;
 	    c += dc;
 	    dc += ddc;
-	    ++buffer;
+	    buffer += (Bpp / 4);
 	}
     }
     else
@@ -375,20 +407,23 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 			      pdx, pdy, radial->c1.radius);
 		    /*  / pixman_fixed_1 / pixman_fixed_1 */
 
-		    *buffer = radial_compute_color (radial->a, b, c,
-						    radial->inva,
-						    radial->delta.radius,
-						    radial->mindr,
-						    &walker,
-						    image->common.repeat);
+		    radial_write_color (radial->a, b, c,
+					radial->inva,
+					radial->delta.radius,
+					radial->mindr,
+					&walker,
+					image->common.repeat,
+					Bpp,
+					write_pixel,
+					buffer);
 		}
 		else
 		{
-		    *buffer = 0;
+		    memset (buffer, 0, Bpp);
 		}
 	    }
 
-	    ++buffer;
+	    buffer += (Bpp / 4);
 
 	    v.vector[0] += unit.vector[0];
 	    v.vector[1] += unit.vector[1];
@@ -401,14 +436,17 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 }
 
 static uint32_t *
-radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 {
-    uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
-
-    pixman_expand_to_float (
-	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+    return radial_get_scanline (iter, mask, 4,
+				_pixman_gradient_walker_write_narrow);
+}
 
-    return buffer;
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return radial_get_scanline (iter, NULL, 16,
+				_pixman_gradient_walker_write_wide);
 }
 
 void
@@ -422,11 +460,11 @@ _pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
 
 PIXMAN_EXPORT pixman_image_t *
 pixman_image_create_radial_gradient (const pixman_point_fixed_t *  inner,
-                                     const pixman_point_fixed_t *  outer,
-                                     pixman_fixed_t                inner_radius,
-                                     pixman_fixed_t                outer_radius,
-                                     const pixman_gradient_stop_t *stops,
-                                     int                           n_stops)
+				     const pixman_point_fixed_t *  outer,
+				     pixman_fixed_t                inner_radius,
+				     pixman_fixed_t                outer_radius,
+				     const pixman_gradient_stop_t *stops,
+				     int                           n_stops)
 {
     pixman_image_t *image;
     radial_gradient_t *radial;
diff --git a/pixman/pixman-region.c b/pixman/pixman-region.c
index 59bc9c7..537d5fb 100644
--- a/pixman/pixman-region.c
+++ b/pixman/pixman-region.c
@@ -76,7 +76,7 @@
 #define PIXREGION_SIZE(reg) ((reg)->data ? (reg)->data->size : 0)
 #define PIXREGION_RECTS(reg) \
     ((reg)->data ? (box_type_t *)((reg)->data + 1) \
-     : &(reg)->extents)
+     : (box_type_t *)&(reg)->extents)
 #define PIXREGION_BOXPTR(reg) ((box_type_t *)((reg)->data + 1))
 #define PIXREGION_BOX(reg, i) (&PIXREGION_BOXPTR (reg)[i])
 #define PIXREGION_TOP(reg) PIXREGION_BOX (reg, (reg)->data->numRects)
@@ -292,7 +292,7 @@ alloc_data (size_t n)
     } while (0)
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX (_equal) (region_type_t *reg1, region_type_t *reg2)
+PREFIX (_equal) (const region_type_t *reg1, const region_type_t *reg2)
 {
     int i;
     box_type_t *rects1;
@@ -395,7 +395,7 @@ PREFIX (_init_rect) (region_type_t *	region,
 }
 
 PIXMAN_EXPORT void
-PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents)
+PREFIX (_init_with_extents) (region_type_t *region, const box_type_t *extents)
 {
     if (!GOOD_RECT (extents))
     {
@@ -417,13 +417,13 @@ PREFIX (_fini) (region_type_t *region)
 }
 
 PIXMAN_EXPORT int
-PREFIX (_n_rects) (region_type_t *region)
+PREFIX (_n_rects) (const region_type_t *region)
 {
     return PIXREGION_NUMRECTS (region);
 }
 
 PIXMAN_EXPORT box_type_t *
-PREFIX (_rectangles) (region_type_t *region,
+PREFIX (_rectangles) (const region_type_t *region,
                       int               *n_rects)
 {
     if (n_rects)
@@ -505,7 +505,7 @@ pixman_rect_alloc (region_type_t * region,
 }
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX (_copy) (region_type_t *dst, region_type_t *src)
+PREFIX (_copy) (region_type_t *dst, const region_type_t *src)
 {
     GOOD (dst);
     GOOD (src);
@@ -746,8 +746,8 @@ typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
 
 static pixman_bool_t
 pixman_op (region_type_t *  new_reg,               /* Place to store result	    */
-	   region_type_t *  reg1,                  /* First region in operation     */
-	   region_type_t *  reg2,                  /* 2d region in operation        */
+	   const region_type_t *  reg1,                  /* First region in operation     */
+	   const region_type_t *  reg2,                  /* 2d region in operation        */
 	   overlap_proc_ptr overlap_func,          /* Function to call for over-
 						    * lapping bands		    */
 	   int              append_non1,           /* Append non-overlapping bands  
@@ -1155,8 +1155,8 @@ pixman_region_intersect_o (region_type_t *region,
 
 PIXMAN_EXPORT pixman_bool_t
 PREFIX (_intersect) (region_type_t *     new_reg,
-                     region_type_t *        reg1,
-                     region_type_t *        reg2)
+                     const region_type_t *        reg1,
+                     const region_type_t *        reg2)
 {
     GOOD (reg1);
     GOOD (reg2);
@@ -1321,7 +1321,7 @@ pixman_region_union_o (region_type_t *region,
 
 PIXMAN_EXPORT pixman_bool_t
 PREFIX(_intersect_rect) (region_type_t *dest,
-			 region_type_t *source,
+			 const region_type_t *source,
 			 int x, int y,
 			 unsigned int width,
 			 unsigned int height)
@@ -1342,7 +1342,7 @@ PREFIX(_intersect_rect) (region_type_t *dest,
  */
 PIXMAN_EXPORT pixman_bool_t
 PREFIX (_union_rect) (region_type_t *dest,
-                      region_type_t *source,
+                      const region_type_t *source,
                       int            x,
 		      int            y,
                       unsigned int   width,
@@ -1368,9 +1368,9 @@ PREFIX (_union_rect) (region_type_t *dest,
 }
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX (_union) (region_type_t *new_reg,
-                 region_type_t *reg1,
-                 region_type_t *reg2)
+PREFIX (_union) (region_type_t *      new_reg,
+                 const region_type_t *reg1,
+                 const region_type_t *reg2)
 {
     /* Return TRUE if some overlap
      * between reg1, reg2
@@ -1954,9 +1954,9 @@ pixman_region_subtract_o (region_type_t * region,
  *-----------------------------------------------------------------------
  */
 PIXMAN_EXPORT pixman_bool_t
-PREFIX (_subtract) (region_type_t *reg_d,
-                    region_type_t *reg_m,
-                    region_type_t *reg_s)
+PREFIX (_subtract) (region_type_t *      reg_d,
+                    const region_type_t *reg_m,
+                    const region_type_t *reg_s)
 {
     GOOD (reg_m);
     GOOD (reg_s);
@@ -2019,9 +2019,9 @@ PREFIX (_subtract) (region_type_t *reg_d,
  *-----------------------------------------------------------------------
  */
 PIXMAN_EXPORT pixman_bool_t
-PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
-		   region_type_t *reg1,     /* Region to invert */
-		   box_type_t *   inv_rect) /* Bounding box for inversion */
+PREFIX (_inverse) (region_type_t *      new_reg,  /* Destination region */
+		   const region_type_t *reg1,     /* Region to invert */
+		   const box_type_t *   inv_rect) /* Bounding box for inversion */
 {
     region_type_t inv_reg; /* Quick and dirty region made from the
 			    * bounding box */
@@ -2113,8 +2113,8 @@ find_box_for_y (box_type_t *begin, box_type_t *end, int y)
  *   that doesn't overlap the box at all and part_in is false)
  */
 PIXMAN_EXPORT pixman_region_overlap_t
-PREFIX (_contains_rectangle) (region_type_t *  region,
-			      box_type_t *     prect)
+PREFIX (_contains_rectangle) (const region_type_t *  region,
+			      const box_type_t *     prect)
 {
     box_type_t *     pbox;
     box_type_t *     pbox_end;
@@ -2318,7 +2318,7 @@ PREFIX (_translate) (region_type_t *region, int x, int y)
 }
 
 PIXMAN_EXPORT void
-PREFIX (_reset) (region_type_t *region, box_type_t *box)
+PREFIX (_reset) (region_type_t *region, const box_type_t *box)
 {
     GOOD (region);
 
@@ -2343,7 +2343,7 @@ PREFIX (_clear) (region_type_t *region)
 
 /* box is "return" value */
 PIXMAN_EXPORT int
-PREFIX (_contains_point) (region_type_t * region,
+PREFIX (_contains_point) (const region_type_t * region,
                           int x, int y,
                           box_type_t * box)
 {
@@ -2387,7 +2387,15 @@ PREFIX (_contains_point) (region_type_t * region,
 }
 
 PIXMAN_EXPORT int
-PREFIX (_not_empty) (region_type_t * region)
+PREFIX (_empty) (const region_type_t * region)
+{
+    GOOD (region);
+
+    return(PIXREGION_NIL (region));
+}
+
+PIXMAN_EXPORT int
+PREFIX (_not_empty) (const region_type_t * region)
 {
     GOOD (region);
 
@@ -2395,11 +2403,11 @@ PREFIX (_not_empty) (region_type_t * region)
 }
 
 PIXMAN_EXPORT box_type_t *
-PREFIX (_extents) (region_type_t * region)
+PREFIX (_extents) (const region_type_t * region)
 {
     GOOD (region);
 
-    return(&region->extents);
+    return(box_type_t *)(&region->extents);
 }
 
 /*
diff --git a/pixman/pixman-region16.c b/pixman/pixman-region16.c
index d88d338..da4719e 100644
--- a/pixman/pixman-region16.c
+++ b/pixman/pixman-region16.c
@@ -23,7 +23,7 @@
  * Author: Soren Sandmann <sandmann@redhat.com>
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #undef PIXMAN_DISABLE_DEPRECATED
diff --git a/pixman/pixman-region32.c b/pixman/pixman-region32.c
index abd6b1a..68b456b 100644
--- a/pixman/pixman-region32.c
+++ b/pixman/pixman-region32.c
@@ -23,7 +23,7 @@
  * Author: Soren Sandmann <sandmann@redhat.com>
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c
index 5f9fef6..44f4de0 100644
--- a/pixman/pixman-solid-fill.c
+++ b/pixman/pixman-solid-fill.c
@@ -22,7 +22,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include "pixman-private.h"
 
@@ -30,10 +30,10 @@ static uint32_t
 color_to_uint32 (const pixman_color_t *color)
 {
     return
-        (color->alpha >> 8 << 24) |
-        (color->red >> 8 << 16) |
-        (color->green & 0xff00) |
-        (color->blue >> 8);
+        ((unsigned int) color->alpha >> 8 << 24) |
+        ((unsigned int) color->red >> 8 << 16) |
+        ((unsigned int) color->green & 0xff00) |
+        ((unsigned int) color->blue >> 8);
 }
 
 static argb_t
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index a6e7808..9c9cff2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -27,7 +27,7 @@
  * Based on work by Owen Taylor and Søren Sandmann
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
@@ -373,16 +373,6 @@ load_128_unaligned (const __m128i* src)
     return _mm_loadu_si128 (src);
 }
 
-/* save 4 pixels using Write Combining memory on a 16-byte
- * boundary aligned address
- */
-static force_inline void
-save_128_write_combining (__m128i* dst,
-                          __m128i  data)
-{
-    _mm_stream_si128 (dst, data);
-}
-
 /* save 4 pixels on a 16-byte boundary aligned address */
 static force_inline void
 save_128_aligned (__m128i* dst,
@@ -391,14 +381,6 @@ save_128_aligned (__m128i* dst,
     _mm_store_si128 (dst, data);
 }
 
-/* save 4 pixels on a unaligned address */
-static force_inline void
-save_128_unaligned (__m128i* dst,
-                    __m128i  data)
-{
-    _mm_storeu_si128 (dst, data);
-}
-
 static force_inline __m128i
 load_32_1x128 (uint32_t data)
 {
@@ -518,7 +500,8 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 static force_inline uint32_t
 combine1 (const uint32_t *ps, const uint32_t *pm)
 {
-    uint32_t s = *ps;
+    uint32_t s;
+    memcpy(&s, ps, sizeof(uint32_t));
 
     if (pm)
     {
@@ -3201,7 +3184,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
     int32_t w;
-    uint32_t m, d;
+    uint32_t d;
 
     __m128i xmm_src, xmm_alpha, xmm_def;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
@@ -3256,7 +3239,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
 	while (w >= 4)
 	{
-	    m = *((uint32_t*)mask);
+            uint32_t m;
+            memcpy(&m, mask, sizeof(uint32_t));
 
 	    if (srca == 0xff && m == 0xffffffff)
 	    {
@@ -3333,8 +3317,8 @@ sse2_fill (pixman_implementation_t *imp,
 
     if (bpp == 8)
     {
-	uint8_t b;
-	uint16_t w;
+	uint32_t b;
+	uint32_t w;
 
 	stride = stride * (int) sizeof (uint32_t) / 1;
 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
@@ -3476,7 +3460,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
     int32_t w;
-    uint32_t m;
 
     __m128i xmm_src, xmm_def;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
@@ -3528,7 +3511,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
 	while (w >= 4)
 	{
-	    m = *((uint32_t*)mask);
+            uint32_t m;
+            memcpy(&m, mask, sizeof(uint32_t));
 
 	    if (srca == 0xff && m == 0xffffffff)
 	    {
@@ -3594,7 +3578,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
     int32_t w;
-    uint32_t m;
     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     __m128i xmm_src, xmm_alpha;
@@ -3626,7 +3609,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 	while (w && (uintptr_t)dst & 15)
 	{
-	    m = *mask++;
+	    uint8_t m = *mask++;
 
 	    if (m)
 	    {
@@ -3646,11 +3629,13 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 	while (w >= 8)
 	{
+            uint32_t m;
+
 	    xmm_dst = load_128_aligned ((__m128i*) dst);
 	    unpack_565_128_4x128 (xmm_dst,
 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
 
-	    m = *((uint32_t*)mask);
+            memcpy(&m, mask, sizeof(uint32_t));
 	    mask += 4;
 
 	    if (m)
@@ -3670,7 +3655,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 			       &xmm_dst0, &xmm_dst1);
 	    }
 
-	    m = *((uint32_t*)mask);
+            memcpy(&m, mask, sizeof(uint32_t));
 	    mask += 4;
 
 	    if (m)
@@ -3699,7 +3684,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    m = *mask++;
+	    uint8_t m = *mask++;
 
 	    if (m)
 	    {
@@ -4061,7 +4046,7 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint32_t d, m;
+    uint32_t d;
     uint32_t src;
     int32_t w;
 
@@ -4088,7 +4073,7 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 
 	while (w && ((uintptr_t)dst & 15))
 	{
-	    m = (uint32_t) *mask++;
+	    uint8_t m = *mask++;
 	    d = (uint32_t) *dst;
 
 	    *dst++ = (uint8_t) pack_1x128_32 (
@@ -4125,7 +4110,7 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    m = (uint32_t) *mask++;
+	    uint8_t m = *mask++;
 	    d = (uint32_t) *dst;
 
 	    *dst++ = (uint8_t) pack_1x128_32 (
@@ -4302,7 +4287,7 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
     int dst_stride, mask_stride;
     int32_t w;
     uint32_t src;
-    uint32_t m, d;
+    uint32_t d;
 
     __m128i xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
@@ -4327,7 +4312,7 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 
 	while (w && ((uintptr_t)dst & 15))
 	{
-	    m = (uint32_t) *mask++;
+	    uint8_t m = *mask++;
 	    d = (uint32_t) *dst;
 
 	    *dst++ = (uint8_t) pack_1x128_32 (
@@ -4363,7 +4348,7 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
 
 	while (w)
 	{
-	    m = (uint32_t) *mask++;
+	    uint8_t m = (uint32_t) *mask++;
 	    d = (uint32_t) *dst;
 
 	    *dst++ = (uint8_t) pack_1x128_32 (
@@ -4636,7 +4621,9 @@ sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
 
 	while (w >= 4)
 	{
-	    uint32_t m = *(uint32_t*)mask;
+	    uint32_t m;
+            memcpy(&m, mask, sizeof(uint32_t));
+
 	    if (m)
 	    {
 		__m128i xmm_mask_lo, xmm_mask_hi;
@@ -4743,7 +4730,7 @@ sse2_blt (pixman_implementation_t *imp,
 
 	while (w >= 2 && ((uintptr_t)d & 3))
 	{
-	    *(uint16_t *)d = *(uint16_t *)s;
+            memmove(d, s, 2);
 	    w -= 2;
 	    s += 2;
 	    d += 2;
@@ -4751,7 +4738,7 @@ sse2_blt (pixman_implementation_t *imp,
 
 	while (w >= 4 && ((uintptr_t)d & 15))
 	{
-	    *(uint32_t *)d = *(uint32_t *)s;
+            memmove(d, s, 4);
 
 	    w -= 4;
 	    s += 4;
@@ -4788,7 +4775,7 @@ sse2_blt (pixman_implementation_t *imp,
 
 	while (w >= 4)
 	{
-	    *(uint32_t *)d = *(uint32_t *)s;
+            memmove(d, s, 4);
 
 	    w -= 4;
 	    s += 4;
@@ -4797,7 +4784,7 @@ sse2_blt (pixman_implementation_t *imp,
 
 	if (w >= 2)
 	{
-	    *(uint16_t *)d = *(uint16_t *)s;
+            memmove(d, s, 2);
 	    w -= 2;
 	    s += 2;
 	    d += 2;
@@ -4829,7 +4816,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     uint32_t    *src, *src_line, s;
     uint32_t    *dst, *dst_line, d;
     uint8_t         *mask, *mask_line;
-    uint32_t m;
     int src_stride, mask_stride, dst_stride;
     int32_t w;
     __m128i ms;
@@ -4858,8 +4844,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         while (w && (uintptr_t)dst & 15)
         {
+            uint8_t m = *mask++;
             s = 0xff000000 | *src++;
-            m = (uint32_t) *mask++;
             d = *dst;
             ms = unpack_32_1x128 (s);
 
@@ -4877,7 +4863,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         while (w >= 4)
         {
-            m = *(uint32_t*) mask;
+            uint32_t m;
+            memcpy(&m, mask, sizeof(uint32_t));
             xmm_src = _mm_or_si128 (
 		load_128_unaligned ((__m128i*)src), mask_ff000000);
 
@@ -4913,7 +4900,7 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         while (w)
         {
-            m = (uint32_t) *mask++;
+            uint8_t m = *mask++;
 
             if (m)
             {
@@ -4954,7 +4941,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
     uint32_t    *src, *src_line, s;
     uint32_t    *dst, *dst_line, d;
     uint8_t         *mask, *mask_line;
-    uint32_t m;
     int src_stride, mask_stride, dst_stride;
     int32_t w;
 
@@ -4983,9 +4969,9 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
         while (w && (uintptr_t)dst & 15)
         {
 	    uint32_t sa;
+            uint8_t m = *mask++;
 
             s = *src++;
-            m = (uint32_t) *mask++;
             d = *dst;
 
 	    sa = s >> 24;
@@ -5016,7 +5002,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
 
         while (w >= 4)
         {
-            m = *(uint32_t *) mask;
+            uint32_t m;
+            memcpy(&m, mask, sizeof(uint32_t));
 
 	    if (m)
 	    {
@@ -5055,9 +5042,9 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
         while (w)
         {
 	    uint32_t sa;
+            uint8_t m = *mask++;
 
             s = *src++;
-            m = (uint32_t) *mask++;
             d = *dst;
 
 	    sa = s >> 24;
@@ -5924,13 +5911,11 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
     intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
     uint32_t pix1, pix2;
-    uint32_t m;
 
     while (w && ((uintptr_t)dst & 15))
     {
 	uint32_t sa;
-
-	m = (uint32_t) *mask++;
+	uint8_t m = *mask++;
 
 	if (m)
 	{
@@ -5966,11 +5951,13 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
 
     while (w >= 4)
     {
+        uint32_t m;
+
 	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
 	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-	m = *(uint32_t*)mask;
+        memcpy(&m, mask, sizeof(uint32_t));
 
 	if (m)
 	{
@@ -6012,8 +5999,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
     while (w)
     {
 	uint32_t sa;
-
-	m = (uint32_t) *mask++;
+	uint8_t m = *mask++;
 
 	if (m)
 	{
@@ -6274,31 +6260,15 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
 
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
-    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
-    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
@@ -6426,7 +6396,7 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
 
     while (w && (((uintptr_t)dst) & 15))
     {
-        *dst++ = *(src++) << 24;
+        *dst++ = (uint32_t)(*(src++)) << 24;
         w--;
     }
 
@@ -6453,7 +6423,7 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
 
     while (w)
     {
-	*dst++ = *(src++) << 24;
+	*dst++ = (uint32_t)(*(src++)) << 24;
 	w--;
     }
 
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
index 680d6b9..0359895 100644
--- a/pixman/pixman-ssse3.c
+++ b/pixman/pixman-ssse3.c
@@ -24,7 +24,7 @@
  * Author: Soren Sandmann (soren.sandmann@gmail.com)
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <stdlib.h>
diff --git a/pixman/pixman-timer.c b/pixman/pixman-timer.c
index f5ae18e..656d900 100644
--- a/pixman/pixman-timer.c
+++ b/pixman/pixman-timer.c
@@ -20,7 +20,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <stdlib.h>
diff --git a/pixman/pixman-trap.c b/pixman/pixman-trap.c
index 91766fd..0ec73dc 100644
--- a/pixman/pixman-trap.c
+++ b/pixman/pixman-trap.c
@@ -22,7 +22,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <stdio.h>
@@ -74,7 +74,7 @@ pixman_sample_floor_y (pixman_fixed_t y,
 
     if (f < Y_FRAC_FIRST (n))
     {
-	if (pixman_fixed_to_int (i) == 0x8000)
+	if (pixman_fixed_to_int (i) == 0xffff8000)
 	{
 	    f = 0; /* saturate */
 	}
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index 4a3a835..302cd0c 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -23,7 +23,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
@@ -206,7 +206,7 @@ pixman_contract_from_float (uint32_t     *dst,
 
     for (i = 0; i < width; ++i)
     {
-	uint8_t a, r, g, b;
+	uint32_t a, r, g, b;
 
 	a = float_to_unorm (src[i].a, 8);
 	r = float_to_unorm (src[i].r, 8);
@@ -238,7 +238,7 @@ _pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *in
 
 pixman_bool_t
 pixman_region16_copy_from_region32 (pixman_region16_t *dst,
-                                    pixman_region32_t *src)
+                                    const pixman_region32_t *src)
 {
     int n_boxes, i;
     pixman_box32_t *boxes32;
@@ -268,7 +268,7 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
 
 pixman_bool_t
 pixman_region32_copy_from_region16 (pixman_region32_t *dst,
-                                    pixman_region16_t *src)
+                                    const pixman_region16_t *src)
 {
     int n_boxes, i;
     pixman_box16_t *boxes16;
diff --git a/pixman/pixman-version.h.in b/pixman/pixman-version.h.in
index 256b2e6..64778a5 100644
--- a/pixman/pixman-version.h.in
+++ b/pixman/pixman-version.h.in
@@ -47,4 +47,8 @@
 	PIXMAN_VERSION_MINOR,			\
 	PIXMAN_VERSION_MICRO)
 
+#ifndef PIXMAN_API
+# define PIXMAN_API
+#endif
+
 #endif /* PIXMAN_VERSION_H__ */
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index c33631c..d4b5dc8 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -26,21 +26,45 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 #include <altivec.h>
 
 #define AVV(x...) {x}
 
+static vector unsigned int mask_ff000000;
+static vector unsigned int mask_red;
+static vector unsigned int mask_green;
+static vector unsigned int mask_blue;
+static vector unsigned int mask_565_fix_rb;
+static vector unsigned int mask_565_fix_g;
+
 static force_inline vector unsigned int
 splat_alpha (vector unsigned int pix)
 {
+#ifdef WORDS_BIGENDIAN
     return vec_perm (pix, pix,
 		     (vector unsigned char)AVV (
 			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
 			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+#else
+    return vec_perm (pix, pix,
+		     (vector unsigned char)AVV (
+			 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
+			 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
+#endif
+}
+
+static force_inline vector unsigned int
+splat_pixel (vector unsigned int pix)
+{
+    return vec_perm (pix, pix,
+		     (vector unsigned char)AVV (
+			 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+			 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
 }
 
 static force_inline vector unsigned int
@@ -50,12 +74,22 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
 
     /* unpack to short */
     hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
 	vec_mergeh ((vector unsigned char)AVV (0),
 		    (vector unsigned char)p);
+#else
+	vec_mergeh ((vector unsigned char) p,
+		    (vector unsigned char) AVV (0));
+#endif
 
     mod = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
 	vec_mergeh ((vector unsigned char)AVV (0),
 		    (vector unsigned char)a);
+#else
+	vec_mergeh ((vector unsigned char) a,
+		    (vector unsigned char) AVV (0));
+#endif
 
     hi = vec_mladd (hi, mod, (vector unsigned short)
                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
@@ -67,11 +101,22 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
 
     /* unpack to short */
     lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
 	vec_mergel ((vector unsigned char)AVV (0),
 		    (vector unsigned char)p);
+#else
+	vec_mergel ((vector unsigned char) p,
+		    (vector unsigned char) AVV (0));
+#endif
+
     mod = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
 	vec_mergel ((vector unsigned char)AVV (0),
 		    (vector unsigned char)a);
+#else
+	vec_mergel ((vector unsigned char) a,
+		    (vector unsigned char) AVV (0));
+#endif
 
     lo = vec_mladd (lo, mod, (vector unsigned short)
                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
@@ -129,6 +174,7 @@ over (vector unsigned int src,
     over (pix_multiply (src, mask),					\
           pix_multiply (srca, mask), dest)
 
+#ifdef WORDS_BIGENDIAN
 
 #define COMPUTE_SHIFT_MASK(source)					\
     source ## _mask = vec_lvsl (0, source);
@@ -140,36 +186,294 @@ over (vector unsigned int src,
     mask ## _mask = vec_lvsl (0, mask);					\
     source ## _mask = vec_lvsl (0, source);
 
-/* notice you have to declare temp vars...
- * Note: tmp3 and tmp4 must remain untouched!
- */
-
-#define LOAD_VECTORS(dest, source)			  \
+#define LOAD_VECTOR(source)				  \
+do							  \
+{							  \
+    vector unsigned char tmp1, tmp2;			  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    v ## source = (typeof(v ## source))			  \
+    v ## source = (typeof(v ## source)) 		  \
 	vec_perm (tmp1, tmp2, source ## _mask);		  \
-    v ## dest = (typeof(v ## dest))vec_ld (0, dest);
+} while (0)
 
-#define LOAD_VECTORSC(dest, source, mask)		  \
-    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
-    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    v ## source = (typeof(v ## source))			  \
-	vec_perm (tmp1, tmp2, source ## _mask);		  \
-    tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
+#define LOAD_VECTORS(dest, source)			  \
+do							  \
+{							  \
+    LOAD_VECTOR(source);				  \
     v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
-    tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
-    v ## mask = (typeof(v ## mask))			  \
-	vec_perm (tmp1, tmp2, mask ## _mask);
+} while (0)
+
+#define LOAD_VECTORSC(dest, source, mask)		  \
+do							  \
+{							  \
+    LOAD_VECTORS(dest, source); 			  \
+    LOAD_VECTOR(mask);					  \
+} while (0)
+
+#define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
+#define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
+
+#else
+
+/* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
+ * They are defined that way because little endian altivec can do unaligned
+ * reads natively and have no need for constructing the permutation pattern
+ * variables.
+ */
+#define COMPUTE_SHIFT_MASK(source)
+
+#define COMPUTE_SHIFT_MASKS(dest, source)
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)
+
+# define LOAD_VECTOR(source)				\
+    v ## source = (typeof(v ## source))vec_xl(0, source);
+
+# define LOAD_VECTORS(dest, source)			\
+    LOAD_VECTOR(source);				\
+    LOAD_VECTOR(dest);					\
+
+# define LOAD_VECTORSC(dest, source, mask)		\
+    LOAD_VECTORS(dest, source); 			\
+    LOAD_VECTOR(mask);					\
+
+#define DECLARE_SRC_MASK_VAR
+#define DECLARE_MASK_MASK_VAR
+
+#endif /* WORDS_BIGENDIAN */
 
 #define LOAD_VECTORSM(dest, source, mask)				\
-    LOAD_VECTORSC (dest, source, mask)					\
+    LOAD_VECTORSC (dest, source, mask); 				\
     v ## source = pix_multiply (v ## source,				\
                                 splat_alpha (v ## mask));
 
 #define STORE_VECTOR(dest)						\
     vec_st ((vector unsigned int) v ## dest, 0, dest);
 
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline vector unsigned int
+load_128_aligned (const uint32_t* src)
+{
+    return *((vector unsigned int *) src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline vector unsigned int
+load_128_unaligned (const uint32_t* src)
+{
+    vector unsigned int vsrc;
+    DECLARE_SRC_MASK_VAR;
+
+    COMPUTE_SHIFT_MASK (src);
+    LOAD_VECTOR (src);
+
+    return vsrc;
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (uint32_t* data,
+		  vector unsigned int vdata)
+{
+    STORE_VECTOR(data)
+}
+
+static force_inline vector unsigned int
+create_mask_32_128 (uint32_t mask)
+{
+    return (vector unsigned int) {mask, mask, mask, mask};
+}
+
+static force_inline vector unsigned int
+unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned char lo;
+
+    /* unpack to short */
+    lo = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned char) data2,
+		    (vector unsigned char) data1);
+#else
+	vec_mergel ((vector unsigned char) data1,
+		    (vector unsigned char) data2);
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned char hi;
+
+    /* unpack to short */
+    hi = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+	vec_mergeh ((vector unsigned char) data2,
+		    (vector unsigned char) data1);
+#else
+	vec_mergeh ((vector unsigned char) data1,
+		    (vector unsigned char) data2);
+#endif
+
+    return (vector unsigned int) hi;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned short lo;
+
+    /* unpack to char */
+    lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergel ((vector unsigned short) data2,
+		    (vector unsigned short) data1);
+#else
+	vec_mergel ((vector unsigned short) data1,
+		    (vector unsigned short) data2);
+#endif
+
+    return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+    vector unsigned short hi;
+
+    /* unpack to char */
+    hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+	vec_mergeh ((vector unsigned short) data2,
+		    (vector unsigned short) data1);
+#else
+	vec_mergeh ((vector unsigned short) data1,
+		    (vector unsigned short) data2);
+#endif
+
+    return (vector unsigned int) hi;
+}
+
+static force_inline void
+unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
+		    vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+    *data_lo = unpacklo_128_16x8(data1, data2);
+    *data_hi = unpackhi_128_16x8(data1, data2);
+}
+
+static force_inline void
+unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
+		    vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+    *data_lo = unpacklo_128_8x16(data1, data2);
+    *data_hi = unpackhi_128_8x16(data1, data2);
+}
+
+static force_inline vector unsigned int
+unpack_565_to_8888 (vector unsigned int lo)
+{
+    vector unsigned int r, g, b, rb, t;
+
+    r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
+    g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
+    b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
+
+    rb = vec_or (r, b);
+    t  = vec_and (rb, mask_565_fix_rb);
+    t  = vec_sr (t, create_mask_32_128(5));
+    rb = vec_or (rb, t);
+
+    t  = vec_and (g, mask_565_fix_g);
+    t  = vec_sr (t, create_mask_32_128(6));
+    g  = vec_or (g, t);
+
+    return vec_or (rb, g);
+}
+
+static force_inline int
+is_opaque (vector unsigned int x)
+{
+    uint32_t cmp_result;
+    vector bool int ffs = vec_cmpeq(x, x);
+
+    cmp_result = vec_all_eq(x, ffs);
+
+    return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (vector unsigned int x)
+{
+    uint32_t cmp_result;
+
+    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+
+    return cmp_result == 0xffff;
+}
+
+static force_inline int
+is_transparent (vector unsigned int x)
+{
+    uint32_t cmp_result;
+
+    cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+    return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
+{
+    uint32_t a;
+
+    a = ALPHA_8(src);
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+	UN8x4_MUL_UN8(s, ALPHA_8(*pm));
+
+    return s;
+}
+
+static force_inline vector unsigned int
+combine4 (const uint32_t* ps, const uint32_t* pm)
+{
+    vector unsigned int src, msk;
+
+    if (pm)
+    {
+	msk = load_128_unaligned(pm);
+
+	if (is_transparent(msk))
+	    return (vector unsigned int) AVV(0);
+    }
+
+    src = load_128_unaligned(ps);
+
+    if (pm)
+	src = pix_multiply(src, msk);
+
+    return src;
+}
+
 static void
 vmx_combine_over_u_no_mask (uint32_t *      dest,
                             const uint32_t *src,
@@ -177,7 +481,7 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -227,7 +531,8 @@ vmx_combine_over_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -298,7 +603,7 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -346,7 +651,8 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -414,7 +720,7 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -459,7 +765,8 @@ vmx_combine_in_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -524,7 +831,7 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -571,7 +878,8 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -638,7 +946,7 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -685,7 +993,8 @@ vmx_combine_out_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -750,7 +1059,7 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -798,7 +1107,8 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -865,7 +1175,7 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -917,7 +1227,8 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -993,7 +1304,7 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1045,7 +1356,8 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1121,7 +1433,7 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1173,7 +1485,8 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1249,7 +1562,7 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, src_mask;
+    DECLARE_SRC_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1295,7 +1608,8 @@ vmx_combine_add_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1363,7 +1677,8 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1413,7 +1728,8 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1471,7 +1787,8 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1527,7 +1844,8 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1581,7 +1899,8 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1636,7 +1955,8 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1693,7 +2013,8 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1750,7 +2071,8 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask, vsrca;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1816,7 +2138,8 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1879,7 +2202,8 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1942,7 +2266,8 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+    DECLARE_SRC_MASK_VAR;
+    DECLARE_MASK_MASK_VAR;
 
     while (width && ((uintptr_t)dest & 15))
     {
@@ -1986,16 +2311,803 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     }
 }
 
+static void
+vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m, d, s, ia;
+
+    vector unsigned int vsrc, valpha, vmask, vdst;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = ALPHA_8(src);
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = (vector unsigned int) {src, src, src, src};
+    valpha = splat_alpha(vsrc);
+
+    while (height--)
+    {
+	const uint8_t *pm = mask_line;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    s = src;
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *dst;
+		UN8x4_MUL_UN8 (s, m);
+		ia = ALPHA_8 (~s);
+		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+		*dst = d;
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)pm);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned(dst, vsrc);
+	    }
+	    else if (m)
+	    {
+		vmask = splat_pixel((vector unsigned int) {m, m, m, m});
+
+		/* dst is 16-byte aligned */
+		vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
+
+		save_128_aligned(dst, vdst);
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    pm += 4;
+	}
+
+	while (w)
+	{
+	    s = src;
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *dst;
+		UN8x4_MUL_UN8 (s, m);
+		ia = ALPHA_8 (~s);
+		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+		*dst = d;
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static pixman_bool_t
+vmx_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t		    filler)
+{
+    uint32_t byte_width;
+    uint8_t *byte_line;
+
+    vector unsigned int vfiller;
+
+    if (bpp == 8)
+    {
+	uint8_t b;
+	uint16_t w;
+
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+
+	b = filler & 0xff;
+	w = (b << 8) | b;
+	filler = (w << 16) | w;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+
+        filler = (filler & 0xffff) * 0x00010001;
+    }
+    else if (bpp == 32)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    vfiller = create_mask_32_128(filler);
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+	byte_line += stride;
+	w = byte_width;
+
+	if (w >= 1 && ((uintptr_t)d & 1))
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+
+	while (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 15))
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 128)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+	    vec_st(vfiller, 0, (uint32_t *) d + 4);
+	    vec_st(vfiller, 0, (uint32_t *) d + 8);
+	    vec_st(vfiller, 0, (uint32_t *) d + 12);
+	    vec_st(vfiller, 0, (uint32_t *) d + 16);
+	    vec_st(vfiller, 0, (uint32_t *) d + 20);
+	    vec_st(vfiller, 0, (uint32_t *) d + 24);
+	    vec_st(vfiller, 0, (uint32_t *) d + 28);
+
+	    d += 128;
+	    w -= 128;
+	}
+
+	if (w >= 64)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+	    vec_st(vfiller, 0, (uint32_t *) d + 4);
+	    vec_st(vfiller, 0, (uint32_t *) d + 8);
+	    vec_st(vfiller, 0, (uint32_t *) d + 12);
+
+	    d += 64;
+	    w -= 64;
+	}
+
+	if (w >= 32)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+	    vec_st(vfiller, 0, (uint32_t *) d + 4);
+
+	    d += 32;
+	    w -= 32;
+	}
+
+	if (w >= 16)
+	{
+	    vec_st(vfiller, 0, (uint32_t *) d);
+
+	    d += 16;
+	    w -= 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	if (w >= 1)
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+    }
+
+    return TRUE;
+}
+
+static void
+vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
+
+	    vmx_src1 = load_128_unaligned (src);
+	    vmx_src2 = load_128_unaligned (src + 4);
+	    vmx_src3 = load_128_unaligned (src + 8);
+	    vmx_src4 = load_128_unaligned (src + 12);
+
+	    save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
+	    save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
+	    save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
+	    save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
+
+	    dst += 16;
+	    src += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+    }
+}
+
+static void
+vmx_composite_over_n_8888 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t src, ia;
+    int      i, w, dst_stride;
+    vector unsigned int vdst, vsrc, via;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    vsrc = (vector unsigned int){src, src, src, src};
+    via = negate (splat_alpha (vsrc));
+    ia = ALPHA_8 (~src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    uint32_t d = *dst;
+	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+	    *dst++ = d;
+	    w--;
+	}
+
+	for (i = w / 4; i > 0; i--)
+	{
+	    vdst = pix_multiply (load_128_aligned (dst), via);
+	    save_128_aligned (dst, pix_add (vsrc, vdst));
+	    dst += 4;
+	}
+
+	for (i = w % 4; --i >= 0;)
+	{
+	    uint32_t d = dst[i];
+	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+	    dst[i] = d;
+	}
+    }
+}
+
+static void
+vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+    dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+    src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+        vmx_combine_over_u (imp, op, dst, src, NULL, width);
+
+        dst += dst_stride;
+        src += src_stride;
+    }
+}
+
+static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, ia;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    vector unsigned int vsrc, valpha, vmask, vdest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = (vector unsigned int) {src, src, src, src};
+    valpha = splat_alpha(vsrc);
+    ia = ALPHA_8 (src);
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+	uint32_t s;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    s = src;
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		UN8x4_MUL_UN8x4 (s, m);
+		UN8x4_MUL_UN8 (m, ia);
+		m = ~m;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
+		*pd = d;
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    /* pm is NOT necessarily 16-byte aligned */
+	    vmask = load_128_unaligned (pm);
+
+	    pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0));
+
+	    /* if all bits in mask are zero, pack_cmp is not 0 */
+	    if (pack_cmp == 0)
+	    {
+		/* pd is 16-byte aligned */
+		vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
+
+		save_128_aligned(pd, vdest);
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    s = src;
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		UN8x4_MUL_UN8x4 (s, m);
+		UN8x4_MUL_UN8 (m, ia);
+		m = ~m;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
+		*pd = d;
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+}
+
+static void
+vmx_composite_add_8_8 (pixman_implementation_t *imp,
+            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+    src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+    dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Small head */
+	while (w && (uintptr_t)dst & 3)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+
+	vmx_combine_add_u (imp, op,
+		    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+	/* Small tail */
+	dst += w & 0xfffc;
+	src += w & 0xfffc;
+
+	w &= 3;
+
+	while (w)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+    }
+}
+
+static void
+vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	vmx_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+}
+
+static force_inline void
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
+                                            const uint32_t* ps,
+                                            int32_t         w,
+                                            pixman_fixed_t  vx,
+                                            pixman_fixed_t  unit_x,
+                                            pixman_fixed_t  src_width_fixed,
+                                            pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    vector unsigned int vsrc, vdst;
+
+    if (fully_transparent_src)
+	return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_vmx (s, d);
+	if (pm)
+	    pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	uint32_t tmp[4];
+
+	tmp[0] = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp[1] = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp[2] = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp[3] = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	vsrc = combine4 (tmp, pm);
+
+	if (is_opaque (vsrc))
+	{
+	    save_128_aligned (pd, vsrc);
+	}
+	else if (!is_zero (vsrc))
+	{
+	    vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
+
+	    save_128_aligned (pd, vdst);
+	}
+
+	w -= 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_vmx (s, d);
+	if (pm)
+	    pm++;
+
+	w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
+		       scaled_nearest_scanline_vmx_8888_8888_OVER,
+		       uint32_t, uint32_t, NORMAL)
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null, a8r8g8b8, vmx_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null, x8r8g8b8, vmx_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
+
     {   PIXMAN_OP_NONE	},
 };
 
+static uint32_t *
+vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    vector unsigned int ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((uintptr_t)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+	vmx0 = load_128_unaligned((uint32_t *) src);
+
+	unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
+	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
+	unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+
+	save_128_aligned(dst, vmx6);
+	save_128_aligned((dst +  4), vmx5);
+	save_128_aligned((dst +  8), vmx4);
+	save_128_aligned((dst + 12), vmx3);
+
+	dst += 16;
+	src += 16;
+	w -= 16;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+#define IMAGE_FLAGS							\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t vmx_iters[] =
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
+
 pixman_implementation_t *
 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
 
+    /* VMX constants */
+    mask_ff000000 = create_mask_32_128 (0xff000000);
+    mask_red   = create_mask_32_128 (0x00f80000);
+    mask_green = create_mask_32_128 (0x0000fc00);
+    mask_blue  = create_mask_32_128 (0x000000f8);
+    mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
+    mask_565_fix_g = create_mask_32_128  (0x0000c000);
+
     /* Set up function pointers */
 
     imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
@@ -2022,5 +3134,9 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
     imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
 
+    imp->fill = vmx_fill;
+
+    imp->iter_info = vmx_iters;
+
     return imp;
 }
diff --git a/pixman/pixman-x86.c b/pixman/pixman-x86.c
index 05297c4..2f688eb 100644
--- a/pixman/pixman-x86.c
+++ b/pixman/pixman-x86.c
@@ -20,7 +20,7 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
@@ -74,69 +74,17 @@ detect_cpu_features (void)
 
 #else
 
-#define _PIXMAN_X86_64							\
-    (defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64))
-
-static pixman_bool_t
-have_cpuid (void)
-{
-#if _PIXMAN_X86_64 || defined (_MSC_VER)
-
-    return TRUE;
-
-#elif defined (__GNUC__)
-    uint32_t result;
-
-    __asm__ volatile (
-        "pushf"				"\n\t"
-        "pop %%eax"			"\n\t"
-        "mov %%eax, %%ecx"		"\n\t"
-        "xor $0x00200000, %%eax"	"\n\t"
-        "push %%eax"			"\n\t"
-        "popf"				"\n\t"
-        "pushf"				"\n\t"
-        "pop %%eax"			"\n\t"
-        "xor %%ecx, %%eax"		"\n\t"
-	"mov %%eax, %0"			"\n\t"
-	: "=r" (result)
-	:
-	: "%eax", "%ecx");
-
-    return !!result;
-
-#else
-#error "Unknown compiler"
+#if defined (__GNUC__)
+#include <cpuid.h>
 #endif
-}
 
 static void
 pixman_cpuid (uint32_t feature,
 	      uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
 {
 #if defined (__GNUC__)
-
-#if _PIXMAN_X86_64
-    __asm__ volatile (
-        "cpuid"				"\n\t"
-	: "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
-	: "a" (feature));
-#else
-    /* On x86-32 we need to be careful about the handling of %ebx
-     * and %esp. We can't declare either one as clobbered
-     * since they are special registers (%ebx is the "PIC
-     * register" holding an offset to global data, %esp the
-     * stack pointer), so we need to make sure that %ebx is
-     * preserved, and that %esp has its original value when
-     * accessing the output operands.
-     */
-    __asm__ volatile (
-	"xchg %%ebx, %1"		"\n\t"
-	"cpuid"				"\n\t"
-	"xchg %%ebx, %1"		"\n\t"
-	: "=a" (*a), "=r" (*b), "=c" (*c), "=d" (*d)
-	: "a" (feature));
-#endif
-
+    *a = *b = *c = *d = 0;
+    __get_cpuid(feature, a, b, c, d);
 #elif defined (_MSC_VER)
     int info[4];
 
@@ -157,9 +105,6 @@ detect_cpu_features (void)
     uint32_t a, b, c, d;
     cpu_features_t features = 0;
 
-    if (!have_cpuid())
-	return features;
-
     /* Get feature bits */
     pixman_cpuid (0x01, &a, &b, &c, &d);
     if (d & (1 << 15))
@@ -187,6 +132,7 @@ detect_cpu_features (void)
 	memcpy (vendor + 8, &c, 4);
 
 	if (strcmp (vendor, "AuthenticAMD") == 0 ||
+	    strcmp (vendor, "HygonGenuine") == 0 ||
 	    strcmp (vendor, "Geode by NSC") == 0)
 	{
 	    pixman_cpuid (0x80000000, &a, &b, &c, &d);
diff --git a/pixman/pixman.c b/pixman/pixman.c
index 9555cea..82ec236 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -24,7 +24,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 #include "pixman-private.h"
 
@@ -182,7 +182,7 @@ clip_general_image (pixman_region32_t * region,
 	    return FALSE;
 	}
     }
-    else if (!pixman_region32_not_empty (clip))
+    else if (pixman_region32_empty (clip))
     {
 	return FALSE;
     }
@@ -277,7 +277,7 @@ _pixman_compute_composite_region32 (pixman_region32_t * region,
 	{
 	    return FALSE;
 	}
-	if (!pixman_region32_not_empty (region))
+	if (pixman_region32_empty (region))
 	    return FALSE;
 	if (dest_image->common.alpha_map->common.have_clip_region)
 	{
@@ -325,18 +325,20 @@ _pixman_compute_composite_region32 (pixman_region32_t * region,
     return TRUE;
 }
 
-typedef struct
+typedef struct box_48_16 box_48_16_t;
+
+struct box_48_16
 {
-    pixman_fixed_48_16_t	x1;
-    pixman_fixed_48_16_t	y1;
-    pixman_fixed_48_16_t	x2;
-    pixman_fixed_48_16_t	y2;
-} box_48_16_t;
+    pixman_fixed_48_16_t        x1;
+    pixman_fixed_48_16_t        y1;
+    pixman_fixed_48_16_t        x2;
+    pixman_fixed_48_16_t        y2;
+};
 
 static pixman_bool_t
-compute_transformed_extents (pixman_transform_t *transform,
+compute_transformed_extents (pixman_transform_t   *transform,
 			     const pixman_box32_t *extents,
-			     box_48_16_t *transformed)
+			     box_48_16_t          *transformed)
 {
     pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
     pixman_fixed_t x1, y1, x2, y2;
@@ -495,21 +497,12 @@ analyze_extent (pixman_image_t       *image,
     if (!compute_transformed_extents (transform, extents, &transformed))
 	return FALSE;
 
-    /* Expand the source area by a tiny bit so account of different rounding that
-     * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
-     * 0.5 so this won't cause the area computed to be overly pessimistic.
-     */
-    transformed.x1 -= 8 * pixman_fixed_e;
-    transformed.y1 -= 8 * pixman_fixed_e;
-    transformed.x2 += 8 * pixman_fixed_e;
-    transformed.y2 += 8 * pixman_fixed_e;
-
     if (image->common.type == BITS)
     {
-	if (pixman_fixed_to_int (transformed.x1) >= 0			&&
-	    pixman_fixed_to_int (transformed.y1) >= 0			&&
-	    pixman_fixed_to_int (transformed.x2) < image->bits.width	&&
-	    pixman_fixed_to_int (transformed.y2) < image->bits.height)
+	if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_e) >= 0                &&
+	    pixman_fixed_to_int (transformed.y1 - pixman_fixed_e) >= 0                &&
+	    pixman_fixed_to_int (transformed.x2 - pixman_fixed_e) < image->bits.width &&
+	    pixman_fixed_to_int (transformed.y2 - pixman_fixed_e) < image->bits.height)
 	{
 	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
 	}
@@ -784,6 +777,11 @@ color_to_pixel (const pixman_color_t *color,
 {
     uint32_t c = color_to_uint32 (color);
 
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA_FLOAT)
+    {
+	return FALSE;
+    }
+
     if (!(format == PIXMAN_a8r8g8b8     ||
           format == PIXMAN_x8r8g8b8     ||
           format == PIXMAN_a8b8g8r8     ||
@@ -1022,6 +1020,7 @@ pixman_format_supported_source (pixman_format_code_t format)
     case PIXMAN_x2r10g10b10:
     case PIXMAN_a8r8g8b8:
     case PIXMAN_a8r8g8b8_sRGB:
+    case PIXMAN_r8g8b8_sRGB:
     case PIXMAN_x8r8g8b8:
     case PIXMAN_a8b8g8r8:
     case PIXMAN_x8b8g8r8:
diff --git a/pixman/pixman.h b/pixman/pixman.h
index 509ba5e..d697b53 100644
--- a/pixman/pixman.h
+++ b/pixman/pixman.h
@@ -127,7 +127,7 @@ typedef pixman_fixed_16_16_t	pixman_fixed_t;
 #define pixman_fixed_1_minus_e		(pixman_fixed_1 - pixman_fixed_e)
 #define pixman_fixed_minus_1		(pixman_int_to_fixed(-1))
 #define pixman_fixed_to_int(f)		((int) ((f) >> 16))
-#define pixman_int_to_fixed(i)		((pixman_fixed_t) ((i) << 16))
+#define pixman_int_to_fixed(i)		((pixman_fixed_t) ((uint32_t) (i) << 16))
 #define pixman_fixed_to_double(f)	(double) ((f) / (double) pixman_fixed_1)
 #define pixman_double_to_fixed(d)	((pixman_fixed_t) ((d) * 65536.0))
 #define pixman_fixed_frac(f)		((f) & pixman_fixed_1_minus_e)
@@ -184,42 +184,73 @@ struct pixman_transform
 struct pixman_box16;
 typedef  union pixman_image		pixman_image_t;
 
+PIXMAN_API
 void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
 						 struct pixman_vector          *vector);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform,
 						 struct pixman_vector          *vector);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst,
 						 const struct pixman_transform *l,
 						 const struct pixman_transform *r);
+
+PIXMAN_API
 void          pixman_transform_init_scale       (struct pixman_transform       *t,
 						 pixman_fixed_t                 sx,
 						 pixman_fixed_t                 sy);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward,
 						 struct pixman_transform       *reverse,
 						 pixman_fixed_t                 sx,
 						 pixman_fixed_t                 sy);
+
+PIXMAN_API
 void          pixman_transform_init_rotate      (struct pixman_transform       *t,
 						 pixman_fixed_t                 cos,
 						 pixman_fixed_t                 sin);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward,
 						 struct pixman_transform       *reverse,
 						 pixman_fixed_t                 c,
 						 pixman_fixed_t                 s);
+
+PIXMAN_API
 void          pixman_transform_init_translate   (struct pixman_transform       *t,
 						 pixman_fixed_t                 tx,
 						 pixman_fixed_t                 ty);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward,
 						 struct pixman_transform       *reverse,
 						 pixman_fixed_t                 tx,
 						 pixman_fixed_t                 ty);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix,
 						 struct pixman_box16           *b);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst,
 						 const struct pixman_transform *src);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
+
+PIXMAN_API
 pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a,
 						 const struct pixman_transform *b);
 
@@ -239,42 +270,70 @@ struct pixman_f_transform
     double  m[3][3];
 };
 
+
+PIXMAN_API
 pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t,
 							const struct pixman_f_transform *ft);
+
+PIXMAN_API
 void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft,
 							const struct pixman_transform   *t);
+
+PIXMAN_API
 pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst,
 							const struct pixman_f_transform *src);
+
+PIXMAN_API
 pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t,
 							struct pixman_f_vector          *v);
+
+PIXMAN_API
 void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t,
 							struct pixman_f_vector          *v);
+
+PIXMAN_API
 void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst,
 							const struct pixman_f_transform *l,
 							const struct pixman_f_transform *r);
+
+PIXMAN_API
 void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t,
 							double                           sx,
 							double                           sy);
+
+PIXMAN_API
 pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward,
 							struct pixman_f_transform       *reverse,
 							double                           sx,
 							double                           sy);
+
+PIXMAN_API
 void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t,
 							double                           cos,
 							double                           sin);
+
+PIXMAN_API
 pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward,
 							struct pixman_f_transform       *reverse,
 							double                           c,
 							double                           s);
+
+PIXMAN_API
 void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t,
 							double                           tx,
 							double                           ty);
+
+PIXMAN_API
 pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward,
 							struct pixman_f_transform       *reverse,
 							double                           tx,
 							double                           ty);
+
+PIXMAN_API
 pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t,
 							struct pixman_box16             *b);
+
+PIXMAN_API
 void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t);
 
 typedef enum
@@ -287,6 +346,16 @@ typedef enum
 
 typedef enum
 {
+    PIXMAN_DITHER_NONE,
+    PIXMAN_DITHER_FAST,
+    PIXMAN_DITHER_GOOD,
+    PIXMAN_DITHER_BEST,
+    PIXMAN_DITHER_ORDERED_BAYER_8,
+    PIXMAN_DITHER_ORDERED_BLUE_NOISE_64,
+} pixman_dither_t;
+
+typedef enum
+{
     PIXMAN_FILTER_FAST,
     PIXMAN_FILTER_GOOD,
     PIXMAN_FILTER_BEST,
@@ -423,73 +492,123 @@ typedef enum
 /* This function exists only to make it possible to preserve
  * the X ABI - it should go away at first opportunity.
  */
+PIXMAN_API
 void pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
 					pixman_region16_data_t *empty_data,
 					pixman_region16_data_t *broken_data);
 
 /* creation/destruction */
+PIXMAN_API
 void                    pixman_region_init               (pixman_region16_t *region);
+
+PIXMAN_API
 void                    pixman_region_init_rect          (pixman_region16_t *region,
 							  int                x,
 							  int                y,
 							  unsigned int       width,
 							  unsigned int       height);
+
+PIXMAN_API
 pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
 							  const pixman_box16_t *boxes,
 							  int                count);
-void                    pixman_region_init_with_extents  (pixman_region16_t *region,
-							  pixman_box16_t    *extents);
+
+PIXMAN_API
+void                    pixman_region_init_with_extents  (pixman_region16_t    *region,
+							  const pixman_box16_t *extents);
+
+PIXMAN_API
 void                    pixman_region_init_from_image    (pixman_region16_t *region,
 							  pixman_image_t    *image);
+
+PIXMAN_API
 void                    pixman_region_fini               (pixman_region16_t *region);
 
 
 /* manipulation */
+PIXMAN_API
 void                    pixman_region_translate          (pixman_region16_t *region,
 							  int                x,
 							  int                y);
-pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest,
-							  pixman_region16_t *source);
-pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg,
-							  pixman_region16_t *reg1,
-							  pixman_region16_t *reg2);
-pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg,
-							  pixman_region16_t *reg1,
-							  pixman_region16_t *reg2);
-pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest,
-							  pixman_region16_t *source,
-							  int                x,
-							  int                y,
-							  unsigned int       width,
-							  unsigned int       height);
-pixman_bool_t		pixman_region_intersect_rect     (pixman_region16_t *dest,
-							  pixman_region16_t *source,
-							  int                x,
-							  int                y,
-							  unsigned int       width,
-							  unsigned int       height);
-pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d,
-							  pixman_region16_t *reg_m,
-							  pixman_region16_t *reg_s);
-pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg,
-							  pixman_region16_t *reg1,
-							  pixman_box16_t    *inv_rect);
-pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region,
-							  int                x,
-							  int                y,
-							  pixman_box16_t    *box);
-pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
-							  pixman_box16_t    *prect);
-pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
-pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
-int                     pixman_region_n_rects            (pixman_region16_t *region);
-pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region,
-							  int               *n_rects);
-pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1,
-							  pixman_region16_t *region2);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_copy               (pixman_region16_t       *dest,
+							  const pixman_region16_t *source);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_intersect          (pixman_region16_t       *new_reg,
+							  const pixman_region16_t *reg1,
+							  const pixman_region16_t *reg2);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_union              (pixman_region16_t       *new_reg,
+							  const pixman_region16_t *reg1,
+							  const pixman_region16_t *reg2);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_union_rect         (pixman_region16_t       *dest,
+							  const pixman_region16_t *source,
+							  int                     x,
+							  int                     y,
+							  unsigned int            width,
+							  unsigned int            height);
+
+PIXMAN_API
+pixman_bool_t		pixman_region_intersect_rect     (pixman_region16_t       *dest,
+							  const pixman_region16_t *source,
+							  int                      x,
+							  int                      y,
+							  unsigned int             width,
+							  unsigned int             height);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_subtract           (pixman_region16_t       *reg_d,
+							  const pixman_region16_t *reg_m,
+							  const pixman_region16_t *reg_s);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_inverse            (pixman_region16_t       *new_reg,
+							  const pixman_region16_t *reg1,
+							  const pixman_box16_t    *inv_rect);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_contains_point     (const pixman_region16_t *region,
+							  int                      x,
+							  int                      y,
+							  pixman_box16_t          *box);
+
+PIXMAN_API
+pixman_region_overlap_t pixman_region_contains_rectangle (const pixman_region16_t *region,
+							  const pixman_box16_t    *prect);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_empty              (const pixman_region16_t *region);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_not_empty          (const pixman_region16_t *region);
+
+PIXMAN_API
+pixman_box16_t *        pixman_region_extents            (const pixman_region16_t *region);
+
+PIXMAN_API
+int                     pixman_region_n_rects            (const pixman_region16_t *region);
+
+PIXMAN_API
+pixman_box16_t *        pixman_region_rectangles         (const pixman_region16_t *region,
+							  int                     *n_rects);
+
+PIXMAN_API
+pixman_bool_t           pixman_region_equal              (const pixman_region16_t *region1,
+							  const pixman_region16_t *region2);
+
+PIXMAN_API
 pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
-void                    pixman_region_reset              (pixman_region16_t *region,
-							  pixman_box16_t    *box);
+
+PIXMAN_API
+void                    pixman_region_reset              (pixman_region16_t       *region,
+							  const pixman_box16_t    *box);
+
+PIXMAN_API
 void			pixman_region_clear		 (pixman_region16_t *region);
 /*
  * 32 bit regions
@@ -523,72 +642,122 @@ struct pixman_region32
 };
 
 /* creation/destruction */
+PIXMAN_API
 void                    pixman_region32_init               (pixman_region32_t *region);
+
+PIXMAN_API
 void                    pixman_region32_init_rect          (pixman_region32_t *region,
 							    int                x,
 							    int                y,
 							    unsigned int       width,
 							    unsigned int       height);
+
+PIXMAN_API
 pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region,
 							    const pixman_box32_t *boxes,
 							    int                count);
-void                    pixman_region32_init_with_extents  (pixman_region32_t *region,
-							    pixman_box32_t    *extents);
+
+PIXMAN_API
+void                    pixman_region32_init_with_extents  (pixman_region32_t    *region,
+							    const pixman_box32_t *extents);
+
+PIXMAN_API
 void                    pixman_region32_init_from_image    (pixman_region32_t *region,
 							    pixman_image_t    *image);
+
+PIXMAN_API
 void                    pixman_region32_fini               (pixman_region32_t *region);
 
 
 /* manipulation */
+PIXMAN_API
 void                    pixman_region32_translate          (pixman_region32_t *region,
 							    int                x,
 							    int                y);
-pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest,
-							    pixman_region32_t *source);
-pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg,
-							    pixman_region32_t *reg1,
-							    pixman_region32_t *reg2);
-pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg,
-							    pixman_region32_t *reg1,
-							    pixman_region32_t *reg2);
-pixman_bool_t		pixman_region32_intersect_rect     (pixman_region32_t *dest,
-							    pixman_region32_t *source,
-							    int                x,
-							    int                y,
-							    unsigned int       width,
-							    unsigned int       height);
-pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest,
-							    pixman_region32_t *source,
-							    int                x,
-							    int                y,
-							    unsigned int       width,
-							    unsigned int       height);
-pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d,
-							    pixman_region32_t *reg_m,
-							    pixman_region32_t *reg_s);
-pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg,
-							    pixman_region32_t *reg1,
-							    pixman_box32_t    *inv_rect);
-pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region,
-							    int                x,
-							    int                y,
-							    pixman_box32_t    *box);
-pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region,
-							    pixman_box32_t    *prect);
-pixman_bool_t           pixman_region32_not_empty          (pixman_region32_t *region);
-pixman_box32_t *        pixman_region32_extents            (pixman_region32_t *region);
-int                     pixman_region32_n_rects            (pixman_region32_t *region);
-pixman_box32_t *        pixman_region32_rectangles         (pixman_region32_t *region,
-							    int               *n_rects);
-pixman_bool_t           pixman_region32_equal              (pixman_region32_t *region1,
-							    pixman_region32_t *region2);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_copy               (pixman_region32_t       *dest,
+							    const pixman_region32_t *source);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_intersect          (pixman_region32_t       *new_reg,
+							    const pixman_region32_t *reg1,
+							    const pixman_region32_t *reg2);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_union              (pixman_region32_t       *new_reg,
+							    const pixman_region32_t *reg1,
+							    const pixman_region32_t *reg2);
+
+PIXMAN_API
+pixman_bool_t		pixman_region32_intersect_rect     (pixman_region32_t       *dest,
+							    const pixman_region32_t *source,
+							    int                      x,
+							    int                      y,
+							    unsigned int             width,
+							    unsigned int             height);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t       *dest,
+							    const pixman_region32_t *source,
+							    int                      x,
+							    int                      y,
+							    unsigned int             width,
+							    unsigned int             height);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_subtract           (pixman_region32_t       *reg_d,
+							    const pixman_region32_t *reg_m,
+							    const pixman_region32_t *reg_s);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_inverse            (pixman_region32_t       *new_reg,
+							    const pixman_region32_t *reg1,
+							    const pixman_box32_t    *inv_rect);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_contains_point     (const pixman_region32_t *region,
+							    int                      x,
+							    int                      y,
+							    pixman_box32_t          *box);
+
+PIXMAN_API
+pixman_region_overlap_t pixman_region32_contains_rectangle (const pixman_region32_t *region,
+							    const pixman_box32_t    *prect);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_empty              (const pixman_region32_t *region);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_not_empty          (const pixman_region32_t *region);
+
+PIXMAN_API
+pixman_box32_t *        pixman_region32_extents            (const pixman_region32_t *region);
+
+PIXMAN_API
+int                     pixman_region32_n_rects            (const pixman_region32_t *region);
+
+PIXMAN_API
+pixman_box32_t *        pixman_region32_rectangles         (const pixman_region32_t *region,
+							    int                     *n_rects);
+
+PIXMAN_API
+pixman_bool_t           pixman_region32_equal              (const pixman_region32_t *region1,
+							    const pixman_region32_t *region2);
+
+PIXMAN_API
 pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region);
-void                    pixman_region32_reset              (pixman_region32_t *region,
-							    pixman_box32_t    *box);
+
+PIXMAN_API
+void                    pixman_region32_reset              (pixman_region32_t    *region,
+							    const pixman_box32_t *box);
+
+PIXMAN_API
 void			pixman_region32_clear		   (pixman_region32_t *region);
 
 
 /* Copy / Fill / Misc */
+PIXMAN_API
 pixman_bool_t pixman_blt                (uint32_t           *src_bits,
 					 uint32_t           *dst_bits,
 					 int                 src_stride,
@@ -601,6 +770,8 @@ pixman_bool_t pixman_blt                (uint32_t           *src_bits,
 					 int                 dest_y,
 					 int                 width,
 					 int                 height);
+
+PIXMAN_API
 pixman_bool_t pixman_fill               (uint32_t           *bits,
 					 int                 stride,
 					 int                 bpp,
@@ -610,7 +781,11 @@ pixman_bool_t pixman_fill               (uint32_t           *bits,
 					 int                 height,
 					 uint32_t            _xor);
 
+
+PIXMAN_API
 int           pixman_version            (void);
+
+PIXMAN_API
 const char*   pixman_version_string     (void);
 
 /*
@@ -654,12 +829,24 @@ struct pixman_indexed
 					 ((g) << 4) |	  \
 					 ((b)))
 
-#define PIXMAN_FORMAT_BPP(f)	(((f) >> 24)       )
-#define PIXMAN_FORMAT_TYPE(f)	(((f) >> 16) & 0xff)
-#define PIXMAN_FORMAT_A(f)	(((f) >> 12) & 0x0f)
-#define PIXMAN_FORMAT_R(f)	(((f) >>  8) & 0x0f)
-#define PIXMAN_FORMAT_G(f)	(((f) >>  4) & 0x0f)
-#define PIXMAN_FORMAT_B(f)	(((f)      ) & 0x0f)
+#define PIXMAN_FORMAT_BYTE(bpp,type,a,r,g,b) \
+	(((bpp >> 3) << 24) | \
+	(3 << 22) | ((type) << 16) | \
+	((a >> 3) << 12) | \
+	((r >> 3) << 8) | \
+	((g >> 3) << 4) | \
+	((b >> 3)))
+
+#define PIXMAN_FORMAT_RESHIFT(val, ofs, num) \
+	(((val >> (ofs)) & ((1 << (num)) - 1)) << ((val >> 22) & 3))
+
+#define PIXMAN_FORMAT_BPP(f)	PIXMAN_FORMAT_RESHIFT(f, 24, 8)
+#define PIXMAN_FORMAT_SHIFT(f)	((uint32_t)((f >> 22) & 3))
+#define PIXMAN_FORMAT_TYPE(f)	(((f) >> 16) & 0x3f)
+#define PIXMAN_FORMAT_A(f)	PIXMAN_FORMAT_RESHIFT(f, 12, 4)
+#define PIXMAN_FORMAT_R(f)	PIXMAN_FORMAT_RESHIFT(f, 8, 4)
+#define PIXMAN_FORMAT_G(f)	PIXMAN_FORMAT_RESHIFT(f, 4, 4)
+#define PIXMAN_FORMAT_B(f)	PIXMAN_FORMAT_RESHIFT(f, 0, 4)
 #define PIXMAN_FORMAT_RGB(f)	(((f)      ) & 0xfff)
 #define PIXMAN_FORMAT_VIS(f)	(((f)      ) & 0xffff)
 #define PIXMAN_FORMAT_DEPTH(f)	(PIXMAN_FORMAT_A(f) +	\
@@ -678,15 +865,22 @@ struct pixman_indexed
 #define PIXMAN_TYPE_BGRA	8
 #define PIXMAN_TYPE_RGBA	9
 #define PIXMAN_TYPE_ARGB_SRGB	10
+#define PIXMAN_TYPE_RGBA_FLOAT	11
 
 #define PIXMAN_FORMAT_COLOR(f)				\
 	(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||	\
 	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||	\
 	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||	\
-	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA_FLOAT)
 
-/* 32bpp formats */
 typedef enum {
+/* 128bpp formats */
+    PIXMAN_rgba_float =	PIXMAN_FORMAT_BYTE(128,PIXMAN_TYPE_RGBA_FLOAT,32,32,32,32),
+/* 96bpp formats */
+    PIXMAN_rgb_float =	PIXMAN_FORMAT_BYTE(96,PIXMAN_TYPE_RGBA_FLOAT,0,32,32,32),
+
+/* 32bpp formats */
     PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
     PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
     PIXMAN_a8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
@@ -703,6 +897,7 @@ typedef enum {
 
 /* sRGB formats */
     PIXMAN_a8r8g8b8_sRGB = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB_SRGB,8,8,8,8),
+    PIXMAN_r8g8b8_sRGB = PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB_SRGB,0,8,8,8),
 
 /* 24bpp formats */
     PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
@@ -757,30 +952,44 @@ typedef enum {
 } pixman_format_code_t;
 
 /* Querying supported format values. */
+PIXMAN_API
 pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
+
+PIXMAN_API
 pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format);
 
 /* Constructors */
+PIXMAN_API
 pixman_image_t *pixman_image_create_solid_fill       (const pixman_color_t         *color);
+
+PIXMAN_API
 pixman_image_t *pixman_image_create_linear_gradient  (const pixman_point_fixed_t   *p1,
 						      const pixman_point_fixed_t   *p2,
 						      const pixman_gradient_stop_t *stops,
 						      int                           n_stops);
+
+PIXMAN_API
 pixman_image_t *pixman_image_create_radial_gradient  (const pixman_point_fixed_t   *inner,
 						      const pixman_point_fixed_t   *outer,
 						      pixman_fixed_t                inner_radius,
 						      pixman_fixed_t                outer_radius,
 						      const pixman_gradient_stop_t *stops,
 						      int                           n_stops);
+
+PIXMAN_API
 pixman_image_t *pixman_image_create_conical_gradient (const pixman_point_fixed_t   *center,
 						      pixman_fixed_t                angle,
 						      const pixman_gradient_stop_t *stops,
 						      int                           n_stops);
+
+PIXMAN_API
 pixman_image_t *pixman_image_create_bits             (pixman_format_code_t          format,
 						      int                           width,
 						      int                           height,
 						      uint32_t                     *bits,
 						      int                           rowstride_bytes);
+
+PIXMAN_API
 pixman_image_t *pixman_image_create_bits_no_clear    (pixman_format_code_t format,
 						      int                  width,
 						      int                  height,
@@ -788,48 +997,99 @@ pixman_image_t *pixman_image_create_bits_no_clear    (pixman_format_code_t forma
 						      int                  rowstride_bytes);
 
 /* Destructor */
+PIXMAN_API
 pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
+
+PIXMAN_API
 pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image);
 
+
+PIXMAN_API
 void		pixman_image_set_destroy_function    (pixman_image_t		   *image,
 						      pixman_image_destroy_func_t   function,
 						      void			   *data);
+
+PIXMAN_API
 void *		pixman_image_get_destroy_data        (pixman_image_t		   *image);
 
 /* Set properties */
+PIXMAN_API
 pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
-						      pixman_region16_t            *region);
+						      const pixman_region16_t            *region);
+
+PIXMAN_API
 pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image,
-						      pixman_region32_t            *region);
+						      const pixman_region32_t            *region);
+
+PIXMAN_API
 void		pixman_image_set_has_client_clip     (pixman_image_t               *image,
 						      pixman_bool_t		    clien_clip);
+
+PIXMAN_API
 pixman_bool_t   pixman_image_set_transform           (pixman_image_t               *image,
 						      const pixman_transform_t     *transform);
+
+PIXMAN_API
 void            pixman_image_set_repeat              (pixman_image_t               *image,
 						      pixman_repeat_t               repeat);
+
+PIXMAN_API
+void            pixman_image_set_dither              (pixman_image_t               *image,
+						      pixman_dither_t               dither);
+
+PIXMAN_API
+void            pixman_image_set_dither_offset       (pixman_image_t               *image,
+						      int                           offset_x,
+						      int                           offset_y);
+
+PIXMAN_API
 pixman_bool_t   pixman_image_set_filter              (pixman_image_t               *image,
 						      pixman_filter_t               filter,
 						      const pixman_fixed_t         *filter_params,
 						      int                           n_filter_params);
+
+PIXMAN_API
 void		pixman_image_set_source_clipping     (pixman_image_t		   *image,
 						      pixman_bool_t                 source_clipping);
+
+PIXMAN_API
 void            pixman_image_set_alpha_map           (pixman_image_t               *image,
 						      pixman_image_t               *alpha_map,
 						      int16_t                       x,
 						      int16_t                       y);
+
+PIXMAN_API
 void            pixman_image_set_component_alpha     (pixman_image_t               *image,
 						      pixman_bool_t                 component_alpha);
+
+PIXMAN_API
 pixman_bool_t   pixman_image_get_component_alpha     (pixman_image_t               *image);
+
+PIXMAN_API
 void		pixman_image_set_accessors	     (pixman_image_t		   *image,
 						      pixman_read_memory_func_t	    read_func,
 						      pixman_write_memory_func_t    write_func);
+
+PIXMAN_API
 void		pixman_image_set_indexed	     (pixman_image_t		   *image,
 						      const pixman_indexed_t	   *indexed);
+
+PIXMAN_API
 uint32_t       *pixman_image_get_data                (pixman_image_t               *image);
+
+PIXMAN_API
 int		pixman_image_get_width               (pixman_image_t               *image);
+
+PIXMAN_API
 int             pixman_image_get_height              (pixman_image_t               *image);
+
+PIXMAN_API
 int		pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */
+
+PIXMAN_API
 int		pixman_image_get_depth               (pixman_image_t		   *image);
+
+PIXMAN_API
 pixman_format_code_t pixman_image_get_format	     (pixman_image_t		   *image);
 
 typedef enum
@@ -847,6 +1107,7 @@ typedef enum
 /* Create the parameter list for a SEPARABLE_CONVOLUTION filter
  * with the given kernels and scale parameters.
  */
+PIXMAN_API
 pixman_fixed_t *
 pixman_filter_create_separable_convolution (int             *n_values,
 					    pixman_fixed_t   scale_x,
@@ -858,11 +1119,15 @@ pixman_filter_create_separable_convolution (int             *n_values,
 					    int              subsample_bits_x,
 					    int              subsample_bits_y);
 
+
+PIXMAN_API
 pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
 						      pixman_image_t		   *image,
 						      const pixman_color_t	   *color,
 						      int			    n_rects,
 						      const pixman_rectangle16_t   *rects);
+
+PIXMAN_API
 pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
                                                       pixman_image_t               *dest,
                                                       const pixman_color_t         *color,
@@ -870,6 +1135,7 @@ pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t
                                                       const pixman_box32_t         *boxes);
 
 /* Composite */
+PIXMAN_API
 pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
 					       pixman_image_t    *src_image,
 					       pixman_image_t    *mask_image,
@@ -882,6 +1148,8 @@ pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
 					       int16_t            dest_y,
 					       uint16_t           width,
 					       uint16_t           height);
+
+PIXMAN_API
 void          pixman_image_composite          (pixman_op_t        op,
 					       pixman_image_t    *src,
 					       pixman_image_t    *mask,
@@ -894,6 +1162,8 @@ void          pixman_image_composite          (pixman_op_t        op,
 					       int16_t            dest_y,
 					       uint16_t           width,
 					       uint16_t           height);
+
+PIXMAN_API
 void          pixman_image_composite32        (pixman_op_t        op,
 					       pixman_image_t    *src,
 					       pixman_image_t    *mask,
@@ -925,6 +1195,7 @@ void          pixman_image_composite32        (pixman_op_t        op,
  * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
  * function is a no-op.
  */
+PIXMAN_API
 void pixman_disable_out_of_bounds_workaround (void);
 
 /*
@@ -937,29 +1208,48 @@ typedef struct
     const void *glyph;
 } pixman_glyph_t;
 
+PIXMAN_API
 pixman_glyph_cache_t *pixman_glyph_cache_create       (void);
+
+PIXMAN_API
 void                  pixman_glyph_cache_destroy      (pixman_glyph_cache_t *cache);
+
+PIXMAN_API
 void                  pixman_glyph_cache_freeze       (pixman_glyph_cache_t *cache);
+
+PIXMAN_API
 void                  pixman_glyph_cache_thaw         (pixman_glyph_cache_t *cache);
+
+PIXMAN_API
 const void *          pixman_glyph_cache_lookup       (pixman_glyph_cache_t *cache,
 						       void                 *font_key,
 						       void                 *glyph_key);
+
+PIXMAN_API
 const void *          pixman_glyph_cache_insert       (pixman_glyph_cache_t *cache,
 						       void                 *font_key,
 						       void                 *glyph_key,
 						       int		     origin_x,
 						       int                   origin_y,
 						       pixman_image_t       *glyph_image);
+
+PIXMAN_API
 void                  pixman_glyph_cache_remove       (pixman_glyph_cache_t *cache,
 						       void                 *font_key,
 						       void                 *glyph_key);
+
+PIXMAN_API
 void                  pixman_glyph_get_extents        (pixman_glyph_cache_t *cache,
 						       int                   n_glyphs,
 						       pixman_glyph_t       *glyphs,
 						       pixman_box32_t       *extents);
+
+PIXMAN_API
 pixman_format_code_t  pixman_glyph_get_mask_format    (pixman_glyph_cache_t *cache,
 						       int		     n_glyphs,
 						       const pixman_glyph_t *glyphs);
+
+PIXMAN_API
 void                  pixman_composite_glyphs         (pixman_op_t           op,
 						       pixman_image_t       *src,
 						       pixman_image_t       *dest,
@@ -975,6 +1265,8 @@ void                  pixman_composite_glyphs         (pixman_op_t           op,
 						       pixman_glyph_cache_t *cache,
 						       int		     n_glyphs,
 						       const pixman_glyph_t *glyphs);
+
+PIXMAN_API
 void                  pixman_composite_glyphs_no_mask (pixman_op_t           op,
 						       pixman_image_t       *src,
 						       pixman_image_t       *dest,
@@ -1042,12 +1334,19 @@ struct pixman_trap
     pixman_span_fix_t	top, bot;
 };
 
+PIXMAN_API
 pixman_fixed_t pixman_sample_ceil_y        (pixman_fixed_t             y,
 					    int                        bpp);
+
+PIXMAN_API
 pixman_fixed_t pixman_sample_floor_y       (pixman_fixed_t             y,
 					    int                        bpp);
+
+PIXMAN_API
 void           pixman_edge_step            (pixman_edge_t             *e,
 					    int                        n);
+
+PIXMAN_API
 void           pixman_edge_init            (pixman_edge_t             *e,
 					    int                        bpp,
 					    pixman_fixed_t             y_start,
@@ -1055,31 +1354,43 @@ void           pixman_edge_init            (pixman_edge_t             *e,
 					    pixman_fixed_t             y_top,
 					    pixman_fixed_t             x_bot,
 					    pixman_fixed_t             y_bot);
+
+PIXMAN_API
 void           pixman_line_fixed_edge_init (pixman_edge_t             *e,
 					    int                        bpp,
 					    pixman_fixed_t             y,
 					    const pixman_line_fixed_t *line,
 					    int                        x_off,
 					    int                        y_off);
+
+PIXMAN_API
 void           pixman_rasterize_edges      (pixman_image_t            *image,
 					    pixman_edge_t             *l,
 					    pixman_edge_t             *r,
 					    pixman_fixed_t             t,
 					    pixman_fixed_t             b);
+
+PIXMAN_API
 void           pixman_add_traps            (pixman_image_t            *image,
 					    int16_t                    x_off,
 					    int16_t                    y_off,
 					    int                        ntrap,
 					    const pixman_trap_t       *traps);
+
+PIXMAN_API
 void           pixman_add_trapezoids       (pixman_image_t            *image,
 					    int16_t                    x_off,
 					    int                        y_off,
 					    int                        ntraps,
 					    const pixman_trapezoid_t  *traps);
+
+PIXMAN_API
 void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
 					    const pixman_trapezoid_t  *trap,
 					    int                        x_off,
 					    int                        y_off);
+
+PIXMAN_API
 void          pixman_composite_trapezoids (pixman_op_t		       op,
 					   pixman_image_t *	       src,
 					   pixman_image_t *	       dst,
@@ -1090,6 +1401,8 @@ void          pixman_composite_trapezoids (pixman_op_t		       op,
 					   int			       y_dst,
 					   int			       n_traps,
 					   const pixman_trapezoid_t *  traps);
+
+PIXMAN_API
 void          pixman_composite_triangles (pixman_op_t		       op,
 					  pixman_image_t *	       src,
 					  pixman_image_t *	       dst,
@@ -1100,6 +1413,8 @@ void          pixman_composite_triangles (pixman_op_t		       op,
 					  int			       y_dst,
 					  int			       n_tris,
 					  const pixman_triangle_t *    tris);
+
+PIXMAN_API
 void	      pixman_add_triangles       (pixman_image_t              *image,
 					  int32_t	               x_off,
 					  int32_t	               y_off,
diff --git a/pixman/rounding.txt b/pixman/rounding.txt
index b52b084..1c00019 100644
--- a/pixman/rounding.txt
+++ b/pixman/rounding.txt
@@ -160,6 +160,7 @@ which means the contents of the matrix corresponding to (frac) should
 contain width samplings of the function, with the first sample at:
 
        floor (frac - (width - 1) / 2.0 - e) + 0.5 - frac
+     = ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
 
 This filter is called separable because each of the k x k convolution
 matrices is specified with two k-wide vectors, one for each dimension,
diff --git a/test/Makefile.am b/test/Makefile.am
deleted file mode 100644
index 88dc36d..0000000
--- a/test/Makefile.am
+++ /dev/null
@@ -1,13 +0,0 @@
-include $(top_srcdir)/test/Makefile.sources
-
-AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
-AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
-LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS) $(PTHREAD_LIBS)
-AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
-
-libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
-
-noinst_LTLIBRARIES = libutils.la
-noinst_PROGRAMS = $(TESTPROGRAMS) $(OTHERPROGRAMS)
-
-TESTS = $(TESTPROGRAMS)
diff --git a/test/Makefile.sources b/test/Makefile.sources
deleted file mode 100644
index c20c34b..0000000
--- a/test/Makefile.sources
+++ /dev/null
@@ -1,51 +0,0 @@
-# Tests (sorted by expected completion time)
-TESTPROGRAMS =			      \
-	oob-test		      \
-	infinite-loop		      \
-	trap-crasher		      \
-	region-translate-test	      \
-	fetch-test		      \
-	a1-trap-test		      \
-	prng-test		      \
-	radial-invalid		      \
-	pdf-op-test		      \
-	region-test		      \
-	combiner-test		      \
-	scaling-crash-test	      \
-	alpha-loop		      \
-	scaling-helpers-test	      \
-	thread-test		      \
-	rotate-test		      \
-	alphamap		      \
-	gradient-crash-test	      \
-	pixel-test		      \
-	matrix-test		      \
-	composite-traps-test	      \
-	region-contains-test	      \
-	glyph-test		      \
-	stress-test		      \
-	blitters-test		      \
-	affine-test		      \
-	scaling-test		      \
-	composite		      \
-	tolerance-test		      \
-	$(NULL)
-
-# Other programs
-OTHERPROGRAMS =                 \
-	lowlevel-blt-bench	\
-	radial-perf-test	\
-        check-formats           \
-	scaling-bench		\
-	$(NULL)
-
-# Utility functions
-libutils_sources =		\
-	utils.c			\
-	utils-prng.c		\
-	$(NULL)
-
-libutils_headers =		\
-	utils.h			\
-	utils-prng.h		\
-	$(NULL)
diff --git a/test/Makefile.win32 b/test/Makefile.win32
deleted file mode 100644
index 6cfb4a7..0000000
--- a/test/Makefile.win32
+++ /dev/null
@@ -1,54 +0,0 @@
-default: all
-
-top_srcdir = ..
-include $(top_srcdir)/test/Makefile.sources
-include $(top_srcdir)/Makefile.win32.common
-
-TEST_LDADD = \
-	$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib \
-	$(CFG_VAR)/libutils.lib \
-	$(NULL)
-
-libutils_OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libutils_sources))
-
-SOURCES = $(patsubst %,   %.c,              $(TESTPROGRAMS) $(OTHERPROGRAMS))
-OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
-TESTS   = $(patsubst %,   $(CFG_VAR)/%.exe, $(TESTPROGRAMS))
-OTHERS  = $(patsubst %,   $(CFG_VAR)/%.exe, $(OTHERPROGRAMS))
-
-all: pixman inform $(TESTS) $(OTHERS)
-
-check: pixman inform $(TESTS)
-	@failures=0 ; \
-	total=0 ; \
-	for test in $(TESTS) ; \
-	do \
-		total=`expr $$total + 1` ; \
-		if ./$$test ; \
-		then echo "PASS: $$test" ; \
-		else echo "FAIL: $$test" ; \
-		     failures=`expr $$failures + 1` ; \
-		fi ; \
-	done ; \
-	if test $$failures -eq 0 ; \
-	then banner="All $$total tests passed" ; \
-	else banner="$$failures of $$total tests failed" ; \
-	fi ; \
-	dashes=`echo "$$banner" | sed s/./=/g`; \
-	echo "$$dashes" ; \
-	echo "$$banner" ; \
-	echo "$$dashes" ; \
-	test $$failures -eq 0
-
-$(CFG_VAR)/libutils.lib: $(libutils_OBJECTS)
-	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
-
-$(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj $(TEST_LDADD)
-	@$(LD) $(PIXMAN_LDFLAGS) -OUT:$@ $^
-
-$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib: pixman
-
-pixman:
-	@$(MAKE) -C $(top_builddir)/pixman -f Makefile.win32
-
-.PHONY: all check pixman
diff --git a/test/affine-bench.c b/test/affine-bench.c
new file mode 100644
index 0000000..86bf46e
--- /dev/null
+++ b/test/affine-bench.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright © 2014 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison@riscosopen.org)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdint.h>
+#include "utils.h"
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+
+#define WIDTH  1920
+#define HEIGHT 1080
+
+/* How much data to read to flush all cached data to RAM */
+#define MAX_L2CACHE_SIZE (8 * 1024 * 1024)
+
+#define PAGE_SIZE (4 * 1024)
+
+struct bench_info
+{
+    pixman_op_t           op;
+    pixman_transform_t    transform;
+    pixman_image_t       *src_image;
+    pixman_image_t       *mask_image;
+    pixman_image_t       *dest_image;
+    int32_t               src_x;
+    int32_t               src_y;
+};
+
+typedef struct bench_info bench_info_t;
+
+struct box_48_16
+{
+    pixman_fixed_48_16_t        x1;
+    pixman_fixed_48_16_t        y1;
+    pixman_fixed_48_16_t        x2;
+    pixman_fixed_48_16_t        y2;
+};
+
+typedef struct box_48_16 box_48_16_t;
+
+/* This function is copied verbatim from pixman.c. */
+static pixman_bool_t
+compute_transformed_extents (pixman_transform_t   *transform,
+			     const pixman_box32_t *extents,
+			     box_48_16_t          *transformed)
+{
+    pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
+    pixman_fixed_t x1, y1, x2, y2;
+    int i;
+
+    x1 = pixman_int_to_fixed (extents->x1) + pixman_fixed_1 / 2;
+    y1 = pixman_int_to_fixed (extents->y1) + pixman_fixed_1 / 2;
+    x2 = pixman_int_to_fixed (extents->x2) - pixman_fixed_1 / 2;
+    y2 = pixman_int_to_fixed (extents->y2) - pixman_fixed_1 / 2;
+
+    if (!transform)
+    {
+	transformed->x1 = x1;
+	transformed->y1 = y1;
+	transformed->x2 = x2;
+	transformed->y2 = y2;
+
+	return TRUE;
+    }
+
+    tx1 = ty1 = INT64_MAX;
+    tx2 = ty2 = INT64_MIN;
+
+    for (i = 0; i < 4; ++i)
+    {
+	pixman_fixed_48_16_t tx, ty;
+	pixman_vector_t v;
+
+	v.vector[0] = (i & 0x01)? x1 : x2;
+	v.vector[1] = (i & 0x02)? y1 : y2;
+	v.vector[2] = pixman_fixed_1;
+
+	if (!pixman_transform_point (transform, &v))
+	    return FALSE;
+
+	tx = (pixman_fixed_48_16_t)v.vector[0];
+	ty = (pixman_fixed_48_16_t)v.vector[1];
+
+	if (tx < tx1)
+	    tx1 = tx;
+	if (ty < ty1)
+	    ty1 = ty;
+	if (tx > tx2)
+	    tx2 = tx;
+	if (ty > ty2)
+	    ty2 = ty;
+    }
+
+    transformed->x1 = tx1;
+    transformed->y1 = ty1;
+    transformed->x2 = tx2;
+    transformed->y2 = ty2;
+
+    return TRUE;
+}
+
+static void
+create_image (uint32_t                   width,
+              uint32_t                   height,
+              pixman_format_code_t       format,
+              pixman_filter_t            filter,
+              uint32_t                 **bits,
+              pixman_image_t           **image)
+{
+    uint32_t stride = (width * PIXMAN_FORMAT_BPP (format) + 31) / 32 * 4;
+
+    *bits = aligned_malloc (PAGE_SIZE, stride * height);
+    memset (*bits, 0xCC, stride * height);
+    *image = pixman_image_create_bits (format, width, height, *bits, stride);
+    pixman_image_set_repeat (*image, PIXMAN_REPEAT_NORMAL);
+    pixman_image_set_filter (*image, filter, NULL, 0);
+}
+
+/* This needs to match the shortest cacheline length we expect to encounter */
+#define CACHE_CLEAN_INCREMENT 32
+
+static void
+flush_cache (void)
+{
+    static const char clean_space[MAX_L2CACHE_SIZE];
+    volatile const char *x = clean_space;
+    const char *clean_end = clean_space + sizeof clean_space;
+
+    while (x < clean_end)
+    {
+        (void) *x;
+        x += CACHE_CLEAN_INCREMENT;
+    }
+}
+
+/* Obtain current time in microseconds modulo 2^32 */
+uint32_t
+gettimei (void)
+{
+#ifdef HAVE_GETTIMEOFDAY
+    struct timeval tv;
+
+    gettimeofday (&tv, NULL);
+    return tv.tv_sec * 1000000 + tv.tv_usec;
+#else
+    return (uint64_t) clock () * 1000000 / CLOCKS_PER_SEC;
+#endif
+}
+
+static void
+pixman_image_composite_wrapper (const pixman_composite_info_t *info)
+{
+    pixman_image_composite (info->op,
+                            info->src_image, info->mask_image, info->dest_image,
+                            info->src_x, info->src_y,
+                            info->mask_x, info->mask_y,
+                            info->dest_x, info->dest_y,
+                            info->width, info->height);
+}
+
+static void
+pixman_image_composite_empty (const pixman_composite_info_t *info)
+{
+    pixman_image_composite (info->op,
+                            info->src_image, info->mask_image, info->dest_image,
+                            info->src_x, info->src_y,
+                            info->mask_x, info->mask_y,
+                            info->dest_x, info->dest_y,
+                            1, 1);
+}
+
+static void
+bench (const bench_info_t *bi,
+       uint32_t            max_n,
+       uint32_t            max_time,
+       uint32_t           *ret_n,
+       uint32_t           *ret_time,
+       void              (*func) (const pixman_composite_info_t *info))
+{
+    uint32_t n = 0;
+    uint32_t t0;
+    uint32_t t1;
+    uint32_t x = 0;
+    pixman_transform_t t;
+    pixman_composite_info_t info;
+
+    t = bi->transform;
+    info.op = bi->op;
+    info.src_image = bi->src_image;
+    info.mask_image = bi->mask_image;
+    info.dest_image = bi->dest_image;
+    info.src_x = 0;
+    info.src_y = 0;
+    info.mask_x = 0;
+    info.mask_y = 0;
+    /* info.dest_x set below */
+    info.dest_y = 0;
+    info.width = WIDTH;
+    info.height = HEIGHT;
+
+    t0 = gettimei ();
+
+    do
+    {
+
+        if (++x >= 64)
+            x = 0;
+
+        info.dest_x = 63 - x;
+
+        t.matrix[0][2] = pixman_int_to_fixed (bi->src_x + x);
+        t.matrix[1][2] = pixman_int_to_fixed (bi->src_y);
+        pixman_image_set_transform (bi->src_image, &t);
+
+        if (bi->mask_image)
+            pixman_image_set_transform (bi->mask_image, &t);
+
+        func (&info);
+        t1 = gettimei ();
+    }
+    while (++n < max_n && (t1 - t0) < max_time);
+
+    if (ret_n)
+        *ret_n = n;
+
+    *ret_time = t1 - t0;
+}
+
+int
+parse_fixed_argument (char *arg, pixman_fixed_t *value)
+{
+    char *tailptr;
+
+    *value = pixman_double_to_fixed (strtod (arg, &tailptr));
+
+    return *tailptr == '\0';
+}
+
+int
+parse_arguments (int                   argc,
+                 char                 *argv[],
+                 pixman_transform_t   *t,
+                 pixman_op_t          *op,
+                 pixman_format_code_t *src_format,
+                 pixman_format_code_t *mask_format,
+                 pixman_format_code_t *dest_format)
+{
+    if (!parse_fixed_argument (*argv, &t->matrix[0][0]))
+        return 0;
+
+    if (*++argv == NULL)
+        return 1;
+
+    if (!parse_fixed_argument (*argv, &t->matrix[0][1]))
+        return 0;
+
+    if (*++argv == NULL)
+        return 1;
+
+    if (!parse_fixed_argument (*argv, &t->matrix[1][0]))
+        return 0;
+
+    if (*++argv == NULL)
+        return 1;
+
+    if (!parse_fixed_argument (*argv, &t->matrix[1][1]))
+        return 0;
+
+    if (*++argv == NULL)
+        return 1;
+
+    *op = operator_from_string (*argv);
+    if (*op == PIXMAN_OP_NONE)
+        return 0;
+
+    if (*++argv == NULL)
+        return 1;
+
+    *src_format = format_from_string (*argv);
+    if (*src_format == PIXMAN_null)
+        return 0;
+
+    ++argv;
+    if (argv[0] && argv[1])
+    {
+        *mask_format = format_from_string (*argv);
+        if (*mask_format == PIXMAN_null)
+            return 0;
+        ++argv;
+    }
+    if (*argv)
+    {
+        *dest_format = format_from_string (*argv);
+        if (*dest_format == PIXMAN_null)
+            return 0;
+    }
+    return 1;
+}
+
+static void
+run_benchmark (const bench_info_t *bi)
+{
+    uint32_t n;  /* number of iterations in at least 5 seconds */
+    uint32_t t1; /* time taken to do n iterations, microseconds */
+    uint32_t t2; /* calling overhead for n iterations, microseconds */
+
+    flush_cache ();
+    bench (bi, UINT32_MAX, 5000000, &n, &t1, pixman_image_composite_wrapper);
+    bench (bi, n, UINT32_MAX, NULL, &t2, pixman_image_composite_empty);
+
+    /* The result indicates the output rate in megapixels/second */
+    printf ("%6.2f\n", (double) n * WIDTH * HEIGHT / (t1 - t2));
+}
+
+
+int
+main (int argc, char *argv[])
+{
+    bench_info_t         binfo;
+    pixman_filter_t      filter      = PIXMAN_FILTER_NEAREST;
+    pixman_format_code_t src_format  = PIXMAN_a8r8g8b8;
+    pixman_format_code_t mask_format = 0;
+    pixman_format_code_t dest_format = PIXMAN_a8r8g8b8;
+    pixman_box32_t       dest_box    = { 0, 0, WIDTH, HEIGHT };
+    box_48_16_t          transformed = { 0 };
+    int32_t xmin, ymin, xmax, ymax;
+    uint32_t *src, *mask, *dest;
+
+    binfo.op         = PIXMAN_OP_SRC;
+    binfo.mask_image = NULL;
+    pixman_transform_init_identity (&binfo.transform);
+
+    ++argv;
+    if (*argv && (*argv)[0] == '-' && (*argv)[1] == 'n')
+    {
+        filter = PIXMAN_FILTER_NEAREST;
+        ++argv;
+        --argc;
+    }
+
+    if (*argv && (*argv)[0] == '-' && (*argv)[1] == 'b')
+    {
+        filter = PIXMAN_FILTER_BILINEAR;
+        ++argv;
+        --argc;
+    }
+
+    if (argc == 1 ||
+        !parse_arguments (argc, argv, &binfo.transform, &binfo.op,
+                          &src_format, &mask_format, &dest_format))
+    {
+        printf ("Usage: affine-bench [-n] [-b] axx [axy] [ayx] [ayy] [combine type]\n");
+        printf ("                    [src format] [mask format] [dest format]\n");
+        printf ("  -n : nearest scaling (default)\n");
+        printf ("  -b : bilinear scaling\n");
+        printf ("  axx : x_out:x_in factor\n");
+        printf ("  axy : x_out:y_in factor (default 0)\n");
+        printf ("  ayx : y_out:x_in factor (default 0)\n");
+        printf ("  ayy : y_out:y_in factor (default 1)\n");
+        printf ("  combine type : src, over, in etc (default src)\n");
+        printf ("  src format : a8r8g8b8, r5g6b5 etc (default a8r8g8b8)\n");
+        printf ("  mask format : as for src format, but no mask used if omitted\n");
+        printf ("  dest format : as for src format (default a8r8g8b8)\n");
+        printf ("The output is a single number in megapixels/second.\n");
+
+        return EXIT_FAILURE;
+    }
+
+    /* Compute required extents for source and mask image so they qualify
+     * for COVER fast paths and get the flags in pixman.c:analyze_extent().
+     * These computations are for FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,
+     * but at the same time they also allow COVER_CLIP_NEAREST.
+     */
+    compute_transformed_extents (&binfo.transform, &dest_box, &transformed);
+    xmin = pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2);
+    ymin = pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2);
+    xmax = pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2);
+    ymax = pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2);
+    /* Note:
+     * The upper limits can be reduced to the following when fetchers
+     * are guaranteed to not access pixels with zero weight. This concerns
+     * particularly all bilinear samplers.
+     *
+     * xmax = pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2 - pixman_fixed_e);
+     * ymax = pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2 - pixman_fixed_e);
+     * This is equivalent to subtracting 0.5 and rounding up, rather than
+     * subtracting 0.5, rounding down and adding 1.
+     */
+    binfo.src_x = -xmin;
+    binfo.src_y = -ymin;
+
+    /* Always over-allocate width by 64 pixels for all src, mask and dst,
+     * so that we can iterate over an x-offset 0..63 in bench ().
+     * This is similar to lowlevel-blt-bench, which uses the same method
+     * to hit different cacheline misalignments.
+     */
+    create_image (xmax - xmin + 64, ymax - ymin + 1, src_format, filter,
+                  &src, &binfo.src_image);
+
+    if (mask_format)
+    {
+        create_image (xmax - xmin + 64, ymax - ymin + 1, mask_format, filter,
+                      &mask, &binfo.mask_image);
+
+        if ((PIXMAN_FORMAT_R(mask_format) ||
+             PIXMAN_FORMAT_G(mask_format) ||
+             PIXMAN_FORMAT_B(mask_format)))
+        {
+            pixman_image_set_component_alpha (binfo.mask_image, 1);
+        }
+    }
+
+    create_image (WIDTH + 64, HEIGHT, dest_format, filter,
+                  &dest, &binfo.dest_image);
+
+    run_benchmark (&binfo);
+
+    return EXIT_SUCCESS;
+}
diff --git a/test/affine-test.c b/test/affine-test.c
index 8e19023..f516856 100644
--- a/test/affine-test.c
+++ b/test/affine-test.c
@@ -171,7 +171,7 @@ test_composite (int      testnum,
 	    int i = prng_rand_n (2);
 	    int j = prng_rand_n (3);
 	    int bitnum = prng_rand_n (32);
-	    transform.matrix[i][j] ^= 1 << bitnum;
+	    transform.matrix[i][j] ^= 1U << bitnum;
 	    if (prng_rand_n (2))
 		break;
 	}
diff --git a/test/alphamap.c b/test/alphamap.c
index 4d09076..150d33e 100644
--- a/test/alphamap.c
+++ b/test/alphamap.c
@@ -10,7 +10,8 @@ static const pixman_format_code_t formats[] =
     PIXMAN_a8r8g8b8,
     PIXMAN_a2r10g10b10,
     PIXMAN_a4r4g4b4,
-    PIXMAN_a8
+    PIXMAN_a8,
+    PIXMAN_rgba_float,
 };
 
 static const pixman_format_code_t alpha_formats[] =
@@ -18,7 +19,8 @@ static const pixman_format_code_t alpha_formats[] =
     PIXMAN_null,
     PIXMAN_a8,
     PIXMAN_a2r10g10b10,
-    PIXMAN_a4r4g4b4
+    PIXMAN_a4r4g4b4,
+    PIXMAN_rgba_float,
 };
 
 static const int origins[] =
@@ -41,7 +43,10 @@ make_image (pixman_format_code_t format)
     uint8_t bpp = PIXMAN_FORMAT_BPP (format) / 8;
     pixman_image_t *image;
 
-    bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
+    if (format != PIXMAN_rgba_float)
+	bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
+    else
+	bits = (uint32_t *)make_random_floats (WIDTH * HEIGHT * bpp);
 
     image = pixman_image_create_bits (format, WIDTH, HEIGHT, bits, WIDTH * bpp);
 
@@ -51,11 +56,11 @@ make_image (pixman_format_code_t format)
     return image;
 }
 
-static uint8_t
+static float
 get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
 {
     uint8_t *bits;
-    uint8_t r;
+    uint32_t r;
 
     if (image->common.alpha_map)
     {
@@ -69,7 +74,7 @@ get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
 	}
 	else
 	{
-	    return 0;
+	    return 0.f;
 	}
     }
 
@@ -78,28 +83,32 @@ get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
     if (image->bits.format == PIXMAN_a8)
     {
 	r = bits[y * WIDTH + x];
+	return r / 255.f;
     }
     else if (image->bits.format == PIXMAN_a2r10g10b10)
     {
 	r = ((uint32_t *)bits)[y * WIDTH + x] >> 30;
-	r |= r << 2;
-	r |= r << 4;
+	return r / 3.f;
     }
     else if (image->bits.format == PIXMAN_a8r8g8b8)
     {
 	r = ((uint32_t *)bits)[y * WIDTH + x] >> 24;
+	return r / 255.f;
     }
     else if (image->bits.format == PIXMAN_a4r4g4b4)
     {
 	r = ((uint16_t *)bits)[y * WIDTH + x] >> 12;
-	r |= r << 4;
+	return r / 15.f;
+    }
+    else if (image->bits.format == PIXMAN_rgba_float)
+    {
+	return ((float *)bits)[y * WIDTH * 4 + x * 4 + 3];
     }
     else
     {
 	assert (0);
+	return 0.f;
     }
-
-    return r;
 }
 
 static uint16_t
@@ -133,6 +142,11 @@ get_red (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
 	r |= r << 4;
 	r |= r << 8;
     }
+    else if (image->bits.format == PIXMAN_rgba_float)
+    {
+	double tmp = ((float *)bits)[y * WIDTH * 4 + x * 4];
+	return tmp * 65535.;
+    }
     else
     {
 	assert (0);
@@ -141,6 +155,23 @@ get_red (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
     return r;
 }
 
+static float get_alpha_err(pixman_format_code_t sf, pixman_format_code_t saf,
+			   pixman_format_code_t df, pixman_format_code_t daf)
+{
+	pixman_format_code_t s = saf != PIXMAN_null ? saf : sf;
+	pixman_format_code_t d = daf != PIXMAN_null ? daf : df;
+
+	/* There are cases where we go through the 8 bit compositing
+	 * path even with 10bpc and higher formats.
+	 */
+	if (PIXMAN_FORMAT_A(s) == PIXMAN_FORMAT_A(d))
+		return 1.f / 255.f;
+	else if (PIXMAN_FORMAT_A(s) > PIXMAN_FORMAT_A(d))
+		return 1.f / ((1 << PIXMAN_FORMAT_A(d)) - 1);
+	else
+		return 1.f / ((1 << PIXMAN_FORMAT_A(s)) - 1);
+}
+
 static int
 run_test (int s, int d, int sa, int da, int soff, int doff)
 {
@@ -151,15 +182,11 @@ run_test (int s, int d, int sa, int da, int soff, int doff)
     pixman_image_t *src, *dst, *orig_dst, *alpha, *orig_alpha;
     pixman_transform_t t1;
     int j, k;
-    int n_alpha_bits, n_red_bits;
+    int n_red_bits;
 
     soff = origins[soff];
     doff = origins[doff];
 
-    n_alpha_bits = PIXMAN_FORMAT_A (df);
-    if (daf != PIXMAN_null)
-	n_alpha_bits = PIXMAN_FORMAT_A (daf);
-
     n_red_bits = PIXMAN_FORMAT_R (df);
 
     /* Source */
@@ -211,21 +238,25 @@ run_test (int s, int d, int sa, int da, int soff, int doff)
     {
 	for (k = MAX (doff, 0); k < MIN (WIDTH, WIDTH + doff); ++k)
 	{
-	    uint8_t sa, da, oda, refa;
+	    float sa, da, oda, refa;
 	    uint16_t sr, dr, odr, refr;
+	    float err;
+
+	    err = get_alpha_err(sf, saf, df, daf);
 
 	    sa = get_alpha (src, k, j, soff, soff);
 	    da = get_alpha (dst, k, j, doff, doff);
 	    oda = get_alpha (orig_dst, k, j, doff, doff);
 
-	    if (sa + oda > 255)
-		refa = 255;
+	    if (sa + oda > 1.f)
+		refa = 1.f;
 	    else
 		refa = sa + oda;
 
-	    if (da >> (8 - n_alpha_bits) != refa >> (8 - n_alpha_bits))
+	    if (da - err > refa ||
+	        da + err < refa)
 	    {
-		printf ("\nWrong alpha value at (%d, %d). Should be 0x%x; got 0x%x. Source was 0x%x, original dest was 0x%x\n",
+		printf ("\nWrong alpha value at (%d, %d). Should be %g; got %g. Source was %g, original dest was %g\n",
 			k, j, refa, da, sa, oda);
 
 		printf ("src: %s, alpha: %s, origin %d %d\ndst: %s, alpha: %s, origin: %d %d\n\n",
diff --git a/test/check-formats.c b/test/check-formats.c
index 8eb263b..4e2633c 100644
--- a/test/check-formats.c
+++ b/test/check-formats.c
@@ -104,198 +104,6 @@ check_op (pixman_op_t          op,
     return retval;
 }
 
-static const pixman_op_t op_list[] =
-{
-    PIXMAN_OP_CLEAR,
-    PIXMAN_OP_SRC,
-    PIXMAN_OP_DST,
-    PIXMAN_OP_OVER,
-    PIXMAN_OP_OVER_REVERSE,
-    PIXMAN_OP_IN,
-    PIXMAN_OP_IN_REVERSE,
-    PIXMAN_OP_OUT,
-    PIXMAN_OP_OUT_REVERSE,
-    PIXMAN_OP_ATOP,
-    PIXMAN_OP_ATOP_REVERSE,
-    PIXMAN_OP_XOR,
-    PIXMAN_OP_ADD,
-    PIXMAN_OP_SATURATE,
-
-    PIXMAN_OP_DISJOINT_CLEAR,
-    PIXMAN_OP_DISJOINT_SRC,
-    PIXMAN_OP_DISJOINT_DST,
-    PIXMAN_OP_DISJOINT_OVER,
-    PIXMAN_OP_DISJOINT_OVER_REVERSE,
-    PIXMAN_OP_DISJOINT_IN,
-    PIXMAN_OP_DISJOINT_IN_REVERSE,
-    PIXMAN_OP_DISJOINT_OUT,
-    PIXMAN_OP_DISJOINT_OUT_REVERSE,
-    PIXMAN_OP_DISJOINT_ATOP,
-    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
-    PIXMAN_OP_DISJOINT_XOR,
-
-    PIXMAN_OP_CONJOINT_CLEAR,
-    PIXMAN_OP_CONJOINT_SRC,
-    PIXMAN_OP_CONJOINT_DST,
-    PIXMAN_OP_CONJOINT_OVER,
-    PIXMAN_OP_CONJOINT_OVER_REVERSE,
-    PIXMAN_OP_CONJOINT_IN,
-    PIXMAN_OP_CONJOINT_IN_REVERSE,
-    PIXMAN_OP_CONJOINT_OUT,
-    PIXMAN_OP_CONJOINT_OUT_REVERSE,
-    PIXMAN_OP_CONJOINT_ATOP,
-    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
-    PIXMAN_OP_CONJOINT_XOR,
-
-    PIXMAN_OP_MULTIPLY,
-    PIXMAN_OP_SCREEN,
-    PIXMAN_OP_OVERLAY,
-    PIXMAN_OP_DARKEN,
-    PIXMAN_OP_LIGHTEN,
-    PIXMAN_OP_COLOR_DODGE,
-    PIXMAN_OP_COLOR_BURN,
-    PIXMAN_OP_HARD_LIGHT,
-    PIXMAN_OP_SOFT_LIGHT,
-    PIXMAN_OP_DIFFERENCE,
-    PIXMAN_OP_EXCLUSION,
-    PIXMAN_OP_HSL_HUE,
-    PIXMAN_OP_HSL_SATURATION,
-    PIXMAN_OP_HSL_COLOR,
-    PIXMAN_OP_HSL_LUMINOSITY
-};
-
-static const pixman_format_code_t format_list[] =
-{
-    PIXMAN_a8r8g8b8,
-    PIXMAN_x8r8g8b8,
-    PIXMAN_a8b8g8r8,
-    PIXMAN_x8b8g8r8,
-    PIXMAN_b8g8r8a8,
-    PIXMAN_b8g8r8x8,
-    PIXMAN_r8g8b8a8,
-    PIXMAN_r8g8b8x8,
-    PIXMAN_x14r6g6b6,
-    PIXMAN_x2r10g10b10,
-    PIXMAN_a2r10g10b10,
-    PIXMAN_x2b10g10r10,
-    PIXMAN_a2b10g10r10,
-    PIXMAN_a8r8g8b8_sRGB,
-    PIXMAN_r8g8b8,
-    PIXMAN_b8g8r8,
-    PIXMAN_r5g6b5,
-    PIXMAN_b5g6r5,
-    PIXMAN_a1r5g5b5,
-    PIXMAN_x1r5g5b5,
-    PIXMAN_a1b5g5r5,
-    PIXMAN_x1b5g5r5,
-    PIXMAN_a4r4g4b4,
-    PIXMAN_x4r4g4b4,
-    PIXMAN_a4b4g4r4,
-    PIXMAN_x4b4g4r4,
-    PIXMAN_a8,
-    PIXMAN_r3g3b2,
-    PIXMAN_b2g3r3,
-    PIXMAN_a2r2g2b2,
-    PIXMAN_a2b2g2r2,
-    PIXMAN_x4a4,
-    PIXMAN_a4,
-    PIXMAN_r1g2b1,
-    PIXMAN_b1g2r1,
-    PIXMAN_a1r1g1b1,
-    PIXMAN_a1b1g1r1,
-    PIXMAN_a1,
-};
-
-static pixman_format_code_t
-format_from_string (const char *s)
-{
-    int i;
-
-    for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
-    {
-        if (strcasecmp (format_name (format_list[i]), s) == 0)
-            return format_list[i];
-    }
-
-    return PIXMAN_null;
-}
-
-static void
-emit (const char *s, int *n_chars)
-{
-    *n_chars += printf ("%s,", s);
-    if (*n_chars > 60)
-    {
-        printf ("\n    ");
-        *n_chars = 0;
-    }
-    else
-    {
-        printf (" ");
-        (*n_chars)++;
-    }
-}
-
-static void
-list_formats (void)
-{
-    int n_chars;
-    int i;
-
-    printf ("Formats:\n    ");
-
-    n_chars = 0;
-    for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
-        emit (format_name (format_list[i]), &n_chars);
-
-    printf ("\n\n");
-}
-
-static void
-list_operators (void)
-{
-    char short_name [128] = { 0 };
-    int i, n_chars;
-
-    printf ("Operators:\n    ");
-
-    n_chars = 0;
-    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
-    {
-        pixman_op_t op = op_list[i];
-        int j;
-
-        snprintf (short_name, sizeof (short_name) - 1, "%s",
-                  operator_name (op) + strlen ("PIXMAN_OP_"));
-
-        for (j = 0; short_name[j] != '\0'; ++j)
-            short_name[j] = tolower (short_name[j]);
-
-        emit (short_name, &n_chars);
-    }
-
-    printf ("\n\n");
-}
-
-static pixman_op_t
-operator_from_string (const char *s)
-{
-    char full_name[128] = { 0 };
-    int i;
-
-    snprintf (full_name, (sizeof full_name) - 1, "PIXMAN_OP_%s", s);
-
-    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
-    {
-        pixman_op_t op = op_list[i];
-
-        if (strcasecmp (operator_name (op), full_name) == 0)
-            return op;
-    }
-
-    return PIXMAN_OP_NONE;
-}
-
 int
 main (int argc, char **argv)
 {
diff --git a/test/composite.c b/test/composite.c
index 594c697..8d95046 100644
--- a/test/composite.c
+++ b/test/composite.c
@@ -92,6 +92,7 @@ static const pixman_format_code_t formats[] =
     
     /* sRGB formats */
     PIXMAN_a8r8g8b8_sRGB,
+    PIXMAN_r8g8b8_sRGB,
 
     /* 24 bpp formats */
     PIXMAN_r8g8b8,
diff --git a/test/cover-test.c b/test/cover-test.c
new file mode 100644
index 0000000..83e2972
--- /dev/null
+++ b/test/cover-test.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright © 2015 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+/*
+ * This test aims to verify both numerical correctness and the honouring of
+ * array bounds for scaled plots (both nearest-neighbour and bilinear) at or
+ * close to the boundary conditions for applicability of "cover" type fast paths
+ * and iter fetch routines.
+ *
+ * It has a secondary purpose: by setting the env var EXACT (to any value) it
+ * will only test plots that are exactly on the boundary condition. This makes
+ * it possible to ensure that "cover" routines are being used to the maximum,
+ * although this requires the use of a debugger or code instrumentation to
+ * verify.
+ */
+
+#include "utils.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+/* Approximate limits for random scale factor generation - these ensure we can
+ * get at least 8x reduction and 8x enlargement.
+ */
+#define LOG2_MAX_FACTOR (3)
+
+/* 1/sqrt(2) (or sqrt(0.5), or 2^-0.5) as a 0.32 fixed-point number */
+#define INV_SQRT_2_0POINT32_FIXED (0xB504F334u)
+
+/* The largest increment that can be generated by random_scale_factor().
+ * This occurs when the "mantissa" part is 0xFFFFFFFF and the "exponent"
+ * part is -LOG2_MAX_FACTOR.
+ */
+#define MAX_INC ((pixman_fixed_t) \
+                 (INV_SQRT_2_0POINT32_FIXED >> (31 - 16 - LOG2_MAX_FACTOR)))
+
+/* Minimum source width (in pixels) based on a typical page size of 4K and
+ * maximum colour depth of 32bpp.
+ */
+#define MIN_SRC_WIDTH (4096 / 4)
+
+/* Derive the destination width so that at max increment we fit within source */
+#define DST_WIDTH (MIN_SRC_WIDTH * pixman_fixed_1 / MAX_INC)
+
+/* Calculate heights the other way round.
+ * No limits due to page alignment here.
+ */
+#define DST_HEIGHT 3
+#define SRC_HEIGHT ((DST_HEIGHT * MAX_INC + pixman_fixed_1 - 1) / pixman_fixed_1)
+
+/* At the time of writing, all the scaled fast paths use SRC, OVER or ADD
+ * Porter-Duff operators. XOR is included in the list to ensure good
+ * representation of iter scanline fetch routines.
+ */
+static const pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_XOR,
+};
+
+/* At the time of writing, all the scaled fast paths use a8r8g8b8, x8r8g8b8
+ * or r5g6b5, or red-blue swapped versions of the same. When a mask channel is
+ * used, it is always a8 (and so implicitly not component alpha). a1r5g5b5 is
+ * included because it is the only other format to feature in any iters. */
+static const pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_a1r5g5b5
+};
+
+/* This is a flag reflecting the environment variable EXACT. It can be used
+ * to ensure that source coordinates corresponding exactly to the "cover" limits
+ * are used, rather than any "near misses". This can, for example, be used in
+ * conjunction with a debugger to ensure that only COVER fast paths are used.
+ */
+static int exact;
+
+static pixman_image_t *
+create_src_image (pixman_format_code_t fmt)
+{
+    pixman_image_t *tmp_img, *img;
+
+    /* We need the left-most and right-most MIN_SRC_WIDTH pixels to have
+     * predictable values, even though fence_image_create_bits() may allocate
+     * an image somewhat larger than that, by an amount that varies depending
+     * upon the page size on the current platform. The solution is to create a
+     * temporary non-fenced image that is exactly MIN_SRC_WIDTH wide and blit it
+     * into the fenced image.
+     */
+    tmp_img = pixman_image_create_bits (fmt, MIN_SRC_WIDTH, SRC_HEIGHT,
+                                        NULL, 0);
+    if (tmp_img == NULL)
+        return NULL;
+
+    img = fence_image_create_bits (fmt, MIN_SRC_WIDTH, SRC_HEIGHT, TRUE);
+    if (img == NULL)
+    {
+        pixman_image_unref (tmp_img);
+        return NULL;
+    }
+
+    prng_randmemset (tmp_img->bits.bits,
+                     tmp_img->bits.rowstride * SRC_HEIGHT * sizeof (uint32_t),
+                     0);
+    image_endian_swap (tmp_img);
+
+    pixman_image_composite (PIXMAN_OP_SRC, tmp_img, NULL, img,
+                            0, 0, 0, 0, 0, 0,
+                            MIN_SRC_WIDTH, SRC_HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SRC, tmp_img, NULL, img,
+                            0, 0, 0, 0, img->bits.width - MIN_SRC_WIDTH, 0,
+                            MIN_SRC_WIDTH, SRC_HEIGHT);
+
+    pixman_image_unref (tmp_img);
+
+    return img;
+}
+
+static pixman_fixed_t
+random_scale_factor(void)
+{
+    /* Get a random number with top bit set. */
+    uint32_t f = prng_rand () | 0x80000000u;
+
+    /* In log(2) space, this is still approximately evenly spread between 31
+     * and 32. Divide by sqrt(2) to centre the distribution on 2^31.
+     */
+    f = ((uint64_t) f * INV_SQRT_2_0POINT32_FIXED) >> 32;
+
+    /* Now shift right (ie divide by an integer power of 2) to spread the
+     * distribution between centres at 2^(16 +/- LOG2_MAX_FACTOR).
+     */
+    f >>= 31 - 16 + prng_rand_n (2 * LOG2_MAX_FACTOR + 1) - LOG2_MAX_FACTOR;
+
+    return f;
+}
+
+static pixman_fixed_t
+calc_translate (int            dst_size,
+                int            src_size,
+                pixman_fixed_t scale,
+                pixman_bool_t  low_align,
+                pixman_bool_t  bilinear)
+{
+    pixman_fixed_t ref_src, ref_dst, scaled_dst;
+
+    if (low_align)
+    {
+        ref_src = bilinear ? pixman_fixed_1 / 2 : pixman_fixed_e;
+        ref_dst = pixman_fixed_1 / 2;
+    }
+    else
+    {
+        ref_src = pixman_int_to_fixed (src_size) -
+                  bilinear * pixman_fixed_1 / 2;
+        ref_dst = pixman_int_to_fixed (dst_size) - pixman_fixed_1 / 2;
+    }
+
+    scaled_dst = ((uint64_t) ref_dst * scale + pixman_fixed_1 / 2) /
+                 pixman_fixed_1;
+
+    /* We need the translation to be set such that when ref_dst is fed through
+     * the transformation matrix, we get ref_src as the result.
+     */
+    return ref_src - scaled_dst;
+}
+
+static pixman_fixed_t
+random_offset (void)
+{
+    pixman_fixed_t offset = 0;
+
+    /* Ensure we test the exact case quite a lot */
+    if (prng_rand_n (2))
+        return offset;
+
+    /* What happens when we are close to the edge of the first
+     * interpolation step?
+     */
+    if (prng_rand_n (2))
+        offset += (pixman_fixed_1 >> BILINEAR_INTERPOLATION_BITS) - 16;
+
+    /* Try fine-grained variations */
+    offset += prng_rand_n (32);
+
+    /* Test in both directions */
+    if (prng_rand_n (2))
+        offset = -offset;
+
+    return offset;
+}
+
+static void
+check_transform (pixman_image_t     *dst_img,
+                 pixman_image_t     *src_img,
+                 pixman_transform_t *transform,
+                 pixman_bool_t       bilinear)
+{
+    pixman_vector_t v1, v2;
+
+    v1.vector[0] = pixman_fixed_1 / 2;
+    v1.vector[1] = pixman_fixed_1 / 2;
+    v1.vector[2] = pixman_fixed_1;
+    assert (pixman_transform_point (transform, &v1));
+
+    v2.vector[0] = pixman_int_to_fixed (dst_img->bits.width) -
+                   pixman_fixed_1 / 2;
+    v2.vector[1] = pixman_int_to_fixed (dst_img->bits.height) -
+                   pixman_fixed_1 / 2;
+    v2.vector[2] = pixman_fixed_1;
+    assert (pixman_transform_point (transform, &v2));
+
+    if (bilinear)
+    {
+        assert (v1.vector[0] >= pixman_fixed_1 / 2);
+        assert (v1.vector[1] >= pixman_fixed_1 / 2);
+        assert (v2.vector[0] <= pixman_int_to_fixed (src_img->bits.width) -
+                                    pixman_fixed_1 / 2);
+        assert (v2.vector[1] <= pixman_int_to_fixed (src_img->bits.height) -
+                                    pixman_fixed_1 / 2);
+    }
+    else
+    {
+        assert (v1.vector[0] >= pixman_fixed_e);
+        assert (v1.vector[1] >= pixman_fixed_e);
+        assert (v2.vector[0] <= pixman_int_to_fixed (src_img->bits.width));
+        assert (v2.vector[1] <= pixman_int_to_fixed (src_img->bits.height));
+    }
+}
+
+static uint32_t
+test_cover (int testnum, int verbose)
+{
+    pixman_fixed_t         x_scale, y_scale;
+    pixman_bool_t          left_align, top_align;
+    pixman_bool_t          bilinear;
+    pixman_filter_t        filter;
+    pixman_op_t            op;
+    size_t                 src_fmt_index;
+    pixman_format_code_t   src_fmt, dst_fmt, mask_fmt;
+    pixman_image_t        *src_img, *dst_img, *mask_img;
+    pixman_transform_t     src_transform, mask_transform;
+    pixman_fixed_t         fuzz[4];
+    uint32_t               crc32;
+
+    /* We allocate one fenced image for each pixel format up-front. This is to
+     * avoid spending a lot of time on memory management rather than on testing
+     * Pixman optimisations. We need one per thread because the transformation
+     * matrices and filtering are properties of the source and mask images.
+     */
+    static pixman_image_t *src_imgs[ARRAY_LENGTH (img_fmt_list)];
+    static pixman_image_t *mask_bits_img;
+    static pixman_bool_t   fence_images_created;
+#ifdef USE_OPENMP
+#pragma omp threadprivate (src_imgs)
+#pragma omp threadprivate (mask_bits_img)
+#pragma omp threadprivate (fence_images_created)
+#endif
+
+    if (!fence_images_created)
+    {
+        int i;
+
+        prng_srand (0);
+
+        for (i = 0; i < ARRAY_LENGTH (img_fmt_list); i++)
+            src_imgs[i] = create_src_image (img_fmt_list[i]);
+
+        mask_bits_img = create_src_image (PIXMAN_a8);
+
+        fence_images_created = TRUE;
+    }
+
+    prng_srand (testnum);
+
+    x_scale = random_scale_factor ();
+    y_scale = random_scale_factor ();
+    left_align = prng_rand_n (2);
+    top_align = prng_rand_n (2);
+    bilinear = prng_rand_n (2);
+    filter = bilinear ? PIXMAN_FILTER_BILINEAR : PIXMAN_FILTER_NEAREST;
+
+    op = op_list[prng_rand_n (ARRAY_LENGTH (op_list))];
+
+    dst_fmt = img_fmt_list[prng_rand_n (ARRAY_LENGTH (img_fmt_list))];
+    dst_img = pixman_image_create_bits (dst_fmt, DST_WIDTH, DST_HEIGHT,
+                                        NULL, 0);
+    prng_randmemset (dst_img->bits.bits,
+                     dst_img->bits.rowstride * DST_HEIGHT * sizeof (uint32_t),
+                     0);
+    image_endian_swap (dst_img);
+
+    src_fmt_index = prng_rand_n (ARRAY_LENGTH (img_fmt_list));
+    src_fmt = img_fmt_list[src_fmt_index];
+    src_img = src_imgs[src_fmt_index];
+    pixman_image_set_filter (src_img, filter, NULL, 0);
+    pixman_transform_init_scale (&src_transform, x_scale, y_scale);
+    src_transform.matrix[0][2] = calc_translate (dst_img->bits.width,
+                                                 src_img->bits.width,
+                                                 x_scale, left_align, bilinear);
+    src_transform.matrix[1][2] = calc_translate (dst_img->bits.height,
+                                                 src_img->bits.height,
+                                                 y_scale, top_align, bilinear);
+
+    if (prng_rand_n (2))
+    {
+        /* No mask */
+        mask_fmt = PIXMAN_null;
+        mask_img = NULL;
+    }
+    else if (prng_rand_n (2))
+    {
+        /* a8 bitmap mask */
+        mask_fmt = PIXMAN_a8;
+        mask_img = mask_bits_img;
+        pixman_image_set_filter (mask_img, filter, NULL, 0);
+        pixman_transform_init_scale (&mask_transform, x_scale, y_scale);
+        mask_transform.matrix[0][2] = calc_translate (dst_img->bits.width,
+                                                      mask_img->bits.width,
+                                                      x_scale, left_align,
+                                                      bilinear);
+        mask_transform.matrix[1][2] = calc_translate (dst_img->bits.height,
+                                                      mask_img->bits.height,
+                                                      y_scale, top_align,
+                                                      bilinear);
+    }
+    else
+    {
+        /* Solid mask */
+        pixman_color_t color;
+        memset (&color, 0xAA, sizeof color);
+        mask_fmt = PIXMAN_solid;
+        mask_img = pixman_image_create_solid_fill (&color);
+    }
+
+    if (!exact)
+    {
+        int i = 0;
+
+        while (i < 4)
+            fuzz[i++] = random_offset ();
+
+        src_transform.matrix[0][2] += fuzz[0];
+        src_transform.matrix[1][2] += fuzz[1];
+        mask_transform.matrix[0][2] += fuzz[2];
+        mask_transform.matrix[1][2] += fuzz[3];
+    }
+
+    pixman_image_set_transform (src_img, &src_transform);
+    if (mask_fmt == PIXMAN_a8)
+        pixman_image_set_transform (mask_img, &mask_transform);
+
+    if (verbose)
+    {
+        printf ("op=%s\n", operator_name (op));
+        printf ("src_fmt=%s, dst_fmt=%s, mask_fmt=%s\n",
+                format_name (src_fmt), format_name (dst_fmt),
+                format_name (mask_fmt));
+        printf ("x_scale=0x%08X, y_scale=0x%08X, align %s/%s, %s\n",
+                x_scale, y_scale,
+                left_align ? "left" : "right", top_align ? "top" : "bottom",
+                bilinear ? "bilinear" : "nearest");
+
+        if (!exact)
+        {
+            int i = 0;
+
+            printf ("fuzz factors");
+            while (i < 4)
+                printf (" %d", fuzz[i++]);
+            printf ("\n");
+        }
+    }
+
+    if (exact)
+    {
+        check_transform (dst_img, src_img, &src_transform, bilinear);
+        if (mask_fmt == PIXMAN_a8)
+            check_transform (dst_img, mask_img, &mask_transform, bilinear);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+                            0, 0, 0, 0, 0, 0,
+                            dst_img->bits.width, dst_img->bits.height);
+
+    if (verbose)
+        print_image (dst_img);
+
+    crc32 = compute_crc32_for_image (0, dst_img);
+
+    pixman_image_unref (dst_img);
+    if (mask_fmt == PIXMAN_solid)
+        pixman_image_unref (mask_img);
+
+    return crc32;
+}
+
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM_FUZZ  0x6B56F607
+#define CHECKSUM_EXACT 0xA669F4A3
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM_FUZZ  0x83119ED0
+#define CHECKSUM_EXACT 0x0D3382CD
+#else
+#define CHECKSUM_FUZZ  0x00000000
+#define CHECKSUM_EXACT 0x00000000
+#endif
+
+int
+main (int argc, const char *argv[])
+{
+    unsigned long page_size;
+
+    page_size = fence_get_page_size ();
+    if (page_size == 0 || page_size > 16 * 1024)
+        return 77; /* automake SKIP */
+
+    exact = getenv ("EXACT") != NULL;
+    if (exact)
+        printf ("Doing plots that are exactly aligned to boundaries\n");
+
+    return fuzzer_test_main ("cover", 2000000,
+                             exact ? CHECKSUM_EXACT : CHECKSUM_FUZZ,
+                             test_cover, argc, argv);
+}
diff --git a/test/fence-image-self-test.c b/test/fence-image-self-test.c
new file mode 100644
index 0000000..2eb82ce
--- /dev/null
+++ b/test/fence-image-self-test.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright © 2015 Raspberry Pi Foundation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <pixman-config.h>
+#endif
+
+#include "utils.h"
+
+
+#if FENCE_MALLOC_ACTIVE && defined (HAVE_SIGACTION)
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+pixman_bool_t verbose;
+
+static void
+segv_handler (int sig, siginfo_t *si, void *unused)
+{
+    _exit (EXIT_SUCCESS);
+}
+
+static void
+die (const char *msg, int err)
+{
+    if (err)
+        perror (msg);
+    else
+        fprintf (stderr, "%s\n", msg);
+
+    abort ();
+}
+
+static void
+prinfo (const char *fmt, ...)
+{
+    va_list ap;
+
+    if (!verbose)
+        return;
+
+    va_start (ap, fmt);
+    vfprintf (stderr, fmt, ap);
+    va_end (ap);
+}
+
+static void
+do_expect_signal (void (*fn)(void *), void *data)
+{
+    struct sigaction sa;
+
+    sa.sa_flags = SA_SIGINFO;
+    sigemptyset (&sa.sa_mask);
+    sa.sa_sigaction = segv_handler;
+    if (sigaction (SIGSEGV, &sa, NULL) == -1)
+        die ("sigaction failed", errno);
+    if (sigaction (SIGBUS, &sa, NULL) == -1)
+        die ("sigaction failed", errno);
+
+    (*fn)(data);
+
+    _exit (EXIT_FAILURE);
+}
+
+/* Check that calling fn(data) causes a segmentation fault.
+ *
+ * You cannot portably return from a SIGSEGV handler in any way,
+ * so we fork, and do the test in the child process. Child's
+ * exit status will reflect the result. Its SEGV handler causes it
+ * to exit with success, and return failure otherwise.
+ */
+static pixman_bool_t
+expect_signal (void (*fn)(void *), void *data)
+{
+    pid_t pid, wp;
+    int status;
+
+    pid = fork ();
+    if (pid == -1)
+        die ("fork failed", errno);
+
+    if (pid == 0)
+        do_expect_signal (fn, data); /* never returns */
+
+    wp = waitpid (pid, &status, 0);
+    if (wp != pid)
+        die ("waitpid did not work", wp == -1 ? errno : 0);
+
+    if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_SUCCESS)
+        return TRUE;
+
+    return FALSE;
+}
+
+static void
+read_u8 (void *data)
+{
+    volatile uint8_t *p = data;
+
+    *p;
+}
+
+static pixman_bool_t
+test_read_fault (uint8_t *p, int offset)
+{
+    prinfo ("*(uint8_t *)(%p + %d)", p, offset);
+
+    if (expect_signal (read_u8, p + offset))
+    {
+        prinfo ("\tsignal OK\n");
+
+        return TRUE;
+    }
+
+    prinfo ("\tFAILED\n");
+
+    return FALSE;
+}
+
+static void
+test_read_ok (uint8_t *p, int offset)
+{
+    prinfo ("*(uint8_t *)(%p + %d)", p, offset);
+
+    /* If fails, SEGV. */
+    read_u8 (p + offset);
+
+    prinfo ("\tOK\n");
+}
+
+static pixman_bool_t
+test_read_faults (pixman_image_t *image)
+{
+    pixman_bool_t ok = TRUE;
+    pixman_format_code_t format = pixman_image_get_format (image);
+    int width = pixman_image_get_width (image);
+    int height = pixman_image_get_height (image);
+    int stride = pixman_image_get_stride (image);
+    uint8_t *p = (void *)pixman_image_get_data (image);
+    int row_bytes = width * PIXMAN_FORMAT_BPP (format) / 8;
+
+    prinfo ("%s %dx%d, row %d B, stride %d B:\n",
+            format_name (format), width, height, row_bytes, stride);
+
+    assert (height > 3);
+
+    test_read_ok (p, 0);
+    test_read_ok (p, row_bytes - 1);
+    test_read_ok (p, stride);
+    test_read_ok (p, stride + row_bytes - 1);
+    test_read_ok (p, 2 * stride);
+    test_read_ok (p, 2 * stride + row_bytes - 1);
+    test_read_ok (p, 3 * stride);
+    test_read_ok (p, (height - 1) * stride + row_bytes - 1);
+
+    ok &= test_read_fault (p, -1);
+    ok &= test_read_fault (p, row_bytes);
+    ok &= test_read_fault (p, stride - 1);
+    ok &= test_read_fault (p, stride + row_bytes);
+    ok &= test_read_fault (p, 2 * stride - 1);
+    ok &= test_read_fault (p, 2 * stride + row_bytes);
+    ok &= test_read_fault (p, 3 * stride - 1);
+    ok &= test_read_fault (p, height * stride);
+
+    return ok;
+}
+
+static pixman_bool_t
+test_image_faults (pixman_format_code_t format, int min_width, int height)
+{
+    pixman_bool_t ok;
+    pixman_image_t *image;
+
+    image = fence_image_create_bits (format, min_width, height, TRUE);
+    ok = test_read_faults (image);
+    pixman_image_unref (image);
+
+    return ok;
+}
+
+int
+main (int argc, char **argv)
+{
+    pixman_bool_t ok = TRUE;
+
+    if (getenv ("VERBOSE") != NULL)
+        verbose = TRUE;
+
+    ok &= test_image_faults (PIXMAN_a8r8g8b8, 7, 5);
+    ok &= test_image_faults (PIXMAN_r8g8b8, 7, 5);
+    ok &= test_image_faults (PIXMAN_r5g6b5, 7, 5);
+    ok &= test_image_faults (PIXMAN_a8, 7, 5);
+    ok &= test_image_faults (PIXMAN_a4, 7, 5);
+    ok &= test_image_faults (PIXMAN_a1, 7, 5);
+
+    if (ok)
+        return EXIT_SUCCESS;
+
+    return EXIT_FAILURE;
+}
+
+#else /* FENCE_MALLOC_ACTIVE */
+
+int
+main (int argc, char **argv)
+{
+    /* Automake return code for test SKIP. */
+    return 77;
+}
+
+#endif /* FENCE_MALLOC_ACTIVE */
diff --git a/test/filter-reduction-test.c b/test/filter-reduction-test.c
new file mode 100644
index 0000000..705fa4b
--- /dev/null
+++ b/test/filter-reduction-test.c
@@ -0,0 +1,112 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static const pixman_fixed_t entries[] =
+{
+    pixman_double_to_fixed (-1.0),
+    pixman_double_to_fixed (-0.5),
+    pixman_double_to_fixed (-1/3.0),
+    pixman_double_to_fixed (0.0),
+    pixman_double_to_fixed (0.5),
+    pixman_double_to_fixed (1.0),
+    pixman_double_to_fixed (1.5),
+    pixman_double_to_fixed (2.0),
+    pixman_double_to_fixed (3.0),
+};
+
+#define SIZE 12
+
+static uint32_t
+test_scale (const pixman_transform_t *xform, uint32_t crc)
+{
+    uint32_t *srcbuf, *dstbuf;
+    pixman_image_t *src, *dest;
+
+    srcbuf = malloc (SIZE * SIZE * 4);
+    prng_randmemset (srcbuf, SIZE * SIZE * 4, 0);
+    src = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, SIZE, SIZE, srcbuf, SIZE * 4);
+
+    dstbuf = malloc (SIZE * SIZE * 4);
+    prng_randmemset (dstbuf, SIZE * SIZE * 4, 0);
+    dest = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, SIZE, SIZE, dstbuf, SIZE * 4);
+
+    pixman_image_set_transform (src, xform);
+    pixman_image_set_repeat (src, PIXMAN_REPEAT_NORMAL);
+    pixman_image_set_filter (src, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    image_endian_swap (src);
+    image_endian_swap (dest);
+
+    pixman_image_composite (PIXMAN_OP_SRC,
+			    src, NULL, dest,
+			    0, 0, 0, 0, 0, 0,
+			    SIZE, SIZE);
+
+    crc = compute_crc32_for_image (crc, dest);
+
+    pixman_image_unref (src);
+    pixman_image_unref (dest);
+
+    free (srcbuf);
+    free (dstbuf);
+
+    return crc;
+}
+
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x02169677
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0xE44B29AC
+#else
+#define CHECKSUM 0x00000000
+#endif
+
+int
+main (int argc, const char *argv[])
+{
+    const pixman_fixed_t *end = entries + ARRAY_LENGTH (entries);
+    const pixman_fixed_t *t0, *t1, *t2, *t3, *t4, *t5;
+    uint32_t crc = 0;
+
+    prng_srand (0x56EA1DBD);
+
+    for (t0 = entries; t0 < end; ++t0)
+    {
+	for (t1 = entries; t1 < end; ++t1)
+	{
+	    for (t2 = entries; t2 < end; ++t2)
+	    {
+		for (t3 = entries; t3 < end; ++t3)
+		{
+		    for (t4 = entries; t4 < end; ++t4)
+		    {
+			for (t5 = entries; t5 < end; ++t5)
+			{
+			    pixman_transform_t xform = {
+				{ { *t0, *t1, *t2 },
+				  { *t3, *t4, *t5 },
+				  { 0, 0, pixman_fixed_1 } }
+			    };
+
+			    crc = test_scale (&xform, crc);
+			}
+		    }
+		}
+	    }
+	}
+    }
+
+    if (crc != CHECKSUM)
+    {
+	printf ("filter-reduction-test failed! (checksum=0x%08X, expected 0x%08X)\n", crc, CHECKSUM);
+	return 1;
+    }
+    else
+    {
+	printf ("filter-reduction-test passed (checksum=0x%08X)\n", crc);
+	return 0;
+    }
+}
diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
index 3da094a..7ba2986 100644
--- a/test/lowlevel-blt-bench.c
+++ b/test/lowlevel-blt-bench.c
@@ -55,7 +55,7 @@ uint32_t *dst;
 uint32_t *src;
 uint32_t *mask;
 
-double bandwidth = 0;
+double bandwidth = 0.0;
 
 double
 bench_memcpy ()
@@ -90,6 +90,7 @@ bench_memcpy ()
 
 static pixman_bool_t use_scaling = FALSE;
 static pixman_filter_t filter = PIXMAN_FILTER_NEAREST;
+static pixman_bool_t use_csv_output = FALSE;
 
 /* nearly 1x scale factor */
 static pixman_transform_t m =
@@ -165,7 +166,7 @@ call_func (pixman_composite_func_t func,
     func (0, &info);
 }
 
-void
+double
 noinline
 bench_L  (pixman_op_t              op,
           pixman_image_t *         src_img,
@@ -179,7 +180,6 @@ bench_L  (pixman_op_t              op,
     int64_t      i, j, k;
     int          x = 0;
     int          q = 0;
-    volatile int qx;
 
     for (i = 0; i < n; i++)
     {
@@ -203,10 +203,11 @@ bench_L  (pixman_op_t              op,
 	    x = 0;
 	call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 63 - x, 0, width, lines_count);
     }
-    qx = q;
+
+    return (double)n * lines_count * width;
 }
 
-void
+double
 noinline
 bench_M (pixman_op_t              op,
          pixman_image_t *         src_img,
@@ -224,6 +225,8 @@ bench_M (pixman_op_t              op,
 	    x = 0;
 	call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 1, 0, WIDTH - 64, HEIGHT);
     }
+
+    return (double)n * (WIDTH - 64) * HEIGHT;
 }
 
 double
@@ -366,15 +369,24 @@ bench_RT (pixman_op_t              op,
     return pix_cnt;
 }
 
+static double
+Mpx_per_sec (double pix_cnt, double t1, double t2, double t3)
+{
+    double overhead = t2 - t1;
+    double testtime = t3 - t2;
+
+    return pix_cnt / (testtime - overhead) / 1e6;
+}
+
 void
-bench_composite (char * testname,
-                 int    src_fmt,
-                 int    src_flags,
-                 int    op,
-                 int    mask_fmt,
-                 int    mask_flags,
-                 int    dst_fmt,
-                 double npix)
+bench_composite (const char *testname,
+                 int         src_fmt,
+                 int         src_flags,
+                 int         op,
+                 int         mask_fmt,
+                 int         mask_flags,
+                 int         dst_fmt,
+                 double      npix)
 {
     pixman_image_t *                src_img;
     pixman_image_t *                dst_img;
@@ -461,9 +473,9 @@ bench_composite (char * testname,
                                          dst,
                                          XWIDTH * 4);
 
-
-    printf ("%24s %c", testname, func != pixman_image_composite_wrapper ?
-            '-' : '=');
+    if (!use_csv_output)
+        printf ("%24s %c", testname, func != pixman_image_composite_wrapper ?
+                '-' : '=');
 
     memcpy (dst, src, BUFSIZE);
     memcpy (src, dst, BUFSIZE);
@@ -476,13 +488,15 @@ bench_composite (char * testname,
     n = 1 + npix / (l1test_width * 8);
     t1 = gettime ();
 #if EXCLUDE_OVERHEAD
-    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, 1);
+    pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, 1);
 #endif
     t2 = gettime ();
-    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, 1);
+    pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, 1);
     t3 = gettime ();
-    printf ("  L1:%7.2f", (double)n * l1test_width * 1 /
-            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    if (use_csv_output)
+        printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+    else
+        printf ("  L1:%7.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
     fflush (stdout);
 
     memcpy (dst, src, BUFSIZE);
@@ -495,13 +509,15 @@ bench_composite (char * testname,
     n = 1 + npix / (l1test_width * nlines);
     t1 = gettime ();
 #if EXCLUDE_OVERHEAD
-    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, nlines);
+    pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, nlines);
 #endif
     t2 = gettime ();
-    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, nlines);
+    pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, nlines);
     t3 = gettime ();
-    printf ("  L2:%7.2f", (double)n * l1test_width * nlines /
-            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    if (use_csv_output)
+        printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+    else
+        printf ("  L2:%7.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
     fflush (stdout);
 
     memcpy (dst, src, BUFSIZE);
@@ -510,14 +526,16 @@ bench_composite (char * testname,
     n = 1 + npix / (WIDTH * HEIGHT);
     t1 = gettime ();
 #if EXCLUDE_OVERHEAD
-    bench_M (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+    pix_cnt = bench_M (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
 #endif
     t2 = gettime ();
-    bench_M (op, src_img, mask_img, dst_img, n, func);
+    pix_cnt = bench_M (op, src_img, mask_img, dst_img, n, func);
     t3 = gettime ();
-    printf ("  M:%6.2f (%6.2f%%)",
-        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1))) / 1000000.,
-        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1)) * bytes_per_pix) * (100.0 / bandwidth) );
+    if (use_csv_output)
+        printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+    else
+        printf ("  M:%6.2f (%6.2f%%)", Mpx_per_sec (pix_cnt, t1, t2, t3),
+                (pix_cnt / ((t3 - t2) - (t2 - t1)) * bytes_per_pix) * (100.0 / bandwidth) );
     fflush (stdout);
 
     memcpy (dst, src, BUFSIZE);
@@ -531,7 +549,10 @@ bench_composite (char * testname,
     t2 = gettime ();
     pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, func);
     t3 = gettime ();
-    printf ("  HT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    if (use_csv_output)
+        printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+    else
+        printf ("  HT:%6.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
     fflush (stdout);
 
     memcpy (dst, src, BUFSIZE);
@@ -545,7 +566,10 @@ bench_composite (char * testname,
     t2 = gettime ();
     pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, func);
     t3 = gettime ();
-    printf ("  VT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    if (use_csv_output)
+        printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+    else
+        printf ("  VT:%6.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
     fflush (stdout);
 
     memcpy (dst, src, BUFSIZE);
@@ -559,7 +583,10 @@ bench_composite (char * testname,
     t2 = gettime ();
     pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
     t3 = gettime ();
-    printf ("  R:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    if (use_csv_output)
+        printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+    else
+        printf ("  R:%6.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
     fflush (stdout);
 
     memcpy (dst, src, BUFSIZE);
@@ -573,7 +600,10 @@ bench_composite (char * testname,
     t2 = gettime ();
     pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
     t3 = gettime ();
-    printf ("  RT:%6.2f (%4.0fKops/s)\n", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000., (double) n / ((t3 - t2) * 1000));
+    if (use_csv_output)
+        printf ("%g\n", Mpx_per_sec (pix_cnt, t1, t2, t3));
+    else
+        printf ("  RT:%6.2f (%4.0fKops/s)\n", Mpx_per_sec (pix_cnt, t1, t2, t3), (double) n / ((t3 - t2) * 1000));
 
     if (mask_img) {
 	pixman_image_unref (mask_img);
@@ -587,17 +617,20 @@ bench_composite (char * testname,
 
 #define PIXMAN_OP_OUT_REV (PIXMAN_OP_OUT_REVERSE)
 
-struct
+struct test_entry
 {
-    char *testname;
-    int   src_fmt;
-    int   src_flags;
-    int   op;
-    int   mask_fmt;
-    int   mask_flags;
-    int   dst_fmt;
-}
-tests_tbl[] =
+    const char *testname;
+    int         src_fmt;
+    int         src_flags;
+    int         op;
+    int         mask_fmt;
+    int         mask_flags;
+    int         dst_fmt;
+};
+
+typedef struct test_entry test_entry_t;
+
+static const test_entry_t tests_tbl[] =
 {
     { "add_8_8_8",             PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
     { "add_n_8_8",             PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
@@ -719,46 +752,286 @@ tests_tbl[] =
     { "rpixbuf",               PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
 };
 
-int
-main (int argc, char *argv[])
+static const test_entry_t special_patterns[] =
+{
+    { "add_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "add_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "src_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_0888_8888_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_0565_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_n_8",               PIXMAN_a8,          1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "pixbuf",                PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8r8g8b8 },
+    { "rpixbuf",               PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
+};
+
+/* Returns the sub-string's end pointer in string. */
+static const char *
+copy_sub_string (char       *buf,
+                 const char *string,
+                 const char *scan_from,
+                 const char *end)
 {
-    double x;
+    const char *delim;
+    size_t n;
+
+    delim = strchr (scan_from, '_');
+    if (!delim)
+        delim = end;
+
+    n = delim - string;
+    strncpy(buf, string, n);
+    buf[n] = '\0';
+
+    return delim;
+}
+
+static pixman_op_t
+parse_longest_operator (char *buf, const char **strp, const char *end)
+{
+    const char *p = *strp;
+    const char *sub_end;
+    const char *best_end = p;
+    pixman_op_t best_op = PIXMAN_OP_NONE;
+    pixman_op_t op;
+
+    while (p < end)
+    {
+        sub_end = copy_sub_string (buf, *strp, p, end);
+        op = operator_from_string (buf);
+        p = sub_end + 1;
+
+        if (op != PIXMAN_OP_NONE)
+        {
+            best_end = p;
+            best_op = op;
+        }
+    }
+
+    *strp = best_end;
+    return best_op;
+}
+
+static pixman_format_code_t
+parse_format (char *buf, const char **p, const char *end)
+{
+    pixman_format_code_t format;
+    const char *delim;
+
+    if (*p >= end)
+        return PIXMAN_null;
+
+    delim = copy_sub_string (buf, *p, *p, end);
+    format = format_from_string (buf);
+
+    if (format != PIXMAN_null)
+        *p = delim + 1;
+
+    return format;
+}
+
+static int
+parse_test_pattern (test_entry_t *test, const char *pattern)
+{
+    const char *p = pattern;
+    const char *end = pattern + strlen (pattern);
+    char buf[1024];
+    pixman_format_code_t format[3];
     int i;
-    const char *pattern = NULL;
-    for (i = 1; i < argc; i++)
+
+    if (strlen (pattern) > sizeof (buf) - 1)
+        return -1;
+
+    /* Special cases that the parser cannot produce. */
+    for (i = 0; i < ARRAY_LENGTH (special_patterns); i++)
     {
-	if (argv[i][0] == '-')
-	{
-	    if (strchr (argv[i] + 1, 'b'))
-	    {
-		use_scaling = TRUE;
-		filter = PIXMAN_FILTER_BILINEAR;
-	    }
-	    else if (strchr (argv[i] + 1, 'n'))
-	    {
-		use_scaling = TRUE;
-		filter = PIXMAN_FILTER_NEAREST;
-	    }
-	}
-	else
-	{
-	    pattern = argv[i];
-	}
+        if (strcmp (pattern, special_patterns[i].testname) == 0)
+        {
+            *test = special_patterns[i];
+            return 0;
+        }
     }
 
-    if (!pattern)
+    test->testname = pattern;
+
+    /* Extract operator, may contain delimiters,
+     * so take the longest string that matches.
+     */
+    test->op = parse_longest_operator (buf, &p, end);
+    if (test->op == PIXMAN_OP_NONE)
+        return -1;
+
+    /* extract up to three pixel formats */
+    format[0] = parse_format (buf, &p, end);
+    format[1] = parse_format (buf, &p, end);
+    format[2] = parse_format (buf, &p, end);
+
+    if (format[0] == PIXMAN_null || format[1] == PIXMAN_null)
+        return -1;
+
+    /* recognize CA flag */
+    test->mask_flags = 0;
+    if (p < end)
     {
-	printf ("Usage: lowlevel-blt-bench [-b] [-n] pattern\n");
-	printf ("  -n : benchmark nearest scaling\n");
-	printf ("  -b : benchmark bilinear scaling\n");
-	return 1;
+        if (strcmp (p, "ca") == 0)
+            test->mask_flags |= CA_FLAG;
+        else
+            return -1; /* trailing garbage */
     }
 
-    src = aligned_malloc (4096, BUFSIZE * 3);
-    memset (src, 0xCC, BUFSIZE * 3);
-    dst = src + (BUFSIZE / 4);
-    mask = dst + (BUFSIZE / 4);
+    test->src_fmt = format[0];
+    if (format[2] == PIXMAN_null)
+    {
+        test->mask_fmt = PIXMAN_null;
+        test->dst_fmt = format[1];
+    }
+    else
+    {
+        test->mask_fmt = format[1];
+        test->dst_fmt = format[2];
+    }
+
+    test->src_flags = 0;
+    if (test->src_fmt == PIXMAN_solid)
+    {
+        test->src_fmt = PIXMAN_a8r8g8b8;
+        test->src_flags |= SOLID_FLAG;
+    }
+
+    if (test->mask_fmt == PIXMAN_solid)
+    {
+        if (test->mask_flags & CA_FLAG)
+            test->mask_fmt = PIXMAN_a8r8g8b8;
+        else
+            test->mask_fmt = PIXMAN_a8;
+
+        test->mask_flags |= SOLID_FLAG;
+    }
+
+    return 0;
+}
 
+static int
+check_int (int got, int expected, const char *name, const char *field)
+{
+    if (got == expected)
+        return 0;
+
+    printf ("%s: %s failure: expected %d, got %d.\n",
+            name, field, expected, got);
+
+    return 1;
+}
+
+static int
+check_format (int got, int expected, const char *name, const char *field)
+{
+    if (got == expected)
+        return 0;
+
+    printf ("%s: %s failure: expected %s (%#x), got %s (%#x).\n",
+            name, field,
+            format_name (expected), expected,
+            format_name (got), got);
+
+    return 1;
+}
+
+static void
+parser_self_test (void)
+{
+    const test_entry_t *ent;
+    test_entry_t test;
+    int fails = 0;
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
+    {
+        ent = &tests_tbl[i];
+
+        if (parse_test_pattern (&test, ent->testname) < 0)
+        {
+            printf ("parsing failed for '%s'\n", ent->testname);
+            fails++;
+            continue;
+        }
+
+        fails += check_format (test.src_fmt, ent->src_fmt,
+                               ent->testname, "src_fmt");
+        fails += check_format (test.mask_fmt, ent->mask_fmt,
+                               ent->testname, "mask_fmt");
+        fails += check_format (test.dst_fmt, ent->dst_fmt,
+                               ent->testname, "dst_fmt");
+        fails += check_int    (test.src_flags, ent->src_flags,
+                               ent->testname, "src_flags");
+        fails += check_int    (test.mask_flags, ent->mask_flags,
+                               ent->testname, "mask_flags");
+        fails += check_int    (test.op, ent->op, ent->testname, "op");
+    }
+
+    if (fails)
+    {
+        printf ("Parser self-test failed.\n");
+        exit (EXIT_FAILURE);
+    }
+
+    if (!use_csv_output)
+        printf ("Parser self-test complete.\n");
+}
+
+static void
+print_test_details (const test_entry_t *test)
+{
+    printf ("%s: %s, src %s%s, mask %s%s%s, dst %s\n",
+            test->testname,
+            operator_name (test->op),
+            format_name (test->src_fmt),
+            test->src_flags & SOLID_FLAG ? " solid" : "",
+            format_name (test->mask_fmt),
+            test->mask_flags & SOLID_FLAG ? " solid" : "",
+            test->mask_flags & CA_FLAG ? " CA" : "",
+            format_name (test->dst_fmt));
+}
+
+static void
+run_one_test (const char *pattern, double bandwidth_, pixman_bool_t prdetails)
+{
+    test_entry_t test;
+
+    if (parse_test_pattern (&test, pattern) < 0)
+    {
+        printf ("Error: Could not parse the test pattern '%s'.\n", pattern);
+        return;
+    }
+
+    if (prdetails)
+    {
+        print_test_details (&test);
+        printf ("---\n");
+    }
+
+    bench_composite (pattern,
+                     test.src_fmt,
+                     test.src_flags,
+                     test.op,
+                     test.mask_fmt,
+                     test.mask_flags,
+                     test.dst_fmt,
+                     bandwidth_ / 8);
+}
+
+static void
+run_default_tests (double bandwidth_)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
+        run_one_test (tests_tbl[i].testname, bandwidth_, FALSE);
+}
+
+static void
+print_explanation (void)
+{
     printf ("Benchmark for a set of most commonly used functions\n");
     printf ("---\n");
     printf ("All results are presented in millions of pixels per second\n");
@@ -786,9 +1059,14 @@ main (int argc, char *argv[])
     printf ("RT  - as R, but %dx%d average sized rectangles are copied\n",
             TINYWIDTH, TINYWIDTH);
     printf ("---\n");
-    bandwidth = x = bench_memcpy ();
+}
+
+static void
+print_speed_scaling (double bw)
+{
     printf ("reference memcpy speed = %.1fMB/s (%.1fMP/s for 32bpp fills)\n",
-            x / 1000000., x / 4000000);
+            bw / 1000000., bw / 4000000);
+
     if (use_scaling)
     {
 	printf ("---\n");
@@ -799,23 +1077,85 @@ main (int argc, char *argv[])
 	else
 	    printf ("UNKNOWN scaling\n");
     }
+
     printf ("---\n");
+}
 
-    for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
+static void
+usage (const char *progname)
+{
+    printf ("Usage: %s [-b] [-n] [-c] [-m M] pattern\n", progname);
+    printf ("  -n : benchmark nearest scaling\n");
+    printf ("  -b : benchmark bilinear scaling\n");
+    printf ("  -c : print output as CSV data\n");
+    printf ("  -m M : set reference memcpy speed to M MB/s instead of measuring it\n");
+}
+
+int
+main (int argc, char *argv[])
+{
+    int i;
+    const char *pattern = NULL;
+
+    for (i = 1; i < argc; i++)
     {
-	if (strcmp (pattern, "all") == 0 || strcmp (tests_tbl[i].testname, pattern) == 0)
+	if (argv[i][0] == '-')
 	{
-	    bench_composite (tests_tbl[i].testname,
-			     tests_tbl[i].src_fmt,
-			     tests_tbl[i].src_flags,
-			     tests_tbl[i].op,
-			     tests_tbl[i].mask_fmt,
-			     tests_tbl[i].mask_flags,
-			     tests_tbl[i].dst_fmt,
-			     bandwidth/8);
+	    if (strchr (argv[i] + 1, 'b'))
+	    {
+		use_scaling = TRUE;
+		filter = PIXMAN_FILTER_BILINEAR;
+	    }
+	    else if (strchr (argv[i] + 1, 'n'))
+	    {
+		use_scaling = TRUE;
+		filter = PIXMAN_FILTER_NEAREST;
+	    }
+
+	    if (strchr (argv[i] + 1, 'c'))
+		use_csv_output = TRUE;
+
+	    if (strcmp (argv[i], "-m") == 0 && i + 1 < argc)
+		bandwidth = atof (argv[++i]) * 1e6;
 	}
+	else
+	{
+	    if (pattern)
+	    {
+		pattern = NULL;
+		printf ("Error: extra arguments given.\n");
+		break;
+	    }
+	    pattern = argv[i];
+	}
+    }
+
+    if (!pattern)
+    {
+	usage (argv[0]);
+	return 1;
     }
 
+    parser_self_test ();
+
+    src = aligned_malloc (4096, BUFSIZE * 3);
+    memset (src, 0xCC, BUFSIZE * 3);
+    dst = src + (BUFSIZE / 4);
+    mask = dst + (BUFSIZE / 4);
+
+    if (!use_csv_output)
+        print_explanation ();
+
+    if (bandwidth < 1.0)
+        bandwidth = bench_memcpy ();
+    if (!use_csv_output)
+        print_speed_scaling (bandwidth);
+
+    if (strcmp (pattern, "all") == 0)
+        run_default_tests (bandwidth);
+    else
+        run_one_test (pattern, bandwidth, !use_csv_output);
+
     free (src);
     return 0;
 }
diff --git a/test/matrix-test.c b/test/matrix-test.c
index 0a5f203..cd8820c 100644
--- a/test/matrix-test.c
+++ b/test/matrix-test.c
@@ -201,8 +201,8 @@ test_matrix (int testnum, int verbose)
             {
                 for (j = 0; j < 3; j++)
                 {
-                    double diff = fabs (result_f.v[j] -
-                                        pixman_fixed_to_float128 (result_i.v[j]));
+                    double diff = fabsl (result_f.v[j] -
+                                         pixman_fixed_to_float128 (result_i.v[j]));
 
                     if (is_affine && diff > (0.51 / 65536.0))
                     {
diff --git a/test/meson.build b/test/meson.build
new file mode 100644
index 0000000..47dd33c
--- /dev/null
+++ b/test/meson.build
@@ -0,0 +1,90 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+tests = [
+  'oob-test',
+  'infinite-loop',
+  'trap-crasher',
+  'fence-image-self-test',
+  'region-translate-test',
+  'fetch-test',
+  'a1-trap-test',
+  'prng-test',
+  'radial-invalid',
+  'pdf-op-test',
+  'region-test',
+  'combiner-test',
+  'scaling-crash-test',
+  'alpha-loop',
+  'scaling-helpers-test',
+  'rotate-test',
+  'alphamap',
+  'gradient-crash-test',
+  'pixel-test',
+  'matrix-test',
+  'filter-reduction-test',
+  'composite-traps-test',
+  'region-contains-test',
+  'glyph-test',
+  'solid-test',
+  'stress-test',
+  'cover-test',
+  'blitters-test',
+  'affine-test',
+  'scaling-test',
+  'composite',
+  'tolerance-test',
+]
+
+# Remove/update this once thread-test.c supports threading methods
+# other than PThreads and Windows threads
+if pthreads_found or host_machine.system() == 'windows'
+  tests += 'thread-test'
+endif
+
+progs = [
+  'lowlevel-blt-bench',
+  'radial-perf-test',
+  'check-formats',
+  'scaling-bench',
+  'affine-bench',
+]
+
+foreach t : tests
+  test(
+    t,
+    executable(
+      t,
+      [t + '.c', config_h],
+      dependencies : [idep_pixman, libtestutils_dep, dep_threads, dep_openmp, dep_png],
+    ),
+    timeout : 120,
+    is_parallel : true,
+  )
+endforeach
+
+foreach p : progs
+  executable(
+    p,
+    p + '.c',
+    dependencies : [idep_pixman, libtestutils_dep, dep_openmp],
+  )
+endforeach
+
diff --git a/test/scaling-test.c b/test/scaling-test.c
index e2f7fa9..0ece611 100644
--- a/test/scaling-test.c
+++ b/test/scaling-test.c
@@ -73,7 +73,7 @@ test_composite (int      testnum,
     pixman_op_t        op;
     pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
     pixman_repeat_t    mask_repeat = PIXMAN_REPEAT_NONE;
-    pixman_format_code_t src_fmt, dst_fmt;
+    pixman_format_code_t src_fmt, mask_fmt, dst_fmt;
     uint32_t *         srcbuf;
     uint32_t *         dstbuf;
     uint32_t *         maskbuf;
@@ -145,6 +145,7 @@ test_composite (int      testnum,
     prng_randmemset (dstbuf, dst_stride * dst_height, 0);
 
     src_fmt = get_format (src_bpp);
+    mask_fmt = PIXMAN_a8;
     dst_fmt = get_format (dst_bpp);
 
     if (prng_rand_n (2))
@@ -169,7 +170,7 @@ test_composite (int      testnum,
         src_fmt, src_width, src_height, srcbuf, src_stride);
 
     mask_img = pixman_image_create_bits (
-        PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride);
+        mask_fmt, mask_width, mask_height, maskbuf, mask_stride);
 
     dst_img = pixman_image_create_bits (
         dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
@@ -255,21 +256,6 @@ test_composite (int      testnum,
     else
 	pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
 
-    if (verbose)
-    {
-	printf ("src_fmt=%s, dst_fmt=%s\n", 
-		format_name (src_fmt), format_name (dst_fmt));
-	printf ("op=%s, scale_x=%d, scale_y=%d, repeat=%d\n",
-	        operator_name (op), scale_x, scale_y, repeat);
-	printf ("translate_x=%d, translate_y=%d\n",
-	        translate_x, translate_y);
-	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
-	        src_width, src_height, dst_width, dst_height);
-	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
-	        src_x, src_y, dst_x, dst_y);
-	printf ("w=%d, h=%d\n", w, h);
-    }
-
     if (prng_rand_n (8) == 0)
     {
 	pixman_box16_t clip_boxes[2];
@@ -352,10 +338,45 @@ test_composite (int      testnum,
     }
 
     if (prng_rand_n (2) == 0)
-	pixman_image_composite (op, src_img, NULL, dst_img,
-                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
-    else
-	pixman_image_composite (op, src_img, mask_img, dst_img,
+    {
+	mask_fmt = PIXMAN_null;
+	pixman_image_unref (mask_img);
+	mask_img = NULL;
+	mask_x = 0;
+	mask_y = 0;
+    }
+
+    if (verbose)
+    {
+	printf ("op=%s, src_fmt=%s, mask_fmt=%s, dst_fmt=%s\n",
+	        operator_name (op), format_name (src_fmt),
+	        format_name (mask_fmt), format_name (dst_fmt));
+	printf ("scale_x=%d, scale_y=%d, repeat=%d, filter=%d\n",
+	        scale_x, scale_y, repeat, src_img->common.filter);
+	printf ("translate_x=%d, translate_y=%d\n",
+	        translate_x, translate_y);
+	if (mask_fmt != PIXMAN_null)
+	{
+	    printf ("mask_scale_x=%d, mask_scale_y=%d, "
+	            "mask_repeat=%d, mask_filter=%d\n",
+	            mask_scale_x, mask_scale_y, mask_repeat,
+	            mask_img->common.filter);
+	    printf ("mask_translate_x=%d, mask_translate_y=%d\n",
+	            mask_translate_x, mask_translate_y);
+	}
+	printf ("src_width=%d, src_height=%d, src_x=%d, src_y=%d\n",
+	        src_width, src_height, src_x, src_y);
+	if (mask_fmt != PIXMAN_null)
+	{
+	    printf ("mask_width=%d, mask_height=%d, mask_x=%d, mask_y=%d\n",
+	            mask_width, mask_height, mask_x, mask_y);
+	}
+	printf ("dst_width=%d, dst_height=%d, dst_x=%d, dst_y=%d\n",
+	        dst_width, dst_height, dst_x, dst_y);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
                             src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
 
     crc32 = compute_crc32_for_image (0, dst_img);
@@ -364,7 +385,8 @@ test_composite (int      testnum,
 	print_image (dst_img);
 
     pixman_image_unref (src_img);
-    pixman_image_unref (mask_img);
+    if (mask_img != NULL)
+	pixman_image_unref (mask_img);
     pixman_image_unref (dst_img);
 
     if (src_stride < 0)
diff --git a/test/solid-test.c b/test/solid-test.c
new file mode 100644
index 0000000..b118d37
--- /dev/null
+++ b/test/solid-test.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright © 2015 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+#include "utils.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#define WIDTH 32
+#define HEIGHT 32
+
+static const pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+
+/* The first eight format in the list are by far the most widely
+ * used formats, so we test those more than the others
+ */
+#define N_MOST_LIKELY_FORMATS 8
+
+static const pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a8,
+    PIXMAN_a1,
+    PIXMAN_r3g3b2,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+#if 0 /* These are going to use floating point in the near future */
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+#endif
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_null
+};
+
+static const pixman_format_code_t mask_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_null
+};
+
+static pixman_indexed_t rgb_palette[9];
+static pixman_indexed_t y_palette[9];
+
+static pixman_format_code_t
+random_format (const pixman_format_code_t *allowed_formats)
+{
+    int n = 0;
+
+    while (allowed_formats[n] != PIXMAN_null)
+        n++;
+
+    if (n > N_MOST_LIKELY_FORMATS && prng_rand_n (4) != 0)
+        n = N_MOST_LIKELY_FORMATS;
+
+    return allowed_formats[prng_rand_n (n)];
+}
+
+static pixman_image_t *
+create_multi_pixel_image (const pixman_format_code_t *allowed_formats,
+                          uint32_t                   *buffer,
+                          pixman_format_code_t       *used_fmt)
+{
+    pixman_format_code_t fmt;
+    pixman_image_t *img;
+    int stride;
+
+    fmt = random_format (allowed_formats);
+    stride = (WIDTH * PIXMAN_FORMAT_BPP (fmt) + 31) / 32 * 4;
+    img = pixman_image_create_bits (fmt, WIDTH, HEIGHT, buffer, stride);
+
+    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+        pixman_image_set_indexed (img, &(rgb_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+        pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
+
+    prng_randmemset (buffer, WIDTH * HEIGHT * 4, 0);
+    image_endian_swap (img);
+
+    if (used_fmt)
+        *used_fmt = fmt;
+
+    return img;
+}
+
+static pixman_image_t *
+create_solid_image (const pixman_format_code_t *allowed_formats,
+                    uint32_t                   *buffer,
+                    pixman_format_code_t       *used_fmt)
+{
+    if (prng_rand_n (2))
+    {
+        /* Use a repeating 1x1 bitmap image for solid */
+        pixman_format_code_t fmt;
+        pixman_image_t      *img, *dummy_img;
+        uint32_t             bpp, dummy_buf;
+
+        fmt = random_format (allowed_formats);
+        bpp = PIXMAN_FORMAT_BPP (fmt);
+        img = pixman_image_create_bits (fmt, 1, 1, buffer, 4);
+        pixman_image_set_repeat (img, PIXMAN_REPEAT_NORMAL);
+
+        if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+            pixman_image_set_indexed (img, &(rgb_palette[bpp]));
+        else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+            pixman_image_set_indexed (img, &(y_palette[bpp]));
+
+        /* Force the flags to be calculated for image with initial
+         * bitmap contents of 0 or 2^bpp-1 by plotting from it into a
+         * separate throwaway image. It is simplest to write all 0s
+         * or all 1s to the first word irrespective of the colour
+         * depth even though we actually only care about the first
+         * pixel since the stride has to be a whole number of words.
+         */
+        *buffer = prng_rand_n (2) ? 0xFFFFFFFFu : 0;
+        dummy_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, 1, 1,
+                                              &dummy_buf, 4);
+        pixman_image_composite (PIXMAN_OP_SRC, img, NULL, dummy_img,
+                                0, 0, 0, 0, 0, 0, 1, 1);
+        pixman_image_unref (dummy_img);
+
+        /* Now set the bitmap contents to a random value */
+        prng_randmemset (buffer, 4, 0);
+        image_endian_swap (img);
+
+        if (used_fmt)
+            *used_fmt = fmt;
+
+        return img;
+    }
+    else
+    {
+        /* Use a native solid image */
+        pixman_color_t color;
+        pixman_image_t *img;
+
+        color.alpha = prng_rand_n (UINT16_MAX + 1);
+        color.red   = prng_rand_n (UINT16_MAX + 1);
+        color.green = prng_rand_n (UINT16_MAX + 1);
+        color.blue  = prng_rand_n (UINT16_MAX + 1);
+        img = pixman_image_create_solid_fill (&color);
+
+        if (used_fmt)
+            *used_fmt = PIXMAN_solid;
+
+        return img;
+    }
+}
+
+static uint32_t
+test_solid (int testnum, int verbose)
+{
+    pixman_op_t          op;
+    uint32_t             src_buf[WIDTH * HEIGHT];
+    uint32_t             dst_buf[WIDTH * HEIGHT];
+    uint32_t             mask_buf[WIDTH * HEIGHT];
+    pixman_image_t      *src_img;
+    pixman_image_t      *dst_img;
+    pixman_image_t      *mask_img = NULL;
+    pixman_format_code_t src_fmt, dst_fmt, mask_fmt = PIXMAN_null;
+    pixman_bool_t        ca = 0;
+    uint32_t             crc32;
+
+    prng_srand (testnum);
+
+    op = op_list[prng_rand_n (ARRAY_LENGTH (op_list))];
+
+    dst_img = create_multi_pixel_image (img_fmt_list, dst_buf, &dst_fmt);
+    switch (prng_rand_n (3))
+    {
+    case 0: /* Solid source, no mask */
+        src_img = create_solid_image (img_fmt_list, src_buf, &src_fmt);
+        break;
+    case 1: /* Solid source, bitmap mask */
+        src_img = create_solid_image (img_fmt_list, src_buf, &src_fmt);
+        mask_img = create_multi_pixel_image (mask_fmt_list, mask_buf, &mask_fmt);
+        break;
+    case 2: /* Bitmap image, solid mask */
+        src_img = create_multi_pixel_image (img_fmt_list, src_buf, &src_fmt);
+        mask_img = create_solid_image (mask_fmt_list, mask_buf, &mask_fmt);
+        break;
+    default:
+        abort ();
+    }
+
+    if (mask_img)
+    {
+        ca = prng_rand_n (2);
+        pixman_image_set_component_alpha (mask_img, ca);
+    }
+
+    if (verbose)
+    {
+        printf ("op=%s\n", operator_name (op));
+        printf ("src_fmt=%s, dst_fmt=%s, mask_fmt=%s\n",
+                format_name (src_fmt), format_name (dst_fmt),
+                format_name (mask_fmt));
+        printf ("src_size=%u, mask_size=%u, component_alpha=%u\n",
+                src_fmt == PIXMAN_solid ? 1 : src_img->bits.width,
+                !mask_img || mask_fmt == PIXMAN_solid ? 1 : mask_img->bits.width,
+                ca);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+                            0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    if (verbose)
+        print_image (dst_img);
+
+    crc32 = compute_crc32_for_image (0, dst_img);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    if (mask_img)
+        pixman_image_unref (mask_img);
+
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    int i;
+
+    prng_srand (0);
+
+    for (i = 1; i <= 8; i++)
+    {
+        initialize_palette (&(rgb_palette[i]), i, TRUE);
+        initialize_palette (&(y_palette[i]), i, FALSE);
+    }
+
+    return fuzzer_test_main ("solid", 500000,
+                             0xC30FD380,
+			     test_solid, argc, argv);
+}
diff --git a/test/stress-test.c b/test/stress-test.c
index 1f03c75..8ee1896 100644
--- a/test/stress-test.c
+++ b/test/stress-test.c
@@ -11,6 +11,8 @@
 
 static const pixman_format_code_t image_formats[] =
 {
+    PIXMAN_rgba_float,
+    PIXMAN_rgb_float,
     PIXMAN_a8r8g8b8,
     PIXMAN_x8r8g8b8,
     PIXMAN_r5g6b5,
@@ -26,6 +28,7 @@ static const pixman_format_code_t image_formats[] =
     PIXMAN_r8g8b8,
     PIXMAN_b8g8r8,
     PIXMAN_a8r8g8b8_sRGB,
+    PIXMAN_r8g8b8_sRGB,
     PIXMAN_r5g6b5,
     PIXMAN_b5g6r5,
     PIXMAN_x2r10g10b10,
@@ -100,6 +103,14 @@ get_size (void)
     }
 }
 
+static uint32_t
+real_reader (const void *src, int size);
+
+static void *xor_ptr(const void *ptr)
+{
+	return (void *)(((intptr_t)ptr) ^ (intptr_t)0x8000000080000000);
+}
+
 static void
 destroy (pixman_image_t *image, void *data)
 {
@@ -114,6 +125,9 @@ destroy (pixman_image_t *image, void *data)
 	    if (image->bits.rowstride < 0)
 		bits -= (- image->bits.rowstride * (image->bits.height - 1));
 
+	    if (image->bits.read_func == real_reader)
+		bits = xor_ptr(bits);
+
 	    fence_free (bits);
 	}
     }
@@ -124,6 +138,7 @@ destroy (pixman_image_t *image, void *data)
 static uint32_t
 real_reader (const void *src, int size)
 {
+    src = xor_ptr(src);
     switch (size)
     {
     case 1:
@@ -141,6 +156,7 @@ real_reader (const void *src, int size)
 static void
 real_writer (void *src, uint32_t value, int size)
 {
+    src = xor_ptr(src);
     switch (size)
     {
     case 1:
@@ -247,9 +263,20 @@ create_random_bits_image (alpha_preference_t alpha_preference)
     pixman_filter_t filter;
     pixman_fixed_t *coefficients = NULL;
     int n_coefficients = 0;
+    int align_add, align_mask;
 
     /* format */
     format = random_format (alpha_preference);
+    switch (PIXMAN_FORMAT_BPP (format)) {
+    case 128:
+	align_mask = 15;
+	align_add = align_mask + prng_rand_n (65);
+	break;
+    default:
+	align_mask = 3;
+	align_add = align_mask + prng_rand_n (17);
+	break;
+    }
 
     indexed = NULL;
     if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
@@ -291,9 +318,12 @@ create_random_bits_image (alpha_preference_t alpha_preference)
     {
     default:
     case 0:
-	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
-	stride = (stride + 3) & (~3);
-	bits = (uint32_t *)make_random_bytes (height * stride);
+	stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+	stride = (stride + align_add) & (~align_mask);
+	if (format == PIXMAN_rgb_float || format == PIXMAN_rgba_float)
+	    bits = (uint32_t *)make_random_floats (height * stride);
+	else
+	    bits = (uint32_t *)make_random_bytes (height * stride);
 	break;
 
     case 1:
@@ -302,8 +332,8 @@ create_random_bits_image (alpha_preference_t alpha_preference)
 	break;
 
     case 2: /* Zero-filled */
-	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
-	stride = (stride + 3) & (~3);
+	stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+	stride = (stride + align_add) & (~align_mask);
 	bits = fence_malloc (height * stride);
 	if (!bits)
 	    return NULL;
@@ -311,8 +341,8 @@ create_random_bits_image (alpha_preference_t alpha_preference)
 	break;
 
     case 3: /* Filled with 0xFF */
-	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
-	stride = (stride + 3) & (~3);
+	stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+	stride = (stride + align_add) & (~align_mask);
 	bits = fence_malloc (height * stride);
 	if (!bits)
 	    return NULL;
@@ -320,27 +350,35 @@ create_random_bits_image (alpha_preference_t alpha_preference)
 	break;
 
     case 4: /* bits is a bad pointer, has read/write functions */
-	stride = 232;
-	bits = (void *)0x01;
-	read_func = fake_reader;
-	write_func = fake_writer;
-	break;
+	if (PIXMAN_FORMAT_BPP (format) <= 32) {
+	    stride = 232;
+	    bits = (void *)0x01;
+	    read_func = fake_reader;
+	    write_func = fake_writer;
+	    break;
+	}
 
     case 5: /* bits is a real pointer, has read/write functions */
-	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
-	stride = (stride + 3) & (~3);
+	stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+	stride = (stride + align_add) & (~align_mask);
 	bits = fence_malloc (height * stride);
 	if (!bits)
 	    return NULL;
 	memset (bits, 0xff, height * stride);
-	read_func = real_reader;
-	write_func = real_writer;
+	if (PIXMAN_FORMAT_BPP (format) <= 32) {
+	    bits = xor_ptr(bits);
+	    read_func = real_reader;
+	    write_func = real_writer;
+	}
 	break;
 
     case 6: /* bits is a real pointer, stride is negative */
-	stride = (width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17));
-	stride = (stride + 3) & (~3);
-	bits = (uint32_t *)make_random_bytes (height * stride);
+	stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+	stride = (stride + align_add) & (~align_mask);
+	if (format == PIXMAN_rgb_float || format == PIXMAN_rgba_float)
+	    bits = (uint32_t *)make_random_floats (height * stride);
+	else
+	    bits = (uint32_t *)make_random_bytes (height * stride);
 	if (!bits)
 	    return NULL;
 	bits += ((height - 1) * stride) / 4;
@@ -484,7 +522,7 @@ set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
 	if (image->type == BITS && prng_rand_n (8) != 0)
 	{
 	    uint32_t width, height;
-	    int x, y;
+	    uint32_t x, y;
 	    int i;
 
 	    /* Also add a couple of clip rectangles inside the image
diff --git a/test/thread-test.c b/test/thread-test.c
index 1c2f040..12c51e3 100644
--- a/test/thread-test.c
+++ b/test/thread-test.c
@@ -1,23 +1,34 @@
 #include "utils.h"
 
-#ifndef HAVE_PTHREADS
+#if !defined (HAVE_PTHREADS) && !defined (_WIN32)
 
 int main ()
 {
-    printf ("Skipped thread-test - pthreads not supported\n");
+    printf ("Skipped thread-test - pthreads or Windows Threads not supported\n");
     return 0;
 }
 
 #else
 
 #include <stdlib.h>
-#include <pthread.h>
+
+#ifdef HAVE_PTHREADS
+# include <pthread.h>
+#elif defined (_WIN32)
+# define WIN32_LEAN_AND_MEAN
+# include <windows.h>
+#endif
+
+#define THREADS 16
 
 typedef struct
 {
     int       thread_no;
     uint32_t *dst_buf;
     prng_t    prng_state;
+#if defined (_WIN32) && !defined (HAVE_PTHREADS)
+    uint32_t  crc32;
+#endif
 } info_t;
 
 static const pixman_op_t operators[] = 
@@ -67,8 +78,13 @@ static const pixman_format_code_t formats[] =
 
 #define DEST_WIDTH (7)
 
+#ifdef HAVE_PTHREADS
 static void *
 thread (void *data)
+#elif defined (_WIN32)
+DWORD WINAPI
+thread (LPVOID data)
+#endif
 {
     info_t *info = data;
     uint32_t crc32 = 0x0;
@@ -112,7 +128,12 @@ thread (void *data)
 	pixman_image_unref (dst_img);
     }
 
+#ifdef HAVE_PTHREADS
     return (void *)(uintptr_t)crc32;
+#elif defined (_WIN32)
+    info->crc32 = crc32;
+    return 0;
+#endif
 }
 
 static inline uint32_t
@@ -127,26 +148,34 @@ byteswap32 (uint32_t x)
 int
 main (void)
 {
-    uint32_t dest[16 * DEST_WIDTH];
-    info_t info[16] = { { 0 } };
-    pthread_t threads[16];
-    void *retvals[16];
-    uint32_t crc32s[16], crc32;
+    uint32_t dest[THREADS * DEST_WIDTH];
+    info_t info[THREADS] = { { 0 } };
+
+#ifdef HAVE_PTHREADS
+    pthread_t threads[THREADS];
+    void *retvals[THREADS];
+#elif defined (_WIN32)
+    HANDLE  hThreadArray[THREADS];
+    DWORD   dwThreadIdArray[THREADS];
+#endif
+
+    uint32_t crc32s[THREADS], crc32;
     int i;
 
-    for (i = 0; i < 16; ++i)
+    for (i = 0; i < THREADS; ++i)
     {
 	info[i].thread_no = i;
 	info[i].dst_buf = &dest[i * DEST_WIDTH];
     }
 
-    for (i = 0; i < 16; ++i)
-	pthread_create (&threads[i], NULL, thread, &info[i]);
+#ifdef HAVE_PTHREADS
+    for (i = 0; i < THREADS; ++i)
+      pthread_create (&threads[i], NULL, thread, &info[i]);
 
-    for (i = 0; i < 16; ++i)
-	pthread_join (threads[i], &retvals[i]);
+    for (i = 0; i < THREADS; ++i)
+	  pthread_join (threads[i], &retvals[i]);
 
-    for (i = 0; i < 16; ++i)
+    for (i = 0; i < THREADS; ++i)
     {
 	crc32s[i] = (uintptr_t)retvals[i];
 
@@ -154,6 +183,36 @@ main (void)
 	    crc32s[i] = byteswap32 (crc32s[i]);
     }
 
+#elif defined (_WIN32)
+    for (i = 0; i < THREADS; ++i)
+      {
+        hThreadArray[i] = CreateThread(NULL,
+                                       0,
+                                       thread,
+                                       &info[i],
+                                       0,
+                                       &dwThreadIdArray[i]);
+        if (hThreadArray[i] == NULL)
+          {
+            printf ("Windows thread creation failed!\n");
+            return 1;
+          }
+      }
+    for (i = 0; i < THREADS; ++i)
+      {
+        WaitForSingleObject (hThreadArray[i], INFINITE);
+        CloseHandle(hThreadArray[i]);
+      }
+
+    for (i = 0; i < THREADS; ++i)
+      {
+        crc32s[i] = info[i].crc32;
+
+        if (is_little_endian())
+          crc32s[i] = byteswap32 (crc32s[i]);
+      }
+#endif
+
     crc32 = compute_crc32 (0, crc32s, sizeof crc32s);
 
 #define EXPECTED 0x82C4D9FB
diff --git a/test/tolerance-test.c b/test/tolerance-test.c
index 320bb7f..3c6e818 100644
--- a/test/tolerance-test.c
+++ b/test/tolerance-test.c
@@ -76,6 +76,12 @@ static const pixman_op_t operators[] =
     PIXMAN_OP_EXCLUSION,
 };
 
+static const pixman_dither_t dithers[] =
+{
+    PIXMAN_DITHER_ORDERED_BAYER_8,
+    PIXMAN_DITHER_ORDERED_BLUE_NOISE_64,
+};
+
 #define RANDOM_ELT(array)                                               \
     (array[prng_rand_n (ARRAY_LENGTH (array))])
 
@@ -176,7 +182,8 @@ verify (int test_no,
         pixman_image_t *orig_dest,
         int x, int y,
         int width, int height,
-	pixman_bool_t component_alpha)
+	pixman_bool_t component_alpha,
+	pixman_dither_t dither)
 {
     pixel_checker_t dest_checker, src_checker, mask_checker;
     int i, j;
@@ -185,6 +192,9 @@ verify (int test_no,
     pixel_checker_init (&dest_checker, dest->bits.format);
     pixel_checker_init (&mask_checker, mask->bits.format);
 
+    if (dest->bits.dither != PIXMAN_DITHER_NONE)
+	pixel_checker_allow_dither (&dest_checker);
+
     assert (dest->bits.format == orig_dest->bits.format);
 
     for (j = y; j < y + height; ++j)
@@ -220,6 +230,7 @@ verify (int test_no,
                 
                 printf ("   operator:         %s (%s alpha)\n", operator_name (op),
 			component_alpha? "component" : "unified");
+		printf ("   dither:           %s\n", dither_name (dither));
                 printf ("   dest_x, dest_y:   %d %d\n", x, y);
                 printf ("   width, height:    %d %d\n", width, height);
                 printf ("   source:           format: %-14s  size: %2d x %2d\n",
@@ -275,6 +286,7 @@ do_check (int i)
     pixman_image_t *dest_copy;
     pixman_bool_t result = TRUE;
     pixman_bool_t component_alpha;
+    pixman_dither_t dither = PIXMAN_DITHER_NONE;
 
     prng_srand (i);
     op = RANDOM_ELT (operators);
@@ -296,6 +308,12 @@ do_check (int i)
     if (y + height > dest->bits.height)
         height = dest->bits.height - y;
 
+    if (prng_rand_n (2))
+    {
+	dither = RANDOM_ELT (dithers);
+	pixman_image_set_dither (dest, dither);
+    }
+
     component_alpha = prng_rand_n (2);
 
     pixman_image_set_component_alpha (mask, component_alpha);
@@ -305,7 +323,8 @@ do_check (int i)
                               x, y, width, height);
 
     if (!verify (i, op, source, mask, dest, dest_copy,
-		 x, y, width, height, component_alpha))
+		 x, y, width, height, component_alpha,
+	         dither))
     {
 	result = FALSE;
     }
diff --git a/test/utils/meson.build b/test/utils/meson.build
new file mode 100644
index 0000000..9a6e820
--- /dev/null
+++ b/test/utils/meson.build
@@ -0,0 +1,31 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libtestutils = static_library(
+  'testutils',
+  ['utils.c', 'utils-prng.c', config_h],
+  dependencies : [idep_pixman, dep_openmp, dep_m, dep_png],
+)
+
+libtestutils_dep = declare_dependency(
+  link_with: libtestutils,
+  include_directories: include_directories('.'),
+)
+
diff --git a/test/utils-prng.c b/test/utils/utils-prng.c
index c27b5be..0cf53dd 100644
--- a/test/utils-prng.c
+++ b/test/utils/utils-prng.c
@@ -199,12 +199,25 @@ randmemset_internal (prng_t                  *prng,
         }
         else
         {
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
 #ifdef HAVE_GCC_VECTOR_EXTENSIONS
-            const uint8x16 bswap_shufflemask =
+# if __has_builtin(__builtin_shufflevector)
+            randdata.vb =
+                __builtin_shufflevector (randdata.vb, randdata.vb,
+                                          3,  2,  1,  0,  7,  6 , 5,  4,
+                                         11, 10,  9,  8, 15, 14, 13, 12);
+# else
+            static const uint8x16 bswap_shufflemask =
             {
                 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
             };
             randdata.vb = __builtin_shuffle (randdata.vb, bswap_shufflemask);
+# endif
+
             store_rand_128_data (buf, &randdata, aligned);
             buf += 16;
 #else
diff --git a/test/utils-prng.h b/test/utils/utils-prng.h
index f9ae8dd..3cc3fbe 100644
--- a/test/utils-prng.h
+++ b/test/utils/utils-prng.h
@@ -72,7 +72,7 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include "pixman-private.h"
diff --git a/test/utils.c b/test/utils/utils.c
index ab3424f..23bf019 100644
--- a/test/utils.c
+++ b/test/utils/utils.c
@@ -5,6 +5,8 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <float.h>
+#include <ctype.h>
+#include <limits.h>
 
 #ifdef HAVE_GETTIMEOFDAY
 #include <sys/time.h>
@@ -28,11 +30,13 @@
 #include <png.h>
 #endif
 
+#define ROUND_UP(x, mult) (((x) + (mult) - 1) / (mult) * (mult))
+
 /* Random number generator state
  */
 
-prng_t prng_state_data;
-prng_t *prng_state;
+prng_t prng_state_data = {0};
+prng_t *prng_state = NULL;
 
 /*----------------------------------------------------------------------------*\
  *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
@@ -376,7 +380,16 @@ typedef struct
     int n_bytes;
 } info_t;
 
-#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H) && defined(HAVE_MMAP)
+#if FENCE_MALLOC_ACTIVE
+
+unsigned long
+fence_get_page_size ()
+{
+    /* You can fake a page size here, if you want to test e.g. 64 kB
+     * pages on a 4 kB page system. Just put a multiplier below.
+     */
+    return getpagesize ();
+}
 
 /* This is apparently necessary on at least OS X */
 #ifndef MAP_ANONYMOUS
@@ -386,7 +399,7 @@ typedef struct
 void *
 fence_malloc (int64_t len)
 {
-    unsigned long page_size = getpagesize();
+    unsigned long page_size = fence_get_page_size ();
     unsigned long page_mask = page_size - 1;
     uint32_t n_payload_bytes = (len + page_mask) & ~page_mask;
     uint32_t n_bytes =
@@ -435,7 +448,7 @@ fence_malloc (int64_t len)
 void
 fence_free (void *data)
 {
-    uint32_t page_size = getpagesize();
+    uint32_t page_size = fence_get_page_size ();
     uint8_t *payload = data;
     uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
     uint8_t *initial_page = leading_protected - page_size;
@@ -444,7 +457,98 @@ fence_free (void *data)
     munmap (info->addr, info->n_bytes);
 }
 
-#else
+static void
+fence_image_destroy (pixman_image_t *image, void *data)
+{
+    fence_free (data);
+}
+
+/* Create an image with fence pages.
+ *
+ * Creates an image, where the data area is allocated with fence_malloc ().
+ * Each row has an additional page in the stride.
+ *
+ * min_width is only a minimum width for the image. The width is aligned up
+ * for the row size to be divisible by both page size and pixel size.
+ *
+ * If stride_fence is true, the additional page on each row will be
+ * armed to cause SIGSEGV or SIGBUS on all accesses. This should catch
+ * all accesses outside the valid row pixels.
+ */
+pixman_image_t *
+fence_image_create_bits (pixman_format_code_t format,
+                         int min_width,
+                         int height,
+                         pixman_bool_t stride_fence)
+{
+    unsigned page_size = fence_get_page_size ();
+    unsigned page_mask = page_size - 1;
+    unsigned bitspp = PIXMAN_FORMAT_BPP (format);
+    unsigned bits_boundary;
+    unsigned row_bits;
+    int width;       /* pixels */
+    unsigned stride; /* bytes */
+    void *pixels;
+    pixman_image_t *image;
+    int i;
+
+    /* must be power of two */
+    assert (page_size && (page_size & page_mask) == 0);
+
+    if (bitspp < 1 || min_width < 1 || height < 1)
+        abort ();
+
+    /* least common multiple between page size * 8 and bitspp */
+    bits_boundary = bitspp;
+    while (! (bits_boundary & 1))
+        bits_boundary >>= 1;
+    bits_boundary *= page_size * 8;
+
+    /* round up to bits_boundary */
+    row_bits = ROUND_UP ( (unsigned)min_width * bitspp, bits_boundary);
+    width = row_bits / bitspp;
+
+    stride = row_bits / 8;
+    if (stride_fence)
+        stride += page_size; /* add fence page */
+
+    if (UINT_MAX / stride < (unsigned)height)
+        abort ();
+
+    pixels = fence_malloc (stride * (unsigned)height);
+    if (!pixels)
+        return NULL;
+
+    if (stride_fence)
+    {
+        uint8_t *guard = (uint8_t *)pixels + stride - page_size;
+
+        /* arm row end fence pages */
+        for (i = 0; i < height; i++)
+        {
+            if (mprotect (guard + i * stride, page_size, PROT_NONE) == -1)
+                goto out_fail;
+        }
+    }
+
+    assert (width >= min_width);
+
+    image = pixman_image_create_bits_no_clear (format, width, height,
+                                               pixels, stride);
+    if (!image)
+        goto out_fail;
+
+    pixman_image_set_destroy_function (image, fence_image_destroy, pixels);
+
+    return image;
+
+out_fail:
+    fence_free (pixels);
+
+    return NULL;
+}
+
+#else /* FENCE_MALLOC_ACTIVE */
 
 void *
 fence_malloc (int64_t len)
@@ -458,7 +562,25 @@ fence_free (void *data)
     free (data);
 }
 
-#endif
+pixman_image_t *
+fence_image_create_bits (pixman_format_code_t format,
+                         int min_width,
+                         int height,
+                         pixman_bool_t stride_fence)
+{
+    return pixman_image_create_bits (format, min_width, height, NULL, 0);
+    /* Implicitly allocated storage does not need a destroy function
+     * to get freed on refcount hitting zero.
+     */
+}
+
+unsigned long
+fence_get_page_size ()
+{
+    return 0;
+}
+
+#endif /* FENCE_MALLOC_ACTIVE */
 
 uint8_t *
 make_random_bytes (int n_bytes)
@@ -473,6 +595,21 @@ make_random_bytes (int n_bytes)
     return bytes;
 }
 
+float *
+make_random_floats (int n_bytes)
+{
+    uint8_t *bytes = fence_malloc (n_bytes);
+    float *vals = (float *)bytes;
+
+    if (!bytes)
+	return 0;
+
+    for (n_bytes /= 4; n_bytes; vals++, n_bytes--)
+	*vals = (float)rand() / (float)RAND_MAX;
+
+    return (float *)bytes;
+}
+
 void
 a8r8g8b8_to_rgba_np (uint32_t *dst, uint32_t *src, int n_pixels)
 {
@@ -844,9 +981,11 @@ enable_divbyzero_exceptions (void)
 {
 #ifdef HAVE_FENV_H
 #ifdef HAVE_FEENABLEEXCEPT
+#ifdef HAVE_FEDIVBYZERO
     feenableexcept (FE_DIVBYZERO);
 #endif
 #endif
+#endif
 }
 
 void
@@ -854,9 +993,11 @@ enable_invalid_exceptions (void)
 {
 #ifdef HAVE_FENV_H
 #ifdef HAVE_FEENABLEEXCEPT
+#ifdef FE_INVALID
     feenableexcept (FE_INVALID);
 #endif
 #endif
+#endif
 }
 
 void *
@@ -948,168 +1089,438 @@ initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb)
     }
 }
 
-const char *
-operator_name (pixman_op_t op)
+struct operator_entry {
+    pixman_op_t		 op;
+    const char		*name;
+    pixman_bool_t	 is_alias;
+};
+
+typedef struct operator_entry operator_entry_t;
+
+static const operator_entry_t op_list[] =
 {
-    switch (op)
-    {
-    case PIXMAN_OP_CLEAR: return "PIXMAN_OP_CLEAR";
-    case PIXMAN_OP_SRC: return "PIXMAN_OP_SRC";
-    case PIXMAN_OP_DST: return "PIXMAN_OP_DST";
-    case PIXMAN_OP_OVER: return "PIXMAN_OP_OVER";
-    case PIXMAN_OP_OVER_REVERSE: return "PIXMAN_OP_OVER_REVERSE";
-    case PIXMAN_OP_IN: return "PIXMAN_OP_IN";
-    case PIXMAN_OP_IN_REVERSE: return "PIXMAN_OP_IN_REVERSE";
-    case PIXMAN_OP_OUT: return "PIXMAN_OP_OUT";
-    case PIXMAN_OP_OUT_REVERSE: return "PIXMAN_OP_OUT_REVERSE";
-    case PIXMAN_OP_ATOP: return "PIXMAN_OP_ATOP";
-    case PIXMAN_OP_ATOP_REVERSE: return "PIXMAN_OP_ATOP_REVERSE";
-    case PIXMAN_OP_XOR: return "PIXMAN_OP_XOR";
-    case PIXMAN_OP_ADD: return "PIXMAN_OP_ADD";
-    case PIXMAN_OP_SATURATE: return "PIXMAN_OP_SATURATE";
-
-    case PIXMAN_OP_DISJOINT_CLEAR: return "PIXMAN_OP_DISJOINT_CLEAR";
-    case PIXMAN_OP_DISJOINT_SRC: return "PIXMAN_OP_DISJOINT_SRC";
-    case PIXMAN_OP_DISJOINT_DST: return "PIXMAN_OP_DISJOINT_DST";
-    case PIXMAN_OP_DISJOINT_OVER: return "PIXMAN_OP_DISJOINT_OVER";
-    case PIXMAN_OP_DISJOINT_OVER_REVERSE: return "PIXMAN_OP_DISJOINT_OVER_REVERSE";
-    case PIXMAN_OP_DISJOINT_IN: return "PIXMAN_OP_DISJOINT_IN";
-    case PIXMAN_OP_DISJOINT_IN_REVERSE: return "PIXMAN_OP_DISJOINT_IN_REVERSE";
-    case PIXMAN_OP_DISJOINT_OUT: return "PIXMAN_OP_DISJOINT_OUT";
-    case PIXMAN_OP_DISJOINT_OUT_REVERSE: return "PIXMAN_OP_DISJOINT_OUT_REVERSE";
-    case PIXMAN_OP_DISJOINT_ATOP: return "PIXMAN_OP_DISJOINT_ATOP";
-    case PIXMAN_OP_DISJOINT_ATOP_REVERSE: return "PIXMAN_OP_DISJOINT_ATOP_REVERSE";
-    case PIXMAN_OP_DISJOINT_XOR: return "PIXMAN_OP_DISJOINT_XOR";
-
-    case PIXMAN_OP_CONJOINT_CLEAR: return "PIXMAN_OP_CONJOINT_CLEAR";
-    case PIXMAN_OP_CONJOINT_SRC: return "PIXMAN_OP_CONJOINT_SRC";
-    case PIXMAN_OP_CONJOINT_DST: return "PIXMAN_OP_CONJOINT_DST";
-    case PIXMAN_OP_CONJOINT_OVER: return "PIXMAN_OP_CONJOINT_OVER";
-    case PIXMAN_OP_CONJOINT_OVER_REVERSE: return "PIXMAN_OP_CONJOINT_OVER_REVERSE";
-    case PIXMAN_OP_CONJOINT_IN: return "PIXMAN_OP_CONJOINT_IN";
-    case PIXMAN_OP_CONJOINT_IN_REVERSE: return "PIXMAN_OP_CONJOINT_IN_REVERSE";
-    case PIXMAN_OP_CONJOINT_OUT: return "PIXMAN_OP_CONJOINT_OUT";
-    case PIXMAN_OP_CONJOINT_OUT_REVERSE: return "PIXMAN_OP_CONJOINT_OUT_REVERSE";
-    case PIXMAN_OP_CONJOINT_ATOP: return "PIXMAN_OP_CONJOINT_ATOP";
-    case PIXMAN_OP_CONJOINT_ATOP_REVERSE: return "PIXMAN_OP_CONJOINT_ATOP_REVERSE";
-    case PIXMAN_OP_CONJOINT_XOR: return "PIXMAN_OP_CONJOINT_XOR";
-
-    case PIXMAN_OP_MULTIPLY: return "PIXMAN_OP_MULTIPLY";
-    case PIXMAN_OP_SCREEN: return "PIXMAN_OP_SCREEN";
-    case PIXMAN_OP_OVERLAY: return "PIXMAN_OP_OVERLAY";
-    case PIXMAN_OP_DARKEN: return "PIXMAN_OP_DARKEN";
-    case PIXMAN_OP_LIGHTEN: return "PIXMAN_OP_LIGHTEN";
-    case PIXMAN_OP_COLOR_DODGE: return "PIXMAN_OP_COLOR_DODGE";
-    case PIXMAN_OP_COLOR_BURN: return "PIXMAN_OP_COLOR_BURN";
-    case PIXMAN_OP_HARD_LIGHT: return "PIXMAN_OP_HARD_LIGHT";
-    case PIXMAN_OP_SOFT_LIGHT: return "PIXMAN_OP_SOFT_LIGHT";
-    case PIXMAN_OP_DIFFERENCE: return "PIXMAN_OP_DIFFERENCE";
-    case PIXMAN_OP_EXCLUSION: return "PIXMAN_OP_EXCLUSION";
-    case PIXMAN_OP_HSL_HUE: return "PIXMAN_OP_HSL_HUE";
-    case PIXMAN_OP_HSL_SATURATION: return "PIXMAN_OP_HSL_SATURATION";
-    case PIXMAN_OP_HSL_COLOR: return "PIXMAN_OP_HSL_COLOR";
-    case PIXMAN_OP_HSL_LUMINOSITY: return "PIXMAN_OP_HSL_LUMINOSITY";
-
-    case PIXMAN_OP_NONE:
-	return "<invalid operator 'none'>";
-    };
+#define ENTRY(op)							\
+    { PIXMAN_OP_##op, "PIXMAN_OP_" #op, FALSE }
+#define ALIAS(op, nam)							\
+    { PIXMAN_OP_##op, nam, TRUE }
+
+    /* operator_name () will return the first hit in this table,
+     * so keep the list properly ordered between entries and aliases.
+     * Aliases are not listed by list_operators ().
+     */
 
-    return "<unknown operator>";
-}
+    ENTRY (CLEAR),
+    ENTRY (SRC),
+    ENTRY (DST),
+    ENTRY (OVER),
+    ENTRY (OVER_REVERSE),
+    ALIAS (OVER_REVERSE,		"overrev"),
+    ENTRY (IN),
+    ENTRY (IN_REVERSE),
+    ALIAS (IN_REVERSE,			"inrev"),
+    ENTRY (OUT),
+    ENTRY (OUT_REVERSE),
+    ALIAS (OUT_REVERSE,			"outrev"),
+    ENTRY (ATOP),
+    ENTRY (ATOP_REVERSE),
+    ALIAS (ATOP_REVERSE,		"atoprev"),
+    ENTRY (XOR),
+    ENTRY (ADD),
+    ENTRY (SATURATE),
+
+    ENTRY (DISJOINT_CLEAR),
+    ENTRY (DISJOINT_SRC),
+    ENTRY (DISJOINT_DST),
+    ENTRY (DISJOINT_OVER),
+    ENTRY (DISJOINT_OVER_REVERSE),
+    ENTRY (DISJOINT_IN),
+    ENTRY (DISJOINT_IN_REVERSE),
+    ENTRY (DISJOINT_OUT),
+    ENTRY (DISJOINT_OUT_REVERSE),
+    ENTRY (DISJOINT_ATOP),
+    ENTRY (DISJOINT_ATOP_REVERSE),
+    ENTRY (DISJOINT_XOR),
+
+    ENTRY (CONJOINT_CLEAR),
+    ENTRY (CONJOINT_SRC),
+    ENTRY (CONJOINT_DST),
+    ENTRY (CONJOINT_OVER),
+    ENTRY (CONJOINT_OVER_REVERSE),
+    ENTRY (CONJOINT_IN),
+    ENTRY (CONJOINT_IN_REVERSE),
+    ENTRY (CONJOINT_OUT),
+    ENTRY (CONJOINT_OUT_REVERSE),
+    ENTRY (CONJOINT_ATOP),
+    ENTRY (CONJOINT_ATOP_REVERSE),
+    ENTRY (CONJOINT_XOR),
+
+    ENTRY (MULTIPLY),
+    ENTRY (SCREEN),
+    ENTRY (OVERLAY),
+    ENTRY (DARKEN),
+    ENTRY (LIGHTEN),
+    ENTRY (COLOR_DODGE),
+    ENTRY (COLOR_BURN),
+    ENTRY (HARD_LIGHT),
+    ENTRY (SOFT_LIGHT),
+    ENTRY (DIFFERENCE),
+    ENTRY (EXCLUSION),
+    ENTRY (HSL_HUE),
+    ENTRY (HSL_SATURATION),
+    ENTRY (HSL_COLOR),
+    ENTRY (HSL_LUMINOSITY),
+
+    ALIAS (NONE, "<invalid operator 'none'>")
+
+#undef ENTRY
+#undef ALIAS
+};
 
-const char *
-format_name (pixman_format_code_t format)
+typedef struct {
+    pixman_dither_t	 dither;
+    const char		*name;
+    pixman_bool_t	 is_alias;
+} dither_entry_t;
+
+static const dither_entry_t dither_list[] =
 {
-    switch (format)
-    {
+#define ENTRY(dither)							\
+    { PIXMAN_DITHER_##dither, "PIXMAN_DITHER_" #dither, FALSE }
+#define ALIAS(dither, nam)							\
+    { PIXMAN_DITHER_##dither, nam, TRUE }
+
+    /* dither_name () will return the first hit in this table,
+     * so keep the list properly ordered between entries and aliases.
+     * Aliases are not listed by list_dithers ().
+     */
+
+    ENTRY (ORDERED_BAYER_8),
+    ENTRY (ORDERED_BLUE_NOISE_64),
+    ENTRY (NONE),
+
+#undef ENTRY
+#undef ALIAS
+};
+
+struct format_entry
+{
+    pixman_format_code_t format;
+    const char		*name;
+    pixman_bool_t	 is_alias;
+};
+
+typedef struct format_entry format_entry_t;
+
+static const format_entry_t format_list[] =
+{
+#define ENTRY(f)							\
+    { PIXMAN_##f, #f, FALSE }
+#define ALIAS(f, nam)							\
+    { PIXMAN_##f, nam, TRUE }
+
+    /* format_name () will return the first hit in this table,
+     * so keep the list properly ordered between entries and aliases.
+     * Aliases are not listed by list_formats ().
+     */
+
+/* 128bpp formats */
+    ENTRY (rgba_float),
+/* 96bpp formats */
+    ENTRY (rgb_float),
+
 /* 32bpp formats */
-    case PIXMAN_a8r8g8b8: return "a8r8g8b8";
-    case PIXMAN_x8r8g8b8: return "x8r8g8b8";
-    case PIXMAN_a8b8g8r8: return "a8b8g8r8";
-    case PIXMAN_x8b8g8r8: return "x8b8g8r8";
-    case PIXMAN_b8g8r8a8: return "b8g8r8a8";
-    case PIXMAN_b8g8r8x8: return "b8g8r8x8";
-    case PIXMAN_r8g8b8a8: return "r8g8b8a8";
-    case PIXMAN_r8g8b8x8: return "r8g8b8x8";
-    case PIXMAN_x14r6g6b6: return "x14r6g6b6";
-    case PIXMAN_x2r10g10b10: return "x2r10g10b10";
-    case PIXMAN_a2r10g10b10: return "a2r10g10b10";
-    case PIXMAN_x2b10g10r10: return "x2b10g10r10";
-    case PIXMAN_a2b10g10r10: return "a2b10g10r10";
+    ENTRY (a8r8g8b8),
+    ALIAS (a8r8g8b8,		"8888"),
+    ENTRY (x8r8g8b8),
+    ALIAS (x8r8g8b8,		"x888"),
+    ENTRY (a8b8g8r8),
+    ENTRY (x8b8g8r8),
+    ENTRY (b8g8r8a8),
+    ENTRY (b8g8r8x8),
+    ENTRY (r8g8b8a8),
+    ENTRY (r8g8b8x8),
+    ENTRY (x14r6g6b6),
+    ENTRY (x2r10g10b10),
+    ALIAS (x2r10g10b10,		"2x10"),
+    ENTRY (a2r10g10b10),
+    ALIAS (a2r10g10b10,		"2a10"),
+    ENTRY (x2b10g10r10),
+    ENTRY (a2b10g10r10),
 
 /* sRGB formats */
-    case PIXMAN_a8r8g8b8_sRGB: return "a8r8g8b8_sRGB";
+    ENTRY (a8r8g8b8_sRGB),
+    ENTRY (r8g8b8_sRGB),
 
 /* 24bpp formats */
-    case PIXMAN_r8g8b8: return "r8g8b8";
-    case PIXMAN_b8g8r8: return "b8g8r8";
-
-/* 16bpp formats */
-    case PIXMAN_r5g6b5: return "r5g6b5";
-    case PIXMAN_b5g6r5: return "b5g6r5";
-
-    case PIXMAN_a1r5g5b5: return "a1r5g5b5";
-    case PIXMAN_x1r5g5b5: return "x1r5g5b5";
-    case PIXMAN_a1b5g5r5: return "a1b5g5r5";
-    case PIXMAN_x1b5g5r5: return "x1b5g5r5";
-    case PIXMAN_a4r4g4b4: return "a4r4g4b4";
-    case PIXMAN_x4r4g4b4: return "x4r4g4b4";
-    case PIXMAN_a4b4g4r4: return "a4b4g4r4";
-    case PIXMAN_x4b4g4r4: return "x4b4g4r4";
+    ENTRY (r8g8b8),
+    ALIAS (r8g8b8,		"0888"),
+    ENTRY (b8g8r8),
+
+/* 16 bpp formats */
+    ENTRY (r5g6b5),
+    ALIAS (r5g6b5,		"0565"),
+    ENTRY (b5g6r5),
+
+    ENTRY (a1r5g5b5),
+    ALIAS (a1r5g5b5,		"1555"),
+    ENTRY (x1r5g5b5),
+    ENTRY (a1b5g5r5),
+    ENTRY (x1b5g5r5),
+    ENTRY (a4r4g4b4),
+    ALIAS (a4r4g4b4,		"4444"),
+    ENTRY (x4r4g4b4),
+    ENTRY (a4b4g4r4),
+    ENTRY (x4b4g4r4),
 
 /* 8bpp formats */
-    case PIXMAN_a8: return "a8";
-    case PIXMAN_r3g3b2: return "r3g3b2";
-    case PIXMAN_b2g3r3: return "b2g3r3";
-    case PIXMAN_a2r2g2b2: return "a2r2g2b2";
-    case PIXMAN_a2b2g2r2: return "a2b2g2r2";
-
-#if 0
-    case PIXMAN_x4c4: return "x4c4";
-    case PIXMAN_g8: return "g8";
-#endif
-    case PIXMAN_c8: return "x4c4 / c8";
-    case PIXMAN_x4g4: return "x4g4 / g8";
+    ENTRY (a8),
+    ALIAS (a8,			"8"),
+    ENTRY (r3g3b2),
+    ENTRY (b2g3r3),
+    ENTRY (a2r2g2b2),
+    ALIAS (a2r2g2b2,		"2222"),
+    ENTRY (a2b2g2r2),
+
+    ALIAS (c8,			"x4c4 / c8"),
+    /* ENTRY (c8), */
+    ALIAS (g8,			"x4g4 / g8"),
+    /* ENTRY (g8), */
+
+    ENTRY (x4a4),
+
+    /* These format codes are identical to c8 and g8, respectively. */
+    /* ENTRY (x4c4), */
+    /* ENTRY (x4g4), */
+
+/* 4 bpp formats */
+    ENTRY (a4),
+    ENTRY (r1g2b1),
+    ENTRY (b1g2r1),
+    ENTRY (a1r1g1b1),
+    ENTRY (a1b1g1r1),
+
+    ALIAS (c4,			"c4"),
+    /* ENTRY (c4), */
+    ALIAS (g4,			"g4"),
+    /* ENTRY (g4), */
+
+/* 1bpp formats */
+    ENTRY (a1),
+
+    ALIAS (g1,			"g1"),
+    /* ENTRY (g1), */
 
-    case PIXMAN_x4a4: return "x4a4";
+/* YUV formats */
+    ALIAS (yuy2,		"yuy2"),
+    /* ENTRY (yuy2), */
+    ALIAS (yv12,		"yv12"),
+    /* ENTRY (yv12), */
+
+/* Fake formats, not in pixman_format_code_t enum */
+    ALIAS (null,		"null"),
+    ALIAS (solid,		"solid"),
+    ALIAS (solid,		"n"),
+    ALIAS (pixbuf,		"pixbuf"),
+    ALIAS (rpixbuf,		"rpixbuf"),
+    ALIAS (unknown,		"unknown"),
+
+#undef ENTRY
+#undef ALIAS
+};
 
-/* 4bpp formats */
-    case PIXMAN_a4: return "a4";
-    case PIXMAN_r1g2b1: return "r1g2b1";
-    case PIXMAN_b1g2r1: return "b1g2r1";
-    case PIXMAN_a1r1g1b1: return "a1r1g1b1";
-    case PIXMAN_a1b1g1r1: return "a1b1g1r1";
+pixman_format_code_t
+format_from_string (const char *s)
+{
+    int i;
 
-    case PIXMAN_c4: return "c4";
-    case PIXMAN_g4: return "g4";
+    for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
+    {
+        const format_entry_t *ent = &format_list[i];
 
-/* 1bpp formats */
-    case PIXMAN_a1: return "a1";
+        if (strcasecmp (ent->name, s) == 0)
+            return ent->format;
+    }
 
-    case PIXMAN_g1: return "g1";
+    return PIXMAN_null;
+}
 
-/* YUV formats */
-    case PIXMAN_yuy2: return "yuy2";
-    case PIXMAN_yv12: return "yv12";
-    };
+static void
+emit (const char *s, int *n_chars)
+{
+    *n_chars += printf ("%s,", s);
+    if (*n_chars > 60)
+    {
+        printf ("\n    ");
+        *n_chars = 0;
+    }
+    else
+    {
+        printf (" ");
+        (*n_chars)++;
+    }
+}
 
-    /* Fake formats.
-     *
-     * This is separate switch to prevent GCC from complaining
-     * that the values are not in the pixman_format_code_t enum.
-     */
-    switch ((uint32_t)format)
+void
+list_formats (void)
+{
+    int n_chars;
+    int i;
+
+    printf ("Formats:\n    ");
+
+    n_chars = 0;
+    for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
     {
-    case PIXMAN_null: return "null"; 
-    case PIXMAN_solid: return "solid"; 
-    case PIXMAN_pixbuf: return "pixbuf"; 
-    case PIXMAN_rpixbuf: return "rpixbuf"; 
-    case PIXMAN_unknown: return "unknown"; 
-    };
+        const format_entry_t *ent = &format_list[i];
+
+        if (ent->is_alias)
+            continue;
+
+        emit (ent->name, &n_chars);
+    }
+
+    printf ("\n\n");
+}
+
+void
+list_operators (void)
+{
+    char short_name [128] = { 0 };
+    int i, n_chars;
+
+    printf ("Operators:\n    ");
+
+    n_chars = 0;
+    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+    {
+        const operator_entry_t *ent = &op_list[i];
+        int j;
+
+        if (ent->is_alias)
+            continue;
+
+        snprintf (short_name, sizeof (short_name) - 1, "%s",
+                  ent->name + strlen ("PIXMAN_OP_"));
+
+        for (j = 0; short_name[j] != '\0'; ++j)
+            short_name[j] = tolower (short_name[j]);
+
+        emit (short_name, &n_chars);
+    }
+
+    printf ("\n\n");
+}
+
+void
+list_dithers (void)
+{
+    int n_chars;
+    int i;
+
+    printf ("Dithers:\n    ");
+
+    n_chars = 0;
+    for (i = 0; i < ARRAY_LENGTH (dither_list); ++i)
+    {
+        const dither_entry_t *ent = &dither_list[i];
+
+        if (ent->is_alias)
+            continue;
+
+        emit (ent->name, &n_chars);
+    }
+
+    printf ("\n\n");
+}
+
+pixman_op_t
+operator_from_string (const char *s)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+    {
+        const operator_entry_t *ent = &op_list[i];
+
+        if (ent->is_alias)
+        {
+            if (strcasecmp (ent->name, s) == 0)
+                return ent->op;
+        }
+        else
+        {
+            if (strcasecmp (ent->name + strlen ("PIXMAN_OP_"), s) == 0)
+                return ent->op;
+        }
+    }
+
+    return PIXMAN_OP_NONE;
+}
+
+pixman_dither_t
+dither_from_string (const char *s)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (dither_list); ++i)
+    {
+        const dither_entry_t *ent = &dither_list[i];
+
+        if (strcasecmp (ent->name, s) == 0)
+            return ent->dither;
+    }
+
+    return PIXMAN_DITHER_NONE;
+}
+
+const char *
+operator_name (pixman_op_t op)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+    {
+        const operator_entry_t *ent = &op_list[i];
+
+        if (ent->op == op)
+            return ent->name;
+    }
+
+    return "<unknown operator>";
+}
+
+const char *
+format_name (pixman_format_code_t format)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
+    {
+        const format_entry_t *ent = &format_list[i];
+
+        if (ent->format == format)
+            return ent->name;
+    }
 
     return "<unknown format>";
 };
 
+const char *
+dither_name (pixman_dither_t dither)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (dither_list); ++i)
+    {
+	const dither_entry_t *ent = &dither_list[i];
+
+	if (ent->dither == dither)
+	    return ent->name;
+    }
+
+    return "<unknown dither>";
+}
+
 #define IS_ZERO(f)     (-DBL_MIN < (f) && (f) < DBL_MIN)
 
 typedef double (* blend_func_t) (double as, double s, double ad, double d);
@@ -1596,6 +2007,10 @@ round_color (pixman_format_code_t format, color_t *color)
 	color->a = round_channel (color->a, PIXMAN_FORMAT_A (format));
 }
 
+/* The acceptable deviation in units of [0.0, 1.0]
+ */
+#define DEVIATION (0.0128)
+
 /* Check whether @pixel is a valid quantization of the a, r, g, b
  * parameters. Some slack is permitted.
  */
@@ -1606,6 +2021,10 @@ pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format)
 
     checker->format = format;
 
+    if (format == PIXMAN_rgba_float ||
+	format == PIXMAN_rgb_float)
+	return;
+
     switch (PIXMAN_FORMAT_TYPE (format))
     {
     case PIXMAN_TYPE_A:
@@ -1651,21 +2070,46 @@ pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format)
 	break;
     }
 
-    checker->am = ((1 << PIXMAN_FORMAT_A (format)) - 1) << checker->as;
-    checker->rm = ((1 << PIXMAN_FORMAT_R (format)) - 1) << checker->rs;
-    checker->gm = ((1 << PIXMAN_FORMAT_G (format)) - 1) << checker->gs;
-    checker->bm = ((1 << PIXMAN_FORMAT_B (format)) - 1) << checker->bs;
+    checker->am = ((1U << PIXMAN_FORMAT_A (format)) - 1) << checker->as;
+    checker->rm = ((1U << PIXMAN_FORMAT_R (format)) - 1) << checker->rs;
+    checker->gm = ((1U << PIXMAN_FORMAT_G (format)) - 1) << checker->gs;
+    checker->bm = ((1U << PIXMAN_FORMAT_B (format)) - 1) << checker->bs;
 
     checker->aw = PIXMAN_FORMAT_A (format);
     checker->rw = PIXMAN_FORMAT_R (format);
     checker->gw = PIXMAN_FORMAT_G (format);
     checker->bw = PIXMAN_FORMAT_B (format);
+
+    checker->ad = DEVIATION;
+    checker->rd = DEVIATION;
+    checker->gd = DEVIATION;
+    checker->bd = DEVIATION;
+}
+
+/* When dithering is enabled, we allow one extra pixel of tolerance
+ */
+void
+pixel_checker_allow_dither (pixel_checker_t *checker)
+{
+    checker->ad += 1 / (double)((1 << checker->aw) - 1);
+    checker->rd += 1 / (double)((1 << checker->rw) - 1);
+    checker->gd += 1 / (double)((1 << checker->gw) - 1);
+    checker->bd += 1 / (double)((1 << checker->bw) - 1);
+}
+
+static void
+pixel_checker_require_uint32_format (const pixel_checker_t *checker)
+{
+    assert (checker->format != PIXMAN_rgba_float &&
+	    checker->format != PIXMAN_rgb_float);
 }
 
 void
 pixel_checker_split_pixel (const pixel_checker_t *checker, uint32_t pixel,
 			   int *a, int *r, int *g, int *b)
 {
+    pixel_checker_require_uint32_format(checker);
+
     *a = (pixel & checker->am) >> checker->as;
     *r = (pixel & checker->rm) >> checker->rs;
     *g = (pixel & checker->gm) >> checker->gs;
@@ -1679,6 +2123,8 @@ pixel_checker_get_masks (const pixel_checker_t *checker,
                          uint32_t              *gm,
                          uint32_t              *bm)
 {
+    pixel_checker_require_uint32_format(checker);
+
     if (am)
         *am = checker->am;
     if (rm)
@@ -1695,6 +2141,8 @@ pixel_checker_convert_pixel_to_color (const pixel_checker_t *checker,
 {
     int a, r, g, b;
 
+    pixel_checker_require_uint32_format(checker);
+
     pixel_checker_split_pixel (checker, pixel, &a, &r, &g, &b);
 
     if (checker->am == 0)
@@ -1740,7 +2188,7 @@ convert (double v, uint32_t width, uint32_t mask, uint32_t shift, double def)
 }
 
 static void
-get_limits (const pixel_checker_t *checker, double limit,
+get_limits (const pixel_checker_t *checker, double sign,
 	    color_t *color,
 	    int *ao, int *ro, int *go, int *bo)
 {
@@ -1756,28 +2204,32 @@ get_limits (const pixel_checker_t *checker, double limit,
 	color = &tmp;
     }
     
-    *ao = convert (color->a + limit, checker->aw, checker->am, checker->as, 1.0);
-    *ro = convert (color->r + limit, checker->rw, checker->rm, checker->rs, 0.0);
-    *go = convert (color->g + limit, checker->gw, checker->gm, checker->gs, 0.0);
-    *bo = convert (color->b + limit, checker->bw, checker->bm, checker->bs, 0.0);
+    *ao = convert (color->a + sign * checker->ad,
+		   checker->aw, checker->am, checker->as, 1.0);
+    *ro = convert (color->r + sign * checker->rd,
+		   checker->rw, checker->rm, checker->rs, 0.0);
+    *go = convert (color->g + sign * checker->gd,
+		   checker->gw, checker->gm, checker->gs, 0.0);
+    *bo = convert (color->b + sign * checker->bd,
+		   checker->bw, checker->bm, checker->bs, 0.0);
 }
 
-/* The acceptable deviation in units of [0.0, 1.0]
- */
-#define DEVIATION (0.0128)
-
 void
 pixel_checker_get_max (const pixel_checker_t *checker, color_t *color,
 		       int *am, int *rm, int *gm, int *bm)
 {
-    get_limits (checker, DEVIATION, color, am, rm, gm, bm);
+    pixel_checker_require_uint32_format(checker);
+
+    get_limits (checker, 1, color, am, rm, gm, bm);
 }
 
 void
 pixel_checker_get_min (const pixel_checker_t *checker, color_t *color,
 		       int *am, int *rm, int *gm, int *bm)
 {
-    get_limits (checker, - DEVIATION, color, am, rm, gm, bm);
+    pixel_checker_require_uint32_format(checker);
+
+    get_limits (checker, - 1, color, am, rm, gm, bm);
 }
 
 pixman_bool_t
@@ -1788,6 +2240,8 @@ pixel_checker_check (const pixel_checker_t *checker, uint32_t pixel,
     int32_t ai, ri, gi, bi;
     pixman_bool_t result;
 
+    pixel_checker_require_uint32_format(checker);
+
     pixel_checker_get_min (checker, color, &a_lo, &r_lo, &g_lo, &b_lo);
     pixel_checker_get_max (checker, color, &a_hi, &r_hi, &g_hi, &b_hi);
     pixel_checker_split_pixel (checker, pixel, &ai, &ri, &gi, &bi);
@@ -1800,3 +2254,36 @@ pixel_checker_check (const pixel_checker_t *checker, uint32_t pixel,
 
     return result;
 }
+
+static void
+color_limits (const pixel_checker_t *checker,
+	      double limit, const color_t *color, color_t *out)
+{
+    if (PIXMAN_FORMAT_A(checker->format))
+	out->a = color->a + limit;
+    else
+	out->a = 1.;
+
+    out->r = color->r + limit;
+    out->g = color->g + limit;
+    out->b = color->b + limit;
+}
+
+pixman_bool_t
+pixel_checker_check_color (const pixel_checker_t *checker,
+			   const color_t *actual, const color_t *reference)
+{
+    color_t min, max;
+    pixman_bool_t result;
+
+    color_limits(checker, -DEVIATION, reference, &min);
+    color_limits(checker, DEVIATION, reference, &max);
+
+    result =
+	actual->a >= min.a && actual->a <= max.a &&
+	actual->r >= min.r && actual->r <= max.r &&
+	actual->g >= min.g && actual->g <= max.g &&
+	actual->b >= min.b && actual->b <= max.b;
+
+    return result;
+}
diff --git a/test/utils.h b/test/utils/utils.h
index 6804334..d3e1ba4 100644
--- a/test/utils.h
+++ b/test/utils/utils.h
@@ -1,5 +1,5 @@
 #ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
 #endif
 
 #include <assert.h>
@@ -86,6 +86,17 @@ is_little_endian (void)
 void
 image_endian_swap (pixman_image_t *img);
 
+#if defined (HAVE_MPROTECT) && defined (HAVE_GETPAGESIZE) && \
+    defined (HAVE_SYS_MMAN_H) && defined (HAVE_MMAP)
+/* fence_malloc and friends have working fence implementation.
+ * Without this, fence_malloc still allocs but does not catch
+ * out-of-bounds accesses.
+ */
+#define FENCE_MALLOC_ACTIVE 1
+#else
+#define FENCE_MALLOC_ACTIVE 0
+#endif
+
 /* Allocate memory that is bounded by protected pages,
  * so that out-of-bounds access will cause segfaults
  */
@@ -95,9 +106,21 @@ fence_malloc (int64_t len);
 void
 fence_free (void *data);
 
+pixman_image_t *
+fence_image_create_bits (pixman_format_code_t format,
+                         int min_width,
+                         int height,
+                         pixman_bool_t stride_fence);
+
+/* Return the page size if FENCE_MALLOC_ACTIVE, or zero otherwise */
+unsigned long
+fence_get_page_size ();
+
 /* Generate n_bytes random bytes in fence_malloced memory */
 uint8_t *
 make_random_bytes (int n_bytes);
+float *
+make_random_floats (int n_bytes);
 
 /* Return current time in seconds */
 double
@@ -187,12 +210,32 @@ convert_linear_to_srgb (double component);
 void
 initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb);
 
+pixman_format_code_t
+format_from_string (const char *s);
+
+void
+list_formats (void);
+
+void
+list_operators (void);
+
+void list_dithers (void);
+
+pixman_op_t
+operator_from_string (const char *s);
+
+pixman_dither_t
+dither_from_string (const char *s);
+
 const char *
 operator_name (pixman_op_t op);
 
 const char *
 format_name (pixman_format_code_t format);
 
+const char *
+dither_name (pixman_dither_t dither);
+
 typedef struct
 {
     double r, g, b, a;
@@ -215,12 +258,16 @@ typedef struct
     uint32_t am, rm, gm, bm;
     uint32_t as, rs, gs, bs;
     uint32_t aw, rw, gw, bw;
+    float ad, rd, gd, bd;
 } pixel_checker_t;
 
 void
 pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format);
 
 void
+pixel_checker_allow_dither (pixel_checker_t *checker);
+
+void
 pixel_checker_split_pixel (const pixel_checker_t *checker, uint32_t pixel,
 			   int *a, int *r, int *g, int *b);