summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.editorconfig11
-rw-r--r--.gitignore47
-rw-r--r--.gitlab-ci.yml10
-rw-r--r--ChangeLog0
-rw-r--r--INSTALL228
-rw-r--r--Makefile.am137
-rw-r--r--Makefile.win3225
-rw-r--r--Makefile.win32.common56
-rw-r--r--README78
-rw-r--r--RELEASING5
-rw-r--r--a64-neon-test.S5
-rw-r--r--arm-simd-test.S10
-rwxr-xr-xautogen.sh14
-rw-r--r--configure.ac1129
-rw-r--r--demos/Makefile.am52
-rw-r--r--demos/conical-test.c2
-rw-r--r--demos/dither.c277
-rw-r--r--demos/dither.ui147
-rw-r--r--demos/gtk-utils.c16
-rw-r--r--demos/linear-gradient.c2
-rw-r--r--demos/meson.build66
-rw-r--r--demos/radial-test.c2
-rw-r--r--demos/scale.c132
-rw-r--r--demos/scale.ui2
-rw-r--r--demos/tri-test.c2
-rw-r--r--meson.build619
-rw-r--r--meson_options.txt128
-rw-r--r--neon-test.S12
-rw-r--r--pixman-1-uninstalled.pc.in5
-rw-r--r--pixman-1.pc.in11
-rw-r--r--pixman/Makefile.am141
-rw-r--r--pixman/Makefile.sources42
-rw-r--r--pixman/Makefile.win3293
-rw-r--r--pixman/dither/blue-noise-64x64.h77
-rw-r--r--pixman/dither/make-blue-noise.c679
-rw-r--r--pixman/loongson-mmintrin.h18
-rw-r--r--pixman/make-srgb.pl2
-rw-r--r--pixman/meson.build143
-rw-r--r--pixman/pixman-access.c320
-rw-r--r--pixman/pixman-arm-asm.h38
-rw-r--r--pixman/pixman-arm-common.h11
-rw-r--r--pixman/pixman-arm-neon-asm-bilinear.S362
-rw-r--r--pixman/pixman-arm-neon-asm.S436
-rw-r--r--pixman/pixman-arm-neon-asm.h607
-rw-r--r--pixman/pixman-arm-neon.c49
-rw-r--r--pixman/pixman-arm-simd-asm-scaled.S42
-rw-r--r--pixman/pixman-arm-simd-asm.S499
-rw-r--r--pixman/pixman-arm-simd-asm.h304
-rw-r--r--pixman/pixman-arm-simd.c24
-rw-r--r--pixman/pixman-arm.c33
-rw-r--r--pixman/pixman-arma64-neon-asm-bilinear.S1276
-rw-r--r--pixman/pixman-arma64-neon-asm.S3704
-rw-r--r--pixman/pixman-arma64-neon-asm.h1310
-rw-r--r--pixman/pixman-bits-image.c582
-rw-r--r--pixman/pixman-combine-float.c2
-rw-r--r--pixman/pixman-combine32.c6
-rw-r--r--pixman/pixman-combine32.h2
-rw-r--r--pixman/pixman-compiler.h8
-rw-r--r--pixman/pixman-conical-gradient.c38
-rw-r--r--pixman/pixman-edge.c2
-rw-r--r--pixman/pixman-fast-path.c32
-rw-r--r--pixman/pixman-filter.c265
-rw-r--r--pixman/pixman-general.c27
-rw-r--r--pixman/pixman-glyph.c2
-rw-r--r--pixman/pixman-gradient-walker.c116
-rw-r--r--pixman/pixman-image.c113
-rw-r--r--pixman/pixman-implementation.c20
-rw-r--r--pixman/pixman-inlines.h28
-rw-r--r--pixman/pixman-linear-gradient.c49
-rw-r--r--pixman/pixman-matrix.c4
-rw-r--r--pixman/pixman-mips-dspr2-asm.S2
-rw-r--r--pixman/pixman-mips-dspr2-asm.h3
-rw-r--r--pixman/pixman-mips-dspr2.c12
-rw-r--r--pixman/pixman-mips-dspr2.h8
-rw-r--r--pixman/pixman-mips.c2
-rw-r--r--pixman/pixman-mmx.c146
-rw-r--r--pixman/pixman-noop.c2
-rw-r--r--pixman/pixman-ppc.c20
-rw-r--r--pixman/pixman-private.h75
-rw-r--r--pixman/pixman-radial-gradient.c132
-rw-r--r--pixman/pixman-region.c64
-rw-r--r--pixman/pixman-region16.c2
-rw-r--r--pixman/pixman-region32.c2
-rw-r--r--pixman/pixman-solid-fill.c10
-rw-r--r--pixman/pixman-sse2.c126
-rw-r--r--pixman/pixman-ssse3.c2
-rw-r--r--pixman/pixman-timer.c2
-rw-r--r--pixman/pixman-trap.c4
-rw-r--r--pixman/pixman-utils.c8
-rw-r--r--pixman/pixman-version.h.in4
-rw-r--r--pixman/pixman-vmx.c1214
-rw-r--r--pixman/pixman-x86.c66
-rw-r--r--pixman/pixman.c47
-rw-r--r--pixman/pixman.h509
-rw-r--r--pixman/rounding.txt1
-rw-r--r--test/Makefile.am13
-rw-r--r--test/Makefile.sources51
-rw-r--r--test/Makefile.win3254
-rw-r--r--test/affine-bench.c448
-rw-r--r--test/affine-test.c2
-rw-r--r--test/alphamap.c73
-rw-r--r--test/check-formats.c192
-rw-r--r--test/composite.c1
-rw-r--r--test/cover-test.c449
-rw-r--r--test/fence-image-self-test.c239
-rw-r--r--test/filter-reduction-test.c112
-rw-r--r--test/lowlevel-blt-bench.c512
-rw-r--r--test/matrix-test.c4
-rw-r--r--test/meson.build90
-rw-r--r--test/scaling-test.c66
-rw-r--r--test/solid-test.c354
-rw-r--r--test/stress-test.c78
-rw-r--r--test/thread-test.c87
-rw-r--r--test/tolerance-test.c23
-rw-r--r--test/utils/meson.build31
-rw-r--r--test/utils/utils-prng.c (renamed from test/utils-prng.c)15
-rw-r--r--test/utils/utils-prng.h (renamed from test/utils-prng.h)2
-rw-r--r--test/utils/utils.c (renamed from test/utils.c)809
-rw-r--r--test/utils/utils.h (renamed from test/utils.h)49
119 files changed, 16297 insertions, 4656 deletions
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..5153bb0
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,11 @@
+# To use this config on you editor, follow the instructions at:
+# http://editorconfig.org
+
+root = true
+
+[*]
+tab_width = 8
+
+[meson.build,meson_options.txt]
+indent_style = space
+indent_size = 2
diff --git a/.gitignore b/.gitignore
index 0f11496..046b161 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,61 +26,28 @@ stamp-h?
config.h
config.h.in
.*.swp
-demos/alpha-test
+demos/*-test
demos/checkerboard
demos/clip-in
-demos/clip-test
-demos/composite-test
-demos/conical-test
-demos/convolution-test
-demos/gradient-test
demos/linear-gradient
demos/quad2quad
-demos/radial-test
demos/scale
-demos/screen-test
-demos/srgb-test
-demos/srgb-trap-test
-demos/trap-test
-demos/tri-test
+demos/dither
pixman/pixman-srgb.c
pixman/pixman-version.h
-test/a1-trap-test
-test/affine-test
+test/*-test
+test/affine-bench
test/alpha-loop
test/alphamap
-test/alpha-test
-test/blitters-test
+test/check-formats
test/clip-in
-test/clip-test
-test/combiner-test
test/composite
-test/composite-test
-test/composite-traps-test
-test/convolution-test
-test/fetch-test
-test/glyph-test
-test/gradient-crash-test
-test/gradient-test
test/infinite-loop
test/lowlevel-blt-bench
-test/oob-test
-test/pdf-op-test
-test/prng-test
-test/radial-perf-test
-test/region-contains-test
-test/region-test
+test/radial-invalid
test/region-translate
-test/region-translate-test
-test/rotate-test
-test/scaling-crash-test
-test/scaling-helpers-test
-test/scaling-test
-test/screen-test
-test/stress-test
+test/scaling-bench
test/trap-crasher
-test/trap-test
-test/window-test
*.pdb
*.dll
*.lib
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..296b6cd
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,10 @@
+image: fedora:39
+
+meson-build:
+ script:
+ - dnf -y install dnf-plugins-core
+ - dnf -y groupinstall buildsys-build
+ - dnf -y builddep pixman
+ - dnf -y install meson
+ - meson setup build
+ - ninja -C build test
diff --git a/ChangeLog b/ChangeLog
deleted file mode 100644
index e69de29..0000000
--- a/ChangeLog
+++ /dev/null
diff --git a/INSTALL b/INSTALL
index 5458714..5824704 100644
--- a/INSTALL
+++ b/INSTALL
@@ -10,225 +10,29 @@ unlimited permission to copy, distribute and modify it.
Basic Installation
==================
-Briefly, the shell commands `./configure; make; make install' should
-configure, build, and install this package. The following
-more-detailed instructions are generic; see the `README' file for
-instructions specific to this package.
+Briefly, the shell commands `meson setup build/; ninja -C build; ninja
+-C build install` should configure, build, and install this package. The
+following more-detailed instructions are generic; see the `README` file
+for instructions specific to this package.
- The `configure' shell script attempts to guess correct values for
-various system-dependent variables used during compilation. It uses
-those values to create a `Makefile' in each directory of the package.
-It may also create one or more `.h' files containing system-dependent
-definitions. Finally, it creates a shell script `config.status' that
-you can run in the future to recreate the current configuration, and a
-file `config.log' containing compiler output (useful mainly for
-debugging `configure').
-
- It can also use an optional file (typically called `config.cache'
-and enabled with `--cache-file=config.cache' or simply `-C') that saves
-the results of its tests to speed up reconfiguring. Caching is
-disabled by default to prevent problems with accidental use of stale
-cache files.
-
- If you need to do unusual things to compile the package, please try
-to figure out how `configure' could check whether to do them, and mail
-diffs or instructions to the address given in the `README' so they can
-be considered for the next release. If you are using the cache, and at
-some point `config.cache' contains results you don't want to keep, you
-may remove or edit it.
-
- The file `configure.ac' (or `configure.in') is used to create
-`configure' by a program called `autoconf'. You need `configure.ac' if
-you want to change it or regenerate `configure' using a newer version
-of `autoconf'.
+ Running `meson setup` attempts to guess correct values for various
+system-dependent variables used during compilation.
The simplest way to compile this package is:
- 1. `cd' to the directory containing the package's source code and type
- `./configure' to configure the package for your system.
+ 1. `cd` to the directory containing the package's source code and type
+ `meson setup build/` to configure the package for your system.
- Running `configure' might take a while. While running, it prints
- some messages telling which features it is checking for.
+ While running, it prints some messages telling which features it
+ is checking for.
- 2. Type `make' to compile the package.
+ 2. Type `ninja -C build` to compile the package.
- 3. Optionally, type `make check' to run any self-tests that come with
- the package.
+ 3. Optionally, type `ninja -C build test` to run any self-tests that
+ come with the package.
- 4. Type `make install' to install the programs and any data files and
- documentation.
+ 4. Type `ninja -C build install` to install the programs and any
+ data files and documentation.
5. You can remove the program binaries and object files from the
- source code directory by typing `make clean'. To also remove the
- files that `configure' created (so you can compile the package for
- a different kind of computer), type `make distclean'. There is
- also a `make maintainer-clean' target, but that is intended mainly
- for the package's developers. If you use it, you may have to get
- all sorts of other programs in order to regenerate files that came
- with the distribution.
-
-Compilers and Options
-=====================
-
-Some systems require unusual options for compilation or linking that the
-`configure' script does not know about. Run `./configure --help' for
-details on some of the pertinent environment variables.
-
- You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment. Here
-is an example:
-
- ./configure CC=c99 CFLAGS=-g LIBS=-lposix
-
- *Note Defining Variables::, for more details.
-
-Compiling For Multiple Architectures
-====================================
-
-You can compile the package for more than one kind of computer at the
-same time, by placing the object files for each architecture in their
-own directory. To do this, you can use GNU `make'. `cd' to the
-directory where you want the object files and executables to go and run
-the `configure' script. `configure' automatically checks for the
-source code in the directory that `configure' is in and in `..'.
-
- With a non-GNU `make', it is safer to compile the package for one
-architecture at a time in the source code directory. After you have
-installed the package for one architecture, use `make distclean' before
-reconfiguring for another architecture.
-
-Installation Names
-==================
-
-By default, `make install' installs the package's commands under
-`/usr/local/bin', include files under `/usr/local/include', etc. You
-can specify an installation prefix other than `/usr/local' by giving
-`configure' the option `--prefix=PREFIX'.
-
- You can specify separate installation prefixes for
-architecture-specific files and architecture-independent files. If you
-pass the option `--exec-prefix=PREFIX' to `configure', the package uses
-PREFIX as the prefix for installing programs and libraries.
-Documentation and other data files still use the regular prefix.
-
- In addition, if you use an unusual directory layout you can give
-options like `--bindir=DIR' to specify different values for particular
-kinds of files. Run `configure --help' for a list of the directories
-you can set and what kinds of files go in them.
-
- If the package supports it, you can cause programs to be installed
-with an extra prefix or suffix on their names by giving `configure' the
-option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
-
-Optional Features
-=================
-
-Some packages pay attention to `--enable-FEATURE' options to
-`configure', where FEATURE indicates an optional part of the package.
-They may also pay attention to `--with-PACKAGE' options, where PACKAGE
-is something like `gnu-as' or `x' (for the X Window System). The
-`README' should mention any `--enable-' and `--with-' options that the
-package recognizes.
-
- For packages that use the X Window System, `configure' can usually
-find the X include and library files automatically, but if it doesn't,
-you can use the `configure' options `--x-includes=DIR' and
-`--x-libraries=DIR' to specify their locations.
-
-Specifying the System Type
-==========================
-
-There may be some features `configure' cannot figure out automatically,
-but needs to determine by the type of machine the package will run on.
-Usually, assuming the package is built to be run on the _same_
-architectures, `configure' can figure that out, but if it prints a
-message saying it cannot guess the machine type, give it the
-`--build=TYPE' option. TYPE can either be a short name for the system
-type, such as `sun4', or a canonical name which has the form:
-
- CPU-COMPANY-SYSTEM
-
-where SYSTEM can have one of these forms:
-
- OS KERNEL-OS
-
- See the file `config.sub' for the possible values of each field. If
-`config.sub' isn't included in this package, then this package doesn't
-need to know the machine type.
-
- If you are _building_ compiler tools for cross-compiling, you should
-use the option `--target=TYPE' to select the type of system they will
-produce code for.
-
- If you want to _use_ a cross compiler, that generates code for a
-platform different from the build platform, you should specify the
-"host" platform (i.e., that on which the generated programs will
-eventually be run) with `--host=TYPE'.
-
-Sharing Defaults
-================
-
-If you want to set default values for `configure' scripts to share, you
-can create a site shell script called `config.site' that gives default
-values for variables like `CC', `cache_file', and `prefix'.
-`configure' looks for `PREFIX/share/config.site' if it exists, then
-`PREFIX/etc/config.site' if it exists. Or, you can set the
-`CONFIG_SITE' environment variable to the location of the site script.
-A warning: not all `configure' scripts look for a site script.
-
-Defining Variables
-==================
-
-Variables not defined in a site shell script can be set in the
-environment passed to `configure'. However, some packages may run
-configure again during the build, and the customized values of these
-variables may be lost. In order to avoid this problem, you should set
-them in the `configure' command line, using `VAR=value'. For example:
-
- ./configure CC=/usr/local2/bin/gcc
-
-causes the specified `gcc' to be used as the C compiler (unless it is
-overridden in the site shell script).
-
-Unfortunately, this technique does not work for `CONFIG_SHELL' due to
-an Autoconf bug. Until the bug is fixed you can use this workaround:
-
- CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
-
-`configure' Invocation
-======================
-
-`configure' recognizes the following options to control how it operates.
-
-`--help'
-`-h'
- Print a summary of the options to `configure', and exit.
-
-`--version'
-`-V'
- Print the version of Autoconf used to generate the `configure'
- script, and exit.
-
-`--cache-file=FILE'
- Enable the cache: use and save the results of the tests in FILE,
- traditionally `config.cache'. FILE defaults to `/dev/null' to
- disable caching.
-
-`--config-cache'
-`-C'
- Alias for `--cache-file=config.cache'.
-
-`--quiet'
-`--silent'
-`-q'
- Do not print messages saying which checks are being made. To
- suppress all normal output, redirect it to `/dev/null' (any error
- messages will still be shown).
-
-`--srcdir=DIR'
- Look for the package's source code in directory DIR. Usually
- `configure' can determine that directory automatically.
-
-`configure' also accepts some other, not widely useful, options. Run
-`configure --help' for more details.
-
+ source code directory by typing `ninja -C build clean`.
diff --git a/Makefile.am b/Makefile.am
deleted file mode 100644
index 5137c9e..0000000
--- a/Makefile.am
+++ /dev/null
@@ -1,137 +0,0 @@
-SUBDIRS = pixman demos test
-
-pkgconfigdir=$(libdir)/pkgconfig
-pkgconfig_DATA=pixman-1.pc
-
-$(pkgconfig_DATA): pixman-1.pc.in
-
-snapshot:
- distdir="$(distdir)-`date '+%Y%m%d'`"; \
- test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
- $(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
-
-GPGKEY=3892336E
-USERNAME=$$USER
-RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
-RELEASE_CAIRO_HOST = $(USERNAME)@cairographics.org
-RELEASE_CAIRO_DIR = /srv/cairo.freedesktop.org/www/$(RELEASE_OR_SNAPSHOT)s
-RELEASE_CAIRO_URL = http://cairographics.org/$(RELEASE_OR_SNAPSHOT)s
-RELEASE_XORG_URL = http://xorg.freedesktop.org/archive/individual/lib
-RELEASE_XORG_HOST = $(USERNAME)@xorg.freedesktop.org
-RELEASE_XORG_DIR = /srv/xorg.freedesktop.org/archive/individual/lib
-RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
-
-EXTRA_DIST = \
- Makefile.win32 \
- Makefile.win32.common
-
-tar_gz = $(PACKAGE)-$(VERSION).tar.gz
-tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
-
-sha1_tgz = $(tar_gz).sha1
-md5_tgz = $(tar_gz).md5
-
-sha1_tbz2 = $(tar_bz2).sha1
-md5_tbz2 = $(tar_bz2).md5
-
-gpg_file = $(sha1_tgz).asc
-
-$(sha1_tgz): $(tar_gz)
- sha1sum $^ > $@
-
-$(md5_tgz): $(tar_gz)
- md5sum $^ > $@
-
-$(sha1_tbz2): $(tar_bz2)
- sha1sum $^ > $@
-
-$(md5_tbz2): $(tar_bz2)
- md5sum $^ > $@
-
-$(gpg_file): $(sha1_tgz)
- @echo "Please enter your GPG password to sign the checksum."
- gpg --armor --sign $^
-
-HASHFILES = $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(md5_tbz2)
-
-release-verify-newer:
- @echo -n "Checking that no $(VERSION) release already exists at $(RELEASE_XORG_HOST)..."
- @ssh $(RELEASE_XORG_HOST) test ! -e $(RELEASE_XORG_DIR)/$(tar_gz) \
- || (echo "Ouch." && echo "Found: $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)/$(tar_gz)" \
- && echo "Refusing to try to generate a new release of the same name." \
- && false)
- @ssh $(RELEASE_CAIRO_HOST) test ! -e $(RELEASE_CAIRO_DIR)/$(tar_gz) \
- || (echo "Ouch." && echo "Found: $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)/$(tar_gz)" \
- && echo "Refusing to try to generate a new release of the same name." \
- && false)
- @echo "Good."
-
-release-remove-old:
- $(RM) $(tar_gz) $(tar_bz2) $(HASHFILES) $(gpg_file)
-
-ensure-prev:
- @if [[ "$(PREV)" == "" ]]; then \
- echo "" && \
- echo "You must set the PREV variable on the make command line to" && \
- echo "the last version." && \
- echo "" && \
- echo "For example:" && \
- echo " make PREV=0.7.3" && \
- echo "" && \
- false; \
- fi
-
-release-check: ensure-prev release-verify-newer release-remove-old distcheck
-
-release-tag:
- git tag -u $(GPGKEY) -m "$(PACKAGE) $(VERSION) release" $(PACKAGE)-$(VERSION)
-
-release-upload: release-check $(tar_gz) $(tar_bz2) $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(gpg_file)
- scp $(tar_gz) $(sha1_tgz) $(gpg_file) $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)
- scp $(tar_gz) $(tar_bz2) $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)
- ssh $(RELEASE_CAIRO_HOST) "rm -f $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-[0-9]* && ln -s $(tar_gz) $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-$(VERSION)"
-
-RELEASE_TYPE = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo "stable release in the" ; else echo "development snapshot leading up to a stable"; fi)
-
-release-publish-message: $(HASHFILES) ensure-prev
- @echo "Please follow the instructions in RELEASING to push stuff out and"
- @echo "send out the announcement mails. Here is the excerpt you need:"
- @echo ""
- @echo "Lists: $(RELEASE_ANNOUNCE_LIST)"
- @echo "Subject: [ANNOUNCE] $(PACKAGE) release $(VERSION) now available"
- @echo "============================== CUT HERE =============================="
- @echo "A new $(PACKAGE) release $(VERSION) is now available. This is a $(RELEASE_TYPE)"
- @echo ""
- @echo "tar.gz:"
- @echo " $(RELEASE_CAIRO_URL)/$(tar_gz)"
- @echo " $(RELEASE_XORG_URL)/$(tar_gz)"
- @echo ""
- @echo "tar.bz2:"
- @echo " $(RELEASE_XORG_URL)/$(tar_bz2)"
- @echo ""
- @echo "Hashes:"
- @echo -n " MD5: "
- @cat $(md5_tgz)
- @echo -n " MD5: "
- @cat $(md5_tbz2)
- @echo -n " SHA1: "
- @cat $(sha1_tgz)
- @echo -n " SHA1: "
- @cat $(sha1_tbz2)
- @echo ""
- @echo "GPG signature:"
- @echo " $(RELEASE_CAIRO_URL)/$(gpg_file)"
- @echo " (signed by`gpg --list-keys $(GPGKEY) | grep uid | cut -b4- | tr -s " "`)"
- @echo ""
- @echo "Git:"
- @echo " git://git.freedesktop.org/git/pixman"
- @echo " tag: $(PACKAGE)-$(VERSION)"
- @echo ""
- @echo "Log:"
- @git log --no-merges "$(PACKAGE)-$(PREV)".."$(PACKAGE)-$(VERSION)" | git shortlog | awk '{ printf "\t"; print ; }' | cut -b1-80
- @echo "============================== CUT HERE =============================="
- @echo ""
-
-release-publish: release-upload release-tag release-publish-message
-
-.PHONY: release-upload release-publish release-publish-message release-tag
diff --git a/Makefile.win32 b/Makefile.win32
deleted file mode 100644
index c3ca3bc..0000000
--- a/Makefile.win32
+++ /dev/null
@@ -1,25 +0,0 @@
-default: all
-
-top_srcdir = .
-include $(top_srcdir)/Makefile.win32.common
-
-all: pixman test
-
-pixman:
- @$(MAKE) -C pixman -f Makefile.win32
-
-test:
- @$(MAKE) -C test -f Makefile.win32
-
-clean_r:
- @$(MAKE) -C pixman -f Makefile.win32 clean
- @$(MAKE) -C test -f Makefile.win32 clean
-
-check:
- @$(MAKE) -C test -f Makefile.win32 check
-
-
-clean: clean_r
-
-
-.PHONY: all pixman test clean check
diff --git a/Makefile.win32.common b/Makefile.win32.common
deleted file mode 100644
index 777f94c..0000000
--- a/Makefile.win32.common
+++ /dev/null
@@ -1,56 +0,0 @@
-LIBRARY = pixman-1
-
-CC = cl
-LD = link
-AR = lib
-PERL = perl
-
-ifeq ($(top_builddir),)
-top_builddir = $(top_srcdir)
-endif
-
-CFG_VAR = $(CFG)
-ifeq ($(CFG_VAR),)
-CFG_VAR = release
-endif
-
-ifeq ($(CFG_VAR),debug)
-CFG_CFLAGS = -MDd -Od -Zi
-CFG_LDFLAGS = -DEBUG
-else
-CFG_CFLAGS = -MD -O2
-CFG_LDFLAGS =
-endif
-
-# Package definitions, to be used instead of those provided in config.h
-PKG_CFLAGS = -DPACKAGE=$(LIBRARY) -DPACKAGE_VERSION="" -DPACKAGE_BUGREPORT=""
-
-BASE_CFLAGS = -nologo -I. -I$(top_srcdir) -I$(top_srcdir)/pixman
-
-PIXMAN_CFLAGS = $(BASE_CFLAGS) $(PKG_CFLAGS) $(CFG_CFLAGS) $(CFLAGS)
-PIXMAN_LDFLAGS = -nologo $(CFG_LDFLAGS) $(LDFLAGS)
-PIXMAN_ARFLAGS = -nologo $(LDFLAGS)
-
-
-inform:
-ifneq ($(CFG),release)
-ifneq ($(CFG),debug)
-ifneq ($(CFG),)
- @echo "Invalid specified configuration option: "$(CFG)"."
- @echo
- @echo "Possible choices for configuration are 'release' and 'debug'"
- @exit 1
-endif
- @echo "Using default RELEASE configuration... (use CFG=release or CFG=debug)"
-endif
-endif
-
-
-$(CFG_VAR)/%.obj: %.c $(libpixman_headers)
- @mkdir -p $(CFG_VAR)
- @$(CC) -c $(PIXMAN_CFLAGS) -Fo"$@" $<
-
-clean: inform
- @$(RM) $(CFG_VAR)/*.{exe,ilk,lib,obj,pdb} $(BUILT_SOURCES) || exit 0
-
-.PHONY: inform clean
diff --git a/README b/README
index 6d8cfd8..e0e7dca 100644
--- a/README
+++ b/README
@@ -1,14 +1,20 @@
+Pixman
+======
+
Pixman is a library that provides low-level pixel manipulation
features such as image compositing and trapezoid rasterization.
-Questions, bug reports and patches should be directed to the pixman
-mailing list:
+Questions should be directed to the pixman mailing list:
- http://lists.freedesktop.org/mailman/listinfo/pixman
+ https://lists.freedesktop.org/mailman/listinfo/pixman
You can also file bugs at
- https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+ https://gitlab.freedesktop.org/pixman/pixman/-/issues/new
+
+or submit improvements in form of a Merge Request via
+
+ https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests
For real time discussions about pixman, feel free to join the IRC
channels #cairo and #xorg-devel on the FreeNode IRC network.
@@ -21,53 +27,65 @@ In order to contribute to pixman, you will need a working knowledge of
the git version control system. For a quick getting started guide,
there is the "Everyday Git With 20 Commands Or So guide"
- http://www.kernel.org/pub/software/scm/git/docs/everyday.html
+ https://www.kernel.org/pub/software/scm/git/docs/everyday.html
from the Git homepage. For more in depth git documentation, see the
resources on the Git community documentation page:
- http://git-scm.com/documentation
+ https://git-scm.com/documentation
Pixman uses the infrastructure from the freedesktop.org umbrella
project. For instructions about how to use the git service on
freedesktop.org, see:
- http://www.freedesktop.org/wiki/Infrastructure/git/Developers
+ https://www.freedesktop.org/wiki/Infrastructure/git/Developers
The Pixman master repository can be found at:
- git://anongit.freedesktop.org/git/pixman
-
-and browsed on the web here:
-
- http://cgit.freedesktop.org/pixman/
+ https://gitlab.freedesktop.org/pixman/pixman
Sending patches
---------------
-The general workflow for sending patches is to first make sure that
-git can send mail on your system. Then,
+Patches should be submitted in form of Merge Requests via Gitlab.
- - create a branch off of master in your local git repository
+You will first need to create a fork of the main pixman repository at
- - make your changes as one or more commits
+ https://gitlab.freedesktop.org/pixman/pixman
- - use the
+via the Fork button on the top right. Once that is done you can add your
+personal repository as a remote to your local pixman development git checkout:
- git send-email
+ git remote add my-gitlab git@gitlab.freedesktop.org:YOURUSERNAME/pixman.git
- command to send the patch series to pixman@lists.freedesktop.org.
+ git fetch my-gitlab
-In order for your patches to be accepted, please consider the
-following guidelines:
+Make sure to have added ssh keys to your gitlab profile at
- - This link:
+ https://gitlab.freedesktop.org/profile/keys
- http://www.kernel.org/pub/software/scm/git/docs/user-manual.html#patch-series
+Once that is set up, the general workflow for sending patches is to create a
+new local branch with your improvements and once it's ready push it to your
+personal pixman fork:
- describes how what a good patch series is, and to create one with
- git.
+ git checkout -b fix-some-bug
+ ...
+ git push my-gitlab
+
+The output of the `git push` command will include a link that allows you to
+create a Merge Request against the official pixman repository.
+
+Whenever you make changes to your branch (add new commits or fix up commits)
+you push them back to your personal pixman fork:
+
+ git push -f my-gitlab
+
+If there is an open Merge Request Gitlab will automatically pick up the
+changes from your branch and pixman developers can review them anew.
+
+In order for your patches to be accepted, please consider the
+following guidelines:
- At each point in the series, pixman should compile and the test
suite should pass.
@@ -79,7 +97,7 @@ following guidelines:
You can run the test suite with
- make check
+ meson test -C builddir
It will take around two minutes to run on a modern PC.
@@ -101,7 +119,7 @@ following guidelines:
- If review comments were incorporated, a brief version
history describing what those changes were.
- - For big patch series, send an introductory email with an overall
+ - For big patch series, write an introductory post with an overall
description of the patch series, including benchmarks and
motivation. Each commit message should still be descriptive and
include enough information to understand why this particular commit
@@ -111,6 +129,6 @@ Pixman has high standards for code quality and so almost everybody
should expect to have the first versions of their patches rejected.
If you think that the reviewers are wrong about something, or that the
-guidelines above are wrong, feel free to discuss the issue on the
-list. The purpose of the guidelines and code review is to ensure high
-code quality; it is not an exercise in compliance.
+guidelines above are wrong, feel free to discuss the issue. The purpose
+of the guidelines and code review is to ensure high code quality; it is
+not an exercise in compliance.
diff --git a/RELEASING b/RELEASING
index 657857d..fc4edc9 100644
--- a/RELEASING
+++ b/RELEASING
@@ -10,12 +10,11 @@ Here are the steps to follow to create a new pixman release:
git log master...origin (no output; note: *3* dots)
-2) Increment pixman_(major|minor|micro) in configure.ac according to
- the directions in that file.
+2) Increment the version in meson.build.
3) Make sure that new version works, including
- - make distcheck passes
+ - meson test passes
- the X server still works with the new pixman version
installed
diff --git a/a64-neon-test.S b/a64-neon-test.S
new file mode 100644
index 0000000..5d4a4ea
--- /dev/null
+++ b/a64-neon-test.S
@@ -0,0 +1,5 @@
+.text
+.arch armv8-a
+.altmacro
+prfm pldl2strm, [x0]
+xtn v0.8b, v0.8h
diff --git a/arm-simd-test.S b/arm-simd-test.S
new file mode 100644
index 0000000..910c814
--- /dev/null
+++ b/arm-simd-test.S
@@ -0,0 +1,10 @@
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0
diff --git a/autogen.sh b/autogen.sh
deleted file mode 100755
index fc34bd5..0000000
--- a/autogen.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /bin/sh
-
-srcdir=`dirname $0`
-test -z "$srcdir" && srcdir=.
-
-ORIGDIR=`pwd`
-cd $srcdir
-
-autoreconf -v --install || exit 1
-cd $ORIGDIR || exit $?
-
-if test -z "$NOCONFIGURE"; then
- $srcdir/configure "$@"
-fi
diff --git a/configure.ac b/configure.ac
deleted file mode 100644
index 0339494..0000000
--- a/configure.ac
+++ /dev/null
@@ -1,1129 +0,0 @@
-dnl Copyright 2005 Red Hat, Inc.
-dnl
-dnl Permission to use, copy, modify, distribute, and sell this software and its
-dnl documentation for any purpose is hereby granted without fee, provided that
-dnl the above copyright notice appear in all copies and that both that
-dnl copyright notice and this permission notice appear in supporting
-dnl documentation, and that the name of Red Hat not be used in
-dnl advertising or publicity pertaining to distribution of the software without
-dnl specific, written prior permission. Red Hat makes no
-dnl representations about the suitability of this software for any purpose. It
-dnl is provided "as is" without express or implied warranty.
-dnl
-dnl RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
-dnl INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
-dnl EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
-dnl CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
-dnl DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-dnl TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-dnl PERFORMANCE OF THIS SOFTWARE.
-dnl
-dnl Process this file with autoconf to create configure.
-
-AC_PREREQ([2.57])
-
-# Pixman versioning scheme
-#
-# - The version in git has an odd MICRO version number
-#
-# - Released versions, both development and stable, have an
-# even MICRO version number
-#
-# - Released development versions have an odd MINOR number
-#
-# - Released stable versions have an even MINOR number
-#
-# - Versions that break ABI must have a new MAJOR number
-#
-# - If you break the ABI, then at least this must be done:
-#
-# - increment MAJOR
-#
-# - In the first development release where you break ABI, find
-# all instances of "pixman-n" and change them to pixman-(n+1)
-#
-# This needs to be done at least in
-# configure.ac
-# all Makefile.am's
-# pixman-n.pc.in
-#
-# This ensures that binary incompatible versions can be installed
-# in parallel. See http://www106.pair.com/rhp/parallel.html for
-# more information
-#
-
-m4_define([pixman_major], 0)
-m4_define([pixman_minor], 33)
-m4_define([pixman_micro], 1)
-
-m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
-
-AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
-AM_INIT_AUTOMAKE([foreign dist-bzip2])
-
-# Suppress verbose compile lines
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-AC_CONFIG_HEADERS(config.h)
-
-AC_CANONICAL_HOST
-
-test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
-
-AC_PROG_CC
-AM_PROG_AS
-AC_PROG_LIBTOOL
-AC_CHECK_FUNCS([getisax])
-AC_C_BIGENDIAN
-AC_C_INLINE
-
-dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
-dnl
-dnl Compiles and links the given program in the environment setup by env-setup
-dnl and executes true-action on success and false-action on failure.
-AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
- save_CFLAGS="$CFLAGS"
- save_LDFLAGS="$LDFLAGS"
- save_LIBS="$LIBS"
- CFLAGS=""
- LDFLAGS=""
- LIBS=""
- $1
- CFLAGS="$save_CFLAGS $CFLAGS"
- LDFLAGS="$save_LDFLAGS $LDFLAGS"
- LIBS="$save_LIBS $LIBS"
- AC_LINK_IFELSE(
- [AC_LANG_SOURCE([$2])],
- [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
- pixman_cc_flag=yes],
- [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
- pixman_cc_flag=no])
-
- if test "x$pixman_cc_stderr" != "x"; then
- pixman_cc_flag=no
- fi
-
- if test "x$pixman_cc_flag" = "xyes"; then
- ifelse([$3], , :, [$3])
- else
- ifelse([$4], , :, [$4])
- fi
- CFLAGS="$save_CFLAGS"
- LDFLAGS="$save_LDFLAGS"
- LIBS="$save_LIBS"
-])
-
-dnl Find a -Werror for catching warnings.
-WERROR=
-for w in -Werror -errwarn; do
- if test "z$WERROR" = "z"; then
- AC_MSG_CHECKING([whether the compiler supports $w])
- PIXMAN_LINK_WITH_ENV(
- [CFLAGS=$w],
- [int main(int c, char **v) { (void)c; (void)v; return 0; }],
- [WERROR=$w; yesno=yes], [yesno=no])
- AC_MSG_RESULT($yesno)
- fi
-done
-
-dnl PIXMAN_CHECK_CFLAG(flag, [program])
-dnl Adds flag to CFLAGS if the given program links without warnings or errors.
-AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
- AC_MSG_CHECKING([whether the compiler supports $1])
- PIXMAN_LINK_WITH_ENV(
- [CFLAGS="$WERROR $1"],
- [$2
- int main(int c, char **v) { (void)c; (void)v; return 0; }
- ],
- [_yesno=yes],
- [_yesno=no])
- if test "x$_yesno" = xyes; then
- CFLAGS="$CFLAGS $1"
- fi
- AC_MSG_RESULT($_yesno)
-])
-
-AC_CHECK_SIZEOF(long)
-
-# Checks for Sun Studio compilers
-AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
-AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
-
-# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
-# if we're using Sun Studio and neither the user nor a config.site
-# has set CFLAGS.
-if test $SUNCC = yes && \
- test "x$test_CFLAGS" = "x" && \
- test "$CFLAGS" = "-g"
-then
- CFLAGS="-O -g"
-fi
-
-#
-# We ignore pixman_major in the version here because the major version should
-# always be encoded in the actual library name. Ie., the soname is:
-#
-# pixman-$(pixman_major).0.minor.micro
-#
-m4_define([lt_current], [pixman_minor])
-m4_define([lt_revision], [pixman_micro])
-m4_define([lt_age], [pixman_minor])
-
-LT_VERSION_INFO="lt_current:lt_revision:lt_age"
-
-PIXMAN_VERSION_MAJOR=pixman_major()
-AC_SUBST(PIXMAN_VERSION_MAJOR)
-PIXMAN_VERSION_MINOR=pixman_minor()
-AC_SUBST(PIXMAN_VERSION_MINOR)
-PIXMAN_VERSION_MICRO=pixman_micro()
-AC_SUBST(PIXMAN_VERSION_MICRO)
-
-AC_SUBST(LT_VERSION_INFO)
-
-# Check for dependencies
-
-PIXMAN_CHECK_CFLAG([-Wall])
-PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
-PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
-PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
-
-dnl =========================================================================
-dnl OpenMP for the test suite?
-dnl
-
-# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
-OPENMP_CFLAGS=
-m4_ifdef([AC_OPENMP], [AC_OPENMP])
-
-if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
- AC_MSG_WARN([OpenMP support requested but found unsupported])
-fi
-
-dnl May not fail to link without -Wall -Werror added
-dnl So try to link only when openmp is supported
-dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
-if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
- m4_define([openmp_test_program],[dnl
- #include <stdio.h>
-
- extern unsigned int lcg_seed;
- #pragma omp threadprivate(lcg_seed)
- unsigned int lcg_seed;
-
- unsigned function(unsigned a, unsigned b)
- {
- lcg_seed ^= b;
- return ((a + b) ^ a ) + lcg_seed;
- }
-
- int main(int argc, char **argv)
- {
- int i;
- int n1 = 0, n2 = argc;
- unsigned checksum = 0;
- int verbose = argv != NULL;
- unsigned (*test_function)(unsigned, unsigned);
- test_function = function;
- #pragma omp parallel for reduction(+:checksum) default(none) \
- shared(n1, n2, test_function, verbose)
- for (i = n1; i < n2; i++)
- {
- unsigned crc = test_function (i, 0);
- if (verbose)
- printf ("%d: %08X\n", i, crc);
- checksum += crc;
- }
- printf("%u\n", checksum);
- return 0;
- }
- ])
-
- PIXMAN_LINK_WITH_ENV(
- [CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
- [openmp_test_program],
- [have_openmp=yes],
- [have_openmp=no])
- if test "x$have_openmp" = "xyes" ; then
- AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
- fi
-fi
-AC_SUBST(OPENMP_CFLAGS)
-
-dnl =========================================================================
-dnl -fvisibility stuff
-
-PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
-#if defined(__GNUC__) && (__GNUC__ >= 4)
-#ifdef _WIN32
-#error Have -fvisibility but it is ignored and generates a warning
-#endif
-#else
-#error Need GCC 4.0 for visibility
-#endif
-])
-
-PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-#else
-#error Need Sun Studio 8 for visibility
-#endif
-])
-
-dnl ===========================================================================
-dnl Check for Loongson Multimedia Instructions
-
-if test "x$LS_CFLAGS" = "x" ; then
- LS_CFLAGS="-march=loongson2f"
-fi
-
-have_loongson_mmi=no
-AC_MSG_CHECKING(whether to use Loongson MMI assembler)
-
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir"
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-#ifndef __mips_loongson_vector_rev
-#error "Loongson Multimedia Instructions are only available on Loongson"
-#endif
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 4.4 for Loongson MMI compilation"
-#endif
-#include "pixman/loongson-mmintrin.h"
-int main () {
- union {
- __m64 v;
- char c[8];
- } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
- int b = 4;
- __m64 c = _mm_srli_pi16 (a.v, b);
- return 0;
-}]])], have_loongson_mmi=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(loongson-mmi,
- [AC_HELP_STRING([--disable-loongson-mmi],
- [disable Loongson MMI fast paths])],
- [enable_loongson_mmi=$enableval], [enable_loongson_mmi=auto])
-
-if test $enable_loongson_mmi = no ; then
- have_loongson_mmi=disabled
-fi
-
-if test $have_loongson_mmi = yes ; then
- AC_DEFINE(USE_LOONGSON_MMI, 1, [use Loongson Multimedia Instructions])
-else
- LS_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_loongson_mmi)
-if test $enable_loongson_mmi = yes && test $have_loongson_mmi = no ; then
- AC_MSG_ERROR([Loongson MMI not detected])
-fi
-
-AM_CONDITIONAL(USE_LOONGSON_MMI, test $have_loongson_mmi = yes)
-
-dnl ===========================================================================
-dnl Check for MMX
-
-if test "x$MMX_CFLAGS" = "x" ; then
- if test "x$SUNCC" = "xyes"; then
- # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
- # but if we're building 64-bit, mmx & sse support is on by default and
- # -xarch=sse throws an error instead
- if test "$AMD64_ABI" = "no" ; then
- MMX_CFLAGS="-xarch=sse"
- fi
- else
- MMX_CFLAGS="-mmmx -Winline"
- fi
-fi
-
-have_mmx_intrinsics=no
-AC_MSG_CHECKING(whether to use MMX intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$MMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 3.4 for MMX intrinsics"
-#endif
-#include <mmintrin.h>
-int main () {
- __m64 v = _mm_cvtsi32_si64 (1);
- __m64 w;
-
- /* Some versions of clang will choke on K */
- asm ("pshufw %2, %1, %0\n\t"
- : "=y" (w)
- : "y" (v), "K" (5)
- );
-
- /* Some versions of clang will choke on this */
- asm ("pmulhuw %1, %0\n\t"
- : "+y" (w)
- : "y" (v)
- );
-
- return _mm_cvtsi64_si32 (v);
-}]])], have_mmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(mmx,
- [AC_HELP_STRING([--disable-mmx],
- [disable x86 MMX fast paths])],
- [enable_mmx=$enableval], [enable_mmx=auto])
-
-if test $enable_mmx = no ; then
- have_mmx_intrinsics=disabled
-fi
-
-if test $have_mmx_intrinsics = yes ; then
- AC_DEFINE(USE_X86_MMX, 1, [use x86 MMX compiler intrinsics])
-else
- MMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_mmx_intrinsics)
-if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
- AC_MSG_ERROR([x86 MMX intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_X86_MMX, test $have_mmx_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Check for SSE2
-
-if test "x$SSE2_CFLAGS" = "x" ; then
- if test "x$SUNCC" = "xyes"; then
- # SSE2 is enabled by default in the Sun Studio 64-bit environment
- if test "$AMD64_ABI" = "no" ; then
- SSE2_CFLAGS="-xarch=sse2"
- fi
- else
- SSE2_CFLAGS="-msse2 -Winline"
- fi
-fi
-
-have_sse2_intrinsics=no
-AC_MSG_CHECKING(whether to use SSE2 intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$SSE2_CFLAGS $CFLAGS"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
-# if !defined(__amd64__) && !defined(__x86_64__)
-# error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
-# endif
-#endif
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-int main () {
- __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
- c = _mm_xor_si128 (a, b);
- return 0;
-}]])], have_sse2_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(sse2,
- [AC_HELP_STRING([--disable-sse2],
- [disable SSE2 fast paths])],
- [enable_sse2=$enableval], [enable_sse2=auto])
-
-if test $enable_sse2 = no ; then
- have_sse2_intrinsics=disabled
-fi
-
-if test $have_sse2_intrinsics = yes ; then
- AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
-fi
-
-AC_MSG_RESULT($have_sse2_intrinsics)
-if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
- AC_MSG_ERROR([SSE2 intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Check for SSSE3
-
-if test "x$SSSE3_CFLAGS" = "x" ; then
- SSSE3_CFLAGS="-mssse3 -Winline"
-fi
-
-have_ssse3_intrinsics=no
-AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$SSSE3_CFLAGS $CFLAGS"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-int main () {
- __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
- c = _mm_maddubs_epi16 (a, b);
- return 0;
-}]])], have_ssse3_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(ssse3,
- [AC_HELP_STRING([--disable-ssse3],
- [disable SSSE3 fast paths])],
- [enable_ssse3=$enableval], [enable_ssse3=auto])
-
-if test $enable_ssse3 = no ; then
- have_ssse3_intrinsics=disabled
-fi
-
-if test $have_ssse3_intrinsics = yes ; then
- AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
-fi
-
-AC_MSG_RESULT($have_ssse3_intrinsics)
-if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
- AC_MSG_ERROR([SSSE3 intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
-
-dnl ===========================================================================
-dnl Other special flags needed when building code using MMX or SSE instructions
-case $host_os in
- solaris*)
- # When building 32-bit binaries, apply a mapfile to ensure that the
- # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
- # since they check at runtime before using those instructions.
- # Not all linkers grok the mapfile format so we check for that first.
- if test "$AMD64_ABI" = "no" ; then
- use_hwcap_mapfile=no
- AC_MSG_CHECKING(whether to use a hardware capability map file)
- hwcap_save_LDFLAGS="$LDFLAGS"
- HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
- LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
- AC_LINK_IFELSE([AC_LANG_SOURCE([[int main() { return 0; }]])],
- use_hwcap_mapfile=yes,
- HWCAP_LDFLAGS="")
- LDFLAGS="$hwcap_save_LDFLAGS"
- AC_MSG_RESULT($use_hwcap_mapfile)
- fi
- if test "x$MMX_LDFLAGS" = "x" ; then
- MMX_LDFLAGS="$HWCAP_LDFLAGS"
- fi
- if test "x$SSE2_LDFLAGS" = "x" ; then
- SSE2_LDFLAGS="$HWCAP_LDFLAGS"
- fi
- ;;
-esac
-
-AC_SUBST(LS_CFLAGS)
-AC_SUBST(IWMMXT_CFLAGS)
-AC_SUBST(MMX_CFLAGS)
-AC_SUBST(MMX_LDFLAGS)
-AC_SUBST(SSE2_CFLAGS)
-AC_SUBST(SSE2_LDFLAGS)
-AC_SUBST(SSSE3_CFLAGS)
-
-dnl ===========================================================================
-dnl Check for VMX/Altivec
-if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
- VMX_CFLAGS="-faltivec"
-else
- VMX_CFLAGS="-maltivec -mabi=altivec"
-fi
-
-have_vmx_intrinsics=no
-AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$VMX_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 3.4 for sane altivec support"
-#endif
-#include <altivec.h>
-int main () {
- vector unsigned int v = vec_splat_u32 (1);
- v = vec_sub (v, v);
- return 0;
-}]])], have_vmx_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(vmx,
- [AC_HELP_STRING([--disable-vmx],
- [disable VMX fast paths])],
- [enable_vmx=$enableval], [enable_vmx=auto])
-
-if test $enable_vmx = no ; then
- have_vmx_intrinsics=disabled
-fi
-
-if test $have_vmx_intrinsics = yes ; then
- AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
-else
- VMX_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_vmx_intrinsics)
-if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
- AC_MSG_ERROR([VMX intrinsics not detected])
-fi
-
-AC_SUBST(VMX_CFLAGS)
-
-AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports ARM SIMD instructions
-have_arm_simd=no
-AC_MSG_CHECKING(whether to use ARM SIMD assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-.text
-.arch armv6
-.object_arch armv4
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-uqadd8 r0, r0, r0]])], have_arm_simd=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-simd,
- [AC_HELP_STRING([--disable-arm-simd],
- [disable ARM SIMD fast paths])],
- [enable_arm_simd=$enableval], [enable_arm_simd=auto])
-
-if test $enable_arm_simd = no ; then
- have_arm_simd=disabled
-fi
-
-if test $have_arm_simd = yes ; then
- AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
-
-AC_MSG_RESULT($have_arm_simd)
-if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
- AC_MSG_ERROR([ARM SIMD intrinsics not detected])
-fi
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports NEON instructions
-have_arm_neon=no
-AC_MSG_CHECKING(whether to use ARM NEON assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-x assembler-with-cpp $CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.eabi_attribute 10, 0
-.arm
-.altmacro
-#ifndef __ARM_EABI__
-#error EABI is required (to be sure that calling conventions are compatible)
-#endif
-pld [r0]
-vmovn.u16 d0, q0]])], have_arm_neon=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(arm-neon,
- [AC_HELP_STRING([--disable-arm-neon],
- [disable ARM NEON fast paths])],
- [enable_arm_neon=$enableval], [enable_arm_neon=auto])
-
-if test $enable_arm_neon = no ; then
- have_arm_neon=disabled
-fi
-
-if test $have_arm_neon = yes ; then
- AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
-
-AC_MSG_RESULT($have_arm_neon)
-if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
- AC_MSG_ERROR([ARM NEON intrinsics not detected])
-fi
-
-dnl ===========================================================================
-dnl Check for IWMMXT
-
-AC_ARG_ENABLE(arm-iwmmxt,
- [AC_HELP_STRING([--disable-arm-iwmmxt],
- [disable ARM IWMMXT fast paths])],
- [enable_iwmmxt=$enableval], [enable_iwmmxt=auto])
-
-AC_ARG_ENABLE(arm-iwmmxt2,
- [AC_HELP_STRING([--disable-arm-iwmmxt2],
- [build ARM IWMMXT fast paths with -march=iwmmxt instead of -march=iwmmxt2])],
- [enable_iwmmxt2=$enableval], [enable_iwmmxt2=auto])
-
-if test "x$IWMMXT_CFLAGS" = "x" ; then
- IWMMXT_CFLAGS="-flax-vector-conversions -Winline -march=iwmmxt"
- if test $enable_iwmmxt2 != no ; then
- IWMMXT_CFLAGS="${IWMMXT_CFLAGS}2"
- fi
-fi
-
-have_iwmmxt_intrinsics=no
-AC_MSG_CHECKING(whether to use ARM IWMMXT intrinsics)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $IWMMXT_CFLAGS"
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#ifndef __arm__
-#error "IWMMXT is only available on ARM"
-#endif
-#ifndef __IWMMXT__
-#error "IWMMXT not enabled (with -march=iwmmxt)"
-#endif
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
-#error "Need GCC >= 4.8 for IWMMXT intrinsics"
-#endif
-#include <mmintrin.h>
-int main () {
- union {
- __m64 v;
- char c[8];
- } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
- int b = 4;
- __m64 c = _mm_srli_si64 (a.v, b);
-}]])], have_iwmmxt_intrinsics=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-if test $enable_iwmmxt = no ; then
- have_iwmmxt_intrinsics=disabled
-fi
-
-if test $have_iwmmxt_intrinsics = yes ; then
- AC_DEFINE(USE_ARM_IWMMXT, 1, [use ARM IWMMXT compiler intrinsics])
-else
- IWMMXT_CFLAGS=
-fi
-
-AC_MSG_RESULT($have_iwmmxt_intrinsics)
-if test $enable_iwmmxt = yes && test $have_iwmmxt_intrinsics = no ; then
- AC_MSG_ERROR([IWMMXT intrinsics not detected])
-fi
-
-AM_CONDITIONAL(USE_ARM_IWMMXT, test $have_iwmmxt_intrinsics = yes)
-
-dnl ==========================================================================
-dnl Check if assembler is gas compatible and supports MIPS DSPr2 instructions
-
-have_mips_dspr2=no
-AC_MSG_CHECKING(whether to use MIPS DSPr2 assembler)
-xserver_save_CFLAGS=$CFLAGS
-CFLAGS="-mdspr2 $CFLAGS"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-#if !(defined(__mips__) && __mips_isa_rev >= 2)
-#error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
-#endif
-int
-main ()
-{
- int c = 0, a = 0, b = 0;
- __asm__ __volatile__ (
- "precr.qb.ph %[c], %[a], %[b] \n\t"
- : [c] "=r" (c)
- : [a] "r" (a), [b] "r" (b)
- );
- return c;
-}]])], have_mips_dspr2=yes)
-CFLAGS=$xserver_save_CFLAGS
-
-AC_ARG_ENABLE(mips-dspr2,
- [AC_HELP_STRING([--disable-mips-dspr2],
- [disable MIPS DSPr2 fast paths])],
- [enable_mips_dspr2=$enableval], [enable_mips_dspr2=auto])
-
-if test $enable_mips_dspr2 = no ; then
- have_mips_dspr2=disabled
-fi
-
-if test $have_mips_dspr2 = yes ; then
- AC_DEFINE(USE_MIPS_DSPR2, 1, [use MIPS DSPr2 assembly optimizations])
-fi
-
-AM_CONDITIONAL(USE_MIPS_DSPR2, test $have_mips_dspr2 = yes)
-
-AC_MSG_RESULT($have_mips_dspr2)
-if test $enable_mips_dspr2 = yes && test $have_mips_dspr2 = no ; then
- AC_MSG_ERROR([MIPS DSPr2 instructions not detected])
-fi
-
-dnl =========================================================================================
-dnl Check for GNU-style inline assembly support
-
-have_gcc_inline_asm=no
-AC_MSG_CHECKING(whether to use GNU-style inline assembler)
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-int main () {
- /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
- asm volatile ( "\tnop\n" : : : "cc", "memory" );
- return 0;
-}]])], have_gcc_inline_asm=yes)
-
-AC_ARG_ENABLE(gcc-inline-asm,
- [AC_HELP_STRING([--disable-gcc-inline-asm],
- [disable GNU-style inline assembler])],
- [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
-
-if test $enable_gcc_inline_asm = no ; then
- have_gcc_inline_asm=disabled
-fi
-
-if test $have_gcc_inline_asm = yes ; then
- AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
-fi
-
-AC_MSG_RESULT($have_gcc_inline_asm)
-if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
- AC_MSG_ERROR([GNU-style inline assembler not detected])
-fi
-
-AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
-
-dnl ==============================================
-dnl Static test programs
-
-AC_ARG_ENABLE(static-testprogs,
- [AC_HELP_STRING([--enable-static-testprogs],
- [build test programs as static binaries [default=no]])],
- [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
-
-TESTPROGS_EXTRA_LDFLAGS=
-if test "x$enable_static_testprogs" = "xyes" ; then
- TESTPROGS_EXTRA_LDFLAGS="-all-static"
-fi
-AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
-
-dnl ==============================================
-dnl Timers
-
-AC_ARG_ENABLE(timers,
- [AC_HELP_STRING([--enable-timers],
- [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
- [enable_timers=$enableval], [enable_timers=no])
-
-if test $enable_timers = yes ; then
- AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
-fi
-AC_SUBST(PIXMAN_TIMERS)
-
-dnl ===================================
-dnl GTK+
-
-AC_ARG_ENABLE(gtk,
- [AC_HELP_STRING([--enable-gtk],
- [enable tests using GTK+ [default=auto]])],
- [enable_gtk=$enableval], [enable_gtk=auto])
-
-PKG_PROG_PKG_CONFIG
-
-if test $enable_gtk = yes ; then
- AC_CHECK_LIB([pixman-1], [pixman_version_string])
- PKG_CHECK_MODULES(GTK, [gtk+-2.0 >= 2.16 pixman-1])
-fi
-
-if test $enable_gtk = auto ; then
- AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
-fi
-
-if test $enable_gtk = auto ; then
- PKG_CHECK_MODULES(GTK, [gtk+-2.0 >= 2.16 pixman-1], [enable_gtk=yes], [enable_gtk=no])
-fi
-
-AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
-
-AC_SUBST(GTK_CFLAGS)
-AC_SUBST(GTK_LIBS)
-
-dnl =====================================
-dnl posix_memalign, sigaction, alarm, gettimeofday
-
-AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
-if test x$have_posix_memalign = xyes; then
- AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
-fi
-
-AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
-if test x$have_sigaction = xyes; then
- AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
-fi
-
-AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
-if test x$have_alarm = xyes; then
- AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
-fi
-
-AC_CHECK_HEADER([sys/mman.h],
- [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
-
-AC_CHECK_FUNC(mmap, have_mmap=yes, have_mmap=no)
-if test x$have_mmap = xyes; then
- AC_DEFINE(HAVE_MMAP, 1, [Whether we have mmap()])
-fi
-
-AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
-if test x$have_mprotect = xyes; then
- AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
-fi
-
-AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
-if test x$have_getpagesize = xyes; then
- AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
-fi
-
-AC_CHECK_HEADER([fenv.h],
- [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
-
-AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
-if test x$have_feenableexcept = xyes; then
- AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
-fi
-
-AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
-AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
-if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
- AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
-fi
-
-dnl =====================================
-dnl Check for missing sqrtf() as, e.g., for Solaris 9
-
-AC_SEARCH_LIBS([sqrtf], [m], [],
- [AC_DEFINE([sqrtf], [sqrt],
- [Define to sqrt if you do not have the `sqrtf' function.])])
-
-dnl =====================================
-dnl Thread local storage
-
-AC_MSG_CHECKING(for thread local storage (TLS) support)
-AC_CACHE_VAL(ac_cv_tls, [
- ac_cv_tls=none
- keywords="__thread __declspec(thread)"
- for kw in $keywords ; do
- AC_TRY_COMPILE([
-#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
-#error This MinGW version has broken __thread support
-#endif
-#ifdef __OpenBSD__
-#error OpenBSD has broken __thread support
-#endif
-
-int $kw test;], [], [ac_cv_tls=$kw; break])
- done
-])
-AC_MSG_RESULT($ac_cv_tls)
-
-if test "$ac_cv_tls" != "none"; then
- AC_DEFINE_UNQUOTED([TLS], $ac_cv_tls, [The compiler supported TLS storage class])
-fi
-
-dnl
-dnl posix tls
-dnl
-
-m4_define([pthread_test_program],AC_LANG_SOURCE([[dnl
-#include <stdlib.h>
-#include <pthread.h>
-
-static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-static pthread_key_t key;
-
-static void
-make_key (void)
-{
- pthread_key_create (&key, NULL);
-}
-
-int
-main ()
-{
- void *value = NULL;
-
- if (pthread_once (&once_control, make_key) != 0)
- {
- value = NULL;
- }
- else
- {
- value = pthread_getspecific (key);
- if (!value)
- {
- value = malloc (100);
- pthread_setspecific (key, value);
- }
- }
- return 0;
-}
-]]))
-
-AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
- if test "z$support_for_pthreads" != "zyes"; then
- PIXMAN_LINK_WITH_ENV(
- [$1], [pthread_test_program],
- [PTHREAD_CFLAGS="$CFLAGS"
- PTHREAD_LIBS="$LIBS"
- PTHREAD_LDFLAGS="$LDFLAGS"
- support_for_pthreads=yes])
- fi
-])
-
-support_for_pthreads=no
-
-AC_MSG_CHECKING(for pthreads)
-
-PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
-PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
-PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
-
-if test $support_for_pthreads = yes; then
- AC_DEFINE([HAVE_PTHREADS], [], [Whether pthreads is supported])
- if test $ac_cv_tls = none ; then
- CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
- fi
-fi
-
-AC_MSG_RESULT($support_for_pthreads)
-
-AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
-AC_SUBST(HAVE_PTHREADS)
-AC_SUBST(PTHREAD_LDFLAGS)
-AC_SUBST(PTHREAD_LIBS)
-AC_SUBST(PTHREAD_CFLAGS)
-
-dnl =====================================
-dnl __attribute__((constructor))
-
-support_for_attribute_constructor=no
-
-AC_MSG_CHECKING(for __attribute__((constructor)))
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
-/* attribute 'constructor' is supported since gcc 2.7, but some compilers
- * may only pretend to be gcc, so let's try to actually use it
- */
-static int x = 1;
-static void __attribute__((constructor)) constructor_function () { x = 0; }
-int main (void) { return x; }
-#else
-#error not gcc or gcc version is older than 2.7
-#endif
-]])], support_for_attribute_constructor=yes)
-
-if test x$support_for_attribute_constructor = xyes; then
- AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
- [],[Whether the tool chain supports __attribute__((constructor))])
-fi
-
-AC_MSG_RESULT($support_for_attribute_constructor)
-AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
-
-dnl =====================================
-dnl __float128
-
-support_for_float128=no
-
-AC_MSG_CHECKING(for __float128)
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-__float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }
-]])], support_for_float128=yes)
-
-if test x$support_for_float128 = xyes; then
- AC_DEFINE([HAVE_FLOAT128], [], [Whether the tool chain supports __float128])
-fi
-
-AC_MSG_RESULT($support_for_float128)
-
-dnl =====================================
-dnl __builtin_clz
-
-support_for_builtin_clz=no
-
-AC_MSG_CHECKING(for __builtin_clz)
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-unsigned int x = 11; int main (void) { return __builtin_clz(x); }
-]])], support_for_builtin_clz=yes)
-
-if test x$support_for_builtin_clz = xyes; then
- AC_DEFINE([HAVE_BUILTIN_CLZ], [], [Whether the compiler supports __builtin_clz])
-fi
-
-AC_MSG_RESULT($support_for_builtin_clz)
-
-dnl =====================================
-dnl GCC vector extensions
-
-support_for_gcc_vector_extensions=no
-
-AC_MSG_CHECKING(for GCC vector extensions)
-AC_LINK_IFELSE([AC_LANG_SOURCE([[
-unsigned int __attribute__ ((vector_size(16))) e, a, b;
-int main (void) { e = a - ((b << 27) + (b >> (32 - 27))) + 1; return e[0]; }
-]])], support_for_gcc_vector_extensions=yes)
-
-if test x$support_for_gcc_vector_extensions = xyes; then
- AC_DEFINE([HAVE_GCC_VECTOR_EXTENSIONS], [],
- [Whether the compiler supports GCC vector extensions])
-fi
-
-AC_MSG_RESULT($support_for_gcc_vector_extensions)
-
-dnl ==================
-dnl libpng
-
-AC_ARG_ENABLE(libpng, AS_HELP_STRING([--enable-libpng], [Build support for libpng (default: auto)]),
- [have_libpng=$enableval], [have_libpng=auto])
-
-case x$have_libpng in
- xyes) PKG_CHECK_MODULES(PNG, [libpng]) ;;
- xno) ;;
- *) PKG_CHECK_MODULES(PNG, [libpng], have_libpng=yes, have_libpng=no) ;;
-esac
-
-if test x$have_libpng = xyes; then
- AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
-fi
-
-AC_SUBST(HAVE_LIBPNG)
-
-AC_OUTPUT([pixman-1.pc
- pixman-1-uninstalled.pc
- Makefile
- pixman/Makefile
- pixman/pixman-version.h
- demos/Makefile
- test/Makefile])
-
-m4_if(m4_eval(pixman_minor % 2), [1], [
- echo
- echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
- echo
- echo " Thanks for testing this development snapshot of pixman. Please"
- echo " report any problems you find, either by sending email to "
- echo
- echo " pixman@lists.freedesktop.org"
- echo
- echo " or by filing a bug at "
- echo
- echo " https://bugs.freedesktop.org/enter_bug.cgi?product=pixman "
- echo
- echo " If you are looking for a stable release of pixman, please note "
- echo " that stable releases have _even_ minor version numbers. Ie., "
- echo " pixman-0.]m4_eval(pixman_minor & ~1)[.x are stable releases, whereas pixman-$PIXMAN_VERSION_MAJOR.$PIXMAN_VERSION_MINOR.$PIXMAN_VERSION_MICRO is a "
- echo " development snapshot that may contain bugs and experimental "
- echo " features. "
- echo
- echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
- echo
-])
diff --git a/demos/Makefile.am b/demos/Makefile.am
deleted file mode 100644
index e04743d..0000000
--- a/demos/Makefile.am
+++ /dev/null
@@ -1,52 +0,0 @@
-EXTRA_DIST = parrot.c parrot.jpg scale.ui
-
-if HAVE_GTK
-
-AM_CFLAGS = $(OPENMP_CFLAGS)
-AM_LDFLAGS = $(OPENMP_CFLAGS)
-
-LDADD = $(top_builddir)/pixman/libpixman-1.la -lm $(GTK_LIBS) $(PNG_LIBS)
-AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS) $(PNG_CFLAGS)
-
-GTK_UTILS = gtk-utils.c gtk-utils.h ../test/utils.c ../test/utils.h \
- ../test/utils-prng.c ../test/utils-prng.h
-
-DEMOS = \
- clip-test \
- clip-in \
- composite-test \
- gradient-test \
- radial-test \
- linear-gradient \
- conical-test \
- alpha-test \
- screen-test \
- convolution-test \
- trap-test \
- tri-test \
- quad2quad \
- checkerboard \
- srgb-trap-test \
- srgb-test \
- scale
-
-gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
-alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
-composite_test_SOURCES = composite-test.c $(GTK_UTILS)
-clip_test_SOURCES = clip-test.c $(GTK_UTILS)
-clip_in_SOURCES = clip-in.c $(GTK_UTILS)
-trap_test_SOURCES = trap-test.c $(GTK_UTILS)
-screen_test_SOURCES = screen-test.c $(GTK_UTILS)
-convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
-radial_test_SOURCES = radial-test.c $(GTK_UTILS)
-linear_gradient_SOURCES = linear-gradient.c $(GTK_UTILS)
-conical_test_SOURCES = conical-test.c $(GTK_UTILS)
-tri_test_SOURCES = tri-test.c $(GTK_UTILS)
-checkerboard_SOURCES = checkerboard.c $(GTK_UTILS)
-srgb_test_SOURCES = srgb-test.c $(GTK_UTILS)
-srgb_trap_test_SOURCES = srgb-trap-test.c $(GTK_UTILS)
-scale_SOURCES = scale.c $(GTK_UTILS)
-
-noinst_PROGRAMS = $(DEMOS)
-
-endif
diff --git a/demos/conical-test.c b/demos/conical-test.c
index 6b32430..5ff1be9 100644
--- a/demos/conical-test.c
+++ b/demos/conical-test.c
@@ -1,4 +1,4 @@
-#include "../test/utils.h"
+#include "utils.h"
#include "gtk-utils.h"
#define SIZE 128
diff --git a/demos/dither.c b/demos/dither.c
new file mode 100644
index 0000000..a6a157a
--- /dev/null
+++ b/demos/dither.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright 2012, Red Hat, Inc.
+ * Copyright 2012, Soren Sandmann
+ * Copyright 2018, Basile Clement
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include "pixman-config.h"
+#endif
+#include <math.h>
+#include <gtk/gtk.h>
+#include <stdlib.h>
+#include "utils.h"
+#include "gtk-utils.h"
+
+#define WIDTH 1024
+#define HEIGHT 640
+
+typedef struct
+{
+ GtkBuilder * builder;
+ pixman_image_t * original;
+ pixman_format_code_t format;
+ pixman_dither_t dither;
+ int width;
+ int height;
+} app_t;
+
+static GtkWidget *
+get_widget (app_t *app, const char *name)
+{
+ GtkWidget *widget = GTK_WIDGET (gtk_builder_get_object (app->builder, name));
+
+ if (!widget)
+ g_error ("Widget %s not found\n", name);
+
+ return widget;
+}
+
+typedef struct
+{
+ char name [20];
+ int value;
+} named_int_t;
+
+static const named_int_t formats[] =
+{
+ { "a8r8g8b8", PIXMAN_a8r8g8b8 },
+ { "rgb", PIXMAN_rgb_float },
+ { "sRGB", PIXMAN_a8r8g8b8_sRGB },
+ { "r5g6b5", PIXMAN_r5g6b5 },
+ { "a4r4g4b4", PIXMAN_a4r4g4b4 },
+ { "a2r2g2b2", PIXMAN_a2r2g2b2 },
+ { "r3g3b2", PIXMAN_r3g3b2 },
+ { "r1g2b1", PIXMAN_r1g2b1 },
+ { "a1r1g1b1", PIXMAN_a1r1g1b1 },
+};
+
+static const named_int_t dithers[] =
+{
+ { "None", PIXMAN_REPEAT_NONE },
+ { "Bayer 8x8", PIXMAN_DITHER_ORDERED_BAYER_8 },
+ { "Blue noise 64x64", PIXMAN_DITHER_ORDERED_BLUE_NOISE_64 },
+};
+
+static int
+get_value (app_t *app, const named_int_t table[], const char *box_name)
+{
+ GtkComboBox *box = GTK_COMBO_BOX (get_widget (app, box_name));
+
+ return table[gtk_combo_box_get_active (box)].value;
+}
+
+static void
+rescale (GtkWidget *may_be_null, app_t *app)
+{
+ app->dither = get_value (app, dithers, "dithering_combo_box");
+ app->format = get_value (app, formats, "target_format_combo_box");
+
+ gtk_widget_set_size_request (
+ get_widget (app, "drawing_area"), app->width + 0.5, app->height + 0.5);
+
+ gtk_widget_queue_draw (
+ get_widget (app, "drawing_area"));
+}
+
+static gboolean
+on_draw (GtkWidget *widget, cairo_t *cr, gpointer user_data)
+{
+ app_t *app = user_data;
+ GdkRectangle area;
+ cairo_surface_t *surface;
+ pixman_image_t *tmp, *final;
+ uint32_t *pixels;
+
+ gdk_cairo_get_clip_rectangle(cr, &area);
+
+ tmp = pixman_image_create_bits (
+ app->format, area.width, area.height, NULL, 0);
+ pixman_image_set_dither (tmp, app->dither);
+
+ pixman_image_composite (
+ PIXMAN_OP_SRC,
+ app->original, NULL, tmp,
+ area.x, area.y, 0, 0, 0, 0,
+ app->width - area.x,
+ app->height - area.y);
+
+ pixels = calloc (1, area.width * area.height * 4);
+ final = pixman_image_create_bits (
+ PIXMAN_a8r8g8b8, area.width, area.height, pixels, area.width * 4);
+
+ pixman_image_composite (
+ PIXMAN_OP_SRC,
+ tmp, NULL, final,
+ area.x, area.y, 0, 0, 0, 0,
+ app->width - area.x,
+ app->height - area.y);
+
+ surface = cairo_image_surface_create_for_data (
+ (uint8_t *)pixels, CAIRO_FORMAT_ARGB32,
+ area.width, area.height, area.width * 4);
+
+ cairo_set_source_surface (cr, surface, area.x, area.y);
+
+ cairo_paint (cr);
+
+ cairo_surface_destroy (surface);
+ free (pixels);
+ pixman_image_unref (final);
+ pixman_image_unref (tmp);
+
+ return TRUE;
+}
+
+static void
+set_up_combo_box (app_t *app, const char *box_name,
+ int n_entries, const named_int_t table[])
+{
+ GtkWidget *widget = get_widget (app, box_name);
+ GtkListStore *model;
+ GtkCellRenderer *cell;
+ int i;
+
+ model = gtk_list_store_new (1, G_TYPE_STRING);
+
+ cell = gtk_cell_renderer_text_new ();
+ gtk_cell_layout_pack_start (GTK_CELL_LAYOUT (widget), cell, TRUE);
+ gtk_cell_layout_set_attributes (GTK_CELL_LAYOUT (widget), cell,
+ "text", 0,
+ NULL);
+
+ gtk_combo_box_set_model (GTK_COMBO_BOX (widget), GTK_TREE_MODEL (model));
+
+ for (i = 0; i < n_entries; ++i)
+ {
+ const named_int_t *info = &(table[i]);
+ GtkTreeIter iter;
+
+ gtk_list_store_append (model, &iter);
+ gtk_list_store_set (model, &iter, 0, info->name, -1);
+ }
+
+ gtk_combo_box_set_active (GTK_COMBO_BOX (widget), 0);
+
+ g_signal_connect (widget, "changed", G_CALLBACK (rescale), app);
+}
+
+static app_t *
+app_new (pixman_image_t *original)
+{
+ GtkWidget *widget;
+ app_t *app = g_malloc (sizeof *app);
+ GError *err = NULL;
+
+ app->builder = gtk_builder_new ();
+ app->original = original;
+
+ if (original->type == BITS)
+ {
+ app->width = pixman_image_get_width (original);
+ app->height = pixman_image_get_height (original);
+ }
+ else
+ {
+ app->width = WIDTH;
+ app->height = HEIGHT;
+ }
+
+ if (!gtk_builder_add_from_file (app->builder, "dither.ui", &err))
+ g_error ("Could not read file dither.ui: %s", err->message);
+
+ widget = get_widget (app, "drawing_area");
+ g_signal_connect (widget, "draw", G_CALLBACK (on_draw), app);
+
+ set_up_combo_box (app, "target_format_combo_box",
+ G_N_ELEMENTS (formats), formats);
+ set_up_combo_box (app, "dithering_combo_box",
+ G_N_ELEMENTS (dithers), dithers);
+
+ app->dither = get_value (app, dithers, "dithering_combo_box");
+ app->format = get_value (app, formats, "target_format_combo_box");
+
+ rescale (NULL, app);
+
+ return app;
+}
+
+int
+main (int argc, char **argv)
+{
+ GtkWidget *window;
+ pixman_image_t *image;
+ app_t *app;
+
+ gtk_init (&argc, &argv);
+
+ if (argc < 2)
+ {
+ pixman_gradient_stop_t stops[] = {
+ /* These colors make it very obvious that dithering
+ * is useful even for 8-bit gradients
+ */
+ { 0x00000, { 0x1b1b, 0x5d5d, 0x7c7c, 0xffff } },
+ { 0x10000, { 0x3838, 0x3232, 0x1010, 0xffff } },
+ };
+ pixman_point_fixed_t p1, p2;
+
+ p1.x = p1.y = 0x0000;
+ p2.x = WIDTH << 16;
+ p2.y = HEIGHT << 16;
+
+ if (!(image = pixman_image_create_linear_gradient (
+ &p1, &p2, stops, ARRAY_LENGTH (stops))))
+ {
+ printf ("Could not create gradient\n");
+ return -1;
+ }
+ }
+ else if (!(image = pixman_image_from_file (argv[1], PIXMAN_a8r8g8b8)))
+ {
+ printf ("Could not load image \"%s\"\n", argv[1]);
+ return -1;
+ }
+
+ app = app_new (image);
+
+ window = get_widget (app, "main");
+
+ g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+
+ gtk_window_set_default_size (GTK_WINDOW (window), 1024, 768);
+
+ gtk_widget_show_all (window);
+
+ gtk_main ();
+
+ return 0;
+}
diff --git a/demos/dither.ui b/demos/dither.ui
new file mode 100644
index 0000000..7c3d068
--- /dev/null
+++ b/demos/dither.ui
@@ -0,0 +1,147 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<interface>
+ <requires lib="gtk+" version="2.12"/>
+ <object class="GtkWindow" id="main">
+ <property name="can_focus">False</property>
+ <child>
+ <placeholder/>
+ </child>
+ <child>
+ <object class="GtkHBox" id="u">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ <property name="spacing">12</property>
+ <child>
+ <object class="GtkScrolledWindow" id="scrolledwindow1">
+ <property name="visible">True</property>
+ <property name="can_focus">True</property>
+ <property name="shadow_type">in</property>
+ <child>
+ <object class="GtkViewport" id="viewport1">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ <child>
+ <object class="GtkDrawingArea" id="drawing_area">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ </object>
+ </child>
+ </object>
+ </child>
+ </object>
+ <packing>
+ <property name="expand">True</property>
+ <property name="fill">True</property>
+ <property name="position">0</property>
+ </packing>
+ </child>
+ <child>
+ <object class="GtkVBox" id="box1">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ <property name="spacing">12</property>
+ <child>
+ <object class="GtkVBox" id="box6">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ <child>
+ <object class="GtkTable" id="grid1">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ <property name="n_rows">2</property>
+ <property name="n_columns">2</property>
+ <property name="column_spacing">8</property>
+ <property name="row_spacing">6</property>
+ <child>
+ <object class="GtkLabel" id="label4">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ <property name="label" translatable="yes">&lt;b&gt;Target format:&lt;/b&gt;</property>
+ <property name="use_markup">True</property>
+ <property name="xalign">1</property>
+ </object>
+ </child>
+ <child>
+ <object class="GtkLabel" id="label5">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ <property name="label" translatable="yes">&lt;b&gt;Dithering:&lt;/b&gt;</property>
+ <property name="use_markup">True</property>
+ <property name="xalign">1</property>
+ </object>
+ <packing>
+ <property name="top_attach">1</property>
+ </packing>
+ </child>
+ <child>
+ <object class="GtkComboBox" id="target_format_combo_box">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ </object>
+ <packing>
+ <property name="left_attach">1</property>
+ </packing>
+ </child>
+ <child>
+ <object class="GtkComboBox" id="dithering_combo_box">
+ <property name="visible">True</property>
+ <property name="can_focus">False</property>
+ </object>
+ <packing>
+ <property name="left_attach">1</property>
+ <property name="top_attach">1</property>
+ </packing>
+ </child>
+ </object>
+ <packing>
+ <property name="expand">False</property>
+ <property name="fill">True</property>
+ <property name="padding">6</property>
+ <property name="position">1</property>
+ </packing>
+ </child>
+ </object>
+ <packing>
+ <property name="expand">False</property>
+ <property name="fill">True</property>
+ <property name="position">0</property>
+ </packing>
+ </child>
+ </object>
+ <packing>
+ <property name="expand">False</property>
+ <property name="fill">True</property>
+ <property name="position">1</property>
+ </packing>
+ </child>
+ </object>
+ </child>
+ </object>
+ <object class="GtkAdjustment" id="rotate_adjustment">
+ <property name="lower">-180</property>
+ <property name="upper">190</property>
+ <property name="step_increment">1</property>
+ <property name="page_increment">10</property>
+ <property name="page_size">10</property>
+ </object>
+ <object class="GtkAdjustment" id="scale_x_adjustment">
+ <property name="lower">-32</property>
+ <property name="upper">42</property>
+ <property name="step_increment">1</property>
+ <property name="page_increment">10</property>
+ <property name="page_size">10</property>
+ </object>
+ <object class="GtkAdjustment" id="scale_y_adjustment">
+ <property name="lower">-32</property>
+ <property name="upper">42</property>
+ <property name="step_increment">1</property>
+ <property name="page_increment">10</property>
+ <property name="page_size">10</property>
+ </object>
+ <object class="GtkAdjustment" id="subsample_adjustment">
+ <property name="upper">12</property>
+ <property name="value">4</property>
+ <property name="step_increment">1</property>
+ <property name="page_increment">1</property>
+ </object>
+</interface>
diff --git a/demos/gtk-utils.c b/demos/gtk-utils.c
index 32d4aec..dc872a9 100644
--- a/demos/gtk-utils.c
+++ b/demos/gtk-utils.c
@@ -1,6 +1,8 @@
#include <gtk/gtk.h>
-#include <config.h>
-#include "../test/utils.h"
+#ifdef HAVE_CONFIG_H
+#include <pixman-config.h>
+#endif
+#include "utils.h"
#include "gtk-utils.h"
pixman_image_t *
@@ -93,15 +95,14 @@ pixbuf_from_argb32 (uint32_t *bits,
}
static gboolean
-on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+on_draw (GtkWidget *widget, cairo_t *cr, gpointer user_data)
{
- pixman_image_t *pimage = data;
+ pixman_image_t *pimage = user_data;
int width = pixman_image_get_width (pimage);
int height = pixman_image_get_height (pimage);
int stride = pixman_image_get_stride (pimage);
cairo_surface_t *cimage;
cairo_format_t format;
- cairo_t *cr;
if (pixman_image_get_format (pimage) == PIXMAN_x8r8g8b8)
format = CAIRO_FORMAT_RGB24;
@@ -111,14 +112,11 @@ on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
cimage = cairo_image_surface_create_for_data (
(uint8_t *)pixman_image_get_data (pimage),
format, width, height, stride);
-
- cr = gdk_cairo_create (widget->window);
cairo_rectangle (cr, 0, 0, width, height);
cairo_set_source_surface (cr, cimage, 0, 0);
cairo_fill (cr);
- cairo_destroy (cr);
cairo_surface_destroy (cimage);
return TRUE;
@@ -170,7 +168,7 @@ show_image (pixman_image_t *image)
break;
}
- g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), copy);
+ g_signal_connect (window, "draw", G_CALLBACK (on_draw), copy);
g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
gtk_widget_show (window);
diff --git a/demos/linear-gradient.c b/demos/linear-gradient.c
index 46433a6..03807a6 100644
--- a/demos/linear-gradient.c
+++ b/demos/linear-gradient.c
@@ -1,4 +1,4 @@
-#include "../test/utils.h"
+#include "utils.h"
#include "gtk-utils.h"
#define WIDTH 1024
diff --git a/demos/meson.build b/demos/meson.build
new file mode 100644
index 0000000..cff3cf2
--- /dev/null
+++ b/demos/meson.build
@@ -0,0 +1,66 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+extra_demo_cflags = []
+if cc.get_argument_syntax() == 'msvc'
+ extra_demo_cflags = ['-D_USE_MATH_DEFINES']
+endif
+
+demos = [
+ 'gradient-test',
+ 'alpha-test',
+ 'composite-test',
+ 'clip-test',
+ 'trap-test',
+ 'screen-test',
+ 'convolution-test',
+ 'radial-test',
+ 'linear-gradient',
+ 'conical-test',
+ 'tri-test',
+ 'checkerboard',
+ 'srgb-test',
+ 'srgb-trap-test',
+ 'scale',
+ 'dither',
+]
+
+if dep_gtk.found()
+
+ libdemo = static_library(
+ 'demo',
+ ['gtk-utils.c', config_h, version_h],
+ dependencies : [libtestutils_dep, dep_gtk, dep_glib, dep_png, dep_m, dep_openmp],
+ include_directories : inc_pixman,
+ )
+
+ if dep_gtk.found()
+ foreach d : demos
+ executable(
+ d,
+ [d + '.c', config_h, version_h],
+ c_args : extra_demo_cflags,
+ link_with : [libdemo],
+ dependencies : [idep_pixman, libtestutils_dep, dep_glib, dep_gtk, dep_openmp, dep_png],
+ )
+ endforeach
+ endif
+
+endif
diff --git a/demos/radial-test.c b/demos/radial-test.c
index 08a367c..52292eb 100644
--- a/demos/radial-test.c
+++ b/demos/radial-test.c
@@ -1,4 +1,4 @@
-#include "../test/utils.h"
+#include "utils.h"
#include "gtk-utils.h"
#define NUM_GRADIENTS 9
diff --git a/demos/scale.c b/demos/scale.c
index d00307e..fa2a28a 100644
--- a/demos/scale.c
+++ b/demos/scale.c
@@ -24,7 +24,7 @@
* Author: Soren Sandmann <soren.sandmann@gmail.com>
*/
#ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "pixman-config.h"
#endif
#include <math.h>
#include <gtk/gtk.h>
@@ -55,50 +55,70 @@ get_widget (app_t *app, const char *name)
return widget;
}
-static double
-min4 (double a, double b, double c, double d)
-{
- double m1, m2;
-
- m1 = MIN (a, b);
- m2 = MIN (c, d);
- return MIN (m1, m2);
-}
-
-static double
-max4 (double a, double b, double c, double d)
-{
- double m1, m2;
-
- m1 = MAX (a, b);
- m2 = MAX (c, d);
- return MAX (m1, m2);
-}
-
+/* Figure out the boundary of a diameter=1 circle transformed into an ellipse
+ * by trans. Proof that this is the correct calculation:
+ *
+ * Transform x,y to u,v by this matrix calculation:
+ *
+ * |u| |a c| |x|
+ * |v| = |b d|*|y|
+ *
+ * Horizontal component:
+ *
+ * u = ax+cy (1)
+ *
+ * For each x,y on a radius-1 circle (p is angle to the point):
+ *
+ * x^2+y^2 = 1
+ * x = cos(p)
+ * y = sin(p)
+ * dx/dp = -sin(p) = -y
+ * dy/dp = cos(p) = x
+ *
+ * Figure out derivative of (1) relative to p:
+ *
+ * du/dp = a(dx/dp) + c(dy/dp)
+ * = -ay + cx
+ *
+ * The min and max u are when du/dp is zero:
+ *
+ * -ay + cx = 0
+ * cx = ay
+ * c = ay/x (2)
+ * y = cx/a (3)
+ *
+ * Substitute (2) into (1) and simplify:
+ *
+ * u = ax + ay^2/x
+ * = a(x^2+y^2)/x
+ * = a/x (because x^2+y^2 = 1)
+ * x = a/u (4)
+ *
+ * Substitute (4) into (3) and simplify:
+ *
+ * y = c(a/u)/a
+ * y = c/u (5)
+ *
+ * Square (4) and (5) and add:
+ *
+ * x^2+y^2 = (a^2+c^2)/u^2
+ *
+ * But x^2+y^2 is 1:
+ *
+ * 1 = (a^2+c^2)/u^2
+ * u^2 = a^2+c^2
+ * u = hypot(a,c)
+ *
+ * Similarily the max/min of v is at:
+ *
+ * v = hypot(b,d)
+ *
+ */
static void
compute_extents (pixman_f_transform_t *trans, double *sx, double *sy)
{
- double min_x, max_x, min_y, max_y;
- pixman_f_vector_t v[4] =
- {
- { { 1, 1, 1 } },
- { { -1, 1, 1 } },
- { { -1, -1, 1 } },
- { { 1, -1, 1 } },
- };
-
- pixman_f_transform_point (trans, &v[0]);
- pixman_f_transform_point (trans, &v[1]);
- pixman_f_transform_point (trans, &v[2]);
- pixman_f_transform_point (trans, &v[3]);
-
- min_x = min4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
- max_x = max4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
- min_y = min4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
- max_y = max4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
-
- *sx = (max_x - min_x) / 2.0;
- *sy = (max_y - min_y) / 2.0;
+ *sx = hypot (trans->m[0][0], trans->m[0][1]) / trans->m[2][2];
+ *sy = hypot (trans->m[1][0], trans->m[1][1]) / trans->m[2][2];
}
typedef struct
@@ -258,39 +278,37 @@ rescale (GtkWidget *may_be_null, app_t *app)
}
static gboolean
-on_expose (GtkWidget *da, GdkEvent *event, gpointer data)
+on_draw (GtkWidget *widget, cairo_t *cr, gpointer user_data)
{
- app_t *app = data;
- GdkRectangle *area = &event->expose.area;
+ app_t *app = user_data;
+ GdkRectangle area;
cairo_surface_t *surface;
pixman_image_t *tmp;
- cairo_t *cr;
uint32_t *pixels;
- pixels = calloc (1, area->width * area->height * 4);
+ gdk_cairo_get_clip_rectangle(cr, &area);
+
+ pixels = calloc (1, area.width * area.height * 4);
tmp = pixman_image_create_bits (
- PIXMAN_a8r8g8b8, area->width, area->height, pixels, area->width * 4);
+ PIXMAN_a8r8g8b8, area.width, area.height, pixels, area.width * 4);
- if (area->x < app->scaled_width && area->y < app->scaled_height)
+ if (area.x < app->scaled_width && area.y < app->scaled_height)
{
pixman_image_composite (
PIXMAN_OP_SRC,
app->original, NULL, tmp,
- area->x, area->y, 0, 0, 0, 0,
- app->scaled_width - area->x, app->scaled_height - area->y);
+ area.x, area.y, 0, 0, 0, 0,
+ app->scaled_width - area.x, app->scaled_height - area.y);
}
surface = cairo_image_surface_create_for_data (
(uint8_t *)pixels, CAIRO_FORMAT_ARGB32,
- area->width, area->height, area->width * 4);
-
- cr = gdk_cairo_create (da->window);
+ area.width, area.height, area.width * 4);
- cairo_set_source_surface (cr, surface, area->x, area->y);
+ cairo_set_source_surface (cr, surface, area.x, area.y);
cairo_paint (cr);
- cairo_destroy (cr);
cairo_surface_destroy (surface);
free (pixels);
pixman_image_unref (tmp);
@@ -380,7 +398,7 @@ app_new (pixman_image_t *original)
gtk_scale_add_mark (GTK_SCALE (widget), 0.0, GTK_POS_LEFT, NULL);
widget = get_widget (app, "drawing_area");
- g_signal_connect (widget, "expose_event", G_CALLBACK (on_expose), app);
+ g_signal_connect (widget, "draw", G_CALLBACK (on_draw), app);
set_up_filter_box (app, "reconstruct_x_combo_box");
set_up_filter_box (app, "reconstruct_y_combo_box");
diff --git a/demos/scale.ui b/demos/scale.ui
index ee985dd..d498d26 100644
--- a/demos/scale.ui
+++ b/demos/scale.ui
@@ -177,6 +177,7 @@
id="lock_checkbutton">
<property name="label" translatable="yes">Lock X and Y Dimensions</property>
<property name="xalign">0.0</property>
+ <property name="active">True</property>
</object>
<packing>
<property name="expand">False</property>
@@ -301,6 +302,7 @@
<object class="GtkSpinButton" id="subsample_spin_button">
<property name="visible">True</property>
<property name="adjustment">subsample_adjustment</property>
+ <property name="value">4</property>
</object>
<packing>
<property name="left_attach">1</property>
diff --git a/demos/tri-test.c b/demos/tri-test.c
index a71869a..9d213eb 100644
--- a/demos/tri-test.c
+++ b/demos/tri-test.c
@@ -1,7 +1,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "../test/utils.h"
+#include "utils.h"
#include "gtk-utils.h"
int
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..3dd3b12
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,619 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+project(
+ 'pixman',
+ ['c'],
+ version : '0.43.5',
+ license : 'MIT',
+ meson_version : '>= 0.52.0',
+ default_options : ['c_std=gnu99', 'buildtype=debugoptimized'],
+)
+
+config = configuration_data()
+cc = meson.get_compiler('c')
+null_dep = dependency('', required : false)
+
+add_project_arguments(
+ cc.get_supported_arguments([
+ '-Wdeclaration-after-statement',
+ '-fno-strict-aliasing',
+ '-fvisibility=hidden',
+ '-Wundef',
+ # -ftrapping-math is the default for gcc, but -fno-trapping-math is the
+ # default for clang. The FLOAT_IS_ZERO macro is used to guard against
+ # floating-point exceptions, however with -fno-trapping-math, the compiler
+ # can reorder floating-point operations so that they occur before the guard.
+ # Note, this function is ignored in clang < 10.0.0.
+ '-ftrapping-math'
+ ]),
+ language : ['c']
+)
+
+# GCC and Clang both ignore -Wno options that they don't recognize, so test for
+# -W<opt>, then add -Wno-<opt> if it's ignored
+foreach opt : ['unused-local-typedefs']
+ if cc.has_argument('-W' + opt)
+ add_project_arguments(['-Wno-' + opt], language : ['c'])
+ endif
+endforeach
+
+use_loongson_mmi = get_option('loongson-mmi')
+have_loongson_mmi = false
+loongson_mmi_flags = ['-mloongson-mmi']
+if not use_loongson_mmi.disabled()
+ if host_machine.cpu_family() == 'mips64' and cc.compiles('''
+ #ifndef __mips_loongson_vector_rev
+ #error "Loongson Multimedia Instructions are only available on Loongson"
+ #endif
+ #if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+ #error "Need GCC >= 4.4 for Loongson MMI compilation"
+ #endif
+ #include "pixman/loongson-mmintrin.h"
+ int main () {
+ union {
+ __m64 v;
+ char c[8];
+ } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+ int b = 4;
+ __m64 c = _mm_srli_pi16 (a.v, b);
+ return 0;
+ }''',
+ args : loongson_mmi_flags,
+ include_directories : include_directories('.'),
+ name : 'Loongson MMI Intrinsic Support')
+ have_loongson_mmi = true
+ endif
+endif
+
+if have_loongson_mmi
+ config.set10('USE_LOONGSON_MMI', true)
+elif use_loongson_mmi.enabled()
+ error('Loongson MMI Support unavailable, but required')
+endif
+
+use_mmx = get_option('mmx')
+have_mmx = false
+mmx_flags = []
+
+if cc.get_id() == 'msvc'
+ mmx_flags = ['/w14710', '/w14714', '/wd4244']
+elif cc.get_id() == 'sun'
+ mmx_flags = ['-xarch=sse']
+else
+ mmx_flags = ['-mmmx', '-Winline']
+endif
+if not use_mmx.disabled()
+ if host_machine.cpu_family() == 'x86_64' or cc.get_id() == 'msvc'
+ have_mmx = true
+ elif host_machine.cpu_family() == 'x86' and cc.compiles('''
+ #include <mmintrin.h>
+ #include <stdint.h>
+
+ /* Check support for block expressions */
+ #define _mm_shuffle_pi16(A, N) \
+ ({ \
+ __m64 ret; \
+ \
+ /* Some versions of clang will choke on K */ \
+ asm ("pshufw %2, %1, %0\n\t" \
+ : "=y" (ret) \
+ : "y" (A), "K" ((const int8_t)N) \
+ ); \
+ \
+ ret; \
+ })
+
+ int main () {
+ __m64 v = _mm_cvtsi32_si64 (1);
+ __m64 w;
+
+ w = _mm_shuffle_pi16(v, 5);
+
+ /* Some versions of clang will choke on this */
+ asm ("pmulhuw %1, %0\n\t"
+ : "+y" (w)
+ : "y" (v)
+ );
+
+ return _mm_cvtsi64_si32 (v);
+ }''',
+ args : mmx_flags,
+ name : 'MMX Intrinsic Support')
+ have_mmx = true
+ endif
+endif
+
+if have_mmx
+ # Inline assembly do not work on X64 MSVC, so we use
+ # compatibility intrinsics there
+ if cc.get_id() != 'msvc' or host_machine.cpu_family() != 'x86_64'
+ config.set10('USE_X86_MMX', true)
+ endif
+elif use_mmx.enabled()
+ error('MMX Support unavailable, but required')
+endif
+
+use_sse2 = get_option('sse2')
+have_sse2 = false
+sse2_flags = []
+if cc.get_id() == 'sun'
+ sse2_flags = ['-xarch=sse2']
+elif cc.get_id() != 'msvc'
+ sse2_flags = ['-msse2', '-Winline']
+endif
+if not use_sse2.disabled()
+ if host_machine.cpu_family() == 'x86'
+ if cc.compiles('''
+ #if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+ # if !defined(__amd64__) && !defined(__x86_64__)
+ # error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
+ # endif
+ #endif
+ #include <mmintrin.h>
+ #include <xmmintrin.h>
+ #include <emmintrin.h>
+ int param;
+ int main () {
+ __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
+ c = _mm_xor_si128 (a, b);
+ return _mm_cvtsi128_si32(c);
+ }''',
+ args : sse2_flags,
+ name : 'SSE2 Intrinsic Support')
+ have_sse2 = true
+ endif
+ elif host_machine.cpu_family() == 'x86_64'
+ have_sse2 = true
+ endif
+endif
+
+if have_sse2
+ config.set10('USE_SSE2', true)
+elif use_sse2.enabled()
+ error('sse2 Support unavailable, but required')
+endif
+
+use_ssse3 = get_option('ssse3')
+have_ssse3 = false
+ssse3_flags = []
+if cc.get_id() != 'msvc'
+ ssse3_flags = ['-mssse3', '-Winline']
+endif
+
+# x64 pre-2010 MSVC compilers crashes when building the ssse3 code
+if not use_ssse3.disabled() and not (cc.get_id() == 'msvc' and cc.version().version_compare('<16') and host_machine.cpu_family() == 'x86_64')
+ if host_machine.cpu_family().startswith('x86')
+ if cc.compiles('''
+ #include <mmintrin.h>
+ #include <xmmintrin.h>
+ #include <emmintrin.h>
+ int param;
+ int main () {
+ __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
+ c = _mm_xor_si128 (a, b);
+ return _mm_cvtsi128_si32(c);
+ }''',
+ args : ssse3_flags,
+ name : 'SSSE3 Intrinsic Support')
+ have_ssse3 = true
+ endif
+ endif
+endif
+
+if have_ssse3
+ config.set10('USE_SSSE3', true)
+elif use_ssse3.enabled()
+ error('ssse3 Support unavailable, but required')
+endif
+
+use_vmx = get_option('vmx')
+have_vmx = false
+vmx_flags = ['-maltivec', '-mabi=altivec']
+if not use_vmx.disabled()
+ if host_machine.cpu_family().startswith('ppc')
+ if cc.compiles('''
+ #include <altivec.h>
+ int main () {
+ vector unsigned int v = vec_splat_u32 (1);
+ v = vec_sub (v, v);
+ return 0;
+ }''',
+ args : vmx_flags,
+ name : 'VMX/Altivec Intrinsic Support')
+ have_vmx = true
+ endif
+ endif
+endif
+
+if cc.compiles('''
+ __asm__ (
+ ".func meson_test"
+ ".endfunc"
+ );''',
+ name : 'test for ASM .func directive')
+ config.set('ASM_HAVE_FUNC_DIRECTIVE', 1)
+endif
+
+if cc.compiles('''
+ __asm__ (
+ ".syntax unified\n"
+ );''',
+ name : 'test for ASM .syntax unified directive')
+ config.set('ASM_HAVE_SYNTAX_UNIFIED', 1)
+endif
+
+if cc.links('''
+ #include <stdint.h>
+
+ __asm__ (
+ " .global _testlabel\n"
+ "_testlabel:\n"
+ );
+
+ int testlabel();
+ int main(int argc, char* argv[]) {
+ return testlabel();
+ }''',
+ name : 'test for ASM leading underscore')
+ config.set('ASM_LEADING_UNDERSCORE', 1)
+endif
+
+
+
+if have_vmx
+ config.set10('USE_VMX', true)
+elif use_vmx.enabled()
+ error('vmx Support unavailable, but required')
+endif
+
+use_armv6_simd = get_option('arm-simd')
+have_armv6_simd = false
+if not use_armv6_simd.disabled()
+ if host_machine.cpu_family() == 'arm'
+ if cc.compiles(files('arm-simd-test.S'), name : 'ARMv6 SIMD Intrinsic Support')
+ have_armv6_simd = true
+ endif
+ endif
+endif
+
+if have_armv6_simd
+ config.set10('USE_ARM_SIMD', true)
+elif use_armv6_simd.enabled()
+ error('ARMv6 SIMD Support unavailable, but required')
+endif
+
+use_neon = get_option('neon')
+have_neon = false
+if not use_neon.disabled()
+ if host_machine.cpu_family() == 'arm'
+ if cc.compiles(files('neon-test.S'), name : 'NEON Intrinsic Support')
+ have_neon = true
+ endif
+ endif
+endif
+
+if have_neon
+ config.set10('USE_ARM_NEON', true)
+elif use_neon.enabled()
+ error('NEON Support unavailable, but required')
+endif
+
+use_a64neon = get_option('a64-neon')
+have_a64neon = false
+if not use_a64neon.disabled()
+ if host_machine.cpu_family() == 'aarch64'
+ if cc.compiles(files('a64-neon-test.S'), name : 'NEON A64 Intrinsic Support')
+ have_a64neon = true
+ endif
+ endif
+endif
+
+if have_a64neon
+ config.set10('USE_ARM_A64_NEON', true)
+elif use_a64neon.enabled()
+ error('A64 NEON Support unavailable, but required')
+endif
+
+use_iwmmxt = get_option('iwmmxt')
+have_iwmmxt = false
+iwmmxt_flags = ['-flax-vector-conversions', '-Winline']
+if not use_iwmmxt.disabled()
+ if get_option('iwmmxt2')
+ iwmmxt_flags += '-march=iwmmxt2'
+ else
+ iwmmxt_flags += '-march=iwmmxt'
+ endif
+
+ if host_machine.cpu_family() == 'arm'
+ if cc.compiles('''
+ #ifndef __IWMMXT__
+ #error "IWMMXT not enabled (with -march=iwmmxt)"
+ #endif
+ #if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+ #error "Need GCC >= 4.8 for IWMMXT intrinsics"
+ #endif
+ #include <mmintrin.h>
+ int main () {
+ union {
+ __m64 v;
+ char c[8];
+ } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+ int b = 4;
+ __m64 c = _mm_srli_si64 (a.v, b);
+ }
+ ''',
+ args : iwmmxt_flags,
+ name : 'IWMMXT Intrinsic Support')
+ have_iwmmxt = true
+ endif
+ endif
+endif
+
+if have_iwmmxt
+ config.set10('USE_ARM_IWMMXT', true)
+elif use_iwmmxt.enabled()
+ error('IWMMXT Support unavailable, but required')
+endif
+
+use_mips_dspr2 = get_option('mips-dspr2')
+have_mips_dspr2 = false
+mips_dspr2_flags = ['-mdspr2']
+if not use_mips_dspr2.disabled()
+ if host_machine.cpu_family() == 'mips32'
+ if cc.compiles('''
+ #if !(defined(__mips__) && __mips_isa_rev >= 2)
+ #error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
+ #endif
+ int
+ main ()
+ {
+ int c = 0, a = 0, b = 0;
+ __asm__ __volatile__ (
+ "precr.qb.ph %[c], %[a], %[b] \n\t"
+ : [c] "=r" (c)
+ : [a] "r" (a), [b] "r" (b)
+ );
+ return c;
+ }''',
+ args : mipds_dspr2_flags,
+ name : 'DSPr2 Intrinsic Support')
+ have_mips_dspr2 = true
+ endif
+ endif
+endif
+
+if have_mips_dspr2
+ config.set10('USE_MIPS_DSPR2', true)
+elif use_mips_dspr2.enabled()
+ error('MIPS DSPr2 Support unavailable, but required')
+endif
+
+use_gnu_asm = get_option('gnu-inline-asm')
+if not use_gnu_asm.disabled()
+ if cc.compiles('''
+ int main () {
+ /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
+ asm volatile ( "\tnop\n" : : : "cc", "memory" );
+ return 0;
+ }
+ ''',
+ name : 'GNU Inline ASM support.')
+ config.set10('USE_GCC_INLINE_ASM', true)
+ elif use_gnu_asm.enabled()
+ error('GNU inline assembly support missing but required.')
+ endif
+endif
+
+if get_option('timers')
+ config.set('PIXMAN_TIMERS', 1)
+endif
+if get_option('gnuplot')
+ config.set('PIXMAN_GNUPLOT', 1)
+endif
+
+if cc.get_id() != 'msvc'
+ dep_openmp = dependency('openmp', required : get_option('openmp'))
+ if dep_openmp.found()
+ config.set10('USE_OPENMP', true)
+ elif meson.version().version_compare('<0.51.0')
+ # In versions of meson before 0.51 the openmp dependency can still
+ # inject arguments in the the auto case when it is not found, the
+ # detection does work correctly in that case however, so we just
+ # replace dep_openmp with null_dep to work around this.
+ dep_openmp = null_dep
+ endif
+else
+ # the MSVC implementation of openmp is not compliant enough for our
+ # uses here, so we disable it here.
+ # Please see: https://stackoverflow.com/questions/12560243/using-threadprivate-directive-in-visual-studio
+ dep_openmp = null_dep
+endif
+
+dep_gtk = dependency('gtk+-3.0', required : get_option('gtk').enabled() and get_option('demos').enabled())
+dep_glib = dependency('glib-2.0', required : get_option('gtk').enabled() and get_option('demos').enabled())
+
+dep_png = null_dep
+if not get_option('libpng').disabled()
+ dep_png = dependency('libpng', required : false)
+
+ # We need to look for the right library to link to for libpng,
+ # when looking for libpng manually
+ foreach png_ver : [ '16', '15', '14', '13', '12', '10' ]
+ if not dep_png.found()
+ dep_png = cc.find_library('libpng@0@'.format(png_ver), has_headers : ['png.h'], required : false)
+ endif
+ endforeach
+
+ if get_option('libpng').enabled() and not dep_png.found()
+ error('libpng support requested but libpng library not found')
+ endif
+endif
+
+if dep_png.found()
+ config.set('HAVE_LIBPNG', 1)
+endif
+dep_m = cc.find_library('m', required : false)
+dep_threads = dependency('threads')
+
+# MSVC-style compilers do not come with pthreads, so we must link
+# to it explicitly, currently pthreads-win32 is supported
+pthreads_found = false
+
+if dep_threads.found() and cc.has_header('pthread.h')
+ if cc.get_argument_syntax() == 'msvc'
+ pthread_lib = null_dep
+ foreach pthread_type : ['VC3', 'VSE3', 'VCE3', 'VC2', 'VSE2', 'VCE2']
+ if not pthread_lib.found()
+ pthread_lib = cc.find_library('pthread@0@'.format(pthread_type), required : false)
+ endif
+ endforeach
+ if pthread_lib.found()
+ pthreads_found = true
+ dep_threads = pthread_lib
+ endif
+ else
+ pthreads_found = true
+ endif
+else
+ # Avoid linking with -pthread if we don't actually have pthreads
+ dep_threads = null_dep
+endif
+
+if pthreads_found
+ config.set('HAVE_PTHREADS', 1)
+endif
+
+funcs = ['sigaction', 'alarm', 'mprotect', 'getpagesize', 'mmap', 'getisax', 'gettimeofday']
+# mingw claimes to have posix_memalign, but it doesn't
+if host_machine.system() != 'windows'
+ funcs += 'posix_memalign'
+endif
+
+foreach f : funcs
+ if cc.has_function(f)
+ config.set('HAVE_@0@'.format(f.to_upper()), 1)
+ endif
+endforeach
+
+# This is only used in one test, that defines _GNU_SOURCE
+if cc.has_function('feenableexcept',
+ prefix : '#define _GNU_SOURCE\n#include <fenv.h>',
+ dependencies : dep_m)
+ config.set('HAVE_FEENABLEEXCEPT', 1)
+endif
+
+if cc.has_header_symbol('fenv.h', 'FE_DIVBYZERO')
+ config.set('HAVE_FEDIVBYZERO', 1)
+endif
+
+foreach h : ['sys/mman.h', 'fenv.h', 'unistd.h']
+ if cc.check_header(h)
+ config.set('HAVE_@0@'.format(h.underscorify().to_upper()), 1)
+ endif
+endforeach
+
+use_tls = get_option('tls')
+have_tls = ''
+if not use_tls.disabled()
+ # gcc on Windows only warns that __declspec(thread) isn't supported,
+ # passing -Werror=attributes makes it fail.
+ if (host_machine.system() == 'windows' and
+ cc.compiles('int __declspec(thread) foo;',
+ args : cc.get_supported_arguments(['-Werror=attributes']),
+ name : 'TLS via __declspec(thread)'))
+ have_tls = '__declspec(thread)'
+ elif cc.compiles('int __thread foo;', name : 'TLS via __thread')
+ have_tls = '__thread'
+ endif
+endif
+
+if have_tls != ''
+ config.set('TLS', have_tls)
+elif use_tls.enabled()
+ error('Compiler TLS Support unavailable, but required')
+endif
+
+if cc.links('''
+ static int x = 1;
+ static void __attribute__((constructor)) constructor_function () { x = 0; }
+ int main (void) { return x; }
+ ''',
+ name : '__attribute__((constructor))')
+ config.set('TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR', 1)
+endif
+
+if cc.links(
+ ' __float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }',
+ name : 'Has float128 support')
+ config.set('HAVE_FLOAT128', 1)
+endif
+
+if cc.has_function('clz')
+ config.set('HAVE_BUILTIN_CLZ', 1)
+endif
+
+if cc.links('''
+ unsigned int __attribute__ ((vector_size(16))) e, a, b;
+ int main (void) { e = a - ((b << 27) + (b >> (32 - 27))) + 1; return e[0]; }
+ ''',
+ name : 'Support for GCC vector extensions')
+ config.set('HAVE_GCC_VECTOR_EXTENSIONS', 1)
+endif
+
+if host_machine.endian() == 'big'
+ config.set('WORDS_BIGENDIAN', 1)
+endif
+
+config.set('SIZEOF_LONG', cc.sizeof('long'))
+
+# Required to make pixman-private.h
+config.set('PACKAGE', 'foo')
+
+version_conf = configuration_data()
+split = meson.project_version().split('.')
+version_conf.set('PIXMAN_VERSION_MAJOR', split[0])
+version_conf.set('PIXMAN_VERSION_MINOR', split[1])
+version_conf.set('PIXMAN_VERSION_MICRO', split[2])
+
+add_project_arguments('-DHAVE_CONFIG_H', language : ['c'])
+
+subdir('pixman')
+
+if not get_option('tests').disabled() or not get_option('demos').disabled()
+ subdir(join_paths('test', 'utils'))
+endif
+
+if not get_option('demos').disabled()
+ subdir('demos')
+endif
+
+if not get_option('tests').disabled()
+ subdir('test')
+endif
+
+pkg = import('pkgconfig')
+pkg.generate(libpixman,
+ name : 'Pixman',
+ filebase : 'pixman-1',
+ description : 'The pixman library (version 1)',
+ subdirs: 'pixman-1',
+ version : meson.project_version(),
+)
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 0000000..df10889
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,128 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+option(
+ 'loongson-mmi',
+ type : 'feature',
+ description : 'Use Loongson MMI intrinsic optimized paths',
+)
+option(
+ 'mmx',
+ type : 'feature',
+ description : 'Use X86 MMX intrinsic optimized paths',
+)
+option(
+ 'sse2',
+ type : 'feature',
+ description : 'Use X86 SSE2 intrinsic optimized paths',
+)
+option(
+ 'ssse3',
+ type : 'feature',
+ description : 'Use X86 SSSE3 intrinsic optimized paths',
+)
+option(
+ 'vmx',
+ type : 'feature',
+ description : 'Use PPC VMX/Altivec intrinsic optimized paths',
+)
+option(
+ 'arm-simd',
+ type : 'feature',
+ description : 'Use ARMv6 SIMD intrinsic optimized paths',
+)
+option(
+ 'neon',
+ type : 'feature',
+ description : 'Use ARM NEON intrinsic optimized paths',
+)
+option(
+ 'a64-neon',
+ type : 'feature',
+ description : 'Use ARM A64 NEON intrinsic optimized paths',
+)
+option(
+ 'iwmmxt',
+ type : 'feature',
+ description : 'Use ARM IWMMXT intrinsic optimized paths',
+)
+option(
+ 'iwmmxt2',
+ type : 'boolean',
+ value : true,
+ description : 'Use ARM IWMMXT2 intrinsic instead of IWMMXT',
+)
+option(
+ 'mips-dspr2',
+ type : 'feature',
+ description : 'Use MIPS32 DSPr2 intrinsic optimized paths',
+)
+option(
+ 'gnu-inline-asm',
+ type : 'feature',
+ description : 'Use GNU style inline assembler',
+)
+option(
+ 'tls',
+ type : 'feature',
+ description : 'Use compiler support for thread-local storage',
+)
+option(
+ 'cpu-features-path',
+ type : 'string',
+ description : 'Path to platform-specific cpu-features.[ch] for systems that do not provide it (e.g. Android)',
+)
+option(
+ 'openmp',
+ type : 'feature',
+ description : 'Enable OpenMP for tests',
+)
+option(
+ 'timers',
+ type : 'boolean',
+ value : false,
+ description : 'Enable TIMER_* macros',
+)
+option(
+ 'gnuplot',
+ type : 'boolean',
+ value : false,
+ description : 'Enable output of filters that can be piped to gnuplot',
+)
+option(
+ 'gtk',
+ type : 'feature',
+ description : 'Enable demos using GTK',
+)
+option(
+ 'libpng',
+ type : 'feature',
+ description : 'Use libpng in tests'
+)
+option(
+ 'tests',
+ type : 'feature',
+ description : 'Build tests'
+)
+option(
+ 'demos',
+ type : 'feature',
+ description : 'Build demos'
+)
diff --git a/neon-test.S b/neon-test.S
new file mode 100644
index 0000000..c30a399
--- /dev/null
+++ b/neon-test.S
@@ -0,0 +1,12 @@
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0
diff --git a/pixman-1-uninstalled.pc.in b/pixman-1-uninstalled.pc.in
deleted file mode 100644
index e0347d0..0000000
--- a/pixman-1-uninstalled.pc.in
+++ /dev/null
@@ -1,5 +0,0 @@
-Name: Pixman
-Description: The pixman library (version 1)
-Version: @PACKAGE_VERSION@
-Cflags: -I${pc_top_builddir}/${pcfiledir}/pixman
-Libs: ${pc_top_builddir}/${pcfiledir}/pixman/libpixman-1.la
diff --git a/pixman-1.pc.in b/pixman-1.pc.in
deleted file mode 100644
index e3b9711..0000000
--- a/pixman-1.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Pixman
-Description: The pixman library (version 1)
-Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/pixman-1
-Libs: -L${libdir} -lpixman-1
-
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
deleted file mode 100644
index 581b6f6..0000000
--- a/pixman/Makefile.am
+++ /dev/null
@@ -1,141 +0,0 @@
-include $(top_srcdir)/pixman/Makefile.sources
-
-lib_LTLIBRARIES = libpixman-1.la
-
-libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@
-libpixman_1_la_LIBADD = @PTHREAD_LIBS@ -lm
-libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers)
-
-libpixmanincludedir = $(includedir)/pixman-1
-libpixmaninclude_HEADERS = pixman.h pixman-version.h
-noinst_LTLIBRARIES =
-
-EXTRA_DIST = \
- Makefile.win32 \
- pixman-region.c \
- solaris-hwcap.mapfile \
- $(NULL)
-
-# mmx code
-if USE_X86_MMX
-noinst_LTLIBRARIES += libpixman-mmx.la
-libpixman_mmx_la_SOURCES = \
- pixman-mmx.c
-libpixman_mmx_la_CFLAGS = $(MMX_CFLAGS)
-libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-mmx.la
-
-ASM_CFLAGS_mmx=$(MMX_CFLAGS)
-endif
-
-# vmx code
-if USE_VMX
-noinst_LTLIBRARIES += libpixman-vmx.la
-libpixman_vmx_la_SOURCES = \
- pixman-vmx.c \
- pixman-combine32.h
-libpixman_vmx_la_CFLAGS = $(VMX_CFLAGS)
-libpixman_1_la_LIBADD += libpixman-vmx.la
-
-ASM_CFLAGS_vmx=$(VMX_CFLAGS)
-endif
-
-# sse2 code
-if USE_SSE2
-noinst_LTLIBRARIES += libpixman-sse2.la
-libpixman_sse2_la_SOURCES = \
- pixman-sse2.c
-libpixman_sse2_la_CFLAGS = $(SSE2_CFLAGS)
-libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-sse2.la
-
-ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
-endif
-
-# ssse3 code
-if USE_SSSE3
-noinst_LTLIBRARIES += libpixman-ssse3.la
-libpixman_ssse3_la_SOURCES = \
- pixman-ssse3.c
-libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS)
-libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-ssse3.la
-
-ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
-endif
-
-# arm simd code
-if USE_ARM_SIMD
-noinst_LTLIBRARIES += libpixman-arm-simd.la
-libpixman_arm_simd_la_SOURCES = \
- pixman-arm-simd.c \
- pixman-arm-common.h \
- pixman-arm-simd-asm.S \
- pixman-arm-simd-asm-scaled.S \
- pixman-arm-asm.h \
- pixman-arm-simd-asm.h
-libpixman_1_la_LIBADD += libpixman-arm-simd.la
-
-ASM_CFLAGS_arm_simd=
-endif
-
-# arm neon code
-if USE_ARM_NEON
-noinst_LTLIBRARIES += libpixman-arm-neon.la
-libpixman_arm_neon_la_SOURCES = \
- pixman-arm-neon.c \
- pixman-arm-common.h \
- pixman-arm-neon-asm.S \
- pixman-arm-neon-asm-bilinear.S \
- pixman-arm-asm.h \
- pixman-arm-neon-asm.h
-libpixman_1_la_LIBADD += libpixman-arm-neon.la
-
-ASM_CFLAGS_arm_neon=
-endif
-
-# iwmmxt code
-if USE_ARM_IWMMXT
-libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
-noinst_LTLIBRARIES += libpixman-iwmmxt.la
-libpixman_1_la_LIBADD += libpixman-iwmmxt.la
-
-libpixman_iwmmxt_la-pixman-mmx.lo: pixman-mmx.c
- $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(AM_CPPFLAGS) $(AM_CPPFLAGS) $(CPPFLAGS) $(CFLAGS) $(IWMMXT_CFLAGS) -MT libpixman_iwmmxt_la-pixman-mmx.lo -MD -MP -MF $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo -c -o libpixman_iwmmxt_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c
- $(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Plo
-
-libpixman_iwmmxt_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
-libpixman_iwmmxt_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
- $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
- $(CFLAGS) $(IWMMXT_CFLAGS) $(AM_LDFLAGS) \
- $(LDFLAGS) -o $@
-
-libpixman-iwmmxt.la: libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_DEPENDENCIES)
- $(AM_V_CCLD)$(libpixman_iwmmxt_la_LINK) libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_LIBADD) $(LIBS)
-endif
-
-# mips dspr2 code
-if USE_MIPS_DSPR2
-noinst_LTLIBRARIES += libpixman-mips-dspr2.la
-libpixman_mips_dspr2_la_SOURCES = \
- pixman-mips-dspr2.c \
- pixman-mips-dspr2.h \
- pixman-mips-dspr2-asm.S \
- pixman-mips-dspr2-asm.h \
- pixman-mips-memcpy-asm.S
-libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
-
-ASM_CFLAGS_mips_dspr2=
-endif
-
-# loongson code
-if USE_LOONGSON_MMI
-noinst_LTLIBRARIES += libpixman-loongson-mmi.la
-libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
-libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
-libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
-libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
-endif
-
-.c.s : $(libpixmaninclude_HEADERS)
- $(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/Makefile.sources b/pixman/Makefile.sources
deleted file mode 100644
index c624eb9..0000000
--- a/pixman/Makefile.sources
+++ /dev/null
@@ -1,42 +0,0 @@
-libpixman_sources = \
- pixman.c \
- pixman-access.c \
- pixman-access-accessors.c \
- pixman-bits-image.c \
- pixman-combine32.c \
- pixman-combine-float.c \
- pixman-conical-gradient.c \
- pixman-filter.c \
- pixman-x86.c \
- pixman-mips.c \
- pixman-arm.c \
- pixman-ppc.c \
- pixman-edge.c \
- pixman-edge-accessors.c \
- pixman-fast-path.c \
- pixman-glyph.c \
- pixman-general.c \
- pixman-gradient-walker.c \
- pixman-image.c \
- pixman-implementation.c \
- pixman-linear-gradient.c \
- pixman-matrix.c \
- pixman-noop.c \
- pixman-radial-gradient.c \
- pixman-region16.c \
- pixman-region32.c \
- pixman-solid-fill.c \
- pixman-timer.c \
- pixman-trap.c \
- pixman-utils.c \
- $(NULL)
-
-libpixman_headers = \
- pixman.h \
- pixman-accessor.h \
- pixman-combine32.h \
- pixman-compiler.h \
- pixman-edge-imp.h \
- pixman-inlines.h \
- pixman-private.h \
- $(NULL)
diff --git a/pixman/Makefile.win32 b/pixman/Makefile.win32
deleted file mode 100644
index 7b64033..0000000
--- a/pixman/Makefile.win32
+++ /dev/null
@@ -1,93 +0,0 @@
-default: all
-
-top_srcdir = ..
-include $(top_srcdir)/pixman/Makefile.sources
-include $(top_srcdir)/Makefile.win32.common
-
-MMX_VAR = $(MMX)
-ifeq ($(MMX_VAR),)
-MMX_VAR=on
-endif
-
-SSE2_VAR = $(SSE2)
-ifeq ($(SSE2_VAR),)
-SSE2_VAR=on
-endif
-
-SSSE3_VAR = $(SSSE3)
-ifeq ($(SSSE3_VAR),)
-SSSE3_VAR=on
-endif
-
-MMX_CFLAGS = -DUSE_X86_MMX -w14710 -w14714
-SSE2_CFLAGS = -DUSE_SSE2
-SSSE3_CFLAGS = -DUSE_SSSE3
-
-# MMX compilation flags
-ifeq ($(MMX_VAR),on)
-PIXMAN_CFLAGS += $(MMX_CFLAGS)
-libpixman_sources += pixman-mmx.c
-endif
-
-# SSE2 compilation flags
-ifeq ($(SSE2_VAR),on)
-PIXMAN_CFLAGS += $(SSE2_CFLAGS)
-libpixman_sources += pixman-sse2.c
-endif
-
-# SSSE3 compilation flags
-ifeq ($(SSSE3_VAR),on)
-PIXMAN_CFLAGS += $(SSSE3_CFLAGS)
-libpixman_sources += pixman-ssse3.c
-endif
-
-OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libpixman_sources))
-
-# targets
-all: inform informMMX informSSE2 informSSSE3 $(CFG_VAR)/$(LIBRARY).lib
-
-informMMX:
-ifneq ($(MMX),off)
-ifneq ($(MMX),on)
-ifneq ($(MMX),)
- @echo "Invalid specified MMX option : "$(MMX_VAR)"."
- @echo
- @echo "Possible choices for MMX are 'on' or 'off'"
- @exit 1
-endif
- @echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)"
-endif
-endif
-
-informSSE2:
-ifneq ($(SSE2),off)
-ifneq ($(SSE2),on)
-ifneq ($(SSE2),)
- @echo "Invalid specified SSE option : "$(SSE2)"."
- @echo
- @echo "Possible choices for SSE2 are 'on' or 'off'"
- @exit 1
-endif
- @echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)"
-endif
-endif
-
-informSSSE3:
-ifneq ($(SSSE3),off)
-ifneq ($(SSSE3),on)
-ifneq ($(SSSE3),)
- @echo "Invalid specified SSE option : "$(SSSE3)"."
- @echo
- @echo "Possible choices for SSSE3 are 'on' or 'off'"
- @exit 1
-endif
- @echo "Setting SSSE3 flag to default value 'on'... (use SSSE3=on or SSSE3=off)"
-endif
-endif
-
-
-# pixman linking
-$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
- @$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
-
-.PHONY: all informMMX informSSE2 informSSSE3
diff --git a/pixman/dither/blue-noise-64x64.h b/pixman/dither/blue-noise-64x64.h
new file mode 100644
index 0000000..93c8805
--- /dev/null
+++ b/pixman/dither/blue-noise-64x64.h
@@ -0,0 +1,77 @@
+/* WARNING: This file is generated by make-blue-noise.c
+ * Please edit that file instead of this one.
+ */
+
+#ifndef BLUE_NOISE_64X64_H
+#define BLUE_NOISE_64X64_H
+
+#include <stdint.h>
+
+static const uint16_t dither_blue_noise_64x64[4096] = {
+ 3039, 1368, 3169, 103, 2211, 1248, 2981, 668, 2633, 37, 3963, 2903, 384, 2564, 3115, 1973, 3348, 830, 2505, 1293, 3054, 1060, 1505, 3268, 400, 1341, 593, 3802, 3384, 429, 4082, 1411, 2503, 3863, 126, 1292, 1887, 2855, 205, 2094, 2977, 1899, 3924, 356, 3088, 2500, 3942, 1409, 2293, 1734, 3732, 1291, 3227, 277, 2054, 786, 2871, 411, 2425, 1678, 3986, 455, 2879, 2288,
+ 388, 1972, 3851, 778, 2768, 3697, 944, 2123, 1501, 3533, 937, 1713, 1381, 3888, 156, 1242, 516, 2888, 1607, 3676, 632, 2397, 3804, 2673, 1898, 3534, 2593, 1777, 1170, 2299, 3013, 1838, 523, 3053, 1647, 3601, 3197, 959, 1520, 3633, 893, 2437, 3367, 2187, 1258, 137, 1965, 401, 3546, 643, 3087, 2498, 733, 2786, 3371, 4053, 1266, 1977, 3663, 183, 2570, 2107, 1183, 3708,
+ 907, 2473, 1151, 3363, 1527, 1902, 232, 3903, 3060, 496, 2486, 3206, 2165, 861, 2387, 3653, 2101, 3972, 132, 2162, 3437, 1827, 215, 895, 3114, 271, 969, 2932, 197, 1598, 878, 3696, 1140, 2120, 904, 2431, 302, 3846, 2675, 481, 3187, 66, 1440, 650, 3833, 2826, 3435, 901, 2936, 2111, 250, 1875, 3609, 1174, 1747, 162, 2346, 3420, 913, 3172, 1383, 752, 3298, 1735,
+ 3540, 2938, 249, 2324, 526, 3099, 2561, 1324, 2347, 1861, 1200, 3702, 257, 3442, 1514, 2999, 992, 1766, 2735, 1163, 478, 2943, 1279, 3635, 2177, 1464, 3672, 2386, 3871, 3340, 2690, 64, 3489, 2811, 3999, 633, 1948, 1243, 2269, 1807, 1143, 2750, 3729, 1790, 2363, 1053, 1537, 2636, 4065, 1076, 1476, 3869, 450, 2200, 2676, 658, 2979, 1548, 544, 1913, 2838, 3911, 116, 2698,
+ 517, 1295, 3997, 1739, 3665, 1083, 3509, 599, 3400, 118, 2956, 720, 2689, 1907, 567, 2523, 284, 3397, 711, 3219, 2450, 3985, 1665, 2549, 562, 3011, 1855, 729, 1355, 528, 1908, 2456, 1384, 337, 1540, 2654, 3138, 3513, 703, 4080, 3314, 2047, 855, 3037, 209, 3317, 577, 1828, 17, 2336, 3193, 2748, 962, 3441, 1450, 3246, 1075, 3878, 2615, 3497, 1033, 2310, 1442, 2183,
+ 1654, 3254, 2061, 738, 2832, 148, 2030, 1670, 909, 3850, 2109, 1533, 4046, 1085, 3098, 3897, 1378, 2248, 3829, 1495, 1966, 23, 797, 3427, 1124, 4057, 95, 2787, 2190, 3074, 3950, 742, 3194, 1999, 3386, 1113, 16, 1657, 2804, 201, 1543, 383, 2559, 1325, 3604, 2068, 2493, 3771, 1284, 3460, 710, 1716, 2447, 80, 3811, 2032, 347, 2227, 15, 1689, 397, 3084, 662, 3798,
+ 973, 43, 2608, 3143, 1459, 2423, 4066, 2770, 3191, 1283, 2630, 314, 3235, 2289, 72, 1822, 2840, 924, 350, 2653, 1057, 3715, 2235, 2775, 346, 2083, 1553, 3292, 1081, 274, 1686, 1188, 2327, 3743, 578, 2234, 3916, 2519, 1011, 3056, 2207, 3438, 3890, 537, 1617, 837, 3094, 373, 2795, 1980, 276, 3951, 1353, 3015, 844, 1724, 3651, 2923, 1316, 4092, 2504, 3627, 1936, 2854,
+ 2461, 3929, 1193, 421, 3746, 820, 1180, 286, 2261, 532, 3625, 1812, 802, 1327, 3527, 670, 3730, 2025, 3124, 3565, 529, 2960, 1769, 1390, 3196, 2494, 3756, 796, 3618, 2602, 3463, 2847, 166, 953, 1745, 2900, 438, 2070, 1418, 3741, 639, 1205, 1891, 2882, 2282, 4012, 1182, 1696, 3630, 951, 2904, 2170, 3530, 375, 2320, 2742, 1132, 701, 3216, 2023, 847, 1230, 310, 3431,
+ 770, 1961, 3531, 1702, 2181, 3370, 1877, 3072, 1571, 3389, 1071, 2415, 3782, 2803, 1610, 2454, 1211, 182, 1655, 2322, 1282, 3372, 287, 3935, 704, 1232, 415, 1910, 2286, 1399, 556, 1964, 4068, 2444, 3605, 1272, 3345, 816, 3526, 256, 2402, 2777, 955, 345, 3289, 111, 2727, 635, 2396, 1488, 3331, 600, 1032, 1575, 4026, 515, 3507, 2433, 1605, 460, 3364, 2783, 1810, 1397,
+ 2334, 223, 2945, 688, 2533, 99, 2705, 624, 3944, 2073, 46, 2978, 508, 2132, 269, 3173, 3453, 2631, 4076, 694, 1892, 2586, 972, 2178, 3470, 1695, 2849, 3141, 77, 3884, 994, 3029, 1536, 673, 3083, 124, 2583, 1722, 2821, 1944, 4027, 1661, 3176, 3728, 1337, 1813, 3503, 2035, 3930, 157, 2537, 1865, 3096, 2646, 1941, 3252, 1449, 135, 2836, 3758, 2139, 84, 3678, 3106,
+ 3862, 1545, 3307, 1320, 3955, 1031, 3664, 1306, 2460, 776, 1487, 3294, 1187, 3990, 1903, 1021, 549, 1484, 943, 3027, 97, 3853, 1499, 2880, 198, 2575, 3995, 1089, 1587, 2475, 3282, 339, 2657, 1158, 2105, 1493, 3943, 580, 3232, 1287, 846, 48, 2480, 2112, 771, 2534, 459, 3134, 850, 1298, 3790, 325, 3652, 1249, 193, 940, 2202, 3895, 1829, 911, 1366, 2577, 1069, 534,
+ 2104, 1009, 2667, 392, 1983, 2917, 1645, 324, 3439, 2869, 3705, 1767, 2592, 756, 2916, 3683, 2276, 2850, 2053, 3594, 2403, 3181, 634, 3699, 1933, 906, 519, 2150, 3673, 764, 1770, 2220, 3795, 3336, 502, 3547, 2339, 1110, 301, 2210, 3354, 3643, 569, 1518, 2940, 3973, 1138, 1613, 2773, 2127, 2983, 1671, 769, 2161, 3800, 2730, 3127, 1179, 533, 3259, 2284, 4014, 1651, 2820,
+ 3566, 653, 1839, 3455, 2399, 789, 3149, 2244, 1863, 1099, 474, 2307, 158, 3541, 1312, 1711, 0, 3902, 360, 1629, 1091, 395, 1781, 1191, 2374, 3353, 1419, 3225, 206, 2931, 3553, 1046, 54, 1646, 2470, 910, 1860, 3137, 3770, 2635, 1562, 2809, 1215, 3788, 222, 2199, 3335, 67, 3606, 524, 1001, 3309, 2410, 3473, 591, 1619, 291, 2502, 3629, 2891, 335, 741, 3378, 168,
+ 2384, 3129, 4051, 22, 1444, 3613, 543, 3893, 186, 2665, 4062, 933, 3058, 2142, 449, 2711, 3224, 849, 1330, 3349, 2195, 2670, 3484, 2993, 32, 3774, 2722, 1859, 2548, 1268, 583, 2027, 3165, 2807, 4029, 227, 2897, 1434, 721, 1816, 195, 905, 2066, 3258, 1754, 970, 2674, 1880, 2338, 3915, 1485, 2660, 14, 1313, 2914, 2046, 4074, 791, 1917, 1301, 1725, 2687, 2019, 1443,
+ 418, 1186, 1664, 2859, 1049, 2056, 2741, 1226, 1589, 3186, 2042, 1377, 3449, 1574, 3941, 1063, 1930, 2501, 3751, 2930, 671, 4031, 888, 2081, 1544, 684, 1117, 351, 4052, 1698, 2393, 3881, 1439, 785, 1277, 2013, 3488, 441, 2459, 3980, 3061, 3481, 2543, 419, 3020, 609, 3515, 1350, 799, 2878, 348, 2034, 3966, 1824, 950, 3281, 1394, 2239, 3452, 55, 3922, 3119, 892, 3785,
+ 3023, 2140, 782, 2492, 3817, 241, 3355, 2424, 856, 3639, 612, 2556, 245, 2858, 705, 2316, 3562, 495, 1748, 128, 1912, 1454, 280, 2552, 3905, 3130, 2274, 3472, 834, 3055, 240, 2692, 471, 2272, 3301, 2632, 1080, 3693, 2136, 1029, 1364, 590, 1611, 4067, 1190, 2360, 3827, 261, 3180, 1768, 3471, 1103, 3003, 520, 3674, 151, 2571, 555, 3033, 982, 2353, 504, 1259, 2555,
+ 149, 3889, 3380, 493, 3178, 1681, 663, 1924, 2990, 49, 1792, 3861, 1192, 1987, 3273, 297, 1457, 3043, 1177, 2292, 3249, 2829, 3682, 1154, 1758, 428, 2872, 1993, 1500, 3703, 1129, 3421, 1840, 3754, 163, 659, 1733, 3182, 38, 2875, 1957, 3614, 2237, 78, 1873, 2801, 1513, 2121, 1074, 2516, 667, 3710, 1429, 2430, 2088, 2830, 1072, 3557, 1531, 2733, 1955, 3286, 3590, 1826,
+ 2778, 1068, 1932, 1452, 2279, 1185, 3564, 3952, 1391, 2726, 3313, 2331, 870, 3709, 1674, 2772, 4085, 808, 2596, 3848, 927, 538, 2335, 3334, 773, 3597, 1347, 109, 2663, 608, 2108, 2994, 936, 1524, 2922, 3968, 2422, 1467, 845, 3870, 321, 2704, 1073, 3308, 3680, 823, 430, 3375, 4030, 112, 2171, 2695, 267, 3374, 731, 1627, 3919, 1871, 352, 3839, 1370, 234, 794, 1532,
+ 3245, 647, 3575, 74, 3045, 2766, 285, 2174, 498, 1059, 1551, 385, 3125, 2598, 143, 1128, 2095, 3395, 318, 1590, 3524, 1345, 1969, 242, 2759, 2092, 947, 3926, 3244, 2356, 1658, 6, 3593, 2554, 1172, 1995, 371, 2755, 3417, 2294, 1570, 3164, 748, 2517, 1401, 3111, 2420, 1662, 2910, 1276, 3276, 854, 1804, 4000, 1253, 2987, 229, 2344, 3184, 649, 2196, 2921, 4095, 2389,
+ 1289, 2193, 2579, 4023, 757, 1858, 986, 3199, 2514, 3475, 4021, 2154, 651, 1432, 3468, 2404, 574, 1799, 3105, 2145, 86, 2614, 3218, 1565, 4088, 2481, 3079, 1815, 323, 1212, 3837, 759, 2159, 435, 3223, 784, 3659, 1114, 1888, 550, 1221, 3786, 1803, 499, 2117, 185, 3763, 942, 589, 2001, 3838, 1483, 3154, 2256, 468, 2544, 3403, 898, 1208, 2610, 3622, 967, 1929, 378,
+ 3781, 220, 1656, 1115, 3347, 2428, 3822, 1577, 712, 1959, 110, 2765, 1762, 3854, 979, 2928, 3714, 1371, 746, 3969, 2884, 975, 3779, 641, 1142, 159, 1460, 702, 3485, 2866, 2495, 3330, 1305, 3937, 1635, 2229, 2962, 146, 4055, 3091, 2417, 100, 3508, 2933, 4006, 1167, 1920, 2760, 3552, 2545, 433, 2845, 142, 1056, 1886, 3616, 1435, 2099, 3803, 1749, 27, 1446, 3350, 2843,
+ 884, 3310, 2948, 2103, 447, 1351, 187, 2895, 3655, 1256, 3036, 932, 3325, 2257, 451, 1915, 40, 2780, 2438, 1112, 1814, 423, 2290, 1905, 2898, 3419, 2306, 3760, 1938, 486, 1019, 1791, 3010, 2628, 203, 3408, 1269, 2507, 1606, 862, 2779, 2078, 952, 1529, 2638, 708, 3332, 1413, 2, 1726, 1156, 3500, 2392, 3791, 3076, 812, 107, 2861, 501, 3050, 3487, 2455, 594, 1731,
+ 2685, 1498, 680, 3908, 2621, 3529, 1786, 2236, 342, 2569, 1526, 3722, 230, 1290, 3203, 3947, 1609, 3516, 467, 3267, 3685, 1461, 3140, 3569, 367, 1759, 928, 2754, 1332, 2219, 4034, 260, 655, 1984, 978, 3814, 617, 2086, 3525, 279, 3841, 1373, 3361, 319, 2251, 3066, 407, 2382, 3918, 3133, 2168, 762, 1523, 507, 2641, 1677, 4025, 2413, 1584, 793, 2049, 1109, 3962, 2218,
+ 1194, 3692, 266, 1687, 981, 3103, 740, 3983, 1005, 3434, 570, 2383, 1942, 2718, 676, 2462, 1007, 2089, 1308, 2222, 233, 2568, 829, 1241, 2669, 3987, 514, 3303, 69, 3142, 1603, 3560, 2295, 3288, 1497, 2696, 1764, 2865, 1058, 3271, 1914, 477, 2529, 3927, 1736, 1273, 3752, 2029, 1012, 565, 2798, 4078, 1949, 3305, 1175, 2179, 380, 3366, 1195, 3849, 2637, 416, 2959, 125,
+ 3396, 2467, 2036, 3234, 2340, 68, 2819, 1436, 2011, 3139, 1704, 4073, 860, 3582, 1468, 2969, 211, 3157, 4056, 866, 2935, 2000, 3923, 31, 2157, 1477, 2429, 1147, 3792, 2557, 774, 2802, 1153, 3747, 464, 3192, 42, 3904, 539, 1474, 2283, 803, 2876, 1061, 75, 3477, 747, 2893, 1538, 3626, 251, 1322, 2506, 189, 2791, 3667, 939, 2991, 1971, 175, 3195, 1416, 3648, 1857,
+ 3052, 454, 851, 3789, 1271, 1906, 3694, 2484, 406, 2757, 26, 1189, 2909, 296, 2215, 3784, 1864, 637, 2715, 1673, 3445, 581, 1572, 3059, 3469, 761, 2984, 1737, 2058, 440, 1414, 1921, 121, 2527, 894, 2223, 1302, 2377, 3077, 2666, 3759, 3198, 1811, 3661, 2166, 2731, 1883, 359, 3285, 2458, 1805, 3459, 926, 3834, 675, 1893, 1496, 2612, 657, 3523, 1763, 2354, 564, 961,
+ 1367, 3977, 1588, 2714, 322, 3446, 1088, 625, 3887, 1354, 3535, 2090, 3316, 1760, 1127, 483, 3491, 1421, 2301, 94, 1202, 3740, 2311, 1014, 1878, 3836, 180, 3412, 991, 2868, 3953, 3450, 3081, 1632, 4071, 1882, 3543, 726, 1719, 179, 1171, 364, 1420, 622, 3090, 1490, 946, 4007, 2212, 1102, 619, 2739, 2189, 1669, 2937, 3426, 39, 3940, 2191, 1264, 887, 4091, 2792, 2135,
+ 4, 2883, 2281, 631, 3044, 1641, 2232, 3243, 1773, 2319, 827, 2591, 629, 3938, 2426, 3222, 2629, 1044, 3879, 3293, 1952, 2749, 275, 2590, 472, 1372, 2496, 660, 3669, 2264, 208, 915, 2167, 561, 2828, 307, 3265, 1104, 3964, 2155, 3425, 1951, 4077, 2391, 283, 3387, 2581, 115, 1415, 3069, 3896, 141, 3158, 1214, 442, 2405, 1349, 3085, 425, 2528, 3002, 312, 1602, 3588,
+ 1137, 3323, 1963, 1002, 3578, 2521, 127, 925, 2970, 273, 3737, 1573, 167, 2863, 1509, 800, 147, 2059, 2942, 409, 921, 3151, 1451, 3909, 3333, 2844, 2096, 1512, 3136, 1210, 1798, 2709, 1331, 3586, 1034, 1521, 2441, 2926, 488, 2585, 775, 3031, 2693, 879, 3602, 1173, 2028, 3654, 2781, 841, 1975, 1507, 3646, 768, 3991, 2012, 996, 3544, 1666, 3810, 1990, 3360, 753, 2597,
+ 3736, 304, 1473, 3828, 485, 1334, 4008, 2072, 3495, 1136, 2806, 2004, 3236, 1010, 2130, 3819, 1750, 3567, 644, 2515, 1794, 3636, 698, 2137, 1162, 832, 3761, 326, 2613, 513, 3302, 3820, 357, 3163, 2259, 3733, 101, 1922, 1386, 3587, 1640, 28, 1286, 2141, 1761, 2918, 693, 1639, 457, 3250, 2434, 365, 2599, 1729, 3284, 2643, 306, 2793, 689, 1090, 104, 1309, 2305, 1831,
+ 2776, 859, 2446, 2915, 1778, 3337, 2677, 614, 1508, 2409, 469, 4033, 1321, 3563, 402, 3131, 2720, 1093, 1569, 4042, 1229, 2277, 216, 3046, 1817, 57, 3006, 1684, 4059, 2016, 795, 2440, 1652, 1960, 610, 2763, 920, 3864, 3110, 1026, 2326, 3762, 3233, 521, 3856, 173, 2457, 3939, 2138, 1262, 3572, 989, 3021, 2238, 119, 1445, 3832, 1809, 2297, 3467, 2700, 3684, 3102, 394,
+ 4036, 2050, 3256, 89, 2198, 1079, 248, 1845, 3805, 3104, 880, 1779, 2688, 717, 2373, 1375, 262, 2249, 3071, 13, 2813, 3429, 1600, 3984, 2416, 3603, 1299, 2298, 998, 3492, 1393, 2951, 10, 4009, 1247, 3462, 1679, 2204, 414, 2736, 316, 1894, 2816, 1050, 3373, 1462, 3107, 817, 3464, 21, 1835, 4070, 568, 1178, 3718, 875, 3168, 466, 2974, 1458, 2084, 616, 1564, 1018,
+ 1693, 546, 1244, 3899, 716, 3160, 3608, 2877, 1220, 334, 3443, 2270, 44, 3000, 1843, 3928, 3405, 766, 3686, 2040, 587, 993, 2647, 387, 930, 2753, 630, 3274, 150, 2808, 453, 3638, 1092, 2352, 3030, 239, 2562, 700, 3240, 1257, 4016, 730, 1515, 2203, 2551, 417, 1866, 1123, 2348, 2902, 1550, 2678, 2075, 3238, 1630, 2531, 2115, 1255, 4054, 840, 290, 3874, 2477, 3399,
+ 2250, 3577, 2817, 1626, 2576, 1356, 2315, 792, 2087, 2618, 1612, 3855, 1263, 3637, 1036, 494, 1535, 2553, 1198, 1715, 3867, 3170, 1359, 1954, 3483, 1539, 2069, 3886, 1772, 2487, 1534, 2045, 3242, 806, 1578, 2018, 3948, 1423, 3596, 2076, 2466, 3424, 139, 3688, 871, 4049, 2852, 3342, 547, 3719, 327, 852, 3505, 207, 2794, 542, 3600, 45, 2411, 3324, 1788, 3012, 1235, 61,
+ 2655, 917, 253, 1986, 3738, 313, 1706, 4072, 120, 3229, 957, 597, 2024, 3262, 2453, 2857, 2002, 3190, 210, 2784, 2206, 300, 2400, 3766, 553, 3152, 218, 1150, 2988, 883, 3753, 627, 2664, 3831, 437, 3385, 1008, 2957, 60, 1636, 891, 2899, 1776, 3062, 1315, 2026, 194, 1643, 2079, 1296, 3201, 2465, 1379, 1927, 3898, 1125, 1847, 2846, 1552, 1028, 2725, 2169, 787, 3202,
+ 1441, 3982, 3032, 1052, 3251, 605, 2639, 3073, 1431, 3642, 2329, 2949, 341, 1634, 833, 129, 4020, 916, 3571, 669, 1506, 3411, 821, 2856, 1207, 2337, 2683, 3448, 340, 2214, 3128, 235, 1738, 1288, 2833, 2419, 606, 1884, 2668, 552, 3765, 1176, 399, 2302, 596, 3591, 2634, 767, 3845, 2767, 995, 3967, 491, 3057, 814, 2300, 3422, 691, 3797, 254, 3645, 509, 3478, 1836,
+ 2119, 475, 2445, 1525, 2175, 3539, 914, 1926, 473, 1157, 1800, 3971, 2701, 3739, 2129, 3486, 1333, 1784, 2366, 2982, 1070, 4089, 1802, 73, 1642, 3958, 835, 1837, 1480, 4043, 1217, 2469, 3416, 2113, 88, 3668, 1240, 3255, 3920, 2355, 3167, 2003, 2645, 3936, 3228, 1592, 1144, 3474, 2394, 79, 1820, 2241, 1594, 3656, 2584, 153, 1448, 3034, 2005, 2511, 1692, 1335, 3913, 217,
+ 2822, 3391, 745, 3813, 192, 1274, 2941, 3847, 2489, 3440, 744, 161, 1422, 1086, 572, 3004, 2617, 338, 3807, 2031, 236, 2472, 3065, 2098, 3358, 362, 2163, 3574, 497, 2788, 1970, 948, 3885, 685, 3100, 1712, 2228, 292, 1408, 1016, 164, 3537, 1417, 941, 34, 2172, 3001, 358, 1491, 3147, 699, 3356, 258, 1149, 2946, 1787, 3931, 382, 1146, 3291, 818, 2890, 2379, 1096,
+ 3679, 1328, 1901, 3162, 2747, 1730, 2253, 5, 1556, 2818, 2093, 3166, 2522, 3410, 2287, 1701, 956, 3237, 620, 1596, 3300, 1307, 511, 3701, 1020, 2939, 1362, 2532, 3208, 749, 3641, 160, 1522, 2624, 1095, 4086, 826, 2841, 3583, 2173, 1727, 723, 2925, 1911, 2482, 3726, 863, 1962, 4028, 1111, 2835, 3773, 2449, 2022, 582, 3278, 923, 2619, 2152, 4039, 92, 1934, 3145, 677,
+ 2530, 53, 2303, 1003, 458, 3989, 739, 3321, 1064, 369, 3556, 877, 1900, 426, 3876, 1, 3617, 2106, 1197, 2805, 3634, 857, 2706, 1504, 2418, 682, 3868, 20, 1139, 1688, 2333, 3311, 2907, 1945, 265, 2385, 3433, 1601, 636, 2620, 3095, 4044, 386, 3382, 1184, 527, 2814, 3414, 2342, 465, 1889, 1343, 874, 3479, 1502, 2233, 3689, 1385, 559, 2745, 1463, 3465, 376, 1718,
+ 3217, 4045, 1580, 3612, 2525, 1228, 3018, 1958, 3725, 2358, 1361, 3996, 1581, 3063, 1224, 2737, 1475, 2442, 3946, 191, 1796, 2128, 3975, 134, 1916, 3318, 1597, 2071, 3749, 2672, 403, 1278, 602, 3745, 3220, 1374, 445, 2064, 3830, 243, 1252, 2390, 1563, 2724, 3875, 1818, 1346, 165, 1650, 3264, 2680, 117, 2998, 4081, 343, 2799, 9, 3122, 1743, 3724, 1040, 2231, 3842, 1209,
+ 900, 398, 2851, 697, 1797, 3482, 293, 2679, 1649, 566, 2954, 91, 2697, 714, 2060, 3211, 781, 480, 3040, 1038, 2611, 666, 2989, 3458, 1201, 2796, 548, 2975, 839, 3121, 1850, 4001, 2208, 1631, 790, 2558, 2972, 1148, 3213, 1849, 3624, 971, 2102, 108, 772, 3101, 2589, 3777, 1042, 656, 3907, 2097, 1615, 2540, 805, 1935, 1231, 3494, 2451, 268, 2995, 750, 2682, 2020,
+ 3024, 1392, 2124, 3279, 106, 2217, 1387, 822, 3214, 3825, 2160, 1000, 2395, 3691, 228, 4038, 1872, 3413, 1608, 2225, 3536, 303, 1653, 886, 2541, 224, 4037, 2252, 1428, 172, 3504, 958, 2848, 113, 3628, 1834, 3979, 19, 2317, 779, 2797, 518, 3174, 3549, 1482, 2266, 444, 2014, 3555, 2439, 1213, 3113, 535, 1135, 3204, 3858, 2309, 931, 623, 2009, 3359, 1566, 140, 3550,
+ 1808, 3872, 2488, 1152, 3764, 2892, 3960, 2412, 353, 1223, 1825, 3444, 3116, 1717, 1082, 2313, 1280, 2661, 82, 3852, 1389, 3200, 2330, 3812, 2038, 3581, 1728, 1039, 3339, 2427, 586, 2580, 1238, 3328, 2280, 1047, 595, 2662, 1363, 3338, 1620, 3934, 2497, 1881, 1054, 3954, 3215, 864, 2887, 1801, 320, 3519, 2378, 3704, 1753, 424, 2958, 1660, 4005, 2601, 1116, 3912, 2381, 573,
+ 2740, 200, 828, 1667, 432, 1931, 1035, 1616, 3598, 2640, 728, 264, 1437, 557, 3501, 2966, 372, 3734, 974, 1978, 758, 2719, 1145, 452, 1433, 725, 2681, 408, 3843, 1918, 1547, 3906, 1996, 503, 1456, 3019, 3493, 1700, 3742, 355, 2134, 176, 1311, 615, 2867, 315, 1680, 1314, 8, 3297, 1494, 783, 1950, 83, 2656, 1382, 3561, 138, 2834, 1404, 330, 1904, 3156, 1027,
+ 1357, 3381, 3041, 3666, 2729, 734, 3415, 177, 3051, 2021, 4079, 2823, 3775, 2186, 2616, 869, 1668, 3148, 2367, 3315, 393, 4075, 1870, 2920, 3343, 2362, 3188, 1303, 2782, 825, 3171, 259, 2905, 3717, 2538, 184, 2074, 838, 2860, 2407, 1024, 3496, 3008, 3706, 1985, 2349, 3623, 2582, 4058, 2184, 2694, 3873, 2964, 990, 3346, 690, 2033, 1066, 2201, 3490, 2971, 718, 3700, 2188,
+ 4061, 391, 1989, 2325, 1430, 3150, 2125, 2526, 592, 1403, 976, 2351, 1165, 1851, 114, 3921, 2063, 613, 1358, 2785, 1623, 2254, 25, 3542, 1045, 246, 1852, 3554, 87, 2243, 3615, 1169, 727, 1705, 968, 3957, 3185, 1251, 500, 4063, 1751, 2622, 842, 1519, 90, 3393, 819, 490, 1874, 999, 571, 1275, 2271, 1586, 4040, 2448, 3126, 3731, 436, 885, 1708, 2421, 24, 1599,
+ 889, 2563, 1199, 645, 70, 4013, 1237, 3723, 1694, 3499, 3, 3266, 484, 2997, 3390, 1233, 2842, 3687, 152, 3480, 1084, 3698, 881, 2490, 1542, 3992, 2209, 692, 1690, 3022, 1470, 2625, 2114, 3512, 2359, 381, 2684, 1897, 3368, 1395, 3080, 289, 2065, 3981, 2758, 1141, 3097, 1472, 2870, 3352, 3707, 225, 3159, 505, 1895, 214, 1222, 1774, 2686, 3978, 3275, 1196, 3518, 2825,
+ 3270, 1720, 3796, 3466, 2650, 1841, 298, 899, 2862, 2091, 2671, 1744, 3735, 801, 1560, 349, 2262, 903, 1833, 2524, 512, 3117, 1793, 2827, 476, 3038, 1216, 2550, 3826, 980, 431, 4048, 35, 2992, 1265, 1595, 765, 3675, 76, 2247, 696, 3456, 1254, 2452, 664, 1757, 2133, 3750, 145, 2332, 1554, 1981, 3580, 2712, 868, 3640, 2919, 638, 2275, 1427, 309, 2595, 2006, 492,
+ 2226, 178, 2911, 836, 1528, 3028, 2240, 3327, 404, 3970, 707, 1294, 2464, 2131, 4032, 2600, 3319, 1406, 2913, 3974, 2156, 1425, 221, 3877, 2017, 811, 3662, 272, 3287, 1988, 2408, 3357, 1746, 598, 3239, 3823, 2182, 2934, 1078, 2604, 3840, 1697, 2906, 413, 3210, 3880, 331, 2644, 1260, 848, 3042, 2535, 1077, 1438, 3261, 2365, 1561, 3799, 85, 3082, 1876, 674, 3932, 1101,
+ 3644, 1344, 1943, 2401, 390, 3835, 1048, 2572, 1541, 1133, 3075, 3584, 308, 2889, 1065, 1869, 601, 3783, 282, 1181, 736, 3312, 2368, 1126, 3383, 1675, 2734, 1426, 628, 2873, 1317, 843, 2717, 2048, 1004, 2536, 333, 1782, 3295, 1517, 219, 2153, 815, 3502, 1579, 2268, 987, 3409, 1780, 4018, 354, 665, 3914, 47, 1956, 456, 1006, 2010, 3406, 1130, 3621, 2894, 1549, 3092,
+ 2485, 640, 3993, 3179, 1270, 3436, 585, 1925, 3757, 2304, 136, 1976, 1486, 646, 3520, 50, 3155, 1637, 2435, 3522, 1937, 2756, 3748, 661, 2224, 58, 3230, 2357, 1830, 3892, 170, 3607, 1447, 3949, 190, 3392, 1336, 584, 4010, 918, 3016, 3670, 1155, 2406, 52, 1304, 3009, 607, 2085, 2699, 3205, 1848, 2291, 3402, 2764, 3865, 3048, 2508, 735, 2710, 443, 2341, 897, 263,
+ 1785, 2769, 983, 56, 2197, 1685, 2703, 202, 2944, 810, 3377, 2626, 3787, 3047, 2055, 1236, 2752, 2122, 945, 3093, 96, 1624, 439, 3014, 1388, 4015, 977, 448, 3506, 1098, 2242, 3026, 506, 2361, 2952, 1862, 3619, 2790, 1992, 2483, 525, 1868, 2652, 4093, 1998, 3595, 2478, 3816, 122, 1412, 929, 3716, 1166, 1648, 813, 1300, 199, 1489, 3998, 1771, 1310, 3808, 2052, 3423,
+ 434, 3712, 1625, 3558, 2955, 853, 4019, 1348, 3511, 1732, 1246, 487, 934, 1672, 2510, 3965, 788, 3711, 396, 1369, 4090, 1055, 2603, 1879, 3528, 2518, 2067, 3005, 1516, 2588, 751, 1740, 3418, 1131, 1576, 686, 2296, 1118, 18, 3263, 1365, 3401, 294, 737, 3177, 410, 867, 1633, 2963, 3579, 2375, 252, 2881, 479, 2471, 3576, 2180, 3306, 332, 2255, 3035, 41, 2648, 1396,
+ 2929, 2230, 1219, 2512, 446, 2008, 3189, 2388, 626, 2164, 2831, 4047, 2376, 174, 3272, 368, 1469, 3226, 2578, 1991, 2874, 2263, 3681, 876, 188, 1239, 683, 3776, 226, 3183, 4083, 2148, 63, 2649, 3859, 299, 3086, 3933, 1585, 2185, 3767, 988, 1707, 2908, 1407, 1844, 2771, 2245, 1161, 560, 1755, 3376, 2051, 4064, 3135, 1832, 652, 2853, 1051, 3649, 760, 3290, 1105, 3945,
+ 872, 154, 3207, 713, 3780, 1453, 281, 1087, 3695, 30, 3299, 1919, 1400, 3551, 1119, 1890, 2314, 618, 1703, 3428, 724, 295, 3146, 1557, 3341, 2896, 1683, 2723, 1974, 1017, 541, 1380, 3720, 804, 3280, 2082, 997, 2567, 777, 2961, 213, 2707, 2328, 3632, 1025, 3891, 3304, 255, 4003, 3108, 2587, 1323, 743, 1479, 105, 1013, 3901, 1618, 2044, 2627, 1465, 1846, 576, 1994,
+ 2560, 3521, 1742, 2118, 2800, 3404, 1783, 2609, 2968, 1582, 1022, 412, 2713, 687, 2976, 3857, 2761, 3620, 62, 1108, 3844, 1340, 2100, 540, 2345, 3925, 405, 3457, 1319, 2468, 3362, 2815, 1867, 2372, 1281, 1714, 3690, 482, 3498, 1842, 1285, 3994, 558, 2039, 81, 2499, 678, 1481, 1923, 964, 12, 3824, 2980, 2205, 2762, 3432, 2398, 181, 3247, 462, 4094, 2350, 3589, 3089,
+ 1555, 1094, 4041, 247, 1267, 908, 3959, 2041, 732, 3860, 2343, 3132, 3769, 2144, 1621, 237, 912, 1329, 3025, 2146, 2642, 1775, 3721, 2746, 1121, 1953, 902, 2285, 130, 3671, 1659, 278, 3153, 522, 2721, 123, 2996, 1466, 2380, 377, 3231, 873, 1510, 3476, 3123, 1250, 2147, 3650, 2839, 3451, 2323, 1122, 3545, 379, 1765, 1218, 603, 3768, 1360, 938, 2885, 133, 1245, 363,
+ 2364, 554, 2743, 3344, 2474, 530, 3112, 169, 1297, 3430, 536, 1741, 98, 1043, 2574, 3253, 2246, 1854, 4022, 510, 3283, 204, 858, 3398, 36, 3118, 1478, 3794, 2986, 706, 2176, 922, 3559, 1097, 3976, 3322, 2149, 1160, 2810, 3883, 2007, 2513, 2953, 328, 1721, 3793, 422, 2566, 807, 329, 1638, 1967, 648, 2520, 3727, 3109, 2116, 2927, 2491, 1939, 3365, 1709, 2728, 3815,
+ 2037, 3120, 831, 1405, 1896, 3592, 1622, 2369, 2864, 2151, 1107, 2542, 3532, 1410, 3917, 427, 3568, 709, 2509, 1503, 1037, 2973, 2436, 1604, 4035, 2594, 563, 1819, 2659, 1234, 4004, 2565, 1511, 2273, 1823, 336, 882, 3772, 575, 1628, 171, 3570, 1120, 2260, 2716, 935, 3064, 1806, 1342, 3144, 3900, 2744, 3296, 985, 1546, 238, 896, 1663, 305, 3660, 695, 2213, 960, 3407,
+ 144, 1795, 3894, 2267, 51, 2708, 1023, 3818, 366, 1821, 4087, 2985, 755, 2057, 2912, 949, 1583, 2774, 231, 3447, 2258, 3866, 1982, 672, 1225, 2077, 3320, 1062, 370, 3241, 1968, 7, 3068, 681, 3631, 2573, 1567, 3175, 2321, 1067, 3070, 722, 1856, 3744, 642, 1471, 4084, 131, 3514, 2443, 531, 1227, 155, 2265, 4024, 2658, 3326, 3910, 1168, 3078, 1530, 3956, 489, 1424,
+ 3647, 1203, 420, 2924, 3755, 719, 3248, 1376, 3067, 890, 196, 1559, 3269, 270, 2432, 1885, 3212, 1164, 3778, 1752, 579, 1338, 344, 3585, 3017, 288, 3658, 2371, 3882, 1691, 611, 2789, 3809, 1339, 389, 2950, 2015, 59, 3548, 2751, 2158, 4011, 1352, 29, 3388, 2370, 2812, 1946, 954, 2110, 1558, 2947, 3573, 1909, 1326, 679, 1853, 2312, 551, 2702, 33, 2414, 3209, 2824,
+ 2547, 2143, 3379, 966, 1492, 1979, 2479, 463, 2194, 3657, 2738, 2318, 1261, 3713, 604, 4002, 11, 2192, 2967, 919, 2607, 3369, 2837, 1676, 2539, 984, 1568, 93, 2901, 1318, 3538, 1041, 2216, 1756, 3454, 1030, 4050, 1402, 798, 1723, 311, 3277, 2546, 2886, 2043, 461, 1206, 3677, 361, 3260, 3988, 809, 2605, 470, 3007, 3517, 102, 3221, 1398, 2062, 3611, 1134, 1928, 865,
+ 4060, 621, 1710, 2606, 3510, 317, 4017, 1682, 3329, 1159, 1940, 654, 3461, 1789, 1015, 2691, 1455, 3599, 374, 1947, 4069, 71, 2126, 763, 3961, 2278, 3161, 1997, 824, 2623, 2080, 244, 3257, 780, 2732, 2308, 545, 3351, 2476, 3806, 1204, 588, 1591, 963, 3610, 1699, 754, 3049, 2651, 1106, 65, 2221, 1644, 3821, 1100, 2463, 1614, 3801, 965, 2965, 715, 3394, 1593, 212,
+};
+
+#endif /* BLUE_NOISE_64X64_H */
diff --git a/pixman/dither/make-blue-noise.c b/pixman/dither/make-blue-noise.c
new file mode 100644
index 0000000..f9974b4
--- /dev/null
+++ b/pixman/dither/make-blue-noise.c
@@ -0,0 +1,679 @@
+/* Blue noise generation using the void-and-cluster method as described in
+ *
+ * The void-and-cluster method for dither array generation
+ * Ulichney, Robert A (1993)
+ *
+ * http://cv.ulichney.com/papers/1993-void-cluster.pdf
+ *
+ * Note that running with openmp (-DUSE_OPENMP) will trigger additional
+ * randomness due to computing reductions in parallel, and is not recommended
+ * unless generating very large dither arrays.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <stdio.h>
+
+/* Booleans and utility functions */
+
+#ifndef TRUE
+# define TRUE 1
+#endif
+
+#ifndef FALSE
+# define FALSE 0
+#endif
+
+typedef int bool_t;
+
+int
+imin (int x, int y)
+{
+ return x < y ? x : y;
+}
+
+/* Memory allocation */
+void *
+malloc_abc (unsigned int a, unsigned int b, unsigned int c)
+{
+ if (a >= INT32_MAX / b)
+ return NULL;
+ else if (a * b >= INT32_MAX / c)
+ return NULL;
+ else
+ return malloc (a * b * c);
+}
+
+/* Random number generation */
+typedef uint32_t xorwow_state_t[5];
+
+uint32_t
+xorwow_next (xorwow_state_t *state)
+{
+ uint32_t s = (*state)[0],
+ t = (*state)[3];
+ (*state)[3] = (*state)[2];
+ (*state)[2] = (*state)[1];
+ (*state)[1] = s;
+
+ t ^= t >> 2;
+ t ^= t << 1;
+ t ^= s ^ (s << 4);
+
+ (*state)[0] = t;
+ (*state)[4] += 362437;
+
+ return t + (*state)[4];
+}
+
+float
+xorwow_float (xorwow_state_t *s)
+{
+ return (xorwow_next (s) >> 9) / (float)((1 << 23) - 1);
+}
+
+/* Floating point matrices
+ *
+ * Used to cache the cluster sizes.
+ */
+typedef struct matrix_t {
+ int width;
+ int height;
+ float *buffer;
+} matrix_t;
+
+bool_t
+matrix_init (matrix_t *matrix, int width, int height)
+{
+ float *buffer;
+
+ if (!matrix)
+ return FALSE;
+
+ buffer = malloc_abc (width, height, sizeof (float));
+
+ if (!buffer)
+ return FALSE;
+
+ matrix->buffer = buffer;
+ matrix->width = width;
+ matrix->height = height;
+
+ return TRUE;
+}
+
+bool_t
+matrix_copy (matrix_t *dst, matrix_t const *src)
+{
+ float *srcbuf = src->buffer,
+ *srcend = src->buffer + src->width * src->height,
+ *dstbuf = dst->buffer;
+
+ if (dst->width != src->width || dst->height != src->height)
+ return FALSE;
+
+ while (srcbuf < srcend)
+ *dstbuf++ = *srcbuf++;
+
+ return TRUE;
+}
+
+float *
+matrix_get (matrix_t *matrix, int x, int y)
+{
+ return &matrix->buffer[y * matrix->width + x];
+}
+
+void
+matrix_destroy (matrix_t *matrix)
+{
+ free (matrix->buffer);
+}
+
+/* Binary patterns */
+typedef struct pattern_t {
+ int width;
+ int height;
+ bool_t *buffer;
+} pattern_t;
+
+bool_t
+pattern_init (pattern_t *pattern, int width, int height)
+{
+ bool_t *buffer;
+
+ if (!pattern)
+ return FALSE;
+
+ buffer = malloc_abc (width, height, sizeof (bool_t));
+
+ if (!buffer)
+ return FALSE;
+
+ pattern->buffer = buffer;
+ pattern->width = width;
+ pattern->height = height;
+
+ return TRUE;
+}
+
+bool_t
+pattern_copy (pattern_t *dst, pattern_t const *src)
+{
+ bool_t *srcbuf = src->buffer,
+ *srcend = src->buffer + src->width * src->height,
+ *dstbuf = dst->buffer;
+
+ if (dst->width != src->width || dst->height != src->height)
+ return FALSE;
+
+ while (srcbuf < srcend)
+ *dstbuf++ = *srcbuf++;
+
+ return TRUE;
+}
+
+bool_t *
+pattern_get (pattern_t *pattern, int x, int y)
+{
+ return &pattern->buffer[y * pattern->width + x];
+}
+
+void
+pattern_fill_white_noise (pattern_t *pattern, float fraction,
+ xorwow_state_t *s)
+{
+ bool_t *buffer = pattern->buffer;
+ bool_t *end = buffer + (pattern->width * pattern->height);
+
+ while (buffer < end)
+ *buffer++ = xorwow_float (s) < fraction;
+}
+
+void
+pattern_destroy (pattern_t *pattern)
+{
+ free (pattern->buffer);
+}
+
+/* Dither arrays */
+typedef struct array_t {
+ int width;
+ int height;
+ uint32_t *buffer;
+} array_t;
+
+bool_t
+array_init (array_t *array, int width, int height)
+{
+ uint32_t *buffer;
+
+ if (!array)
+ return FALSE;
+
+ buffer = malloc_abc (width, height, sizeof (uint32_t));
+
+ if (!buffer)
+ return FALSE;
+
+ array->buffer = buffer;
+ array->width = width;
+ array->height = height;
+
+ return TRUE;
+}
+
+uint32_t *
+array_get (array_t *array, int x, int y)
+{
+ return &array->buffer[y * array->width + x];
+}
+
+bool_t
+array_save_ppm (array_t *array, const char *filename)
+{
+ FILE *f = fopen(filename, "wb");
+
+ int i = 0;
+ int bpp = 2;
+ uint8_t buffer[1024];
+
+ if (!f)
+ return FALSE;
+
+ if (array->width * array->height - 1 < 256)
+ bpp = 1;
+
+ fprintf(f, "P5 %d %d %d\n", array->width, array->height,
+ array->width * array->height - 1);
+ while (i < array->width * array->height)
+ {
+ int j = 0;
+ for (; j < 1024 / bpp && j < array->width * array->height; ++j)
+ {
+ uint32_t v = array->buffer[i + j];
+ if (bpp == 2)
+ {
+ buffer[2 * j] = v & 0xff;
+ buffer[2 * j + 1] = (v & 0xff00) >> 8;
+ } else {
+ buffer[j] = v;
+ }
+ }
+
+ fwrite((void *)buffer, bpp, j, f);
+ i += j;
+ }
+
+ if (fclose(f) != 0)
+ return FALSE;
+
+ return TRUE;
+}
+
+bool_t
+array_save (array_t *array, const char *filename)
+{
+ int x, y;
+ FILE *f = fopen(filename, "wb");
+
+ if (!f)
+ return FALSE;
+
+ fprintf (f,
+"/* WARNING: This file is generated by make-blue-noise.c\n"
+" * Please edit that file instead of this one.\n"
+" */\n"
+"\n"
+"#ifndef BLUE_NOISE_%dX%d_H\n"
+"#define BLUE_NOISE_%dX%d_H\n"
+"\n"
+"#include <stdint.h>\n"
+"\n", array->width, array->height, array->width, array->height);
+
+ fprintf (f, "static const uint16_t dither_blue_noise_%dx%d[%d] = {\n",
+ array->width, array->height, array->width * array->height);
+
+ for (y = 0; y < array->height; ++y)
+ {
+ fprintf (f, " ");
+ for (x = 0; x < array->width; ++x)
+ {
+ if (x != 0)
+ fprintf (f, ", ");
+
+ fprintf (f, "%d", *array_get (array, x, y));
+ }
+
+ fprintf (f, ",\n");
+ }
+ fprintf (f, "};\n");
+
+ fprintf (f, "\n#endif /* BLUE_NOISE_%dX%d_H */\n",
+ array->width, array->height);
+
+ if (fclose(f) != 0)
+ return FALSE;
+
+ return TRUE;
+}
+
+void
+array_destroy (array_t *array)
+{
+ free (array->buffer);
+}
+
+/* Dither array generation */
+bool_t
+compute_cluster_sizes (pattern_t *pattern, matrix_t *matrix)
+{
+ int width = pattern->width,
+ height = pattern->height;
+
+ if (matrix->width != width || matrix->height != height)
+ return FALSE;
+
+ int px, py, qx, qy, dx, dy;
+ float tsqsi = 2.f * 1.5f * 1.5f;
+
+#ifdef USE_OPENMP
+#pragma omp parallel for default (none) \
+ private (py, px, qy, qx, dx, dy) \
+ shared (height, width, pattern, matrix, tsqsi)
+#endif
+ for (py = 0; py < height; ++py)
+ {
+ for (px = 0; px < width; ++px)
+ {
+ bool_t pixel = *pattern_get (pattern, px, py);
+ float dist = 0.f;
+
+ for (qx = 0; qx < width; ++qx)
+ {
+ dx = imin (abs (qx - px), width - abs (qx - px));
+ dx = dx * dx;
+
+ for (qy = 0; qy < height; ++qy)
+ {
+ dy = imin (abs (qy - py), height - abs (qy - py));
+ dy = dy * dy;
+
+ dist += (pixel == *pattern_get (pattern, qx, qy))
+ * expf (- (dx + dy) / tsqsi);
+ }
+ }
+
+ *matrix_get (matrix, px, py) = dist;
+ }
+ }
+
+ return TRUE;
+}
+
+bool_t
+swap_pixel (pattern_t *pattern, matrix_t *matrix, int x, int y)
+{
+ int width = pattern->width,
+ height = pattern->height;
+
+ bool_t new;
+
+ float f,
+ dist = 0.f,
+ tsqsi = 2.f * 1.5f * 1.5f;
+
+ int px, py, dx, dy;
+ bool_t b;
+
+ new = !*pattern_get (pattern, x, y);
+ *pattern_get (pattern, x, y) = new;
+
+ if (matrix->width != width || matrix->height != height)
+ return FALSE;
+
+
+#ifdef USE_OPENMP
+#pragma omp parallel for reduction (+:dist) default (none) \
+ private (px, py, dx, dy, b, f) \
+ shared (x, y, width, height, pattern, matrix, new, tsqsi)
+#endif
+ for (py = 0; py < height; ++py)
+ {
+ dy = imin (abs (py - y), height - abs (py - y));
+ dy = dy * dy;
+
+ for (px = 0; px < width; ++px)
+ {
+ dx = imin (abs (px - x), width - abs (px - x));
+ dx = dx * dx;
+
+ b = (*pattern_get (pattern, px, py) == new);
+ f = expf (- (dx + dy) / tsqsi);
+ *matrix_get (matrix, px, py) += (2 * b - 1) * f;
+
+ dist += b * f;
+ }
+ }
+
+ *matrix_get (matrix, x, y) = dist;
+ return TRUE;
+}
+
+void
+largest_cluster (pattern_t *pattern, matrix_t *matrix,
+ bool_t pixel, int *xmax, int *ymax)
+{
+ int width = pattern->width,
+ height = pattern->height;
+
+ int x, y;
+
+ float vmax = -INFINITY;
+
+#ifdef USE_OPENMP
+#pragma omp parallel default (none) \
+ private (x, y) \
+ shared (height, width, pattern, matrix, pixel, xmax, ymax, vmax)
+#endif
+ {
+ int xbest = -1,
+ ybest = -1;
+
+#ifdef USE_OPENMP
+ float vbest = -INFINITY;
+
+#pragma omp for reduction (max: vmax) collapse (2)
+#endif
+ for (y = 0; y < height; ++y)
+ {
+ for (x = 0; x < width; ++x)
+ {
+ if (*pattern_get (pattern, x, y) != pixel)
+ continue;
+
+ if (*matrix_get (matrix, x, y) > vmax)
+ {
+ vmax = *matrix_get (matrix, x, y);
+#ifdef USE_OPENMP
+ vbest = vmax;
+#endif
+ xbest = x;
+ ybest = y;
+ }
+ }
+ }
+
+#ifdef USE_OPENMP
+#pragma omp barrier
+#pragma omp critical
+ {
+ if (vmax == vbest)
+ {
+ *xmax = xbest;
+ *ymax = ybest;
+ }
+ }
+#else
+ *xmax = xbest;
+ *ymax = ybest;
+#endif
+ }
+
+ assert (vmax > -INFINITY);
+}
+
+void
+generate_initial_binary_pattern (pattern_t *pattern, matrix_t *matrix)
+{
+ int xcluster = 0,
+ ycluster = 0,
+ xvoid = 0,
+ yvoid = 0;
+
+ for (;;)
+ {
+ largest_cluster (pattern, matrix, TRUE, &xcluster, &ycluster);
+ assert (*pattern_get (pattern, xcluster, ycluster) == TRUE);
+ swap_pixel (pattern, matrix, xcluster, ycluster);
+
+ largest_cluster (pattern, matrix, FALSE, &xvoid, &yvoid);
+ assert (*pattern_get (pattern, xvoid, yvoid) == FALSE);
+ swap_pixel (pattern, matrix, xvoid, yvoid);
+
+ if (xcluster == xvoid && ycluster == yvoid)
+ return;
+ }
+}
+
+bool_t
+generate_dither_array (array_t *array,
+ pattern_t const *prototype, matrix_t const *matrix,
+ pattern_t *temp_pattern, matrix_t *temp_matrix)
+{
+ int width = prototype->width,
+ height = prototype->height;
+
+ int x, y, rank;
+
+ int initial_rank = 0;
+
+ if (array->width != width || array->height != height)
+ return FALSE;
+
+ // Make copies of the prototype and associated sizes matrix since we will
+ // trash them
+ if (!pattern_copy (temp_pattern, prototype))
+ return FALSE;
+
+ if (!matrix_copy (temp_matrix, matrix))
+ return FALSE;
+
+ // Compute initial rank
+ for (y = 0; y < height; ++y)
+ {
+ for (x = 0; x < width; ++x)
+ {
+ if (*pattern_get (temp_pattern, x, y))
+ initial_rank += 1;
+
+ *array_get (array, x, y) = 0;
+ }
+ }
+
+ // Phase 1
+ for (rank = initial_rank; rank > 0; --rank)
+ {
+ largest_cluster (temp_pattern, temp_matrix, TRUE, &x, &y);
+ swap_pixel (temp_pattern, temp_matrix, x, y);
+ *array_get (array, x, y) = rank - 1;
+ }
+
+ // Make copies again for phases 2 & 3
+ if (!pattern_copy (temp_pattern, prototype))
+ return FALSE;
+
+ if (!matrix_copy (temp_matrix, matrix))
+ return FALSE;
+
+ // Phase 2 & 3
+ for (rank = initial_rank; rank < width * height; ++rank)
+ {
+ largest_cluster (temp_pattern, temp_matrix, FALSE, &x, &y);
+ swap_pixel (temp_pattern, temp_matrix, x, y);
+ *array_get (array, x, y) = rank;
+ }
+
+ return TRUE;
+}
+
+bool_t
+generate (int size, xorwow_state_t *s,
+ char const *c_filename, char const *ppm_filename)
+{
+ bool_t ok = TRUE;
+
+ pattern_t prototype, temp_pattern;
+ array_t array;
+ matrix_t matrix, temp_matrix;
+
+ printf ("Generating %dx%d blue noise...\n", size, size);
+
+ if (!pattern_init (&prototype, size, size))
+ return FALSE;
+
+ if (!pattern_init (&temp_pattern, size, size))
+ {
+ pattern_destroy (&prototype);
+ return FALSE;
+ }
+
+ if (!matrix_init (&matrix, size, size))
+ {
+ pattern_destroy (&temp_pattern);
+ pattern_destroy (&prototype);
+ return FALSE;
+ }
+
+ if (!matrix_init (&temp_matrix, size, size))
+ {
+ matrix_destroy (&matrix);
+ pattern_destroy (&temp_pattern);
+ pattern_destroy (&prototype);
+ return FALSE;
+ }
+
+ if (!array_init (&array, size, size))
+ {
+ matrix_destroy (&temp_matrix);
+ matrix_destroy (&matrix);
+ pattern_destroy (&temp_pattern);
+ pattern_destroy (&prototype);
+ return FALSE;
+ }
+
+ printf("Filling initial binary pattern with white noise...\n");
+ pattern_fill_white_noise (&prototype, .1, s);
+
+ printf("Initializing cluster sizes...\n");
+ if (!compute_cluster_sizes (&prototype, &matrix))
+ {
+ fprintf (stderr, "Error while computing cluster sizes\n");
+ ok = FALSE;
+ goto out;
+ }
+
+ printf("Generating initial binary pattern...\n");
+ generate_initial_binary_pattern (&prototype, &matrix);
+
+ printf("Generating dither array...\n");
+ if (!generate_dither_array (&array, &prototype, &matrix,
+ &temp_pattern, &temp_matrix))
+ {
+ fprintf (stderr, "Error while generating dither array\n");
+ ok = FALSE;
+ goto out;
+ }
+
+ printf("Saving dither array...\n");
+ if (!array_save (&array, c_filename))
+ {
+ fprintf (stderr, "Error saving dither array\n");
+ ok = FALSE;
+ goto out;
+ }
+
+#if SAVE_PPM
+ if (!array_save_ppm (&array, ppm_filename))
+ {
+ fprintf (stderr, "Error saving dither array PPM\n");
+ ok = FALSE;
+ goto out;
+ }
+#else
+ (void)ppm_filename;
+#endif
+
+ printf("All done!\n");
+
+out:
+ array_destroy (&array);
+ matrix_destroy (&temp_matrix);
+ matrix_destroy (&matrix);
+ pattern_destroy (&temp_pattern);
+ pattern_destroy (&prototype);
+ return ok;
+}
+
+int
+main (void)
+{
+ xorwow_state_t s = {1185956906, 12385940, 983948, 349208051, 901842};
+
+ if (!generate (64, &s, "blue-noise-64x64.h", "blue-noise-64x64.ppm"))
+ return -1;
+
+ return 0;
+}
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 086c6e0..0e79e86 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -209,12 +209,13 @@ _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
: "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
);
return ret;
+ } else {
+ uint64_t val = ((uint64_t)__w3 << 48)
+ | ((uint64_t)__w2 << 32)
+ | ((uint64_t)__w1 << 16)
+ | ((uint64_t)__w0 << 0);
+ return *(__m64 *)&val;
}
- uint64_t val = ((uint64_t)__w3 << 48)
- | ((uint64_t)__w2 << 32)
- | ((uint64_t)__w1 << 16)
- | ((uint64_t)__w0 << 0);
- return *(__m64 *)&val;
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -236,10 +237,11 @@ _mm_set_pi32 (unsigned __i1, unsigned __i0)
: "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
);
return ret;
+ } else {
+ uint64_t val = ((uint64_t)__i1 << 32)
+ | ((uint64_t)__i0 << 0);
+ return *(__m64 *)&val;
}
- uint64_t val = ((uint64_t)__i1 << 32)
- | ((uint64_t)__i0 << 0);
- return *(__m64 *)&val;
}
#undef _MM_SHUFFLE
diff --git a/pixman/make-srgb.pl b/pixman/make-srgb.pl
index cdaa80b..8bba160 100644
--- a/pixman/make-srgb.pl
+++ b/pixman/make-srgb.pl
@@ -73,7 +73,7 @@ print <<"PROLOG";
#include <stdint.h>
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
diff --git a/pixman/meson.build b/pixman/meson.build
new file mode 100644
index 0000000..62ec66b
--- /dev/null
+++ b/pixman/meson.build
@@ -0,0 +1,143 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+config_h = configure_file(
+ configuration : config,
+ output : 'pixman-config.h'
+)
+
+version_h = configure_file(
+ configuration : version_conf,
+ input : 'pixman-version.h.in',
+ output : 'pixman-version.h',
+ install_dir : join_paths(get_option('prefix'), get_option('includedir'), 'pixman-1')
+)
+
+libpixman_extra_cargs = []
+default_library = get_option('default_library')
+if default_library != 'static' and cc.has_function_attribute('dllexport')
+ libpixman_extra_cargs = ['-DPIXMAN_API=__declspec(dllexport)']
+endif
+
+pixman_simd_libs = []
+simds = [
+ # the mmx library can be compiled with mmx on x86/x86_64, iwmmxt on
+ # some arm cores, or loongson mmi on loongson mips systems. The
+ # libraries will all have the same name, "pixman-mmx", but there is
+ # no chance of more than one version being built in the same build
+ # because no system could have mmx, iwmmxt, and mmi, and it
+ # simplifies the build logic to give them the same name.
+ ['mmx', have_mmx, mmx_flags, []],
+ ['mmx', have_loongson_mmi, loongson_mmi_flags, []],
+ ['mmx', have_iwmmxt, iwmmxt_flags, []],
+
+ ['sse2', have_sse2, sse2_flags, []],
+ ['ssse3', have_ssse3, ssse3_flags, []],
+ ['vmx', have_vmx, vmx_flags, []],
+ ['arm-simd', have_armv6_simd, [],
+ ['pixman-arm-simd-asm.S', 'pixman-arm-simd-asm-scaled.S']],
+ ['arm-neon', have_neon, [],
+ ['pixman-arm-neon-asm.S', 'pixman-arm-neon-asm-bilinear.S']],
+ ['arm-neon', have_a64neon, [],
+ ['pixman-arma64-neon-asm.S', 'pixman-arma64-neon-asm-bilinear.S']],
+ ['mips-dspr2', have_mips_dspr2, mips_dspr2_flags,
+ ['pixman-mips-dspr2-asm.S', 'pixman-mips-memcpy-asm.S']],
+]
+
+foreach simd : simds
+ if simd[1]
+ name = 'pixman-' + simd[0]
+ pixman_simd_libs += static_library(
+ name,
+ [name + '.c', config_h, version_h, simd[3]],
+ c_args : simd[2]
+ )
+ endif
+endforeach
+
+pixman_files = files(
+ 'pixman.c',
+ 'pixman-access.c',
+ 'pixman-access-accessors.c',
+ 'pixman-bits-image.c',
+ 'pixman-combine32.c',
+ 'pixman-combine-float.c',
+ 'pixman-conical-gradient.c',
+ 'pixman-filter.c',
+ 'pixman-x86.c',
+ 'pixman-mips.c',
+ 'pixman-arm.c',
+ 'pixman-ppc.c',
+ 'pixman-edge.c',
+ 'pixman-edge-accessors.c',
+ 'pixman-fast-path.c',
+ 'pixman-glyph.c',
+ 'pixman-general.c',
+ 'pixman-gradient-walker.c',
+ 'pixman-image.c',
+ 'pixman-implementation.c',
+ 'pixman-linear-gradient.c',
+ 'pixman-matrix.c',
+ 'pixman-noop.c',
+ 'pixman-radial-gradient.c',
+ 'pixman-region16.c',
+ 'pixman-region32.c',
+ 'pixman-solid-fill.c',
+ 'pixman-timer.c',
+ 'pixman-trap.c',
+ 'pixman-utils.c',
+)
+
+# Android cpu-features
+cpu_features_path = get_option('cpu-features-path')
+cpu_features_sources = []
+cpu_features_inc = []
+if cpu_features_path != ''
+ message('Using cpu-features.[ch] from ' + cpu_features_path)
+ cpu_features_sources = files(
+ cpu_features_path / 'cpu-features.h',
+ cpu_features_path / 'cpu-features.c',
+ )
+ cpu_features_inc = include_directories(cpu_features_path)
+endif
+
+libpixman = library(
+ 'pixman-1',
+ [pixman_files, config_h, version_h, cpu_features_sources],
+ link_with: pixman_simd_libs,
+ c_args : libpixman_extra_cargs,
+ dependencies : [dep_m, dep_threads],
+ include_directories : cpu_features_inc,
+ version : meson.project_version(),
+ install : true,
+)
+
+inc_pixman = include_directories('.')
+
+idep_pixman = declare_dependency(
+ link_with: libpixman,
+ include_directories : inc_pixman,
+)
+
+if meson.version().version_compare('>= 0.54.0')
+ meson.override_dependency('pixman-1', idep_pixman)
+endif
+
+install_headers('pixman.h', subdir : 'pixman-1')
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
index 4f0642d..7bd7a5a 100644
--- a/pixman/pixman-access.c
+++ b/pixman/pixman-access.c
@@ -25,7 +25,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
@@ -68,14 +68,14 @@
#ifdef WORDS_BIGENDIAN
#define FETCH_24(img,l,o) \
- ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16) | \
- (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8) | \
- (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
+ ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16) | \
+ (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8) | \
+ (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
#else
#define FETCH_24(img,l,o) \
- ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0) | \
- (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8) | \
- (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
+ ((uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0) | \
+ (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8) | \
+ (uint32_t)(READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
#endif
/* Store macros */
@@ -87,7 +87,7 @@
uint32_t *__d = ((uint32_t *)(l)) + ((o) >> 5); \
uint32_t __m, __v; \
\
- __m = 1 << (0x1f - ((o) & 0x1f)); \
+ __m = 1U << (0x1f - ((o) & 0x1f)); \
__v = (v)? __m : 0; \
\
WRITE((img), __d, (READ((img), __d) & ~__m) | __v); \
@@ -100,7 +100,7 @@
uint32_t *__d = ((uint32_t *)(l)) + ((o) >> 5); \
uint32_t __m, __v; \
\
- __m = 1 << ((o) & 0x1f); \
+ __m = 1U << ((o) & 0x1f); \
__v = (v)? __m : 0; \
\
WRITE((img), __d, (READ((img), __d) & ~__m) | __v); \
@@ -465,7 +465,7 @@ convert_and_store_pixel (bits_image_t * image,
image, bits, offset, PIXMAN_ ## format); \
} \
\
- static const void *const __dummy__ ## format
+ static const void *const __dummy__ ## format MAYBE_UNUSED
MAKE_ACCESSORS(a8r8g8b8);
MAKE_ACCESSORS(x8r8g8b8);
@@ -610,6 +610,32 @@ fetch_scanline_a8r8g8b8_sRGB_float (bits_image_t * image,
}
}
+static void
+fetch_scanline_r8g8b8_sRGB_float (bits_image_t * image,
+ int x,
+ int y,
+ int width,
+ uint32_t * b,
+ const uint32_t *mask)
+{
+ const uint8_t *bits = (uint8_t *)(image->bits + y * image->rowstride);
+ argb_t *buffer = (argb_t *)b;
+ int i;
+ for (i = x; i < width; ++i)
+ {
+ uint32_t p = FETCH_24 (image, bits, i);
+ argb_t *argb = buffer;
+
+ argb->a = 1.0f;
+
+ argb->r = to_linear[(p >> 16) & 0xff];
+ argb->g = to_linear[(p >> 8) & 0xff];
+ argb->b = to_linear[(p >> 0) & 0xff];
+
+ buffer++;
+ }
+}
+
/* Expects a float buffer */
static void
fetch_scanline_a2r10g10b10_float (bits_image_t * image,
@@ -642,6 +668,48 @@ fetch_scanline_a2r10g10b10_float (bits_image_t * image,
}
/* Expects a float buffer */
+#ifndef PIXMAN_FB_ACCESSORS
+static void
+fetch_scanline_rgbf_float (bits_image_t *image,
+ int x,
+ int y,
+ int width,
+ uint32_t * b,
+ const uint32_t *mask)
+{
+ const float *bits = (float *)image->bits + y * image->rowstride;
+ const float *pixel = bits + x * 3;
+ argb_t *buffer = (argb_t *)b;
+
+ for (; width--; buffer++) {
+ buffer->r = *pixel++;
+ buffer->g = *pixel++;
+ buffer->b = *pixel++;
+ buffer->a = 1.f;
+ }
+}
+
+static void
+fetch_scanline_rgbaf_float (bits_image_t *image,
+ int x,
+ int y,
+ int width,
+ uint32_t * b,
+ const uint32_t *mask)
+{
+ const float *bits = (float *)image->bits + y * image->rowstride;
+ const float *pixel = bits + x * 4;
+ argb_t *buffer = (argb_t *)b;
+
+ for (; width--; buffer++) {
+ buffer->r = *pixel++;
+ buffer->g = *pixel++;
+ buffer->b = *pixel++;
+ buffer->a = *pixel++;
+ }
+}
+#endif
+
static void
fetch_scanline_x2r10g10b10_float (bits_image_t *image,
int x,
@@ -805,6 +873,40 @@ fetch_scanline_yv12 (bits_image_t *image,
/**************************** Pixel wise fetching *****************************/
+#ifndef PIXMAN_FB_ACCESSORS
+static argb_t
+fetch_pixel_rgbf_float (bits_image_t *image,
+ int offset,
+ int line)
+{
+ float *bits = (float *)image->bits + line * image->rowstride;
+ argb_t argb;
+
+ argb.r = bits[offset * 3];
+ argb.g = bits[offset * 3 + 1];
+ argb.b = bits[offset * 3 + 2];
+ argb.a = 1.f;
+
+ return argb;
+}
+
+static argb_t
+fetch_pixel_rgbaf_float (bits_image_t *image,
+ int offset,
+ int line)
+{
+ float *bits = (float *)image->bits + line * image->rowstride;
+ argb_t argb;
+
+ argb.r = bits[offset * 4];
+ argb.g = bits[offset * 4 + 1];
+ argb.b = bits[offset * 4 + 2];
+ argb.a = bits[offset * 4 + 3];
+
+ return argb;
+}
+#endif
+
static argb_t
fetch_pixel_x2r10g10b10_float (bits_image_t *image,
int offset,
@@ -905,6 +1007,24 @@ fetch_pixel_a8r8g8b8_sRGB_float (bits_image_t *image,
return argb;
}
+static argb_t
+fetch_pixel_r8g8b8_sRGB_float (bits_image_t *image,
+ int offset,
+ int line)
+{
+ uint8_t *bits = (uint8_t *)(image->bits + line * image->rowstride);
+ uint32_t p = FETCH_24 (image, bits, offset);
+ argb_t argb;
+
+ argb.a = 1.0f;
+
+ argb.r = to_linear[(p >> 16) & 0xff];
+ argb.g = to_linear[(p >> 8) & 0xff];
+ argb.b = to_linear[(p >> 0) & 0xff];
+
+ return argb;
+}
+
static uint32_t
fetch_pixel_yuy2 (bits_image_t *image,
int offset,
@@ -962,6 +1082,45 @@ fetch_pixel_yv12 (bits_image_t *image,
/*********************************** Store ************************************/
+#ifndef PIXMAN_FB_ACCESSORS
+static void
+store_scanline_rgbaf_float (bits_image_t * image,
+ int x,
+ int y,
+ int width,
+ const uint32_t *v)
+{
+ float *bits = (float *)image->bits + image->rowstride * y + 4 * x;
+ const argb_t *values = (argb_t *)v;
+
+ for (; width; width--, values++)
+ {
+ *bits++ = values->r;
+ *bits++ = values->g;
+ *bits++ = values->b;
+ *bits++ = values->a;
+ }
+}
+
+static void
+store_scanline_rgbf_float (bits_image_t * image,
+ int x,
+ int y,
+ int width,
+ const uint32_t *v)
+{
+ float *bits = (float *)image->bits + image->rowstride * y + 3 * x;
+ const argb_t *values = (argb_t *)v;
+
+ for (; width; width--, values++)
+ {
+ *bits++ = values->r;
+ *bits++ = values->g;
+ *bits++ = values->b;
+ }
+}
+#endif
+
static void
store_scanline_a2r10g10b10_float (bits_image_t * image,
int x,
@@ -976,7 +1135,7 @@ store_scanline_a2r10g10b10_float (bits_image_t * image,
for (i = 0; i < width; ++i)
{
- uint16_t a, r, g, b;
+ uint32_t a, r, g, b;
a = pixman_float_to_unorm (values[i].a, 2);
r = pixman_float_to_unorm (values[i].r, 10);
@@ -1002,7 +1161,7 @@ store_scanline_x2r10g10b10_float (bits_image_t * image,
for (i = 0; i < width; ++i)
{
- uint16_t r, g, b;
+ uint32_t r, g, b;
r = pixman_float_to_unorm (values[i].r, 10);
g = pixman_float_to_unorm (values[i].g, 10);
@@ -1027,7 +1186,7 @@ store_scanline_a2b10g10r10_float (bits_image_t * image,
for (i = 0; i < width; ++i)
{
- uint16_t a, r, g, b;
+ uint32_t a, r, g, b;
a = pixman_float_to_unorm (values[i].a, 2);
r = pixman_float_to_unorm (values[i].r, 10);
@@ -1053,7 +1212,7 @@ store_scanline_x2b10g10r10_float (bits_image_t * image,
for (i = 0; i < width; ++i)
{
- uint16_t r, g, b;
+ uint32_t r, g, b;
r = pixman_float_to_unorm (values[i].r, 10);
g = pixman_float_to_unorm (values[i].g, 10);
@@ -1078,7 +1237,7 @@ store_scanline_a8r8g8b8_sRGB_float (bits_image_t * image,
for (i = 0; i < width; ++i)
{
- uint8_t a, r, g, b;
+ uint32_t a, r, g, b;
a = pixman_float_to_unorm (values[i].a, 8);
r = to_srgb (values[i].r);
@@ -1090,6 +1249,31 @@ store_scanline_a8r8g8b8_sRGB_float (bits_image_t * image,
}
}
+static void
+store_scanline_r8g8b8_sRGB_float (bits_image_t * image,
+ int x,
+ int y,
+ int width,
+ const uint32_t *v)
+{
+ uint8_t *bits = (uint8_t *)(image->bits + image->rowstride * y) + 3 * x;
+ argb_t *values = (argb_t *)v;
+ int i;
+
+ for (i = 0; i < width; ++i)
+ {
+ uint32_t r, g, b, rgb;
+
+ r = to_srgb (values[i].r);
+ g = to_srgb (values[i].g);
+ b = to_srgb (values[i].b);
+
+ rgb = (r << 16) | (g << 8) | b;
+
+ STORE_24 (image, bits, i, rgb);
+ }
+}
+
/*
* Contracts a floating point image to 32bpp and then stores it using a
* regular 32-bit store proc. Despite the type, this function expects an
@@ -1151,7 +1335,7 @@ fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t *image,
while (pixel < end)
{
- uint8_t a, r, g, b;
+ uint32_t a, r, g, b;
tmp = READ (image, pixel++);
@@ -1168,6 +1352,37 @@ fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t *image,
}
}
+static void
+fetch_scanline_r8g8b8_32_sRGB (bits_image_t *image,
+ int x,
+ int y,
+ int width,
+ uint32_t *buffer,
+ const uint32_t *mask)
+{
+ const uint8_t *bits = (uint8_t *)(image->bits + y * image->rowstride) + 3 * x;
+ uint32_t tmp;
+ int i;
+
+ for (i = 0; i < width; ++i)
+ {
+ uint32_t a, r, g, b;
+
+ tmp = FETCH_24 (image, bits, i);
+
+ a = 0xff;
+ r = (tmp >> 16) & 0xff;
+ g = (tmp >> 8) & 0xff;
+ b = (tmp >> 0) & 0xff;
+
+ r = to_linear[r] * 255.0f + 0.5f;
+ g = to_linear[g] * 255.0f + 0.5f;
+ b = to_linear[b] * 255.0f + 0.5f;
+
+ *buffer++ = (a << 24) | (r << 16) | (g << 8) | (b << 0);
+ }
+}
+
static uint32_t
fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
int offset,
@@ -1175,7 +1390,7 @@ fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
{
uint32_t *bits = image->bits + line * image->rowstride;
uint32_t tmp = READ (image, bits + offset);
- uint8_t a, r, g, b;
+ uint32_t a, r, g, b;
a = (tmp >> 24) & 0xff;
r = (tmp >> 16) & 0xff;
@@ -1189,6 +1404,27 @@ fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
return (a << 24) | (r << 16) | (g << 8) | (b << 0);
}
+static uint32_t
+fetch_pixel_r8g8b8_32_sRGB (bits_image_t *image,
+ int offset,
+ int line)
+{
+ uint8_t *bits = (uint8_t *)(image->bits + line * image->rowstride);
+ uint32_t tmp = FETCH_24 (image, bits, offset);
+ uint32_t a, r, g, b;
+
+ a = 0xff;
+ r = (tmp >> 16) & 0xff;
+ g = (tmp >> 8) & 0xff;
+ b = (tmp >> 0) & 0xff;
+
+ r = to_linear[r] * 255.0f + 0.5f;
+ g = to_linear[g] * 255.0f + 0.5f;
+ b = to_linear[b] * 255.0f + 0.5f;
+
+ return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
static void
store_scanline_a8r8g8b8_32_sRGB (bits_image_t *image,
int x,
@@ -1204,7 +1440,7 @@ store_scanline_a8r8g8b8_32_sRGB (bits_image_t *image,
for (i = 0; i < width; ++i)
{
- uint8_t a, r, g, b;
+ uint32_t a, r, g, b;
tmp = values[i];
@@ -1221,6 +1457,36 @@ store_scanline_a8r8g8b8_32_sRGB (bits_image_t *image,
}
}
+static void
+store_scanline_r8g8b8_32_sRGB (bits_image_t *image,
+ int x,
+ int y,
+ int width,
+ const uint32_t *v)
+{
+ uint8_t *bits = (uint8_t *)(image->bits + image->rowstride * y) + 3 * x;
+ uint64_t *values = (uint64_t *)v;
+ uint64_t tmp;
+ int i;
+
+ for (i = 0; i < width; ++i)
+ {
+ uint32_t r, g, b;
+
+ tmp = values[i];
+
+ r = (tmp >> 16) & 0xff;
+ g = (tmp >> 8) & 0xff;
+ b = (tmp >> 0) & 0xff;
+
+ r = to_srgb (r * (1/255.0f));
+ g = to_srgb (g * (1/255.0f));
+ b = to_srgb (b * (1/255.0f));
+
+ STORE_24 (image, bits, i, (r << 16) | (g << 8) | (b << 0));
+ }
+}
+
static argb_t
fetch_pixel_generic_float (bits_image_t *image,
int offset,
@@ -1294,6 +1560,11 @@ static const format_info_t accessors[] =
fetch_pixel_a8r8g8b8_32_sRGB, fetch_pixel_a8r8g8b8_sRGB_float,
store_scanline_a8r8g8b8_32_sRGB, store_scanline_a8r8g8b8_sRGB_float,
},
+ { PIXMAN_r8g8b8_sRGB,
+ fetch_scanline_r8g8b8_32_sRGB, fetch_scanline_r8g8b8_sRGB_float,
+ fetch_pixel_r8g8b8_32_sRGB, fetch_pixel_r8g8b8_sRGB_float,
+ store_scanline_r8g8b8_32_sRGB, store_scanline_r8g8b8_sRGB_float,
+ },
/* 24bpp formats */
FORMAT_INFO (r8g8b8),
@@ -1351,7 +1622,18 @@ static const format_info_t accessors[] =
FORMAT_INFO (g1),
/* Wide formats */
-
+#ifndef PIXMAN_FB_ACCESSORS
+ { PIXMAN_rgba_float,
+ NULL, fetch_scanline_rgbaf_float,
+ fetch_pixel_generic_lossy_32, fetch_pixel_rgbaf_float,
+ NULL, store_scanline_rgbaf_float },
+
+ { PIXMAN_rgb_float,
+ NULL, fetch_scanline_rgbf_float,
+ fetch_pixel_generic_lossy_32, fetch_pixel_rgbf_float,
+ NULL, store_scanline_rgbf_float },
+#endif
+
{ PIXMAN_a2r10g10b10,
NULL, fetch_scanline_a2r10g10b10_float,
fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10_float,
diff --git a/pixman/pixman-arm-asm.h b/pixman/pixman-arm-asm.h
index ee78541..edf8e82 100644
--- a/pixman/pixman-arm-asm.h
+++ b/pixman/pixman-arm-asm.h
@@ -25,13 +25,39 @@
*
*/
+
+#include "pixman-config.h"
+
+
/* Supplementary macro for setting function attributes */
-.macro pixman_asm_function fname
- .func fname
- .global fname
+.macro pixman_asm_function_impl fname
+#ifdef ASM_HAVE_FUNC_DIRECTIVE
+ .func \fname
+#endif
+ .global \fname
#ifdef __ELF__
- .hidden fname
- .type fname, %function
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+.endm
+
+.macro pixman_asm_function fname
+#ifdef ASM_LEADING_UNDERSCORE
+ pixman_asm_function_impl _\fname
+#else
+ pixman_asm_function_impl \fname
+#endif
+.endm
+
+.macro pixman_syntax_unified
+#ifdef ASM_HAVE_SYNTAX_UNIFIED
+ .syntax unified
+#endif
+.endm
+
+.macro pixman_end_asm_function
+#ifdef ASM_HAVE_FUNC_DIRECTIVE
+ .endfunc
#endif
-fname:
.endm
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 3a7cb2b..9537688 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -266,13 +266,6 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op, \
scaled_nearest_scanline_##cputype##_##name##_##op, \
src_type, dst_type, NORMAL)
-/* Provide entries for the fast path table */
-#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \
- SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \
- SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
-
#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \
src_type, dst_type) \
void \
@@ -318,9 +311,7 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \
/* Provide entries for the fast path table */
#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \
- SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
- SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func), \
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH (op,s,d,func), \
SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
/*****************************************************************************/
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 0fd92d6..6bd2736 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -68,6 +68,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-neon-asm.h"
+pixman_syntax_unified
+
/*
* Bilinear macros from pixman-arm-neon-asm.S
*/
@@ -82,28 +84,28 @@
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
- vld1.32 {reg1}, [TMP1], STRIDE
- vld1.32 {reg2}, [TMP1]
+ vld1.32 {\reg1}, [TMP1], STRIDE
+ vld1.32 {\reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
- vld1.32 {reg2[0]}, [TMP1], STRIDE
- vld1.32 {reg2[1]}, [TMP1]
- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+ vld1.32 {\reg2[0]}, [TMP1], STRIDE
+ vld1.32 {\reg2[1]}, [TMP1]
+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
.endm
.macro bilinear_load_and_vertical_interpolate_two_8888 \
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
- bilinear_load_8888 reg1, reg2, tmp1
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- bilinear_load_8888 reg3, reg4, tmp2
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ bilinear_load_8888 \reg1, \reg2, \tmp1
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ bilinear_load_8888 \reg3, \reg4, \tmp2
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -111,9 +113,9 @@
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
.endm
.macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -125,19 +127,19 @@
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {acc2lo[0]}, [TMP1], STRIDE
- vld1.32 {acc2hi[0]}, [TMP2], STRIDE
- vld1.32 {acc2lo[1]}, [TMP1]
- vld1.32 {acc2hi[1]}, [TMP2]
- convert_0565_to_x888 acc2, reg3, reg2, reg1
- vzip.u8 reg1, reg3
- vzip.u8 reg2, reg4
- vzip.u8 reg3, reg4
- vzip.u8 reg1, reg2
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\acc2lo[1]}, [TMP1]
+ vld1.32 {\acc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+ vzip.u8 \reg1, \reg3
+ vzip.u8 \reg2, \reg4
+ vzip.u8 \reg3, \reg4
+ vzip.u8 \reg1, \reg2
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -150,46 +152,46 @@
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
- vld1.32 {xacc2lo[1]}, [TMP1]
- vld1.32 {xacc2hi[1]}, [TMP2]
- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\xacc2lo[1]}, [TMP1]
+ vld1.32 {\xacc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
- vzip.u8 xreg1, xreg3
- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
- vzip.u8 xreg2, xreg4
- vld1.32 {yacc2lo[1]}, [TMP1]
- vzip.u8 xreg3, xreg4
- vld1.32 {yacc2hi[1]}, [TMP2]
- vzip.u8 xreg1, xreg2
- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
- vmull.u8 xacc1, xreg1, d28
- vzip.u8 yreg1, yreg3
- vmlal.u8 xacc1, xreg2, d29
- vzip.u8 yreg2, yreg4
- vmull.u8 xacc2, xreg3, d28
- vzip.u8 yreg3, yreg4
- vmlal.u8 xacc2, xreg4, d29
- vzip.u8 yreg1, yreg2
- vmull.u8 yacc1, yreg1, d28
- vmlal.u8 yacc1, yreg2, d29
- vmull.u8 yacc2, yreg3, d28
- vmlal.u8 yacc2, yreg4, d29
+ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
+ vzip.u8 \xreg1, \xreg3
+ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
+ vzip.u8 \xreg2, \xreg4
+ vld1.32 {\yacc2lo[1]}, [TMP1]
+ vzip.u8 \xreg3, \xreg4
+ vld1.32 {\yacc2hi[1]}, [TMP2]
+ vzip.u8 \xreg1, \xreg2
+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+ vmull.u8 \xacc1, \xreg1, d28
+ vzip.u8 \yreg1, \yreg3
+ vmlal.u8 \xacc1, \xreg2, d29
+ vzip.u8 \yreg2, \yreg4
+ vmull.u8 \xacc2, \xreg3, d28
+ vzip.u8 \yreg3, \yreg4
+ vmlal.u8 \xacc2, \xreg4, d29
+ vzip.u8 \yreg1, \yreg2
+ vmull.u8 \yacc1, \yreg1, d28
+ vmlal.u8 \yacc1, \yreg2, d29
+ vmull.u8 \yacc2, \yreg3, d28
+ vmlal.u8 \yacc2, \yreg4, d29
.endm
.macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
vst1.32 {d0, d1}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d0}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.32 {d0[0]}, [OUT, :32]!
.else
.error bilinear_store_8888 numpix is unsupported
@@ -201,12 +203,12 @@
vuzp.u8 d2, d3
vuzp.u8 d1, d3
vuzp.u8 d0, d2
- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
vst1.16 {d2}, [OUT]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d2[0]}, [OUT]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.16 {d2[0]}, [OUT]!
.else
.error bilinear_store_0565 numpix is unsupported
@@ -222,20 +224,20 @@
.endm
.macro bilinear_load_mask_8 numpix, mask
-.if numpix == 4
- vld1.32 {mask[0]}, [MASK]!
-.elseif numpix == 2
- vld1.16 {mask[0]}, [MASK]!
-.elseif numpix == 1
- vld1.8 {mask[0]}, [MASK]!
+.if \numpix == 4
+ vld1.32 {\mask[0]}, [MASK]!
+.elseif \numpix == 2
+ vld1.16 {\mask[0]}, [MASK]!
+.elseif \numpix == 1
+ vld1.8 {\mask[0]}, [MASK]!
.else
- .error bilinear_load_mask_8 numpix is unsupported
+ .error bilinear_load_mask_8 \numpix is unsupported
.endif
pld [MASK, #prefetch_offset]
.endm
.macro bilinear_load_mask mask_fmt, numpix, mask
- bilinear_load_mask_&mask_fmt numpix, mask
+ bilinear_load_mask_\()\mask_fmt \numpix, \mask
.endm
@@ -250,28 +252,28 @@
.endm
.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
-.if numpix == 4
- vld1.32 {dst0, dst1}, [OUT]
-.elseif numpix == 2
- vld1.32 {dst0}, [OUT]
-.elseif numpix == 1
- vld1.32 {dst0[0]}, [OUT]
+.if \numpix == 4
+ vld1.32 {\dst0, \dst1}, [OUT]
+.elseif \numpix == 2
+ vld1.32 {\dst0}, [OUT]
+.elseif \numpix == 1
+ vld1.32 {\dst0[0]}, [OUT]
.else
- .error bilinear_load_dst_8888 numpix is unsupported
+ .error bilinear_load_dst_8888 \numpix is unsupported
.endif
pld [OUT, #(prefetch_offset * 4)]
.endm
.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
.endm
/*
@@ -290,19 +292,19 @@
.endm
.macro bilinear_duplicate_mask_8 numpix, mask
-.if numpix == 4
- vdup.32 mask, mask[0]
-.elseif numpix == 2
- vdup.16 mask, mask[0]
-.elseif numpix == 1
- vdup.8 mask, mask[0]
+.if \numpix == 4
+ vdup.32 \mask, \mask[0]
+.elseif \numpix == 2
+ vdup.16 \mask, \mask[0]
+.elseif \numpix == 1
+ vdup.8 \mask, \mask[0]
.else
.error bilinear_duplicate_mask_8 is unsupported
.endif
.endm
.macro bilinear_duplicate_mask mask_fmt, numpix, mask
- bilinear_duplicate_mask_&mask_fmt numpix, mask
+ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
.endm
/*
@@ -310,10 +312,10 @@
* Interleave should be done when maks is enabled or operator is 'over'.
*/
.macro bilinear_interleave src0, src1, dst0, dst1
- vuzp.8 src0, src1
- vuzp.8 dst0, dst1
- vuzp.8 src0, src1
- vuzp.8 dst0, dst1
+ vuzp.8 \src0, \src1
+ vuzp.8 \dst0, \dst1
+ vuzp.8 \src0, \src1
+ vuzp.8 \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_x_src \
@@ -323,7 +325,7 @@
.macro bilinear_interleave_src_dst_x_over \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_x_add \
@@ -333,26 +335,26 @@
.macro bilinear_interleave_src_dst_8_src \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_8_over \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst_8_add \
numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave src0, src1, dst0, dst1
+ bilinear_interleave \src0, \src1, \dst0, \dst1
.endm
.macro bilinear_interleave_src_dst \
mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
- bilinear_interleave_src_dst_&mask_fmt&_&op \
- numpix, src0, src1, src01, dst0, dst1, dst01
+ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
.endm
@@ -370,23 +372,23 @@
numpix, src0, src1, src01, mask, \
tmp01, tmp23, tmp45, tmp67
- vmull.u8 tmp01, src0, mask
- vmull.u8 tmp23, src1, mask
+ vmull.u8 \tmp01, \src0, \mask
+ vmull.u8 \tmp23, \src1, \mask
/* bubbles */
- vrshr.u16 tmp45, tmp01, #8
- vrshr.u16 tmp67, tmp23, #8
+ vrshr.u16 \tmp45, \tmp01, #8
+ vrshr.u16 \tmp67, \tmp23, #8
/* bubbles */
- vraddhn.u16 src0, tmp45, tmp01
- vraddhn.u16 src1, tmp67, tmp23
+ vraddhn.u16 \src0, \tmp45, \tmp01
+ vraddhn.u16 \src1, \tmp67, \tmp23
.endm
.macro bilinear_apply_mask_to_src \
mask_fmt, numpix, src0, src1, src01, mask, \
tmp01, tmp23, tmp45, tmp67
- bilinear_apply_mask_to_src_&mask_fmt \
- numpix, src0, src1, src01, mask, \
- tmp01, tmp23, tmp45, tmp67
+ bilinear_apply_mask_to_src_\()\mask_fmt \
+ \numpix, \src0, \src1, \src01, \mask, \
+ \tmp01, \tmp23, \tmp45, \tmp67
.endm
@@ -403,79 +405,79 @@
numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
- vdup.32 tmp8, src1[1]
+ vdup.32 \tmp8, \src1[1]
/* bubbles */
- vmvn.8 tmp8, tmp8
+ vmvn.8 \tmp8, \tmp8
/* bubbles */
- vmull.u8 tmp01, dst0, tmp8
+ vmull.u8 \tmp01, \dst0, \tmp8
/* bubbles */
- vmull.u8 tmp23, dst1, tmp8
+ vmull.u8 \tmp23, \dst1, \tmp8
/* bubbles */
- vrshr.u16 tmp45, tmp01, #8
- vrshr.u16 tmp67, tmp23, #8
+ vrshr.u16 \tmp45, \tmp01, #8
+ vrshr.u16 \tmp67, \tmp23, #8
/* bubbles */
- vraddhn.u16 dst0, tmp45, tmp01
- vraddhn.u16 dst1, tmp67, tmp23
+ vraddhn.u16 \dst0, \tmp45, \tmp01
+ vraddhn.u16 \dst1, \tmp67, \tmp23
/* bubbles */
- vqadd.u8 src01, dst01, src01
+ vqadd.u8 \src01, \dst01, \src01
.endm
.macro bilinear_combine_add \
numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
- vqadd.u8 src01, dst01, src01
+ vqadd.u8 \src01, \dst01, \src01
.endm
.macro bilinear_combine \
op, numpix, src0, src1, src01, dst0, dst1, dst01, \
tmp01, tmp23, tmp45, tmp67, tmp8
- bilinear_combine_&op \
- numpix, src0, src1, src01, dst0, dst1, dst01, \
- tmp01, tmp23, tmp45, tmp67, tmp8
+ bilinear_combine_\()\op \
+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
+ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
.endm
/*
* Macros for final deinterleaving of destination pixels if needed.
*/
.macro bilinear_deinterleave numpix, dst0, dst1, dst01
- vuzp.8 dst0, dst1
+ vuzp.8 \dst0, \dst1
/* bubbles */
- vuzp.8 dst0, dst1
+ vuzp.8 \dst0, \dst1
.endm
.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
.endm
.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
.endm
.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
- bilinear_deinterleave numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
.endm
.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
- bilinear_load_&src_fmt d0, d1, d2
- bilinear_load_mask mask_fmt, 1, d4
- bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
+ bilinear_load_\()\src_fmt d0, d1, d2
+ bilinear_load_mask \mask_fmt, 1, d4
+ bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
@@ -483,28 +485,28 @@
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
/* 5 cycles bubble */
- bilinear_duplicate_mask mask_fmt, 1, d4
+ bilinear_duplicate_mask \mask_fmt, 1, d4
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
bilinear_interleave_src_dst \
- mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
+ \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
- mask_fmt, 1, d0, d1, q0, d4, \
+ \mask_fmt, 1, d0, d1, q0, d4, \
q3, q8, q10, q11
bilinear_combine \
- op, 1, d0, d1, q0, d18, d19, q9, \
+ \op, 1, d0, d1, q0, d18, d19, q9, \
q3, q8, q10, q11, d5
- bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
- bilinear_store_&dst_fmt 1, q2, q3
+ bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
+ bilinear_store_\()\dst_fmt 1, q2, q3
.endm
.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
- bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23
- bilinear_load_mask mask_fmt, 2, d4
- bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
+ bilinear_load_mask \mask_fmt, 2, d4
+ bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
vmlal.u16 q0, d3, d30
@@ -513,24 +515,24 @@
vmlal.u16 q10, d23, d31
vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
- bilinear_duplicate_mask mask_fmt, 2, d4
+ bilinear_duplicate_mask \mask_fmt, 2, d4
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
bilinear_interleave_src_dst \
- mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
+ \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
bilinear_apply_mask_to_src \
- mask_fmt, 2, d0, d1, q0, d4, \
+ \mask_fmt, 2, d0, d1, q0, d4, \
q3, q8, q10, q11
bilinear_combine \
- op, 2, d0, d1, q0, d18, d19, q9, \
+ \op, 2, d0, d1, q0, d18, d19, q9, \
q3, q8, q10, q11, d5
- bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
- bilinear_store_&dst_fmt 2, q2, q3
+ bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
+ bilinear_store_\()\dst_fmt 2, q2, q3
.endm
.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
- bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
@@ -546,8 +548,8 @@
vmlsl.u16 q2, d6, d30
vmlal.u16 q2, d7, d30
vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
- bilinear_load_mask mask_fmt, 4, d22
- bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
+ bilinear_load_mask \mask_fmt, 4, d22
+ bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
pld [TMP1, PF_OFFS]
vmlsl.u16 q8, d18, d31
vmlal.u16 q8, d19, d31
@@ -556,21 +558,21 @@
vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
- bilinear_duplicate_mask mask_fmt, 4, d22
+ bilinear_duplicate_mask \mask_fmt, 4, d22
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
bilinear_interleave_src_dst \
- mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
+ \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
bilinear_apply_mask_to_src \
- mask_fmt, 4, d0, d1, q0, d22, \
+ \mask_fmt, 4, d0, d1, q0, d22, \
q3, q8, q9, q10
bilinear_combine \
- op, 4, d0, d1, q0, d2, d3, q1, \
+ \op, 4, d0, d1, q0, d2, d3, q1, \
q3, q8, q9, q10, d23
- bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
- bilinear_store_&dst_fmt 4, q2, q3
+ bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
+ bilinear_store_\()\dst_fmt 4, q2, q3
.endm
.set BILINEAR_FLAG_USE_MASK, 1
@@ -610,14 +612,14 @@
prefetch_distance, \
flags
-pixman_asm_function fname
-.if pixblock_size == 8
-.elseif pixblock_size == 4
+pixman_asm_function \fname
+.if \pixblock_size == 8
+.elseif \pixblock_size == 4
.else
.error unsupported pixblock size
.endif
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
OUT .req r0
TOP .req r1
BOTTOM .req r2
@@ -635,7 +637,7 @@ pixman_asm_function fname
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
ldmia ip, {WB, X, UX, WIDTH}
.else
OUT .req r0
@@ -654,17 +656,17 @@ pixman_asm_function fname
TMP4 .req r10
STRIDE .req r3
- .set prefetch_offset, prefetch_distance
+ .set prefetch_offset, \prefetch_distance
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
ldmia ip, {WT, WB, X, UX, WIDTH}
.endif
mul PF_OFFS, PF_OFFS, UX
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpush {d8-d15}
.endif
@@ -683,11 +685,11 @@ pixman_asm_function fname
/* ensure good destination alignment */
cmp WIDTH, #1
blt 0f
- tst OUT, #(1 << dst_bpp_shift)
+ tst OUT, #(1 << \dst_bpp_shift)
beq 0f
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
- bilinear_process_last_pixel
+ \bilinear_process_last_pixel
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
@@ -696,53 +698,53 @@ pixman_asm_function fname
cmp WIDTH, #2
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 1))
+ tst OUT, #(1 << (\dst_bpp_shift + 1))
beq 0f
- bilinear_process_two_pixels
+ \bilinear_process_two_pixels
sub WIDTH, WIDTH, #2
0:
-.if pixblock_size == 8
+.if \pixblock_size == 8
cmp WIDTH, #4
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 2))
+ tst OUT, #(1 << (\dst_bpp_shift + 2))
beq 0f
- bilinear_process_four_pixels
+ \bilinear_process_four_pixels
sub WIDTH, WIDTH, #4
0:
.endif
- subs WIDTH, WIDTH, #pixblock_size
+ subs WIDTH, WIDTH, #\pixblock_size
blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
- bilinear_process_pixblock_head
- subs WIDTH, WIDTH, #pixblock_size
+ mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
+ \bilinear_process_pixblock_head
+ subs WIDTH, WIDTH, #\pixblock_size
blt 5f
0:
- bilinear_process_pixblock_tail_head
- subs WIDTH, WIDTH, #pixblock_size
+ \bilinear_process_pixblock_tail_head
+ subs WIDTH, WIDTH, #\pixblock_size
bge 0b
5:
- bilinear_process_pixblock_tail
+ \bilinear_process_pixblock_tail
1:
-.if pixblock_size == 8
+.if \pixblock_size == 8
tst WIDTH, #4
beq 2f
- bilinear_process_four_pixels
+ \bilinear_process_four_pixels
2:
.endif
/* handle the remaining trailing pixels */
tst WIDTH, #2
beq 2f
- bilinear_process_two_pixels
+ \bilinear_process_two_pixels
2:
tst WIDTH, #1
beq 3f
- bilinear_process_last_pixel
+ \bilinear_process_last_pixel
3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpop {d8-d15}
.endif
-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
pop {r4, r5, r6, r7, r8, r9}
.else
pop {r4, r5, r6, r7, r8, r9, r10, ip}
@@ -762,11 +764,11 @@ pixman_asm_function fname
.unreq TMP3
.unreq TMP4
.unreq STRIDE
-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
.unreq MASK
.endif
-.endfunc
+pixman_end_asm_function
.endm
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 7e949a3..0e09257 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -53,6 +53,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-neon-asm.h"
+ pixman_syntax_unified
+
/* Global configuration options and preferences */
/*
@@ -260,13 +262,13 @@
vshrn.u16 d7, q2, #3
vsli.u16 q2, q2, #5
vshll.u8 q14, d16, #8
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vshll.u8 q8, d19, #8
- PF tst PF_CTL, #0xF
+ PF tst, PF_CTL, #0xF
vsri.u8 d6, d6, #5
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vmvn.8 d3, d3
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vsri.u8 d7, d7, #6
vshrn.u16 d30, q2, #2
vmull.u8 q10, d3, d6
@@ -275,18 +277,18 @@
vmull.u8 q12, d3, d30
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vsri.u16 q14, q8, #5
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vshll.u8 q9, d18, #8
vrshr.u16 q13, q10, #8
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vrshr.u16 q3, q11, #8
vrshr.u16 q15, q12, #8
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vsri.u16 q14, q9, #11
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vraddhn.u16 d20, q10, q13
vraddhn.u16 d23, q11, q3
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vraddhn.u16 d22, q12, q15
vst1.16 {d28, d29}, [DST_W, :128]!
.endm
@@ -434,20 +436,20 @@ generate_composite_function \
.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
vsri.u16 q14, q8, #5
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
fetch_src_pixblock
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vsri.u16 q14, q9, #11
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vshll.u8 q8, d1, #8
vst1.16 {d28, d29}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
vshll.u8 q14, d2, #8
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vshll.u8 q9, d0, #8
.endm
@@ -509,20 +511,20 @@ generate_composite_function \
.macro pixman_composite_add_8_8_process_pixblock_tail_head
fetch_src_pixblock
- PF add PF_X, PF_X, #32
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #32
+ PF tst, PF_CTL, #0xF
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #32
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #32
+ PF subne, PF_CTL, PF_CTL, #1
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@@ -541,20 +543,20 @@ generate_composite_function \
.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
fetch_src_pixblock
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
vqadd.u8 q14, q0, q2
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q15, q1, q3
.endm
@@ -604,16 +606,16 @@ generate_composite_function_single_scanline \
.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
fetch_src_pixblock
@@ -621,13 +623,13 @@ generate_composite_function_single_scanline \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -656,16 +658,16 @@ generate_composite_function_single_scanline \
.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
@@ -675,13 +677,13 @@ generate_composite_function_single_scanline \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -742,20 +744,20 @@ generate_composite_function_single_scanline \
vraddhn.u16 d31, q3, q11
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
vqadd.u8 q14, q0, q14
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0x0F
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0x0F
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vqadd.u8 q15, q1, q15
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vmull.u8 q8, d24, d4
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vmull.u8 q9, d24, d5
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d6
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d7
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
.endm
@@ -784,16 +786,16 @@ generate_composite_function \
.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
vrshr.u16 q14, q8, #8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
vrshr.u16 q15, q9, #8
vrshr.u16 q12, q10, #8
vrshr.u16 q13, q11, #8
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d28, q14, q8
vraddhn.u16 d29, q15, q9
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d30, q12, q10
vraddhn.u16 d31, q13, q11
vqadd.u8 q14, q0, q14
@@ -802,12 +804,12 @@ generate_composite_function \
vmvn.8 d22, d3
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d22, d4
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d22, d5
vmull.u8 q10, d22, d6
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vmull.u8 q11, d22, d7
.endm
@@ -1245,23 +1247,23 @@ generate_composite_function \
.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
fetch_mask_pixblock
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vrshrn.u16 d28, q8, #8
- PF tst PF_CTL, #0x0F
+ PF tst, PF_CTL, #0x0F
vrshrn.u16 d29, q9, #8
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vrshrn.u16 d30, q10, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vrshrn.u16 d31, q11, #8
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vmull.u8 q8, d24, d0
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q9, d24, d1
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q10, d24, d2
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d3
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q8, q8, #8
vrsra.u16 q9, q9, #8
@@ -1314,23 +1316,23 @@ generate_composite_function \
.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
fetch_mask_pixblock
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vrshrn.u16 d28, q0, #8
- PF tst PF_CTL, #0x0F
+ PF tst, PF_CTL, #0x0F
vrshrn.u16 d29, q1, #8
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vrshrn.u16 d30, q2, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vrshrn.u16 d31, q3, #8
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vmull.u8 q0, d24, d16
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q1, d25, d16
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q2, d26, d16
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q3, d27, d16
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q0, q0, #8
vrsra.u16 q1, q1, #8
@@ -1408,27 +1410,27 @@ generate_composite_function \
vrshr.u16 q15, q9, #8
fetch_mask_pixblock
vrshr.u16 q6, q10, #8
- PF add PF_X, PF_X, #8
+ PF add, PF_X, PF_X, #8
vrshr.u16 q7, q11, #8
- PF tst PF_CTL, #0x0F
+ PF tst, PF_CTL, #0x0F
vraddhn.u16 d28, q14, q8
- PF addne PF_X, PF_X, #8
+ PF addne, PF_X, PF_X, #8
vraddhn.u16 d29, q15, q9
- PF subne PF_CTL, PF_CTL, #1
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d30, q6, q10
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
vraddhn.u16 d31, q7, q11
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vmull.u8 q6, d24, d8
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
vmull.u8 q7, d24, d9
- PF subge PF_X, PF_X, ORIG_W
+ PF subge, PF_X, PF_X, ORIG_W
vmull.u8 q8, d24, d10
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subsge, PF_CTL, PF_CTL, #0x10
vmull.u8 q9, d24, d11
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vqadd.u8 q14, q0, q14
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
vqadd.u8 q15, q1, q15
vrshr.u16 q10, q6, #8
vrshr.u16 q11, q7, #8
@@ -2425,21 +2427,21 @@ generate_composite_function \
vrshr.u16 q13, q10, #8
fetch_src_pixblock
vraddhn.u16 d30, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d29, q12, q9
vraddhn.u16 d28, q13, q10
vmull.u8 q8, d3, d0
vmull.u8 q9, d3, d1
vmull.u8 q10, d3, d2
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
@@ -2482,21 +2484,21 @@ generate_composite_function \
vrshr.u16 q13, q10, #8
fetch_src_pixblock
vraddhn.u16 d28, q11, q8
- PF add PF_X, PF_X, #8
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #8
- PF subne PF_CTL, PF_CTL, #1
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ PF addne, PF_X, PF_X, #8
+ PF subne, PF_CTL, PF_CTL, #1
vraddhn.u16 d29, q12, q9
vraddhn.u16 d30, q13, q10
vmull.u8 q8, d3, d0
vmull.u8 q9, d3, d1
vmull.u8 q10, d3, d2
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- PF cmp PF_X, ORIG_W
+ PF cmp, PF_X, ORIG_W
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endm
generate_composite_function \
@@ -2841,28 +2843,28 @@ generate_composite_function_nearest_scanline \
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #2
- vld1.32 {reg1}, [TMP1], STRIDE
- vld1.32 {reg2}, [TMP1]
+ vld1.32 {\reg1}, [TMP1], STRIDE
+ vld1.32 {\reg2}, [TMP1]
.endm
.macro bilinear_load_0565 reg1, reg2, tmp
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
- vld1.32 {reg2[0]}, [TMP1], STRIDE
- vld1.32 {reg2[1]}, [TMP1]
- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+ vld1.32 {\reg2[0]}, [TMP1], STRIDE
+ vld1.32 {\reg2[1]}, [TMP1]
+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
.endm
.macro bilinear_load_and_vertical_interpolate_two_8888 \
acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
- bilinear_load_8888 reg1, reg2, tmp1
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- bilinear_load_8888 reg3, reg4, tmp2
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ bilinear_load_8888 \reg1, \reg2, \tmp1
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ bilinear_load_8888 \reg3, \reg4, \tmp2
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_8888 \
@@ -2870,9 +2872,9 @@ generate_composite_function_nearest_scanline \
yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
bilinear_load_and_vertical_interpolate_two_8888 \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
.endm
.macro bilinear_load_and_vertical_interpolate_two_0565 \
@@ -2884,19 +2886,19 @@ generate_composite_function_nearest_scanline \
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {acc2lo[0]}, [TMP1], STRIDE
- vld1.32 {acc2hi[0]}, [TMP2], STRIDE
- vld1.32 {acc2lo[1]}, [TMP1]
- vld1.32 {acc2hi[1]}, [TMP2]
- convert_0565_to_x888 acc2, reg3, reg2, reg1
- vzip.u8 reg1, reg3
- vzip.u8 reg2, reg4
- vzip.u8 reg3, reg4
- vzip.u8 reg1, reg2
- vmull.u8 acc1, reg1, d28
- vmlal.u8 acc1, reg2, d29
- vmull.u8 acc2, reg3, d28
- vmlal.u8 acc2, reg4, d29
+ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\acc2lo[1]}, [TMP1]
+ vld1.32 {\acc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+ vzip.u8 \reg1, \reg3
+ vzip.u8 \reg2, \reg4
+ vzip.u8 \reg3, \reg4
+ vzip.u8 \reg1, \reg2
+ vmull.u8 \acc1, \reg1, d28
+ vmlal.u8 \acc1, \reg2, d29
+ vmull.u8 \acc2, \reg3, d28
+ vmlal.u8 \acc2, \reg4, d29
.endm
.macro bilinear_load_and_vertical_interpolate_four_0565 \
@@ -2909,49 +2911,49 @@ generate_composite_function_nearest_scanline \
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
- vld1.32 {xacc2lo[1]}, [TMP1]
- vld1.32 {xacc2hi[1]}, [TMP2]
- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {\xacc2lo[1]}, [TMP1]
+ vld1.32 {\xacc2hi[1]}, [TMP2]
+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
mov TMP1, X, asr #16
add X, X, UX
add TMP1, TOP, TMP1, asl #1
mov TMP2, X, asr #16
add X, X, UX
add TMP2, TOP, TMP2, asl #1
- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
- vzip.u8 xreg1, xreg3
- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
- vzip.u8 xreg2, xreg4
- vld1.32 {yacc2lo[1]}, [TMP1]
- vzip.u8 xreg3, xreg4
- vld1.32 {yacc2hi[1]}, [TMP2]
- vzip.u8 xreg1, xreg2
- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
- vmull.u8 xacc1, xreg1, d28
- vzip.u8 yreg1, yreg3
- vmlal.u8 xacc1, xreg2, d29
- vzip.u8 yreg2, yreg4
- vmull.u8 xacc2, xreg3, d28
- vzip.u8 yreg3, yreg4
- vmlal.u8 xacc2, xreg4, d29
- vzip.u8 yreg1, yreg2
- vmull.u8 yacc1, yreg1, d28
- vmlal.u8 yacc1, yreg2, d29
- vmull.u8 yacc2, yreg3, d28
- vmlal.u8 yacc2, yreg4, d29
+ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
+ vzip.u8 \xreg1, \xreg3
+ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
+ vzip.u8 \xreg2, \xreg4
+ vld1.32 {\yacc2lo[1]}, [TMP1]
+ vzip.u8 \xreg3, \xreg4
+ vld1.32 {\yacc2hi[1]}, [TMP2]
+ vzip.u8 \xreg1, \xreg2
+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+ vmull.u8 \xacc1, \xreg1, d28
+ vzip.u8 \yreg1, \yreg3
+ vmlal.u8 \xacc1, \xreg2, d29
+ vzip.u8 \yreg2, \yreg4
+ vmull.u8 \xacc2, \xreg3, d28
+ vzip.u8 \yreg3, \yreg4
+ vmlal.u8 \xacc2, \xreg4, d29
+ vzip.u8 \yreg1, \yreg2
+ vmull.u8 \yacc1, \yreg1, d28
+ vmlal.u8 \yacc1, \yreg2, d29
+ vmull.u8 \yacc2, \yreg3, d28
+ vmlal.u8 \yacc2, \yreg4, d29
.endm
.macro bilinear_store_8888 numpix, tmp1, tmp2
-.if numpix == 4
+.if \numpix == 4
vst1.32 {d0, d1}, [OUT, :128]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d0}, [OUT, :64]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.32 {d0[0]}, [OUT, :32]!
.else
- .error bilinear_store_8888 numpix is unsupported
+ .error bilinear_store_8888 \numpix is unsupported
.endif
.endm
@@ -2960,20 +2962,20 @@ generate_composite_function_nearest_scanline \
vuzp.u8 d2, d3
vuzp.u8 d1, d3
vuzp.u8 d0, d2
- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
-.if numpix == 4
+ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
+.if \numpix == 4
vst1.16 {d2}, [OUT, :64]!
-.elseif numpix == 2
+.elseif \numpix == 2
vst1.32 {d2[0]}, [OUT, :32]!
-.elseif numpix == 1
+.elseif \numpix == 1
vst1.16 {d2[0]}, [OUT, :16]!
.else
- .error bilinear_store_0565 numpix is unsupported
+ .error bilinear_store_0565 \numpix is unsupported
.endif
.endm
.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
- bilinear_load_&src_fmt d0, d1, d2
+ bilinear_load_\()\src_fmt d0, d1, d2
vmull.u8 q1, d0, d28
vmlal.u8 q1, d1, d29
/* 5 cycles bubble */
@@ -2985,11 +2987,11 @@ generate_composite_function_nearest_scanline \
/* 3 cycles bubble */
vmovn.u16 d0, q0
/* 1 cycle bubble */
- bilinear_store_&dst_fmt 1, q2, q3
+ bilinear_store_\()\dst_fmt 1, q2, q3
.endm
.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
- bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23
vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
vmlsl.u16 q0, d2, d30
@@ -3002,11 +3004,11 @@ generate_composite_function_nearest_scanline \
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
vmovn.u16 d0, q0
- bilinear_store_&dst_fmt 2, q2, q3
+ bilinear_store_\()\dst_fmt 2, q2, q3
.endm
.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
- bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
q1, q11, d0, d1, d20, d21, d22, d23 \
q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
@@ -3034,54 +3036,54 @@ generate_composite_function_nearest_scanline \
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vadd.u16 q12, q12, q13
- bilinear_store_&dst_fmt 4, q2, q3
+ bilinear_store_\()\dst_fmt 4, q2, q3
.endm
.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
.else
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
.endif
.endm
.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
.else
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
.else
- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
.else
- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
.endif
.endm
.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
.else
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
.endif
.endm
@@ -3106,7 +3108,7 @@ generate_composite_function_nearest_scanline \
src_bpp_shift, dst_bpp_shift, \
prefetch_distance, flags
-pixman_asm_function fname
+pixman_asm_function \fname
OUT .req r0
TOP .req r1
BOTTOM .req r2
@@ -3124,11 +3126,11 @@ pixman_asm_function fname
mov ip, sp
push {r4, r5, r6, r7, r8, r9}
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
ldmia ip, {WB, X, UX, WIDTH}
mul PF_OFFS, PF_OFFS, UX
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpush {d8-d15}
.endif
@@ -3147,11 +3149,11 @@ pixman_asm_function fname
/* ensure good destination alignment */
cmp WIDTH, #1
blt 0f
- tst OUT, #(1 << dst_bpp_shift)
+ tst OUT, #(1 << \dst_bpp_shift)
beq 0f
vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
vadd.u16 q12, q12, q13
- bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #1
0:
vadd.u16 q13, q13, q13
@@ -3160,64 +3162,64 @@ pixman_asm_function fname
cmp WIDTH, #2
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 1))
+ tst OUT, #(1 << (\dst_bpp_shift + 1))
beq 0f
- bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #2
0:
-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
/*********** 8 pixels per iteration *****************/
cmp WIDTH, #4
blt 0f
- tst OUT, #(1 << (dst_bpp_shift + 2))
+ tst OUT, #(1 << (\dst_bpp_shift + 2))
beq 0f
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
sub WIDTH, WIDTH, #4
0:
subs WIDTH, WIDTH, #8
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #8
blt 5f
0:
- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #8
bge 0b
5:
- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
1:
tst WIDTH, #4
beq 2f
- bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2:
.else
/*********** 4 pixels per iteration *****************/
subs WIDTH, WIDTH, #4
blt 1f
mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #4
blt 5f
0:
- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
subs WIDTH, WIDTH, #4
bge 0b
5:
- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
1:
/****************************************************/
.endif
/* handle the remaining trailing pixels */
tst WIDTH, #2
beq 2f
- bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
2:
tst WIDTH, #1
beq 3f
- bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
3:
-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
vpop {d8-d15}
.endif
pop {r4, r5, r6, r7, r8, r9}
@@ -3236,7 +3238,7 @@ pixman_asm_function fname
.unreq TMP3
.unreq TMP4
.unreq STRIDE
-.endfunc
+ pixman_end_asm_function
.endm
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index bdcf6a9..06318d9 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -74,134 +74,134 @@
*/
.macro pixldst1 op, elem_size, reg1, mem_operand, abits
-.if abits > 0
- op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
.else
- op&.&elem_size {d&reg1}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
.endif
.endm
.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
-.if abits > 0
- op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
.else
- op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
.endif
.endm
.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
-.if abits > 0
- op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.if \abits > 0
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
.else
- op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
.endif
.endm
.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
- op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
.endm
.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
- op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
.endm
.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
- op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+ \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
.endm
.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
-.if numbytes == 32
- pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
- %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif numbytes == 16
- pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
-.elseif numbytes == 8
- pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
-.elseif numbytes == 4
- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
- pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
- .elseif elem_size == 16
- pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
- pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+.if \numbytes == 32
+ pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif \numbytes == 16
+ pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+.elseif \numbytes == 8
+ pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
+.elseif \numbytes == 4
+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
+ pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
+ .elseif \elem_size == 16
+ pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
+ pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
.else
- pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+ pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
.endif
-.elseif numbytes == 2
- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
- pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 2
+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
+ pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
.else
- pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
- pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+ pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
+ pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
.endif
-.elseif numbytes == 1
- pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.elseif \numbytes == 1
+ pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixld numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
- %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
- pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+ pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
.else
- pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+ pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
.endif
.endif
.endm
.macro pixst numpix, bpp, basereg, mem_operand, abits=0
-.if bpp > 0
-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
- %(basereg+6), %(basereg+7), mem_operand, abits
-.elseif (bpp == 24) && (numpix == 8)
- pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
-.elseif (bpp == 24) && (numpix == 4)
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
-.elseif (bpp == 24) && (numpix == 2)
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
-.elseif (bpp == 24) && (numpix == 1)
- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+ pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
.else
- pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+ pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
.endif
.endif
.endm
.macro pixld_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
.else
- pixld numpix, bpp, basereg, mem_operand, 128
+ pixld \numpix, \bpp, \basereg, \mem_operand, 128
.endif
.endm
.macro pixst_a numpix, bpp, basereg, mem_operand
-.if (bpp * numpix) <= 128
- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.if (\bpp * \numpix) <= 128
+ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
.else
- pixst numpix, bpp, basereg, mem_operand, 128
+ pixst \numpix, \bpp, \basereg, \mem_operand, 128
.endif
.endm
@@ -210,44 +210,44 @@
* aliases to be defined)
*/
.macro pixld1_s elem_size, reg1, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #1
+ add TMP1, \mem_operand, TMP1, asl #1
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP2, mem_operand, TMP2, asl #1
- vld1.16 {d&reg1&[0]}, [TMP1, :16]
+ add TMP2, \mem_operand, TMP2, asl #1
+ vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #1
- vld1.16 {d&reg1&[1]}, [TMP2, :16]
+ add TMP1, \mem_operand, TMP1, asl #1
+ vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP2, mem_operand, TMP2, asl #1
- vld1.16 {d&reg1&[2]}, [TMP1, :16]
- vld1.16 {d&reg1&[3]}, [TMP2, :16]
-.elseif elem_size == 32
+ add TMP2, \mem_operand, TMP2, asl #1
+ vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
+ vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
+.elseif \elem_size == 32
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #2
+ add TMP1, \mem_operand, TMP1, asl #2
mov TMP2, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP2, mem_operand, TMP2, asl #2
- vld1.32 {d&reg1&[0]}, [TMP1, :32]
- vld1.32 {d&reg1&[1]}, [TMP2, :32]
+ add TMP2, \mem_operand, TMP2, asl #2
+ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
+ vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
.else
.error "unsupported"
.endif
@@ -257,110 +257,110 @@
.if 0 /* elem_size == 32 */
mov TMP1, VX, asr #16
add VX, VX, UNIT_X, asl #1
- add TMP1, mem_operand, TMP1, asl #2
+ add TMP1, \mem_operand, TMP1, asl #2
mov TMP2, VX, asr #16
sub VX, VX, UNIT_X
- add TMP2, mem_operand, TMP2, asl #2
- vld1.32 {d&reg1&[0]}, [TMP1, :32]
+ add TMP2, \mem_operand, TMP2, asl #2
+ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
mov TMP1, VX, asr #16
add VX, VX, UNIT_X, asl #1
- add TMP1, mem_operand, TMP1, asl #2
- vld1.32 {d&reg2&[0]}, [TMP2, :32]
+ add TMP1, \mem_operand, TMP1, asl #2
+ vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
mov TMP2, VX, asr #16
add VX, VX, UNIT_X
- add TMP2, mem_operand, TMP2, asl #2
- vld1.32 {d&reg1&[1]}, [TMP1, :32]
- vld1.32 {d&reg2&[1]}, [TMP2, :32]
+ add TMP2, \mem_operand, TMP2, asl #2
+ vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
+ vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
.else
- pixld1_s elem_size, reg1, mem_operand
- pixld1_s elem_size, reg2, mem_operand
+ pixld1_s \elem_size, \reg1, \mem_operand
+ pixld1_s \elem_size, \reg2, \mem_operand
.endif
.endm
.macro pixld0_s elem_size, reg1, idx, mem_operand
-.if elem_size == 16
+.if \elem_size == 16
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #1
- vld1.16 {d&reg1&[idx]}, [TMP1, :16]
-.elseif elem_size == 32
+ add TMP1, \mem_operand, TMP1, asl #1
+ vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
+.elseif \elem_size == 32
mov TMP1, VX, asr #16
adds VX, VX, UNIT_X
-5: subpls VX, VX, SRC_WIDTH_FIXED
+5: subspl VX, VX, SRC_WIDTH_FIXED
bpl 5b
- add TMP1, mem_operand, TMP1, asl #2
- vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+ add TMP1, \mem_operand, TMP1, asl #2
+ vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
.endif
.endm
.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
-.if numbytes == 32
- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
- pixdeinterleave elem_size, %(basereg+4)
-.elseif numbytes == 16
- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
-.elseif numbytes == 8
- pixld1_s elem_size, %(basereg+1), mem_operand
-.elseif numbytes == 4
- .if elem_size == 32
- pixld0_s elem_size, %(basereg+0), 1, mem_operand
- .elseif elem_size == 16
- pixld0_s elem_size, %(basereg+0), 2, mem_operand
- pixld0_s elem_size, %(basereg+0), 3, mem_operand
+.if \numbytes == 32
+ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
+ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
+ pixdeinterleave \elem_size, %(\basereg+4)
+.elseif \numbytes == 16
+ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
+.elseif \numbytes == 8
+ pixld1_s \elem_size, %(\basereg+1), \mem_operand
+.elseif \numbytes == 4
+ .if \elem_size == 32
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+ .elseif \elem_size == 16
+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
.else
- pixld0_s elem_size, %(basereg+0), 4, mem_operand
- pixld0_s elem_size, %(basereg+0), 5, mem_operand
- pixld0_s elem_size, %(basereg+0), 6, mem_operand
- pixld0_s elem_size, %(basereg+0), 7, mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
.endif
-.elseif numbytes == 2
- .if elem_size == 16
- pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 2
+ .if \elem_size == 16
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
.else
- pixld0_s elem_size, %(basereg+0), 2, mem_operand
- pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
.endif
-.elseif numbytes == 1
- pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.elseif \numbytes == 1
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixld_s numpix, bpp, basereg, mem_operand
-.if bpp > 0
- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.if \bpp > 0
+ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
.endif
.endm
.macro vuzp8 reg1, reg2
- vuzp.8 d&reg1, d&reg2
+ vuzp.8 d\()\reg1, d\()\reg2
.endm
.macro vzip8 reg1, reg2
- vzip.8 d&reg1, d&reg2
+ vzip.8 d\()\reg1, d\()\reg2
.endm
/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
.macro pixdeinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- vuzp8 %(basereg+0), %(basereg+1)
- vuzp8 %(basereg+2), %(basereg+3)
- vuzp8 %(basereg+1), %(basereg+3)
- vuzp8 %(basereg+0), %(basereg+2)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vuzp8 %(\basereg+0), %(\basereg+1)
+ vuzp8 %(\basereg+2), %(\basereg+3)
+ vuzp8 %(\basereg+1), %(\basereg+3)
+ vuzp8 %(\basereg+0), %(\basereg+2)
.endif
.endm
/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
.macro pixinterleave bpp, basereg
-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
- vzip8 %(basereg+0), %(basereg+2)
- vzip8 %(basereg+1), %(basereg+3)
- vzip8 %(basereg+2), %(basereg+3)
- vzip8 %(basereg+0), %(basereg+1)
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vzip8 %(\basereg+0), %(\basereg+2)
+ vzip8 %(\basereg+1), %(\basereg+3)
+ vzip8 %(\basereg+2), %(\basereg+3)
+ vzip8 %(\basereg+0), %(\basereg+1)
.endif
.endm
@@ -394,22 +394,22 @@
*/
.macro PF a, x:vararg
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
- a x
+ \a \x
.endif
.endm
.macro cache_preload std_increment, boost_increment
.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
.if regs_shortage
- PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+ PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
.endif
-.if std_increment != 0
- PF add PF_X, PF_X, #std_increment
+.if \std_increment != 0
+ PF add, PF_X, PF_X, #\std_increment
.endif
- PF tst PF_CTL, #0xF
- PF addne PF_X, PF_X, #boost_increment
- PF subne PF_CTL, PF_CTL, #1
- PF cmp PF_X, ORIG_W
+ PF tst, PF_CTL, #0xF
+ PF addne, PF_X, PF_X, #\boost_increment
+ PF subne, PF_CTL, PF_CTL, #1
+ PF cmp, PF_X, ORIG_W
.if src_bpp_shift >= 0
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
.endif
@@ -419,16 +419,16 @@
.if mask_bpp_shift >= 0
PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
.endif
- PF subge PF_X, PF_X, ORIG_W
- PF subges PF_CTL, PF_CTL, #0x10
+ PF subge, PF_X, PF_X, ORIG_W
+ PF subsge, PF_CTL, PF_CTL, #0x10
.if src_bpp_shift >= 0
- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
.endif
.if dst_r_bpp != 0
- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
.endif
.if mask_bpp_shift >= 0
- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
.endif
.endif
.endm
@@ -465,21 +465,20 @@
beq 2f
.irp lowbit, 1, 2, 4, 8, 16
-local skip1
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
- tst DST_R, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_R, #\lowbit
beq 1f
.endif
- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
.if dst_r_bpp > 0
- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
.else
- add DST_R, DST_R, #lowbit
+ add DST_R, DST_R, #\lowbit
.endif
- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
- sub W, W, #(lowbit * 8 / dst_w_bpp)
+ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
+ sub W, W, #(\lowbit * 8 / dst_w_bpp)
1:
.endif
.endr
@@ -487,19 +486,19 @@ local skip1
pixdeinterleave mask_bpp, mask_basereg
pixdeinterleave dst_r_bpp, dst_r_basereg
- process_pixblock_head
+ \process_pixblock_head
cache_preload 0, pixblock_size
cache_preload_simple
- process_pixblock_tail
+ \process_pixblock_tail
pixinterleave dst_w_bpp, dst_w_basereg
.irp lowbit, 1, 2, 4, 8, 16
-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
-.if lowbit < 16 /* we don't need more than 16-byte alignment */
- tst DST_W, #lowbit
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_W, #\lowbit
beq 1f
.endif
- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
1:
.endif
.endr
@@ -530,18 +529,18 @@ local skip1
tst W, #(pixblock_size - 1)
beq 2f
.irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
- tst W, #chunk_size
+.if pixblock_size > \chunk_size
+ tst W, #\chunk_size
beq 1f
- pixld_src chunk_size, src_bpp, src_basereg, SRC
- pixld chunk_size, mask_bpp, mask_basereg, MASK
-.if dst_aligned_flag != 0
- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+ pixld_src \chunk_size, src_bpp, src_basereg, SRC
+ pixld \chunk_size, mask_bpp, mask_basereg, MASK
+.if \dst_aligned_flag != 0
+ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.else
- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.endif
-.if cache_preload_flag != 0
- PF add PF_X, PF_X, #chunk_size
+.if \cache_preload_flag != 0
+ PF add, PF_X, PF_X, #\chunk_size
.endif
1:
.endif
@@ -550,21 +549,21 @@ local skip1
pixdeinterleave mask_bpp, mask_basereg
pixdeinterleave dst_r_bpp, dst_r_basereg
- process_pixblock_head
-.if cache_preload_flag != 0
+ \process_pixblock_head
+.if \cache_preload_flag != 0
cache_preload 0, pixblock_size
cache_preload_simple
.endif
- process_pixblock_tail
+ \process_pixblock_tail
pixinterleave dst_w_bpp, dst_w_basereg
.irp chunk_size, 16, 8, 4, 2, 1
-.if pixblock_size > chunk_size
- tst W, #chunk_size
+.if pixblock_size > \chunk_size
+ tst W, #\chunk_size
beq 1f
-.if dst_aligned_flag != 0
- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.if \dst_aligned_flag != 0
+ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
.else
- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
.endif
1:
.endif
@@ -604,7 +603,7 @@ local skip1
.if regs_shortage
str H, [sp, #4] /* save updated height to stack */
.endif
- bge start_of_loop_label
+ bge \start_of_loop_label
.endm
/*
@@ -631,7 +630,7 @@ local skip1
src_basereg_ = 0, \
mask_basereg_ = 24
- pixman_asm_function fname
+ pixman_asm_function \fname
push {r4-r12, lr} /* save all registers */
@@ -641,10 +640,10 @@ local skip1
* has to be used instead of ADVANCED.
*/
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
-.if prefetch_distance == 0
+.if \prefetch_distance == 0
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
.endif
@@ -652,17 +651,17 @@ local skip1
* Make some macro arguments globally visible and accessible
* from other macros
*/
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set pixblock_size, pixblock_size_
- .set dst_w_basereg, dst_w_basereg_
- .set dst_r_basereg, dst_r_basereg_
- .set src_basereg, src_basereg_
- .set mask_basereg, mask_basereg_
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set pixblock_size, \pixblock_size_
+ .set dst_w_basereg, \dst_w_basereg_
+ .set dst_r_basereg, \dst_r_basereg_
+ .set src_basereg, \src_basereg_
+ .set mask_basereg, \mask_basereg_
.macro pixld_src x:vararg
- pixld x
+ pixld \x
.endm
.macro fetch_src_pixblock
pixld_src pixblock_size, src_bpp, \
@@ -755,19 +754,19 @@ local skip1
.error "requested dst bpp (dst_w_bpp) is not supported"
.endif
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
.set dst_r_bpp, dst_w_bpp
.else
.set dst_r_bpp, 0
.endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
.set DEINTERLEAVE_32BPP_ENABLED, 1
.else
.set DEINTERLEAVE_32BPP_ENABLED, 0
.endif
-.if prefetch_distance < 0 || prefetch_distance > 15
- .error "invalid prefetch distance (prefetch_distance)"
+.if \prefetch_distance < 0 || \prefetch_distance > 15
+ .error "invalid prefetch distance (\prefetch_distance)"
.endif
.if src_bpp > 0
@@ -776,7 +775,7 @@ local skip1
.if mask_bpp > 0
ldr MASK, [sp, #48]
.endif
- PF mov PF_X, #0
+ PF mov, PF_X, #0
.if src_bpp > 0
ldr SRC_STRIDE, [sp, #44]
.endif
@@ -801,14 +800,14 @@ local skip1
/*
* Setup advanced prefetcher initial state
*/
- PF mov PF_SRC, SRC
- PF mov PF_DST, DST_R
- PF mov PF_MASK, MASK
+ PF mov, PF_SRC, SRC
+ PF mov, PF_DST, DST_R
+ PF mov, PF_MASK, MASK
/* PF_CTL = prefetch_distance | ((h - 1) << 4) */
- PF mov PF_CTL, H, lsl #4
- PF add PF_CTL, #(prefetch_distance - 0x10)
+ PF mov, PF_CTL, H, lsl #4
+ PF add, PF_CTL, #(\prefetch_distance - 0x10)
- init
+ \init
.if regs_shortage
push {r0, r1}
.endif
@@ -826,9 +825,9 @@ local skip1
* long scanlines
*/
0:
- ensure_destination_ptr_alignment process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ ensure_destination_ptr_alignment \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
/* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
pixld_a pixblock_size, dst_r_bpp, \
@@ -836,33 +835,33 @@ local skip1
fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
- PF add PF_X, PF_X, #pixblock_size
- process_pixblock_head
+ PF add, PF_X, PF_X, #pixblock_size
+ \process_pixblock_head
cache_preload 0, pixblock_size
cache_preload_simple
subs W, W, #(pixblock_size * 2)
blt 2f
1:
- process_pixblock_tail_head
+ \process_pixblock_tail_head
cache_preload_simple
subs W, W, #pixblock_size
bge 1b
2:
- process_pixblock_tail
+ \process_pixblock_tail
pixst_a pixblock_size, dst_w_bpp, \
(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
/* Process the remaining trailing pixels in the scanline */
process_trailing_pixels 1, 1, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
advance_to_next_scanline 0b
.if regs_shortage
pop {r0, r1}
.endif
- cleanup
+ \cleanup
pop {r4-r12, pc} /* exit */
/*
* This is the start of the loop, designed to process images with small width
@@ -878,22 +877,22 @@ local skip1
fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
- process_pixblock_head
- process_pixblock_tail
+ \process_pixblock_head
+ \process_pixblock_tail
pixst pixblock_size, dst_w_bpp, \
(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
1:
/* Process the remaining trailing pixels in the scanline */
process_trailing_pixels 0, 0, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
advance_to_next_scanline 8b
9:
.if regs_shortage
pop {r0, r1}
.endif
- cleanup
+ \cleanup
pop {r4-r12, pc} /* exit */
.purgem fetch_src_pixblock
@@ -915,7 +914,7 @@ local skip1
.unreq PF_DST
.unreq PF_MASK
.unreq DUMMY
- .endfunc
+ pixman_end_asm_function
.endm
/*
@@ -939,23 +938,23 @@ local skip1
src_basereg_ = 0, \
mask_basereg_ = 24
- pixman_asm_function fname
+ pixman_asm_function \fname
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
/*
* Make some macro arguments globally visible and accessible
* from other macros
*/
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set pixblock_size, pixblock_size_
- .set dst_w_basereg, dst_w_basereg_
- .set dst_r_basereg, dst_r_basereg_
- .set src_basereg, src_basereg_
- .set mask_basereg, mask_basereg_
-
-.if use_nearest_scaling != 0
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set pixblock_size, \pixblock_size_
+ .set dst_w_basereg, \dst_w_basereg_
+ .set dst_r_basereg, \dst_r_basereg_
+ .set src_basereg, \src_basereg_
+ .set mask_basereg, \mask_basereg_
+
+.if \use_nearest_scaling != 0
/*
* Assign symbolic names to registers for nearest scaling
*/
@@ -971,7 +970,7 @@ local skip1
SRC_WIDTH_FIXED .req r7
.macro pixld_src x:vararg
- pixld_s x
+ pixld_s \x
.endm
ldr UNIT_X, [sp]
@@ -991,16 +990,16 @@ local skip1
MASK .req r3 /* mask pointer */
.macro pixld_src x:vararg
- pixld x
+ pixld \x
.endm
.endif
-.if (((flags) & FLAG_DST_READWRITE) != 0)
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
.set dst_r_bpp, dst_w_bpp
.else
.set dst_r_bpp, 0
.endif
-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
.set DEINTERLEAVE_32BPP_ENABLED, 1
.else
.set DEINTERLEAVE_32BPP_ENABLED, 0
@@ -1011,15 +1010,15 @@ local skip1
(src_basereg - pixblock_size * src_bpp / 64), SRC
.endm
- init
+ \init
mov DST_R, DST_W
cmp W, #pixblock_size
blt 8f
- ensure_destination_ptr_alignment process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ ensure_destination_ptr_alignment \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
subs W, W, #pixblock_size
blt 7f
@@ -1030,26 +1029,26 @@ local skip1
fetch_src_pixblock
pixld pixblock_size, mask_bpp, \
(mask_basereg - pixblock_size * mask_bpp / 64), MASK
- process_pixblock_head
+ \process_pixblock_head
subs W, W, #pixblock_size
blt 2f
1:
- process_pixblock_tail_head
+ \process_pixblock_tail_head
subs W, W, #pixblock_size
bge 1b
2:
- process_pixblock_tail
+ \process_pixblock_tail
pixst_a pixblock_size, dst_w_bpp, \
(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
7:
/* Process the remaining trailing pixels in the scanline (dst aligned) */
process_trailing_pixels 0, 1, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
- cleanup
-.if use_nearest_scaling != 0
+ \cleanup
+.if \use_nearest_scaling != 0
pop {r4-r8, pc} /* exit */
.else
bx lr /* exit */
@@ -1057,13 +1056,13 @@ local skip1
8:
/* Process the remaining trailing pixels in the scanline (dst unaligned) */
process_trailing_pixels 0, 0, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
- cleanup
+ \cleanup
-.if use_nearest_scaling != 0
+.if \use_nearest_scaling != 0
pop {r4-r8, pc} /* exit */
.unreq DST_R
@@ -1090,15 +1089,15 @@ local skip1
.purgem fetch_src_pixblock
.purgem pixld_src
- .endfunc
+ pixman_end_asm_function
.endm
.macro generate_composite_function_single_scanline x:vararg
- generate_composite_function_scanline 0, x
+ generate_composite_function_scanline 0, \x
.endm
.macro generate_composite_function_nearest_scanline x:vararg
- generate_composite_function_scanline 1, x
+ generate_composite_function_scanline 1, \x
.endm
/* Default prologue/epilogue, nothing special needs to be done */
@@ -1134,22 +1133,22 @@ local skip1
* value (in) is lost.
*/
.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
- vshrn.u16 out_r, in, #8
- vshrn.u16 out_g, in, #3
- vsli.u16 in, in, #5
- vmov.u8 out_a, #255
- vsri.u8 out_r, out_r, #5
- vsri.u8 out_g, out_g, #6
- vshrn.u16 out_b, in, #2
+ vshrn.u16 \out_r, \in, #8
+ vshrn.u16 \out_g, \in, #3
+ vsli.u16 \in, \in, #5
+ vmov.u8 \out_a, #255
+ vsri.u8 \out_r, \out_r, #5
+ vsri.u8 \out_g, \out_g, #6
+ vshrn.u16 \out_b, \in, #2
.endm
.macro convert_0565_to_x888 in, out_r, out_g, out_b
- vshrn.u16 out_r, in, #8
- vshrn.u16 out_g, in, #3
- vsli.u16 in, in, #5
- vsri.u8 out_r, out_r, #5
- vsri.u8 out_g, out_g, #6
- vshrn.u16 out_b, in, #2
+ vshrn.u16 \out_r, \in, #8
+ vshrn.u16 \out_g, \in, #3
+ vsli.u16 \in, \in, #5
+ vsri.u8 \out_r, \out_r, #5
+ vsri.u8 \out_g, \out_g, #6
+ vshrn.u16 \out_b, \in, #2
.endm
/*
@@ -1159,11 +1158,11 @@ local skip1
* registers (tmp1, tmp2)
*/
.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
- vshll.u8 tmp1, in_g, #8
- vshll.u8 out, in_r, #8
- vshll.u8 tmp2, in_b, #8
- vsri.u16 out, tmp1, #5
- vsri.u16 out, tmp2, #11
+ vshll.u8 \tmp1, \in_g, #8
+ vshll.u8 \out, \in_r, #8
+ vshll.u8 \tmp2, \in_b, #8
+ vsri.u16 \out, \tmp1, #5
+ vsri.u16 \out, \tmp2, #11
.endm
/*
@@ -1173,12 +1172,12 @@ local skip1
* value from 'in' is lost
*/
.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
- vshl.u16 out0, in, #5 /* G top 6 bits */
- vshl.u16 tmp, in, #11 /* B top 5 bits */
- vsri.u16 in, in, #5 /* R is ready in top bits */
- vsri.u16 out0, out0, #6 /* G is ready in top bits */
- vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
- vshr.u16 out1, in, #8 /* R is in place */
- vsri.u16 out0, tmp, #8 /* G & B is in place */
- vzip.u16 out0, out1 /* everything is in place */
+ vshl.u16 \out0, \in, #5 /* G top 6 bits */
+ vshl.u16 \tmp, \in, #11 /* B top 5 bits */
+ vsri.u16 \in, \in, #5 /* R is ready in top bits */
+ vsri.u16 \out0, \out0, #6 /* G is ready in top bits */
+ vsri.u16 \tmp, \tmp, #5 /* B is ready in top bits */
+ vshr.u16 \out1, \in, #8 /* R is in place */
+ vsri.u16 \out0, \tmp, #8 /* G & B is in place */
+ vzip.u16 \out0, \out1 /* everything is in place */
.endm
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 60e9c78..103f1c2 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -27,7 +27,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <string.h>
@@ -194,7 +194,7 @@ arm_neon_fill (pixman_implementation_t *imp,
uint32_t _xor)
{
/* stride is always multiple of 32bit units in pixman */
- uint32_t byte_stride = stride * sizeof(uint32_t);
+ int32_t byte_stride = stride * sizeof(uint32_t);
switch (bpp)
{
@@ -331,6 +331,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, neon_composite_over_8888_8_0565),
PIXMAN_STD_FAST_PATH (OVER, r5g6b5, a8, r5g6b5, neon_composite_over_0565_8_0565),
PIXMAN_STD_FAST_PATH (OVER, b5g6r5, a8, b5g6r5, neon_composite_over_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_over_8888_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, neon_composite_over_8888_0565),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, neon_composite_over_8888_0565),
@@ -341,17 +342,33 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, neon_composite_add_n_8_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, neon_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, neon_composite_add_n_8_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, neon_composite_add_n_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8),
PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, neon_composite_add_0565_8_0565),
PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, neon_composite_add_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, a8, x8r8g8b8, neon_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, x8r8g8b8, neon_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, x8b8g8r8, a8, x8b8g8r8, neon_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, x8b8g8r8, neon_composite_add_8888_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, neon_composite_add_8888_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, neon_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, solid, x8r8g8b8, neon_composite_add_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, x8r8g8b8, neon_composite_add_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, x8b8g8r8, solid, x8b8g8r8, neon_composite_add_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, x8b8g8r8, neon_composite_add_8888_n_8888),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, neon_composite_add_8888_n_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, neon_composite_add_8888_n_8888),
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, null, x8r8g8b8, neon_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, x8r8g8b8, neon_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, x8b8g8r8, null, x8b8g8r8, neon_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, x8b8g8r8, neon_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (IN, solid, null, a8, neon_composite_in_n_8),
@@ -359,24 +376,26 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, neon_composite_out_reverse_8_0565),
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, b5g6r5, neon_composite_out_reverse_8_0565),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, x8r8g8b8, neon_composite_out_reverse_8_8888),
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8r8g8b8, neon_composite_out_reverse_8_8888),
+ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, x8b8g8r8, neon_composite_out_reverse_8_8888),
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8b8g8r8, neon_composite_out_reverse_8_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
/* Note: NONE repeat is not supported yet */
SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
@@ -404,6 +423,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8888),
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
@@ -420,6 +440,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/pixman-arm-simd-asm-scaled.S b/pixman/pixman-arm-simd-asm-scaled.S
index e050292..cc62c81 100644
--- a/pixman/pixman-arm-simd-asm-scaled.S
+++ b/pixman/pixman-arm-simd-asm-scaled.S
@@ -39,6 +39,8 @@
#include "pixman-arm-asm.h"
+ pixman_syntax_unified
+
/*
* Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
@@ -62,7 +64,7 @@
prefetch_distance, \
prefetch_braking_distance
-pixman_asm_function fname
+pixman_asm_function \fname
W .req r0
DST .req r1
SRC .req r2
@@ -76,39 +78,39 @@ pixman_asm_function fname
ldr UNIT_X, [sp]
push {r4, r5, r6, r7, r8, r10}
- mvn VXMASK, #((1 << bpp_shift) - 1)
+ mvn VXMASK, #((1 << \bpp_shift) - 1)
ldr SRC_WIDTH_FIXED, [sp, #28]
/* define helper macro */
.macro scale_2_pixels
- ldr&t TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+ ldr\()\t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
- str&t TMP1, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
+ str\()\t TMP1, [DST], #(1 << \bpp_shift)
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
- ldr&t TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ ldr\()\t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
- str&t TMP2, [DST], #(1 << bpp_shift)
-9: subpls VX, VX, SRC_WIDTH_FIXED
+ str\()\t TMP2, [DST], #(1 << \bpp_shift)
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
.endm
/* now do the scaling */
- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
adds VX, VX, UNIT_X
-9: subpls VX, VX, SRC_WIDTH_FIXED
+9: subspl VX, VX, SRC_WIDTH_FIXED
bpl 9b
- subs W, W, #(8 + prefetch_braking_distance)
+ subs W, W, #(8 + \prefetch_braking_distance)
blt 2f
/* calculate prefetch offset */
- mov PF_OFFS, #prefetch_distance
+ mov PF_OFFS, #\prefetch_distance
mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
- pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
- add PF_OFFS, UNIT_X, lsl #3
+ pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
+ add PF_OFFS, PF_OFFS, UNIT_X, lsl #3
scale_2_pixels
scale_2_pixels
scale_2_pixels
@@ -116,7 +118,7 @@ pixman_asm_function fname
subs W, W, #8
bge 1b
2:
- subs W, W, #(4 - 8 - prefetch_braking_distance)
+ subs W, W, #(4 - 8 - \prefetch_braking_distance)
blt 2f
1: /* process the remaining pixels */
scale_2_pixels
@@ -129,8 +131,8 @@ pixman_asm_function fname
scale_2_pixels
2:
tst W, #1
- ldrne&t TMP1, [SRC, TMP1]
- strne&t TMP1, [DST]
+ ldr\()\t\()ne TMP1, [SRC, TMP1]
+ str\()\t\()ne TMP1, [DST]
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
@@ -146,7 +148,7 @@ pixman_asm_function fname
/* return */
pop {r4, r5, r6, r7, r8, r10}
bx lr
-.endfunc
+ pixman_end_asm_function
.endm
generate_nearest_scanline_func \
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 7b0727b..34d38f1 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -40,6 +40,8 @@
#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"
+ pixman_syntax_unified
+
/* A head macro should do all processing which results in an output of up to
* 16 bytes, as far as the final load instruction. The corresponding tail macro
* should complete the processing of the up-to-16 bytes. The calling macro will
@@ -57,7 +59,7 @@
.endm
.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- pixld cond, numbytes, firstreg, SRC, unaligned_src
+ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm
.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
@@ -65,8 +67,8 @@
WK5 .req STRIDE_S
WK6 .req MASK
WK7 .req STRIDE_M
-110: pixld , 16, 0, SRC, unaligned_src
- pixld , 16, 4, SRC, unaligned_src
+110: pixld , 16, 0, SRC, \unaligned_src
+ pixld , 16, 4, SRC, \unaligned_src
pld [SRC, SCRATCH]
pixst , 16, 0, DST
pixst , 16, 4, DST
@@ -122,7 +124,7 @@ generate_composite_function \
.macro src_n_0565_init
ldrh SRC, [sp, #ARGS_STACK_OFFSET]
- orr SRC, SRC, lsl #16
+ orr SRC, SRC, SRC, lsl #16
mov STRIDE_S, SRC
mov MASK, SRC
mov STRIDE_M, SRC
@@ -130,8 +132,8 @@ generate_composite_function \
.macro src_n_8_init
ldrb SRC, [sp, #ARGS_STACK_OFFSET]
- orr SRC, SRC, lsl #8
- orr SRC, SRC, lsl #16
+ orr SRC, SRC, SRC, lsl #8
+ orr SRC, SRC, SRC, lsl #16
mov STRIDE_S, SRC
mov MASK, SRC
mov STRIDE_M, SRC
@@ -142,7 +144,7 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req MASK
WK7 .req STRIDE_M
- pixst cond, numbytes, 4, DST
+ pixst \cond, \numbytes, 4, DST
.unreq WK4
.unreq WK5
.unreq WK6
@@ -182,20 +184,20 @@ generate_composite_function \
/******************************************************************************/
.macro src_x888_8888_pixel, cond, reg
- orr&cond WK&reg, WK&reg, #0xFF000000
+ orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
.endm
.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- pixld cond, numbytes, firstreg, SRC, unaligned_src
+ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
.endm
.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
- src_x888_8888_pixel cond, %(firstreg+0)
- .if numbytes >= 8
- src_x888_8888_pixel cond, %(firstreg+1)
- .if numbytes == 16
- src_x888_8888_pixel cond, %(firstreg+2)
- src_x888_8888_pixel cond, %(firstreg+3)
+ src_x888_8888_pixel \cond, %(\firstreg+0)
+ .if \numbytes >= 8
+ src_x888_8888_pixel \cond, %(\firstreg+1)
+ .if \numbytes == 16
+ src_x888_8888_pixel \cond, %(\firstreg+2)
+ src_x888_8888_pixel \cond, %(\firstreg+3)
.endif
.endif
.endm
@@ -222,73 +224,73 @@ generate_composite_function \
.endm
.macro src_0565_8888_2pixels, reg1, reg2
- and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
- bic WK&reg2, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
- mov WK&reg1, WK&reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
- mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
- bic WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
- orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
- orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
- pkhtb WK&reg1, WK&reg1, WK&reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
- sel WK&reg1, WK&reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
- mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
- pkhtb WK&reg2, WK&reg2, WK&reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
- sel WK&reg2, WK&reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
- orr WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
- orr WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
+ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
+ bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+ pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
+ sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
+ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
+ pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
+ sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+ orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
.endm
/* This version doesn't need STRIDE_M, but is one instruction longer.
It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
- and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
- bic WK&reg1, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
- mov WK&reg2, WK&reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
- mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
- bic WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
- mov WK&reg2, WK&reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
- mov WK&reg1, WK&reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
- orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
- orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
- pkhbt WK&reg2, WK&reg2, WK&reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
- pkhbt WK&reg1, WK&reg1, WK&reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
- sel WK&reg2, SCRATCH, WK&reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
- sel WK&reg1, SCRATCH, WK&reg1 @ --------rrrrrrrrggggggggbbbbbbbb
- orr WK&reg2, WK&reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
- orr WK&reg1, WK&reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+ bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+ mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
+ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
+ bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+ mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
+ mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
+ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
+ pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+ sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
*/
.macro src_0565_8888_1pixel, reg
- bic SCRATCH, WK&reg, MASK @ 0000000000000000rrrrr000000bbbbb
- and WK&reg, WK&reg, MASK @ 000000000000000000000gggggg00000
- mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
- mov WK&reg, WK&reg, lsl #5 @ 0000000000000000gggggg0000000000
- orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
- orr WK&reg, WK&reg, WK&reg, lsr #6 @ 000000000000000gggggggggggg00000
- pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
- sel WK&reg, WK&reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
- orr WK&reg, WK&reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb
+ and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000
+ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+ mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000
+ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+ orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000
+ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+ sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
+ orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
.endm
.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 16
- pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
- .elseif numbytes == 8
- pixld , 4, firstreg, SRC, unaligned_src
- .elseif numbytes == 4
- pixld , 2, firstreg, SRC, unaligned_src
+ .if \numbytes == 16
+ pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
+ .elseif \numbytes == 8
+ pixld , 4, \firstreg, SRC, \unaligned_src
+ .elseif \numbytes == 4
+ pixld , 2, \firstreg, SRC, \unaligned_src
.endif
.endm
.macro src_0565_8888_process_tail cond, numbytes, firstreg
- .if numbytes == 16
- src_0565_8888_2pixels firstreg, %(firstreg+1)
- src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
- src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .if \numbytes == 16
+ src_0565_8888_2pixels \firstreg, %(\firstreg+1)
+ src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+ src_0565_8888_2pixels \firstreg, %(\firstreg+1)
.else
- src_0565_8888_1pixel firstreg
+ src_0565_8888_1pixel \firstreg
.endif
.endm
@@ -311,23 +313,23 @@ generate_composite_function \
.endm
.macro src_x888_0565_1pixel s, d
- and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
- and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
- orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
- orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+ and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+ and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000
+ orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+ orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
/* Top 16 bits are discarded during the following STRH */
.endm
.macro src_x888_0565_2pixels slo, shi, d, tmp
- and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
- and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
- and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
- orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
- orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
- and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
- orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
- orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
- pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+ and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
+ and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
+ and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+ orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
+ orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
+ and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000
+ orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+ orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+ pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
.endm
.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
@@ -335,33 +337,33 @@ generate_composite_function \
WK5 .req STRIDE_M
WK6 .req WK3
WK7 .req ORIG_W
- .if numbytes == 16
+ .if \numbytes == 16
pixld , 16, 4, SRC, 0
src_x888_0565_2pixels 4, 5, 0, 0
pixld , 8, 4, SRC, 0
src_x888_0565_2pixels 6, 7, 1, 1
pixld , 8, 6, SRC, 0
.else
- pixld , numbytes*2, 4, SRC, 0
+ pixld , \numbytes*2, 4, SRC, 0
.endif
.endm
.macro src_x888_0565_process_tail cond, numbytes, firstreg
- .if numbytes == 16
+ .if \numbytes == 16
src_x888_0565_2pixels 4, 5, 2, 2
src_x888_0565_2pixels 6, 7, 3, 4
- .elseif numbytes == 8
+ .elseif \numbytes == 8
src_x888_0565_2pixels 4, 5, 1, 1
src_x888_0565_2pixels 6, 7, 2, 2
- .elseif numbytes == 4
+ .elseif \numbytes == 4
src_x888_0565_2pixels 4, 5, 1, 1
.else
src_x888_0565_1pixel 4, 1
.endif
- .if numbytes == 16
- pixst , numbytes, 0, DST
+ .if \numbytes == 16
+ pixst , \numbytes, 0, DST
.else
- pixst , numbytes, 1, DST
+ pixst , \numbytes, 1, DST
.endif
.unreq WK4
.unreq WK5
@@ -382,37 +384,37 @@ generate_composite_function \
/******************************************************************************/
.macro add_8_8_8pixels cond, dst1, dst2
- uqadd8&cond WK&dst1, WK&dst1, MASK
- uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
+ uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK
+ uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M
.endm
.macro add_8_8_4pixels cond, dst
- uqadd8&cond WK&dst, WK&dst, MASK
+ uqadd8\()\cond WK\()\dst, WK\()\dst, MASK
.endm
.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
WK4 .req MASK
WK5 .req STRIDE_M
- .if numbytes == 16
- pixld cond, 8, 4, SRC, unaligned_src
- pixld cond, 16, firstreg, DST, 0
- add_8_8_8pixels cond, firstreg, %(firstreg+1)
- pixld cond, 8, 4, SRC, unaligned_src
+ .if \numbytes == 16
+ pixld \cond, 8, 4, SRC, \unaligned_src
+ pixld \cond, 16, \firstreg, DST, 0
+ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
+ pixld \cond, 8, 4, SRC, \unaligned_src
.else
- pixld cond, numbytes, 4, SRC, unaligned_src
- pixld cond, numbytes, firstreg, DST, 0
+ pixld \cond, \numbytes, 4, SRC, \unaligned_src
+ pixld \cond, \numbytes, \firstreg, DST, 0
.endif
.unreq WK4
.unreq WK5
.endm
.macro add_8_8_process_tail cond, numbytes, firstreg
- .if numbytes == 16
- add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
- .elseif numbytes == 8
- add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .if \numbytes == 16
+ add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
+ .elseif \numbytes == 8
+ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
.else
- add_8_8_4pixels cond, firstreg
+ add_8_8_4pixels \cond, \firstreg
.endif
.endm
@@ -441,8 +443,8 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req STRIDE_M
WK7 .req ORIG_W
- pixld , numbytes, %(4+firstreg), SRC, unaligned_src
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src
+ pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.unreq WK5
.unreq WK6
@@ -451,44 +453,44 @@ generate_composite_function \
.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
/* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
- teq WK&reg0, #0
- .if numbytes > 4
- teqeq WK&reg1, #0
- .if numbytes > 8
- teqeq WK&reg2, #0
- teqeq WK&reg3, #0
+ teq WK\()\reg0, #0
+ .if \numbytes > 4
+ teqeq WK\()\reg1, #0
+ .if \numbytes > 8
+ teqeq WK\()\reg2, #0
+ teqeq WK\()\reg3, #0
.endif
.endif
.endm
.macro over_8888_8888_prepare next
- mov WK&next, WK&next, lsr #24
+ mov WK\()\next, WK\()\next, lsr #24
.endm
.macro over_8888_8888_1pixel src, dst, offset, next
/* src = destination component multiplier */
- rsb WK&src, WK&src, #255
+ rsb WK\()\src, WK\()\src, #255
/* Split even/odd bytes of dst into SCRATCH/dst */
- uxtb16 SCRATCH, WK&dst
- uxtb16 WK&dst, WK&dst, ror #8
+ uxtb16 SCRATCH, WK\()\dst
+ uxtb16 WK\()\dst, WK\()\dst, ror #8
/* Multiply through, adding 0.5 to the upper byte of result for rounding */
- mla SCRATCH, SCRATCH, WK&src, MASK
- mla WK&dst, WK&dst, WK&src, MASK
+ mla SCRATCH, SCRATCH, WK\()\src, MASK
+ mla WK\()\dst, WK\()\dst, WK\()\src, MASK
/* Where we would have had a stall between the result of the first MLA and the shifter input,
* reload the complete source pixel */
- ldr WK&src, [SRC, #offset]
+ ldr WK\()\src, [SRC, #\offset]
/* Multiply by 257/256 to approximate 256/255 */
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
/* In this stall, start processing the next pixel */
- .if offset < -4
- mov WK&next, WK&next, lsr #24
+ .if \offset < -4
+ mov WK\()\next, WK\()\next, lsr #24
.endif
- uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+ uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
/* Recombine even/odd bytes of multiplied destination */
mov SCRATCH, SCRATCH, ror #8
- sel WK&dst, SCRATCH, WK&dst
+ sel WK\()\dst, SCRATCH, WK\()\dst
/* Saturated add of source to multiplied destination */
- uqadd8 WK&dst, WK&dst, WK&src
+ uqadd8 WK\()\dst, WK\()\dst, WK\()\src
.endm
.macro over_8888_8888_process_tail cond, numbytes, firstreg
@@ -496,17 +498,17 @@ generate_composite_function \
WK5 .req STRIDE_S
WK6 .req STRIDE_M
WK7 .req ORIG_W
- over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+ over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
beq 10f
- over_8888_8888_prepare %(4+firstreg)
- .set PROCESS_REG, firstreg
- .set PROCESS_OFF, -numbytes
- .rept numbytes / 4
+ over_8888_8888_prepare %(4+\firstreg)
+ .set PROCESS_REG, \firstreg
+ .set PROCESS_OFF, -\numbytes
+ .rept \numbytes / 4
over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.set PROCESS_OFF, PROCESS_OFF+4
.endr
- pixst , numbytes, firstreg, DST
+ pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.unreq WK5
@@ -536,16 +538,16 @@ generate_composite_function \
*/
.macro mul_8888_8 word, byte, tmp, half
/* Split even/odd bytes of word apart */
- uxtb16 tmp, word
- uxtb16 word, word, ror #8
+ uxtb16 \tmp, \word
+ uxtb16 \word, \word, ror #8
/* Multiply bytes together with rounding, then by 257/256 */
- mla tmp, tmp, byte, half
- mla word, word, byte, half /* 1 stall follows */
- uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
- uxtab16 word, word, word, ror #8
+ mla \tmp, \tmp, \byte, \half
+ mla \word, \word, \byte, \half /* 1 stall follows */
+ uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */
+ uxtab16 \word, \word, \word, ror #8
/* Recombine bytes */
- mov tmp, tmp, ror #8
- sel word, tmp, word
+ mov \tmp, \tmp, ror #8
+ sel \word, \tmp, \word
.endm
/******************************************************************************/
@@ -567,8 +569,8 @@ generate_composite_function \
WK5 .req STRIDE_D
WK6 .req STRIDE_S
WK7 .req ORIG_W
- pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
+ pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.unreq WK5
.unreq WK6
@@ -576,10 +578,10 @@ generate_composite_function \
.endm
.macro over_8888_n_8888_1pixel src, dst
- mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
- sub WK7, WK6, WK&src, lsr #24
- mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
- uqadd8 WK&dst, WK&dst, WK&src
+ mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M
+ sub WK7, WK6, WK\()\src, lsr #24
+ mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M
+ uqadd8 WK\()\dst, WK\()\dst, WK\()\src
.endm
.macro over_8888_n_8888_process_tail cond, numbytes, firstreg
@@ -587,12 +589,12 @@ generate_composite_function \
WK5 .req STRIDE_D
WK6 .req STRIDE_S
WK7 .req ORIG_W
- over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+ over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
beq 10f
mov WK6, #255
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
- .if numbytes == 16 && PROCESS_REG == 2
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+ .if \numbytes == 16 && PROCESS_REG == 2
/* We're using WK6 and WK7 as temporaries, so half way through
* 4 pixels, reload the second two source pixels but this time
* into WK4 and WK5 */
@@ -601,7 +603,7 @@ generate_composite_function \
over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
- pixst , numbytes, firstreg, DST
+ pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.unreq WK5
@@ -642,13 +644,13 @@ generate_composite_function \
.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
WK4 .req STRIDE_M
- pixld , numbytes/4, 4, MASK, unaligned_mask
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes/4, 4, MASK, \unaligned_mask
+ pixld , \numbytes, \firstreg, DST, 0
.unreq WK4
.endm
.macro over_n_8_8888_1pixel src, dst
- uxtb Y, WK4, ror #src*8
+ uxtb Y, WK4, ror #\src*8
/* Trailing part of multiplication of source */
mla SCRATCH, STRIDE_S, Y, STRIDE_D
mla Y, SRC, Y, STRIDE_D
@@ -659,20 +661,20 @@ generate_composite_function \
sub ORIG_W, ORIG_W, Y, lsr #24
sel Y, SCRATCH, Y
/* Then multiply the destination */
- mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
- uqadd8 WK&dst, WK&dst, Y
+ mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
+ uqadd8 WK\()\dst, WK\()\dst, Y
.endm
.macro over_n_8_8888_process_tail cond, numbytes, firstreg
WK4 .req STRIDE_M
teq WK4, #0
beq 10f
- .set PROCESS_REG, firstreg
- .rept numbytes / 4
- over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+ over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG)
.set PROCESS_REG, PROCESS_REG+1
.endr
- pixst , numbytes, firstreg, DST
+ pixst , \numbytes, \firstreg, DST
10:
.unreq WK4
.endm
@@ -705,14 +707,14 @@ generate_composite_function \
.endm
.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- pixld , numbytes, firstreg, DST, 0
+ pixld , \numbytes, \firstreg, DST, 0
.endm
.macro over_reverse_n_8888_1pixel d, is_only
- teq WK&d, #0
+ teq WK\()\d, #0
beq 8f /* replace with source */
- bics ORIG_W, STRIDE_D, WK&d, lsr #24
- .if is_only == 1
+ bics ORIG_W, STRIDE_D, WK\()\d, lsr #24
+ .if \is_only == 1
beq 49f /* skip store */
.else
beq 9f /* write same value back */
@@ -723,36 +725,36 @@ generate_composite_function \
uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
mov SCRATCH, SCRATCH, ror #8
sel ORIG_W, SCRATCH, ORIG_W
- uqadd8 WK&d, WK&d, ORIG_W
+ uqadd8 WK\()\d, WK\()\d, ORIG_W
b 9f
-8: mov WK&d, SRC
+8: mov WK\()\d, SRC
9:
.endm
.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
- over_reverse_n_8888_1pixel reg1, 1
+ .if \numbytes == 4
+ over_reverse_n_8888_1pixel \reg1, 1
.else
- and SCRATCH, WK&reg1, WK&reg2
- .if numbytes == 16
- and SCRATCH, SCRATCH, WK&reg3
- and SCRATCH, SCRATCH, WK&reg4
+ and SCRATCH, WK\()\reg1, WK\()\reg2
+ .if \numbytes == 16
+ and SCRATCH, SCRATCH, WK\()\reg3
+ and SCRATCH, SCRATCH, WK\()\reg4
.endif
mvns SCRATCH, SCRATCH, asr #24
beq 49f /* skip store if all opaque */
- over_reverse_n_8888_1pixel reg1, 0
- over_reverse_n_8888_1pixel reg2, 0
- .if numbytes == 16
- over_reverse_n_8888_1pixel reg3, 0
- over_reverse_n_8888_1pixel reg4, 0
+ over_reverse_n_8888_1pixel \reg1, 0
+ over_reverse_n_8888_1pixel \reg2, 0
+ .if \numbytes == 16
+ over_reverse_n_8888_1pixel \reg3, 0
+ over_reverse_n_8888_1pixel \reg4, 0
.endif
.endif
- pixst , numbytes, reg1, DST
+ pixst , \numbytes, \reg1, DST
49:
.endm
.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
- over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+ over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm
generate_composite_function \
@@ -794,20 +796,20 @@ generate_composite_function \
.macro over_white_8888_8888_ca_combine m, d
uxtb16 TMP1, TMP0 /* rb_notmask */
- uxtb16 TMP2, d /* rb_dest; 1 stall follows */
+ uxtb16 TMP2, \d /* rb_dest; 1 stall follows */
smlatt TMP3, TMP2, TMP1, HALF /* red */
smlabb TMP2, TMP2, TMP1, HALF /* blue */
uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
- uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
- smlatt d, TMP1, TMP0, HALF /* alpha */
+ uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */
+ smlatt \d, TMP1, TMP0, HALF /* alpha */
smlabb TMP1, TMP1, TMP0, HALF /* green */
pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
- pkhbt TMP1, TMP1, d, lsl #16 /* ag */
+ pkhbt TMP1, TMP1, \d, lsl #16 /* ag */
uxtab16 TMP0, TMP0, TMP0, ror #8
uxtab16 TMP1, TMP1, TMP1, ror #8
mov TMP0, TMP0, ror #8
- sel d, TMP0, TMP1
- uqadd8 d, d, m /* d is a late result */
+ sel \d, TMP0, TMP1
+ uqadd8 \d, \d, \m /* d is a late result */
.endm
.macro over_white_8888_8888_ca_1pixel_head
@@ -853,10 +855,10 @@ generate_composite_function \
.endm
.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .if numbytes == 4
+ .if \numbytes == 4
over_white_8888_8888_ca_1pixel_head
.else
- .if numbytes == 16
+ .if \numbytes == 16
over_white_8888_8888_ca_2pixels_head
over_white_8888_8888_ca_2pixels_tail
.endif
@@ -865,7 +867,7 @@ generate_composite_function \
.endm
.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
- .if numbytes == 4
+ .if \numbytes == 4
over_white_8888_8888_ca_1pixel_tail
.else
over_white_8888_8888_ca_2pixels_tail
@@ -1004,7 +1006,7 @@ generate_composite_function \
.endm
.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- .rept (numbytes / 4) - 1
+ .rept (\numbytes / 4) - 1
over_n_8888_8888_ca_1pixel_head
over_n_8888_8888_ca_1pixel_tail
.endr
@@ -1020,7 +1022,7 @@ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
cmp ip, #-1
beq pixman_composite_over_white_8888_8888_ca_asm_armv6
/* else drop through... */
- .endfunc
+pixman_end_asm_function
generate_composite_function \
pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
@@ -1045,84 +1047,84 @@ generate_composite_function \
.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
ldrb ORIG_W, [SRC], #4
- .if numbytes >= 8
- ldrb WK&reg1, [SRC], #4
- .if numbytes == 16
- ldrb WK&reg2, [SRC], #4
- ldrb WK&reg3, [SRC], #4
+ .if \numbytes >= 8
+ ldrb WK\()\reg1, [SRC], #4
+ .if \numbytes == 16
+ ldrb WK\()\reg2, [SRC], #4
+ ldrb WK\()\reg3, [SRC], #4
.endif
.endif
- add DST, DST, #numbytes
+ add DST, DST, #\numbytes
.endm
.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
- in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+ in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
.endm
.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
- .if is_only != 1
- movs s, ORIG_W
- .if offset != 0
- ldrb ORIG_W, [SRC, #offset]
+ .if \is_only != 1
+ movs \s, ORIG_W
+ .if \offset != 0
+ ldrb ORIG_W, [SRC, #\offset]
.endif
beq 01f
teq STRIDE_M, #0xFF
beq 02f
.endif
- uxtb16 SCRATCH, d /* rb_dest */
- uxtb16 d, d, ror #8 /* ag_dest */
- mla SCRATCH, SCRATCH, s, MASK
- mla d, d, s, MASK
+ uxtb16 SCRATCH, \d /* rb_dest */
+ uxtb16 \d, \d, ror #8 /* ag_dest */
+ mla SCRATCH, SCRATCH, \s, MASK
+ mla \d, \d, \s, MASK
uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
- uxtab16 d, d, d, ror #8
+ uxtab16 \d, \d, \d, ror #8
mov SCRATCH, SCRATCH, ror #8
- sel d, SCRATCH, d
+ sel \d, SCRATCH, \d
b 02f
- .if offset == 0
+ .if \offset == 0
48: /* Last mov d,#0 of the set - used as part of shortcut for
* source values all 0 */
.endif
-01: mov d, #0
+01: mov \d, #0
02:
.endm
.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
- .if numbytes == 4
+ .if \numbytes == 4
teq ORIG_W, ORIG_W, asr #32
- ldrne WK&reg1, [DST, #-4]
- .elseif numbytes == 8
- teq ORIG_W, WK&reg1
+ ldrne WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+ teq ORIG_W, WK\()\reg1
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
- ldmnedb DST, {WK&reg1-WK&reg2}
+ ldmdbne DST, {WK\()\reg1-WK\()\reg2}
.else
- teq ORIG_W, WK&reg1
- teqeq ORIG_W, WK&reg2
- teqeq ORIG_W, WK&reg3
+ teq ORIG_W, WK\()\reg1
+ teqeq ORIG_W, WK\()\reg2
+ teqeq ORIG_W, WK\()\reg3
teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
- ldmnedb DST, {WK&reg1-WK&reg4}
+ ldmdbne DST, {WK\()\reg1-WK\()\reg4}
.endif
cmnne DST, #0 /* clear C if NE */
bcs 49f /* no writes to dest if source all -1 */
beq 48f /* set dest to all 0 if source all 0 */
- .if numbytes == 4
- in_reverse_8888_8888_1pixel ORIG_W, WK&reg1, 0, 1
- str WK&reg1, [DST, #-4]
- .elseif numbytes == 8
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -4, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, 0, 0
- stmdb DST, {WK&reg1-WK&reg2}
+ .if \numbytes == 4
+ in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1
+ str WK\()\reg1, [DST, #-4]
+ .elseif \numbytes == 8
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0
+ stmdb DST, {WK\()\reg1-WK\()\reg2}
.else
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -12, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, -8, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg3, -4, 0
- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg4, 0, 0
- stmdb DST, {WK&reg1-WK&reg4}
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0
+ stmdb DST, {WK\()\reg1-WK\()\reg4}
.endif
49:
.endm
.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
- in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+ in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
.endm
generate_composite_function \
@@ -1136,3 +1138,44 @@ generate_composite_function \
in_reverse_8888_8888_process_tail
/******************************************************************************/
+
+.macro over_n_8888_init
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x00800080
+ /* Hold multiplier for destination in STRIDE_M */
+ mov STRIDE_M, #255
+ sub STRIDE_M, STRIDE_M, SRC, lsr #24
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+.endm
+
+.macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld , \numbytes, \firstreg, DST, 0
+.endm
+
+.macro over_n_8888_1pixel dst
+ mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK
+ uqadd8 WK\()\dst, WK\()\dst, SRC
+.endm
+
+.macro over_n_8888_process_tail cond, numbytes, firstreg
+ .set PROCESS_REG, \firstreg
+ .rept \numbytes / 4
+ over_n_8888_1pixel %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+ pixst , \numbytes, \firstreg, DST
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
+ 2, /* prefetch distance */ \
+ over_n_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ over_n_8888_process_head, \
+ over_n_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 8de060a..5ec19e0 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -112,64 +112,64 @@
*/
.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
- .if numbytes == 16
- .if unaligned == 1
- op&r&cond WK&reg0, [base], #4
- op&r&cond WK&reg1, [base], #4
- op&r&cond WK&reg2, [base], #4
- op&r&cond WK&reg3, [base], #4
+ .if \numbytes == 16
+ .if \unaligned == 1
+ \op\()r\()\cond WK\()\reg0, [\base], #4
+ \op\()r\()\cond WK\()\reg1, [\base], #4
+ \op\()r\()\cond WK\()\reg2, [\base], #4
+ \op\()r\()\cond WK\()\reg3, [\base], #4
.else
- op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
.endif
- .elseif numbytes == 8
- .if unaligned == 1
- op&r&cond WK&reg0, [base], #4
- op&r&cond WK&reg1, [base], #4
+ .elseif \numbytes == 8
+ .if \unaligned == 1
+ \op\()r\()\cond WK\()\reg0, [\base], #4
+ \op\()r\()\cond WK\()\reg1, [\base], #4
.else
- op&m&cond&ia base!, {WK&reg0,WK&reg1}
- .endif
- .elseif numbytes == 4
- op&r&cond WK&reg0, [base], #4
- .elseif numbytes == 2
- op&r&cond&h WK&reg0, [base], #2
- .elseif numbytes == 1
- op&r&cond&b WK&reg0, [base], #1
+ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1}
+ .endif
+ .elseif \numbytes == 4
+ \op\()r\()\cond WK\()\reg0, [\base], #4
+ .elseif \numbytes == 2
+ \op\()rh\()\cond WK\()\reg0, [\base], #2
+ .elseif \numbytes == 1
+ \op\()rb\()\cond WK\()\reg0, [\base], #1
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
- .if numbytes == 16
- stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
- .elseif numbytes == 8
- stm&cond&db base, {WK&reg0,WK&reg1}
- .elseif numbytes == 4
- str&cond WK&reg0, [base, #-4]
- .elseif numbytes == 2
- str&cond&h WK&reg0, [base, #-2]
- .elseif numbytes == 1
- str&cond&b WK&reg0, [base, #-1]
+ .if \numbytes == 16
+ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
+ .elseif \numbytes == 8
+ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
+ .elseif \numbytes == 4
+ str\()\cond WK\()\reg0, [\base, #-4]
+ .elseif \numbytes == 2
+ strh\()\cond WK\()\reg0, [\base, #-2]
+ .elseif \numbytes == 1
+ strb\()\cond WK\()\reg0, [\base, #-1]
.else
- .error "unsupported size: numbytes"
+ .error "unsupported size: \numbytes"
.endif
.endm
.macro pixld cond, numbytes, firstreg, base, unaligned
- pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+ pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
.endm
.macro pixst cond, numbytes, firstreg, base
.if (flags) & FLAG_DST_READWRITE
- pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
.else
- pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
.endif
.endm
.macro PF a, x:vararg
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
- a x
+ \a \x
.endif
.endm
@@ -179,11 +179,11 @@
* between 0 and prefetch_distance (inclusive) cache lines ahead so there
* are no gaps when the inner loop starts.
*/
- .if bpp > 0
- PF bic, ptr, base, #31
+ .if \bpp > 0
+ PF bic, \ptr, \base, #31
.set OFFSET, 0
.rept prefetch_distance+1
- PF pld, [ptr, #OFFSET]
+ PF pld, [\ptr, #OFFSET]
.set OFFSET, OFFSET+32
.endr
.endif
@@ -201,42 +201,42 @@
* and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
* possible when there are 4 src bytes for every 1 dst byte).
*/
- .if bpp > 0
- .ifc base,DST
+ .if \bpp > 0
+ .ifc \base,DST
/* The test can be simplified further when preloading the destination */
- PF tst, base, #16
+ PF tst, \base, #16
PF beq, 61f
.else
- .if bpp/dst_w_bpp == 4
- PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+ .if \bpp/dst_w_bpp == 4
+ PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
PF and, SCRATCH, SCRATCH, #31
- PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
- PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
- PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */
+ PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
+ PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
+ PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
PF bcs, 61f
PF bpl, 60f
PF pld, [ptr, #32*(prefetch_distance+2)]
.else
- PF mov, SCRATCH, base, lsl #32-5
- PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
- PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+ PF mov, SCRATCH, \base, lsl #32-5
+ PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
+ PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
PF bls, 61f
.endif
.endif
-60: PF pld, [ptr, #32*(prefetch_distance+1)]
+60: PF pld, [\ptr, #32*(prefetch_distance+1)]
61:
.endif
.endm
#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
.macro preload_middle bpp, base, scratch_holds_offset
- .if bpp > 0
+ .if \bpp > 0
/* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
- .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
- .if scratch_holds_offset
- PF pld, [base, SCRATCH]
+ .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
+ .if \scratch_holds_offset
+ PF pld, [\base, SCRATCH]
.else
- PF bic, SCRATCH, base, #31
+ PF bic, SCRATCH, \base, #31
PF pld, [SCRATCH, #32*prefetch_distance]
.endif
.endif
@@ -244,28 +244,28 @@
.endm
.macro preload_trailing bpp, bpp_shift, base
- .if bpp > 0
- .if bpp*pix_per_block > 256
+ .if \bpp > 0
+ .if \bpp*pix_per_block > 256
/* Calculations are more complex if more than one fetch per block */
- PF and, WK1, base, #31
- PF add, WK1, WK1, WK0, lsl #bpp_shift
- PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
- PF bic, SCRATCH, base, #31
+ PF and, WK1, \base, #31
+ PF add, WK1, WK1, WK0, lsl #\bpp_shift
+ PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
+ PF bic, SCRATCH, \base, #31
80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
PF add, SCRATCH, SCRATCH, #32
PF subs, WK1, WK1, #32
PF bhi, 80b
.else
/* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
- PF mov, SCRATCH, base, lsl #32-5
- PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
- PF adceqs, SCRATCH, SCRATCH, #0
+ PF mov, SCRATCH, \base, lsl #32-5
+ PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
+ PF adcseq, SCRATCH, SCRATCH, #0
/* The instruction above has two effects: ensures Z is only
* set if C was clear (so Z indicates that both shifted quantities
* were 0), and clears C if Z was set (so C indicates that the sum
* of the shifted quantities was greater and not equal to 32) */
PF beq, 82f
- PF bic, SCRATCH, base, #31
+ PF bic, SCRATCH, \base, #31
PF bcc, 81f
PF pld, [SCRATCH, #32*(prefetch_distance+2)]
81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
@@ -288,12 +288,12 @@
* "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
* "base" - base address register of channel to preload (SRC, MASK or DST)
*/
- .if bpp > 0
- .if narrow_case && (bpp <= dst_w_bpp)
+ .if \bpp > 0
+ .if \narrow_case && (\bpp <= dst_w_bpp)
/* In these cases, each line for each channel is in either 1 or 2 cache lines */
- PF bic, WK0, base, #31
+ PF bic, WK0, \base, #31
PF pld, [WK0]
- PF add, WK1, base, X, LSL #bpp_shift
+ PF add, WK1, \base, X, LSL #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
@@ -301,9 +301,9 @@
PF pld, [WK1]
90:
.else
- PF bic, WK0, base, #31
+ PF bic, WK0, \base, #31
PF pld, [WK0]
- PF add, WK1, base, X, lsl #bpp_shift
+ PF add, WK1, \base, X, lsl #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
@@ -319,56 +319,56 @@
.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
- process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
- .if decrementx
- sub&cond X, X, #8*numbytes/dst_w_bpp
+ \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
+ .if \decrementx
+ sub\()\cond X, X, #8*\numbytes/dst_w_bpp
.endif
- process_tail cond, numbytes, firstreg
+ \process_tail \cond, \numbytes, \firstreg
.if !((flags) & FLAG_PROCESS_DOES_STORE)
- pixst cond, numbytes, firstreg, DST
+ pixst \cond, \numbytes, \firstreg, DST
.endif
.endm
.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
.if (flags) & FLAG_BRANCH_OVER
- .ifc cond,mi
+ .ifc \cond,mi
bpl 100f
.endif
- .ifc cond,cs
+ .ifc \cond,cs
bcc 100f
.endif
- .ifc cond,ne
+ .ifc \cond,ne
beq 100f
.endif
- conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
100:
.else
- conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
.endif
.endm
.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
.if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
/* Can't interleave reads and writes */
- test
- conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+ \test
+ conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
.if (flags) & FLAG_PROCESS_CORRUPTS_PSR
- test
+ \test
.endif
- conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+ conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
.else
/* Can interleave reads and writes for better scheduling */
- test
- process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
- process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
- .if decrementx
- sub&cond1 X, X, #8*numbytes1/dst_w_bpp
- sub&cond2 X, X, #8*numbytes2/dst_w_bpp
- .endif
- process_tail cond1, numbytes1, firstreg1
- process_tail cond2, numbytes2, firstreg2
- pixst cond1, numbytes1, firstreg1, DST
- pixst cond2, numbytes2, firstreg2, DST
+ \test
+ \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
+ \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
+ .if \decrementx
+ sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
+ sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
+ .endif
+ \process_tail \cond1, \numbytes1, \firstreg1
+ \process_tail \cond2, \numbytes2, \firstreg2
+ pixst \cond1, \numbytes1, \firstreg1, DST
+ pixst \cond2, \numbytes2, \firstreg2, DST
.endif
.endm
@@ -400,12 +400,12 @@
.endif
/* Use unaligned loads in all cases for simplicity */
.if dst_w_bpp == 8
- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
+ conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
.elseif dst_w_bpp == 16
test_bits_1_0_ptr
- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
.endif
- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
.endif
@@ -424,12 +424,12 @@
.endm
.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
- conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+ conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
.if dst_w_bpp == 16
test_bits_1_0_pix
- conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+ conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
.elseif dst_w_bpp == 8
- conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+ conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
.endif
.endm
@@ -438,7 +438,7 @@
110:
.set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
.rept pix_per_block*dst_w_bpp/128
- process_head , 16, 0, unaligned_src, unaligned_mask, 1
+ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
preload_middle src_bpp, SRC, 1
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -453,9 +453,9 @@
* preloads for, to achieve staggered prefetches for multiple channels, because there are
* always two STMs per prefetch, so there is always an opposite STM on which to put the
* preload. Note, no need to BIC the base register here */
- PF pld, [DST, #32*prefetch_distance - dst_alignment]
+ PF pld, [DST, #32*prefetch_distance - \dst_alignment]
.endif
- process_tail , 16, 0
+ \process_tail , 16, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 16, 0, DST
.endif
@@ -470,11 +470,11 @@
.if dst_r_bpp > 0
tst DST, #16
bne 111f
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
+ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
b 112f
111:
.endif
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
+ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
112:
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
@@ -487,13 +487,13 @@
.endif
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
/* The remainder of the line is handled identically to the medium case */
- medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+ medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
.endm
.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
120:
- process_head , 16, 0, unaligned_src, unaligned_mask, 0
- process_tail , 16, 0
+ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0
+ \process_tail , 16, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 16, 0, DST
.endif
@@ -501,16 +501,16 @@
bhs 120b
/* Trailing pixels */
tst X, #128/dst_w_bpp - 1
- beq exit_label
- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+ beq \exit_label
+ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
.endm
.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
tst X, #16*8/dst_w_bpp
- conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+ conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
/* Trailing pixels */
/* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
.endm
.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
@@ -523,37 +523,37 @@
tst SRC, #3
bne 140f
.endif
- action process_head, process_tail, process_inner_loop, exit_label, 0, 0
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
.if src_bpp == 8 || src_bpp == 16
- b exit_label
+ b \exit_label
140:
- action process_head, process_tail, process_inner_loop, exit_label, 1, 0
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
.endif
.if mask_bpp == 8 || mask_bpp == 16
- b exit_label
+ b \exit_label
141:
.if src_bpp == 8 || src_bpp == 16
tst SRC, #3
bne 142f
.endif
- action process_head, process_tail, process_inner_loop, exit_label, 0, 1
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
.if src_bpp == 8 || src_bpp == 16
- b exit_label
+ b \exit_label
142:
- action process_head, process_tail, process_inner_loop, exit_label, 1, 1
+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
.endif
.endif
.endm
.macro end_of_line restore_x, vars_spilled, loop_label, last_one
- .if vars_spilled
+ .if \vars_spilled
/* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
/* This is ldmia sp,{} */
.word 0xE89D0000 | LINE_SAVED_REGS
.endif
subs Y, Y, #1
- .if vars_spilled
+ .if \vars_spilled
.if (LINE_SAVED_REGS) & (1<<1)
str Y, [sp]
.endif
@@ -565,18 +565,18 @@
.if mask_bpp > 0
add MASK, MASK, STRIDE_M
.endif
- .if restore_x
+ .if \restore_x
mov X, ORIG_W
.endif
- bhs loop_label
- .ifc "last_one",""
- .if vars_spilled
+ bhs \loop_label
+ .ifc "\last_one",""
+ .if \vars_spilled
b 197f
.else
b 198f
.endif
.else
- .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+ .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
b 198f
.endif
.endif
@@ -596,17 +596,17 @@
process_tail, \
process_inner_loop
- pixman_asm_function fname
+ pixman_asm_function \fname
/*
* Make some macro arguments globally visible and accessible
* from other macros
*/
- .set src_bpp, src_bpp_
- .set mask_bpp, mask_bpp_
- .set dst_w_bpp, dst_w_bpp_
- .set flags, flags_
- .set prefetch_distance, prefetch_distance_
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set flags, \flags_
+ .set prefetch_distance, \prefetch_distance_
/*
* Select prefetch type for this function.
@@ -732,7 +732,7 @@
sub Y, Y, #1
#endif
- init
+ \init
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0
/* Reserve a word in which to store X during leading pixels */
@@ -773,7 +773,7 @@
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
151: /* New line */
- newline
+ \newline
preload_leading_step1 src_bpp, WK1, SRC
preload_leading_step1 mask_bpp, WK2, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -790,7 +790,7 @@
preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
.endif
- leading_15bytes process_head, process_tail
+ leading_15bytes \process_head, \process_tail
154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
@@ -800,10 +800,10 @@
and SCRATCH, MASK, #31
rsb SCRATCH, SCRATCH, #32*prefetch_distance
.endif
- .ifc "process_inner_loop",""
- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+ .ifc "\process_inner_loop",""
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
.else
- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
.endif
157: /* Check for another line */
@@ -825,7 +825,7 @@
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
161: /* New line */
- newline
+ \newline
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 0, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -837,10 +837,10 @@
beq 164f
rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
- leading_15bytes process_head, process_tail
+ leading_15bytes \process_head, \process_tail
164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
- switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+ switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
167: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
@@ -856,7 +856,7 @@
.word 0xE92D0000 | LINE_SAVED_REGS
.endif
171: /* New line */
- newline
+ \newline
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 1, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
@@ -868,8 +868,8 @@
beq 174f
172: subs X, X, #1
blo 177f
- process_head , 1, 0, 1, 1, 0
- process_tail , 1, 0
+ \process_head , 1, 0, 1, 1, 0
+ \process_tail , 1, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 1, 0, DST
.endif
@@ -880,15 +880,15 @@
beq 174f
subs X, X, #1
blo 177f
- process_head , 2, 0, 1, 1, 0
- process_tail , 2, 0
+ \process_head , 2, 0, 1, 1, 0
+ \process_tail , 2, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 2, 0, DST
.endif
.endif
174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
- switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+ switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
177: /* Check for another line */
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
@@ -908,7 +908,7 @@
add sp, sp, #4
.endif
- cleanup
+ \cleanup
#ifdef DEBUG_PARAMS
add sp, sp, #9*4 /* junk the debug copy of arguments */
@@ -932,13 +932,13 @@
.unreq WK3
.unreq SCRATCH
.unreq ORIG_W
- .endfunc
+ pixman_end_asm_function
.endm
.macro line_saved_regs x:vararg
.set LINE_SAVED_REGS, 0
.set LINE_SAVED_REG_COUNT, 0
- .irp SAVED_REG,x
+ .irp SAVED_REG,\x
.ifc "SAVED_REG","Y"
.set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
.set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index fa1ab5c..40f3a97 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -24,7 +24,7 @@
*
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -51,6 +51,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, armv6, over_n_8888,
+ uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
uint32_t, 1)
@@ -240,6 +242,10 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, armv6_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, armv6_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8b8g8r8, armv6_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8b8g8r8, armv6_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888),
@@ -260,15 +266,15 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+ SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
- PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
{ PIXMAN_OP_NONE },
};
diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
index 23374e4..288172b 100644
--- a/pixman/pixman-arm.c
+++ b/pixman/pixman-arm.c
@@ -20,7 +20,7 @@
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -176,6 +176,31 @@ detect_cpu_features (void)
return features;
}
+#elif defined (_3DS) /* 3DS homebrew (devkitARM) */
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+ arm_cpu_features_t features = 0;
+
+ features |= ARM_V6;
+
+ return features;
+}
+
+#elif defined (PSP2) || defined (__SWITCH__)
+/* Vita (VitaSDK) or Switch (devkitA64) homebrew */
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+ arm_cpu_features_t features = 0;
+
+ features |= ARM_NEON;
+
+ return features;
+}
+
#else /* Unknown */
static arm_cpu_features_t
@@ -221,5 +246,11 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp)
imp = _pixman_implementation_create_arm_neon (imp);
#endif
+#ifdef USE_ARM_A64_NEON
+ /* neon is a part of aarch64 */
+ if (!_pixman_disabled ("arm-neon"))
+ imp = _pixman_implementation_create_arm_neon (imp);
+#endif
+
return imp;
}
diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S
new file mode 100644
index 0000000..7303bdc
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm-bilinear.S
@@ -0,0 +1,1276 @@
+/*
+ * Copyright © 2011 SCore Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ * Author: Taekyun Kim (tkq.kim@samsung.com)
+ */
+
+/*
+ * This file contains scaled bilinear scanline functions implemented
+ * using older siarhei's bilinear macro template.
+ *
+ * << General scanline function procedures >>
+ * 1. bilinear interpolate source pixels
+ * 2. load mask pixels
+ * 3. load destination pixels
+ * 4. duplicate mask to fill whole register
+ * 5. interleave source & destination pixels
+ * 6. apply mask to source pixels
+ * 7. combine source & destination pixels
+ * 8, Deinterleave final result
+ * 9. store destination pixels
+ *
+ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
+ * Registers with double numbers(src01, dst01) are 128-bits registers.
+ * All temp registers can be used freely outside the code block.
+ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
+ *
+ * Remarks
+ * There can be lots of pipeline stalls inside code block and between code blocks.
+ * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined (__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.arch armv8-a
+.altmacro
+.p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-asm.h"
+#include "pixman-arma64-neon-asm.h"
+
+/*
+ * Bilinear macros from pixman-arm-neon-asm.S
+ */
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ ld1 {\()\reg1\().2s}, [TMP1], STRIDE
+ ld1 {\()\reg2\().2s}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
+ ld1 {\()\reg2\().s}[1], [TMP1]
+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 \reg1, \reg2, \tmp1
+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
+ bilinear_load_8888 \reg3, \reg4, \tmp2
+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
+.endm
+
+.macro vzip reg1, reg2
+ zip1 v24.8b, \reg1, \reg2
+ zip2 \reg2, \reg1, \reg2
+ mov \reg1, v24.8b
+.endm
+
+.macro vuzp reg1, reg2
+ uzp1 v24.8b, \reg1, \reg2
+ uzp2 \reg2, \reg1, \reg2
+ mov \reg1, v24.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
+ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
+ ld1 {\()\acc2\().s}[1], [TMP1]
+ ld1 {\()\acc2\().s}[3], [TMP2]
+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+ vzip \()\reg1\().8b, \()\reg3\().8b
+ vzip \()\reg2\().8b, \()\reg4\().8b
+ vzip \()\reg3\().8b, \()\reg4\().8b
+ vzip \()\reg1\().8b, \()\reg2\().8b
+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
+ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
+ ld1 {\()\xacc2\().s}[1], [TMP1]
+ ld1 {\()\xacc2\().s}[3], [TMP2]
+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
+ vzip \()\xreg1\().8b, \()\xreg3\().8b
+ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
+ vzip \()\xreg2\().8b, \()\xreg4\().8b
+ ld1 {\()\yacc2\().s}[1], [TMP1]
+ vzip \()\xreg3\().8b, \()\xreg4\().8b
+ ld1 {\()\yacc2\().s}[3], [TMP2]
+ vzip \()\xreg1\().8b, \()\xreg2\().8b
+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
+ vzip \()\yreg1\().8b, \()\yreg3\().8b
+ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
+ vzip \()\yreg2\().8b, \()\yreg4\().8b
+ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
+ vzip \()\yreg3\().8b, \()\yreg4\().8b
+ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
+ vzip \()\yreg1\().8b, \()\yreg2\().8b
+ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
+ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
+ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
+ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if \numpix == 4
+ st1 {v0.2s, v1.2s}, [OUT], #16
+.elseif \numpix == 2
+ st1 {v0.2s}, [OUT], #8
+.elseif \numpix == 1
+ st1 {v0.s}[0], [OUT], #4
+.else
+ .error bilinear_store_8888 \numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp v0.8b, v1.8b
+ vuzp v2.8b, v3.8b
+ vuzp v1.8b, v3.8b
+ vuzp v0.8b, v2.8b
+ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
+.if \numpix == 4
+ st1 {v1.4h}, [OUT], #8
+.elseif \numpix == 2
+ st1 {v1.s}[0], [OUT], #4
+.elseif \numpix == 1
+ st1 {v1.h}[0], [OUT], #2
+.else
+ .error bilinear_store_0565 \numpix is unsupported
+.endif
+.endm
+
+
+/*
+ * Macros for loading mask pixels into register 'mask'.
+ * dup must be done in somewhere else.
+ */
+.macro bilinear_load_mask_x numpix, mask
+.endm
+
+.macro bilinear_load_mask_8 numpix, mask
+.if \numpix == 4
+ ld1 {\()\mask\().s}[0], [MASK], #4
+.elseif \numpix == 2
+ ld1 {\()\mask\().h}[0], [MASK], #2
+.elseif \numpix == 1
+ ld1 {\()\mask\().b}[0], [MASK], #1
+.else
+ .error bilinear_load_mask_8 \numpix is unsupported
+.endif
+ prfum PREFETCH_MODE, [MASK, #(prefetch_offset)]
+.endm
+
+.macro bilinear_load_mask mask_fmt, numpix, mask
+ bilinear_load_mask_\mask_fmt \numpix, \mask
+.endm
+
+
+/*
+ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
+ * Interleave should be done somewhere else.
+ */
+.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.if \numpix == 4
+ ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT]
+.elseif \numpix == 2
+ ld1 {\()\dst0\().2s}, [OUT]
+.elseif \numpix == 1
+ ld1 {\()\dst0\().s}[0], [OUT]
+.else
+ .error bilinear_load_dst_8888 \numpix is unsupported
+.endif
+ mov \()\dst01\().d[0], \()\dst0\().d[0]
+ mov \()\dst01\().d[1], \()\dst1\().d[0]
+ prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
+.endm
+
+.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
+ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
+.endm
+
+/*
+ * Macros for duplicating partially loaded mask to fill entire register.
+ * We will apply mask to interleaved source pixels, that is
+ * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * So, we need to duplicate loaded mask into whole register.
+ *
+ * For two pixel case
+ * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * We can do some optimizations for this including last pixel cases.
+ */
+.macro bilinear_duplicate_mask_x numpix, mask
+.endm
+
+.macro bilinear_duplicate_mask_8 numpix, mask
+.if \numpix == 4
+ dup \()\mask\().2s, \()\mask\().s[0]
+.elseif \numpix == 2
+ dup \()\mask\().4h, \()\mask\().h[0]
+.elseif \numpix == 1
+ dup \()\mask\().8b, \()\mask\().b[0]
+.else
+ .error bilinear_duplicate_\mask_8 is unsupported
+.endif
+.endm
+
+.macro bilinear_duplicate_mask mask_fmt, numpix, mask
+ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
+.endm
+
+/*
+ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
+ * Interleave should be done when maks is enabled or operator is 'over'.
+ */
+.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+ vuzp \()\src0\().8b, \()\src1\().8b
+ vuzp \()\dst0\().8b, \()\dst1\().8b
+ vuzp \()\src0\().8b, \()\src1\().8b
+ vuzp \()\dst0\().8b, \()\dst1\().8b
+ mov \()\src01\().d[1], \()\src1\().d[0]
+ mov \()\src01\().d[0], \()\src0\().d[0]
+ mov \()\dst01\().d[1], \()\dst1\().d[0]
+ mov \()\dst01\().d[0], \()\dst0\().d[0]
+.endm
+
+.macro bilinear_interleave_src_dst_x_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_interleave_src_dst \
+ mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
+.endm
+
+
+/*
+ * Macros for applying masks to src pixels. (see combine_mask_u() function)
+ * src, dst should be in interleaved form.
+ * mask register should be in form (m0, m1, m2, m3).
+ */
+.macro bilinear_apply_mask_to_src_x \
+ numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+.endm
+
+.macro bilinear_apply_mask_to_src_8 \
+ numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+
+ umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b
+ umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b
+ /* bubbles */
+ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
+ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
+ /* bubbles */
+ raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
+ raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
+ mov \()\src01\().d[0], \()\src0\().d[0]
+ mov \()\src01\().d[1], \()\src1\().d[0]
+.endm
+
+.macro bilinear_apply_mask_to_src \
+ mask_fmt, numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+
+ bilinear_apply_mask_to_src_\()\mask_fmt \
+ \numpix, \src0, \src1, \src01, \mask, \
+ \tmp01, \tmp23, \tmp45, \tmp67
+.endm
+
+
+/*
+ * Macros for combining src and destination pixels.
+ * Interleave or not is depending on operator 'op'.
+ */
+.macro bilinear_combine_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+.macro bilinear_combine_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+ dup \()\tmp8\().2s, \()\src1\().s[1]
+ /* bubbles */
+ mvn \()\tmp8\().8b, \()\tmp8\().8b
+ /* bubbles */
+ umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b
+ /* bubbles */
+ umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b
+ /* bubbles */
+ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
+ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
+ /* bubbles */
+ raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
+ raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
+ mov \()\dst01\().d[0], \()\dst0\().d[0]
+ mov \()\dst01\().d[1], \()\dst1\().d[0]
+ /* bubbles */
+ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
+ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
+ mov \()\src01\().d[0], \()\src0\().d[0]
+ mov \()\src01\().d[1], \()\src1\().d[0]
+.endm
+
+.macro bilinear_combine_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
+ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
+ mov \()\src01\().d[0], \()\src0\().d[0]
+ mov \()\src01\().d[1], \()\src1\().d[0]
+.endm
+
+.macro bilinear_combine \
+ op, numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+ bilinear_combine_\()\op \
+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
+ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
+.endm
+
+/*
+ * Macros for final deinterleaving of destination pixels if needed.
+ */
+.macro bilinear_deinterleave numpix, dst0, dst1, dst01
+ vuzp \()\dst0\().8b, \()\dst1\().8b
+ /* bubbles */
+ vuzp \()\dst0\().8b, \()\dst1\().8b
+ mov \()\dst01\().d[0], \()\dst0\().d[0]
+ mov \()\dst01\().d[1], \()\dst1\().d[0]
+.endm
+
+.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+.endm
+
+.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
+ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
+.endm
+
+
+.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
+ bilinear_load_\()\src_fmt v0, v1, v2
+ bilinear_load_mask \mask_fmt, 1, v4
+ bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9
+ umull v2.8h, v0.8b, v28.8b
+ umlal v2.8h, v1.8b, v29.8b
+ /* 5 cycles bubble */
+ ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v2.4h, v15.h[0]
+ umlal2 v0.4s, v2.8h, v15.h[0]
+ /* 5 cycles bubble */
+ bilinear_duplicate_mask \mask_fmt, 1, v4
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ /* 3 cycles bubble */
+ xtn v0.8b, v0.8h
+ /* 1 cycle bubble */
+ bilinear_interleave_src_dst \
+ \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9
+ bilinear_apply_mask_to_src \
+ \mask_fmt, 1, v0, v1, v0, v4, \
+ v3, v8, v10, v11
+ bilinear_combine \
+ \op, 1, v0, v1, v0, v18, v19, v9, \
+ v3, v8, v10, v11, v5
+ bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0
+ bilinear_store_\()\dst_fmt 1, v17, v18
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
+ v1, v11, v18, v19, v20, v21, v22, v23
+ bilinear_load_mask \mask_fmt, 2, v4
+ bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9
+ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v1.4h, v15.h[0]
+ umlal2 v0.4s, v1.8h, v15.h[0]
+ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v10.4s, v11.4h, v15.h[4]
+ umlal2 v10.4s, v11.8h, v15.h[4]
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ bilinear_duplicate_mask \mask_fmt, 2, v4
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ xtn v0.8b, v0.8h
+ bilinear_interleave_src_dst \
+ \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9
+ bilinear_apply_mask_to_src \
+ \mask_fmt, 2, v0, v1, v0, v4, \
+ v3, v8, v10, v11
+ bilinear_combine \
+ \op, 2, v0, v1, v0, v18, v19, v9, \
+ v3, v8, v10, v11, v5
+ bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0
+ bilinear_store_\()\dst_fmt 2, v16, v17
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
+ v1, v11, v4, v5, v6, v7, v22, v23, \
+ v3, v9, v16, v17, v20, v21, v18, v19
+ prfm PREFETCH_MODE, [TMP1, PF_OFFS]
+ sub TMP1, TMP1, STRIDE
+ prfm PREFETCH_MODE, [TMP1, PF_OFFS]
+ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v1.4h, v15.h[0]
+ umlal2 v0.4s, v1.8h, v15.h[0]
+ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v10.4s, v11.4h, v15.h[4]
+ umlal2 v10.4s, v11.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v2.4s, v3.4h, v15.h[0]
+ umlal2 v2.4s, v3.8h, v15.h[0]
+ ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v8.4s, v9.4h, v15.h[4]
+ umlal2 v8.4s, v9.8h, v15.h[4]
+ add v12.8h, v12.8h, v13.8h
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ bilinear_load_mask \mask_fmt, 4, v4
+ bilinear_duplicate_mask \mask_fmt, 4, v4
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v2.8h
+ add v12.8h, v12.8h, v13.8h
+ bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21
+ bilinear_interleave_src_dst \
+ \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11
+ bilinear_apply_mask_to_src \
+ \mask_fmt, 4, v0, v1, v0, v4, \
+ v6, v8, v9, v10
+ bilinear_combine \
+ \op, 4, v0, v1, v0, v2, v3, v1, \
+ v6, v8, v9, v10, v23
+ bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0
+ bilinear_store_\()\dst_fmt 4, v6, v7
+.endm
+
+.set BILINEAR_FLAG_USE_MASK, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
+ * process_last_pixel - code block that interpolate one pixel and does not
+ * update horizontal weight
+ * process_two_pixels - code block that interpolate two pixels and update
+ * horizontal weight
+ * process_four_pixels - code block that interpolate four pixels and update
+ * horizontal weight
+ * process_pixblock_head - head part of middle loop
+ * process_pixblock_tail - tail part of middle loop
+ * process_pixblock_tail_head - tail_head of middle loop
+ * pixblock_size - number of pixels processed in a single middle loop
+ * prefetch_distance - prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+ fname, \
+ src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+ bilinear_process_last_pixel, \
+ bilinear_process_two_pixels, \
+ bilinear_process_four_pixels, \
+ bilinear_process_pixblock_head, \
+ bilinear_process_pixblock_tail, \
+ bilinear_process_pixblock_tail_head, \
+ pixblock_size, \
+ prefetch_distance, \
+ flags
+
+pixman_asm_function \fname
+.if \pixblock_size == 8
+.elseif \pixblock_size == 4
+.else
+ .error unsupported pixblock size
+.endif
+
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
+ OUT .req x0
+ TOP .req x1
+ BOTTOM .req x2
+ WT .req x3
+ WWT .req w3
+ WB .req x4
+ WWB .req w4
+ X .req w5
+ UX .req w6
+ WIDTH .req x7
+ TMP1 .req x10
+ WTMP1 .req w10
+ TMP2 .req x11
+ WTMP2 .req w11
+ PF_OFFS .req x12
+ TMP3 .req x13
+ WTMP3 .req w13
+ TMP4 .req x14
+ WTMP4 .req w14
+ STRIDE .req x15
+ DUMMY .req x30
+
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub sp, sp, 112
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ stp x10, x11, [x29, -80]
+ stp x12, x13, [x29, -96]
+ stp x14, x15, [x29, -112]
+.else
+ OUT .req x0
+ MASK .req x1
+ TOP .req x2
+ BOTTOM .req x3
+ WT .req x4
+ WWT .req w4
+ WB .req x5
+ WWB .req w5
+ X .req w6
+ UX .req w7
+ WIDTH .req x8
+ TMP1 .req x10
+ WTMP1 .req w10
+ TMP2 .req x11
+ WTMP2 .req w11
+ PF_OFFS .req x12
+ TMP3 .req x13
+ WTMP3 .req w13
+ TMP4 .req x14
+ WTMP4 .req w14
+ STRIDE .req x15
+ DUMMY .req x30
+
+ .set prefetch_offset, \prefetch_distance
+
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ stp x10, x11, [x29, -80]
+ stp x12, x13, [x29, -96]
+ stp x14, x15, [x29, -112]
+ str x8, [x29, -120]
+ ldr w8, [x29, 16]
+ sub sp, sp, 120
+.endif
+
+ mov WTMP1, #\prefetch_distance
+ umull PF_OFFS, WTMP1, UX
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 300f
+
+ dup v12.8h, X
+ dup v13.8h, UX
+ dup v28.8b, WWT
+ dup v29.8b, WWB
+ mov v25.d[0], v12.d[1]
+ mov v26.d[0], v13.d[0]
+ add v25.4h, v25.4h, v26.4h
+ mov v12.d[1], v25.d[0]
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 100f
+ tst OUT, #(1 << \dst_bpp_shift)
+ beq 100f
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ \bilinear_process_last_pixel
+ sub WIDTH, WIDTH, #1
+100:
+ add v13.8h, v13.8h, v13.8h
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+
+ cmp WIDTH, #2
+ blt 100f
+ tst OUT, #(1 << (\dst_bpp_shift + 1))
+ beq 100f
+ \bilinear_process_two_pixels
+ sub WIDTH, WIDTH, #2
+100:
+.if \pixblock_size == 8
+ cmp WIDTH, #4
+ blt 100f
+ tst OUT, #(1 << (\dst_bpp_shift + 2))
+ beq 100f
+ \bilinear_process_four_pixels
+ sub WIDTH, WIDTH, #4
+100:
+.endif
+ subs WIDTH, WIDTH, #\pixblock_size
+ blt 100f
+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
+ \bilinear_process_pixblock_head
+ subs WIDTH, WIDTH, #\pixblock_size
+ blt 500f
+0:
+ \bilinear_process_pixblock_tail_head
+ subs WIDTH, WIDTH, #\pixblock_size
+ bge 0b
+500:
+ \bilinear_process_pixblock_tail
+100:
+.if \pixblock_size == 8
+ tst WIDTH, #4
+ beq 200f
+ \bilinear_process_four_pixels
+200:
+.endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 200f
+ \bilinear_process_two_pixels
+200:
+ tst WIDTH, #1
+ beq 300f
+ \bilinear_process_last_pixel
+300:
+
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x10, x11, [x29, -80]
+ ldp x12, x13, [x29, -96]
+ ldp x14, x15, [x29, -112]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+.else
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x10, x11, [x29, -80]
+ ldp x12, x13, [x29, -96]
+ ldp x14, x15, [x29, -112]
+ ldr x8, [x29, -120]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+.endif
+ ret
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+ .unreq WWT
+ .unreq WB
+ .unreq WWB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq WTMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
+ .unreq MASK
+.endif
+
+pixman_end_asm_function
+
+.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+ bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+ bilinear_src_8888_8_8888_process_pixblock_tail
+ bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+ bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+ bilinear_src_8888_8_0565_process_pixblock_tail
+ bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+ bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+ bilinear_src_0565_8_x888_process_pixblock_tail
+ bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+ bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+ bilinear_src_0565_8_0565_process_pixblock_tail
+ bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #2
+
+ ld1 {v22.2s}, [TMP1], STRIDE
+ ld1 {v23.2s}, [TMP1]
+ asr WTMP3, X, #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, lsl #2
+ umull v8.8h, v22.8b, v28.8b
+ umlal v8.8h, v23.8b, v29.8b
+
+ ld1 {v22.2s}, [TMP2], STRIDE
+ ld1 {v23.2s}, [TMP2]
+ asr WTMP4, X, #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, lsl #2
+ umull v9.8h, v22.8b, v28.8b
+ umlal v9.8h, v23.8b, v29.8b
+
+ ld1 {v22.2s}, [TMP3], STRIDE
+ ld1 {v23.2s}, [TMP3]
+ umull v10.8h, v22.8b, v28.8b
+ umlal v10.8h, v23.8b, v29.8b
+
+ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v8.4h, v15.h[0]
+ umlal2 v0.4s, v8.8h, v15.h[0]
+
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ ld1 {v16.2s}, [TMP4], STRIDE
+ ld1 {v17.2s}, [TMP4]
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ umull v11.8h, v16.8b, v28.8b
+ umlal v11.8h, v17.8b, v29.8b
+
+ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v1.4s, v9.4h, v15.h[4]
+ umlal2 v1.4s, v9.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v2.4s, v10.4h, v15.h[0]
+ umlal2 v2.4s, v10.8h, v15.h[0]
+ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v3.4s, v11.4h, v15.h[4]
+ umlal2 v3.4s, v11.8h, v15.h[4]
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ xtn v6.8b, v0.8h
+ xtn v7.8b, v2.8h
+ ld1 {v2.2s, v3.2s}, [OUT]
+ prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ dup v4.2s, v7.s[1]
+ mvn v4.8b, v4.8b
+ umull v11.8h, v2.8b, v4.8b
+ umull v2.8h, v3.8b, v4.8b
+ urshr v1.8h, v11.8h, #8
+ urshr v10.8h, v2.8h, #8
+ raddhn v3.8b, v10.8h, v2.8h
+ raddhn v2.8b, v1.8h, v11.8h
+ uqadd v6.8b, v2.8b, v6.8b
+ uqadd v7.8b, v3.8b, v7.8b
+ vuzp v6.8b, v7.8b
+ vuzp v6.8b, v7.8b
+ add v12.8h, v12.8h, v13.8h
+ st1 {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ umlsl v2.4s, v10.4h, v15.h[0]
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #2
+ umlal2 v2.4s, v10.8h, v15.h[0]
+ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ ld1 {v20.2s}, [TMP1], STRIDE
+ umlsl v3.4s, v11.4h, v15.h[4]
+ umlal2 v3.4s, v11.8h, v15.h[4]
+ ld1 {v21.2s}, [TMP1]
+ umull v8.8h, v20.8b, v28.8b
+ umlal v8.8h, v21.8b, v29.8b
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ld1 {v22.2s}, [TMP2], STRIDE
+ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ xtn v6.8b, v0.8h
+ ld1 {v23.2s}, [TMP2]
+ umull v9.8h, v22.8b, v28.8b
+ asr WTMP3, X, #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, lsl #2
+ asr WTMP4, X, #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, lsl #2
+ umlal v9.8h, v23.8b, v29.8b
+ xtn v7.8b, v2.8h
+ ld1 {v2.2s, v3.2s}, [OUT]
+ prfm PREFETCH_MODE, [OUT, PF_OFFS]
+ ld1 {v22.2s}, [TMP3], STRIDE
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ dup v4.2s, v7.s[1]
+ ld1 {v23.2s}, [TMP3]
+ mvn v4.8b, v4.8b
+ umull v10.8h, v22.8b, v28.8b
+ umlal v10.8h, v23.8b, v29.8b
+ umull v11.8h, v2.8b, v4.8b
+ umull v2.8h, v3.8b, v4.8b
+ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v8.4h, v15.h[0]
+ urshr v1.8h, v11.8h, #8
+ umlal2 v0.4s, v8.8h, v15.h[0]
+ urshr v8.8h, v2.8h, #8
+ raddhn v3.8b, v8.8h, v2.8h
+ raddhn v2.8b, v1.8h, v11.8h
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ ld1 {v16.2s}, [TMP4], STRIDE
+ uqadd v6.8b, v2.8b, v6.8b
+ uqadd v7.8b, v3.8b, v7.8b
+ ld1 {v17.2s}, [TMP4]
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ umull v11.8h, v16.8b, v28.8b
+ umlal v11.8h, v17.8b, v29.8b
+ vuzp v6.8b, v7.8b
+ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ vuzp v6.8b, v7.8b
+ umlsl v1.4s, v9.4h, v15.h[4]
+ add v12.8h, v12.8h, v13.8h
+ umlal2 v1.4s, v9.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ st1 {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+ bilinear_over_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+ bilinear_over_8888_8_8888_process_pixblock_tail
+ bilinear_over_8888_8_8888_process_pixblock_head
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+ bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+ bilinear_add_8888_8888_process_pixblock_tail
+ bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+ bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+ bilinear_add_8888_8_8888_process_pixblock_tail
+ bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_src_8888_8_8888_process_last_pixel, \
+ bilinear_src_8888_8_8888_process_two_pixels, \
+ bilinear_src_8888_8_8888_process_four_pixels, \
+ bilinear_src_8888_8_8888_process_pixblock_head, \
+ bilinear_src_8888_8_8888_process_pixblock_tail, \
+ bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+ 8888, 0565, 2, 1, \
+ bilinear_src_8888_8_0565_process_last_pixel, \
+ bilinear_src_8888_8_0565_process_two_pixels, \
+ bilinear_src_8888_8_0565_process_four_pixels, \
+ bilinear_src_8888_8_0565_process_pixblock_head, \
+ bilinear_src_8888_8_0565_process_pixblock_tail, \
+ bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+ 0565, 8888, 1, 2, \
+ bilinear_src_0565_8_x888_process_last_pixel, \
+ bilinear_src_0565_8_x888_process_two_pixels, \
+ bilinear_src_0565_8_x888_process_four_pixels, \
+ bilinear_src_0565_8_x888_process_pixblock_head, \
+ bilinear_src_0565_8_x888_process_pixblock_tail, \
+ bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+ 0565, 0565, 1, 1, \
+ bilinear_src_0565_8_0565_process_last_pixel, \
+ bilinear_src_0565_8_0565_process_two_pixels, \
+ bilinear_src_0565_8_0565_process_four_pixels, \
+ bilinear_src_0565_8_0565_process_pixblock_head, \
+ bilinear_src_0565_8_0565_process_pixblock_tail, \
+ bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8888_process_last_pixel, \
+ bilinear_over_8888_8888_process_two_pixels, \
+ bilinear_over_8888_8888_process_four_pixels, \
+ bilinear_over_8888_8888_process_pixblock_head, \
+ bilinear_over_8888_8888_process_pixblock_tail, \
+ bilinear_over_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8_8888_process_last_pixel, \
+ bilinear_over_8888_8_8888_process_two_pixels, \
+ bilinear_over_8888_8_8888_process_four_pixels, \
+ bilinear_over_8888_8_8888_process_pixblock_head, \
+ bilinear_over_8888_8_8888_process_pixblock_tail, \
+ bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8888_process_last_pixel, \
+ bilinear_add_8888_8888_process_two_pixels, \
+ bilinear_add_8888_8888_process_four_pixels, \
+ bilinear_add_8888_8888_process_pixblock_head, \
+ bilinear_add_8888_8888_process_pixblock_tail, \
+ bilinear_add_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8_8888_process_last_pixel, \
+ bilinear_add_8888_8_8888_process_two_pixels, \
+ bilinear_add_8888_8_8888_process_four_pixels, \
+ bilinear_add_8888_8_8888_process_pixblock_head, \
+ bilinear_add_8888_8_8888_process_pixblock_tail, \
+ bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
new file mode 100644
index 0000000..107c133
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm.S
@@ -0,0 +1,3704 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ * - pixman_composite_over_8888_0565_asm_neon
+ * - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.arch armv8-a
+
+.altmacro
+.p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-asm.h"
+#include "pixman-arma64-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ * to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * v0, v1, v2, v3 - contain loaded source pixel data
+ * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed)
+ * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
+ * v28, v29, v30, v31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * v0, v1, v2, v3 - contain loaded source pixel data
+ * v4, v5 - contain loaded destination pixels (they are needed)
+ * v28, v29 - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
+ * perform all the needed calculations and write the result to {v28, v29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
+ * actually use v0 register for blue channel (a vector of eight 8-bit
+ * values), v1 register for green, v2 for red and v3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion: // vuzp8 is a wrapper macro
+ * vuzp8 v0, v1
+ * vuzp8 v2, v3
+ * vuzp8 v1, v3
+ * vuzp8 v0, v2
+ *
+ * Planar to packed conversion: // vzip8 is a wrapper macro
+ * vzip8 v0, v2
+ * vzip8 v1, v3
+ * vzip8 v2, v3
+ * vzip8 v0, v1
+ *
+ * But pixel can be loaded directly in planar format using LD4 / b NEON
+ * instruction. It is 1 cycle slower than LD1 / s, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+ and put data into v6 - red, v7 - green, v30 - blue */
+ mov v4.d[1], v5.d[0]
+ shrn v6.8b, v4.8h, #8
+ shrn v7.8b, v4.8h, #3
+ sli v4.8h, v4.8h, #5
+ sri v6.8b, v6.8b, #5
+ mvn v3.8b, v3.8b /* invert source alpha */
+ sri v7.8b, v7.8b, #6
+ shrn v30.8b, v4.8h, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into v20 - red, v23 - green, v22 - blue */
+ umull v10.8h, v3.8b, v6.8b
+ umull v11.8h, v3.8b, v7.8b
+ umull v12.8h, v3.8b, v30.8b
+ urshr v17.8h, v10.8h, #8
+ urshr v18.8h, v11.8h, #8
+ urshr v19.8h, v12.8h, #8
+ raddhn v20.8b, v10.8h, v17.8h
+ raddhn v23.8b, v11.8h, v18.8h
+ raddhn v22.8b, v12.8h, v19.8h
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ uqadd v17.8b, v2.8b, v20.8b
+ uqadd v18.8b, v0.8b, v22.8b
+ uqadd v19.8b, v1.8b, v23.8b
+ /* convert the result to r5g6b5 and store it into {v14} */
+ ushll v14.8h, v17.8b, #7
+ sli v14.8h, v14.8h, #1
+ ushll v8.8h, v19.8b, #7
+ sli v8.8h, v8.8h, #1
+ ushll v9.8h, v18.8b, #7
+ sli v9.8h, v9.8h, #1
+ sri v14.8h, v8.8h, #5
+ sri v14.8h, v9.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ * head
+ * while (...) {
+ * tail
+ * head
+ * }
+ * tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ * pixman_composite_over_8888_0565_process_pixblock_tail
+ * st1 {v28.4h, v29.4h}, [DST_W], #32
+ * ld1 {v4.4h, v5.4h}, [DST_R], #16
+ * ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
+ * pixman_composite_over_8888_0565_process_pixblock_head
+ * cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few LD/ST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ uqadd v17.8b, v2.8b, v20.8b
+ ld1 {v4.4h, v5.4h}, [DST_R], #16
+ mov v4.d[1], v5.d[0]
+ uqadd v18.8b, v0.8b, v22.8b
+ uqadd v19.8b, v1.8b, v23.8b
+ shrn v6.8b, v4.8h, #8
+ fetch_src_pixblock
+ shrn v7.8b, v4.8h, #3
+ sli v4.8h, v4.8h, #5
+ ushll v14.8h, v17.8b, #7
+ sli v14.8h, v14.8h, #1
+ PF add, PF_X, PF_X, #8
+ ushll v8.8h, v19.8b, #7
+ sli v8.8h, v8.8h, #1
+ PF tst, PF_CTL, #0xF
+ sri v6.8b, v6.8b, #5
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+10:
+ mvn v3.8b, v3.8b
+ PF beq, 10f
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ sri v7.8b, v7.8b, #6
+ shrn v30.8b, v4.8h, #2
+ umull v10.8h, v3.8b, v6.8b
+ PF lsl, DUMMY, PF_X, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ umull v11.8h, v3.8b, v7.8b
+ umull v12.8h, v3.8b, v30.8b
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ sri v14.8h, v8.8h, #5
+ PF cmp, PF_X, ORIG_W
+ ushll v9.8h, v18.8b, #7
+ sli v9.8h, v9.8h, #1
+ urshr v17.8h, v10.8h, #8
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ urshr v19.8h, v11.8h, #8
+ urshr v18.8h, v12.8h, #8
+ PF ble, 10f
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ sri v14.8h, v9.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+10:
+ raddhn v20.8b, v10.8h, v17.8h
+ raddhn v23.8b, v11.8h, v19.8h
+ PF ble, 10f
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_SRC, #1
+10:
+ raddhn v22.8b, v12.8h, v18.8h
+ st1 {v14.8h}, [DST_W], #16
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ pixman_composite_over_8888_0565_process_pixblock_tail
+ st1 {v14.8h}, [DST_W], #16
+ ld1 {v4.4h, v4.5h}, [DST_R], #16
+ fetch_src_pixblock
+ pixman_composite_over_8888_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ * FLAG_DST_READWRITE - tells that the destination buffer is both read
+ * and written, for write-only buffer we would use
+ * FLAG_DST_WRITEONLY flag instead
+ * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ * and separate color channels for 32bpp format.
+ * The next things are:
+ * - the number of pixels processed per iteration (8 in this case, because
+ * that's the maximum what can fit into four 64-bit NEON registers).
+ * - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ * prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+ pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+ and put data into v6 - red, v7 - green, v30 - blue */
+ mov v4.d[1], v5.d[0]
+ shrn v6.8b, v4.8h, #8
+ shrn v7.8b, v4.8h, #3
+ sli v4.8h, v4.8h, #5
+ sri v6.8b, v6.8b, #5
+ sri v7.8b, v7.8b, #6
+ shrn v30.8b, v4.8h, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into v20 - red, v23 - green, v22 - blue */
+ umull v10.8h, v3.8b, v6.8b
+ umull v11.8h, v3.8b, v7.8b
+ umull v12.8h, v3.8b, v30.8b
+ urshr v13.8h, v10.8h, #8
+ urshr v14.8h, v11.8h, #8
+ urshr v15.8h, v12.8h, #8
+ raddhn v20.8b, v10.8h, v13.8h
+ raddhn v23.8b, v11.8h, v14.8h
+ raddhn v22.8b, v12.8h, v15.8h
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ uqadd v17.8b, v2.8b, v20.8b
+ uqadd v18.8b, v0.8b, v22.8b
+ uqadd v19.8b, v1.8b, v23.8b
+ /* convert the result to r5g6b5 and store it into {v14} */
+ ushll v14.8h, v17.8b, #7
+ sli v14.8h, v14.8h, #1
+ ushll v8.8h, v19.8b, #7
+ sli v8.8h, v8.8h, #1
+ ushll v9.8h, v18.8b, #7
+ sli v9.8h, v9.8h, #1
+ sri v14.8h, v8.8h, #5
+ sri v14.8h, v9.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+ pixman_composite_over_n_0565_process_pixblock_tail
+ ld1 {v4.4h, v5.4h}, [DST_R], #16
+ st1 {v14.8h}, [DST_W], #16
+ pixman_composite_over_n_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+ mov v3.s[0], w4
+ dup v0.8b, v3.b[0]
+ dup v1.8b, v3.b[1]
+ dup v2.8b, v3.b[2]
+ dup v3.8b, v3.b[3]
+ mvn v3.8b, v3.8b /* invert source alpha */
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_0565_init, \
+ default_cleanup, \
+ pixman_composite_over_n_0565_process_pixblock_head, \
+ pixman_composite_over_n_0565_process_pixblock_tail, \
+ pixman_composite_over_n_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+ ushll v8.8h, v1.8b, #7
+ sli v8.8h, v8.8h, #1
+ ushll v14.8h, v2.8b, #7
+ sli v14.8h, v14.8h, #1
+ ushll v9.8h, v0.8b, #7
+ sli v9.8h, v9.8h, #1
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+ sri v14.8h, v8.8h, #5
+ sri v14.8h, v9.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+ sri v14.8h, v8.8h, #5
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ fetch_src_pixblock
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ sri v14.8h, v9.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+ PF cmp, PF_X, ORIG_W
+ PF lsl, DUMMY, PF_X, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ ushll v8.8h, v1.8b, #7
+ sli v8.8h, v8.8h, #1
+ st1 {v14.8h}, [DST_W], #16
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ ushll v14.8h, v2.8b, #7
+ sli v14.8h, v14.8h, #1
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+10:
+ ushll v9.8h, v0.8b, #7
+ sli v9.8h, v9.8h, #1
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+ mov v0.d[1], v1.d[0]
+ shrn v30.8b, v0.8h, #8
+ shrn v29.8b, v0.8h, #3
+ sli v0.8h, v0.8h, #5
+ movi v31.8b, #255
+ sri v30.8b, v30.8b, #5
+ sri v29.8b, v29.8b, #6
+ shrn v28.8b, v0.8h, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+ pixman_composite_src_0565_8888_process_pixblock_tail
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ fetch_src_pixblock
+ pixman_composite_src_0565_8888_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+ uqadd v28.8b, v0.8b, v4.8b
+ uqadd v29.8b, v1.8b, v5.8b
+ uqadd v30.8b, v2.8b, v6.8b
+ uqadd v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add, PF_X, PF_X, #32
+ PF tst, PF_CTL, #0xF
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ PF beq, 10f
+ PF add, PF_X, PF_X, #32
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ PF cmp, PF_X, ORIG_W
+ PF lsl, DUMMY, PF_X, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ uqadd v28.8b, v0.8b, v4.8b
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+10:
+ uqadd v29.8b, v1.8b, v5.8b
+ uqadd v30.8b, v2.8b, v6.8b
+ uqadd v31.8b, v3.8b, v7.8b
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ PF cmp, PF_X, ORIG_W
+ PF lsl, DUMMY, PF_X, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ uqadd v28.8b, v0.8b, v4.8b
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+10:
+ uqadd v29.8b, v1.8b, v5.8b
+ uqadd v30.8b, v2.8b, v6.8b
+ uqadd v31.8b, v3.8b, v7.8b
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+ mvn v24.8b, v3.8b /* get inverted alpha */
+ /* do alpha blending */
+ umull v8.8h, v24.8b, v4.8b
+ umull v9.8h, v24.8b, v5.8b
+ umull v10.8h, v24.8b, v6.8b
+ umull v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ urshr v14.8h, v8.8h, #8
+ urshr v15.8h, v9.8h, #8
+ urshr v16.8h, v10.8h, #8
+ urshr v17.8h, v11.8h, #8
+ raddhn v28.8b, v14.8h, v8.8h
+ raddhn v29.8b, v15.8h, v9.8h
+ raddhn v30.8b, v16.8h, v10.8h
+ raddhn v31.8b, v17.8h, v11.8h
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ urshr v14.8h, v8.8h, #8
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ urshr v15.8h, v9.8h, #8
+ urshr v16.8h, v10.8h, #8
+ urshr v17.8h, v11.8h, #8
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ raddhn v28.8b, v14.8h, v8.8h
+ raddhn v29.8b, v15.8h, v9.8h
+ PF cmp, PF_X, ORIG_W
+ raddhn v30.8b, v16.8h, v10.8h
+ raddhn v31.8b, v17.8h, v11.8h
+ fetch_src_pixblock
+ PF lsl, DUMMY, PF_X, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ mvn v22.8b, v3.8b
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ umull v8.8h, v22.8b, v4.8b
+ PF ble, 10f
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ umull v9.8h, v22.8b, v5.8b
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+10:
+ umull v10.8h, v22.8b, v6.8b
+ PF ble, 10f
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+10:
+ umull v11.8h, v22.8b, v7.8b
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ urshr v14.8h, v8.8h, #8
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ urshr v15.8h, v9.8h, #8
+ urshr v16.8h, v10.8h, #8
+ urshr v17.8h, v11.8h, #8
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ raddhn v28.8b, v14.8h, v8.8h
+ raddhn v29.8b, v15.8h, v9.8h
+ PF cmp, PF_X, ORIG_W
+ raddhn v30.8b, v16.8h, v10.8h
+ raddhn v31.8b, v17.8h, v11.8h
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+ fetch_src_pixblock
+ PF lsl, DUMMY, PF_X, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ mvn v22.8b, v3.8b
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ umull v8.8h, v22.8b, v4.8b
+ PF ble, 10f
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ umull v9.8h, v22.8b, v5.8b
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+10:
+ umull v10.8h, v22.8b, v6.8b
+ PF ble, 10f
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+10:
+ umull v11.8h, v22.8b, v7.8b
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+ /* deinterleaved source pixels in {v0, v1, v2, v3} */
+ /* inverted alpha in {v24} */
+ /* destination pixels in {v4, v5, v6, v7} */
+ umull v8.8h, v24.8b, v4.8b
+ umull v9.8h, v24.8b, v5.8b
+ umull v10.8h, v24.8b, v6.8b
+ umull v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+ urshr v14.8h, v8.8h, #8
+ urshr v15.8h, v9.8h, #8
+ urshr v16.8h, v10.8h, #8
+ urshr v17.8h, v11.8h, #8
+ raddhn v28.8b, v14.8h, v8.8h
+ raddhn v29.8b, v15.8h, v9.8h
+ raddhn v30.8b, v16.8h, v10.8h
+ raddhn v31.8b, v17.8h, v11.8h
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+ urshr v14.8h, v8.8h, #8
+ urshr v15.8h, v9.8h, #8
+ urshr v16.8h, v10.8h, #8
+ urshr v17.8h, v11.8h, #8
+ raddhn v28.8b, v14.8h, v8.8h
+ raddhn v29.8b, v15.8h, v9.8h
+ raddhn v30.8b, v16.8h, v10.8h
+ raddhn v31.8b, v17.8h, v11.8h
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ uqadd v28.8b, v0.8b, v28.8b
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0x0F
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+ PF cmp, PF_X, ORIG_W
+ umull v8.8h, v24.8b, v4.8b
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ umull v9.8h, v24.8b, v5.8b
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ umull v10.8h, v24.8b, v6.8b
+ PF subs, PF_CTL, PF_CTL, #0x10
+ umull v11.8h, v24.8b, v7.8b
+ PF ble, 10f
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+10:
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_n_8888_init
+ mov v3.s[0], w4
+ dup v0.8b, v3.b[0]
+ dup v1.8b, v3.b[1]
+ dup v2.8b, v3.b[2]
+ dup v3.8b, v3.b[3]
+ mvn v24.8b, v3.8b /* get inverted alpha */
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+ urshr v14.8h, v8.8h, #8
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ urshr v15.8h, v9.8h, #8
+ urshr v12.8h, v10.8h, #8
+ urshr v13.8h, v11.8h, #8
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ raddhn v28.8b, v14.8h, v8.8h
+ raddhn v29.8b, v15.8h, v9.8h
+ PF cmp, PF_X, ORIG_W
+ raddhn v30.8b, v12.8h, v10.8h
+ raddhn v31.8b, v13.8h, v11.8h
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+ ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
+ mvn v22.8b, v3.8b
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ PF blt, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ umull v8.8h, v22.8b, v4.8b
+ PF blt, 10f
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ umull v9.8h, v22.8b, v5.8b
+ umull v10.8h, v22.8b, v6.8b
+ PF blt, 10f
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+10:
+ umull v11.8h, v22.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+ mov v7.s[0], w4
+ dup v4.8b, v7.b[0]
+ dup v5.8b, v7.b[1]
+ dup v6.8b, v7.b[2]
+ dup v7.8b, v7.b[3]
+.endm
+
+generate_composite_function \
+ pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_reverse_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 4, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+ umull v0.8h, v24.8b, v8.8b /* IN for SRC pixels (part1) */
+ umull v1.8h, v24.8b, v9.8b
+ umull v2.8h, v24.8b, v10.8b
+ umull v3.8h, v24.8b, v11.8b
+ mov v4.d[1], v5.d[0]
+ shrn v25.8b, v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */
+ shrn v26.8b, v4.8h, #3
+ sli v4.8h, v4.8h, #5
+ urshr v17.8h, v0.8h, #8 /* IN for SRC pixels (part2) */
+ urshr v18.8h, v1.8h, #8
+ urshr v19.8h, v2.8h, #8
+ urshr v20.8h, v3.8h, #8
+ raddhn v0.8b, v0.8h, v17.8h
+ raddhn v1.8b, v1.8h, v18.8h
+ raddhn v2.8b, v2.8h, v19.8h
+ raddhn v3.8b, v3.8h, v20.8h
+ sri v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */
+ sri v26.8b, v26.8b, #6
+ mvn v3.8b, v3.8b
+ shrn v30.8b, v4.8h, #2
+ umull v18.8h, v3.8b, v25.8b /* now do alpha blending */
+ umull v19.8h, v3.8b, v26.8b
+ umull v20.8h, v3.8b, v30.8b
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+ /* 3 cycle bubble (after vmull.u8) */
+ urshr v5.8h, v18.8h, #8
+ urshr v6.8h, v19.8h, #8
+ urshr v7.8h, v20.8h, #8
+ raddhn v17.8b, v18.8h, v5.8h
+ raddhn v19.8b, v19.8h, v6.8h
+ raddhn v18.8b, v20.8h, v7.8h
+ uqadd v5.8b, v2.8b, v17.8b
+ /* 1 cycle bubble */
+ uqadd v6.8b, v0.8b, v18.8b
+ uqadd v7.8b, v1.8b, v19.8b
+ ushll v14.8h, v5.8b, #7 /* convert to 16bpp */
+ sli v14.8h, v14.8h, #1
+ ushll v18.8h, v7.8b, #7
+ sli v18.8h, v18.8h, #1
+ ushll v19.8h, v6.8b, #7
+ sli v19.8h, v19.8h, #1
+ sri v14.8h, v18.8h, #5
+ /* 1 cycle bubble */
+ sri v14.8h, v19.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+#if 0
+ ld1 {v4.8h}, [DST_R], #16
+ shrn v25.8b, v4.8h, #8
+ fetch_mask_pixblock
+ shrn v26.8b, v4.8h, #3
+ fetch_src_pixblock
+ umull v22.8h, v24.8b, v10.8b
+ urshr v13.8h, v18.8h, #8
+ urshr v11.8h, v19.8h, #8
+ urshr v15.8h, v20.8h, #8
+ raddhn v17.8b, v18.8h, v13.8h
+ raddhn v19.8b, v19.8h, v11.8h
+ raddhn v18.8b, v20.8h, v15.8h
+ uqadd v17.8b, v2.8b, v17.8b
+ umull v21.8h, v24.8b, v9.8b
+ uqadd v18.8b, v0.8b, v18.8b
+ uqadd v19.8b, v1.8b, v19.8b
+ ushll v14.8h, v17.8b, #7
+ sli v14.8h, v14.8h, #1
+ umull v20.8h, v24.8b, v8.8b
+ ushll v18.8h, v18.8b, #7
+ sli v18.8h, v18.8h, #1
+ ushll v19.8h, v19.8b, #7
+ sli v19.8h, v19.8h, #1
+ sri v14.8h, v18.8h, #5
+ umull v23.8h, v24.8b, v11.8b
+ sri v14.8h, v19.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+
+ cache_preload 8, 8
+
+ sli v4.8h, v4.8h, #5
+ urshr v16.8h, v20.8h, #8
+ urshr v17.8h, v21.8h, #8
+ urshr v18.8h, v22.8h, #8
+ urshr v19.8h, v23.8h, #8
+ raddhn v0.8b, v20.8h, v16.8h
+ raddhn v1.8b, v21.8h, v17.8h
+ raddhn v2.8b, v22.8h, v18.8h
+ raddhn v3.8b, v23.8h, v19.8h
+ sri v25.8b, v25.8b, #5
+ sri v26.8b, v26.8b, #6
+ mvn v3.8b, v3.8b
+ shrn v30.8b, v4.8h, #2
+ st1 {v14.8h}, [DST_W], #16
+ umull v18.8h, v3.8b, v25.8b
+ umull v19.8h, v3.8b, v26.8b
+ umull v20.8h, v3.8b, v30.8b
+#else
+ pixman_composite_over_8888_8_0565_process_pixblock_tail
+ st1 {v28.4h, v29.4h}, [DST_W], #16
+ ld1 {v4.4h, v5.4h}, [DST_R], #16
+ fetch_mask_pixblock
+ fetch_src_pixblock
+ pixman_composite_over_8888_8_0565_process_pixblock_head
+#endif
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+ mov v11.s[0], w4
+ dup v8.8b, v11.b[0]
+ dup v9.8b, v11.b[1]
+ dup v10.8b, v11.b[2]
+ dup v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_0565_init, \
+ pixman_composite_over_n_8_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+ mov v24.s[0], w6
+ dup v24.8b, v24.b[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_0565_init, \
+ pixman_composite_over_8888_n_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
+ fetch_src_pixblock
+ cache_preload 16, 16
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_0565_process_pixblock_head, \
+ pixman_composite_src_0565_0565_process_pixblock_tail, \
+ pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+ st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
+.endm
+
+.macro pixman_composite_src_n_8_init
+ mov v0.s[0], w4
+ dup v3.8b, v0.b[0]
+ dup v2.8b, v0.b[0]
+ dup v1.8b, v0.b[0]
+ dup v0.8b, v0.b[0]
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8_init, \
+ pixman_composite_src_n_8_cleanup, \
+ pixman_composite_src_n_8_process_pixblock_head, \
+ pixman_composite_src_n_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
+.endm
+
+.macro pixman_composite_src_n_0565_init
+ mov v0.s[0], w4
+ dup v3.4h, v0.h[0]
+ dup v2.4h, v0.h[0]
+ dup v1.4h, v0.h[0]
+ dup v0.4h, v0.h[0]
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_0565_init, \
+ pixman_composite_src_n_0565_cleanup, \
+ pixman_composite_src_n_0565_process_pixblock_head, \
+ pixman_composite_src_n_0565_process_pixblock_tail, \
+ pixman_composite_src_n_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+.endm
+
+.macro pixman_composite_src_n_8888_init
+ mov v0.s[0], w4
+ dup v3.2s, v0.s[0]
+ dup v2.2s, v0.s[0]
+ dup v1.2s, v0.s[0]
+ dup v0.2s, v0.s[0]
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8888_init, \
+ pixman_composite_src_n_8888_cleanup, \
+ pixman_composite_src_n_8888_process_pixblock_head, \
+ pixman_composite_src_n_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_8888_process_pixblock_head, \
+ pixman_composite_src_8888_8888_process_pixblock_tail, \
+ pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+ orr v0.8b, v0.8b, v4.8b
+ orr v1.8b, v1.8b, v4.8b
+ orr v2.8b, v2.8b, v4.8b
+ orr v3.8b, v3.8b, v4.8b
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+ fetch_src_pixblock
+ orr v0.8b, v0.8b, v4.8b
+ orr v1.8b, v1.8b, v4.8b
+ orr v2.8b, v2.8b, v4.8b
+ orr v3.8b, v3.8b, v4.8b
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+ movi v4.2s, #0xff, lsl 24
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_x888_8888_init, \
+ default_cleanup, \
+ pixman_composite_src_x888_8888_process_pixblock_head, \
+ pixman_composite_src_x888_8888_process_pixblock_tail, \
+ pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+ /* expecting solid source in {v0, v1, v2, v3} */
+ /* mask is in v24 (v25, v26, v27 are unused) */
+
+ /* in */
+ umull v8.8h, v24.8b, v0.8b
+ umull v9.8h, v24.8b, v1.8b
+ umull v10.8h, v24.8b, v2.8b
+ umull v11.8h, v24.8b, v3.8b
+ ursra v8.8h, v8.8h, #8
+ ursra v9.8h, v9.8h, #8
+ ursra v10.8h, v10.8h, #8
+ ursra v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+ rshrn v28.8b, v8.8h, #8
+ rshrn v29.8b, v9.8h, #8
+ rshrn v30.8b, v10.8h, #8
+ rshrn v31.8b, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add, PF_X, PF_X, #8
+ rshrn v28.8b, v8.8h, #8
+ PF tst, PF_CTL, #0x0F
+ rshrn v29.8b, v9.8h, #8
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+10:
+ rshrn v30.8b, v10.8h, #8
+ PF beq, 10f
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ rshrn v31.8b, v11.8h, #8
+ PF cmp, PF_X, ORIG_W
+ umull v8.8h, v24.8b, v0.8b
+ PF lsl, DUMMY, PF_X, #mask_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+ umull v9.8h, v24.8b, v1.8b
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ umull v10.8h, v24.8b, v2.8b
+ PF ble, 10f
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ umull v11.8h, v24.8b, v3.8b
+ PF ble, 10f
+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+ PF add, PF_MASK, PF_MASK, #1
+10:
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ ursra v8.8h, v8.8h, #8
+ ursra v9.8h, v9.8h, #8
+ ursra v10.8h, v10.8h, #8
+ ursra v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+ mov v3.s[0], w4
+ dup v0.8b, v3.b[0]
+ dup v1.8b, v3.b[1]
+ dup v2.8b, v3.b[2]
+ dup v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8888_init, \
+ pixman_composite_src_n_8_8888_cleanup, \
+ pixman_composite_src_n_8_8888_process_pixblock_head, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+ umull v0.8h, v24.8b, v16.8b
+ umull v1.8h, v25.8b, v16.8b
+ umull v2.8h, v26.8b, v16.8b
+ umull v3.8h, v27.8b, v16.8b
+ ursra v0.8h, v0.8h, #8
+ ursra v1.8h, v1.8h, #8
+ ursra v2.8h, v2.8h, #8
+ ursra v3.8h, v3.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+ rshrn v28.8b, v0.8h, #8
+ rshrn v29.8b, v1.8h, #8
+ rshrn v30.8b, v2.8h, #8
+ rshrn v31.8b, v3.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add, PF_X, PF_X, #8
+ rshrn v28.8b, v0.8h, #8
+ PF tst, PF_CTL, #0x0F
+ rshrn v29.8b, v1.8h, #8
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+10:
+ rshrn v30.8b, v2.8h, #8
+ PF beq, 10f
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ rshrn v31.8b, v3.8h, #8
+ PF cmp, PF_X, ORIG_W
+ umull v0.8h, v24.8b, v16.8b
+ PF lsl, DUMMY, PF_X, mask_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+ umull v1.8h, v25.8b, v16.8b
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ umull v2.8h, v26.8b, v16.8b
+ PF ble, 10f
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ umull v3.8h, v27.8b, v16.8b
+ PF ble, 10f
+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+ PF add, PF_MASK, PF_MASK, #1
+10:
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ ursra v0.8h, v0.8h, #8
+ ursra v1.8h, v1.8h, #8
+ ursra v2.8h, v2.8h, #8
+ ursra v3.8h, v3.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+ mov v16.s[0], w4
+ dup v16.8b, v16.b[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8_init, \
+ pixman_composite_src_n_8_8_cleanup, \
+ pixman_composite_src_n_8_8_process_pixblock_head, \
+ pixman_composite_src_n_8_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+ /* expecting deinterleaved source data in {v8, v9, v10, v11} */
+ /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
+ /* and destination data in {v4, v5, v6, v7} */
+ /* mask is in v24 (v25, v26, v27 are unused) */
+
+ /* in */
+ umull v12.8h, v24.8b, v8.8b
+ umull v13.8h, v24.8b, v9.8b
+ umull v14.8h, v24.8b, v10.8b
+ umull v15.8h, v24.8b, v11.8b
+ urshr v16.8h, v12.8h, #8
+ urshr v17.8h, v13.8h, #8
+ urshr v18.8h, v14.8h, #8
+ urshr v19.8h, v15.8h, #8
+ raddhn v0.8b, v12.8h, v16.8h
+ raddhn v1.8b, v13.8h, v17.8h
+ raddhn v2.8b, v14.8h, v18.8h
+ raddhn v3.8b, v15.8h, v19.8h
+ mvn v25.8b, v3.8b /* get inverted alpha */
+ /* source: v0 - blue, v1 - green, v2 - red, v3 - alpha */
+ /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
+ /* now do alpha blending */
+ umull v12.8h, v25.8b, v4.8b
+ umull v13.8h, v25.8b, v5.8b
+ umull v14.8h, v25.8b, v6.8b
+ umull v15.8h, v25.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+ urshr v16.8h, v12.8h, #8
+ urshr v17.8h, v13.8h, #8
+ urshr v18.8h, v14.8h, #8
+ urshr v19.8h, v15.8h, #8
+ raddhn v28.8b, v16.8h, v12.8h
+ raddhn v29.8b, v17.8h, v13.8h
+ raddhn v30.8b, v18.8h, v14.8h
+ raddhn v31.8b, v19.8h, v15.8h
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+ urshr v16.8h, v12.8h, #8
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ urshr v17.8h, v13.8h, #8
+ fetch_mask_pixblock
+ urshr v18.8h, v14.8h, #8
+ PF add, PF_X, PF_X, #8
+ urshr v19.8h, v15.8h, #8
+ PF tst, PF_CTL, #0x0F
+ raddhn v28.8b, v16.8h, v12.8h
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+10:
+ raddhn v29.8b, v17.8h, v13.8h
+ PF beq, 10f
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ raddhn v30.8b, v18.8h, v14.8h
+ PF cmp, PF_X, ORIG_W
+ raddhn v31.8b, v19.8h, v15.8h
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+ umull v16.8h, v24.8b, v8.8b
+ PF lsl, DUMMY, PF_X, #mask_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+ umull v17.8h, v24.8b, v9.8b
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+10:
+ umull v18.8h, v24.8b, v10.8b
+ PF ble, 10f
+ PF subs, PF_CTL, PF_CTL, #0x10
+10:
+ umull v19.8h, v24.8b, v11.8b
+ PF ble, 10f
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+10:
+ uqadd v28.8b, v0.8b, v28.8b
+ PF ble, 10f
+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+ PF add, PF_MASK, PF_MASK, #1
+10:
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+ urshr v12.8h, v16.8h, #8
+ urshr v13.8h, v17.8h, #8
+ urshr v14.8h, v18.8h, #8
+ urshr v15.8h, v19.8h, #8
+ raddhn v0.8b, v16.8h, v12.8h
+ raddhn v1.8b, v17.8h, v13.8h
+ raddhn v2.8b, v18.8h, v14.8h
+ raddhn v3.8b, v19.8h, v15.8h
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ mvn v25.8b, v3.8b
+ umull v12.8h, v25.8b, v4.8b
+ umull v13.8h, v25.8b, v5.8b
+ umull v14.8h, v25.8b, v6.8b
+ umull v15.8h, v25.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+ mov v11.s[0], w4
+ dup v8.8b, v11.b[0]
+ dup v9.8b, v11.b[1]
+ dup v10.8b, v11.b[2]
+ dup v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8888_init, \
+ pixman_composite_over_n_8_8888_cleanup, \
+ pixman_composite_over_n_8_8888_process_pixblock_head, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+ umull v0.8h, v24.8b, v8.8b
+ umull v1.8h, v25.8b, v8.8b
+ umull v2.8h, v26.8b, v8.8b
+ umull v3.8h, v27.8b, v8.8b
+ urshr v10.8h, v0.8h, #8
+ urshr v11.8h, v1.8h, #8
+ urshr v12.8h, v2.8h, #8
+ urshr v13.8h, v3.8h, #8
+ raddhn v0.8b, v0.8h, v10.8h
+ raddhn v1.8b, v1.8h, v11.8h
+ raddhn v2.8b, v2.8h, v12.8h
+ raddhn v3.8b, v3.8h, v13.8h
+ mvn v24.8b, v0.8b
+ mvn v25.8b, v1.8b
+ mvn v26.8b, v2.8b
+ mvn v27.8b, v3.8b
+ umull v10.8h, v24.8b, v4.8b
+ umull v11.8h, v25.8b, v5.8b
+ umull v12.8h, v26.8b, v6.8b
+ umull v13.8h, v27.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+ urshr v14.8h, v10.8h, #8
+ urshr v15.8h, v11.8h, #8
+ urshr v16.8h, v12.8h, #8
+ urshr v17.8h, v13.8h, #8
+ raddhn v28.8b, v14.8h, v10.8h
+ raddhn v29.8b, v15.8h, v11.8h
+ raddhn v30.8b, v16.8h, v12.8h
+ raddhn v31.8b, v17.8h, v13.8h
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ pixman_composite_over_n_8_8_process_pixblock_tail
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+ mov v8.s[0], w4
+ dup v8.8b, v8.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8_init, \
+ pixman_composite_over_n_8_8_cleanup, \
+ pixman_composite_over_n_8_8_process_pixblock_head, \
+ pixman_composite_over_n_8_8_process_pixblock_tail, \
+ pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {v8, v9, v10, v11}
+ * dest in {v4, v5, v6, v7 }
+ * mask in {v24, v25, v26, v27}
+ * output: updated src in {v0, v1, v2, v3 }
+ * updated mask in {v24, v25, v26, v3 }
+ */
+ umull v0.8h, v24.8b, v8.8b
+ umull v1.8h, v25.8b, v9.8b
+ umull v2.8h, v26.8b, v10.8b
+ umull v3.8h, v27.8b, v11.8b
+ umull v12.8h, v11.8b, v25.8b
+ umull v13.8h, v11.8b, v24.8b
+ umull v14.8h, v11.8b, v26.8b
+ urshr v15.8h, v0.8h, #8
+ urshr v16.8h, v1.8h, #8
+ urshr v17.8h, v2.8h, #8
+ raddhn v0.8b, v0.8h, v15.8h
+ raddhn v1.8b, v1.8h, v16.8h
+ raddhn v2.8b, v2.8h, v17.8h
+ urshr v15.8h, v13.8h, #8
+ urshr v16.8h, v12.8h, #8
+ urshr v17.8h, v14.8h, #8
+ urshr v18.8h, v3.8h, #8
+ raddhn v24.8b, v13.8h, v15.8h
+ raddhn v25.8b, v12.8h, v16.8h
+ raddhn v26.8b, v14.8h, v17.8h
+ raddhn v3.8b, v3.8h, v18.8h
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in {v28, v29, v30, v31}
+ */
+ mvn v24.8b, v24.8b
+ mvn v25.8b, v25.8b
+ mvn v26.8b, v26.8b
+ mvn v27.8b, v3.8b
+ umull v12.8h, v24.8b, v4.8b
+ umull v13.8h, v25.8b, v5.8b
+ umull v14.8h, v26.8b, v6.8b
+ umull v15.8h, v27.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+ /* ... continue 'combine_over_ca' replacement */
+ urshr v16.8h, v12.8h, #8
+ urshr v17.8h, v13.8h, #8
+ urshr v18.8h, v14.8h, #8
+ urshr v19.8h, v15.8h, #8
+ raddhn v28.8b, v16.8h, v12.8h
+ raddhn v29.8b, v17.8h, v13.8h
+ raddhn v30.8b, v18.8h, v14.8h
+ raddhn v31.8b, v19.8h, v15.8h
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+ urshr v16.8h, v12.8h, #8
+ urshr v17.8h, v13.8h, #8
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ urshr v18.8h, v14.8h, #8
+ urshr v19.8h, v15.8h, #8
+ raddhn v28.8b, v16.8h, v12.8h
+ raddhn v29.8b, v17.8h, v13.8h
+ raddhn v30.8b, v18.8h, v14.8h
+ raddhn v31.8b, v19.8h, v15.8h
+ fetch_mask_pixblock
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+ cache_preload 8, 8
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+ mov v13.s[0], w4
+ dup v8.8b, v13.b[0]
+ dup v9.8b, v13.b[1]
+ dup v10.8b, v13.b[2]
+ dup v11.8b, v13.b[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_8888_ca_init, \
+ pixman_composite_over_n_8888_8888_ca_cleanup, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A]
+ * mask in {v24, v25, v26} [B, G, R]
+ * output: updated src in {v0, v1, v2 } [B, G, R]
+ * updated mask in {v24, v25, v26} [B, G, R]
+ */
+ umull v0.8h, v24.8b, v8.8b
+ umull v1.8h, v25.8b, v9.8b
+ umull v2.8h, v26.8b, v10.8b
+ umull v12.8h, v11.8b, v24.8b
+ umull v13.8h, v11.8b, v25.8b
+ umull v14.8h, v11.8b, v26.8b
+ urshr v15.8h, v0.8h, #8
+ urshr v16.8h, v1.8h, #8
+ urshr v17.8h, v2.8h, #8
+ raddhn v0.8b, v0.8h, v15.8h
+ raddhn v1.8b, v1.8h, v16.8h
+ raddhn v2.8b, v2.8h, v17.8h
+ urshr v19.8h, v12.8h, #8
+ urshr v20.8h, v13.8h, #8
+ urshr v21.8h, v14.8h, #8
+ raddhn v24.8b, v12.8h, v19.8h
+ raddhn v25.8b, v13.8h, v20.8h
+ /*
+ * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+ * and put data into v16 - blue, v17 - green, v18 - red
+ */
+ mov v4.d[1], v5.d[0]
+ shrn v17.8b, v4.8h, #3
+ shrn v18.8b, v4.8h, #8
+ raddhn v26.8b, v14.8h, v21.8h
+ sli v4.8h, v4.8h, #5
+ sri v18.8b, v18.8b, #5
+ sri v17.8b, v17.8b, #6
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in v16 - blue, v17 - green, v18 - red
+ */
+ mvn v24.8b, v24.8b
+ mvn v25.8b, v25.8b
+ shrn v16.8b, v4.8h, #2
+ mvn v26.8b, v26.8b
+ umull v5.8h, v16.8b, v24.8b
+ umull v6.8h, v17.8b, v25.8b
+ umull v7.8h, v18.8b, v26.8b
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+ /* ... continue 'combine_over_ca' replacement */
+ urshr v13.8h, v5.8h, #8
+ urshr v14.8h, v6.8h, #8
+ urshr v15.8h, v7.8h, #8
+ raddhn v16.8b, v13.8h, v5.8h
+ raddhn v17.8b, v14.8h, v6.8h
+ raddhn v18.8b, v15.8h, v7.8h
+ uqadd v16.8b, v0.8b, v16.8b
+ uqadd v17.8b, v1.8b, v17.8b
+ uqadd v18.8b, v2.8b, v18.8b
+ /*
+ * convert the results in v16, v17, v18 to r5g6b5 and store
+ * them into {v14}
+ */
+ ushll v14.8h, v18.8b, #7
+ sli v14.8h, v14.8h, #1
+ ushll v12.8h, v17.8b, #7
+ sli v12.8h, v12.8h, #1
+ ushll v13.8h, v16.8b, #7
+ sli v13.8h, v13.8h, #1
+ sri v14.8h, v12.8h, #5
+ sri v14.8h, v13.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+ fetch_mask_pixblock
+ urshr v13.8h, v5.8h, #8
+ urshr v14.8h, v6.8h, #8
+ ld1 {v4.8h}, [DST_R], #16
+ urshr v15.8h, v7.8h, #8
+ raddhn v16.8b, v13.8h, v5.8h
+ raddhn v17.8b, v14.8h, v6.8h
+ raddhn v18.8b, v15.8h, v7.8h
+ mov v5.d[0], v4.d[1]
+ /* process_pixblock_head */
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A]
+ * mask in {v24, v25, v26} [B, G, R]
+ * output: updated src in {v0, v1, v2 } [B, G, R]
+ * updated mask in {v24, v25, v26} [B, G, R]
+ */
+ uqadd v16.8b, v0.8b, v16.8b
+ uqadd v17.8b, v1.8b, v17.8b
+ uqadd v18.8b, v2.8b, v18.8b
+ umull v0.8h, v24.8b, v8.8b
+ umull v1.8h, v25.8b, v9.8b
+ umull v2.8h, v26.8b, v10.8b
+ /*
+ * convert the result in v16, v17, v18 to r5g6b5 and store
+ * it into {v14}
+ */
+ ushll v14.8h, v18.8b, #7
+ sli v14.8h, v14.8h, #1
+ ushll v18.8h, v16.8b, #7
+ sli v18.8h, v18.8h, #1
+ ushll v19.8h, v17.8b, #7
+ sli v19.8h, v19.8h, #1
+ umull v12.8h, v11.8b, v24.8b
+ sri v14.8h, v19.8h, #5
+ umull v13.8h, v11.8b, v25.8b
+ umull v15.8h, v11.8b, v26.8b
+ sri v14.8h, v18.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+ cache_preload 8, 8
+ urshr v16.8h, v0.8h, #8
+ urshr v17.8h, v1.8h, #8
+ urshr v18.8h, v2.8h, #8
+ raddhn v0.8b, v0.8h, v16.8h
+ raddhn v1.8b, v1.8h, v17.8h
+ raddhn v2.8b, v2.8h, v18.8h
+ urshr v19.8h, v12.8h, #8
+ urshr v20.8h, v13.8h, #8
+ urshr v21.8h, v15.8h, #8
+ raddhn v24.8b, v12.8h, v19.8h
+ raddhn v25.8b, v13.8h, v20.8h
+ /*
+ * convert 8 r5g6b5 pixel data from {v4, v5} to planar
+ * 8-bit format and put data into v16 - blue, v17 - green,
+ * v18 - red
+ */
+ mov v4.d[1], v5.d[0]
+ shrn v17.8b, v4.8h, #3
+ shrn v18.8b, v4.8h, #8
+ raddhn v26.8b, v15.8h, v21.8h
+ sli v4.8h, v4.8h, #5
+ sri v17.8b, v17.8b, #6
+ sri v18.8b, v18.8b, #5
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in v16 - blue, v17 - green, v18 - red
+ */
+ mvn v24.8b, v24.8b
+ mvn v25.8b, v25.8b
+ shrn v16.8b, v4.8h, #2
+ mvn v26.8b, v26.8b
+ umull v5.8h, v16.8b, v24.8b
+ umull v6.8h, v17.8b, v25.8b
+ umull v7.8h, v18.8b, v26.8b
+ st1 {v14.8h}, [DST_W], #16
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+ mov v13.s[0], w4
+ dup v8.8b, v13.b[0]
+ dup v9.8b, v13.b[1]
+ dup v10.8b, v13.b[2]
+ dup v11.8b, v13.b[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_0565_ca_init, \
+ pixman_composite_over_n_8888_0565_ca_cleanup, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+ /* expecting source data in {v0, v1, v2, v3} */
+ /* and destination data in {v4, v5, v6, v7} */
+ umull v8.8h, v4.8b, v3.8b
+ umull v9.8h, v5.8b, v3.8b
+ umull v10.8h, v6.8b, v3.8b
+ umull v11.8h, v7.8b, v3.8b
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+ urshr v14.8h, v8.8h, #8
+ urshr v15.8h, v9.8h, #8
+ urshr v12.8h, v10.8h, #8
+ urshr v13.8h, v11.8h, #8
+ raddhn v28.8b, v8.8h, v14.8h
+ raddhn v29.8b, v9.8h, v15.8h
+ raddhn v30.8b, v10.8h, v12.8h
+ raddhn v31.8b, v11.8h, v13.8h
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+ pixman_composite_in_n_8_process_pixblock_tail
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ cache_preload 32, 32
+ pixman_composite_in_n_8_process_pixblock_head
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_in_n_8_init
+ mov v3.s[0], w4
+ dup v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_in_n_8_init, \
+ pixman_composite_in_n_8_cleanup, \
+ pixman_composite_in_n_8_process_pixblock_head, \
+ pixman_composite_in_n_8_process_pixblock_tail, \
+ pixman_composite_in_n_8_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+ /* expecting source data in {v8, v9, v10, v11} */
+ /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
+ /* and destination data in {v4, v5, v6, v7} */
+ /* mask is in v24, v25, v26, v27 */
+ umull v0.8h, v24.8b, v11.8b
+ umull v1.8h, v25.8b, v11.8b
+ umull v2.8h, v26.8b, v11.8b
+ umull v3.8h, v27.8b, v11.8b
+ urshr v12.8h, v0.8h, #8
+ urshr v13.8h, v1.8h, #8
+ urshr v14.8h, v2.8h, #8
+ urshr v15.8h, v3.8h, #8
+ raddhn v0.8b, v0.8h, v12.8h
+ raddhn v1.8b, v1.8h, v13.8h
+ raddhn v2.8b, v2.8h, v14.8h
+ raddhn v3.8b, v3.8h, v15.8h
+ uqadd v28.8b, v0.8b, v4.8b
+ uqadd v29.8b, v1.8b, v5.8b
+ uqadd v30.8b, v2.8b, v6.8b
+ uqadd v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+ pixman_composite_add_n_8_8_process_pixblock_tail
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+ mov v11.s[0], w4
+ dup v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8_init, \
+ pixman_composite_add_n_8_8_cleanup, \
+ pixman_composite_add_n_8_8_process_pixblock_head, \
+ pixman_composite_add_n_8_8_process_pixblock_tail, \
+ pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+ /* expecting source data in {v0, v1, v2, v3} */
+ /* destination data in {v4, v5, v6, v7} */
+ /* mask in {v24, v25, v26, v27} */
+ umull v8.8h, v24.8b, v0.8b
+ umull v9.8h, v25.8b, v1.8b
+ umull v10.8h, v26.8b, v2.8b
+ umull v11.8h, v27.8b, v3.8b
+ urshr v0.8h, v8.8h, #8
+ urshr v1.8h, v9.8h, #8
+ urshr v12.8h, v10.8h, #8
+ urshr v13.8h, v11.8h, #8
+ raddhn v0.8b, v0.8h, v8.8h
+ raddhn v1.8b, v1.8h, v9.8h
+ raddhn v2.8b, v12.8h, v10.8h
+ raddhn v3.8b, v13.8h, v11.8h
+ uqadd v28.8b, v0.8b, v4.8b
+ uqadd v29.8b, v1.8b, v5.8b
+ uqadd v30.8b, v2.8b, v6.8b
+ uqadd v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+ pixman_composite_add_8_8_8_process_pixblock_tail
+ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ fetch_mask_pixblock
+ fetch_src_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8_8_8_init, \
+ pixman_composite_add_8_8_8_cleanup, \
+ pixman_composite_add_8_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+ /* expecting source data in {v0, v1, v2, v3} */
+ /* destination data in {v4, v5, v6, v7} */
+ /* mask in {v24, v25, v26, v27} */
+ umull v8.8h, v27.8b, v0.8b
+ umull v9.8h, v27.8b, v1.8b
+ umull v10.8h, v27.8b, v2.8b
+ umull v11.8h, v27.8b, v3.8b
+ /* 1 cycle bubble */
+ ursra v8.8h, v8.8h, #8
+ ursra v9.8h, v9.8h, #8
+ ursra v10.8h, v10.8h, #8
+ ursra v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+ /* 2 cycle bubble */
+ rshrn v28.8b, v8.8h, #8
+ rshrn v29.8b, v9.8h, #8
+ rshrn v30.8b, v10.8h, #8
+ rshrn v31.8b, v11.8h, #8
+ uqadd v28.8b, v4.8b, v28.8b
+ uqadd v29.8b, v5.8b, v29.8b
+ uqadd v30.8b, v6.8b, v30.8b
+ uqadd v31.8b, v7.8b, v31.8b
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ rshrn v28.8b, v8.8h, #8
+ fetch_mask_pixblock
+ rshrn v29.8b, v9.8h, #8
+ umull v8.8h, v27.8b, v0.8b
+ rshrn v30.8b, v10.8h, #8
+ umull v9.8h, v27.8b, v1.8b
+ rshrn v31.8b, v11.8h, #8
+ umull v10.8h, v27.8b, v2.8b
+ umull v11.8h, v27.8b, v3.8b
+ uqadd v28.8b, v4.8b, v28.8b
+ uqadd v29.8b, v5.8b, v29.8b
+ uqadd v30.8b, v6.8b, v30.8b
+ uqadd v31.8b, v7.8b, v31.8b
+ ursra v8.8h, v8.8h, #8
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ ursra v9.8h, v9.8h, #8
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ ursra v10.8h, v10.8h, #8
+
+ cache_preload 8, 8
+
+ ursra v11.8h, v11.8h, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+generate_composite_function \
+ pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+ mov v3.s[0], w4
+ dup v0.8b, v3.b[0]
+ dup v1.8b, v3.b[1]
+ dup v2.8b, v3.b[2]
+ dup v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8888_init, \
+ pixman_composite_add_n_8_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+ mov v27.s[0], w6
+ dup v27.8b, v27.b[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8888_n_8888_init, \
+ pixman_composite_add_8888_n_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ /* expecting source data in {v0, v1, v2, v3} */
+ /* destination data in {v4, v5, v6, v7} */
+ /* solid mask is in v15 */
+
+ /* 'in' */
+ umull v11.8h, v15.8b, v3.8b
+ umull v10.8h, v15.8b, v2.8b
+ umull v9.8h, v15.8b, v1.8b
+ umull v8.8h, v15.8b, v0.8b
+ urshr v16.8h, v11.8h, #8
+ urshr v14.8h, v10.8h, #8
+ urshr v13.8h, v9.8h, #8
+ urshr v12.8h, v8.8h, #8
+ raddhn v3.8b, v11.8h, v16.8h
+ raddhn v2.8b, v10.8h, v14.8h
+ raddhn v1.8b, v9.8h, v13.8h
+ raddhn v0.8b, v8.8h, v12.8h
+ mvn v24.8b, v3.8b /* get inverted alpha */
+ /* now do alpha blending */
+ umull v8.8h, v24.8b, v4.8b
+ umull v9.8h, v24.8b, v5.8b
+ umull v10.8h, v24.8b, v6.8b
+ umull v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ urshr v16.8h, v8.8h, #8
+ urshr v17.8h, v9.8h, #8
+ urshr v18.8h, v10.8h, #8
+ urshr v19.8h, v11.8h, #8
+ raddhn v28.8b, v16.8h, v8.8h
+ raddhn v29.8b, v17.8h, v9.8h
+ raddhn v30.8b, v18.8h, v10.8h
+ raddhn v31.8b, v19.8h, v11.8h
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ uqadd v28.8b, v0.8b, v28.8b
+ uqadd v29.8b, v1.8b, v29.8b
+ uqadd v30.8b, v2.8b, v30.8b
+ uqadd v31.8b, v3.8b, v31.8b
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+ mov v15.s[0], w6
+ dup v15.8b, v15.b[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_8888_init, \
+ pixman_composite_over_8888_n_8888_cleanup, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+ st3 {v0.8b, v1.8b, v2.8b}, [DST_W], #24
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0888_process_pixblock_head, \
+ pixman_composite_src_0888_0888_process_pixblock_tail, \
+ pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+ mov v31.8b, v2.8b
+ mov v2.8b, v0.8b
+ mov v0.8b, v31.8b
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+ st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
+ fetch_src_pixblock
+ mov v31.8b, v2.8b
+ mov v2.8b, v0.8b
+ mov v0.8b, v31.8b
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+ eor v3.8b, v3.8b, v3.8b
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_0888_8888_rev_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+ ushll v8.8h, v1.8b, #7
+ sli v8.8h, v8.8h, #1
+ ushll v9.8h, v2.8b, #7
+ sli v9.8h, v9.8h, #1
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+ ushll v14.8h, v0.8b, #7
+ sli v14.8h, v14.8h, #1
+ sri v14.8h, v8.8h, #5
+ sri v14.8h, v9.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+ ushll v14.8h, v0.8b, #7
+ sli v14.8h, v14.8h, #1
+ fetch_src_pixblock
+ sri v14.8h, v8.8h, #5
+ sri v14.8h, v9.8h, #11
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+ ushll v8.8h, v1.8b, #7
+ sli v8.8h, v8.8h, #1
+ st1 {v14.8h}, [DST_W], #16
+ ushll v9.8h, v2.8b, #7
+ sli v9.8h, v9.8h, #1
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+ umull v8.8h, v3.8b, v0.8b
+ umull v9.8h, v3.8b, v1.8b
+ umull v10.8h, v3.8b, v2.8b
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+ urshr v11.8h, v8.8h, #8
+ mov v30.8b, v31.8b
+ mov v31.8b, v3.8b
+ mov v3.8b, v30.8b
+ urshr v12.8h, v9.8h, #8
+ urshr v13.8h, v10.8h, #8
+ raddhn v30.8b, v11.8h, v8.8h
+ raddhn v29.8b, v12.8h, v9.8h
+ raddhn v28.8b, v13.8h, v10.8h
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+ urshr v11.8h, v8.8h, #8
+ mov v30.8b, v31.8b
+ mov v31.8b, v3.8b
+ mov v3.8b, v31.8b
+ urshr v12.8h, v9.8h, #8
+ urshr v13.8h, v10.8h, #8
+ fetch_src_pixblock
+ raddhn v30.8b, v11.8h, v8.8h
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ raddhn v29.8b, v12.8h, v9.8h
+ raddhn v28.8b, v13.8h, v10.8h
+ umull v8.8h, v3.8b, v0.8b
+ umull v9.8h, v3.8b, v1.8b
+ umull v10.8h, v3.8b, v2.8b
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ PF cmp, PF_X, ORIG_W
+ PF lsl, DUMMY, PF_X, src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+ PF subs, PF_CTL, PF_CTL, #0x10
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+10:
+.endm
+
+generate_composite_function \
+ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+ umull v8.8h, v3.8b, v0.8b
+ umull v9.8h, v3.8b, v1.8b
+ umull v10.8h, v3.8b, v2.8b
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+ urshr v11.8h, v8.8h, #8
+ mov v30.8b, v31.8b
+ mov v31.8b, v3.8b
+ mov v3.8b, v30.8b
+ urshr v12.8h, v9.8h, #8
+ urshr v13.8h, v10.8h, #8
+ raddhn v28.8b, v11.8h, v8.8h
+ raddhn v29.8b, v12.8h, v9.8h
+ raddhn v30.8b, v13.8h, v10.8h
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+ urshr v11.8h, v8.8h, #8
+ mov v30.8b, v31.8b
+ mov v31.8b, v3.8b
+ mov v3.8b, v30.8b
+ urshr v12.8h, v9.8h, #8
+ urshr v13.8h, v10.8h, #8
+ fetch_src_pixblock
+ raddhn v28.8b, v11.8h, v8.8h
+ PF add, PF_X, PF_X, #8
+ PF tst, PF_CTL, #0xF
+ PF beq, 10f
+ PF add, PF_X, PF_X, #8
+ PF sub, PF_CTL, PF_CTL, #1
+10:
+ raddhn v29.8b, v12.8h, v9.8h
+ raddhn v30.8b, v13.8h, v10.8h
+ umull v8.8h, v3.8b, v0.8b
+ umull v9.8h, v3.8b, v1.8b
+ umull v10.8h, v3.8b, v2.8b
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+ PF cmp, PF_X, ORIG_W
+ PF lsl, DUMMY, PF_X, src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+ PF ble, 10f
+ PF sub, PF_X, PF_X, ORIG_W
+ PF subs, PF_CTL, PF_CTL, #0x10
+ PF ble, 10f
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+10:
+.endm
+
+generate_composite_function \
+ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+ /* mask is in v15 */
+ mov v4.d[0], v8.d[0]
+ mov v4.d[1], v9.d[0]
+ mov v13.d[0], v10.d[0]
+ mov v13.d[1], v11.d[0]
+ convert_0565_to_x888 v4, v2, v1, v0
+ convert_0565_to_x888 v13, v6, v5, v4
+ /* source pixel data is in {v0, v1, v2, XX} */
+ /* destination pixel data is in {v4, v5, v6, XX} */
+ mvn v7.8b, v15.8b
+ umull v10.8h, v15.8b, v2.8b
+ umull v9.8h, v15.8b, v1.8b
+ umull v8.8h, v15.8b, v0.8b
+ umull v11.8h, v7.8b, v4.8b
+ umull v12.8h, v7.8b, v5.8b
+ umull v13.8h, v7.8b, v6.8b
+ urshr v19.8h, v10.8h, #8
+ urshr v18.8h, v9.8h, #8
+ urshr v17.8h, v8.8h, #8
+ raddhn v2.8b, v10.8h, v19.8h
+ raddhn v1.8b, v9.8h, v18.8h
+ raddhn v0.8b, v8.8h, v17.8h
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+ urshr v17.8h, v11.8h, #8
+ urshr v18.8h, v12.8h, #8
+ urshr v19.8h, v13.8h, #8
+ raddhn v28.8b, v17.8h, v11.8h
+ raddhn v29.8b, v18.8h, v12.8h
+ raddhn v30.8b, v19.8h, v13.8h
+ uqadd v0.8b, v0.8b, v28.8b
+ uqadd v1.8b, v1.8b, v29.8b
+ uqadd v2.8b, v2.8b, v30.8b
+ /* 32bpp result is in {v0, v1, v2, XX} */
+ convert_8888_to_0565 v2, v1, v0, v14, v30, v13
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_over_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ ld1 {v10.4h, v11.4h}, [DST_R], #16
+ cache_preload 8, 8
+ pixman_composite_over_0565_8_0565_process_pixblock_head
+ st1 {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+ mov v15.s[0], w6
+ dup v15.8b, v15.b[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_0565_n_0565_init, \
+ pixman_composite_over_0565_n_0565_cleanup, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+ /* mask is in v15 */
+ mov v4.d[0], v8.d[0]
+ mov v4.d[1], v9.d[0]
+ mov v13.d[0], v10.d[0]
+ mov v13.d[1], v11.d[0]
+ convert_0565_to_x888 v4, v2, v1, v0
+ convert_0565_to_x888 v13, v6, v5, v4
+ /* source pixel data is in {v0, v1, v2, XX} */
+ /* destination pixel data is in {v4, v5, v6, XX} */
+ umull v9.8h, v15.8b, v2.8b
+ umull v8.8h, v15.8b, v1.8b
+ umull v7.8h, v15.8b, v0.8b
+ urshr v12.8h, v9.8h, #8
+ urshr v11.8h, v8.8h, #8
+ urshr v10.8h, v7.8h, #8
+ raddhn v2.8b, v9.8h, v12.8h
+ raddhn v1.8b, v8.8h, v11.8h
+ raddhn v0.8b, v7.8h, v10.8h
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+ uqadd v0.8b, v0.8b, v4.8b
+ uqadd v1.8b, v1.8b, v5.8b
+ uqadd v2.8b, v2.8b, v6.8b
+ /* 32bpp result is in {v0, v1, v2, XX} */
+ convert_8888_to_0565 v2, v1, v0, v14, v30, v13
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_add_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ ld1 {v10.4h, v11.4h}, [DST_R], #16
+ cache_preload 8, 8
+ pixman_composite_add_0565_8_0565_process_pixblock_head
+ st1 {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+ pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_add_0565_8_0565_process_pixblock_head, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+ /* mask is in v15 */
+ mov v12.d[0], v10.d[0]
+ mov v12.d[1], v11.d[0]
+ convert_0565_to_x888 v12, v6, v5, v4
+ /* destination pixel data is in {v4, v5, v6, xx} */
+ mvn v24.8b, v15.8b /* get inverted alpha */
+ /* now do alpha blending */
+ umull v8.8h, v24.8b, v4.8b
+ umull v9.8h, v24.8b, v5.8b
+ umull v10.8h, v24.8b, v6.8b
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ urshr v11.8h, v8.8h, #8
+ urshr v12.8h, v9.8h, #8
+ urshr v13.8h, v10.8h, #8
+ raddhn v0.8b, v11.8h, v8.8h
+ raddhn v1.8b, v12.8h, v9.8h
+ raddhn v2.8b, v13.8h, v10.8h
+ /* 32bpp result is in {v0, v1, v2, XX} */
+ convert_8888_to_0565 v2, v1, v0, v14, v12, v3
+ mov v28.d[0], v14.d[0]
+ mov v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+ fetch_src_pixblock
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ ld1 {v10.4h, v11.4h}, [DST_R], #16
+ cache_preload 8, 8
+ pixman_composite_out_reverse_8_0565_process_pixblock_head
+ st1 {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+ pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 15, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+ /* src is in v0 */
+ /* destination pixel data is in {v4, v5, v6, v7} */
+ mvn v1.8b, v0.8b /* get inverted alpha */
+ /* now do alpha blending */
+ umull v8.8h, v1.8b, v4.8b
+ umull v9.8h, v1.8b, v5.8b
+ umull v10.8h, v1.8b, v6.8b
+ umull v11.8h, v1.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+ urshr v14.8h, v8.8h, #8
+ urshr v15.8h, v9.8h, #8
+ urshr v12.8h, v10.8h, #8
+ urshr v13.8h, v11.8h, #8
+ raddhn v28.8b, v14.8h, v8.8h
+ raddhn v29.8b, v15.8h, v9.8h
+ raddhn v30.8b, v12.8h, v10.8h
+ raddhn v31.8b, v13.8h, v11.8h
+ /* 32bpp result is in {v28, v29, v30, v31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ pixman_composite_out_reverse_8_8888_process_pixblock_tail
+ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+ cache_preload 8, 8
+ pixman_composite_out_reverse_8_8888_process_pixblock_head
+ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+ pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+ asr TMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ ld1 {\()\reg1\().2s}, [TMP1], STRIDE
+ ld1 {\()\reg2\().2s}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ asr TMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
+ ld1 {\()\reg2\().s}[1], [TMP1]
+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 \reg1, \reg2, \tmp1
+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
+ bilinear_load_8888 \reg3, \reg4, \tmp2
+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
+.endm
+
+.macro vzip reg1, reg2
+ umov TMP4, v31.d[0]
+ zip1 v31.8b, \reg1, \reg2
+ zip2 \reg2, \reg1, \reg2
+ mov \reg1, v31.8b
+ mov v31.d[0], TMP4
+.endm
+
+.macro vuzp reg1, reg2
+ umov TMP4, v31.d[0]
+ uzp1 v31.8b, \reg1, \reg2
+ uzp2 \reg2, \reg1, \reg2
+ mov \reg1, v31.8b
+ mov v31.d[0], TMP4
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+ asr TMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr TMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
+ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
+ ld1 {\()\acc2\().s}[1], [TMP1]
+ ld1 {\()\acc2\().s}[3], [TMP2]
+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
+ vzip \()\reg1\().8b, \()\reg3\().8b
+ vzip \()\reg2\().8b, \()\reg4\().8b
+ vzip \()\reg3\().8b, \()\reg4\().8b
+ vzip \()\reg1\().8b, \()\reg2\().8b
+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+ asr TMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr TMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
+ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
+ ld1 {\()\xacc2\().s}[1], [TMP1]
+ ld1 {\()\xacc2\().s}[3], [TMP2]
+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
+ asr TMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr TMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
+ vzip \()\xreg1\().8b, \()\xreg3\().8b
+ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
+ vzip \()\xreg2\().8b, \()\xreg4\().8b
+ ld1 {\()\yacc2\().s}[1], [TMP1]
+ vzip \()\xreg3\().8b, \()\xreg4\().8b
+ ld1 {\()\yacc2\().s}[3], [TMP2]
+ vzip \()\xreg1\().8b, \()\xreg2\().8b
+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
+ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
+ vzip \()\yreg1\().8b, \()\yreg3\().8b
+ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
+ vzip \()\yreg2\().8b, \()\yreg4\().8b
+ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
+ vzip \()\yreg3\().8b, \()\yreg4\().8b
+ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
+ vzip \()\yreg1\().8b, \()\yreg2\().8b
+ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
+ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
+ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
+ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if \numpix == 4
+ st1 {v0.2s, v1.2s}, [OUT], #16
+.elseif \numpix == 2
+ st1 {v0.2s}, [OUT], #8
+.elseif \numpix == 1
+ st1 {v0.s}[0], [OUT], #4
+.else
+ .error bilinear_store_8888 \numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp v0.8b, v1.8b
+ vuzp v2.8b, v3.8b
+ vuzp v1.8b, v3.8b
+ vuzp v0.8b, v2.8b
+ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
+.if \numpix == 4
+ st1 {v1.4h}, [OUT], #8
+.elseif \numpix == 2
+ st1 {v1.s}[0], [OUT], #4
+.elseif \numpix == 1
+ st1 {v1.h}[0], [OUT], #2
+.else
+ .error bilinear_store_0565 \numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_load_\()\src_fmt v0, v1, v2
+ umull v2.8h, v0.8b, v28.8b
+ umlal v2.8h, v1.8b, v29.8b
+ /* 5 cycles bubble */
+ ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v2.4h, v15.h[0]
+ umlal2 v0.4s, v2.8h, v15.h[0]
+ /* 5 cycles bubble */
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ /* 3 cycles bubble */
+ xtn v0.8b, v0.8h
+ /* 1 cycle bubble */
+ bilinear_store_\()\dst_fmt 1, v3, v4
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
+ v1, v11, v2, v3, v20, v21, v22, v23
+ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v1.4h, v15.h[0]
+ umlal2 v0.4s, v1.8h, v15.h[0]
+ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v10.4s, v11.4h, v15.h[4]
+ umlal2 v10.4s, v11.8h, v15.h[4]
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ xtn v0.8b, v0.8h
+ bilinear_store_\()\dst_fmt 2, v3, v4
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
+ v1, v11, v14, v20, v16, v17, v22, v23, \
+ v3, v9, v24, v25, v26, v27, v18, v19
+ prfm PREFETCH_MODE, [TMP1, PF_OFFS]
+ sub TMP1, TMP1, STRIDE
+ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v1.4h, v15.h[0]
+ umlal2 v0.4s, v1.8h, v15.h[0]
+ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v10.4s, v11.4h, v15.h[4]
+ umlal2 v10.4s, v11.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v2.4s, v3.4h, v15.h[0]
+ umlal2 v2.4s, v3.8h, v15.h[0]
+ ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ prfm PREFETCH_MODE, [TMP2, PF_OFFS]
+ umlsl v8.4s, v9.4h, v15.h[4]
+ umlal2 v8.4s, v9.8h, v15.h[4]
+ add v12.8h, v12.8h, v13.8h
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v2.8h
+ add v12.8h, v12.8h, v13.8h
+ bilinear_store_\()\dst_fmt 4, v3, v4
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
+.else
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
+.else
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
+.else
+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
+.else
+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
+.else
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4, 0
+.set BILINEAR_FLAG_UNROLL_8, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+ src_bpp_shift, dst_bpp_shift, \
+ prefetch_distance, flags
+
+pixman_asm_function \fname
+ OUT .req x0
+ TOP .req x1
+ BOTTOM .req x2
+ WT .req x3
+ WB .req x4
+ X .req x5
+ UX .req x6
+ WIDTH .req x7
+ TMP1 .req x8
+ TMP2 .req x9
+ PF_OFFS .req x10
+ TMP3 .req x11
+ TMP4 .req x12
+ STRIDE .req x13
+
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
+ sxtw x6, w6
+ sxtw x7, w7
+
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub sp, sp, 112 /* push all registers */
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+ stp x8, x9, [x29, -80]
+ stp x10, x11, [x29, -96]
+ stp x12, x13, [x29, -112]
+
+ mov PF_OFFS, #\prefetch_distance
+ mul PF_OFFS, PF_OFFS, UX
+
+ subs STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 300f
+
+ dup v12.8h, w5
+ dup v13.8h, w6
+ dup v28.8b, w3
+ dup v29.8b, w4
+ mov v25.d[0], v12.d[1]
+ mov v26.d[0], v13.d[0]
+ add v25.4h, v25.4h, v26.4h
+ mov v12.d[1], v25.d[0]
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 100f
+ tst OUT, #(1 << \dst_bpp_shift)
+ beq 100f
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
+ sub WIDTH, WIDTH, #1
+100:
+ add v13.8h, v13.8h, v13.8h
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+
+ cmp WIDTH, #2
+ blt 100f
+ tst OUT, #(1 << (\dst_bpp_shift + 1))
+ beq 100f
+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
+ sub WIDTH, WIDTH, #2
+100:
+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+ cmp WIDTH, #4
+ blt 100f
+ tst OUT, #(1 << (\dst_bpp_shift + 2))
+ beq 100f
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+ sub WIDTH, WIDTH, #4
+100:
+ subs WIDTH, WIDTH, #8
+ blt 100f
+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
+ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #8
+ blt 500f
+1000:
+ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #8
+ bge 1000b
+500:
+ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
+100:
+ tst WIDTH, #4
+ beq 200f
+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+200:
+.else
+/*********** 4 pixels per iteration *****************/
+ subs WIDTH, WIDTH, #4
+ blt 100f
+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #4
+ blt 500f
+1000:
+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #4
+ bge 1000b
+500:
+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
+100:
+/****************************************************/
+.endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 200f
+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
+200:
+ tst WIDTH, #1
+ beq 300f
+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
+300:
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+ ldp x8, x9, [x29, -80]
+ ldp x10, x11, [x29, -96]
+ ldp x12, x13, [x29, -104]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+ ret
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+pixman_end_asm_function
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+ asr TMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ asr TMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #2
+
+ ld1 {v22.2s}, [TMP1], STRIDE
+ ld1 {v23.2s}, [TMP1]
+ asr TMP3, X, #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, lsl #2
+ umull v8.8h, v22.8b, v28.8b
+ umlal v8.8h, v23.8b, v29.8b
+
+ ld1 {v22.2s}, [TMP2], STRIDE
+ ld1 {v23.2s}, [TMP2]
+ asr TMP4, X, #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, lsl #2
+ umull v9.8h, v22.8b, v28.8b
+ umlal v9.8h, v23.8b, v29.8b
+
+ ld1 {v22.2s}, [TMP3], STRIDE
+ ld1 {v23.2s}, [TMP3]
+ umull v10.8h, v22.8b, v28.8b
+ umlal v10.8h, v23.8b, v29.8b
+
+ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v8.4h, v15.h[0]
+ umlal2 v0.4s, v8.8h, v15.h[0]
+
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ ld1 {v16.2s}, [TMP4], STRIDE
+ ld1 {v17.2s}, [TMP4]
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ umull v11.8h, v16.8b, v28.8b
+ umlal v11.8h, v17.8b, v29.8b
+
+ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v1.4s, v9.4h, v15.h[4]
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+ umlal2 v1.4s, v9.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v2.4s, v10.4h, v15.h[0]
+ umlal2 v2.4s, v10.8h, v15.h[0]
+ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v3.4s, v11.4h, v15.h[4]
+ umlal2 v3.4s, v11.8h, v15.h[4]
+ add v12.8h, v12.8h, v13.8h
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ xtn v6.8b, v0.8h
+ xtn v7.8b, v2.8h
+ add v12.8h, v12.8h, v13.8h
+ st1 {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+ asr TMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ asr TMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #2
+ umlal2 v1.4s, v9.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v2.4s, v10.4h, v15.h[0]
+ umlal2 v2.4s, v10.8h, v15.h[0]
+ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ ld1 {v20.2s}, [TMP1], STRIDE
+ umlsl v3.4s, v11.4h, v15.h[4]
+ umlal2 v3.4s, v11.8h, v15.h[4]
+ ld1 {v21.2s}, [TMP1]
+ umull v8.8h, v20.8b, v28.8b
+ umlal v8.8h, v21.8b, v29.8b
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ld1 {v22.2s}, [TMP2], STRIDE
+ shrn2 v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ ld1 {v23.2s}, [TMP2]
+ umull v9.8h, v22.8b, v28.8b
+ asr TMP3, X, #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, lsl #2
+ asr TMP4, X, #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, lsl #2
+ umlal v9.8h, v23.8b, v29.8b
+ ld1 {v22.2s}, [TMP3], STRIDE
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ld1 {v23.2s}, [TMP3]
+ umull v10.8h, v22.8b, v28.8b
+ umlal v10.8h, v23.8b, v29.8b
+ xtn v6.8b, v0.8h
+ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+ xtn v7.8b, v4.8h
+ umlsl v0.4s, v8.4h, v15.h[0]
+ umlal2 v0.4s, v8.8h, v15.h[0]
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ ld1 {v16.2s}, [TMP4], STRIDE
+ add v12.8h, v12.8h, v13.8h
+ ld1 {v17.2s}, [TMP4]
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ umull v11.8h, v16.8b, v28.8b
+ umlal v11.8h, v17.8b, v29.8b
+ st1 {v6.2s, v7.2s}, [OUT], #16
+ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v1.4s, v9.4h, v15.h[4]
+.endm
+
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+ 2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+ 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+ 1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+ 1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
new file mode 100644
index 0000000..6aa6838
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm.h
@@ -0,0 +1,1310 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ * - handling of leading and trailing unaligned pixels
+ * - doing most of the work related to L2 cache preload
+ * - encourages the use of software pipelining for better instructions
+ * scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ * - try overlapped pixel method (from Ian Rickards) when processing
+ * exactly two blocks of pixels
+ * - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY, 0
+.set FLAG_DST_READWRITE, 1
+.set FLAG_DEINTERLEAVE_32BPP, 2
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
+
+/*
+ * prefetch mode
+ * available modes are:
+ * pldl1keep
+ * pldl1strm
+ * pldl2keep
+ * pldl2strm
+ * pldl3keep
+ * pldl3strm
+ */
+#define PREFETCH_MODE pldl1keep
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+ \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
+ \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\()
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if \numbytes == 32
+ .if \elem_size==32
+ pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+ .elseif \elem_size==16
+ pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+ .else
+ pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+ .endif
+.elseif \numbytes == 16
+ .if \elem_size==32
+ pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+ .elseif \elem_size==16
+ pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+ .else
+ pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
+ .endif
+.elseif \numbytes == 8
+ .if \elem_size==32
+ pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits
+ .elseif \elem_size==16
+ pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits
+ .else
+ pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits
+ .endif
+.elseif \numbytes == 4
+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
+ pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4
+ .elseif \elem_size == 16
+ pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2
+ pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2
+ .else
+ pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1
+ pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1
+ pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1
+ pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1
+ .endif
+.elseif \numbytes == 2
+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
+ pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2
+ .else
+ pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1
+ pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1
+ .endif
+.elseif \numbytes == 1
+ pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1
+.else
+ .error "unsupported size: \numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+ pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
+.else
+ pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if \bpp > 0
+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \
+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
+.elseif (\bpp == 24) && (\numpix == 8)
+ pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
+.elseif (\bpp == 24) && (\numpix == 4)
+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 2)
+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
+.elseif (\bpp == 24) && (\numpix == 1)
+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
+.elseif \numpix * \bpp == 32 && \abits == 32
+ pixldst 4, st1, 32, \basereg, \mem_operand, \abits
+.elseif \numpix * \bpp == 16 && \abits == 16
+ pixldst 2, st1, 16, \basereg, \mem_operand, \abits
+.else
+ pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (\bpp * \numpix) <= 128
+ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
+.else
+ pixld \numpix, \bpp, \basereg, \mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (\bpp * \numpix) <= 128
+ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
+.else
+ pixst \numpix, \bpp, \basereg, \mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if \elem_size == 16
+ asr TMP1, VX, #16
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP1, \mem_operand, TMP1, lsl #1
+ asr TMP2, VX, #16
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP2, \mem_operand, TMP2, lsl #1
+ ld1 {v\()\reg1\().h}[0], [TMP1]
+ asr TMP1, VX, #16
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP1, \mem_operand, TMP1, lsl #1
+ ld1 {v\()\reg1\().h}[1], [TMP2]
+ asr TMP2, VX, #16
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP2, \mem_operand, TMP2, lsl #1
+ ld1 {v\()\reg1\().h}[2], [TMP1]
+ ld1 {v\()\reg1\().h}[3], [TMP2]
+.elseif \elem_size == 32
+ asr TMP1, VX, #16
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP1, \mem_operand, TMP1, lsl #2
+ asr TMP2, VX, #16
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP2, \mem_operand, TMP2, lsl #2
+ ld1 {v\()\reg1\().s}[0], [TMP1]
+ ld1 {v\()\reg1\().s}[1], [TMP2]
+.else
+ .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if 0 /* \elem_size == 32 */
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X, asl #1
+ add TMP1, \mem_operand, TMP1, asl #2
+ mov TMP2, VX, asr #16
+ sub VX, VX, UNIT_X
+ add TMP2, \mem_operand, TMP2, asl #2
+ ld1 {v\()\reg1\().s}[0], [TMP1]
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X, asl #1
+ add TMP1, \mem_operand, TMP1, asl #2
+ ld1 {v\()\reg2\().s}[0], [TMP2, :32]
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, \mem_operand, TMP2, asl #2
+ ld1 {v\()\reg1\().s}[1], [TMP1]
+ ld1 {v\()\reg2\().s}[1], [TMP2]
+.else
+ pixld1_s \elem_size, \reg1, \mem_operand
+ pixld1_s \elem_size, \reg2, \mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if \elem_size == 16
+ asr TMP1, VX, #16
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP1, \mem_operand, TMP1, lsl #1
+ ld1 {v\()\reg1\().h}[\idx], [TMP1]
+.elseif \elem_size == 32
+ asr DUMMY, VX, #16
+ mov TMP1, DUMMY
+ adds VX, VX, UNIT_X
+ bmi 55f
+5: subs VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+55:
+ add TMP1, \mem_operand, TMP1, lsl #2
+ ld1 {v\()\reg1\().s}[\idx], [TMP1]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if \numbytes == 32
+ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
+ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
+ pixdeinterleave \elem_size, %(\basereg+4)
+.elseif \numbytes == 16
+ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
+.elseif \numbytes == 8
+ pixld1_s \elem_size, %(\basereg+1), \mem_operand
+.elseif \numbytes == 4
+ .if \elem_size == 32
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+ .elseif \elem_size == 16
+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
+ .else
+ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
+ .endif
+.elseif \numbytes == 2
+ .if \elem_size == 16
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+ .else
+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
+ .endif
+.elseif \numbytes == 1
+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+.else
+ .error "unsupported size: \numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if \bpp > 0
+ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+ umov DUMMY, v16.d[0]
+ uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
+ uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
+ mov v\()\reg1\().8b, v16.8b
+ mov v16.d[0], DUMMY
+.endm
+
+.macro vzip8 reg1, reg2
+ umov DUMMY, v16.d[0]
+ zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
+ zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
+ mov v\()\reg1\().8b, v16.8b
+ mov v16.d[0], DUMMY
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vuzp8 %(\basereg+0), %(\basereg+1)
+ vuzp8 %(\basereg+2), %(\basereg+3)
+ vuzp8 %(\basereg+1), %(\basereg+3)
+ vuzp8 %(\basereg+0), %(\basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vzip8 %(\basereg+0), %(\basereg+2)
+ vzip8 %(\basereg+1), %(\basereg+3)
+ vzip8 %(\basereg+2), %(\basereg+3)
+ vzip8 %(\basereg+0), %(\basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in software
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+ \a \x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if \std_increment != 0
+ PF add, PF_X, PF_X, #\std_increment
+.endif
+ PF tst, PF_CTL, #0xF
+ PF beq, 71f
+ PF add, PF_X, PF_X, #\boost_increment
+ PF sub, PF_CTL, PF_CTL, #1
+71:
+ PF cmp, PF_X, ORIG_W
+.if src_bpp_shift >= 0
+ PF lsl, DUMMY, PF_X, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
+.endif
+.if dst_r_bpp != 0
+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
+.endif
+.if mask_bpp_shift >= 0
+ PF lsl, DUMMY, PF_X, #mask_bpp_shift
+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
+.endif
+ PF ble, 71f
+ PF sub, PF_X, PF_X, ORIG_W
+ PF subs, PF_CTL, PF_CTL, #0x10
+71:
+ PF ble, 72f
+.if src_bpp_shift >= 0
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
+ PF add, PF_SRC, PF_SRC, #1
+.endif
+.if dst_r_bpp != 0
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
+ PF add, PF_DST, PF_DST, #1
+.endif
+.if mask_bpp_shift >= 0
+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
+ PF add, PF_MASK, PF_MASK, #1
+.endif
+72:
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+ prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+ prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+ prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+.if dst_w_bpp != 24
+ tst DST_R, #0xF
+ beq 52f
+
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
+.irp lowbit, 1, 2, 4, 8, 16
+
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_R, #\lowbit
+ beq 51f
+.endif
+ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+ add DST_R, DST_R, #\lowbit
+.endif
+ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
+ sub W, W, #(\lowbit * 8 / dst_w_bpp)
+51:
+.endif
+.endr
+.endif
+ pixdeinterleave src_bpp, src_basereg
+ pixdeinterleave mask_bpp, mask_basereg
+ pixdeinterleave dst_r_bpp, dst_r_basereg
+
+ \process_pixblock_head
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+ \process_pixblock_tail
+
+ pixinterleave dst_w_bpp, dst_w_basereg
+
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_W, #\lowbit
+ beq 51f
+.endif
+.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
+ sub W, W, #(\lowbit * 8 / dst_w_bpp)
+.endif
+ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+51:
+.endif
+.endr
+.endif
+52:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ * set to 0
+ * dst_aligned_flag - selects whether destination buffer
+ * is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+ dst_aligned_flag, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+ tst W, #(pixblock_size - 1)
+ beq 52f
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > \chunk_size
+ tst W, #\chunk_size
+ beq 51f
+ pixld_src \chunk_size, src_bpp, src_basereg, SRC
+ pixld \chunk_size, mask_bpp, mask_basereg, MASK
+.if \dst_aligned_flag != 0
+ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if \cache_preload_flag != 0
+ PF add, PF_X, PF_X, #\chunk_size
+.endif
+51:
+.endif
+.endr
+.endif
+ pixdeinterleave src_bpp, src_basereg
+ pixdeinterleave mask_bpp, mask_basereg
+ pixdeinterleave dst_r_bpp, dst_r_basereg
+
+ \process_pixblock_head
+.if \cache_preload_flag != 0
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+.endif
+ \process_pixblock_tail
+ pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > \chunk_size
+ tst W, #\chunk_size
+ beq 51f
+.if \dst_aligned_flag != 0
+ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+51:
+.endif
+.endr
+52:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+ mov W, ORIG_W
+ add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+ add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+ add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+ sub DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+ sub SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+ sub MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+ subs H, H, #1
+ mov DST_R, DST_W
+ bge \start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * v0, v1, v2, v3 - reserved for loading source pixel data
+ * v4, v5, v6, v7 - reserved for loading destination pixel data
+ * v24, v25, v26, v27 - reserved for loading mask pixel data
+ * v28, v29, v30, v31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags, \
+ pixblock_size_, \
+ prefetch_distance, \
+ init, \
+ cleanup, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head, \
+ dst_w_basereg_ = 28, \
+ dst_r_basereg_ = 4, \
+ src_basereg_ = 0, \
+ mask_basereg_ = 24
+
+ pixman_asm_function \fname
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub sp, sp, 232 /* push all registers */
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+ stp x8, x9, [x29, -80]
+ stp x10, x11, [x29, -96]
+ stp x12, x13, [x29, -112]
+ stp x14, x15, [x29, -128]
+ stp x16, x17, [x29, -144]
+ stp x18, x19, [x29, -160]
+ stp x20, x21, [x29, -176]
+ stp x22, x23, [x29, -192]
+ stp x24, x25, [x29, -208]
+ stp x26, x27, [x29, -224]
+ str x28, [x29, -232]
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if \prefetch_distance == 0
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set pixblock_size, \pixblock_size_
+ .set dst_w_basereg, \dst_w_basereg_
+ .set dst_r_basereg, \dst_r_basereg_
+ .set src_basereg, \src_basereg_
+ .set mask_basereg, \mask_basereg_
+
+ .macro pixld_src x:vararg
+ pixld \x
+ .endm
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+/*
+ * Assign symbolic names to registers
+ */
+ W .req x0 /* width (is updated during processing) */
+ H .req x1 /* height (is updated during processing) */
+ DST_W .req x2 /* destination buffer pointer for writes */
+ DST_STRIDE .req x3 /* destination image stride */
+ SRC .req x4 /* source buffer pointer */
+ SRC_STRIDE .req x5 /* source image stride */
+ MASK .req x6 /* mask pointer */
+ MASK_STRIDE .req x7 /* mask stride */
+
+ DST_R .req x8 /* destination buffer pointer for reads */
+
+ PF_CTL .req x9 /* combined lines counter and prefetch */
+ /* distance increment counter */
+ PF_X .req x10 /* pixel index in a scanline for current */
+ /* pretetch position */
+ PF_SRC .req x11 /* pointer to source scanline start */
+ /* for prefetch purposes */
+ PF_DST .req x12 /* pointer to destination scanline start */
+ /* for prefetch purposes */
+ PF_MASK .req x13 /* pointer to mask scanline start */
+ /* for prefetch purposes */
+
+ ORIG_W .req x14 /* saved original width */
+ DUMMY .req x15 /* temporary register */
+
+ sxtw x0, w0
+ sxtw x1, w1
+ sxtw x3, w3
+ sxtw x5, w5
+ sxtw x7, w7
+
+ .set mask_bpp_shift, -1
+.if src_bpp == 32
+ .set src_bpp_shift, 2
+.elseif src_bpp == 24
+ .set src_bpp_shift, 0
+.elseif src_bpp == 16
+ .set src_bpp_shift, 1
+.elseif src_bpp == 8
+ .set src_bpp_shift, 0
+.elseif src_bpp == 0
+ .set src_bpp_shift, -1
+.else
+ .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+ .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+ .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+ .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+ .set mask_bpp_shift, -1
+.else
+ .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+ .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+ .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+ .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+ .set dst_bpp_shift, 0
+.else
+ .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+.else
+ .set dst_r_bpp, 0
+.endif
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+ .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+ .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if \prefetch_distance < 0 || \prefetch_distance > 15
+ .error "invalid prefetch distance (\prefetch_distance)"
+.endif
+
+ PF mov, PF_X, #0
+ mov DST_R, DST_W
+
+.if src_bpp == 24
+ sub SRC_STRIDE, SRC_STRIDE, W
+ sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+ sub MASK_STRIDE, MASK_STRIDE, W
+ sub MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+ sub DST_STRIDE, DST_STRIDE, W
+ sub DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+ PF mov, PF_SRC, SRC
+ PF mov, PF_DST, DST_R
+ PF mov, PF_MASK, MASK
+ /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
+ PF lsl, DUMMY, H, #4
+ PF mov, PF_CTL, DUMMY
+ PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
+
+ \init
+ subs H, H, #1
+ mov ORIG_W, W
+ blt 9f
+ cmp W, #(pixblock_size * 2)
+ blt 800f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+ ensure_destination_ptr_alignment \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
+
+ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+ pixld_a pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+ PF add, PF_X, PF_X, #pixblock_size
+ \process_pixblock_head
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+ subs W, W, #(pixblock_size * 2)
+ blt 200f
+
+100:
+ \process_pixblock_tail_head
+ cache_preload_simple
+ subs W, W, #pixblock_size
+ bge 100b
+
+200:
+ \process_pixblock_tail
+ pixst_a pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+ /* Process the remaining trailing pixels in the scanline */
+ process_trailing_pixels 1, 1, \
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
+ advance_to_next_scanline 0b
+
+ \cleanup
+1000:
+ /* pop all registers */
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x8, x9, [x29, -80]
+ ldp x10, x11, [x29, -96]
+ ldp x12, x13, [x29, -112]
+ ldp x14, x15, [x29, -128]
+ ldp x16, x17, [x29, -144]
+ ldp x18, x19, [x29, -160]
+ ldp x20, x21, [x29, -176]
+ ldp x22, x23, [x29, -192]
+ ldp x24, x25, [x29, -208]
+ ldp x26, x27, [x29, -224]
+ ldr x28, [x29, -232]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+ ret /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+800:
+.if src_bpp_shift >= 0
+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
+ PF prfm, PREFETCH_MODE, [SRC, DUMMY]
+.endif
+.if dst_r_bpp != 0
+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
+ PF prfm, PREFETCH_MODE, [DST_R, DUMMY]
+.endif
+.if mask_bpp_shift >= 0
+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
+ PF prfm, PREFETCH_MODE, [MASK, DUMMY]
+.endif
+ /* Process exactly pixblock_size pixels if needed */
+ tst W, #pixblock_size
+ beq 100f
+ pixld pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+ \process_pixblock_head
+ \process_pixblock_tail
+ pixst pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+100:
+ /* Process the remaining trailing pixels in the scanline */
+ process_trailing_pixels 0, 0, \
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
+ advance_to_next_scanline 800b
+9:
+ \cleanup
+ /* pop all registers */
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x8, x9, [x29, -80]
+ ldp x10, x11, [x29, -96]
+ ldp x12, x13, [x29, -112]
+ ldp x14, x15, [x29, -128]
+ ldp x16, x17, [x29, -144]
+ ldp x18, x19, [x29, -160]
+ ldp x20, x21, [x29, -176]
+ ldp x22, x23, [x29, -192]
+ ldp x24, x25, [x29, -208]
+ ldp x26, x27, [x29, -224]
+ ldr x28, [x29, -232]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+ ret /* exit */
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
+ .unreq SRC
+ .unreq MASK
+ .unreq DST_R
+ .unreq DST_W
+ .unreq ORIG_W
+ .unreq W
+ .unreq H
+ .unreq SRC_STRIDE
+ .unreq DST_STRIDE
+ .unreq MASK_STRIDE
+ .unreq PF_CTL
+ .unreq PF_X
+ .unreq PF_SRC
+ .unreq PF_DST
+ .unreq PF_MASK
+ .unreq DUMMY
+ pixman_end_asm_function
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline use_nearest_scaling, \
+ fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags, \
+ pixblock_size_, \
+ init, \
+ cleanup, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head, \
+ dst_w_basereg_ = 28, \
+ dst_r_basereg_ = 4, \
+ src_basereg_ = 0, \
+ mask_basereg_ = 24
+
+ pixman_asm_function \fname
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, \src_bpp_
+ .set mask_bpp, \mask_bpp_
+ .set dst_w_bpp, \dst_w_bpp_
+ .set pixblock_size, \pixblock_size_
+ .set dst_w_basereg, \dst_w_basereg_
+ .set dst_r_basereg, \dst_r_basereg_
+ .set src_basereg, \src_basereg_
+ .set mask_basereg, \mask_basereg_
+
+.if \use_nearest_scaling != 0
+ /*
+ * Assign symbolic names to registers for nearest scaling
+ */
+ W .req x0
+ DST_W .req x1
+ SRC .req x2
+ VX .req x3
+ UNIT_X .req x4
+ SRC_WIDTH_FIXED .req x5
+ MASK .req x6
+ TMP1 .req x8
+ TMP2 .req x9
+ DST_R .req x10
+ DUMMY .req x30
+
+ .macro pixld_src x:vararg
+ pixld_s \x
+ .endm
+
+ sxtw x0, w0
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
+
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub sp, sp, 88
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ stp x8, x9, [x29, -80]
+ str x10, [x29, -88]
+.else
+ /*
+ * Assign symbolic names to registers
+ */
+ W .req x0 /* width (is updated during processing) */
+ DST_W .req x1 /* destination buffer pointer for writes */
+ SRC .req x2 /* source buffer pointer */
+ MASK .req x3 /* mask pointer */
+ DST_R .req x4 /* destination buffer pointer for reads */
+ DUMMY .req x30
+
+ .macro pixld_src x:vararg
+ pixld \x
+ .endm
+
+ sxtw x0, w0
+
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub sp, sp, 64
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+.endif
+
+.if (((\flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+.else
+ .set dst_r_bpp, 0
+.endif
+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+ .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+ .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+
+ \init
+ mov DST_R, DST_W
+
+ cmp W, #pixblock_size
+ blt 800f
+
+ ensure_destination_ptr_alignment \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
+
+ subs W, W, #pixblock_size
+ blt 700f
+
+ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+ pixld_a pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+ \process_pixblock_head
+ subs W, W, #pixblock_size
+ blt 200f
+100:
+ \process_pixblock_tail_head
+ subs W, W, #pixblock_size
+ bge 100b
+200:
+ \process_pixblock_tail
+ pixst_a pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+700:
+ /* Process the remaining trailing pixels in the scanline (dst aligned) */
+ process_trailing_pixels 0, 1, \
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
+
+ \cleanup
+.if \use_nearest_scaling != 0
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x8, x9, [x29, -80]
+ ldr x10, [x29, -96]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+ ret /* exit */
+.else
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+ ret /* exit */
+.endif
+800:
+ /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+ process_trailing_pixels 0, 0, \
+ \process_pixblock_head, \
+ \process_pixblock_tail, \
+ \process_pixblock_tail_head
+
+ \cleanup
+.if \use_nearest_scaling != 0
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x8, x9, [x29, -80]
+ ldr x10, [x29, -88]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+ ret /* exit */
+
+ .unreq DUMMY
+ .unreq DST_R
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq DST_W
+ .unreq MASK
+ .unreq SRC_WIDTH_FIXED
+
+.else
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+ ret /* exit */
+
+ .unreq DUMMY
+ .unreq SRC
+ .unreq MASK
+ .unreq DST_R
+ .unreq DST_W
+ .unreq W
+.endif
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
+ pixman_end_asm_function
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+ generate_composite_function_scanline 0, \x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+ generate_composite_function_scanline 1, \x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores v8-v15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+.endm
+
+.macro default_cleanup_need_all_regs
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ * value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+ shrn \()\out_r\().8b, \()\in\().8h, #8
+ shrn \()\out_g\().8b, \()\in\().8h, #3
+ sli \()\in\().8h, \()\in\().8h, #5
+ movi \()\out_a\().8b, #255
+ sri \()\out_r\().8b, \()\out_r\().8b, #5
+ sri \()\out_g\().8b, \()\out_g\().8b, #6
+ shrn \()\out_b\().8b, \()\in\().8h, #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+ shrn \()\out_r\().8b, \()\in\().8h, #8
+ shrn \()\out_g\().8b, \()\in\().8h, #3
+ sli \()\in\().8h, \()\in\().8h, #5
+ sri \()\out_r\().8b, \()\out_r\().8b, #5
+ sri \()\out_g\().8b, \()\out_g\().8b, #6
+ shrn \()\out_b\().8b, \()\in\().8h, #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+ ushll \()\tmp1\().8h, \()\in_g\().8b, #7
+ shl \()\tmp1\().8h, \()\tmp1\().8h, #1
+ ushll \()\out\().8h, \()\in_r\().8b, #7
+ shl \()\out\().8h, \()\out\().8h, #1
+ ushll \()\tmp2\().8h, \()\in_b\().8b, #7
+ shl \()\tmp2\().8h, \()\tmp2\().8h, #1
+ sri \()\out\().8h, \()\tmp1\().8h, #5
+ sri \()\out\().8h, \()\tmp2\().8h, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+ shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */
+ shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */
+ sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */
+ sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */
+ sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */
+ ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */
+ sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */
+ zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */
+ zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h
+ mov \()\out0\().d[0], \()\tmp\().d[0]
+.endm
diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c
index dcdcc69..20353cf 100644
--- a/pixman/pixman-bits-image.c
+++ b/pixman/pixman-bits-image.c
@@ -27,7 +27,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdio.h>
#include <stdlib.h>
@@ -35,44 +35,47 @@
#include "pixman-private.h"
#include "pixman-combine32.h"
#include "pixman-inlines.h"
+#include "dither/blue-noise-64x64.h"
-static uint32_t *
-_pixman_image_get_scanline_generic_float (pixman_iter_t * iter,
- const uint32_t *mask)
-{
- pixman_iter_get_scanline_t fetch_32 = iter->data;
- uint32_t *buffer = iter->buffer;
-
- fetch_32 (iter, NULL);
+/* Fetch functions */
- pixman_expand_to_float ((argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+static force_inline void
+fetch_pixel_no_alpha_32 (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds,
+ void *out)
+{
+ uint32_t *ret = out;
- return iter->buffer;
+ if (check_bounds &&
+ (x < 0 || x >= image->width || y < 0 || y >= image->height))
+ *ret = 0;
+ else
+ *ret = image->fetch_pixel_32 (image, x, y);
}
-/* Fetch functions */
-
-static force_inline uint32_t
-fetch_pixel_no_alpha (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds)
+static force_inline void
+fetch_pixel_no_alpha_float (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds,
+ void *out)
{
+ argb_t *ret = out;
+
if (check_bounds &&
(x < 0 || x >= image->width || y < 0 || y >= image->height))
- {
- return 0;
- }
-
- return image->fetch_pixel_32 (image, x, y);
+ ret->a = ret->r = ret->g = ret->b = 0.f;
+ else
+ *ret = image->fetch_pixel_float (image, x, y);
}
-typedef uint32_t (* get_pixel_t) (bits_image_t *image,
- int x, int y, pixman_bool_t check_bounds);
+typedef void (* get_pixel_t) (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds, void *out);
-static force_inline uint32_t
+static force_inline void
bits_image_fetch_pixel_nearest (bits_image_t *image,
pixman_fixed_t x,
pixman_fixed_t y,
- get_pixel_t get_pixel)
+ get_pixel_t get_pixel,
+ void *out)
{
int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
@@ -82,19 +85,20 @@ bits_image_fetch_pixel_nearest (bits_image_t *image,
repeat (image->common.repeat, &x0, image->width);
repeat (image->common.repeat, &y0, image->height);
- return get_pixel (image, x0, y0, FALSE);
+ get_pixel (image, x0, y0, FALSE, out);
}
else
{
- return get_pixel (image, x0, y0, TRUE);
+ get_pixel (image, x0, y0, TRUE, out);
}
}
-static force_inline uint32_t
-bits_image_fetch_pixel_bilinear (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
+static force_inline void
+bits_image_fetch_pixel_bilinear_32 (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel,
+ void *out)
{
pixman_repeat_t repeat_mode = image->common.repeat;
int width = image->width;
@@ -102,6 +106,7 @@ bits_image_fetch_pixel_bilinear (bits_image_t *image,
int x1, y1, x2, y2;
uint32_t tl, tr, bl, br;
int32_t distx, disty;
+ uint32_t *ret = out;
x1 = x - pixman_fixed_1 / 2;
y1 = y - pixman_fixed_1 / 2;
@@ -121,27 +126,142 @@ bits_image_fetch_pixel_bilinear (bits_image_t *image,
repeat (repeat_mode, &x2, width);
repeat (repeat_mode, &y2, height);
- tl = get_pixel (image, x1, y1, FALSE);
- bl = get_pixel (image, x1, y2, FALSE);
- tr = get_pixel (image, x2, y1, FALSE);
- br = get_pixel (image, x2, y2, FALSE);
+ get_pixel (image, x1, y1, FALSE, &tl);
+ get_pixel (image, x2, y1, FALSE, &tr);
+ get_pixel (image, x1, y2, FALSE, &bl);
+ get_pixel (image, x2, y2, FALSE, &br);
}
else
{
- tl = get_pixel (image, x1, y1, TRUE);
- tr = get_pixel (image, x2, y1, TRUE);
- bl = get_pixel (image, x1, y2, TRUE);
- br = get_pixel (image, x2, y2, TRUE);
+ get_pixel (image, x1, y1, TRUE, &tl);
+ get_pixel (image, x2, y1, TRUE, &tr);
+ get_pixel (image, x1, y2, TRUE, &bl);
+ get_pixel (image, x2, y2, TRUE, &br);
}
- return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+ *ret = bilinear_interpolation (tl, tr, bl, br, distx, disty);
}
-static force_inline uint32_t
+static force_inline void
+bits_image_fetch_pixel_bilinear_float (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel,
+ void *out)
+{
+ pixman_repeat_t repeat_mode = image->common.repeat;
+ int width = image->width;
+ int height = image->height;
+ int x1, y1, x2, y2;
+ argb_t tl, tr, bl, br;
+ float distx, disty;
+ argb_t *ret = out;
+
+ x1 = x - pixman_fixed_1 / 2;
+ y1 = y - pixman_fixed_1 / 2;
+
+ distx = ((float)pixman_fixed_fraction(x1)) / 65536.f;
+ disty = ((float)pixman_fixed_fraction(y1)) / 65536.f;
+
+ x1 = pixman_fixed_to_int (x1);
+ y1 = pixman_fixed_to_int (y1);
+ x2 = x1 + 1;
+ y2 = y1 + 1;
+
+ if (repeat_mode != PIXMAN_REPEAT_NONE)
+ {
+ repeat (repeat_mode, &x1, width);
+ repeat (repeat_mode, &y1, height);
+ repeat (repeat_mode, &x2, width);
+ repeat (repeat_mode, &y2, height);
+
+ get_pixel (image, x1, y1, FALSE, &tl);
+ get_pixel (image, x2, y1, FALSE, &tr);
+ get_pixel (image, x1, y2, FALSE, &bl);
+ get_pixel (image, x2, y2, FALSE, &br);
+ }
+ else
+ {
+ get_pixel (image, x1, y1, TRUE, &tl);
+ get_pixel (image, x2, y1, TRUE, &tr);
+ get_pixel (image, x1, y2, TRUE, &bl);
+ get_pixel (image, x2, y2, TRUE, &br);
+ }
+
+ *ret = bilinear_interpolation_float (tl, tr, bl, br, distx, disty);
+}
+
+static force_inline void accum_32(unsigned int *satot, unsigned int *srtot,
+ unsigned int *sgtot, unsigned int *sbtot,
+ const void *p, pixman_fixed_t f)
+{
+ uint32_t pixel = *(uint32_t *)p;
+
+ *srtot += (int)RED_8 (pixel) * f;
+ *sgtot += (int)GREEN_8 (pixel) * f;
+ *sbtot += (int)BLUE_8 (pixel) * f;
+ *satot += (int)ALPHA_8 (pixel) * f;
+}
+
+static force_inline void reduce_32(unsigned int satot, unsigned int srtot,
+ unsigned int sgtot, unsigned int sbtot,
+ void *p)
+{
+ uint32_t *ret = p;
+
+ satot = (int32_t)(satot + 0x8000) / 65536;
+ srtot = (int32_t)(srtot + 0x8000) / 65536;
+ sgtot = (int32_t)(sgtot + 0x8000) / 65536;
+ sbtot = (int32_t)(sbtot + 0x8000) / 65536;
+
+ satot = CLIP ((int32_t)satot, 0, 0xff);
+ srtot = CLIP ((int32_t)srtot, 0, 0xff);
+ sgtot = CLIP ((int32_t)sgtot, 0, 0xff);
+ sbtot = CLIP ((int32_t)sbtot, 0, 0xff);
+
+ *ret = ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
+}
+
+static force_inline void accum_float(unsigned int *satot, unsigned int *srtot,
+ unsigned int *sgtot, unsigned int *sbtot,
+ const void *p, pixman_fixed_t f)
+{
+ const argb_t *pixel = p;
+
+ *satot += pixel->a * f;
+ *srtot += pixel->r * f;
+ *sgtot += pixel->g * f;
+ *sbtot += pixel->b * f;
+}
+
+static force_inline void reduce_float(unsigned int satot, unsigned int srtot,
+ unsigned int sgtot, unsigned int sbtot,
+ void *p)
+{
+ argb_t *ret = p;
+
+ ret->a = CLIP ((int32_t)satot / 65536.f, 0.f, 1.f);
+ ret->r = CLIP ((int32_t)srtot / 65536.f, 0.f, 1.f);
+ ret->g = CLIP ((int32_t)sgtot / 65536.f, 0.f, 1.f);
+ ret->b = CLIP ((int32_t)sbtot / 65536.f, 0.f, 1.f);
+}
+
+typedef void (* accumulate_pixel_t) (unsigned int *satot, unsigned int *srtot,
+ unsigned int *sgtot, unsigned int *sbtot,
+ const void *pixel, pixman_fixed_t f);
+
+typedef void (* reduce_pixel_t) (unsigned int satot, unsigned int srtot,
+ unsigned int sgtot, unsigned int sbtot,
+ void *out);
+
+static force_inline void
bits_image_fetch_pixel_convolution (bits_image_t *image,
pixman_fixed_t x,
pixman_fixed_t y,
- get_pixel_t get_pixel)
+ get_pixel_t get_pixel,
+ void *out,
+ accumulate_pixel_t accum,
+ reduce_pixel_t reduce)
{
pixman_fixed_t *params = image->common.filter_params;
int x_off = (params[0] - pixman_fixed_1) >> 1;
@@ -152,7 +272,7 @@ bits_image_fetch_pixel_convolution (bits_image_t *image,
pixman_repeat_t repeat_mode = image->common.repeat;
int width = image->width;
int height = image->height;
- int srtot, sgtot, sbtot, satot;
+ unsigned int srtot, sgtot, sbtot, satot;
params += 2;
@@ -174,48 +294,39 @@ bits_image_fetch_pixel_convolution (bits_image_t *image,
if (f)
{
- uint32_t pixel;
+ /* Must be big enough to hold a argb_t */
+ argb_t pixel;
if (repeat_mode != PIXMAN_REPEAT_NONE)
{
repeat (repeat_mode, &rx, width);
repeat (repeat_mode, &ry, height);
- pixel = get_pixel (image, rx, ry, FALSE);
+ get_pixel (image, rx, ry, FALSE, &pixel);
}
else
{
- pixel = get_pixel (image, rx, ry, TRUE);
+ get_pixel (image, rx, ry, TRUE, &pixel);
}
- srtot += (int)RED_8 (pixel) * f;
- sgtot += (int)GREEN_8 (pixel) * f;
- sbtot += (int)BLUE_8 (pixel) * f;
- satot += (int)ALPHA_8 (pixel) * f;
+ accum (&satot, &srtot, &sgtot, &sbtot, &pixel, f);
}
params++;
}
}
- satot = (satot + 0x8000) >> 16;
- srtot = (srtot + 0x8000) >> 16;
- sgtot = (sgtot + 0x8000) >> 16;
- sbtot = (sbtot + 0x8000) >> 16;
-
- satot = CLIP (satot, 0, 0xff);
- srtot = CLIP (srtot, 0, 0xff);
- sgtot = CLIP (sgtot, 0, 0xff);
- sbtot = CLIP (sbtot, 0, 0xff);
-
- return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
+ reduce (satot, srtot, sgtot, sbtot, out);
}
-static uint32_t
-bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
- pixman_fixed_t x,
- pixman_fixed_t y,
- get_pixel_t get_pixel)
+static void
+bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
+ pixman_fixed_t x,
+ pixman_fixed_t y,
+ get_pixel_t get_pixel,
+ void *out,
+ accumulate_pixel_t accum,
+ reduce_pixel_t reduce)
{
pixman_fixed_t *params = image->common.filter_params;
pixman_repeat_t repeat_mode = image->common.repeat;
@@ -230,7 +341,7 @@ bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
pixman_fixed_t *y_params;
- int srtot, sgtot, sbtot, satot;
+ unsigned int srtot, sgtot, sbtot, satot;
int32_t x1, x2, y1, y2;
int32_t px, py;
int i, j;
@@ -270,82 +381,100 @@ bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
if (fx)
{
+ /* Must be big enough to hold a argb_t */
+ argb_t pixel;
pixman_fixed_t f;
- uint32_t pixel;
if (repeat_mode != PIXMAN_REPEAT_NONE)
{
repeat (repeat_mode, &rx, width);
repeat (repeat_mode, &ry, height);
- pixel = get_pixel (image, rx, ry, FALSE);
+ get_pixel (image, rx, ry, FALSE, &pixel);
}
else
{
- pixel = get_pixel (image, rx, ry, TRUE);
+ get_pixel (image, rx, ry, TRUE, &pixel);
}
f = (fy * fx + 0x8000) >> 16;
- srtot += (int)RED_8 (pixel) * f;
- sgtot += (int)GREEN_8 (pixel) * f;
- sbtot += (int)BLUE_8 (pixel) * f;
- satot += (int)ALPHA_8 (pixel) * f;
+ accum(&satot, &srtot, &sgtot, &sbtot, &pixel, f);
}
}
}
}
- satot = (satot + 0x8000) >> 16;
- srtot = (srtot + 0x8000) >> 16;
- sgtot = (sgtot + 0x8000) >> 16;
- sbtot = (sbtot + 0x8000) >> 16;
-
- satot = CLIP (satot, 0, 0xff);
- srtot = CLIP (srtot, 0, 0xff);
- sgtot = CLIP (sgtot, 0, 0xff);
- sbtot = CLIP (sbtot, 0, 0xff);
- return ((satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot));
+ reduce(satot, srtot, sgtot, sbtot, out);
}
-static force_inline uint32_t
-bits_image_fetch_pixel_filtered (bits_image_t *image,
+static force_inline void
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+ pixman_bool_t wide,
pixman_fixed_t x,
pixman_fixed_t y,
- get_pixel_t get_pixel)
+ get_pixel_t get_pixel,
+ void *out)
{
switch (image->common.filter)
{
case PIXMAN_FILTER_NEAREST:
case PIXMAN_FILTER_FAST:
- return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+ bits_image_fetch_pixel_nearest (image, x, y, get_pixel, out);
break;
case PIXMAN_FILTER_BILINEAR:
case PIXMAN_FILTER_GOOD:
case PIXMAN_FILTER_BEST:
- return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+ if (wide)
+ bits_image_fetch_pixel_bilinear_float (image, x, y, get_pixel, out);
+ else
+ bits_image_fetch_pixel_bilinear_32 (image, x, y, get_pixel, out);
break;
case PIXMAN_FILTER_CONVOLUTION:
- return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+ if (wide)
+ {
+ bits_image_fetch_pixel_convolution (image, x, y,
+ get_pixel, out,
+ accum_float,
+ reduce_float);
+ }
+ else
+ {
+ bits_image_fetch_pixel_convolution (image, x, y,
+ get_pixel, out,
+ accum_32, reduce_32);
+ }
break;
case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
- return bits_image_fetch_pixel_separable_convolution (image, x, y, get_pixel);
+ if (wide)
+ {
+ bits_image_fetch_pixel_separable_convolution (image, x, y,
+ get_pixel, out,
+ accum_float,
+ reduce_float);
+ }
+ else
+ {
+ bits_image_fetch_pixel_separable_convolution (image, x, y,
+ get_pixel, out,
+ accum_32, reduce_32);
+ }
break;
default:
+ assert (0);
break;
}
-
- return 0;
}
static uint32_t *
-bits_image_fetch_affine_no_alpha (pixman_iter_t * iter,
- const uint32_t * mask)
+__bits_image_fetch_affine_no_alpha (pixman_iter_t * iter,
+ pixman_bool_t wide,
+ const uint32_t * mask)
{
pixman_image_t *image = iter->image;
int offset = iter->x;
@@ -353,10 +482,13 @@ bits_image_fetch_affine_no_alpha (pixman_iter_t * iter,
int width = iter->width;
uint32_t * buffer = iter->buffer;
+ const uint32_t wide_zero[4] = {0};
pixman_fixed_t x, y;
pixman_fixed_t ux, uy;
pixman_vector_t v;
int i;
+ get_pixel_t get_pixel =
+ wide ? fetch_pixel_no_alpha_float : fetch_pixel_no_alpha_32;
/* reference point is the center of the pixel */
v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
@@ -382,29 +514,48 @@ bits_image_fetch_affine_no_alpha (pixman_iter_t * iter,
for (i = 0; i < width; ++i)
{
- if (!mask || mask[i])
+ if (!mask || (!wide && mask[i]) ||
+ (wide && memcmp(&mask[4 * i], wide_zero, 16) != 0))
{
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x, y, fetch_pixel_no_alpha);
+ bits_image_fetch_pixel_filtered (
+ &image->bits, wide, x, y, get_pixel, buffer);
}
x += ux;
y += uy;
+ buffer += wide ? 4 : 1;
}
- return buffer;
+ return iter->buffer;
+}
+
+static uint32_t *
+bits_image_fetch_affine_no_alpha_32 (pixman_iter_t *iter,
+ const uint32_t *mask)
+{
+ return __bits_image_fetch_affine_no_alpha(iter, FALSE, mask);
+}
+
+static uint32_t *
+bits_image_fetch_affine_no_alpha_float (pixman_iter_t *iter,
+ const uint32_t *mask)
+{
+ return __bits_image_fetch_affine_no_alpha(iter, TRUE, mask);
}
/* General fetcher */
-static force_inline uint32_t
-fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+static force_inline void
+fetch_pixel_general_32 (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds,
+ void *out)
{
- uint32_t pixel;
+ uint32_t pixel, *ret = out;
if (check_bounds &&
(x < 0 || x >= image->width || y < 0 || y >= image->height))
{
- return 0;
+ *ret = 0;
+ return;
}
pixel = image->fetch_pixel_32 (image, x, y);
@@ -433,19 +584,61 @@ fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_boun
pixel |= (pixel_a << 24);
}
- return pixel;
+ *ret = pixel;
+}
+
+static force_inline void
+fetch_pixel_general_float (bits_image_t *image,
+ int x, int y, pixman_bool_t check_bounds,
+ void *out)
+{
+ argb_t *ret = out;
+
+ if (check_bounds &&
+ (x < 0 || x >= image->width || y < 0 || y >= image->height))
+ {
+ ret->a = ret->r = ret->g = ret->b = 0;
+ return;
+ }
+
+ *ret = image->fetch_pixel_float (image, x, y);
+
+ if (image->common.alpha_map)
+ {
+ x -= image->common.alpha_origin_x;
+ y -= image->common.alpha_origin_y;
+
+ if (x < 0 || x >= image->common.alpha_map->width ||
+ y < 0 || y >= image->common.alpha_map->height)
+ {
+ ret->a = 0.f;
+ }
+ else
+ {
+ argb_t alpha;
+
+ alpha = image->common.alpha_map->fetch_pixel_float (
+ image->common.alpha_map, x, y);
+
+ ret->a = alpha.a;
+ }
+ }
}
static uint32_t *
-bits_image_fetch_general (pixman_iter_t *iter,
- const uint32_t *mask)
+__bits_image_fetch_general (pixman_iter_t *iter,
+ pixman_bool_t wide,
+ const uint32_t *mask)
{
pixman_image_t *image = iter->image;
int offset = iter->x;
int line = iter->y++;
int width = iter->width;
uint32_t * buffer = iter->buffer;
+ get_pixel_t get_pixel =
+ wide ? fetch_pixel_general_float : fetch_pixel_general_32;
+ const uint32_t wide_zero[4] = {0};
pixman_fixed_t x, y, w;
pixman_fixed_t ux, uy, uw;
pixman_vector_t v;
@@ -480,12 +673,13 @@ bits_image_fetch_general (pixman_iter_t *iter,
{
pixman_fixed_t x0, y0;
- if (!mask || mask[i])
+ if (!mask || (!wide && mask[i]) ||
+ (wide && memcmp(&mask[4 * i], wide_zero, 16) != 0))
{
if (w != 0)
{
- x0 = ((pixman_fixed_48_16_t)x << 16) / w;
- y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+ x0 = ((uint64_t)x << 16) / w;
+ y0 = ((uint64_t)y << 16) / w;
}
else
{
@@ -493,16 +687,31 @@ bits_image_fetch_general (pixman_iter_t *iter,
y0 = 0;
}
- buffer[i] = bits_image_fetch_pixel_filtered (
- &image->bits, x0, y0, fetch_pixel_general);
+ bits_image_fetch_pixel_filtered (
+ &image->bits, wide, x0, y0, get_pixel, buffer);
}
x += ux;
y += uy;
w += uw;
+ buffer += wide ? 4 : 1;
}
- return buffer;
+ return iter->buffer;
+}
+
+static uint32_t *
+bits_image_fetch_general_32 (pixman_iter_t *iter,
+ const uint32_t *mask)
+{
+ return __bits_image_fetch_general(iter, FALSE, mask);
+}
+
+static uint32_t *
+bits_image_fetch_general_float (pixman_iter_t *iter,
+ const uint32_t *mask)
+{
+ return __bits_image_fetch_general(iter, TRUE, mask);
}
static void
@@ -703,15 +912,15 @@ static const fetcher_info_t fetcher_info[] =
/* Affine, no alpha */
{ PIXMAN_any,
(FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
- bits_image_fetch_affine_no_alpha,
- _pixman_image_get_scanline_generic_float
+ bits_image_fetch_affine_no_alpha_32,
+ bits_image_fetch_affine_no_alpha_float,
},
/* General */
{ PIXMAN_any,
0,
- bits_image_fetch_general,
- _pixman_image_get_scanline_generic_float
+ bits_image_fetch_general_32,
+ bits_image_fetch_general_float,
},
{ PIXMAN_null },
@@ -741,7 +950,6 @@ _pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
}
else
{
- iter->data = info->get_scanline_32;
iter->get_scanline = info->get_scanline_float;
}
return;
@@ -847,6 +1055,119 @@ dest_write_back_narrow (pixman_iter_t *iter)
iter->y++;
}
+static float
+dither_factor_blue_noise_64 (int x, int y)
+{
+ float m = dither_blue_noise_64x64[((y & 0x3f) << 6) | (x & 0x3f)];
+ return m * (1. / 4096.f) + (1. / 8192.f);
+}
+
+static float
+dither_factor_bayer_8 (int x, int y)
+{
+ uint32_t m;
+
+ y ^= x;
+
+ /* Compute reverse(interleave(xor(x mod n, y mod n), x mod n))
+ * Here n = 8 and `mod n` is the bottom 3 bits.
+ */
+ m = ((y & 0x1) << 5) | ((x & 0x1) << 4) |
+ ((y & 0x2) << 2) | ((x & 0x2) << 1) |
+ ((y & 0x4) >> 1) | ((x & 0x4) >> 2);
+
+ /* m is in range [0, 63]. We scale it to [0, 63.0f/64.0f], then
+ * shift it to to [1.0f/128.0f, 127.0f/128.0f] so that 0 < d < 1.
+ * This ensures exact values are not changed by dithering.
+ */
+ return (float)(m) * (1 / 64.0f) + (1.0f / 128.0f);
+}
+
+typedef float (* dither_factor_t)(int x, int y);
+
+static force_inline float
+dither_apply_channel (float f, float d, float s)
+{
+ /* float_to_unorm splits the [0, 1] segment in (1 << n_bits)
+ * subsections of equal length; however unorm_to_float does not
+ * map to the center of those sections. In fact, pixel value u is
+ * mapped to:
+ *
+ * u u u 1
+ * -------------- = ---------- + -------------- * ----------
+ * 2^n_bits - 1 2^n_bits 2^n_bits - 1 2^n_bits
+ *
+ * Hence if f = u / (2^n_bits - 1) is exactly representable on a
+ * n_bits palette, all the numbers between
+ *
+ * u
+ * ---------- = f - f * 2^n_bits = f + (0 - f) * 2^n_bits
+ * 2^n_bits
+ *
+ * and
+ *
+ * u + 1
+ * ---------- = f - (f - 1) * 2^n_bits = f + (1 - f) * 2^n_bits
+ * 2^n_bits
+ *
+ * are also mapped back to u.
+ *
+ * Hence the following calculation ensures that we add as much
+ * noise as possible without perturbing values which are exactly
+ * representable in the target colorspace. Note that this corresponds to
+ * mixing the original color with noise with a ratio of `1 / 2^n_bits`.
+ */
+ return f + (d - f) * s;
+}
+
+static force_inline float
+dither_compute_scale (int n_bits)
+{
+ // No dithering for wide formats
+ if (n_bits == 0 || n_bits >= 32)
+ return 0.f;
+
+ return 1.f / (float)(1 << n_bits);
+}
+
+static const uint32_t *
+dither_apply_ordered (pixman_iter_t *iter, dither_factor_t factor)
+{
+ bits_image_t *image = &iter->image->bits;
+ int x = iter->x + image->dither_offset_x;
+ int y = iter->y + image->dither_offset_y;
+ int width = iter->width;
+ argb_t *buffer = (argb_t *)iter->buffer;
+
+ pixman_format_code_t format = image->format;
+ int a_size = PIXMAN_FORMAT_A (format);
+ int r_size = PIXMAN_FORMAT_R (format);
+ int g_size = PIXMAN_FORMAT_G (format);
+ int b_size = PIXMAN_FORMAT_B (format);
+
+ float a_scale = dither_compute_scale (a_size);
+ float r_scale = dither_compute_scale (r_size);
+ float g_scale = dither_compute_scale (g_size);
+ float b_scale = dither_compute_scale (b_size);
+
+ int i;
+ float d;
+
+ for (i = 0; i < width; ++i)
+ {
+ d = factor (x + i, y);
+
+ buffer->a = dither_apply_channel (buffer->a, d, a_scale);
+ buffer->r = dither_apply_channel (buffer->r, d, r_scale);
+ buffer->g = dither_apply_channel (buffer->g, d, g_scale);
+ buffer->b = dither_apply_channel (buffer->b, d, b_scale);
+
+ buffer++;
+ }
+
+ return iter->buffer;
+}
+
static void
dest_write_back_wide (pixman_iter_t *iter)
{
@@ -856,6 +1177,23 @@ dest_write_back_wide (pixman_iter_t *iter)
int width = iter->width;
const uint32_t *buffer = iter->buffer;
+ switch (image->dither)
+ {
+ case PIXMAN_DITHER_NONE:
+ break;
+
+ case PIXMAN_DITHER_GOOD:
+ case PIXMAN_DITHER_BEST:
+ case PIXMAN_DITHER_ORDERED_BLUE_NOISE_64:
+ buffer = dither_apply_ordered (iter, dither_factor_blue_noise_64);
+ break;
+
+ case PIXMAN_DITHER_FAST:
+ case PIXMAN_DITHER_ORDERED_BAYER_8:
+ buffer = dither_apply_ordered (iter, dither_factor_bayer_8);
+ break;
+ }
+
image->store_scanline_float (image, x, y, width, buffer);
if (image->common.alpha_map)
@@ -932,7 +1270,7 @@ create_bits (pixman_format_code_t format,
*rowstride_bytes = stride;
if (clear)
- return calloc (buf_size, 1);
+ return calloc (1, buf_size);
else
return malloc (buf_size);
}
@@ -948,6 +1286,9 @@ _pixman_bits_image_init (pixman_image_t * image,
{
uint32_t *free_me = NULL;
+ if (PIXMAN_FORMAT_BPP (format) == 128)
+ return_val_if_fail(!(rowstride % 4), FALSE);
+
if (!bits && width && height)
{
int rowstride_bytes;
@@ -968,6 +1309,9 @@ _pixman_bits_image_init (pixman_image_t * image,
image->bits.height = height;
image->bits.bits = bits;
image->bits.free_me = free_me;
+ image->bits.dither = PIXMAN_DITHER_NONE;
+ image->bits.dither_offset_x = 0;
+ image->bits.dither_offset_y = 0;
image->bits.read_func = NULL;
image->bits.write_func = NULL;
image->bits.rowstride = rowstride;
diff --git a/pixman/pixman-combine-float.c b/pixman/pixman-combine-float.c
index f5145bc..27392d6 100644
--- a/pixman/pixman-combine-float.c
+++ b/pixman/pixman-combine-float.c
@@ -26,7 +26,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <math.h>
diff --git a/pixman/pixman-combine32.c b/pixman/pixman-combine32.c
index 4c484d3..de51f64 100644
--- a/pixman/pixman-combine32.c
+++ b/pixman/pixman-combine32.c
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <math.h>
@@ -568,7 +568,7 @@ combine_multiply_ca (pixman_implementation_t *imp,
uint8_t isa = ~sa; \
uint8_t da = ALPHA_8 (d); \
uint8_t ida = ~da; \
- int32_t ra, rr, rg, rb; \
+ uint32_t ra, rr, rg, rb; \
\
ra = da * 0xff + sa * 0xff - sa * da; \
rr = isa * RED_8 (d) + ida * RED_8 (s); \
@@ -609,7 +609,7 @@ combine_multiply_ca (pixman_implementation_t *imp,
uint32_t d = *(dest + i); \
uint8_t da = ALPHA_8 (d); \
uint8_t ida = ~da; \
- int32_t ra, rr, rg, rb; \
+ uint32_t ra, rr, rg, rb; \
uint8_t ira, iga, iba; \
\
combine_mask_ca (&s, &m); \
diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index cdd56a6..59bb247 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -12,7 +12,7 @@
#define RB_MASK 0xff00ff
#define AG_MASK 0xff00ff00
#define RB_ONE_HALF 0x800080
-#define RB_MASK_PLUS_ONE 0x10000100
+#define RB_MASK_PLUS_ONE 0x1000100
#define ALPHA_8(x) ((x) >> A_SHIFT)
#define RED_8(x) (((x) >> R_SHIFT) & MASK)
diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h
index 2489adc..6394156 100644
--- a/pixman/pixman-compiler.h
+++ b/pixman/pixman-compiler.h
@@ -95,6 +95,8 @@
/* Sun Studio 8 visibility */
#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
# define PIXMAN_EXPORT __global
+#elif defined (_MSC_VER) || defined(__MINGW32__)
+# define PIXMAN_EXPORT PIXMAN_API
#else
# define PIXMAN_EXPORT
#endif
@@ -107,14 +109,14 @@
#if defined(PIXMAN_NO_TLS)
# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static type name
+ static type name;
# define PIXMAN_GET_THREAD_LOCAL(name) \
(&name)
#elif defined(TLS)
# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static TLS type name
+ static TLS type name;
# define PIXMAN_GET_THREAD_LOCAL(name) \
(&name)
@@ -174,7 +176,7 @@
#elif defined(_MSC_VER)
# define PIXMAN_DEFINE_THREAD_LOCAL(type, name) \
- static __declspec(thread) type name
+ static __declspec(thread) type name;
# define PIXMAN_GET_THREAD_LOCAL(name) \
(&name)
diff --git a/pixman/pixman-conical-gradient.c b/pixman/pixman-conical-gradient.c
index 8bb46ae..37dfffd 100644
--- a/pixman/pixman-conical-gradient.c
+++ b/pixman/pixman-conical-gradient.c
@@ -25,7 +25,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
@@ -51,7 +51,10 @@ coordinates_to_parameter (double x, double y, double angle)
}
static uint32_t *
-conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+conical_get_scanline (pixman_iter_t *iter,
+ const uint32_t *mask,
+ int Bpp,
+ pixman_gradient_walker_write_t write_pixel)
{
pixman_image_t *image = iter->image;
int x = iter->x;
@@ -61,7 +64,7 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
gradient_t *gradient = (gradient_t *)image;
conical_gradient_t *conical = (conical_gradient_t *)image;
- uint32_t *end = buffer + width;
+ uint32_t *end = buffer + width * (Bpp / 4);
pixman_gradient_walker_t walker;
pixman_bool_t affine = TRUE;
double cx = 1.;
@@ -109,11 +112,12 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
{
double t = coordinates_to_parameter (rx, ry, conical->angle);
- *buffer = _pixman_gradient_walker_pixel (
- &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+ write_pixel (&walker,
+ (pixman_fixed_48_16_t)pixman_double_to_fixed (t),
+ buffer);
}
- ++buffer;
+ buffer += (Bpp / 4);
rx += cx;
ry += cy;
@@ -144,11 +148,12 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
t = coordinates_to_parameter (x, y, conical->angle);
- *buffer = _pixman_gradient_walker_pixel (
- &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+ write_pixel (&walker,
+ (pixman_fixed_48_16_t)pixman_double_to_fixed (t),
+ buffer);
}
- ++buffer;
+ buffer += (Bpp / 4);
rx += cx;
ry += cy;
@@ -161,14 +166,17 @@ conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
}
static uint32_t *
-conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
{
- uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
-
- pixman_expand_to_float (
- (argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+ return conical_get_scanline (iter, mask, 4,
+ _pixman_gradient_walker_write_narrow);
+}
- return buffer;
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ return conical_get_scanline (iter, NULL, 16,
+ _pixman_gradient_walker_write_wide);
}
void
diff --git a/pixman/pixman-edge.c b/pixman/pixman-edge.c
index ad6dfc4..c324cd3 100644
--- a/pixman/pixman-edge.c
+++ b/pixman/pixman-edge.c
@@ -21,7 +21,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <string.h>
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index c6e43de..d510cac 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -24,7 +24,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <string.h>
#include <stdlib.h>
@@ -908,7 +908,7 @@ fast_composite_add_n_8_8 (pixman_implementation_t *imp,
#define CREATE_BITMASK(n) (0x80000000 >> (n))
#define UPDATE_BITMASK(n) ((n) >> 1)
#else
-#define CREATE_BITMASK(n) (1 << (n))
+#define CREATE_BITMASK(n) (1U << (n))
#define UPDATE_BITMASK(n) ((n) << 1)
#endif
@@ -2343,6 +2343,8 @@ fast_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
int32_t dist_y;
int i;
+ COMPILE_TIME_ASSERT (BILINEAR_INTERPOLATION_BITS < 8);
+
fx = info->x;
ux = iter->image->common.transform->matrix[0][0];
@@ -2798,7 +2800,7 @@ bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
repeat (repeat_mode, &rx, bits->width);
repeat (repeat_mode, &ry, bits->height);
- row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+ row = (uint8_t *)(bits->bits + bits->rowstride * ry);
pixel = convert_pixel (row, rx) | mask;
}
else
@@ -2809,7 +2811,7 @@ bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
}
else
{
- row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+ row = (uint8_t *)(bits->bits + bits->rowstride * ry);
pixel = convert_pixel (row, rx) | mask;
}
}
@@ -2842,7 +2844,7 @@ bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
}
}
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const uint32_t zero[2] = { 0, 0 };
static force_inline void
bits_image_fetch_bilinear_affine (pixman_image_t * image,
@@ -2911,8 +2913,8 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
repeat (repeat_mode, &x2, width);
repeat (repeat_mode, &y2, height);
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+ row1 = (uint8_t *)(bits->bits + bits->rowstride * y1);
+ row2 = (uint8_t *)(bits->bits + bits->rowstride * y2);
tl = convert_pixel (row1, x1) | mask;
tr = convert_pixel (row1, x2) | mask;
@@ -2942,12 +2944,12 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
if (y2 == 0)
{
- row1 = zero;
+ row1 = (const uint8_t *)zero;
mask1 = 0;
}
else
{
- row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+ row1 = (uint8_t *)(bits->bits + bits->rowstride * y1);
row1 += bpp / 8 * x1;
mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
@@ -2955,12 +2957,12 @@ bits_image_fetch_bilinear_affine (pixman_image_t * image,
if (y1 == height - 1)
{
- row2 = zero;
+ row2 = (const uint8_t *)zero;
mask2 = 0;
}
else
{
- row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+ row2 = (uint8_t *)(bits->bits + bits->rowstride * y2);
row2 += bpp / 8 * x1;
mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
@@ -3058,7 +3060,7 @@ bits_image_fetch_nearest_affine (pixman_image_t * image,
repeat (repeat_mode, &y0, height);
}
- row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+ row = (uint8_t *)(bits->bits + bits->rowstride * y0);
buffer[i] = convert_pixel (row, x0) | mask;
}
@@ -3084,7 +3086,7 @@ convert_x8r8g8b8 (const uint8_t *row, int x)
static force_inline uint32_t
convert_a8 (const uint8_t *row, int x)
{
- return *(row + x) << 24;
+ return (uint32_t) *(row + x) << 24;
}
static force_inline uint32_t
@@ -3256,9 +3258,9 @@ static const pixman_iter_info_t fast_iters[] =
},
#define AFFINE_FAST_PATHS(name, format, repeat) \
- SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat) \
+ NEAREST_AFFINE_FAST_PATH(name, format, repeat) \
BILINEAR_AFFINE_FAST_PATH(name, format, repeat) \
- NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+ SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)
AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
diff --git a/pixman/pixman-filter.c b/pixman/pixman-filter.c
index b2bf53f..33327df 100644
--- a/pixman/pixman-filter.c
+++ b/pixman/pixman-filter.c
@@ -29,7 +29,7 @@
#include <math.h>
#include <assert.h>
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -109,14 +109,16 @@ general_cubic (double x, double B, double C)
if (ax < 1)
{
- return ((12 - 9 * B - 6 * C) * ax * ax * ax +
- (-18 + 12 * B + 6 * C) * ax * ax + (6 - 2 * B)) / 6;
+ return (((12 - 9 * B - 6 * C) * ax +
+ (-18 + 12 * B + 6 * C)) * ax * ax +
+ (6 - 2 * B)) / 6;
}
- else if (ax >= 1 && ax < 2)
+ else if (ax < 2)
{
- return ((-B - 6 * C) * ax * ax * ax +
- (6 * B + 30 * C) * ax * ax + (-12 * B - 48 * C) *
- ax + (8 * B + 24 * C)) / 6;
+ return ((((-B - 6 * C) * ax +
+ (6 * B + 30 * C)) * ax +
+ (-12 * B - 48 * C)) * ax +
+ (8 * B + 24 * C)) / 6;
}
else
{
@@ -141,7 +143,7 @@ static const filter_info_t filters[] =
{ PIXMAN_KERNEL_BOX, box_kernel, 1.0 },
{ PIXMAN_KERNEL_LINEAR, linear_kernel, 2.0 },
{ PIXMAN_KERNEL_CUBIC, cubic_kernel, 4.0 },
- { PIXMAN_KERNEL_GAUSSIAN, gaussian_kernel, 6 * SIGMA },
+ { PIXMAN_KERNEL_GAUSSIAN, gaussian_kernel, 5.0 },
{ PIXMAN_KERNEL_LANCZOS2, lanczos2_kernel, 4.0 },
{ PIXMAN_KERNEL_LANCZOS3, lanczos3_kernel, 6.0 },
{ PIXMAN_KERNEL_LANCZOS3_STRETCHED, nice_kernel, 8.0 },
@@ -160,18 +162,21 @@ integral (pixman_kernel_t kernel1, double x1,
pixman_kernel_t kernel2, double scale, double x2,
double width)
{
- /* If the integration interval crosses zero, break it into
- * two separate integrals. This ensures that filters such
- * as LINEAR that are not differentiable at 0 will still
- * integrate properly.
+ if (kernel1 == PIXMAN_KERNEL_BOX && kernel2 == PIXMAN_KERNEL_BOX)
+ {
+ return width;
+ }
+ /* The LINEAR filter is not differentiable at 0, so if the
+ * integration interval crosses zero, break it into two
+ * separate integrals.
*/
- if (x1 < 0 && x1 + width > 0)
+ else if (kernel1 == PIXMAN_KERNEL_LINEAR && x1 < 0 && x1 + width > 0)
{
return
integral (kernel1, x1, kernel2, scale, x2, - x1) +
integral (kernel1, 0, kernel2, scale, x2 - x1, width + x1);
}
- else if (x2 < 0 && x2 + width > 0)
+ else if (kernel2 == PIXMAN_KERNEL_LINEAR && x2 < 0 && x2 + width > 0)
{
return
integral (kernel1, x1, kernel2, scale, x2, - x2) +
@@ -189,13 +194,19 @@ integral (pixman_kernel_t kernel1, double x1,
}
else
{
- /* Integration via Simpson's rule */
-#define N_SEGMENTS 128
+ /* Integration via Simpson's rule
+ * See http://www.intmath.com/integration/6-simpsons-rule.php
+ * 12 segments (6 cubic approximations) seems to produce best
+ * result for lanczos3.linear, which was the combination that
+ * showed the most errors. This makes sense as the lanczos3
+ * filter is 6 wide.
+ */
+#define N_SEGMENTS 12
#define SAMPLE(a1, a2) \
(filters[kernel1].func ((a1)) * filters[kernel2].func ((a2) * scale))
double s = 0.0;
- double h = width / (double)N_SEGMENTS;
+ double h = width / N_SEGMENTS;
int i;
s = SAMPLE (x1, x2);
@@ -204,11 +215,14 @@ integral (pixman_kernel_t kernel1, double x1,
{
double a1 = x1 + h * i;
double a2 = x2 + h * i;
+ s += 4 * SAMPLE (a1, a2);
+ }
+ for (i = 2; i < N_SEGMENTS; i += 2)
+ {
+ double a1 = x1 + h * i;
+ double a2 = x2 + h * i;
s += 2 * SAMPLE (a1, a2);
-
- if (i >= 2 && i < N_SEGMENTS - 1)
- s += 4 * SAMPLE (a1, a2);
}
s += SAMPLE (x1 + width, x2 + width);
@@ -217,25 +231,20 @@ integral (pixman_kernel_t kernel1, double x1,
}
}
-static pixman_fixed_t *
-create_1d_filter (int *width,
+static void
+create_1d_filter (int width,
pixman_kernel_t reconstruct,
pixman_kernel_t sample,
double scale,
- int n_phases)
+ int n_phases,
+ pixman_fixed_t *pstart,
+ pixman_fixed_t *pend
+ )
{
- pixman_fixed_t *params, *p;
+ pixman_fixed_t *p = pstart;
double step;
- double size;
int i;
-
- size = scale * filters[sample].width + filters[reconstruct].width;
- *width = ceil (size);
-
- p = params = malloc (*width * n_phases * sizeof (pixman_fixed_t));
- if (!params)
- return NULL;
-
+ if(width <= 0) return;
step = 1.0 / n_phases;
for (i = 0; i < n_phases; ++i)
@@ -243,16 +252,16 @@ create_1d_filter (int *width,
double frac = step / 2.0 + i * step;
pixman_fixed_t new_total;
int x, x1, x2;
- double total;
+ double total, e;
/* Sample convolution of reconstruction and sampling
* filter. See rounding.txt regarding the rounding
* and sample positions.
*/
- x1 = ceil (frac - *width / 2.0 - 0.5);
- x2 = x1 + *width;
-
+ x1 = ceil (frac - width / 2.0 - 0.5);
+ x2 = x1 + width;
+ assert( p >= pstart && p + (x2 - x1) <= pend ); /* assert validity of the following loop */
total = 0;
for (x = x1; x < x2; ++x)
{
@@ -274,29 +283,158 @@ create_1d_filter (int *width,
ihigh - ilow);
}
- total += c;
- *p++ = (pixman_fixed_t)(c * 65536.0 + 0.5);
+ *p = (pixman_fixed_t)floor (c * 65536.0 + 0.5);
+ total += *p;
+ p++;
}
- /* Normalize */
- p -= *width;
- total = 1 / total;
- new_total = 0;
+ /* Normalize, with error diffusion */
+ p -= width;
+ assert(p >= pstart && p + (x2 - x1) <= pend); /* assert validity of the following loop */
+
+ total = 65536.0 / total;
+ new_total = 0;
+ e = 0.0;
for (x = x1; x < x2; ++x)
{
- pixman_fixed_t t = (*p) * total + 0.5;
+ double v = (*p) * total + e;
+ pixman_fixed_t t = floor (v + 0.5);
+ e = v - t;
new_total += t;
*p++ = t;
}
- if (new_total != pixman_fixed_1)
- *(p - *width / 2) += (pixman_fixed_1 - new_total);
+ /* pixman_fixed_e's worth of error may remain; put it
+ * at the first sample, since that is the only one that
+ * hasn't had any error diffused into it.
+ */
+
+ assert(p - width >= pstart && p - width < pend); /* assert... */
+ *(p - width) += pixman_fixed_1 - new_total;
}
+}
- return params;
+
+static int
+filter_width (pixman_kernel_t reconstruct, pixman_kernel_t sample, double size)
+{
+ return ceil (filters[reconstruct].width + size * filters[sample].width);
+}
+
+#ifdef PIXMAN_GNUPLOT
+
+/* If enable-gnuplot is configured, then you can pipe the output of a
+ * pixman-using program to gnuplot and get a continuously-updated plot
+ * of the horizontal filter. This works well with demos/scale to test
+ * the filter generation.
+ *
+ * The plot is all the different subposition filters shuffled
+ * together. This is misleading in a few cases:
+ *
+ * IMPULSE.BOX - goes up and down as the subfilters have different
+ * numbers of non-zero samples
+ * IMPULSE.TRIANGLE - somewhat crooked for the same reason
+ * 1-wide filters - looks triangular, but a 1-wide box would be more
+ * accurate
+ */
+static void
+gnuplot_filter (int width, int n_phases, const pixman_fixed_t* p)
+{
+ double step;
+ int i, j;
+ int first;
+
+ step = 1.0 / n_phases;
+
+ printf ("set style line 1 lc rgb '#0060ad' lt 1 lw 0.5 pt 7 pi 1 ps 0.5\n");
+ printf ("plot [x=%g:%g] '-' with linespoints ls 1\n", -width*0.5, width*0.5);
+ /* Print a point at the origin so that y==0 line is included: */
+ printf ("0 0\n\n");
+
+ /* The position of the first sample of the phase corresponding to
+ * frac is given by:
+ *
+ * ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+ *
+ * We have to find the frac that minimizes this expression.
+ *
+ * For odd widths, we have
+ *
+ * ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+ * = ceil (frac) + K - frac
+ * = 1 + K - frac
+ *
+ * for some K, so this is minimized when frac is maximized and
+ * strictly growing with frac. So for odd widths, we can simply
+ * start at the last phase and go backwards.
+ *
+ * For even widths, we have
+ *
+ * ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
+ * = ceil (frac - 0.5) + K - frac
+ *
+ * The graph for this function (ignoring K) looks like this:
+ *
+ * 0.5
+ * | |\
+ * | | \
+ * | | \
+ * 0 | | \
+ * |\ |
+ * | \ |
+ * | \ |
+ * -0.5 | \|
+ * ---------------------------------
+ * 0 0.5 1
+ *
+ * So in this case we need to start with the phase whose frac is
+ * less than, but as close as possible to 0.5, then go backwards
+ * until we hit the first phase, then wrap around to the last
+ * phase and continue backwards.
+ *
+ * Which phase is as close as possible 0.5? The locations of the
+ * sampling point corresponding to the kth phase is given by
+ * 1/(2 * n_phases) + k / n_phases:
+ *
+ * 1/(2 * n_phases) + k / n_phases = 0.5
+ *
+ * from which it follows that
+ *
+ * k = (n_phases - 1) / 2
+ *
+ * rounded down is the phase in question.
+ */
+ if (width & 1)
+ first = n_phases - 1;
+ else
+ first = (n_phases - 1) / 2;
+
+ for (j = 0; j < width; ++j)
+ {
+ for (i = 0; i < n_phases; ++i)
+ {
+ int phase = first - i;
+ double frac, pos;
+
+ if (phase < 0)
+ phase = n_phases + phase;
+
+ frac = step / 2.0 + phase * step;
+ pos = ceil (frac - width / 2.0 - 0.5) + 0.5 - frac + j;
+
+ printf ("%g %g\n",
+ pos,
+ pixman_fixed_to_double (*(p + phase * width + j)));
+ }
+ }
+
+ printf ("e\n");
+ fflush (stdout);
}
+#endif
+
/* Create the parameter list for a SEPARABLE_CONVOLUTION filter
* with the given kernels and scale parameters
*/
@@ -313,38 +451,41 @@ pixman_filter_create_separable_convolution (int *n_values,
{
double sx = fabs (pixman_fixed_to_double (scale_x));
double sy = fabs (pixman_fixed_to_double (scale_y));
- pixman_fixed_t *horz = NULL, *vert = NULL, *params = NULL;
+ pixman_fixed_t *params;
int subsample_x, subsample_y;
int width, height;
+ width = filter_width (reconstruct_x, sample_x, sx);
subsample_x = (1 << subsample_bits_x);
- subsample_y = (1 << subsample_bits_y);
- horz = create_1d_filter (&width, reconstruct_x, sample_x, sx, subsample_x);
- vert = create_1d_filter (&height, reconstruct_y, sample_y, sy, subsample_y);
+ height = filter_width (reconstruct_y, sample_y, sy);
+ subsample_y = (1 << subsample_bits_y);
- if (!horz || !vert)
- goto out;
-
*n_values = 4 + width * subsample_x + height * subsample_y;
params = malloc (*n_values * sizeof (pixman_fixed_t));
if (!params)
- goto out;
+ return NULL;
params[0] = pixman_int_to_fixed (width);
params[1] = pixman_int_to_fixed (height);
params[2] = pixman_int_to_fixed (subsample_bits_x);
params[3] = pixman_int_to_fixed (subsample_bits_y);
- memcpy (params + 4, horz,
- width * subsample_x * sizeof (pixman_fixed_t));
- memcpy (params + 4 + width * subsample_x, vert,
- height * subsample_y * sizeof (pixman_fixed_t));
+ {
+ pixman_fixed_t
+ *xparams = params+4,
+ *yparams = xparams + width*subsample_x,
+ *endparams = params + *n_values;
+ create_1d_filter(width, reconstruct_x, sample_x, sx, subsample_x,
+ xparams, yparams);
+ create_1d_filter(height, reconstruct_y, sample_y, sy, subsample_y,
+ yparams, endparams);
+ }
-out:
- free (horz);
- free (vert);
+#ifdef PIXMAN_GNUPLOT
+ gnuplot_filter(width, subsample_x, params + 4);
+#endif
return params;
}
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 7cdea29..b4450cb 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -26,7 +26,7 @@
* SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
#include <string.h>
@@ -141,7 +141,8 @@ general_composite_rect (pixman_implementation_t *imp,
if ((src_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
(dest_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
- !(operator_needs_division (op)))
+ !(operator_needs_division (op)) &&
+ (dest_image->bits.dither == PIXMAN_DITHER_NONE))
{
width_flag = ITER_NARROW;
Bpp = 4;
@@ -155,23 +156,27 @@ general_composite_rect (pixman_implementation_t *imp,
#define ALIGN(addr) \
((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
- src_buffer = ALIGN (scanline_buffer);
- mask_buffer = ALIGN (src_buffer + width * Bpp);
- dest_buffer = ALIGN (mask_buffer + width * Bpp);
+ if (width <= 0 || _pixman_multiply_overflows_int (width, Bpp * 3))
+ return;
- if (ALIGN (dest_buffer + width * Bpp) >
- scanline_buffer + sizeof (stack_scanline_buffer))
+ if (width * Bpp * 3 > sizeof (stack_scanline_buffer) - 15 * 3)
{
- scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
+ scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 15 * 3);
if (!scanline_buffer)
return;
- src_buffer = ALIGN (scanline_buffer);
- mask_buffer = ALIGN (src_buffer + width * Bpp);
- dest_buffer = ALIGN (mask_buffer + width * Bpp);
+ memset (scanline_buffer, 0, width * Bpp * 3 + 15 * 3);
+ }
+ else
+ {
+ memset (stack_scanline_buffer, 0, sizeof (stack_scanline_buffer));
}
+ src_buffer = ALIGN (scanline_buffer);
+ mask_buffer = ALIGN (src_buffer + width * Bpp);
+ dest_buffer = ALIGN (mask_buffer + width * Bpp);
+
if (width_flag == ITER_WIDE)
{
/* To make sure there aren't any NANs in the buffers */
diff --git a/pixman/pixman-glyph.c b/pixman/pixman-glyph.c
index 96a349a..dc90411 100644
--- a/pixman/pixman-glyph.c
+++ b/pixman/pixman-glyph.c
@@ -25,7 +25,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
diff --git a/pixman/pixman-gradient-walker.c b/pixman/pixman-gradient-walker.c
index 822f8e6..b31d5ad 100644
--- a/pixman/pixman-gradient-walker.c
+++ b/pixman/pixman-gradient-walker.c
@@ -24,7 +24,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -122,10 +122,9 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
left_c = right_c;
}
- /* The alpha channel is scaled to be in the [0, 255] interval,
- * and the red/green/blue channels are scaled to be in [0, 1].
+ /* The alpha/red/green/blue channels are scaled to be in [0, 1].
* This ensures that after premultiplication all channels will
- * be in the [0, 255] interval.
+ * be in the [0, 1] interval.
*/
la = (left_c->alpha * (1.0f/257.0f));
lr = (left_c->red * (1.0f/257.0f));
@@ -143,7 +142,7 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
if (FLOAT_IS_ZERO (rx - lx) || left_x == INT32_MIN || right_x == INT32_MAX)
{
walker->a_s = walker->r_s = walker->g_s = walker->b_s = 0.0f;
- walker->a_b = (la + ra) / 2.0f;
+ walker->a_b = (la + ra) / 510.0f;
walker->r_b = (lr + rr) / 510.0f;
walker->g_b = (lg + rg) / 510.0f;
walker->b_b = (lb + rb) / 510.0f;
@@ -152,12 +151,12 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
{
float w_rec = 1.0f / (rx - lx);
- walker->a_b = (la * rx - ra * lx) * w_rec;
+ walker->a_b = (la * rx - ra * lx) * w_rec * (1.0f/255.0f);
walker->r_b = (lr * rx - rr * lx) * w_rec * (1.0f/255.0f);
walker->g_b = (lg * rx - rg * lx) * w_rec * (1.0f/255.0f);
walker->b_b = (lb * rx - rb * lx) * w_rec * (1.0f/255.0f);
- walker->a_s = (ra - la) * w_rec;
+ walker->a_s = (ra - la) * w_rec * (1.0f/255.0f);
walker->r_s = (rr - lr) * w_rec * (1.0f/255.0f);
walker->g_s = (rg - lg) * w_rec * (1.0f/255.0f);
walker->b_s = (rb - lb) * w_rec * (1.0f/255.0f);
@@ -169,34 +168,97 @@ gradient_walker_reset (pixman_gradient_walker_t *walker,
walker->need_reset = FALSE;
}
-uint32_t
-_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
- pixman_fixed_48_16_t x)
+static argb_t
+pixman_gradient_walker_pixel_float (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x)
{
- float a, r, g, b;
- uint8_t a8, r8, g8, b8;
- uint32_t v;
+ argb_t f;
float y;
if (walker->need_reset || x < walker->left_x || x >= walker->right_x)
- gradient_walker_reset (walker, x);
+ gradient_walker_reset (walker, x);
y = x * (1.0f / 65536.0f);
- a = walker->a_s * y + walker->a_b;
- r = a * (walker->r_s * y + walker->r_b);
- g = a * (walker->g_s * y + walker->g_b);
- b = a * (walker->b_s * y + walker->b_b);
+ f.a = walker->a_s * y + walker->a_b;
+ f.r = f.a * (walker->r_s * y + walker->r_b);
+ f.g = f.a * (walker->g_s * y + walker->g_b);
+ f.b = f.a * (walker->b_s * y + walker->b_b);
- a8 = a + 0.5f;
- r8 = r + 0.5f;
- g8 = g + 0.5f;
- b8 = b + 0.5f;
+ return f;
+}
+
+static uint32_t
+pixman_gradient_walker_pixel_32 (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x)
+{
+ argb_t f;
+ float y;
+
+ if (walker->need_reset || x < walker->left_x || x >= walker->right_x)
+ gradient_walker_reset (walker, x);
+
+ y = x * (1.0f / 65536.0f);
+
+ /* Instead of [0...1] for ARGB, we want [0...255],
+ * multiply alpha with 255 and the color channels
+ * also get multiplied by the alpha multiplier.
+ *
+ * We don't use pixman_contract_from_float because it causes a 2x
+ * slowdown to do so, and the values are already normalized,
+ * so we don't have to worry about values < 0.f or > 1.f
+ */
+ f.a = 255.f * (walker->a_s * y + walker->a_b);
+ f.r = f.a * (walker->r_s * y + walker->r_b);
+ f.g = f.a * (walker->g_s * y + walker->g_b);
+ f.b = f.a * (walker->b_s * y + walker->b_b);
- v = ((a8 << 24) & 0xff000000) |
- ((r8 << 16) & 0x00ff0000) |
- ((g8 << 8) & 0x0000ff00) |
- ((b8 >> 0) & 0x000000ff);
+ return (((uint32_t)(f.a + .5f) << 24) & 0xff000000) |
+ (((uint32_t)(f.r + .5f) << 16) & 0x00ff0000) |
+ (((uint32_t)(f.g + .5f) << 8) & 0x0000ff00) |
+ (((uint32_t)(f.b + .5f) >> 0) & 0x000000ff);
+}
+
+void
+_pixman_gradient_walker_write_narrow (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer)
+{
+ *buffer = pixman_gradient_walker_pixel_32 (walker, x);
+}
+
+void
+_pixman_gradient_walker_write_wide (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer)
+{
+ *(argb_t *)buffer = pixman_gradient_walker_pixel_float (walker, x);
+}
+
+void
+_pixman_gradient_walker_fill_narrow (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer,
+ uint32_t *end)
+{
+ register uint32_t color;
+
+ color = pixman_gradient_walker_pixel_32 (walker, x);
+ while (buffer < end)
+ *buffer++ = color;
+}
+
+void
+_pixman_gradient_walker_fill_wide (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer,
+ uint32_t *end)
+{
+ register argb_t color;
+ argb_t *buffer_wide = (argb_t *)buffer;
+ argb_t *end_wide = (argb_t *)end;
- return v;
+ color = pixman_gradient_walker_pixel_float (walker, x);
+ while (buffer_wide < end_wide)
+ *buffer_wide++ = color;
}
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index 1ff1a49..72796fc 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -21,7 +21,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
@@ -335,37 +335,47 @@ compute_image_info (pixman_image_t *image)
{
flags |= FAST_PATH_NEAREST_FILTER;
}
- else if (
- /* affine and integer translation components in matrix ... */
- ((flags & FAST_PATH_AFFINE_TRANSFORM) &&
- !pixman_fixed_frac (image->common.transform->matrix[0][2] |
- image->common.transform->matrix[1][2])) &&
- (
- /* ... combined with a simple rotation */
- (flags & (FAST_PATH_ROTATE_90_TRANSFORM |
- FAST_PATH_ROTATE_180_TRANSFORM |
- FAST_PATH_ROTATE_270_TRANSFORM)) ||
- /* ... or combined with a simple non-rotated translation */
- (image->common.transform->matrix[0][0] == pixman_fixed_1 &&
- image->common.transform->matrix[1][1] == pixman_fixed_1 &&
- image->common.transform->matrix[0][1] == 0 &&
- image->common.transform->matrix[1][0] == 0)
- )
- )
+ else if (flags & FAST_PATH_AFFINE_TRANSFORM)
{
- /* FIXME: there are some affine-test failures, showing that
- * handling of BILINEAR and NEAREST filter is not quite
- * equivalent when getting close to 32K for the translation
- * components of the matrix. That's likely some bug, but for
- * now just skip BILINEAR->NEAREST optimization in this case.
+ /* Suppose the transform is
+ *
+ * [ t00, t01, t02 ]
+ * [ t10, t11, t12 ]
+ * [ 0, 0, 1 ]
+ *
+ * and the destination coordinates are (n + 0.5, m + 0.5). Then
+ * the transformed x coordinate is:
+ *
+ * tx = t00 * (n + 0.5) + t01 * (m + 0.5) + t02
+ * = t00 * n + t01 * m + t02 + (t00 + t01) * 0.5
+ *
+ * which implies that if t00, t01 and t02 are all integers
+ * and (t00 + t01) is odd, then tx will be an integer plus 0.5,
+ * which means a BILINEAR filter will reduce to NEAREST. The same
+ * applies in the y direction
*/
- pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
- if (image->common.transform->matrix[0][2] <= magic_limit &&
- image->common.transform->matrix[1][2] <= magic_limit &&
- image->common.transform->matrix[0][2] >= -magic_limit &&
- image->common.transform->matrix[1][2] >= -magic_limit)
+ pixman_fixed_t (*t)[3] = image->common.transform->matrix;
+
+ if ((pixman_fixed_frac (
+ t[0][0] | t[0][1] | t[0][2] |
+ t[1][0] | t[1][1] | t[1][2]) == 0) &&
+ (pixman_fixed_to_int (
+ (t[0][0] + t[0][1]) & (t[1][0] + t[1][1])) % 2) == 1)
{
- flags |= FAST_PATH_NEAREST_FILTER;
+ /* FIXME: there are some affine-test failures, showing that
+ * handling of BILINEAR and NEAREST filter is not quite
+ * equivalent when getting close to 32K for the translation
+ * components of the matrix. That's likely some bug, but for
+ * now just skip BILINEAR->NEAREST optimization in this case.
+ */
+ pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
+ if (image->common.transform->matrix[0][2] <= magic_limit &&
+ image->common.transform->matrix[1][2] <= magic_limit &&
+ image->common.transform->matrix[0][2] >= -magic_limit &&
+ image->common.transform->matrix[1][2] >= -magic_limit)
+ {
+ flags |= FAST_PATH_NEAREST_FILTER;
+ }
}
}
break;
@@ -557,7 +567,7 @@ _pixman_image_validate (pixman_image_t *image)
PIXMAN_EXPORT pixman_bool_t
pixman_image_set_clip_region32 (pixman_image_t * image,
- pixman_region32_t *region)
+ const pixman_region32_t *region)
{
image_common_t *common = (image_common_t *)image;
pixman_bool_t result;
@@ -581,7 +591,7 @@ pixman_image_set_clip_region32 (pixman_image_t * image,
PIXMAN_EXPORT pixman_bool_t
pixman_image_set_clip_region (pixman_image_t * image,
- pixman_region16_t *region)
+ const pixman_region16_t *region)
{
image_common_t *common = (image_common_t *)image;
pixman_bool_t result;
@@ -674,6 +684,41 @@ pixman_image_set_repeat (pixman_image_t *image,
image_property_changed (image);
}
+PIXMAN_EXPORT void
+pixman_image_set_dither (pixman_image_t *image,
+ pixman_dither_t dither)
+{
+ if (image->type == BITS)
+ {
+ if (image->bits.dither == dither)
+ return;
+
+ image->bits.dither = dither;
+
+ image_property_changed (image);
+ }
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_dither_offset (pixman_image_t *image,
+ int offset_x,
+ int offset_y)
+{
+ if (image->type == BITS)
+ {
+ if (image->bits.dither_offset_x == offset_x &&
+ image->bits.dither_offset_y == offset_y)
+ {
+ return;
+ }
+
+ image->bits.dither_offset_x = offset_x;
+ image->bits.dither_offset_y = offset_y;
+
+ image_property_changed (image);
+ }
+}
+
PIXMAN_EXPORT pixman_bool_t
pixman_image_set_filter (pixman_image_t * image,
pixman_filter_t filter,
@@ -832,6 +877,10 @@ pixman_image_set_accessors (pixman_image_t * image,
if (image->type == BITS)
{
+ /* Accessors only work for <= 32 bpp. */
+ if (PIXMAN_FORMAT_BPP(image->bits.format) > 32)
+ return_if_fail (!read_func && !write_func);
+
image->bits.read_func = read_func;
image->bits.write_func = write_func;
@@ -911,7 +960,7 @@ _pixman_image_get_solid (pixman_implementation_t *imp,
else if (image->bits.format == PIXMAN_x8r8g8b8)
result = image->bits.bits[0] | 0xff000000;
else if (image->bits.format == PIXMAN_a8)
- result = (*(uint8_t *)image->bits.bits) << 24;
+ result = (uint32_t)(*(uint8_t *)image->bits.bits) << 24;
else
goto otherwise;
}
diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c
index 5884054..69fa70b 100644
--- a/pixman/pixman-implementation.c
+++ b/pixman/pixman-implementation.c
@@ -22,7 +22,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
#include "pixman-private.h"
@@ -63,7 +63,7 @@ typedef struct
} cache [N_CACHED_FAST_PATHS];
} cache_t;
-PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache)
static void
dummy_composite_rect (pixman_implementation_t *imp,
@@ -380,6 +380,11 @@ _pixman_disabled (const char *name)
return FALSE;
}
+static const pixman_fast_path_t empty_fast_path[] =
+{
+ { PIXMAN_OP_NONE }
+};
+
pixman_implementation_t *
_pixman_choose_implementation (void)
{
@@ -397,5 +402,16 @@ _pixman_choose_implementation (void)
imp = _pixman_implementation_create_noop (imp);
+ if (_pixman_disabled ("wholeops"))
+ {
+ pixman_implementation_t *cur;
+
+ /* Disable all whole-operation paths except the general one,
+ * so that optimized iterators are used as much as possible.
+ */
+ for (cur = imp; cur->fallback; cur = cur->fallback)
+ cur->fast_paths = empty_fast_path;
+ }
+
return imp;
}
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h
index dd1c2f1..f785910 100644
--- a/pixman/pixman-inlines.h
+++ b/pixman/pixman-inlines.h
@@ -222,6 +222,31 @@ bilinear_interpolation (uint32_t tl, uint32_t tr,
#endif
#endif // BILINEAR_INTERPOLATION_BITS <= 4
+static force_inline argb_t
+bilinear_interpolation_float (argb_t tl, argb_t tr,
+ argb_t bl, argb_t br,
+ float distx, float disty)
+{
+ float distxy, distxiy, distixy, distixiy;
+ argb_t r;
+
+ distxy = distx * disty;
+ distxiy = distx * (1.f - disty);
+ distixy = (1.f - distx) * disty;
+ distixiy = (1.f - distx) * (1.f - disty);
+
+ r.a = tl.a * distixiy + tr.a * distxiy +
+ bl.a * distixy + br.a * distxy;
+ r.r = tl.r * distixiy + tr.r * distxiy +
+ bl.r * distixy + br.r * distxy;
+ r.g = tl.g * distixiy + tr.g * distxiy +
+ bl.g * distixy + br.g * distxy;
+ r.b = tl.b * distixiy + tr.b * distxiy +
+ bl.b * distixy + br.b * distxy;
+
+ return r;
+}
+
/*
* For each scanline fetched from source image with PAD repeat:
* - calculate how many pixels need to be padded on the left side
@@ -747,7 +772,8 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func) \
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func), \
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func), \
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
/*****************************************************************************/
diff --git a/pixman/pixman-linear-gradient.c b/pixman/pixman-linear-gradient.c
index 40c8c9f..014b69c 100644
--- a/pixman/pixman-linear-gradient.c
+++ b/pixman/pixman-linear-gradient.c
@@ -26,7 +26,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
#include "pixman-private.h"
@@ -89,8 +89,11 @@ linear_gradient_is_horizontal (pixman_image_t *image,
}
static uint32_t *
-linear_get_scanline_narrow (pixman_iter_t *iter,
- const uint32_t *mask)
+linear_get_scanline (pixman_iter_t *iter,
+ const uint32_t *mask,
+ int Bpp,
+ pixman_gradient_walker_write_t write_pixel,
+ pixman_gradient_walker_fill_t fill_pixel)
{
pixman_image_t *image = iter->image;
int x = iter->x;
@@ -103,7 +106,7 @@ linear_get_scanline_narrow (pixman_iter_t *iter,
pixman_fixed_48_16_t dx, dy;
gradient_t *gradient = (gradient_t *)image;
linear_gradient_t *linear = (linear_gradient_t *)image;
- uint32_t *end = buffer + width;
+ uint32_t *end = buffer + width * (Bpp / 4);
pixman_gradient_walker_t walker;
_pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
@@ -137,7 +140,7 @@ linear_get_scanline_narrow (pixman_iter_t *iter,
if (l == 0 || unit.vector[2] == 0)
{
/* affine transformation only */
- pixman_fixed_32_32_t t, next_inc;
+ pixman_fixed_32_32_t t, next_inc;
double inc;
if (l == 0 || v.vector[2] == 0)
@@ -152,7 +155,7 @@ linear_get_scanline_narrow (pixman_iter_t *iter,
invden = pixman_fixed_1 * (double) pixman_fixed_1 /
(l * (double) v.vector[2]);
v2 = v.vector[2] * (1. / pixman_fixed_1);
- t = ((dx * v.vector[0] + dy * v.vector[1]) -
+ t = ((dx * v.vector[0] + dy * v.vector[1]) -
(dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
}
@@ -160,11 +163,7 @@ linear_get_scanline_narrow (pixman_iter_t *iter,
if (((pixman_fixed_32_32_t )(inc * width)) == 0)
{
- register uint32_t color;
-
- color = _pixman_gradient_walker_pixel (&walker, t);
- while (buffer < end)
- *buffer++ = color;
+ fill_pixel (&walker, t, buffer, end);
}
else
{
@@ -175,12 +174,11 @@ linear_get_scanline_narrow (pixman_iter_t *iter,
{
if (!mask || *mask++)
{
- *buffer = _pixman_gradient_walker_pixel (&walker,
- t + next_inc);
+ write_pixel (&walker, t + next_inc, buffer);
}
i++;
next_inc = inc * i;
- buffer++;
+ buffer += (Bpp / 4);
}
}
}
@@ -202,14 +200,14 @@ linear_get_scanline_narrow (pixman_iter_t *iter,
invden = pixman_fixed_1 * (double) pixman_fixed_1 /
(l * (double) v.vector[2]);
v2 = v.vector[2] * (1. / pixman_fixed_1);
- t = ((dx * v.vector[0] + dy * v.vector[1]) -
+ t = ((dx * v.vector[0] + dy * v.vector[1]) -
(dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
}
- *buffer = _pixman_gradient_walker_pixel (&walker, t);
+ write_pixel (&walker, t, buffer);
}
- ++buffer;
+ buffer += (Bpp / 4);
v.vector[0] += unit.vector[0];
v.vector[1] += unit.vector[1];
@@ -223,14 +221,21 @@ linear_get_scanline_narrow (pixman_iter_t *iter,
}
static uint32_t *
-linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+linear_get_scanline_narrow (pixman_iter_t *iter,
+ const uint32_t *mask)
{
- uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+ return linear_get_scanline (iter, mask, 4,
+ _pixman_gradient_walker_write_narrow,
+ _pixman_gradient_walker_fill_narrow);
+}
- pixman_expand_to_float (
- (argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
- return buffer;
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ return linear_get_scanline (iter, NULL, 16,
+ _pixman_gradient_walker_write_wide,
+ _pixman_gradient_walker_fill_wide);
}
void
diff --git a/pixman/pixman-matrix.c b/pixman/pixman-matrix.c
index 4032c13..da5209c 100644
--- a/pixman/pixman-matrix.c
+++ b/pixman/pixman-matrix.c
@@ -25,7 +25,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <math.h>
@@ -273,7 +273,7 @@ pixman_transform_point_31_16 (const pixman_transform_t *t,
{
/* the divisor is small, we can actually keep all the bits */
int64_t hi, rhi, lo, rlo;
- int64_t div = (divint << 16) + divfrac;
+ int64_t div = ((uint64_t)divint << 16) + divfrac;
fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32);
rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 866e93e..9dad163 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -26,7 +26,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * Author: Nemanja Lukic (nlukic@mips.com)
+ * Author: Nemanja Lukic (nemanja.lukic@rt-rk.com)
*/
#include "pixman-private.h"
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index cab122d..e238566 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -26,7 +26,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * Author: Nemanja Lukic (nlukic@mips.com)
+ * Author: Nemanja Lukic (nemanja.lukic@rt-rk.com)
*/
#ifndef PIXMAN_MIPS_DSPR2_ASM_H
@@ -72,6 +72,7 @@
#define LEAF_MIPS32R2(symbol) \
.globl symbol; \
.align 2; \
+ .hidden symbol; \
.type symbol, @function; \
.ent symbol, 0; \
symbol: .frame sp, 0, ra; \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index e10c9df..c43eb1e 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -26,11 +26,11 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * Author: Nemanja Lukic (nlukic@mips.com)
+ * Author: Nemanja Lukic (nemanja.lukic@rt-rk.com)
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -388,11 +388,11 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
- PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
- PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
- PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565),
- PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565),
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565),
+ SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
index 955ed70..57b3835 100644
--- a/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman-mips-dspr2.h
@@ -26,7 +26,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * Author: Nemanja Lukic (nlukic@mips.com)
+ * Author: Nemanja Lukic (nemanja.lukic@rt-rk.com)
*/
#ifndef PIXMAN_MIPS_DSPR2_H
@@ -328,12 +328,6 @@ FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_pad_##op, \
scaled_nearest_scanline_mips_##name##_##op, \
src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
-/* Provide entries for the fast path table */
-#define PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \
- SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
- SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
- SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
-
/****************************************************************************/
#define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST(flags, name, op, \
diff --git a/pixman/pixman-mips.c b/pixman/pixman-mips.c
index 3048813..7479a08 100644
--- a/pixman/pixman-mips.c
+++ b/pixman/pixman-mips.c
@@ -20,7 +20,7 @@
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index f9a92ce..52c70e9 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -30,7 +30,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
@@ -60,7 +60,7 @@ _mm_empty (void)
#endif
#ifdef USE_X86_MMX
-# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
+# if (defined(__SSE2__) || defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
# include <xmmintrin.h>
# else
/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
@@ -89,21 +89,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
return __A;
}
-# ifdef __OPTIMIZE__
-extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
-{
- __m64 ret;
-
- asm ("pshufw %2, %1, %0\n\t"
- : "=y" (ret)
- : "y" (__A), "K" (__N)
- );
-
- return ret;
-}
-# else
-# define _mm_shuffle_pi16(A, N) \
+# define _mm_shuffle_pi16(A, N) \
({ \
__m64 ret; \
\
@@ -114,11 +100,10 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
\
ret; \
})
-# endif
# endif
#endif
-#ifndef _MSC_VER
+#ifndef _MM_SHUFFLE
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
#endif
@@ -402,8 +387,10 @@ in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
static force_inline __m64 ldq_u(__m64 *p)
{
#ifdef USE_X86_MMX
- /* x86's alignment restrictions are very relaxed. */
- return *(__m64 *)p;
+ /* x86's alignment restrictions are very relaxed, but that's no excuse */
+ __m64 r;
+ memcpy(&r, p, sizeof(__m64));
+ return r;
#elif defined USE_ARM_IWMMXT
int align = (uintptr_t)p & 7;
__m64 *aligned_p;
@@ -422,7 +409,9 @@ static force_inline uint32_t ldl_u(const uint32_t *p)
{
#ifdef USE_X86_MMX
/* x86's alignment restrictions are very relaxed. */
- return *p;
+ uint32_t r;
+ memcpy(&r, p, sizeof(uint32_t));
+ return r;
#else
struct __una_u32 { uint32_t x __attribute__((packed)); };
const struct __una_u32 *ptr = (const struct __una_u32 *) p;
@@ -3555,6 +3544,105 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
_mm_empty ();
}
+static force_inline void
+scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t src_width_fixed,
+ pixman_bool_t fully_transparent_src)
+{
+ if (fully_transparent_src)
+ return;
+
+ while (w)
+ {
+ __m64 d = load (pd);
+ __m64 s = load (ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
+ pd++;
+
+ w--;
+ }
+
+ _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
+ scaled_nearest_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
+ scaled_nearest_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
+ scaled_nearest_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
+ scaled_nearest_scanline_mmx_8888_8888_OVER,
+ uint32_t, uint32_t, NORMAL)
+
+static force_inline void
+scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
+ uint32_t * dst,
+ const uint32_t * src,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t src_width_fixed,
+ pixman_bool_t zero_src)
+{
+ __m64 mm_mask;
+
+ if (zero_src || (*mask >> 24) == 0)
+ {
+ /* A workaround for https://gcc.gnu.org/PR47759 */
+ _mm_empty ();
+ return;
+ }
+
+ mm_mask = expand_alpha (load8888 (mask));
+
+ while (w)
+ {
+ uint32_t s = *(src + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ if (s)
+ {
+ __m64 ms = load8888 (&s);
+ __m64 alpha = expand_alpha (ms);
+ __m64 dest = load8888 (dst);
+
+ store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
+ }
+
+ dst++;
+ w--;
+ }
+
+ _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
+ scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
+ scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
+ scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
+ scaled_nearest_scanline_mmx_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
+
#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
#define BMSK (BSHIFT - 1)
@@ -3866,7 +3954,7 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
while (w && (((uintptr_t)dst) & 15))
{
- *dst++ = *(src++) << 24;
+ *dst++ = (uint32_t)*(src++) << 24;
w--;
}
@@ -3893,7 +3981,7 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
while (w)
{
- *dst++ = *(src++) << 24;
+ *dst++ = (uint32_t)*(src++) << 24;
w--;
}
@@ -3995,6 +4083,16 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
+
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888 ),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888 ),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888 ),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888 ),
+
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c
index e598904..e43199b 100644
--- a/pixman/pixman-noop.c
+++ b/pixman/pixman-noop.c
@@ -22,7 +22,7 @@
* DEALINGS IN THE SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <string.h>
#include <stdlib.h>
diff --git a/pixman/pixman-ppc.c b/pixman/pixman-ppc.c
index a6e7bb0..926eb44 100644
--- a/pixman/pixman-ppc.c
+++ b/pixman/pixman-ppc.c
@@ -20,7 +20,7 @@
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -68,6 +68,24 @@ pixman_have_vmx (void)
return have_vmx;
}
+#elif defined (__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+
+ unsigned long cpufeatures;
+ int have_vmx;
+
+ if (elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)))
+ return FALSE;
+
+ have_vmx = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
+ return have_vmx;
+}
+
#elif defined (__linux__)
#include <sys/types.h>
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index fdc966a..34fb69b 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -1,5 +1,3 @@
-#include <float.h>
-
#ifndef PIXMAN_PRIVATE_H
#define PIXMAN_PRIVATE_H
@@ -7,7 +5,7 @@
* The defines which are shared between C and assembly code
*/
-/* bilinear interpolation precision (must be <= 8) */
+/* bilinear interpolation precision (must be < 8) */
#define BILINEAR_INTERPOLATION_BITS 7
#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
@@ -30,6 +28,7 @@
#include <stdio.h>
#include <string.h>
#include <stddef.h>
+#include <float.h>
#include "pixman-compiler.h"
@@ -181,6 +180,10 @@ struct bits_image
uint32_t * free_me;
int rowstride; /* in number of uint32_t's */
+ pixman_dither_t dither;
+ uint32_t dither_offset_y;
+ uint32_t dither_offset_x;
+
fetch_scanline_t fetch_scanline_32;
fetch_pixel_32_t fetch_pixel_32;
store_scanline_t store_scanline_32;
@@ -364,9 +367,38 @@ void
_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
pixman_fixed_48_16_t pos);
-uint32_t
-_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
- pixman_fixed_48_16_t x);
+typedef void (*pixman_gradient_walker_write_t) (
+ pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer);
+
+void
+_pixman_gradient_walker_write_narrow(pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer);
+
+void
+_pixman_gradient_walker_write_wide(pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer);
+
+typedef void (*pixman_gradient_walker_fill_t) (
+ pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer,
+ uint32_t *end);
+
+void
+_pixman_gradient_walker_fill_narrow(pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer,
+ uint32_t *end);
+
+void
+_pixman_gradient_walker_fill_wide(pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x,
+ uint32_t *buffer,
+ uint32_t *end);
/*
* Edges
@@ -608,6 +640,11 @@ pixman_implementation_t *
_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
#endif
+#ifdef USE_ARM_A64_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
#ifdef USE_MIPS_DSPR2
pixman_implementation_t *
_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
@@ -819,11 +856,11 @@ pixman_contract_from_float (uint32_t *dst,
/* Region Helpers */
pixman_bool_t
pixman_region32_copy_from_region16 (pixman_region32_t *dst,
- pixman_region16_t *src);
+ const pixman_region16_t *src);
pixman_bool_t
pixman_region16_copy_from_region32 (pixman_region16_t *dst,
- pixman_region32_t *src);
+ const pixman_region32_t *src);
/* Doubly linked lists */
typedef struct pixman_link_t pixman_link_t;
@@ -1013,28 +1050,9 @@ float pixman_unorm_to_float (uint16_t u, int n_bits);
* Various debugging code
*/
-#undef DEBUG
-
#define COMPILE_TIME_ASSERT(x) \
do { typedef int compile_time_assertion [(x)?1:-1]; } while (0)
-/* Turn on debugging depending on what type of release this is
- */
-#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1))
-
-/* Debugging gets turned on for development releases because these
- * are the things that end up in bleeding edge distributions such
- * as Rawhide etc.
- *
- * For performance reasons we don't turn it on for stable releases or
- * random git checkouts. (Random git checkouts are often used for
- * performance work).
- */
-
-# define DEBUG
-
-#endif
-
void
_pixman_log_error (const char *function, const char *message);
@@ -1074,16 +1092,19 @@ _pixman_log_error (const char *function, const char *message);
typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
+PIXMAN_EXPORT
pixman_bool_t
pixman_transform_point_31_16 (const pixman_transform_t *t,
const pixman_vector_48_16_t *v,
pixman_vector_48_16_t *result);
+PIXMAN_EXPORT
void
pixman_transform_point_31_16_3d (const pixman_transform_t *t,
const pixman_vector_48_16_t *v,
pixman_vector_48_16_t *result);
+PIXMAN_EXPORT
void
pixman_transform_point_31_16_affine (const pixman_transform_t *t,
const pixman_vector_48_16_t *v,
diff --git a/pixman/pixman-radial-gradient.c b/pixman/pixman-radial-gradient.c
index 6a21796..38e1052 100644
--- a/pixman/pixman-radial-gradient.c
+++ b/pixman/pixman-radial-gradient.c
@@ -28,7 +28,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
#include <math.h>
@@ -66,15 +66,18 @@ fdot (double x1,
return x1 * x2 + y1 * y2 + z1 * z2;
}
-static uint32_t
-radial_compute_color (double a,
- double b,
- double c,
- double inva,
- double dr,
- double mindr,
- pixman_gradient_walker_t *walker,
- pixman_repeat_t repeat)
+static void
+radial_write_color (double a,
+ double b,
+ double c,
+ double inva,
+ double dr,
+ double mindr,
+ pixman_gradient_walker_t *walker,
+ pixman_repeat_t repeat,
+ int Bpp,
+ pixman_gradient_walker_write_t write_pixel,
+ uint32_t *buffer)
{
/*
* In this function error propagation can lead to bad results:
@@ -99,21 +102,31 @@ radial_compute_color (double a,
double t;
if (b == 0)
- return 0;
+ {
+ memset (buffer, 0, Bpp);
+ return;
+ }
t = pixman_fixed_1 / 2 * c / b;
if (repeat == PIXMAN_REPEAT_NONE)
{
if (0 <= t && t <= pixman_fixed_1)
- return _pixman_gradient_walker_pixel (walker, t);
+ {
+ write_pixel (walker, t, buffer);
+ return;
+ }
}
else
{
if (t * dr >= mindr)
- return _pixman_gradient_walker_pixel (walker, t);
+ {
+ write_pixel (walker, t, buffer);
+ return;
+ }
}
- return 0;
+ memset (buffer, 0, Bpp);
+ return;
}
discr = fdot (b, a, 0, b, -c, 0);
@@ -139,24 +152,40 @@ radial_compute_color (double a,
if (repeat == PIXMAN_REPEAT_NONE)
{
if (0 <= t0 && t0 <= pixman_fixed_1)
- return _pixman_gradient_walker_pixel (walker, t0);
+ {
+ write_pixel (walker, t0, buffer);
+ return;
+ }
else if (0 <= t1 && t1 <= pixman_fixed_1)
- return _pixman_gradient_walker_pixel (walker, t1);
+ {
+ write_pixel (walker, t1, buffer);
+ return;
+ }
}
else
{
if (t0 * dr >= mindr)
- return _pixman_gradient_walker_pixel (walker, t0);
+ {
+ write_pixel (walker, t0, buffer);
+ return;
+ }
else if (t1 * dr >= mindr)
- return _pixman_gradient_walker_pixel (walker, t1);
+ {
+ write_pixel (walker, t1, buffer);
+ return;
+ }
}
}
- return 0;
+ memset (buffer, 0, Bpp);
+ return;
}
static uint32_t *
-radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+radial_get_scanline (pixman_iter_t *iter,
+ const uint32_t *mask,
+ int Bpp,
+ pixman_gradient_walker_write_t write_pixel)
{
/*
* Implementation of radial gradients following the PDF specification.
@@ -247,7 +276,7 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
gradient_t *gradient = (gradient_t *)image;
radial_gradient_t *radial = (radial_gradient_t *)image;
- uint32_t *end = buffer + width;
+ uint32_t *end = buffer + width * (Bpp / 4);
pixman_gradient_walker_t walker;
pixman_vector_t v, unit;
@@ -330,18 +359,21 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
{
if (!mask || *mask++)
{
- *buffer = radial_compute_color (radial->a, b, c,
- radial->inva,
- radial->delta.radius,
- radial->mindr,
- &walker,
- image->common.repeat);
+ radial_write_color (radial->a, b, c,
+ radial->inva,
+ radial->delta.radius,
+ radial->mindr,
+ &walker,
+ image->common.repeat,
+ Bpp,
+ write_pixel,
+ buffer);
}
b += db;
c += dc;
dc += ddc;
- ++buffer;
+ buffer += (Bpp / 4);
}
}
else
@@ -375,20 +407,23 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
pdx, pdy, radial->c1.radius);
/* / pixman_fixed_1 / pixman_fixed_1 */
- *buffer = radial_compute_color (radial->a, b, c,
- radial->inva,
- radial->delta.radius,
- radial->mindr,
- &walker,
- image->common.repeat);
+ radial_write_color (radial->a, b, c,
+ radial->inva,
+ radial->delta.radius,
+ radial->mindr,
+ &walker,
+ image->common.repeat,
+ Bpp,
+ write_pixel,
+ buffer);
}
else
{
- *buffer = 0;
+ memset (buffer, 0, Bpp);
}
}
- ++buffer;
+ buffer += (Bpp / 4);
v.vector[0] += unit.vector[0];
v.vector[1] += unit.vector[1];
@@ -401,14 +436,17 @@ radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
}
static uint32_t *
-radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
{
- uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
-
- pixman_expand_to_float (
- (argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+ return radial_get_scanline (iter, mask, 4,
+ _pixman_gradient_walker_write_narrow);
+}
- return buffer;
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+ return radial_get_scanline (iter, NULL, 16,
+ _pixman_gradient_walker_write_wide);
}
void
@@ -422,11 +460,11 @@ _pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
PIXMAN_EXPORT pixman_image_t *
pixman_image_create_radial_gradient (const pixman_point_fixed_t * inner,
- const pixman_point_fixed_t * outer,
- pixman_fixed_t inner_radius,
- pixman_fixed_t outer_radius,
- const pixman_gradient_stop_t *stops,
- int n_stops)
+ const pixman_point_fixed_t * outer,
+ pixman_fixed_t inner_radius,
+ pixman_fixed_t outer_radius,
+ const pixman_gradient_stop_t *stops,
+ int n_stops)
{
pixman_image_t *image;
radial_gradient_t *radial;
diff --git a/pixman/pixman-region.c b/pixman/pixman-region.c
index 59bc9c7..537d5fb 100644
--- a/pixman/pixman-region.c
+++ b/pixman/pixman-region.c
@@ -76,7 +76,7 @@
#define PIXREGION_SIZE(reg) ((reg)->data ? (reg)->data->size : 0)
#define PIXREGION_RECTS(reg) \
((reg)->data ? (box_type_t *)((reg)->data + 1) \
- : &(reg)->extents)
+ : (box_type_t *)&(reg)->extents)
#define PIXREGION_BOXPTR(reg) ((box_type_t *)((reg)->data + 1))
#define PIXREGION_BOX(reg, i) (&PIXREGION_BOXPTR (reg)[i])
#define PIXREGION_TOP(reg) PIXREGION_BOX (reg, (reg)->data->numRects)
@@ -292,7 +292,7 @@ alloc_data (size_t n)
} while (0)
PIXMAN_EXPORT pixman_bool_t
-PREFIX (_equal) (region_type_t *reg1, region_type_t *reg2)
+PREFIX (_equal) (const region_type_t *reg1, const region_type_t *reg2)
{
int i;
box_type_t *rects1;
@@ -395,7 +395,7 @@ PREFIX (_init_rect) (region_type_t * region,
}
PIXMAN_EXPORT void
-PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents)
+PREFIX (_init_with_extents) (region_type_t *region, const box_type_t *extents)
{
if (!GOOD_RECT (extents))
{
@@ -417,13 +417,13 @@ PREFIX (_fini) (region_type_t *region)
}
PIXMAN_EXPORT int
-PREFIX (_n_rects) (region_type_t *region)
+PREFIX (_n_rects) (const region_type_t *region)
{
return PIXREGION_NUMRECTS (region);
}
PIXMAN_EXPORT box_type_t *
-PREFIX (_rectangles) (region_type_t *region,
+PREFIX (_rectangles) (const region_type_t *region,
int *n_rects)
{
if (n_rects)
@@ -505,7 +505,7 @@ pixman_rect_alloc (region_type_t * region,
}
PIXMAN_EXPORT pixman_bool_t
-PREFIX (_copy) (region_type_t *dst, region_type_t *src)
+PREFIX (_copy) (region_type_t *dst, const region_type_t *src)
{
GOOD (dst);
GOOD (src);
@@ -746,8 +746,8 @@ typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
static pixman_bool_t
pixman_op (region_type_t * new_reg, /* Place to store result */
- region_type_t * reg1, /* First region in operation */
- region_type_t * reg2, /* 2d region in operation */
+ const region_type_t * reg1, /* First region in operation */
+ const region_type_t * reg2, /* 2d region in operation */
overlap_proc_ptr overlap_func, /* Function to call for over-
* lapping bands */
int append_non1, /* Append non-overlapping bands
@@ -1155,8 +1155,8 @@ pixman_region_intersect_o (region_type_t *region,
PIXMAN_EXPORT pixman_bool_t
PREFIX (_intersect) (region_type_t * new_reg,
- region_type_t * reg1,
- region_type_t * reg2)
+ const region_type_t * reg1,
+ const region_type_t * reg2)
{
GOOD (reg1);
GOOD (reg2);
@@ -1321,7 +1321,7 @@ pixman_region_union_o (region_type_t *region,
PIXMAN_EXPORT pixman_bool_t
PREFIX(_intersect_rect) (region_type_t *dest,
- region_type_t *source,
+ const region_type_t *source,
int x, int y,
unsigned int width,
unsigned int height)
@@ -1342,7 +1342,7 @@ PREFIX(_intersect_rect) (region_type_t *dest,
*/
PIXMAN_EXPORT pixman_bool_t
PREFIX (_union_rect) (region_type_t *dest,
- region_type_t *source,
+ const region_type_t *source,
int x,
int y,
unsigned int width,
@@ -1368,9 +1368,9 @@ PREFIX (_union_rect) (region_type_t *dest,
}
PIXMAN_EXPORT pixman_bool_t
-PREFIX (_union) (region_type_t *new_reg,
- region_type_t *reg1,
- region_type_t *reg2)
+PREFIX (_union) (region_type_t * new_reg,
+ const region_type_t *reg1,
+ const region_type_t *reg2)
{
/* Return TRUE if some overlap
* between reg1, reg2
@@ -1954,9 +1954,9 @@ pixman_region_subtract_o (region_type_t * region,
*-----------------------------------------------------------------------
*/
PIXMAN_EXPORT pixman_bool_t
-PREFIX (_subtract) (region_type_t *reg_d,
- region_type_t *reg_m,
- region_type_t *reg_s)
+PREFIX (_subtract) (region_type_t * reg_d,
+ const region_type_t *reg_m,
+ const region_type_t *reg_s)
{
GOOD (reg_m);
GOOD (reg_s);
@@ -2019,9 +2019,9 @@ PREFIX (_subtract) (region_type_t *reg_d,
*-----------------------------------------------------------------------
*/
PIXMAN_EXPORT pixman_bool_t
-PREFIX (_inverse) (region_type_t *new_reg, /* Destination region */
- region_type_t *reg1, /* Region to invert */
- box_type_t * inv_rect) /* Bounding box for inversion */
+PREFIX (_inverse) (region_type_t * new_reg, /* Destination region */
+ const region_type_t *reg1, /* Region to invert */
+ const box_type_t * inv_rect) /* Bounding box for inversion */
{
region_type_t inv_reg; /* Quick and dirty region made from the
* bounding box */
@@ -2113,8 +2113,8 @@ find_box_for_y (box_type_t *begin, box_type_t *end, int y)
* that doesn't overlap the box at all and part_in is false)
*/
PIXMAN_EXPORT pixman_region_overlap_t
-PREFIX (_contains_rectangle) (region_type_t * region,
- box_type_t * prect)
+PREFIX (_contains_rectangle) (const region_type_t * region,
+ const box_type_t * prect)
{
box_type_t * pbox;
box_type_t * pbox_end;
@@ -2318,7 +2318,7 @@ PREFIX (_translate) (region_type_t *region, int x, int y)
}
PIXMAN_EXPORT void
-PREFIX (_reset) (region_type_t *region, box_type_t *box)
+PREFIX (_reset) (region_type_t *region, const box_type_t *box)
{
GOOD (region);
@@ -2343,7 +2343,7 @@ PREFIX (_clear) (region_type_t *region)
/* box is "return" value */
PIXMAN_EXPORT int
-PREFIX (_contains_point) (region_type_t * region,
+PREFIX (_contains_point) (const region_type_t * region,
int x, int y,
box_type_t * box)
{
@@ -2387,7 +2387,15 @@ PREFIX (_contains_point) (region_type_t * region,
}
PIXMAN_EXPORT int
-PREFIX (_not_empty) (region_type_t * region)
+PREFIX (_empty) (const region_type_t * region)
+{
+ GOOD (region);
+
+ return(PIXREGION_NIL (region));
+}
+
+PIXMAN_EXPORT int
+PREFIX (_not_empty) (const region_type_t * region)
{
GOOD (region);
@@ -2395,11 +2403,11 @@ PREFIX (_not_empty) (region_type_t * region)
}
PIXMAN_EXPORT box_type_t *
-PREFIX (_extents) (region_type_t * region)
+PREFIX (_extents) (const region_type_t * region)
{
GOOD (region);
- return(&region->extents);
+ return(box_type_t *)(&region->extents);
}
/*
diff --git a/pixman/pixman-region16.c b/pixman/pixman-region16.c
index d88d338..da4719e 100644
--- a/pixman/pixman-region16.c
+++ b/pixman/pixman-region16.c
@@ -23,7 +23,7 @@
* Author: Soren Sandmann <sandmann@redhat.com>
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#undef PIXMAN_DISABLE_DEPRECATED
diff --git a/pixman/pixman-region32.c b/pixman/pixman-region32.c
index abd6b1a..68b456b 100644
--- a/pixman/pixman-region32.c
+++ b/pixman/pixman-region32.c
@@ -23,7 +23,7 @@
* Author: Soren Sandmann <sandmann@redhat.com>
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c
index 5f9fef6..44f4de0 100644
--- a/pixman/pixman-solid-fill.c
+++ b/pixman/pixman-solid-fill.c
@@ -22,7 +22,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -30,10 +30,10 @@ static uint32_t
color_to_uint32 (const pixman_color_t *color)
{
return
- (color->alpha >> 8 << 24) |
- (color->red >> 8 << 16) |
- (color->green & 0xff00) |
- (color->blue >> 8);
+ ((unsigned int) color->alpha >> 8 << 24) |
+ ((unsigned int) color->red >> 8 << 16) |
+ ((unsigned int) color->green & 0xff00) |
+ ((unsigned int) color->blue >> 8);
}
static argb_t
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index a6e7808..9c9cff2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -27,7 +27,7 @@
* Based on work by Owen Taylor and Søren Sandmann
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
@@ -373,16 +373,6 @@ load_128_unaligned (const __m128i* src)
return _mm_loadu_si128 (src);
}
-/* save 4 pixels using Write Combining memory on a 16-byte
- * boundary aligned address
- */
-static force_inline void
-save_128_write_combining (__m128i* dst,
- __m128i data)
-{
- _mm_stream_si128 (dst, data);
-}
-
/* save 4 pixels on a 16-byte boundary aligned address */
static force_inline void
save_128_aligned (__m128i* dst,
@@ -391,14 +381,6 @@ save_128_aligned (__m128i* dst,
_mm_store_si128 (dst, data);
}
-/* save 4 pixels on a unaligned address */
-static force_inline void
-save_128_unaligned (__m128i* dst,
- __m128i data)
-{
- _mm_storeu_si128 (dst, data);
-}
-
static force_inline __m128i
load_32_1x128 (uint32_t data)
{
@@ -518,7 +500,8 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
static force_inline uint32_t
combine1 (const uint32_t *ps, const uint32_t *pm)
{
- uint32_t s = *ps;
+ uint32_t s;
+ memcpy(&s, ps, sizeof(uint32_t));
if (pm)
{
@@ -3201,7 +3184,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
int32_t w;
- uint32_t m, d;
+ uint32_t d;
__m128i xmm_src, xmm_alpha, xmm_def;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
@@ -3256,7 +3239,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
while (w >= 4)
{
- m = *((uint32_t*)mask);
+ uint32_t m;
+ memcpy(&m, mask, sizeof(uint32_t));
if (srca == 0xff && m == 0xffffffff)
{
@@ -3333,8 +3317,8 @@ sse2_fill (pixman_implementation_t *imp,
if (bpp == 8)
{
- uint8_t b;
- uint16_t w;
+ uint32_t b;
+ uint32_t w;
stride = stride * (int) sizeof (uint32_t) / 1;
byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
@@ -3476,7 +3460,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
int32_t w;
- uint32_t m;
__m128i xmm_src, xmm_def;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
@@ -3528,7 +3511,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
while (w >= 4)
{
- m = *((uint32_t*)mask);
+ uint32_t m;
+ memcpy(&m, mask, sizeof(uint32_t));
if (srca == 0xff && m == 0xffffffff)
{
@@ -3594,7 +3578,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
int32_t w;
- uint32_t m;
__m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
__m128i xmm_src, xmm_alpha;
@@ -3626,7 +3609,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
while (w && (uintptr_t)dst & 15)
{
- m = *mask++;
+ uint8_t m = *mask++;
if (m)
{
@@ -3646,11 +3629,13 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
while (w >= 8)
{
+ uint32_t m;
+
xmm_dst = load_128_aligned ((__m128i*) dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
- m = *((uint32_t*)mask);
+ memcpy(&m, mask, sizeof(uint32_t));
mask += 4;
if (m)
@@ -3670,7 +3655,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
&xmm_dst0, &xmm_dst1);
}
- m = *((uint32_t*)mask);
+ memcpy(&m, mask, sizeof(uint32_t));
mask += 4;
if (m)
@@ -3699,7 +3684,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
while (w)
{
- m = *mask++;
+ uint8_t m = *mask++;
if (m)
{
@@ -4061,7 +4046,7 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
uint8_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
- uint32_t d, m;
+ uint32_t d;
uint32_t src;
int32_t w;
@@ -4088,7 +4073,7 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
while (w && ((uintptr_t)dst & 15))
{
- m = (uint32_t) *mask++;
+ uint8_t m = *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x128_32 (
@@ -4125,7 +4110,7 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
while (w)
{
- m = (uint32_t) *mask++;
+ uint8_t m = *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x128_32 (
@@ -4302,7 +4287,7 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
int dst_stride, mask_stride;
int32_t w;
uint32_t src;
- uint32_t m, d;
+ uint32_t d;
__m128i xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
@@ -4327,7 +4312,7 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
while (w && ((uintptr_t)dst & 15))
{
- m = (uint32_t) *mask++;
+ uint8_t m = *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x128_32 (
@@ -4363,7 +4348,7 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
while (w)
{
- m = (uint32_t) *mask++;
+ uint8_t m = (uint32_t) *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x128_32 (
@@ -4636,7 +4621,9 @@ sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
while (w >= 4)
{
- uint32_t m = *(uint32_t*)mask;
+ uint32_t m;
+ memcpy(&m, mask, sizeof(uint32_t));
+
if (m)
{
__m128i xmm_mask_lo, xmm_mask_hi;
@@ -4743,7 +4730,7 @@ sse2_blt (pixman_implementation_t *imp,
while (w >= 2 && ((uintptr_t)d & 3))
{
- *(uint16_t *)d = *(uint16_t *)s;
+ memmove(d, s, 2);
w -= 2;
s += 2;
d += 2;
@@ -4751,7 +4738,7 @@ sse2_blt (pixman_implementation_t *imp,
while (w >= 4 && ((uintptr_t)d & 15))
{
- *(uint32_t *)d = *(uint32_t *)s;
+ memmove(d, s, 4);
w -= 4;
s += 4;
@@ -4788,7 +4775,7 @@ sse2_blt (pixman_implementation_t *imp,
while (w >= 4)
{
- *(uint32_t *)d = *(uint32_t *)s;
+ memmove(d, s, 4);
w -= 4;
s += 4;
@@ -4797,7 +4784,7 @@ sse2_blt (pixman_implementation_t *imp,
if (w >= 2)
{
- *(uint16_t *)d = *(uint16_t *)s;
+ memmove(d, s, 2);
w -= 2;
s += 2;
d += 2;
@@ -4829,7 +4816,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
uint32_t *src, *src_line, s;
uint32_t *dst, *dst_line, d;
uint8_t *mask, *mask_line;
- uint32_t m;
int src_stride, mask_stride, dst_stride;
int32_t w;
__m128i ms;
@@ -4858,8 +4844,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
while (w && (uintptr_t)dst & 15)
{
+ uint8_t m = *mask++;
s = 0xff000000 | *src++;
- m = (uint32_t) *mask++;
d = *dst;
ms = unpack_32_1x128 (s);
@@ -4877,7 +4863,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
while (w >= 4)
{
- m = *(uint32_t*) mask;
+ uint32_t m;
+ memcpy(&m, mask, sizeof(uint32_t));
xmm_src = _mm_or_si128 (
load_128_unaligned ((__m128i*)src), mask_ff000000);
@@ -4913,7 +4900,7 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
while (w)
{
- m = (uint32_t) *mask++;
+ uint8_t m = *mask++;
if (m)
{
@@ -4954,7 +4941,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
uint32_t *src, *src_line, s;
uint32_t *dst, *dst_line, d;
uint8_t *mask, *mask_line;
- uint32_t m;
int src_stride, mask_stride, dst_stride;
int32_t w;
@@ -4983,9 +4969,9 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
while (w && (uintptr_t)dst & 15)
{
uint32_t sa;
+ uint8_t m = *mask++;
s = *src++;
- m = (uint32_t) *mask++;
d = *dst;
sa = s >> 24;
@@ -5016,7 +5002,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
while (w >= 4)
{
- m = *(uint32_t *) mask;
+ uint32_t m;
+ memcpy(&m, mask, sizeof(uint32_t));
if (m)
{
@@ -5055,9 +5042,9 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
while (w)
{
uint32_t sa;
+ uint8_t m = *mask++;
s = *src++;
- m = (uint32_t) *mask++;
d = *dst;
sa = s >> 24;
@@ -5924,13 +5911,11 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
intptr_t unit_x = unit_x_;
BILINEAR_DECLARE_VARIABLES;
uint32_t pix1, pix2;
- uint32_t m;
while (w && ((uintptr_t)dst & 15))
{
uint32_t sa;
-
- m = (uint32_t) *mask++;
+ uint8_t m = *mask++;
if (m)
{
@@ -5966,11 +5951,13 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
while (w >= 4)
{
+ uint32_t m;
+
__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- m = *(uint32_t*)mask;
+ memcpy(&m, mask, sizeof(uint32_t));
if (m)
{
@@ -6012,8 +5999,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
while (w)
{
uint32_t sa;
-
- m = (uint32_t) *mask++;
+ uint8_t m = *mask++;
if (m)
{
@@ -6274,31 +6260,15 @@ static const pixman_fast_path_t sse2_fast_paths[] =
PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
- SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
- SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
@@ -6426,7 +6396,7 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
while (w && (((uintptr_t)dst) & 15))
{
- *dst++ = *(src++) << 24;
+ *dst++ = (uint32_t)(*(src++)) << 24;
w--;
}
@@ -6453,7 +6423,7 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
while (w)
{
- *dst++ = *(src++) << 24;
+ *dst++ = (uint32_t)(*(src++)) << 24;
w--;
}
diff --git a/pixman/pixman-ssse3.c b/pixman/pixman-ssse3.c
index 680d6b9..0359895 100644
--- a/pixman/pixman-ssse3.c
+++ b/pixman/pixman-ssse3.c
@@ -24,7 +24,7 @@
* Author: Soren Sandmann (soren.sandmann@gmail.com)
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
diff --git a/pixman/pixman-timer.c b/pixman/pixman-timer.c
index f5ae18e..656d900 100644
--- a/pixman/pixman-timer.c
+++ b/pixman/pixman-timer.c
@@ -20,7 +20,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdlib.h>
diff --git a/pixman/pixman-trap.c b/pixman/pixman-trap.c
index 91766fd..0ec73dc 100644
--- a/pixman/pixman-trap.c
+++ b/pixman/pixman-trap.c
@@ -22,7 +22,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdio.h>
@@ -74,7 +74,7 @@ pixman_sample_floor_y (pixman_fixed_t y,
if (f < Y_FRAC_FIRST (n))
{
- if (pixman_fixed_to_int (i) == 0x8000)
+ if (pixman_fixed_to_int (i) == 0xffff8000)
{
f = 0; /* saturate */
}
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index 4a3a835..302cd0c 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -23,7 +23,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <stdio.h>
#include <stdlib.h>
@@ -206,7 +206,7 @@ pixman_contract_from_float (uint32_t *dst,
for (i = 0; i < width; ++i)
{
- uint8_t a, r, g, b;
+ uint32_t a, r, g, b;
a = float_to_unorm (src[i].a, 8);
r = float_to_unorm (src[i].r, 8);
@@ -238,7 +238,7 @@ _pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *in
pixman_bool_t
pixman_region16_copy_from_region32 (pixman_region16_t *dst,
- pixman_region32_t *src)
+ const pixman_region32_t *src)
{
int n_boxes, i;
pixman_box32_t *boxes32;
@@ -268,7 +268,7 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
pixman_bool_t
pixman_region32_copy_from_region16 (pixman_region32_t *dst,
- pixman_region16_t *src)
+ const pixman_region16_t *src)
{
int n_boxes, i;
pixman_box16_t *boxes16;
diff --git a/pixman/pixman-version.h.in b/pixman/pixman-version.h.in
index 256b2e6..64778a5 100644
--- a/pixman/pixman-version.h.in
+++ b/pixman/pixman-version.h.in
@@ -47,4 +47,8 @@
PIXMAN_VERSION_MINOR, \
PIXMAN_VERSION_MICRO)
+#ifndef PIXMAN_API
+# define PIXMAN_API
+#endif
+
#endif /* PIXMAN_VERSION_H__ */
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index c33631c..d4b5dc8 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -26,21 +26,45 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
#include "pixman-combine32.h"
+#include "pixman-inlines.h"
#include <altivec.h>
#define AVV(x...) {x}
+static vector unsigned int mask_ff000000;
+static vector unsigned int mask_red;
+static vector unsigned int mask_green;
+static vector unsigned int mask_blue;
+static vector unsigned int mask_565_fix_rb;
+static vector unsigned int mask_565_fix_g;
+
static force_inline vector unsigned int
splat_alpha (vector unsigned int pix)
{
+#ifdef WORDS_BIGENDIAN
return vec_perm (pix, pix,
(vector unsigned char)AVV (
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+#else
+ return vec_perm (pix, pix,
+ (vector unsigned char)AVV (
+ 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
+ 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
+#endif
+}
+
+static force_inline vector unsigned int
+splat_pixel (vector unsigned int pix)
+{
+ return vec_perm (pix, pix,
+ (vector unsigned char)AVV (
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
}
static force_inline vector unsigned int
@@ -50,12 +74,22 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
/* unpack to short */
hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergeh ((vector unsigned char)AVV (0),
(vector unsigned char)p);
+#else
+ vec_mergeh ((vector unsigned char) p,
+ (vector unsigned char) AVV (0));
+#endif
mod = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergeh ((vector unsigned char)AVV (0),
(vector unsigned char)a);
+#else
+ vec_mergeh ((vector unsigned char) a,
+ (vector unsigned char) AVV (0));
+#endif
hi = vec_mladd (hi, mod, (vector unsigned short)
AVV (0x0080, 0x0080, 0x0080, 0x0080,
@@ -67,11 +101,22 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
/* unpack to short */
lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergel ((vector unsigned char)AVV (0),
(vector unsigned char)p);
+#else
+ vec_mergel ((vector unsigned char) p,
+ (vector unsigned char) AVV (0));
+#endif
+
mod = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
vec_mergel ((vector unsigned char)AVV (0),
(vector unsigned char)a);
+#else
+ vec_mergel ((vector unsigned char) a,
+ (vector unsigned char) AVV (0));
+#endif
lo = vec_mladd (lo, mod, (vector unsigned short)
AVV (0x0080, 0x0080, 0x0080, 0x0080,
@@ -129,6 +174,7 @@ over (vector unsigned int src,
over (pix_multiply (src, mask), \
pix_multiply (srca, mask), dest)
+#ifdef WORDS_BIGENDIAN
#define COMPUTE_SHIFT_MASK(source) \
source ## _mask = vec_lvsl (0, source);
@@ -140,36 +186,294 @@ over (vector unsigned int src,
mask ## _mask = vec_lvsl (0, mask); \
source ## _mask = vec_lvsl (0, source);
-/* notice you have to declare temp vars...
- * Note: tmp3 and tmp4 must remain untouched!
- */
-
-#define LOAD_VECTORS(dest, source) \
+#define LOAD_VECTOR(source) \
+do \
+{ \
+ vector unsigned char tmp1, tmp2; \
tmp1 = (typeof(tmp1))vec_ld (0, source); \
tmp2 = (typeof(tmp2))vec_ld (15, source); \
- v ## source = (typeof(v ## source)) \
+ v ## source = (typeof(v ## source)) \
vec_perm (tmp1, tmp2, source ## _mask); \
- v ## dest = (typeof(v ## dest))vec_ld (0, dest);
+} while (0)
-#define LOAD_VECTORSC(dest, source, mask) \
- tmp1 = (typeof(tmp1))vec_ld (0, source); \
- tmp2 = (typeof(tmp2))vec_ld (15, source); \
- v ## source = (typeof(v ## source)) \
- vec_perm (tmp1, tmp2, source ## _mask); \
- tmp1 = (typeof(tmp1))vec_ld (0, mask); \
+#define LOAD_VECTORS(dest, source) \
+do \
+{ \
+ LOAD_VECTOR(source); \
v ## dest = (typeof(v ## dest))vec_ld (0, dest); \
- tmp2 = (typeof(tmp2))vec_ld (15, mask); \
- v ## mask = (typeof(v ## mask)) \
- vec_perm (tmp1, tmp2, mask ## _mask);
+} while (0)
+
+#define LOAD_VECTORSC(dest, source, mask) \
+do \
+{ \
+ LOAD_VECTORS(dest, source); \
+ LOAD_VECTOR(mask); \
+} while (0)
+
+#define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
+#define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
+
+#else
+
+/* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
+ * They are defined that way because little endian altivec can do unaligned
+ * reads natively and have no need for constructing the permutation pattern
+ * variables.
+ */
+#define COMPUTE_SHIFT_MASK(source)
+
+#define COMPUTE_SHIFT_MASKS(dest, source)
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)
+
+# define LOAD_VECTOR(source) \
+ v ## source = (typeof(v ## source))vec_xl(0, source);
+
+# define LOAD_VECTORS(dest, source) \
+ LOAD_VECTOR(source); \
+ LOAD_VECTOR(dest); \
+
+# define LOAD_VECTORSC(dest, source, mask) \
+ LOAD_VECTORS(dest, source); \
+ LOAD_VECTOR(mask); \
+
+#define DECLARE_SRC_MASK_VAR
+#define DECLARE_MASK_MASK_VAR
+
+#endif /* WORDS_BIGENDIAN */
#define LOAD_VECTORSM(dest, source, mask) \
- LOAD_VECTORSC (dest, source, mask) \
+ LOAD_VECTORSC (dest, source, mask); \
v ## source = pix_multiply (v ## source, \
splat_alpha (v ## mask));
#define STORE_VECTOR(dest) \
vec_st ((vector unsigned int) v ## dest, 0, dest);
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline vector unsigned int
+load_128_aligned (const uint32_t* src)
+{
+ return *((vector unsigned int *) src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline vector unsigned int
+load_128_unaligned (const uint32_t* src)
+{
+ vector unsigned int vsrc;
+ DECLARE_SRC_MASK_VAR;
+
+ COMPUTE_SHIFT_MASK (src);
+ LOAD_VECTOR (src);
+
+ return vsrc;
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (uint32_t* data,
+ vector unsigned int vdata)
+{
+ STORE_VECTOR(data)
+}
+
+static force_inline vector unsigned int
+create_mask_32_128 (uint32_t mask)
+{
+ return (vector unsigned int) {mask, mask, mask, mask};
+}
+
+static force_inline vector unsigned int
+unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned char lo;
+
+ /* unpack to short */
+ lo = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+ vec_mergel ((vector unsigned char) data2,
+ (vector unsigned char) data1);
+#else
+ vec_mergel ((vector unsigned char) data1,
+ (vector unsigned char) data2);
+#endif
+
+ return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned char hi;
+
+ /* unpack to short */
+ hi = (vector unsigned char)
+#ifdef WORDS_BIGENDIAN
+ vec_mergeh ((vector unsigned char) data2,
+ (vector unsigned char) data1);
+#else
+ vec_mergeh ((vector unsigned char) data1,
+ (vector unsigned char) data2);
+#endif
+
+ return (vector unsigned int) hi;
+}
+
+static force_inline vector unsigned int
+unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned short lo;
+
+ /* unpack to char */
+ lo = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+ vec_mergel ((vector unsigned short) data2,
+ (vector unsigned short) data1);
+#else
+ vec_mergel ((vector unsigned short) data1,
+ (vector unsigned short) data2);
+#endif
+
+ return (vector unsigned int) lo;
+}
+
+static force_inline vector unsigned int
+unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
+{
+ vector unsigned short hi;
+
+ /* unpack to char */
+ hi = (vector unsigned short)
+#ifdef WORDS_BIGENDIAN
+ vec_mergeh ((vector unsigned short) data2,
+ (vector unsigned short) data1);
+#else
+ vec_mergeh ((vector unsigned short) data1,
+ (vector unsigned short) data2);
+#endif
+
+ return (vector unsigned int) hi;
+}
+
+static force_inline void
+unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
+ vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+ *data_lo = unpacklo_128_16x8(data1, data2);
+ *data_hi = unpackhi_128_16x8(data1, data2);
+}
+
+static force_inline void
+unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
+ vector unsigned int* data_lo, vector unsigned int* data_hi)
+{
+ *data_lo = unpacklo_128_8x16(data1, data2);
+ *data_hi = unpackhi_128_8x16(data1, data2);
+}
+
+static force_inline vector unsigned int
+unpack_565_to_8888 (vector unsigned int lo)
+{
+ vector unsigned int r, g, b, rb, t;
+
+ r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
+ g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
+ b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
+
+ rb = vec_or (r, b);
+ t = vec_and (rb, mask_565_fix_rb);
+ t = vec_sr (t, create_mask_32_128(5));
+ rb = vec_or (rb, t);
+
+ t = vec_and (g, mask_565_fix_g);
+ t = vec_sr (t, create_mask_32_128(6));
+ g = vec_or (g, t);
+
+ return vec_or (rb, g);
+}
+
+static force_inline int
+is_opaque (vector unsigned int x)
+{
+ uint32_t cmp_result;
+ vector bool int ffs = vec_cmpeq(x, x);
+
+ cmp_result = vec_all_eq(x, ffs);
+
+ return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (vector unsigned int x)
+{
+ uint32_t cmp_result;
+
+ cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+
+ return cmp_result == 0xffff;
+}
+
+static force_inline int
+is_transparent (vector unsigned int x)
+{
+ uint32_t cmp_result;
+
+ cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
+ return (cmp_result & 0x8888) == 0x8888;
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
+{
+ uint32_t a;
+
+ a = ALPHA_8(src);
+
+ if (a == 0xff)
+ {
+ return src;
+ }
+ else if (src)
+ {
+ UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
+ }
+
+ return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+ uint32_t s = *ps;
+
+ if (pm)
+ UN8x4_MUL_UN8(s, ALPHA_8(*pm));
+
+ return s;
+}
+
+static force_inline vector unsigned int
+combine4 (const uint32_t* ps, const uint32_t* pm)
+{
+ vector unsigned int src, msk;
+
+ if (pm)
+ {
+ msk = load_128_unaligned(pm);
+
+ if (is_transparent(msk))
+ return (vector unsigned int) AVV(0);
+ }
+
+ src = load_128_unaligned(ps);
+
+ if (pm)
+ src = pix_multiply(src, msk);
+
+ return src;
+}
+
static void
vmx_combine_over_u_no_mask (uint32_t * dest,
const uint32_t *src,
@@ -177,7 +481,7 @@ vmx_combine_over_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -227,7 +531,8 @@ vmx_combine_over_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -298,7 +603,7 @@ vmx_combine_over_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -346,7 +651,8 @@ vmx_combine_over_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -414,7 +720,7 @@ vmx_combine_in_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -459,7 +765,8 @@ vmx_combine_in_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -524,7 +831,7 @@ vmx_combine_in_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -571,7 +878,8 @@ vmx_combine_in_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -638,7 +946,7 @@ vmx_combine_out_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -685,7 +993,8 @@ vmx_combine_out_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -750,7 +1059,7 @@ vmx_combine_out_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -798,7 +1107,8 @@ vmx_combine_out_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -865,7 +1175,7 @@ vmx_combine_atop_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -917,7 +1227,8 @@ vmx_combine_atop_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -993,7 +1304,7 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1045,7 +1356,8 @@ vmx_combine_atop_reverse_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1121,7 +1433,7 @@ vmx_combine_xor_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1173,7 +1485,8 @@ vmx_combine_xor_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1249,7 +1562,7 @@ vmx_combine_add_u_no_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc;
- vector unsigned char tmp1, tmp2, src_mask;
+ DECLARE_SRC_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1295,7 +1608,8 @@ vmx_combine_add_u_mask (uint32_t * dest,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1363,7 +1677,8 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1413,7 +1728,8 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1471,7 +1787,8 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1527,7 +1844,8 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1581,7 +1899,8 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1636,7 +1955,8 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1693,7 +2013,8 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1750,7 +2071,8 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask, vsrca;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1816,7 +2138,8 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1879,7 +2202,8 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1942,7 +2266,8 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
{
int i;
vector unsigned int vdest, vsrc, vmask;
- vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+ DECLARE_SRC_MASK_VAR;
+ DECLARE_MASK_MASK_VAR;
while (width && ((uintptr_t)dest & 15))
{
@@ -1986,16 +2311,803 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
}
}
+static void
+vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m, d, s, ia;
+
+ vector unsigned int vsrc, valpha, vmask, vdst;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ srca = ALPHA_8(src);
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+ vsrc = (vector unsigned int) {src, src, src, src};
+ valpha = splat_alpha(vsrc);
+
+ while (height--)
+ {
+ const uint8_t *pm = mask_line;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (uintptr_t)dst & 15)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *dst;
+ UN8x4_MUL_UN8 (s, m);
+ ia = ALPHA_8 (~s);
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+ *dst = d;
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)pm);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned(dst, vsrc);
+ }
+ else if (m)
+ {
+ vmask = splat_pixel((vector unsigned int) {m, m, m, m});
+
+ /* dst is 16-byte aligned */
+ vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
+
+ save_128_aligned(dst, vdst);
+ }
+
+ w -= 4;
+ dst += 4;
+ pm += 4;
+ }
+
+ while (w)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *dst;
+ UN8x4_MUL_UN8 (s, m);
+ ia = ALPHA_8 (~s);
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+ *dst = d;
+ }
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
+static pixman_bool_t
+vmx_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t filler)
+{
+ uint32_t byte_width;
+ uint8_t *byte_line;
+
+ vector unsigned int vfiller;
+
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
+
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
+
+ b = filler & 0xff;
+ w = (b << 8) | b;
+ filler = (w << 16) | w;
+ }
+ else if (bpp == 16)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
+
+ filler = (filler & 0xffff) * 0x00010001;
+ }
+ else if (bpp == 32)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
+
+ vfiller = create_mask_32_128(filler);
+
+ while (height--)
+ {
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ if (w >= 1 && ((uintptr_t)d & 1))
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+
+ while (w >= 2 && ((uintptr_t)d & 3))
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((uintptr_t)d & 15))
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 128)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+ vec_st(vfiller, 0, (uint32_t *) d + 16);
+ vec_st(vfiller, 0, (uint32_t *) d + 20);
+ vec_st(vfiller, 0, (uint32_t *) d + 24);
+ vec_st(vfiller, 0, (uint32_t *) d + 28);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+ vec_st(vfiller, 0, (uint32_t *) d + 8);
+ vec_st(vfiller, 0, (uint32_t *) d + 12);
+
+ d += 64;
+ w -= 64;
+ }
+
+ if (w >= 32)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+ vec_st(vfiller, 0, (uint32_t *) d + 4);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ vec_st(vfiller, 0, (uint32_t *) d);
+
+ d += 16;
+ w -= 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = filler;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = filler;
+ w -= 2;
+ d += 2;
+ }
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = filler;
+ w -= 1;
+ d += 1;
+ }
+ }
+
+ return TRUE;
+}
+
+static void
+vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int32_t w;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (uintptr_t)dst & 15)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
+
+ vmx_src1 = load_128_unaligned (src);
+ vmx_src2 = load_128_unaligned (src + 4);
+ vmx_src3 = load_128_unaligned (src + 8);
+ vmx_src4 = load_128_unaligned (src + 12);
+
+ save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
+ save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
+ save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
+ save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+ }
+}
+
+static void
+vmx_composite_over_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t src, ia;
+ int i, w, dst_stride;
+ vector unsigned int vdst, vsrc, via;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ vsrc = (vector unsigned int){src, src, src, src};
+ via = negate (splat_alpha (vsrc));
+ ia = ALPHA_8 (~src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((uintptr_t)dst & 15))
+ {
+ uint32_t d = *dst;
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+ *dst++ = d;
+ w--;
+ }
+
+ for (i = w / 4; i > 0; i--)
+ {
+ vdst = pix_multiply (load_128_aligned (dst), via);
+ save_128_aligned (dst, pix_add (vsrc, vdst));
+ dst += 4;
+ }
+
+ for (i = w % 4; --i >= 0;)
+ {
+ uint32_t d = dst[i];
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+ dst[i] = d;
+ }
+ }
+}
+
+static void
+vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ int dst_stride, src_stride;
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ dst = dst_line;
+ src = src_line;
+
+ while (height--)
+ {
+ vmx_combine_over_u (imp, op, dst, src, NULL, width);
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
+static void
+vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src, ia;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ vector unsigned int vsrc, valpha, vmask, vdest;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+ vsrc = (vector unsigned int) {src, src, src, src};
+ valpha = splat_alpha(vsrc);
+ ia = ALPHA_8 (src);
+
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+ uint32_t s;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (uintptr_t)pd & 15)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ UN8x4_MUL_UN8x4 (s, m);
+ UN8x4_MUL_UN8 (m, ia);
+ m = ~m;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
+ *pd = d;
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ /* pm is NOT necessarily 16-byte aligned */
+ vmask = load_128_unaligned (pm);
+
+ pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0));
+
+ /* if all bits in mask are zero, pack_cmp is not 0 */
+ if (pack_cmp == 0)
+ {
+ /* pd is 16-byte aligned */
+ vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
+
+ save_128_aligned(pd, vdest);
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ s = src;
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ UN8x4_MUL_UN8x4 (s, m);
+ UN8x4_MUL_UN8 (m, ia);
+ m = ~m;
+ UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
+ *pd = d;
+ }
+
+ pd++;
+ w--;
+ }
+ }
+}
+
+static void
+vmx_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (uintptr_t)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ vmx_combine_add_u (imp, op,
+ (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+ }
+}
+
+static void
+vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+
+ vmx_combine_add_u (imp, op, dst, src, NULL, width);
+ }
+}
+
+static force_inline void
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t src_width_fixed,
+ pixman_bool_t fully_transparent_src)
+{
+ uint32_t s, d;
+ const uint32_t* pm = NULL;
+
+ vector unsigned int vsrc, vdst;
+
+ if (fully_transparent_src)
+ return;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((uintptr_t)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ *pd++ = core_combine_over_u_pixel_vmx (s, d);
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ uint32_t tmp[4];
+
+ tmp[0] = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp[1] = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp[2] = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+ tmp[3] = *(ps + pixman_fixed_to_int (vx));
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ vsrc = combine4 (tmp, pm);
+
+ if (is_opaque (vsrc))
+ {
+ save_128_aligned (pd, vsrc);
+ }
+ else if (!is_zero (vsrc))
+ {
+ vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
+
+ save_128_aligned (pd, vdst);
+ }
+
+ w -= 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+ vx += unit_x;
+ while (vx >= 0)
+ vx -= src_width_fixed;
+
+ *pd++ = core_combine_over_u_pixel_vmx (s, d);
+ if (pm)
+ pm++;
+
+ w--;
+ }
+}
+
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
+ scaled_nearest_scanline_vmx_8888_8888_OVER,
+ uint32_t, uint32_t, NORMAL)
+
static const pixman_fast_path_t vmx_fast_paths[] =
{
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, vmx_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, vmx_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
+
+ /* PIXMAN_OP_ADD */
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
+
+ /* PIXMAN_OP_SRC */
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
+
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
+
{ PIXMAN_OP_NONE },
};
+static uint32_t *
+vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ vector unsigned int ff000000 = mask_ff000000;
+ uint32_t *dst = iter->buffer;
+ uint32_t *src = (uint32_t *)iter->bits;
+
+ iter->bits += iter->stride;
+
+ while (w && ((uintptr_t)dst) & 0x0f)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+static uint32_t *
+vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+ vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
+
+ iter->bits += iter->stride;
+
+ while (w && (((uintptr_t)dst) & 15))
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ vmx0 = load_128_unaligned((uint32_t *) src);
+
+ unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
+ unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
+
+ save_128_aligned(dst, vmx6);
+ save_128_aligned((dst + 4), vmx5);
+ save_128_aligned((dst + 8), vmx4);
+ save_128_aligned((dst + 12), vmx3);
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+#define IMAGE_FLAGS \
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
+ FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t vmx_iters[] =
+{
+ { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+ _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
+ },
+ { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+ _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
+ },
+ { PIXMAN_null },
+};
+
pixman_implementation_t *
_pixman_implementation_create_vmx (pixman_implementation_t *fallback)
{
pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
+ /* VMX constants */
+ mask_ff000000 = create_mask_32_128 (0xff000000);
+ mask_red = create_mask_32_128 (0x00f80000);
+ mask_green = create_mask_32_128 (0x0000fc00);
+ mask_blue = create_mask_32_128 (0x000000f8);
+ mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
+ mask_565_fix_g = create_mask_32_128 (0x0000c000);
+
/* Set up function pointers */
imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
@@ -2022,5 +3134,9 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+ imp->fill = vmx_fill;
+
+ imp->iter_info = vmx_iters;
+
return imp;
}
diff --git a/pixman/pixman-x86.c b/pixman/pixman-x86.c
index 05297c4..2f688eb 100644
--- a/pixman/pixman-x86.c
+++ b/pixman/pixman-x86.c
@@ -20,7 +20,7 @@
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -74,69 +74,17 @@ detect_cpu_features (void)
#else
-#define _PIXMAN_X86_64 \
- (defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64))
-
-static pixman_bool_t
-have_cpuid (void)
-{
-#if _PIXMAN_X86_64 || defined (_MSC_VER)
-
- return TRUE;
-
-#elif defined (__GNUC__)
- uint32_t result;
-
- __asm__ volatile (
- "pushf" "\n\t"
- "pop %%eax" "\n\t"
- "mov %%eax, %%ecx" "\n\t"
- "xor $0x00200000, %%eax" "\n\t"
- "push %%eax" "\n\t"
- "popf" "\n\t"
- "pushf" "\n\t"
- "pop %%eax" "\n\t"
- "xor %%ecx, %%eax" "\n\t"
- "mov %%eax, %0" "\n\t"
- : "=r" (result)
- :
- : "%eax", "%ecx");
-
- return !!result;
-
-#else
-#error "Unknown compiler"
+#if defined (__GNUC__)
+#include <cpuid.h>
#endif
-}
static void
pixman_cpuid (uint32_t feature,
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
{
#if defined (__GNUC__)
-
-#if _PIXMAN_X86_64
- __asm__ volatile (
- "cpuid" "\n\t"
- : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
- : "a" (feature));
-#else
- /* On x86-32 we need to be careful about the handling of %ebx
- * and %esp. We can't declare either one as clobbered
- * since they are special registers (%ebx is the "PIC
- * register" holding an offset to global data, %esp the
- * stack pointer), so we need to make sure that %ebx is
- * preserved, and that %esp has its original value when
- * accessing the output operands.
- */
- __asm__ volatile (
- "xchg %%ebx, %1" "\n\t"
- "cpuid" "\n\t"
- "xchg %%ebx, %1" "\n\t"
- : "=a" (*a), "=r" (*b), "=c" (*c), "=d" (*d)
- : "a" (feature));
-#endif
-
+ *a = *b = *c = *d = 0;
+ __get_cpuid(feature, a, b, c, d);
#elif defined (_MSC_VER)
int info[4];
@@ -157,9 +105,6 @@ detect_cpu_features (void)
uint32_t a, b, c, d;
cpu_features_t features = 0;
- if (!have_cpuid())
- return features;
-
/* Get feature bits */
pixman_cpuid (0x01, &a, &b, &c, &d);
if (d & (1 << 15))
@@ -187,6 +132,7 @@ detect_cpu_features (void)
memcpy (vendor + 8, &c, 4);
if (strcmp (vendor, "AuthenticAMD") == 0 ||
+ strcmp (vendor, "HygonGenuine") == 0 ||
strcmp (vendor, "Geode by NSC") == 0)
{
pixman_cpuid (0x80000000, &a, &b, &c, &d);
diff --git a/pixman/pixman.c b/pixman/pixman.c
index 9555cea..82ec236 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -24,7 +24,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
@@ -182,7 +182,7 @@ clip_general_image (pixman_region32_t * region,
return FALSE;
}
}
- else if (!pixman_region32_not_empty (clip))
+ else if (pixman_region32_empty (clip))
{
return FALSE;
}
@@ -277,7 +277,7 @@ _pixman_compute_composite_region32 (pixman_region32_t * region,
{
return FALSE;
}
- if (!pixman_region32_not_empty (region))
+ if (pixman_region32_empty (region))
return FALSE;
if (dest_image->common.alpha_map->common.have_clip_region)
{
@@ -325,18 +325,20 @@ _pixman_compute_composite_region32 (pixman_region32_t * region,
return TRUE;
}
-typedef struct
+typedef struct box_48_16 box_48_16_t;
+
+struct box_48_16
{
- pixman_fixed_48_16_t x1;
- pixman_fixed_48_16_t y1;
- pixman_fixed_48_16_t x2;
- pixman_fixed_48_16_t y2;
-} box_48_16_t;
+ pixman_fixed_48_16_t x1;
+ pixman_fixed_48_16_t y1;
+ pixman_fixed_48_16_t x2;
+ pixman_fixed_48_16_t y2;
+};
static pixman_bool_t
-compute_transformed_extents (pixman_transform_t *transform,
+compute_transformed_extents (pixman_transform_t *transform,
const pixman_box32_t *extents,
- box_48_16_t *transformed)
+ box_48_16_t *transformed)
{
pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
pixman_fixed_t x1, y1, x2, y2;
@@ -495,21 +497,12 @@ analyze_extent (pixman_image_t *image,
if (!compute_transformed_extents (transform, extents, &transformed))
return FALSE;
- /* Expand the source area by a tiny bit so account of different rounding that
- * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
- * 0.5 so this won't cause the area computed to be overly pessimistic.
- */
- transformed.x1 -= 8 * pixman_fixed_e;
- transformed.y1 -= 8 * pixman_fixed_e;
- transformed.x2 += 8 * pixman_fixed_e;
- transformed.y2 += 8 * pixman_fixed_e;
-
if (image->common.type == BITS)
{
- if (pixman_fixed_to_int (transformed.x1) >= 0 &&
- pixman_fixed_to_int (transformed.y1) >= 0 &&
- pixman_fixed_to_int (transformed.x2) < image->bits.width &&
- pixman_fixed_to_int (transformed.y2) < image->bits.height)
+ if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_e) >= 0 &&
+ pixman_fixed_to_int (transformed.y1 - pixman_fixed_e) >= 0 &&
+ pixman_fixed_to_int (transformed.x2 - pixman_fixed_e) < image->bits.width &&
+ pixman_fixed_to_int (transformed.y2 - pixman_fixed_e) < image->bits.height)
{
*flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
}
@@ -784,6 +777,11 @@ color_to_pixel (const pixman_color_t *color,
{
uint32_t c = color_to_uint32 (color);
+ if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA_FLOAT)
+ {
+ return FALSE;
+ }
+
if (!(format == PIXMAN_a8r8g8b8 ||
format == PIXMAN_x8r8g8b8 ||
format == PIXMAN_a8b8g8r8 ||
@@ -1022,6 +1020,7 @@ pixman_format_supported_source (pixman_format_code_t format)
case PIXMAN_x2r10g10b10:
case PIXMAN_a8r8g8b8:
case PIXMAN_a8r8g8b8_sRGB:
+ case PIXMAN_r8g8b8_sRGB:
case PIXMAN_x8r8g8b8:
case PIXMAN_a8b8g8r8:
case PIXMAN_x8b8g8r8:
diff --git a/pixman/pixman.h b/pixman/pixman.h
index 509ba5e..d697b53 100644
--- a/pixman/pixman.h
+++ b/pixman/pixman.h
@@ -127,7 +127,7 @@ typedef pixman_fixed_16_16_t pixman_fixed_t;
#define pixman_fixed_1_minus_e (pixman_fixed_1 - pixman_fixed_e)
#define pixman_fixed_minus_1 (pixman_int_to_fixed(-1))
#define pixman_fixed_to_int(f) ((int) ((f) >> 16))
-#define pixman_int_to_fixed(i) ((pixman_fixed_t) ((i) << 16))
+#define pixman_int_to_fixed(i) ((pixman_fixed_t) ((uint32_t) (i) << 16))
#define pixman_fixed_to_double(f) (double) ((f) / (double) pixman_fixed_1)
#define pixman_double_to_fixed(d) ((pixman_fixed_t) ((d) * 65536.0))
#define pixman_fixed_frac(f) ((f) & pixman_fixed_1_minus_e)
@@ -184,42 +184,73 @@ struct pixman_transform
struct pixman_box16;
typedef union pixman_image pixman_image_t;
+PIXMAN_API
void pixman_transform_init_identity (struct pixman_transform *matrix);
+
+PIXMAN_API
pixman_bool_t pixman_transform_point_3d (const struct pixman_transform *transform,
struct pixman_vector *vector);
+
+PIXMAN_API
pixman_bool_t pixman_transform_point (const struct pixman_transform *transform,
struct pixman_vector *vector);
+
+PIXMAN_API
pixman_bool_t pixman_transform_multiply (struct pixman_transform *dst,
const struct pixman_transform *l,
const struct pixman_transform *r);
+
+PIXMAN_API
void pixman_transform_init_scale (struct pixman_transform *t,
pixman_fixed_t sx,
pixman_fixed_t sy);
+
+PIXMAN_API
pixman_bool_t pixman_transform_scale (struct pixman_transform *forward,
struct pixman_transform *reverse,
pixman_fixed_t sx,
pixman_fixed_t sy);
+
+PIXMAN_API
void pixman_transform_init_rotate (struct pixman_transform *t,
pixman_fixed_t cos,
pixman_fixed_t sin);
+
+PIXMAN_API
pixman_bool_t pixman_transform_rotate (struct pixman_transform *forward,
struct pixman_transform *reverse,
pixman_fixed_t c,
pixman_fixed_t s);
+
+PIXMAN_API
void pixman_transform_init_translate (struct pixman_transform *t,
pixman_fixed_t tx,
pixman_fixed_t ty);
+
+PIXMAN_API
pixman_bool_t pixman_transform_translate (struct pixman_transform *forward,
struct pixman_transform *reverse,
pixman_fixed_t tx,
pixman_fixed_t ty);
+
+PIXMAN_API
pixman_bool_t pixman_transform_bounds (const struct pixman_transform *matrix,
struct pixman_box16 *b);
+
+PIXMAN_API
pixman_bool_t pixman_transform_invert (struct pixman_transform *dst,
const struct pixman_transform *src);
+
+PIXMAN_API
pixman_bool_t pixman_transform_is_identity (const struct pixman_transform *t);
+
+PIXMAN_API
pixman_bool_t pixman_transform_is_scale (const struct pixman_transform *t);
+
+PIXMAN_API
pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
+
+PIXMAN_API
pixman_bool_t pixman_transform_is_inverse (const struct pixman_transform *a,
const struct pixman_transform *b);
@@ -239,42 +270,70 @@ struct pixman_f_transform
double m[3][3];
};
+
+PIXMAN_API
pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform *t,
const struct pixman_f_transform *ft);
+
+PIXMAN_API
void pixman_f_transform_from_pixman_transform (struct pixman_f_transform *ft,
const struct pixman_transform *t);
+
+PIXMAN_API
pixman_bool_t pixman_f_transform_invert (struct pixman_f_transform *dst,
const struct pixman_f_transform *src);
+
+PIXMAN_API
pixman_bool_t pixman_f_transform_point (const struct pixman_f_transform *t,
struct pixman_f_vector *v);
+
+PIXMAN_API
void pixman_f_transform_point_3d (const struct pixman_f_transform *t,
struct pixman_f_vector *v);
+
+PIXMAN_API
void pixman_f_transform_multiply (struct pixman_f_transform *dst,
const struct pixman_f_transform *l,
const struct pixman_f_transform *r);
+
+PIXMAN_API
void pixman_f_transform_init_scale (struct pixman_f_transform *t,
double sx,
double sy);
+
+PIXMAN_API
pixman_bool_t pixman_f_transform_scale (struct pixman_f_transform *forward,
struct pixman_f_transform *reverse,
double sx,
double sy);
+
+PIXMAN_API
void pixman_f_transform_init_rotate (struct pixman_f_transform *t,
double cos,
double sin);
+
+PIXMAN_API
pixman_bool_t pixman_f_transform_rotate (struct pixman_f_transform *forward,
struct pixman_f_transform *reverse,
double c,
double s);
+
+PIXMAN_API
void pixman_f_transform_init_translate (struct pixman_f_transform *t,
double tx,
double ty);
+
+PIXMAN_API
pixman_bool_t pixman_f_transform_translate (struct pixman_f_transform *forward,
struct pixman_f_transform *reverse,
double tx,
double ty);
+
+PIXMAN_API
pixman_bool_t pixman_f_transform_bounds (const struct pixman_f_transform *t,
struct pixman_box16 *b);
+
+PIXMAN_API
void pixman_f_transform_init_identity (struct pixman_f_transform *t);
typedef enum
@@ -287,6 +346,16 @@ typedef enum
typedef enum
{
+ PIXMAN_DITHER_NONE,
+ PIXMAN_DITHER_FAST,
+ PIXMAN_DITHER_GOOD,
+ PIXMAN_DITHER_BEST,
+ PIXMAN_DITHER_ORDERED_BAYER_8,
+ PIXMAN_DITHER_ORDERED_BLUE_NOISE_64,
+} pixman_dither_t;
+
+typedef enum
+{
PIXMAN_FILTER_FAST,
PIXMAN_FILTER_GOOD,
PIXMAN_FILTER_BEST,
@@ -423,73 +492,123 @@ typedef enum
/* This function exists only to make it possible to preserve
* the X ABI - it should go away at first opportunity.
*/
+PIXMAN_API
void pixman_region_set_static_pointers (pixman_box16_t *empty_box,
pixman_region16_data_t *empty_data,
pixman_region16_data_t *broken_data);
/* creation/destruction */
+PIXMAN_API
void pixman_region_init (pixman_region16_t *region);
+
+PIXMAN_API
void pixman_region_init_rect (pixman_region16_t *region,
int x,
int y,
unsigned int width,
unsigned int height);
+
+PIXMAN_API
pixman_bool_t pixman_region_init_rects (pixman_region16_t *region,
const pixman_box16_t *boxes,
int count);
-void pixman_region_init_with_extents (pixman_region16_t *region,
- pixman_box16_t *extents);
+
+PIXMAN_API
+void pixman_region_init_with_extents (pixman_region16_t *region,
+ const pixman_box16_t *extents);
+
+PIXMAN_API
void pixman_region_init_from_image (pixman_region16_t *region,
pixman_image_t *image);
+
+PIXMAN_API
void pixman_region_fini (pixman_region16_t *region);
/* manipulation */
+PIXMAN_API
void pixman_region_translate (pixman_region16_t *region,
int x,
int y);
-pixman_bool_t pixman_region_copy (pixman_region16_t *dest,
- pixman_region16_t *source);
-pixman_bool_t pixman_region_intersect (pixman_region16_t *new_reg,
- pixman_region16_t *reg1,
- pixman_region16_t *reg2);
-pixman_bool_t pixman_region_union (pixman_region16_t *new_reg,
- pixman_region16_t *reg1,
- pixman_region16_t *reg2);
-pixman_bool_t pixman_region_union_rect (pixman_region16_t *dest,
- pixman_region16_t *source,
- int x,
- int y,
- unsigned int width,
- unsigned int height);
-pixman_bool_t pixman_region_intersect_rect (pixman_region16_t *dest,
- pixman_region16_t *source,
- int x,
- int y,
- unsigned int width,
- unsigned int height);
-pixman_bool_t pixman_region_subtract (pixman_region16_t *reg_d,
- pixman_region16_t *reg_m,
- pixman_region16_t *reg_s);
-pixman_bool_t pixman_region_inverse (pixman_region16_t *new_reg,
- pixman_region16_t *reg1,
- pixman_box16_t *inv_rect);
-pixman_bool_t pixman_region_contains_point (pixman_region16_t *region,
- int x,
- int y,
- pixman_box16_t *box);
-pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
- pixman_box16_t *prect);
-pixman_bool_t pixman_region_not_empty (pixman_region16_t *region);
-pixman_box16_t * pixman_region_extents (pixman_region16_t *region);
-int pixman_region_n_rects (pixman_region16_t *region);
-pixman_box16_t * pixman_region_rectangles (pixman_region16_t *region,
- int *n_rects);
-pixman_bool_t pixman_region_equal (pixman_region16_t *region1,
- pixman_region16_t *region2);
+
+PIXMAN_API
+pixman_bool_t pixman_region_copy (pixman_region16_t *dest,
+ const pixman_region16_t *source);
+
+PIXMAN_API
+pixman_bool_t pixman_region_intersect (pixman_region16_t *new_reg,
+ const pixman_region16_t *reg1,
+ const pixman_region16_t *reg2);
+
+PIXMAN_API
+pixman_bool_t pixman_region_union (pixman_region16_t *new_reg,
+ const pixman_region16_t *reg1,
+ const pixman_region16_t *reg2);
+
+PIXMAN_API
+pixman_bool_t pixman_region_union_rect (pixman_region16_t *dest,
+ const pixman_region16_t *source,
+ int x,
+ int y,
+ unsigned int width,
+ unsigned int height);
+
+PIXMAN_API
+pixman_bool_t pixman_region_intersect_rect (pixman_region16_t *dest,
+ const pixman_region16_t *source,
+ int x,
+ int y,
+ unsigned int width,
+ unsigned int height);
+
+PIXMAN_API
+pixman_bool_t pixman_region_subtract (pixman_region16_t *reg_d,
+ const pixman_region16_t *reg_m,
+ const pixman_region16_t *reg_s);
+
+PIXMAN_API
+pixman_bool_t pixman_region_inverse (pixman_region16_t *new_reg,
+ const pixman_region16_t *reg1,
+ const pixman_box16_t *inv_rect);
+
+PIXMAN_API
+pixman_bool_t pixman_region_contains_point (const pixman_region16_t *region,
+ int x,
+ int y,
+ pixman_box16_t *box);
+
+PIXMAN_API
+pixman_region_overlap_t pixman_region_contains_rectangle (const pixman_region16_t *region,
+ const pixman_box16_t *prect);
+
+PIXMAN_API
+pixman_bool_t pixman_region_empty (const pixman_region16_t *region);
+
+PIXMAN_API
+pixman_bool_t pixman_region_not_empty (const pixman_region16_t *region);
+
+PIXMAN_API
+pixman_box16_t * pixman_region_extents (const pixman_region16_t *region);
+
+PIXMAN_API
+int pixman_region_n_rects (const pixman_region16_t *region);
+
+PIXMAN_API
+pixman_box16_t * pixman_region_rectangles (const pixman_region16_t *region,
+ int *n_rects);
+
+PIXMAN_API
+pixman_bool_t pixman_region_equal (const pixman_region16_t *region1,
+ const pixman_region16_t *region2);
+
+PIXMAN_API
pixman_bool_t pixman_region_selfcheck (pixman_region16_t *region);
-void pixman_region_reset (pixman_region16_t *region,
- pixman_box16_t *box);
+
+PIXMAN_API
+void pixman_region_reset (pixman_region16_t *region,
+ const pixman_box16_t *box);
+
+PIXMAN_API
void pixman_region_clear (pixman_region16_t *region);
/*
* 32 bit regions
@@ -523,72 +642,122 @@ struct pixman_region32
};
/* creation/destruction */
+PIXMAN_API
void pixman_region32_init (pixman_region32_t *region);
+
+PIXMAN_API
void pixman_region32_init_rect (pixman_region32_t *region,
int x,
int y,
unsigned int width,
unsigned int height);
+
+PIXMAN_API
pixman_bool_t pixman_region32_init_rects (pixman_region32_t *region,
const pixman_box32_t *boxes,
int count);
-void pixman_region32_init_with_extents (pixman_region32_t *region,
- pixman_box32_t *extents);
+
+PIXMAN_API
+void pixman_region32_init_with_extents (pixman_region32_t *region,
+ const pixman_box32_t *extents);
+
+PIXMAN_API
void pixman_region32_init_from_image (pixman_region32_t *region,
pixman_image_t *image);
+
+PIXMAN_API
void pixman_region32_fini (pixman_region32_t *region);
/* manipulation */
+PIXMAN_API
void pixman_region32_translate (pixman_region32_t *region,
int x,
int y);
-pixman_bool_t pixman_region32_copy (pixman_region32_t *dest,
- pixman_region32_t *source);
-pixman_bool_t pixman_region32_intersect (pixman_region32_t *new_reg,
- pixman_region32_t *reg1,
- pixman_region32_t *reg2);
-pixman_bool_t pixman_region32_union (pixman_region32_t *new_reg,
- pixman_region32_t *reg1,
- pixman_region32_t *reg2);
-pixman_bool_t pixman_region32_intersect_rect (pixman_region32_t *dest,
- pixman_region32_t *source,
- int x,
- int y,
- unsigned int width,
- unsigned int height);
-pixman_bool_t pixman_region32_union_rect (pixman_region32_t *dest,
- pixman_region32_t *source,
- int x,
- int y,
- unsigned int width,
- unsigned int height);
-pixman_bool_t pixman_region32_subtract (pixman_region32_t *reg_d,
- pixman_region32_t *reg_m,
- pixman_region32_t *reg_s);
-pixman_bool_t pixman_region32_inverse (pixman_region32_t *new_reg,
- pixman_region32_t *reg1,
- pixman_box32_t *inv_rect);
-pixman_bool_t pixman_region32_contains_point (pixman_region32_t *region,
- int x,
- int y,
- pixman_box32_t *box);
-pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region,
- pixman_box32_t *prect);
-pixman_bool_t pixman_region32_not_empty (pixman_region32_t *region);
-pixman_box32_t * pixman_region32_extents (pixman_region32_t *region);
-int pixman_region32_n_rects (pixman_region32_t *region);
-pixman_box32_t * pixman_region32_rectangles (pixman_region32_t *region,
- int *n_rects);
-pixman_bool_t pixman_region32_equal (pixman_region32_t *region1,
- pixman_region32_t *region2);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_copy (pixman_region32_t *dest,
+ const pixman_region32_t *source);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_intersect (pixman_region32_t *new_reg,
+ const pixman_region32_t *reg1,
+ const pixman_region32_t *reg2);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_union (pixman_region32_t *new_reg,
+ const pixman_region32_t *reg1,
+ const pixman_region32_t *reg2);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_intersect_rect (pixman_region32_t *dest,
+ const pixman_region32_t *source,
+ int x,
+ int y,
+ unsigned int width,
+ unsigned int height);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_union_rect (pixman_region32_t *dest,
+ const pixman_region32_t *source,
+ int x,
+ int y,
+ unsigned int width,
+ unsigned int height);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_subtract (pixman_region32_t *reg_d,
+ const pixman_region32_t *reg_m,
+ const pixman_region32_t *reg_s);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_inverse (pixman_region32_t *new_reg,
+ const pixman_region32_t *reg1,
+ const pixman_box32_t *inv_rect);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_contains_point (const pixman_region32_t *region,
+ int x,
+ int y,
+ pixman_box32_t *box);
+
+PIXMAN_API
+pixman_region_overlap_t pixman_region32_contains_rectangle (const pixman_region32_t *region,
+ const pixman_box32_t *prect);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_empty (const pixman_region32_t *region);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_not_empty (const pixman_region32_t *region);
+
+PIXMAN_API
+pixman_box32_t * pixman_region32_extents (const pixman_region32_t *region);
+
+PIXMAN_API
+int pixman_region32_n_rects (const pixman_region32_t *region);
+
+PIXMAN_API
+pixman_box32_t * pixman_region32_rectangles (const pixman_region32_t *region,
+ int *n_rects);
+
+PIXMAN_API
+pixman_bool_t pixman_region32_equal (const pixman_region32_t *region1,
+ const pixman_region32_t *region2);
+
+PIXMAN_API
pixman_bool_t pixman_region32_selfcheck (pixman_region32_t *region);
-void pixman_region32_reset (pixman_region32_t *region,
- pixman_box32_t *box);
+
+PIXMAN_API
+void pixman_region32_reset (pixman_region32_t *region,
+ const pixman_box32_t *box);
+
+PIXMAN_API
void pixman_region32_clear (pixman_region32_t *region);
/* Copy / Fill / Misc */
+PIXMAN_API
pixman_bool_t pixman_blt (uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
@@ -601,6 +770,8 @@ pixman_bool_t pixman_blt (uint32_t *src_bits,
int dest_y,
int width,
int height);
+
+PIXMAN_API
pixman_bool_t pixman_fill (uint32_t *bits,
int stride,
int bpp,
@@ -610,7 +781,11 @@ pixman_bool_t pixman_fill (uint32_t *bits,
int height,
uint32_t _xor);
+
+PIXMAN_API
int pixman_version (void);
+
+PIXMAN_API
const char* pixman_version_string (void);
/*
@@ -654,12 +829,24 @@ struct pixman_indexed
((g) << 4) | \
((b)))
-#define PIXMAN_FORMAT_BPP(f) (((f) >> 24) )
-#define PIXMAN_FORMAT_TYPE(f) (((f) >> 16) & 0xff)
-#define PIXMAN_FORMAT_A(f) (((f) >> 12) & 0x0f)
-#define PIXMAN_FORMAT_R(f) (((f) >> 8) & 0x0f)
-#define PIXMAN_FORMAT_G(f) (((f) >> 4) & 0x0f)
-#define PIXMAN_FORMAT_B(f) (((f) ) & 0x0f)
+#define PIXMAN_FORMAT_BYTE(bpp,type,a,r,g,b) \
+ (((bpp >> 3) << 24) | \
+ (3 << 22) | ((type) << 16) | \
+ ((a >> 3) << 12) | \
+ ((r >> 3) << 8) | \
+ ((g >> 3) << 4) | \
+ ((b >> 3)))
+
+#define PIXMAN_FORMAT_RESHIFT(val, ofs, num) \
+ (((val >> (ofs)) & ((1 << (num)) - 1)) << ((val >> 22) & 3))
+
+#define PIXMAN_FORMAT_BPP(f) PIXMAN_FORMAT_RESHIFT(f, 24, 8)
+#define PIXMAN_FORMAT_SHIFT(f) ((uint32_t)((f >> 22) & 3))
+#define PIXMAN_FORMAT_TYPE(f) (((f) >> 16) & 0x3f)
+#define PIXMAN_FORMAT_A(f) PIXMAN_FORMAT_RESHIFT(f, 12, 4)
+#define PIXMAN_FORMAT_R(f) PIXMAN_FORMAT_RESHIFT(f, 8, 4)
+#define PIXMAN_FORMAT_G(f) PIXMAN_FORMAT_RESHIFT(f, 4, 4)
+#define PIXMAN_FORMAT_B(f) PIXMAN_FORMAT_RESHIFT(f, 0, 4)
#define PIXMAN_FORMAT_RGB(f) (((f) ) & 0xfff)
#define PIXMAN_FORMAT_VIS(f) (((f) ) & 0xffff)
#define PIXMAN_FORMAT_DEPTH(f) (PIXMAN_FORMAT_A(f) + \
@@ -678,15 +865,22 @@ struct pixman_indexed
#define PIXMAN_TYPE_BGRA 8
#define PIXMAN_TYPE_RGBA 9
#define PIXMAN_TYPE_ARGB_SRGB 10
+#define PIXMAN_TYPE_RGBA_FLOAT 11
#define PIXMAN_FORMAT_COLOR(f) \
(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB || \
PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR || \
PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA || \
- PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
+ PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA || \
+ PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA_FLOAT)
-/* 32bpp formats */
typedef enum {
+/* 128bpp formats */
+ PIXMAN_rgba_float = PIXMAN_FORMAT_BYTE(128,PIXMAN_TYPE_RGBA_FLOAT,32,32,32,32),
+/* 96bpp formats */
+ PIXMAN_rgb_float = PIXMAN_FORMAT_BYTE(96,PIXMAN_TYPE_RGBA_FLOAT,0,32,32,32),
+
+/* 32bpp formats */
PIXMAN_a8r8g8b8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
PIXMAN_x8r8g8b8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
PIXMAN_a8b8g8r8 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
@@ -703,6 +897,7 @@ typedef enum {
/* sRGB formats */
PIXMAN_a8r8g8b8_sRGB = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB_SRGB,8,8,8,8),
+ PIXMAN_r8g8b8_sRGB = PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB_SRGB,0,8,8,8),
/* 24bpp formats */
PIXMAN_r8g8b8 = PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
@@ -757,30 +952,44 @@ typedef enum {
} pixman_format_code_t;
/* Querying supported format values. */
+PIXMAN_API
pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
+
+PIXMAN_API
pixman_bool_t pixman_format_supported_source (pixman_format_code_t format);
/* Constructors */
+PIXMAN_API
pixman_image_t *pixman_image_create_solid_fill (const pixman_color_t *color);
+
+PIXMAN_API
pixman_image_t *pixman_image_create_linear_gradient (const pixman_point_fixed_t *p1,
const pixman_point_fixed_t *p2,
const pixman_gradient_stop_t *stops,
int n_stops);
+
+PIXMAN_API
pixman_image_t *pixman_image_create_radial_gradient (const pixman_point_fixed_t *inner,
const pixman_point_fixed_t *outer,
pixman_fixed_t inner_radius,
pixman_fixed_t outer_radius,
const pixman_gradient_stop_t *stops,
int n_stops);
+
+PIXMAN_API
pixman_image_t *pixman_image_create_conical_gradient (const pixman_point_fixed_t *center,
pixman_fixed_t angle,
const pixman_gradient_stop_t *stops,
int n_stops);
+
+PIXMAN_API
pixman_image_t *pixman_image_create_bits (pixman_format_code_t format,
int width,
int height,
uint32_t *bits,
int rowstride_bytes);
+
+PIXMAN_API
pixman_image_t *pixman_image_create_bits_no_clear (pixman_format_code_t format,
int width,
int height,
@@ -788,48 +997,99 @@ pixman_image_t *pixman_image_create_bits_no_clear (pixman_format_code_t forma
int rowstride_bytes);
/* Destructor */
+PIXMAN_API
pixman_image_t *pixman_image_ref (pixman_image_t *image);
+
+PIXMAN_API
pixman_bool_t pixman_image_unref (pixman_image_t *image);
+
+PIXMAN_API
void pixman_image_set_destroy_function (pixman_image_t *image,
pixman_image_destroy_func_t function,
void *data);
+
+PIXMAN_API
void * pixman_image_get_destroy_data (pixman_image_t *image);
/* Set properties */
+PIXMAN_API
pixman_bool_t pixman_image_set_clip_region (pixman_image_t *image,
- pixman_region16_t *region);
+ const pixman_region16_t *region);
+
+PIXMAN_API
pixman_bool_t pixman_image_set_clip_region32 (pixman_image_t *image,
- pixman_region32_t *region);
+ const pixman_region32_t *region);
+
+PIXMAN_API
void pixman_image_set_has_client_clip (pixman_image_t *image,
pixman_bool_t clien_clip);
+
+PIXMAN_API
pixman_bool_t pixman_image_set_transform (pixman_image_t *image,
const pixman_transform_t *transform);
+
+PIXMAN_API
void pixman_image_set_repeat (pixman_image_t *image,
pixman_repeat_t repeat);
+
+PIXMAN_API
+void pixman_image_set_dither (pixman_image_t *image,
+ pixman_dither_t dither);
+
+PIXMAN_API
+void pixman_image_set_dither_offset (pixman_image_t *image,
+ int offset_x,
+ int offset_y);
+
+PIXMAN_API
pixman_bool_t pixman_image_set_filter (pixman_image_t *image,
pixman_filter_t filter,
const pixman_fixed_t *filter_params,
int n_filter_params);
+
+PIXMAN_API
void pixman_image_set_source_clipping (pixman_image_t *image,
pixman_bool_t source_clipping);
+
+PIXMAN_API
void pixman_image_set_alpha_map (pixman_image_t *image,
pixman_image_t *alpha_map,
int16_t x,
int16_t y);
+
+PIXMAN_API
void pixman_image_set_component_alpha (pixman_image_t *image,
pixman_bool_t component_alpha);
+
+PIXMAN_API
pixman_bool_t pixman_image_get_component_alpha (pixman_image_t *image);
+
+PIXMAN_API
void pixman_image_set_accessors (pixman_image_t *image,
pixman_read_memory_func_t read_func,
pixman_write_memory_func_t write_func);
+
+PIXMAN_API
void pixman_image_set_indexed (pixman_image_t *image,
const pixman_indexed_t *indexed);
+
+PIXMAN_API
uint32_t *pixman_image_get_data (pixman_image_t *image);
+
+PIXMAN_API
int pixman_image_get_width (pixman_image_t *image);
+
+PIXMAN_API
int pixman_image_get_height (pixman_image_t *image);
+
+PIXMAN_API
int pixman_image_get_stride (pixman_image_t *image); /* in bytes */
+
+PIXMAN_API
int pixman_image_get_depth (pixman_image_t *image);
+
+PIXMAN_API
pixman_format_code_t pixman_image_get_format (pixman_image_t *image);
typedef enum
@@ -847,6 +1107,7 @@ typedef enum
/* Create the parameter list for a SEPARABLE_CONVOLUTION filter
* with the given kernels and scale parameters.
*/
+PIXMAN_API
pixman_fixed_t *
pixman_filter_create_separable_convolution (int *n_values,
pixman_fixed_t scale_x,
@@ -858,11 +1119,15 @@ pixman_filter_create_separable_convolution (int *n_values,
int subsample_bits_x,
int subsample_bits_y);
+
+PIXMAN_API
pixman_bool_t pixman_image_fill_rectangles (pixman_op_t op,
pixman_image_t *image,
const pixman_color_t *color,
int n_rects,
const pixman_rectangle16_t *rects);
+
+PIXMAN_API
pixman_bool_t pixman_image_fill_boxes (pixman_op_t op,
pixman_image_t *dest,
const pixman_color_t *color,
@@ -870,6 +1135,7 @@ pixman_bool_t pixman_image_fill_boxes (pixman_op_t
const pixman_box32_t *boxes);
/* Composite */
+PIXMAN_API
pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
pixman_image_t *src_image,
pixman_image_t *mask_image,
@@ -882,6 +1148,8 @@ pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
int16_t dest_y,
uint16_t width,
uint16_t height);
+
+PIXMAN_API
void pixman_image_composite (pixman_op_t op,
pixman_image_t *src,
pixman_image_t *mask,
@@ -894,6 +1162,8 @@ void pixman_image_composite (pixman_op_t op,
int16_t dest_y,
uint16_t width,
uint16_t height);
+
+PIXMAN_API
void pixman_image_composite32 (pixman_op_t op,
pixman_image_t *src,
pixman_image_t *mask,
@@ -925,6 +1195,7 @@ void pixman_image_composite32 (pixman_op_t op,
* Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
* function is a no-op.
*/
+PIXMAN_API
void pixman_disable_out_of_bounds_workaround (void);
/*
@@ -937,29 +1208,48 @@ typedef struct
const void *glyph;
} pixman_glyph_t;
+PIXMAN_API
pixman_glyph_cache_t *pixman_glyph_cache_create (void);
+
+PIXMAN_API
void pixman_glyph_cache_destroy (pixman_glyph_cache_t *cache);
+
+PIXMAN_API
void pixman_glyph_cache_freeze (pixman_glyph_cache_t *cache);
+
+PIXMAN_API
void pixman_glyph_cache_thaw (pixman_glyph_cache_t *cache);
+
+PIXMAN_API
const void * pixman_glyph_cache_lookup (pixman_glyph_cache_t *cache,
void *font_key,
void *glyph_key);
+
+PIXMAN_API
const void * pixman_glyph_cache_insert (pixman_glyph_cache_t *cache,
void *font_key,
void *glyph_key,
int origin_x,
int origin_y,
pixman_image_t *glyph_image);
+
+PIXMAN_API
void pixman_glyph_cache_remove (pixman_glyph_cache_t *cache,
void *font_key,
void *glyph_key);
+
+PIXMAN_API
void pixman_glyph_get_extents (pixman_glyph_cache_t *cache,
int n_glyphs,
pixman_glyph_t *glyphs,
pixman_box32_t *extents);
+
+PIXMAN_API
pixman_format_code_t pixman_glyph_get_mask_format (pixman_glyph_cache_t *cache,
int n_glyphs,
const pixman_glyph_t *glyphs);
+
+PIXMAN_API
void pixman_composite_glyphs (pixman_op_t op,
pixman_image_t *src,
pixman_image_t *dest,
@@ -975,6 +1265,8 @@ void pixman_composite_glyphs (pixman_op_t op,
pixman_glyph_cache_t *cache,
int n_glyphs,
const pixman_glyph_t *glyphs);
+
+PIXMAN_API
void pixman_composite_glyphs_no_mask (pixman_op_t op,
pixman_image_t *src,
pixman_image_t *dest,
@@ -1042,12 +1334,19 @@ struct pixman_trap
pixman_span_fix_t top, bot;
};
+PIXMAN_API
pixman_fixed_t pixman_sample_ceil_y (pixman_fixed_t y,
int bpp);
+
+PIXMAN_API
pixman_fixed_t pixman_sample_floor_y (pixman_fixed_t y,
int bpp);
+
+PIXMAN_API
void pixman_edge_step (pixman_edge_t *e,
int n);
+
+PIXMAN_API
void pixman_edge_init (pixman_edge_t *e,
int bpp,
pixman_fixed_t y_start,
@@ -1055,31 +1354,43 @@ void pixman_edge_init (pixman_edge_t *e,
pixman_fixed_t y_top,
pixman_fixed_t x_bot,
pixman_fixed_t y_bot);
+
+PIXMAN_API
void pixman_line_fixed_edge_init (pixman_edge_t *e,
int bpp,
pixman_fixed_t y,
const pixman_line_fixed_t *line,
int x_off,
int y_off);
+
+PIXMAN_API
void pixman_rasterize_edges (pixman_image_t *image,
pixman_edge_t *l,
pixman_edge_t *r,
pixman_fixed_t t,
pixman_fixed_t b);
+
+PIXMAN_API
void pixman_add_traps (pixman_image_t *image,
int16_t x_off,
int16_t y_off,
int ntrap,
const pixman_trap_t *traps);
+
+PIXMAN_API
void pixman_add_trapezoids (pixman_image_t *image,
int16_t x_off,
int y_off,
int ntraps,
const pixman_trapezoid_t *traps);
+
+PIXMAN_API
void pixman_rasterize_trapezoid (pixman_image_t *image,
const pixman_trapezoid_t *trap,
int x_off,
int y_off);
+
+PIXMAN_API
void pixman_composite_trapezoids (pixman_op_t op,
pixman_image_t * src,
pixman_image_t * dst,
@@ -1090,6 +1401,8 @@ void pixman_composite_trapezoids (pixman_op_t op,
int y_dst,
int n_traps,
const pixman_trapezoid_t * traps);
+
+PIXMAN_API
void pixman_composite_triangles (pixman_op_t op,
pixman_image_t * src,
pixman_image_t * dst,
@@ -1100,6 +1413,8 @@ void pixman_composite_triangles (pixman_op_t op,
int y_dst,
int n_tris,
const pixman_triangle_t * tris);
+
+PIXMAN_API
void pixman_add_triangles (pixman_image_t *image,
int32_t x_off,
int32_t y_off,
diff --git a/pixman/rounding.txt b/pixman/rounding.txt
index b52b084..1c00019 100644
--- a/pixman/rounding.txt
+++ b/pixman/rounding.txt
@@ -160,6 +160,7 @@ which means the contents of the matrix corresponding to (frac) should
contain width samplings of the function, with the first sample at:
floor (frac - (width - 1) / 2.0 - e) + 0.5 - frac
+ = ceil (frac - width / 2.0 - 0.5) + 0.5 - frac
This filter is called separable because each of the k x k convolution
matrices is specified with two k-wide vectors, one for each dimension,
diff --git a/test/Makefile.am b/test/Makefile.am
deleted file mode 100644
index 88dc36d..0000000
--- a/test/Makefile.am
+++ /dev/null
@@ -1,13 +0,0 @@
-include $(top_srcdir)/test/Makefile.sources
-
-AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
-AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
-LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm $(PNG_LIBS) $(PTHREAD_LIBS)
-AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
-
-libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
-
-noinst_LTLIBRARIES = libutils.la
-noinst_PROGRAMS = $(TESTPROGRAMS) $(OTHERPROGRAMS)
-
-TESTS = $(TESTPROGRAMS)
diff --git a/test/Makefile.sources b/test/Makefile.sources
deleted file mode 100644
index c20c34b..0000000
--- a/test/Makefile.sources
+++ /dev/null
@@ -1,51 +0,0 @@
-# Tests (sorted by expected completion time)
-TESTPROGRAMS = \
- oob-test \
- infinite-loop \
- trap-crasher \
- region-translate-test \
- fetch-test \
- a1-trap-test \
- prng-test \
- radial-invalid \
- pdf-op-test \
- region-test \
- combiner-test \
- scaling-crash-test \
- alpha-loop \
- scaling-helpers-test \
- thread-test \
- rotate-test \
- alphamap \
- gradient-crash-test \
- pixel-test \
- matrix-test \
- composite-traps-test \
- region-contains-test \
- glyph-test \
- stress-test \
- blitters-test \
- affine-test \
- scaling-test \
- composite \
- tolerance-test \
- $(NULL)
-
-# Other programs
-OTHERPROGRAMS = \
- lowlevel-blt-bench \
- radial-perf-test \
- check-formats \
- scaling-bench \
- $(NULL)
-
-# Utility functions
-libutils_sources = \
- utils.c \
- utils-prng.c \
- $(NULL)
-
-libutils_headers = \
- utils.h \
- utils-prng.h \
- $(NULL)
diff --git a/test/Makefile.win32 b/test/Makefile.win32
deleted file mode 100644
index 6cfb4a7..0000000
--- a/test/Makefile.win32
+++ /dev/null
@@ -1,54 +0,0 @@
-default: all
-
-top_srcdir = ..
-include $(top_srcdir)/test/Makefile.sources
-include $(top_srcdir)/Makefile.win32.common
-
-TEST_LDADD = \
- $(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib \
- $(CFG_VAR)/libutils.lib \
- $(NULL)
-
-libutils_OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libutils_sources))
-
-SOURCES = $(patsubst %, %.c, $(TESTPROGRAMS) $(OTHERPROGRAMS))
-OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
-TESTS = $(patsubst %, $(CFG_VAR)/%.exe, $(TESTPROGRAMS))
-OTHERS = $(patsubst %, $(CFG_VAR)/%.exe, $(OTHERPROGRAMS))
-
-all: pixman inform $(TESTS) $(OTHERS)
-
-check: pixman inform $(TESTS)
- @failures=0 ; \
- total=0 ; \
- for test in $(TESTS) ; \
- do \
- total=`expr $$total + 1` ; \
- if ./$$test ; \
- then echo "PASS: $$test" ; \
- else echo "FAIL: $$test" ; \
- failures=`expr $$failures + 1` ; \
- fi ; \
- done ; \
- if test $$failures -eq 0 ; \
- then banner="All $$total tests passed" ; \
- else banner="$$failures of $$total tests failed" ; \
- fi ; \
- dashes=`echo "$$banner" | sed s/./=/g`; \
- echo "$$dashes" ; \
- echo "$$banner" ; \
- echo "$$dashes" ; \
- test $$failures -eq 0
-
-$(CFG_VAR)/libutils.lib: $(libutils_OBJECTS)
- @$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
-
-$(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj $(TEST_LDADD)
- @$(LD) $(PIXMAN_LDFLAGS) -OUT:$@ $^
-
-$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib: pixman
-
-pixman:
- @$(MAKE) -C $(top_builddir)/pixman -f Makefile.win32
-
-.PHONY: all check pixman
diff --git a/test/affine-bench.c b/test/affine-bench.c
new file mode 100644
index 0000000..86bf46e
--- /dev/null
+++ b/test/affine-bench.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright © 2014 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison@riscosopen.org)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdint.h>
+#include "utils.h"
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+
+#define WIDTH 1920
+#define HEIGHT 1080
+
+/* How much data to read to flush all cached data to RAM */
+#define MAX_L2CACHE_SIZE (8 * 1024 * 1024)
+
+#define PAGE_SIZE (4 * 1024)
+
+struct bench_info
+{
+ pixman_op_t op;
+ pixman_transform_t transform;
+ pixman_image_t *src_image;
+ pixman_image_t *mask_image;
+ pixman_image_t *dest_image;
+ int32_t src_x;
+ int32_t src_y;
+};
+
+typedef struct bench_info bench_info_t;
+
+struct box_48_16
+{
+ pixman_fixed_48_16_t x1;
+ pixman_fixed_48_16_t y1;
+ pixman_fixed_48_16_t x2;
+ pixman_fixed_48_16_t y2;
+};
+
+typedef struct box_48_16 box_48_16_t;
+
+/* This function is copied verbatim from pixman.c. */
+static pixman_bool_t
+compute_transformed_extents (pixman_transform_t *transform,
+ const pixman_box32_t *extents,
+ box_48_16_t *transformed)
+{
+ pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
+ pixman_fixed_t x1, y1, x2, y2;
+ int i;
+
+ x1 = pixman_int_to_fixed (extents->x1) + pixman_fixed_1 / 2;
+ y1 = pixman_int_to_fixed (extents->y1) + pixman_fixed_1 / 2;
+ x2 = pixman_int_to_fixed (extents->x2) - pixman_fixed_1 / 2;
+ y2 = pixman_int_to_fixed (extents->y2) - pixman_fixed_1 / 2;
+
+ if (!transform)
+ {
+ transformed->x1 = x1;
+ transformed->y1 = y1;
+ transformed->x2 = x2;
+ transformed->y2 = y2;
+
+ return TRUE;
+ }
+
+ tx1 = ty1 = INT64_MAX;
+ tx2 = ty2 = INT64_MIN;
+
+ for (i = 0; i < 4; ++i)
+ {
+ pixman_fixed_48_16_t tx, ty;
+ pixman_vector_t v;
+
+ v.vector[0] = (i & 0x01)? x1 : x2;
+ v.vector[1] = (i & 0x02)? y1 : y2;
+ v.vector[2] = pixman_fixed_1;
+
+ if (!pixman_transform_point (transform, &v))
+ return FALSE;
+
+ tx = (pixman_fixed_48_16_t)v.vector[0];
+ ty = (pixman_fixed_48_16_t)v.vector[1];
+
+ if (tx < tx1)
+ tx1 = tx;
+ if (ty < ty1)
+ ty1 = ty;
+ if (tx > tx2)
+ tx2 = tx;
+ if (ty > ty2)
+ ty2 = ty;
+ }
+
+ transformed->x1 = tx1;
+ transformed->y1 = ty1;
+ transformed->x2 = tx2;
+ transformed->y2 = ty2;
+
+ return TRUE;
+}
+
+static void
+create_image (uint32_t width,
+ uint32_t height,
+ pixman_format_code_t format,
+ pixman_filter_t filter,
+ uint32_t **bits,
+ pixman_image_t **image)
+{
+ uint32_t stride = (width * PIXMAN_FORMAT_BPP (format) + 31) / 32 * 4;
+
+ *bits = aligned_malloc (PAGE_SIZE, stride * height);
+ memset (*bits, 0xCC, stride * height);
+ *image = pixman_image_create_bits (format, width, height, *bits, stride);
+ pixman_image_set_repeat (*image, PIXMAN_REPEAT_NORMAL);
+ pixman_image_set_filter (*image, filter, NULL, 0);
+}
+
+/* This needs to match the shortest cacheline length we expect to encounter */
+#define CACHE_CLEAN_INCREMENT 32
+
+static void
+flush_cache (void)
+{
+ static const char clean_space[MAX_L2CACHE_SIZE];
+ volatile const char *x = clean_space;
+ const char *clean_end = clean_space + sizeof clean_space;
+
+ while (x < clean_end)
+ {
+ (void) *x;
+ x += CACHE_CLEAN_INCREMENT;
+ }
+}
+
+/* Obtain current time in microseconds modulo 2^32 */
+uint32_t
+gettimei (void)
+{
+#ifdef HAVE_GETTIMEOFDAY
+ struct timeval tv;
+
+ gettimeofday (&tv, NULL);
+ return tv.tv_sec * 1000000 + tv.tv_usec;
+#else
+ return (uint64_t) clock () * 1000000 / CLOCKS_PER_SEC;
+#endif
+}
+
+static void
+pixman_image_composite_wrapper (const pixman_composite_info_t *info)
+{
+ pixman_image_composite (info->op,
+ info->src_image, info->mask_image, info->dest_image,
+ info->src_x, info->src_y,
+ info->mask_x, info->mask_y,
+ info->dest_x, info->dest_y,
+ info->width, info->height);
+}
+
+static void
+pixman_image_composite_empty (const pixman_composite_info_t *info)
+{
+ pixman_image_composite (info->op,
+ info->src_image, info->mask_image, info->dest_image,
+ info->src_x, info->src_y,
+ info->mask_x, info->mask_y,
+ info->dest_x, info->dest_y,
+ 1, 1);
+}
+
+static void
+bench (const bench_info_t *bi,
+ uint32_t max_n,
+ uint32_t max_time,
+ uint32_t *ret_n,
+ uint32_t *ret_time,
+ void (*func) (const pixman_composite_info_t *info))
+{
+ uint32_t n = 0;
+ uint32_t t0;
+ uint32_t t1;
+ uint32_t x = 0;
+ pixman_transform_t t;
+ pixman_composite_info_t info;
+
+ t = bi->transform;
+ info.op = bi->op;
+ info.src_image = bi->src_image;
+ info.mask_image = bi->mask_image;
+ info.dest_image = bi->dest_image;
+ info.src_x = 0;
+ info.src_y = 0;
+ info.mask_x = 0;
+ info.mask_y = 0;
+ /* info.dest_x set below */
+ info.dest_y = 0;
+ info.width = WIDTH;
+ info.height = HEIGHT;
+
+ t0 = gettimei ();
+
+ do
+ {
+
+ if (++x >= 64)
+ x = 0;
+
+ info.dest_x = 63 - x;
+
+ t.matrix[0][2] = pixman_int_to_fixed (bi->src_x + x);
+ t.matrix[1][2] = pixman_int_to_fixed (bi->src_y);
+ pixman_image_set_transform (bi->src_image, &t);
+
+ if (bi->mask_image)
+ pixman_image_set_transform (bi->mask_image, &t);
+
+ func (&info);
+ t1 = gettimei ();
+ }
+ while (++n < max_n && (t1 - t0) < max_time);
+
+ if (ret_n)
+ *ret_n = n;
+
+ *ret_time = t1 - t0;
+}
+
+int
+parse_fixed_argument (char *arg, pixman_fixed_t *value)
+{
+ char *tailptr;
+
+ *value = pixman_double_to_fixed (strtod (arg, &tailptr));
+
+ return *tailptr == '\0';
+}
+
+int
+parse_arguments (int argc,
+ char *argv[],
+ pixman_transform_t *t,
+ pixman_op_t *op,
+ pixman_format_code_t *src_format,
+ pixman_format_code_t *mask_format,
+ pixman_format_code_t *dest_format)
+{
+ if (!parse_fixed_argument (*argv, &t->matrix[0][0]))
+ return 0;
+
+ if (*++argv == NULL)
+ return 1;
+
+ if (!parse_fixed_argument (*argv, &t->matrix[0][1]))
+ return 0;
+
+ if (*++argv == NULL)
+ return 1;
+
+ if (!parse_fixed_argument (*argv, &t->matrix[1][0]))
+ return 0;
+
+ if (*++argv == NULL)
+ return 1;
+
+ if (!parse_fixed_argument (*argv, &t->matrix[1][1]))
+ return 0;
+
+ if (*++argv == NULL)
+ return 1;
+
+ *op = operator_from_string (*argv);
+ if (*op == PIXMAN_OP_NONE)
+ return 0;
+
+ if (*++argv == NULL)
+ return 1;
+
+ *src_format = format_from_string (*argv);
+ if (*src_format == PIXMAN_null)
+ return 0;
+
+ ++argv;
+ if (argv[0] && argv[1])
+ {
+ *mask_format = format_from_string (*argv);
+ if (*mask_format == PIXMAN_null)
+ return 0;
+ ++argv;
+ }
+ if (*argv)
+ {
+ *dest_format = format_from_string (*argv);
+ if (*dest_format == PIXMAN_null)
+ return 0;
+ }
+ return 1;
+}
+
+static void
+run_benchmark (const bench_info_t *bi)
+{
+ uint32_t n; /* number of iterations in at least 5 seconds */
+ uint32_t t1; /* time taken to do n iterations, microseconds */
+ uint32_t t2; /* calling overhead for n iterations, microseconds */
+
+ flush_cache ();
+ bench (bi, UINT32_MAX, 5000000, &n, &t1, pixman_image_composite_wrapper);
+ bench (bi, n, UINT32_MAX, NULL, &t2, pixman_image_composite_empty);
+
+ /* The result indicates the output rate in megapixels/second */
+ printf ("%6.2f\n", (double) n * WIDTH * HEIGHT / (t1 - t2));
+}
+
+
+int
+main (int argc, char *argv[])
+{
+ bench_info_t binfo;
+ pixman_filter_t filter = PIXMAN_FILTER_NEAREST;
+ pixman_format_code_t src_format = PIXMAN_a8r8g8b8;
+ pixman_format_code_t mask_format = 0;
+ pixman_format_code_t dest_format = PIXMAN_a8r8g8b8;
+ pixman_box32_t dest_box = { 0, 0, WIDTH, HEIGHT };
+ box_48_16_t transformed = { 0 };
+ int32_t xmin, ymin, xmax, ymax;
+ uint32_t *src, *mask, *dest;
+
+ binfo.op = PIXMAN_OP_SRC;
+ binfo.mask_image = NULL;
+ pixman_transform_init_identity (&binfo.transform);
+
+ ++argv;
+ if (*argv && (*argv)[0] == '-' && (*argv)[1] == 'n')
+ {
+ filter = PIXMAN_FILTER_NEAREST;
+ ++argv;
+ --argc;
+ }
+
+ if (*argv && (*argv)[0] == '-' && (*argv)[1] == 'b')
+ {
+ filter = PIXMAN_FILTER_BILINEAR;
+ ++argv;
+ --argc;
+ }
+
+ if (argc == 1 ||
+ !parse_arguments (argc, argv, &binfo.transform, &binfo.op,
+ &src_format, &mask_format, &dest_format))
+ {
+ printf ("Usage: affine-bench [-n] [-b] axx [axy] [ayx] [ayy] [combine type]\n");
+ printf (" [src format] [mask format] [dest format]\n");
+ printf (" -n : nearest scaling (default)\n");
+ printf (" -b : bilinear scaling\n");
+ printf (" axx : x_out:x_in factor\n");
+ printf (" axy : x_out:y_in factor (default 0)\n");
+ printf (" ayx : y_out:x_in factor (default 0)\n");
+ printf (" ayy : y_out:y_in factor (default 1)\n");
+ printf (" combine type : src, over, in etc (default src)\n");
+ printf (" src format : a8r8g8b8, r5g6b5 etc (default a8r8g8b8)\n");
+ printf (" mask format : as for src format, but no mask used if omitted\n");
+ printf (" dest format : as for src format (default a8r8g8b8)\n");
+ printf ("The output is a single number in megapixels/second.\n");
+
+ return EXIT_FAILURE;
+ }
+
+ /* Compute required extents for source and mask image so they qualify
+ * for COVER fast paths and get the flags in pixman.c:analyze_extent().
+ * These computations are for FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,
+ * but at the same time they also allow COVER_CLIP_NEAREST.
+ */
+ compute_transformed_extents (&binfo.transform, &dest_box, &transformed);
+ xmin = pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2);
+ ymin = pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2);
+ xmax = pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2);
+ ymax = pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2);
+ /* Note:
+ * The upper limits can be reduced to the following when fetchers
+ * are guaranteed to not access pixels with zero weight. This concerns
+ * particularly all bilinear samplers.
+ *
+ * xmax = pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2 - pixman_fixed_e);
+ * ymax = pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2 - pixman_fixed_e);
+ * This is equivalent to subtracting 0.5 and rounding up, rather than
+ * subtracting 0.5, rounding down and adding 1.
+ */
+ binfo.src_x = -xmin;
+ binfo.src_y = -ymin;
+
+ /* Always over-allocate width by 64 pixels for all src, mask and dst,
+ * so that we can iterate over an x-offset 0..63 in bench ().
+ * This is similar to lowlevel-blt-bench, which uses the same method
+ * to hit different cacheline misalignments.
+ */
+ create_image (xmax - xmin + 64, ymax - ymin + 1, src_format, filter,
+ &src, &binfo.src_image);
+
+ if (mask_format)
+ {
+ create_image (xmax - xmin + 64, ymax - ymin + 1, mask_format, filter,
+ &mask, &binfo.mask_image);
+
+ if ((PIXMAN_FORMAT_R(mask_format) ||
+ PIXMAN_FORMAT_G(mask_format) ||
+ PIXMAN_FORMAT_B(mask_format)))
+ {
+ pixman_image_set_component_alpha (binfo.mask_image, 1);
+ }
+ }
+
+ create_image (WIDTH + 64, HEIGHT, dest_format, filter,
+ &dest, &binfo.dest_image);
+
+ run_benchmark (&binfo);
+
+ return EXIT_SUCCESS;
+}
diff --git a/test/affine-test.c b/test/affine-test.c
index 8e19023..f516856 100644
--- a/test/affine-test.c
+++ b/test/affine-test.c
@@ -171,7 +171,7 @@ test_composite (int testnum,
int i = prng_rand_n (2);
int j = prng_rand_n (3);
int bitnum = prng_rand_n (32);
- transform.matrix[i][j] ^= 1 << bitnum;
+ transform.matrix[i][j] ^= 1U << bitnum;
if (prng_rand_n (2))
break;
}
diff --git a/test/alphamap.c b/test/alphamap.c
index 4d09076..150d33e 100644
--- a/test/alphamap.c
+++ b/test/alphamap.c
@@ -10,7 +10,8 @@ static const pixman_format_code_t formats[] =
PIXMAN_a8r8g8b8,
PIXMAN_a2r10g10b10,
PIXMAN_a4r4g4b4,
- PIXMAN_a8
+ PIXMAN_a8,
+ PIXMAN_rgba_float,
};
static const pixman_format_code_t alpha_formats[] =
@@ -18,7 +19,8 @@ static const pixman_format_code_t alpha_formats[] =
PIXMAN_null,
PIXMAN_a8,
PIXMAN_a2r10g10b10,
- PIXMAN_a4r4g4b4
+ PIXMAN_a4r4g4b4,
+ PIXMAN_rgba_float,
};
static const int origins[] =
@@ -41,7 +43,10 @@ make_image (pixman_format_code_t format)
uint8_t bpp = PIXMAN_FORMAT_BPP (format) / 8;
pixman_image_t *image;
- bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
+ if (format != PIXMAN_rgba_float)
+ bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
+ else
+ bits = (uint32_t *)make_random_floats (WIDTH * HEIGHT * bpp);
image = pixman_image_create_bits (format, WIDTH, HEIGHT, bits, WIDTH * bpp);
@@ -51,11 +56,11 @@ make_image (pixman_format_code_t format)
return image;
}
-static uint8_t
+static float
get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
{
uint8_t *bits;
- uint8_t r;
+ uint32_t r;
if (image->common.alpha_map)
{
@@ -69,7 +74,7 @@ get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
}
else
{
- return 0;
+ return 0.f;
}
}
@@ -78,28 +83,32 @@ get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
if (image->bits.format == PIXMAN_a8)
{
r = bits[y * WIDTH + x];
+ return r / 255.f;
}
else if (image->bits.format == PIXMAN_a2r10g10b10)
{
r = ((uint32_t *)bits)[y * WIDTH + x] >> 30;
- r |= r << 2;
- r |= r << 4;
+ return r / 3.f;
}
else if (image->bits.format == PIXMAN_a8r8g8b8)
{
r = ((uint32_t *)bits)[y * WIDTH + x] >> 24;
+ return r / 255.f;
}
else if (image->bits.format == PIXMAN_a4r4g4b4)
{
r = ((uint16_t *)bits)[y * WIDTH + x] >> 12;
- r |= r << 4;
+ return r / 15.f;
+ }
+ else if (image->bits.format == PIXMAN_rgba_float)
+ {
+ return ((float *)bits)[y * WIDTH * 4 + x * 4 + 3];
}
else
{
assert (0);
+ return 0.f;
}
-
- return r;
}
static uint16_t
@@ -133,6 +142,11 @@ get_red (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
r |= r << 4;
r |= r << 8;
}
+ else if (image->bits.format == PIXMAN_rgba_float)
+ {
+ double tmp = ((float *)bits)[y * WIDTH * 4 + x * 4];
+ return tmp * 65535.;
+ }
else
{
assert (0);
@@ -141,6 +155,23 @@ get_red (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
return r;
}
+static float get_alpha_err(pixman_format_code_t sf, pixman_format_code_t saf,
+ pixman_format_code_t df, pixman_format_code_t daf)
+{
+ pixman_format_code_t s = saf != PIXMAN_null ? saf : sf;
+ pixman_format_code_t d = daf != PIXMAN_null ? daf : df;
+
+ /* There are cases where we go through the 8 bit compositing
+ * path even with 10bpc and higher formats.
+ */
+ if (PIXMAN_FORMAT_A(s) == PIXMAN_FORMAT_A(d))
+ return 1.f / 255.f;
+ else if (PIXMAN_FORMAT_A(s) > PIXMAN_FORMAT_A(d))
+ return 1.f / ((1 << PIXMAN_FORMAT_A(d)) - 1);
+ else
+ return 1.f / ((1 << PIXMAN_FORMAT_A(s)) - 1);
+}
+
static int
run_test (int s, int d, int sa, int da, int soff, int doff)
{
@@ -151,15 +182,11 @@ run_test (int s, int d, int sa, int da, int soff, int doff)
pixman_image_t *src, *dst, *orig_dst, *alpha, *orig_alpha;
pixman_transform_t t1;
int j, k;
- int n_alpha_bits, n_red_bits;
+ int n_red_bits;
soff = origins[soff];
doff = origins[doff];
- n_alpha_bits = PIXMAN_FORMAT_A (df);
- if (daf != PIXMAN_null)
- n_alpha_bits = PIXMAN_FORMAT_A (daf);
-
n_red_bits = PIXMAN_FORMAT_R (df);
/* Source */
@@ -211,21 +238,25 @@ run_test (int s, int d, int sa, int da, int soff, int doff)
{
for (k = MAX (doff, 0); k < MIN (WIDTH, WIDTH + doff); ++k)
{
- uint8_t sa, da, oda, refa;
+ float sa, da, oda, refa;
uint16_t sr, dr, odr, refr;
+ float err;
+
+ err = get_alpha_err(sf, saf, df, daf);
sa = get_alpha (src, k, j, soff, soff);
da = get_alpha (dst, k, j, doff, doff);
oda = get_alpha (orig_dst, k, j, doff, doff);
- if (sa + oda > 255)
- refa = 255;
+ if (sa + oda > 1.f)
+ refa = 1.f;
else
refa = sa + oda;
- if (da >> (8 - n_alpha_bits) != refa >> (8 - n_alpha_bits))
+ if (da - err > refa ||
+ da + err < refa)
{
- printf ("\nWrong alpha value at (%d, %d). Should be 0x%x; got 0x%x. Source was 0x%x, original dest was 0x%x\n",
+ printf ("\nWrong alpha value at (%d, %d). Should be %g; got %g. Source was %g, original dest was %g\n",
k, j, refa, da, sa, oda);
printf ("src: %s, alpha: %s, origin %d %d\ndst: %s, alpha: %s, origin: %d %d\n\n",
diff --git a/test/check-formats.c b/test/check-formats.c
index 8eb263b..4e2633c 100644
--- a/test/check-formats.c
+++ b/test/check-formats.c
@@ -104,198 +104,6 @@ check_op (pixman_op_t op,
return retval;
}
-static const pixman_op_t op_list[] =
-{
- PIXMAN_OP_CLEAR,
- PIXMAN_OP_SRC,
- PIXMAN_OP_DST,
- PIXMAN_OP_OVER,
- PIXMAN_OP_OVER_REVERSE,
- PIXMAN_OP_IN,
- PIXMAN_OP_IN_REVERSE,
- PIXMAN_OP_OUT,
- PIXMAN_OP_OUT_REVERSE,
- PIXMAN_OP_ATOP,
- PIXMAN_OP_ATOP_REVERSE,
- PIXMAN_OP_XOR,
- PIXMAN_OP_ADD,
- PIXMAN_OP_SATURATE,
-
- PIXMAN_OP_DISJOINT_CLEAR,
- PIXMAN_OP_DISJOINT_SRC,
- PIXMAN_OP_DISJOINT_DST,
- PIXMAN_OP_DISJOINT_OVER,
- PIXMAN_OP_DISJOINT_OVER_REVERSE,
- PIXMAN_OP_DISJOINT_IN,
- PIXMAN_OP_DISJOINT_IN_REVERSE,
- PIXMAN_OP_DISJOINT_OUT,
- PIXMAN_OP_DISJOINT_OUT_REVERSE,
- PIXMAN_OP_DISJOINT_ATOP,
- PIXMAN_OP_DISJOINT_ATOP_REVERSE,
- PIXMAN_OP_DISJOINT_XOR,
-
- PIXMAN_OP_CONJOINT_CLEAR,
- PIXMAN_OP_CONJOINT_SRC,
- PIXMAN_OP_CONJOINT_DST,
- PIXMAN_OP_CONJOINT_OVER,
- PIXMAN_OP_CONJOINT_OVER_REVERSE,
- PIXMAN_OP_CONJOINT_IN,
- PIXMAN_OP_CONJOINT_IN_REVERSE,
- PIXMAN_OP_CONJOINT_OUT,
- PIXMAN_OP_CONJOINT_OUT_REVERSE,
- PIXMAN_OP_CONJOINT_ATOP,
- PIXMAN_OP_CONJOINT_ATOP_REVERSE,
- PIXMAN_OP_CONJOINT_XOR,
-
- PIXMAN_OP_MULTIPLY,
- PIXMAN_OP_SCREEN,
- PIXMAN_OP_OVERLAY,
- PIXMAN_OP_DARKEN,
- PIXMAN_OP_LIGHTEN,
- PIXMAN_OP_COLOR_DODGE,
- PIXMAN_OP_COLOR_BURN,
- PIXMAN_OP_HARD_LIGHT,
- PIXMAN_OP_SOFT_LIGHT,
- PIXMAN_OP_DIFFERENCE,
- PIXMAN_OP_EXCLUSION,
- PIXMAN_OP_HSL_HUE,
- PIXMAN_OP_HSL_SATURATION,
- PIXMAN_OP_HSL_COLOR,
- PIXMAN_OP_HSL_LUMINOSITY
-};
-
-static const pixman_format_code_t format_list[] =
-{
- PIXMAN_a8r8g8b8,
- PIXMAN_x8r8g8b8,
- PIXMAN_a8b8g8r8,
- PIXMAN_x8b8g8r8,
- PIXMAN_b8g8r8a8,
- PIXMAN_b8g8r8x8,
- PIXMAN_r8g8b8a8,
- PIXMAN_r8g8b8x8,
- PIXMAN_x14r6g6b6,
- PIXMAN_x2r10g10b10,
- PIXMAN_a2r10g10b10,
- PIXMAN_x2b10g10r10,
- PIXMAN_a2b10g10r10,
- PIXMAN_a8r8g8b8_sRGB,
- PIXMAN_r8g8b8,
- PIXMAN_b8g8r8,
- PIXMAN_r5g6b5,
- PIXMAN_b5g6r5,
- PIXMAN_a1r5g5b5,
- PIXMAN_x1r5g5b5,
- PIXMAN_a1b5g5r5,
- PIXMAN_x1b5g5r5,
- PIXMAN_a4r4g4b4,
- PIXMAN_x4r4g4b4,
- PIXMAN_a4b4g4r4,
- PIXMAN_x4b4g4r4,
- PIXMAN_a8,
- PIXMAN_r3g3b2,
- PIXMAN_b2g3r3,
- PIXMAN_a2r2g2b2,
- PIXMAN_a2b2g2r2,
- PIXMAN_x4a4,
- PIXMAN_a4,
- PIXMAN_r1g2b1,
- PIXMAN_b1g2r1,
- PIXMAN_a1r1g1b1,
- PIXMAN_a1b1g1r1,
- PIXMAN_a1,
-};
-
-static pixman_format_code_t
-format_from_string (const char *s)
-{
- int i;
-
- for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
- {
- if (strcasecmp (format_name (format_list[i]), s) == 0)
- return format_list[i];
- }
-
- return PIXMAN_null;
-}
-
-static void
-emit (const char *s, int *n_chars)
-{
- *n_chars += printf ("%s,", s);
- if (*n_chars > 60)
- {
- printf ("\n ");
- *n_chars = 0;
- }
- else
- {
- printf (" ");
- (*n_chars)++;
- }
-}
-
-static void
-list_formats (void)
-{
- int n_chars;
- int i;
-
- printf ("Formats:\n ");
-
- n_chars = 0;
- for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
- emit (format_name (format_list[i]), &n_chars);
-
- printf ("\n\n");
-}
-
-static void
-list_operators (void)
-{
- char short_name [128] = { 0 };
- int i, n_chars;
-
- printf ("Operators:\n ");
-
- n_chars = 0;
- for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
- {
- pixman_op_t op = op_list[i];
- int j;
-
- snprintf (short_name, sizeof (short_name) - 1, "%s",
- operator_name (op) + strlen ("PIXMAN_OP_"));
-
- for (j = 0; short_name[j] != '\0'; ++j)
- short_name[j] = tolower (short_name[j]);
-
- emit (short_name, &n_chars);
- }
-
- printf ("\n\n");
-}
-
-static pixman_op_t
-operator_from_string (const char *s)
-{
- char full_name[128] = { 0 };
- int i;
-
- snprintf (full_name, (sizeof full_name) - 1, "PIXMAN_OP_%s", s);
-
- for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
- {
- pixman_op_t op = op_list[i];
-
- if (strcasecmp (operator_name (op), full_name) == 0)
- return op;
- }
-
- return PIXMAN_OP_NONE;
-}
-
int
main (int argc, char **argv)
{
diff --git a/test/composite.c b/test/composite.c
index 594c697..8d95046 100644
--- a/test/composite.c
+++ b/test/composite.c
@@ -92,6 +92,7 @@ static const pixman_format_code_t formats[] =
/* sRGB formats */
PIXMAN_a8r8g8b8_sRGB,
+ PIXMAN_r8g8b8_sRGB,
/* 24 bpp formats */
PIXMAN_r8g8b8,
diff --git a/test/cover-test.c b/test/cover-test.c
new file mode 100644
index 0000000..83e2972
--- /dev/null
+++ b/test/cover-test.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright © 2015 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+/*
+ * This test aims to verify both numerical correctness and the honouring of
+ * array bounds for scaled plots (both nearest-neighbour and bilinear) at or
+ * close to the boundary conditions for applicability of "cover" type fast paths
+ * and iter fetch routines.
+ *
+ * It has a secondary purpose: by setting the env var EXACT (to any value) it
+ * will only test plots that are exactly on the boundary condition. This makes
+ * it possible to ensure that "cover" routines are being used to the maximum,
+ * although this requires the use of a debugger or code instrumentation to
+ * verify.
+ */
+
+#include "utils.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+/* Approximate limits for random scale factor generation - these ensure we can
+ * get at least 8x reduction and 8x enlargement.
+ */
+#define LOG2_MAX_FACTOR (3)
+
+/* 1/sqrt(2) (or sqrt(0.5), or 2^-0.5) as a 0.32 fixed-point number */
+#define INV_SQRT_2_0POINT32_FIXED (0xB504F334u)
+
+/* The largest increment that can be generated by random_scale_factor().
+ * This occurs when the "mantissa" part is 0xFFFFFFFF and the "exponent"
+ * part is -LOG2_MAX_FACTOR.
+ */
+#define MAX_INC ((pixman_fixed_t) \
+ (INV_SQRT_2_0POINT32_FIXED >> (31 - 16 - LOG2_MAX_FACTOR)))
+
+/* Minimum source width (in pixels) based on a typical page size of 4K and
+ * maximum colour depth of 32bpp.
+ */
+#define MIN_SRC_WIDTH (4096 / 4)
+
+/* Derive the destination width so that at max increment we fit within source */
+#define DST_WIDTH (MIN_SRC_WIDTH * pixman_fixed_1 / MAX_INC)
+
+/* Calculate heights the other way round.
+ * No limits due to page alignment here.
+ */
+#define DST_HEIGHT 3
+#define SRC_HEIGHT ((DST_HEIGHT * MAX_INC + pixman_fixed_1 - 1) / pixman_fixed_1)
+
+/* At the time of writing, all the scaled fast paths use SRC, OVER or ADD
+ * Porter-Duff operators. XOR is included in the list to ensure good
+ * representation of iter scanline fetch routines.
+ */
+static const pixman_op_t op_list[] = {
+ PIXMAN_OP_SRC,
+ PIXMAN_OP_OVER,
+ PIXMAN_OP_ADD,
+ PIXMAN_OP_XOR,
+};
+
+/* At the time of writing, all the scaled fast paths use a8r8g8b8, x8r8g8b8
+ * or r5g6b5, or red-blue swapped versions of the same. When a mask channel is
+ * used, it is always a8 (and so implicitly not component alpha). a1r5g5b5 is
+ * included because it is the only other format to feature in any iters. */
+static const pixman_format_code_t img_fmt_list[] = {
+ PIXMAN_a8r8g8b8,
+ PIXMAN_x8r8g8b8,
+ PIXMAN_r5g6b5,
+ PIXMAN_a1r5g5b5
+};
+
+/* This is a flag reflecting the environment variable EXACT. It can be used
+ * to ensure that source coordinates corresponding exactly to the "cover" limits
+ * are used, rather than any "near misses". This can, for example, be used in
+ * conjunction with a debugger to ensure that only COVER fast paths are used.
+ */
+static int exact;
+
+static pixman_image_t *
+create_src_image (pixman_format_code_t fmt)
+{
+ pixman_image_t *tmp_img, *img;
+
+ /* We need the left-most and right-most MIN_SRC_WIDTH pixels to have
+ * predictable values, even though fence_image_create_bits() may allocate
+ * an image somewhat larger than that, by an amount that varies depending
+ * upon the page size on the current platform. The solution is to create a
+ * temporary non-fenced image that is exactly MIN_SRC_WIDTH wide and blit it
+ * into the fenced image.
+ */
+ tmp_img = pixman_image_create_bits (fmt, MIN_SRC_WIDTH, SRC_HEIGHT,
+ NULL, 0);
+ if (tmp_img == NULL)
+ return NULL;
+
+ img = fence_image_create_bits (fmt, MIN_SRC_WIDTH, SRC_HEIGHT, TRUE);
+ if (img == NULL)
+ {
+ pixman_image_unref (tmp_img);
+ return NULL;
+ }
+
+ prng_randmemset (tmp_img->bits.bits,
+ tmp_img->bits.rowstride * SRC_HEIGHT * sizeof (uint32_t),
+ 0);
+ image_endian_swap (tmp_img);
+
+ pixman_image_composite (PIXMAN_OP_SRC, tmp_img, NULL, img,
+ 0, 0, 0, 0, 0, 0,
+ MIN_SRC_WIDTH, SRC_HEIGHT);
+ pixman_image_composite (PIXMAN_OP_SRC, tmp_img, NULL, img,
+ 0, 0, 0, 0, img->bits.width - MIN_SRC_WIDTH, 0,
+ MIN_SRC_WIDTH, SRC_HEIGHT);
+
+ pixman_image_unref (tmp_img);
+
+ return img;
+}
+
+static pixman_fixed_t
+random_scale_factor(void)
+{
+ /* Get a random number with top bit set. */
+ uint32_t f = prng_rand () | 0x80000000u;
+
+ /* In log(2) space, this is still approximately evenly spread between 31
+ * and 32. Divide by sqrt(2) to centre the distribution on 2^31.
+ */
+ f = ((uint64_t) f * INV_SQRT_2_0POINT32_FIXED) >> 32;
+
+ /* Now shift right (ie divide by an integer power of 2) to spread the
+ * distribution between centres at 2^(16 +/- LOG2_MAX_FACTOR).
+ */
+ f >>= 31 - 16 + prng_rand_n (2 * LOG2_MAX_FACTOR + 1) - LOG2_MAX_FACTOR;
+
+ return f;
+}
+
+static pixman_fixed_t
+calc_translate (int dst_size,
+ int src_size,
+ pixman_fixed_t scale,
+ pixman_bool_t low_align,
+ pixman_bool_t bilinear)
+{
+ pixman_fixed_t ref_src, ref_dst, scaled_dst;
+
+ if (low_align)
+ {
+ ref_src = bilinear ? pixman_fixed_1 / 2 : pixman_fixed_e;
+ ref_dst = pixman_fixed_1 / 2;
+ }
+ else
+ {
+ ref_src = pixman_int_to_fixed (src_size) -
+ bilinear * pixman_fixed_1 / 2;
+ ref_dst = pixman_int_to_fixed (dst_size) - pixman_fixed_1 / 2;
+ }
+
+ scaled_dst = ((uint64_t) ref_dst * scale + pixman_fixed_1 / 2) /
+ pixman_fixed_1;
+
+ /* We need the translation to be set such that when ref_dst is fed through
+ * the transformation matrix, we get ref_src as the result.
+ */
+ return ref_src - scaled_dst;
+}
+
+static pixman_fixed_t
+random_offset (void)
+{
+ pixman_fixed_t offset = 0;
+
+ /* Ensure we test the exact case quite a lot */
+ if (prng_rand_n (2))
+ return offset;
+
+ /* What happens when we are close to the edge of the first
+ * interpolation step?
+ */
+ if (prng_rand_n (2))
+ offset += (pixman_fixed_1 >> BILINEAR_INTERPOLATION_BITS) - 16;
+
+ /* Try fine-grained variations */
+ offset += prng_rand_n (32);
+
+ /* Test in both directions */
+ if (prng_rand_n (2))
+ offset = -offset;
+
+ return offset;
+}
+
+static void
+check_transform (pixman_image_t *dst_img,
+ pixman_image_t *src_img,
+ pixman_transform_t *transform,
+ pixman_bool_t bilinear)
+{
+ pixman_vector_t v1, v2;
+
+ v1.vector[0] = pixman_fixed_1 / 2;
+ v1.vector[1] = pixman_fixed_1 / 2;
+ v1.vector[2] = pixman_fixed_1;
+ assert (pixman_transform_point (transform, &v1));
+
+ v2.vector[0] = pixman_int_to_fixed (dst_img->bits.width) -
+ pixman_fixed_1 / 2;
+ v2.vector[1] = pixman_int_to_fixed (dst_img->bits.height) -
+ pixman_fixed_1 / 2;
+ v2.vector[2] = pixman_fixed_1;
+ assert (pixman_transform_point (transform, &v2));
+
+ if (bilinear)
+ {
+ assert (v1.vector[0] >= pixman_fixed_1 / 2);
+ assert (v1.vector[1] >= pixman_fixed_1 / 2);
+ assert (v2.vector[0] <= pixman_int_to_fixed (src_img->bits.width) -
+ pixman_fixed_1 / 2);
+ assert (v2.vector[1] <= pixman_int_to_fixed (src_img->bits.height) -
+ pixman_fixed_1 / 2);
+ }
+ else
+ {
+ assert (v1.vector[0] >= pixman_fixed_e);
+ assert (v1.vector[1] >= pixman_fixed_e);
+ assert (v2.vector[0] <= pixman_int_to_fixed (src_img->bits.width));
+ assert (v2.vector[1] <= pixman_int_to_fixed (src_img->bits.height));
+ }
+}
+
+static uint32_t
+test_cover (int testnum, int verbose)
+{
+ pixman_fixed_t x_scale, y_scale;
+ pixman_bool_t left_align, top_align;
+ pixman_bool_t bilinear;
+ pixman_filter_t filter;
+ pixman_op_t op;
+ size_t src_fmt_index;
+ pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
+ pixman_image_t *src_img, *dst_img, *mask_img;
+ pixman_transform_t src_transform, mask_transform;
+ pixman_fixed_t fuzz[4];
+ uint32_t crc32;
+
+ /* We allocate one fenced image for each pixel format up-front. This is to
+ * avoid spending a lot of time on memory management rather than on testing
+ * Pixman optimisations. We need one per thread because the transformation
+ * matrices and filtering are properties of the source and mask images.
+ */
+ static pixman_image_t *src_imgs[ARRAY_LENGTH (img_fmt_list)];
+ static pixman_image_t *mask_bits_img;
+ static pixman_bool_t fence_images_created;
+#ifdef USE_OPENMP
+#pragma omp threadprivate (src_imgs)
+#pragma omp threadprivate (mask_bits_img)
+#pragma omp threadprivate (fence_images_created)
+#endif
+
+ if (!fence_images_created)
+ {
+ int i;
+
+ prng_srand (0);
+
+ for (i = 0; i < ARRAY_LENGTH (img_fmt_list); i++)
+ src_imgs[i] = create_src_image (img_fmt_list[i]);
+
+ mask_bits_img = create_src_image (PIXMAN_a8);
+
+ fence_images_created = TRUE;
+ }
+
+ prng_srand (testnum);
+
+ x_scale = random_scale_factor ();
+ y_scale = random_scale_factor ();
+ left_align = prng_rand_n (2);
+ top_align = prng_rand_n (2);
+ bilinear = prng_rand_n (2);
+ filter = bilinear ? PIXMAN_FILTER_BILINEAR : PIXMAN_FILTER_NEAREST;
+
+ op = op_list[prng_rand_n (ARRAY_LENGTH (op_list))];
+
+ dst_fmt = img_fmt_list[prng_rand_n (ARRAY_LENGTH (img_fmt_list))];
+ dst_img = pixman_image_create_bits (dst_fmt, DST_WIDTH, DST_HEIGHT,
+ NULL, 0);
+ prng_randmemset (dst_img->bits.bits,
+ dst_img->bits.rowstride * DST_HEIGHT * sizeof (uint32_t),
+ 0);
+ image_endian_swap (dst_img);
+
+ src_fmt_index = prng_rand_n (ARRAY_LENGTH (img_fmt_list));
+ src_fmt = img_fmt_list[src_fmt_index];
+ src_img = src_imgs[src_fmt_index];
+ pixman_image_set_filter (src_img, filter, NULL, 0);
+ pixman_transform_init_scale (&src_transform, x_scale, y_scale);
+ src_transform.matrix[0][2] = calc_translate (dst_img->bits.width,
+ src_img->bits.width,
+ x_scale, left_align, bilinear);
+ src_transform.matrix[1][2] = calc_translate (dst_img->bits.height,
+ src_img->bits.height,
+ y_scale, top_align, bilinear);
+
+ if (prng_rand_n (2))
+ {
+ /* No mask */
+ mask_fmt = PIXMAN_null;
+ mask_img = NULL;
+ }
+ else if (prng_rand_n (2))
+ {
+ /* a8 bitmap mask */
+ mask_fmt = PIXMAN_a8;
+ mask_img = mask_bits_img;
+ pixman_image_set_filter (mask_img, filter, NULL, 0);
+ pixman_transform_init_scale (&mask_transform, x_scale, y_scale);
+ mask_transform.matrix[0][2] = calc_translate (dst_img->bits.width,
+ mask_img->bits.width,
+ x_scale, left_align,
+ bilinear);
+ mask_transform.matrix[1][2] = calc_translate (dst_img->bits.height,
+ mask_img->bits.height,
+ y_scale, top_align,
+ bilinear);
+ }
+ else
+ {
+ /* Solid mask */
+ pixman_color_t color;
+ memset (&color, 0xAA, sizeof color);
+ mask_fmt = PIXMAN_solid;
+ mask_img = pixman_image_create_solid_fill (&color);
+ }
+
+ if (!exact)
+ {
+ int i = 0;
+
+ while (i < 4)
+ fuzz[i++] = random_offset ();
+
+ src_transform.matrix[0][2] += fuzz[0];
+ src_transform.matrix[1][2] += fuzz[1];
+ mask_transform.matrix[0][2] += fuzz[2];
+ mask_transform.matrix[1][2] += fuzz[3];
+ }
+
+ pixman_image_set_transform (src_img, &src_transform);
+ if (mask_fmt == PIXMAN_a8)
+ pixman_image_set_transform (mask_img, &mask_transform);
+
+ if (verbose)
+ {
+ printf ("op=%s\n", operator_name (op));
+ printf ("src_fmt=%s, dst_fmt=%s, mask_fmt=%s\n",
+ format_name (src_fmt), format_name (dst_fmt),
+ format_name (mask_fmt));
+ printf ("x_scale=0x%08X, y_scale=0x%08X, align %s/%s, %s\n",
+ x_scale, y_scale,
+ left_align ? "left" : "right", top_align ? "top" : "bottom",
+ bilinear ? "bilinear" : "nearest");
+
+ if (!exact)
+ {
+ int i = 0;
+
+ printf ("fuzz factors");
+ while (i < 4)
+ printf (" %d", fuzz[i++]);
+ printf ("\n");
+ }
+ }
+
+ if (exact)
+ {
+ check_transform (dst_img, src_img, &src_transform, bilinear);
+ if (mask_fmt == PIXMAN_a8)
+ check_transform (dst_img, mask_img, &mask_transform, bilinear);
+ }
+
+ pixman_image_composite (op, src_img, mask_img, dst_img,
+ 0, 0, 0, 0, 0, 0,
+ dst_img->bits.width, dst_img->bits.height);
+
+ if (verbose)
+ print_image (dst_img);
+
+ crc32 = compute_crc32_for_image (0, dst_img);
+
+ pixman_image_unref (dst_img);
+ if (mask_fmt == PIXMAN_solid)
+ pixman_image_unref (mask_img);
+
+ return crc32;
+}
+
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM_FUZZ 0x6B56F607
+#define CHECKSUM_EXACT 0xA669F4A3
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM_FUZZ 0x83119ED0
+#define CHECKSUM_EXACT 0x0D3382CD
+#else
+#define CHECKSUM_FUZZ 0x00000000
+#define CHECKSUM_EXACT 0x00000000
+#endif
+
+int
+main (int argc, const char *argv[])
+{
+ unsigned long page_size;
+
+ page_size = fence_get_page_size ();
+ if (page_size == 0 || page_size > 16 * 1024)
+ return 77; /* automake SKIP */
+
+ exact = getenv ("EXACT") != NULL;
+ if (exact)
+ printf ("Doing plots that are exactly aligned to boundaries\n");
+
+ return fuzzer_test_main ("cover", 2000000,
+ exact ? CHECKSUM_EXACT : CHECKSUM_FUZZ,
+ test_cover, argc, argv);
+}
diff --git a/test/fence-image-self-test.c b/test/fence-image-self-test.c
new file mode 100644
index 0000000..2eb82ce
--- /dev/null
+++ b/test/fence-image-self-test.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright © 2015 Raspberry Pi Foundation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <pixman-config.h>
+#endif
+
+#include "utils.h"
+
+
+#if FENCE_MALLOC_ACTIVE && defined (HAVE_SIGACTION)
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+pixman_bool_t verbose;
+
+static void
+segv_handler (int sig, siginfo_t *si, void *unused)
+{
+ _exit (EXIT_SUCCESS);
+}
+
+static void
+die (const char *msg, int err)
+{
+ if (err)
+ perror (msg);
+ else
+ fprintf (stderr, "%s\n", msg);
+
+ abort ();
+}
+
+static void
+prinfo (const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!verbose)
+ return;
+
+ va_start (ap, fmt);
+ vfprintf (stderr, fmt, ap);
+ va_end (ap);
+}
+
+static void
+do_expect_signal (void (*fn)(void *), void *data)
+{
+ struct sigaction sa;
+
+ sa.sa_flags = SA_SIGINFO;
+ sigemptyset (&sa.sa_mask);
+ sa.sa_sigaction = segv_handler;
+ if (sigaction (SIGSEGV, &sa, NULL) == -1)
+ die ("sigaction failed", errno);
+ if (sigaction (SIGBUS, &sa, NULL) == -1)
+ die ("sigaction failed", errno);
+
+ (*fn)(data);
+
+ _exit (EXIT_FAILURE);
+}
+
+/* Check that calling fn(data) causes a segmentation fault.
+ *
+ * You cannot portably return from a SIGSEGV handler in any way,
+ * so we fork, and do the test in the child process. Child's
+ * exit status will reflect the result. Its SEGV handler causes it
+ * to exit with success, and return failure otherwise.
+ */
+static pixman_bool_t
+expect_signal (void (*fn)(void *), void *data)
+{
+ pid_t pid, wp;
+ int status;
+
+ pid = fork ();
+ if (pid == -1)
+ die ("fork failed", errno);
+
+ if (pid == 0)
+ do_expect_signal (fn, data); /* never returns */
+
+ wp = waitpid (pid, &status, 0);
+ if (wp != pid)
+ die ("waitpid did not work", wp == -1 ? errno : 0);
+
+ if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_SUCCESS)
+ return TRUE;
+
+ return FALSE;
+}
+
+static void
+read_u8 (void *data)
+{
+ volatile uint8_t *p = data;
+
+ *p;
+}
+
+static pixman_bool_t
+test_read_fault (uint8_t *p, int offset)
+{
+ prinfo ("*(uint8_t *)(%p + %d)", p, offset);
+
+ if (expect_signal (read_u8, p + offset))
+ {
+ prinfo ("\tsignal OK\n");
+
+ return TRUE;
+ }
+
+ prinfo ("\tFAILED\n");
+
+ return FALSE;
+}
+
+static void
+test_read_ok (uint8_t *p, int offset)
+{
+ prinfo ("*(uint8_t *)(%p + %d)", p, offset);
+
+ /* If fails, SEGV. */
+ read_u8 (p + offset);
+
+ prinfo ("\tOK\n");
+}
+
+static pixman_bool_t
+test_read_faults (pixman_image_t *image)
+{
+ pixman_bool_t ok = TRUE;
+ pixman_format_code_t format = pixman_image_get_format (image);
+ int width = pixman_image_get_width (image);
+ int height = pixman_image_get_height (image);
+ int stride = pixman_image_get_stride (image);
+ uint8_t *p = (void *)pixman_image_get_data (image);
+ int row_bytes = width * PIXMAN_FORMAT_BPP (format) / 8;
+
+ prinfo ("%s %dx%d, row %d B, stride %d B:\n",
+ format_name (format), width, height, row_bytes, stride);
+
+ assert (height > 3);
+
+ test_read_ok (p, 0);
+ test_read_ok (p, row_bytes - 1);
+ test_read_ok (p, stride);
+ test_read_ok (p, stride + row_bytes - 1);
+ test_read_ok (p, 2 * stride);
+ test_read_ok (p, 2 * stride + row_bytes - 1);
+ test_read_ok (p, 3 * stride);
+ test_read_ok (p, (height - 1) * stride + row_bytes - 1);
+
+ ok &= test_read_fault (p, -1);
+ ok &= test_read_fault (p, row_bytes);
+ ok &= test_read_fault (p, stride - 1);
+ ok &= test_read_fault (p, stride + row_bytes);
+ ok &= test_read_fault (p, 2 * stride - 1);
+ ok &= test_read_fault (p, 2 * stride + row_bytes);
+ ok &= test_read_fault (p, 3 * stride - 1);
+ ok &= test_read_fault (p, height * stride);
+
+ return ok;
+}
+
+static pixman_bool_t
+test_image_faults (pixman_format_code_t format, int min_width, int height)
+{
+ pixman_bool_t ok;
+ pixman_image_t *image;
+
+ image = fence_image_create_bits (format, min_width, height, TRUE);
+ ok = test_read_faults (image);
+ pixman_image_unref (image);
+
+ return ok;
+}
+
+int
+main (int argc, char **argv)
+{
+ pixman_bool_t ok = TRUE;
+
+ if (getenv ("VERBOSE") != NULL)
+ verbose = TRUE;
+
+ ok &= test_image_faults (PIXMAN_a8r8g8b8, 7, 5);
+ ok &= test_image_faults (PIXMAN_r8g8b8, 7, 5);
+ ok &= test_image_faults (PIXMAN_r5g6b5, 7, 5);
+ ok &= test_image_faults (PIXMAN_a8, 7, 5);
+ ok &= test_image_faults (PIXMAN_a4, 7, 5);
+ ok &= test_image_faults (PIXMAN_a1, 7, 5);
+
+ if (ok)
+ return EXIT_SUCCESS;
+
+ return EXIT_FAILURE;
+}
+
+#else /* FENCE_MALLOC_ACTIVE */
+
+int
+main (int argc, char **argv)
+{
+ /* Automake return code for test SKIP. */
+ return 77;
+}
+
+#endif /* FENCE_MALLOC_ACTIVE */
diff --git a/test/filter-reduction-test.c b/test/filter-reduction-test.c
new file mode 100644
index 0000000..705fa4b
--- /dev/null
+++ b/test/filter-reduction-test.c
@@ -0,0 +1,112 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static const pixman_fixed_t entries[] =
+{
+ pixman_double_to_fixed (-1.0),
+ pixman_double_to_fixed (-0.5),
+ pixman_double_to_fixed (-1/3.0),
+ pixman_double_to_fixed (0.0),
+ pixman_double_to_fixed (0.5),
+ pixman_double_to_fixed (1.0),
+ pixman_double_to_fixed (1.5),
+ pixman_double_to_fixed (2.0),
+ pixman_double_to_fixed (3.0),
+};
+
+#define SIZE 12
+
+static uint32_t
+test_scale (const pixman_transform_t *xform, uint32_t crc)
+{
+ uint32_t *srcbuf, *dstbuf;
+ pixman_image_t *src, *dest;
+
+ srcbuf = malloc (SIZE * SIZE * 4);
+ prng_randmemset (srcbuf, SIZE * SIZE * 4, 0);
+ src = pixman_image_create_bits (
+ PIXMAN_a8r8g8b8, SIZE, SIZE, srcbuf, SIZE * 4);
+
+ dstbuf = malloc (SIZE * SIZE * 4);
+ prng_randmemset (dstbuf, SIZE * SIZE * 4, 0);
+ dest = pixman_image_create_bits (
+ PIXMAN_a8r8g8b8, SIZE, SIZE, dstbuf, SIZE * 4);
+
+ pixman_image_set_transform (src, xform);
+ pixman_image_set_repeat (src, PIXMAN_REPEAT_NORMAL);
+ pixman_image_set_filter (src, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+ image_endian_swap (src);
+ image_endian_swap (dest);
+
+ pixman_image_composite (PIXMAN_OP_SRC,
+ src, NULL, dest,
+ 0, 0, 0, 0, 0, 0,
+ SIZE, SIZE);
+
+ crc = compute_crc32_for_image (crc, dest);
+
+ pixman_image_unref (src);
+ pixman_image_unref (dest);
+
+ free (srcbuf);
+ free (dstbuf);
+
+ return crc;
+}
+
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x02169677
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0xE44B29AC
+#else
+#define CHECKSUM 0x00000000
+#endif
+
+int
+main (int argc, const char *argv[])
+{
+ const pixman_fixed_t *end = entries + ARRAY_LENGTH (entries);
+ const pixman_fixed_t *t0, *t1, *t2, *t3, *t4, *t5;
+ uint32_t crc = 0;
+
+ prng_srand (0x56EA1DBD);
+
+ for (t0 = entries; t0 < end; ++t0)
+ {
+ for (t1 = entries; t1 < end; ++t1)
+ {
+ for (t2 = entries; t2 < end; ++t2)
+ {
+ for (t3 = entries; t3 < end; ++t3)
+ {
+ for (t4 = entries; t4 < end; ++t4)
+ {
+ for (t5 = entries; t5 < end; ++t5)
+ {
+ pixman_transform_t xform = {
+ { { *t0, *t1, *t2 },
+ { *t3, *t4, *t5 },
+ { 0, 0, pixman_fixed_1 } }
+ };
+
+ crc = test_scale (&xform, crc);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (crc != CHECKSUM)
+ {
+ printf ("filter-reduction-test failed! (checksum=0x%08X, expected 0x%08X)\n", crc, CHECKSUM);
+ return 1;
+ }
+ else
+ {
+ printf ("filter-reduction-test passed (checksum=0x%08X)\n", crc);
+ return 0;
+ }
+}
diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
index 3da094a..7ba2986 100644
--- a/test/lowlevel-blt-bench.c
+++ b/test/lowlevel-blt-bench.c
@@ -55,7 +55,7 @@ uint32_t *dst;
uint32_t *src;
uint32_t *mask;
-double bandwidth = 0;
+double bandwidth = 0.0;
double
bench_memcpy ()
@@ -90,6 +90,7 @@ bench_memcpy ()
static pixman_bool_t use_scaling = FALSE;
static pixman_filter_t filter = PIXMAN_FILTER_NEAREST;
+static pixman_bool_t use_csv_output = FALSE;
/* nearly 1x scale factor */
static pixman_transform_t m =
@@ -165,7 +166,7 @@ call_func (pixman_composite_func_t func,
func (0, &info);
}
-void
+double
noinline
bench_L (pixman_op_t op,
pixman_image_t * src_img,
@@ -179,7 +180,6 @@ bench_L (pixman_op_t op,
int64_t i, j, k;
int x = 0;
int q = 0;
- volatile int qx;
for (i = 0; i < n; i++)
{
@@ -203,10 +203,11 @@ bench_L (pixman_op_t op,
x = 0;
call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 63 - x, 0, width, lines_count);
}
- qx = q;
+
+ return (double)n * lines_count * width;
}
-void
+double
noinline
bench_M (pixman_op_t op,
pixman_image_t * src_img,
@@ -224,6 +225,8 @@ bench_M (pixman_op_t op,
x = 0;
call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 1, 0, WIDTH - 64, HEIGHT);
}
+
+ return (double)n * (WIDTH - 64) * HEIGHT;
}
double
@@ -366,15 +369,24 @@ bench_RT (pixman_op_t op,
return pix_cnt;
}
+static double
+Mpx_per_sec (double pix_cnt, double t1, double t2, double t3)
+{
+ double overhead = t2 - t1;
+ double testtime = t3 - t2;
+
+ return pix_cnt / (testtime - overhead) / 1e6;
+}
+
void
-bench_composite (char * testname,
- int src_fmt,
- int src_flags,
- int op,
- int mask_fmt,
- int mask_flags,
- int dst_fmt,
- double npix)
+bench_composite (const char *testname,
+ int src_fmt,
+ int src_flags,
+ int op,
+ int mask_fmt,
+ int mask_flags,
+ int dst_fmt,
+ double npix)
{
pixman_image_t * src_img;
pixman_image_t * dst_img;
@@ -461,9 +473,9 @@ bench_composite (char * testname,
dst,
XWIDTH * 4);
-
- printf ("%24s %c", testname, func != pixman_image_composite_wrapper ?
- '-' : '=');
+ if (!use_csv_output)
+ printf ("%24s %c", testname, func != pixman_image_composite_wrapper ?
+ '-' : '=');
memcpy (dst, src, BUFSIZE);
memcpy (src, dst, BUFSIZE);
@@ -476,13 +488,15 @@ bench_composite (char * testname,
n = 1 + npix / (l1test_width * 8);
t1 = gettime ();
#if EXCLUDE_OVERHEAD
- bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, 1);
+ pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, 1);
#endif
t2 = gettime ();
- bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, 1);
+ pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, 1);
t3 = gettime ();
- printf (" L1:%7.2f", (double)n * l1test_width * 1 /
- ((t3 - t2) - (t2 - t1)) / 1000000.);
+ if (use_csv_output)
+ printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+ else
+ printf (" L1:%7.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
fflush (stdout);
memcpy (dst, src, BUFSIZE);
@@ -495,13 +509,15 @@ bench_composite (char * testname,
n = 1 + npix / (l1test_width * nlines);
t1 = gettime ();
#if EXCLUDE_OVERHEAD
- bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, nlines);
+ pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, nlines);
#endif
t2 = gettime ();
- bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, nlines);
+ pix_cnt = bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, nlines);
t3 = gettime ();
- printf (" L2:%7.2f", (double)n * l1test_width * nlines /
- ((t3 - t2) - (t2 - t1)) / 1000000.);
+ if (use_csv_output)
+ printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+ else
+ printf (" L2:%7.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
fflush (stdout);
memcpy (dst, src, BUFSIZE);
@@ -510,14 +526,16 @@ bench_composite (char * testname,
n = 1 + npix / (WIDTH * HEIGHT);
t1 = gettime ();
#if EXCLUDE_OVERHEAD
- bench_M (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+ pix_cnt = bench_M (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
#endif
t2 = gettime ();
- bench_M (op, src_img, mask_img, dst_img, n, func);
+ pix_cnt = bench_M (op, src_img, mask_img, dst_img, n, func);
t3 = gettime ();
- printf (" M:%6.2f (%6.2f%%)",
- ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1))) / 1000000.,
- ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1)) * bytes_per_pix) * (100.0 / bandwidth) );
+ if (use_csv_output)
+ printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+ else
+ printf (" M:%6.2f (%6.2f%%)", Mpx_per_sec (pix_cnt, t1, t2, t3),
+ (pix_cnt / ((t3 - t2) - (t2 - t1)) * bytes_per_pix) * (100.0 / bandwidth) );
fflush (stdout);
memcpy (dst, src, BUFSIZE);
@@ -531,7 +549,10 @@ bench_composite (char * testname,
t2 = gettime ();
pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, func);
t3 = gettime ();
- printf (" HT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+ if (use_csv_output)
+ printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+ else
+ printf (" HT:%6.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
fflush (stdout);
memcpy (dst, src, BUFSIZE);
@@ -545,7 +566,10 @@ bench_composite (char * testname,
t2 = gettime ();
pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, func);
t3 = gettime ();
- printf (" VT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+ if (use_csv_output)
+ printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+ else
+ printf (" VT:%6.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
fflush (stdout);
memcpy (dst, src, BUFSIZE);
@@ -559,7 +583,10 @@ bench_composite (char * testname,
t2 = gettime ();
pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
t3 = gettime ();
- printf (" R:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+ if (use_csv_output)
+ printf ("%g,", Mpx_per_sec (pix_cnt, t1, t2, t3));
+ else
+ printf (" R:%6.2f", Mpx_per_sec (pix_cnt, t1, t2, t3));
fflush (stdout);
memcpy (dst, src, BUFSIZE);
@@ -573,7 +600,10 @@ bench_composite (char * testname,
t2 = gettime ();
pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
t3 = gettime ();
- printf (" RT:%6.2f (%4.0fKops/s)\n", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000., (double) n / ((t3 - t2) * 1000));
+ if (use_csv_output)
+ printf ("%g\n", Mpx_per_sec (pix_cnt, t1, t2, t3));
+ else
+ printf (" RT:%6.2f (%4.0fKops/s)\n", Mpx_per_sec (pix_cnt, t1, t2, t3), (double) n / ((t3 - t2) * 1000));
if (mask_img) {
pixman_image_unref (mask_img);
@@ -587,17 +617,20 @@ bench_composite (char * testname,
#define PIXMAN_OP_OUT_REV (PIXMAN_OP_OUT_REVERSE)
-struct
+struct test_entry
{
- char *testname;
- int src_fmt;
- int src_flags;
- int op;
- int mask_fmt;
- int mask_flags;
- int dst_fmt;
-}
-tests_tbl[] =
+ const char *testname;
+ int src_fmt;
+ int src_flags;
+ int op;
+ int mask_fmt;
+ int mask_flags;
+ int dst_fmt;
+};
+
+typedef struct test_entry test_entry_t;
+
+static const test_entry_t tests_tbl[] =
{
{ "add_8_8_8", PIXMAN_a8, 0, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 },
{ "add_n_8_8", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_ADD, PIXMAN_a8, 0, PIXMAN_a8 },
@@ -719,46 +752,286 @@ tests_tbl[] =
{ "rpixbuf", PIXMAN_x8b8g8r8, 0, PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
};
-int
-main (int argc, char *argv[])
+static const test_entry_t special_patterns[] =
+{
+ { "add_n_2x10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_x2r10g10b10 },
+ { "add_n_2a10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD, PIXMAN_null, 0, PIXMAN_a2r10g10b10 },
+ { "src_n_2x10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC, PIXMAN_null, 0, PIXMAN_x2r10g10b10 },
+ { "src_n_2a10", PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC, PIXMAN_null, 0, PIXMAN_a2r10g10b10 },
+ { "src_0888_8888_rev", PIXMAN_b8g8r8, 0, PIXMAN_OP_SRC, PIXMAN_null, 0, PIXMAN_x8r8g8b8 },
+ { "src_0888_0565_rev", PIXMAN_b8g8r8, 0, PIXMAN_OP_SRC, PIXMAN_null, 0, PIXMAN_r5g6b5 },
+ { "src_n_8", PIXMAN_a8, 1, PIXMAN_OP_SRC, PIXMAN_null, 0, PIXMAN_a8 },
+ { "pixbuf", PIXMAN_x8b8g8r8, 0, PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, 0, PIXMAN_a8r8g8b8 },
+ { "rpixbuf", PIXMAN_x8b8g8r8, 0, PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
+};
+
+/* Returns the sub-string's end pointer in string. */
+static const char *
+copy_sub_string (char *buf,
+ const char *string,
+ const char *scan_from,
+ const char *end)
{
- double x;
+ const char *delim;
+ size_t n;
+
+ delim = strchr (scan_from, '_');
+ if (!delim)
+ delim = end;
+
+ n = delim - string;
+ strncpy(buf, string, n);
+ buf[n] = '\0';
+
+ return delim;
+}
+
+static pixman_op_t
+parse_longest_operator (char *buf, const char **strp, const char *end)
+{
+ const char *p = *strp;
+ const char *sub_end;
+ const char *best_end = p;
+ pixman_op_t best_op = PIXMAN_OP_NONE;
+ pixman_op_t op;
+
+ while (p < end)
+ {
+ sub_end = copy_sub_string (buf, *strp, p, end);
+ op = operator_from_string (buf);
+ p = sub_end + 1;
+
+ if (op != PIXMAN_OP_NONE)
+ {
+ best_end = p;
+ best_op = op;
+ }
+ }
+
+ *strp = best_end;
+ return best_op;
+}
+
+static pixman_format_code_t
+parse_format (char *buf, const char **p, const char *end)
+{
+ pixman_format_code_t format;
+ const char *delim;
+
+ if (*p >= end)
+ return PIXMAN_null;
+
+ delim = copy_sub_string (buf, *p, *p, end);
+ format = format_from_string (buf);
+
+ if (format != PIXMAN_null)
+ *p = delim + 1;
+
+ return format;
+}
+
+static int
+parse_test_pattern (test_entry_t *test, const char *pattern)
+{
+ const char *p = pattern;
+ const char *end = pattern + strlen (pattern);
+ char buf[1024];
+ pixman_format_code_t format[3];
int i;
- const char *pattern = NULL;
- for (i = 1; i < argc; i++)
+
+ if (strlen (pattern) > sizeof (buf) - 1)
+ return -1;
+
+ /* Special cases that the parser cannot produce. */
+ for (i = 0; i < ARRAY_LENGTH (special_patterns); i++)
{
- if (argv[i][0] == '-')
- {
- if (strchr (argv[i] + 1, 'b'))
- {
- use_scaling = TRUE;
- filter = PIXMAN_FILTER_BILINEAR;
- }
- else if (strchr (argv[i] + 1, 'n'))
- {
- use_scaling = TRUE;
- filter = PIXMAN_FILTER_NEAREST;
- }
- }
- else
- {
- pattern = argv[i];
- }
+ if (strcmp (pattern, special_patterns[i].testname) == 0)
+ {
+ *test = special_patterns[i];
+ return 0;
+ }
}
- if (!pattern)
+ test->testname = pattern;
+
+ /* Extract operator, may contain delimiters,
+ * so take the longest string that matches.
+ */
+ test->op = parse_longest_operator (buf, &p, end);
+ if (test->op == PIXMAN_OP_NONE)
+ return -1;
+
+ /* extract up to three pixel formats */
+ format[0] = parse_format (buf, &p, end);
+ format[1] = parse_format (buf, &p, end);
+ format[2] = parse_format (buf, &p, end);
+
+ if (format[0] == PIXMAN_null || format[1] == PIXMAN_null)
+ return -1;
+
+ /* recognize CA flag */
+ test->mask_flags = 0;
+ if (p < end)
{
- printf ("Usage: lowlevel-blt-bench [-b] [-n] pattern\n");
- printf (" -n : benchmark nearest scaling\n");
- printf (" -b : benchmark bilinear scaling\n");
- return 1;
+ if (strcmp (p, "ca") == 0)
+ test->mask_flags |= CA_FLAG;
+ else
+ return -1; /* trailing garbage */
}
- src = aligned_malloc (4096, BUFSIZE * 3);
- memset (src, 0xCC, BUFSIZE * 3);
- dst = src + (BUFSIZE / 4);
- mask = dst + (BUFSIZE / 4);
+ test->src_fmt = format[0];
+ if (format[2] == PIXMAN_null)
+ {
+ test->mask_fmt = PIXMAN_null;
+ test->dst_fmt = format[1];
+ }
+ else
+ {
+ test->mask_fmt = format[1];
+ test->dst_fmt = format[2];
+ }
+
+ test->src_flags = 0;
+ if (test->src_fmt == PIXMAN_solid)
+ {
+ test->src_fmt = PIXMAN_a8r8g8b8;
+ test->src_flags |= SOLID_FLAG;
+ }
+
+ if (test->mask_fmt == PIXMAN_solid)
+ {
+ if (test->mask_flags & CA_FLAG)
+ test->mask_fmt = PIXMAN_a8r8g8b8;
+ else
+ test->mask_fmt = PIXMAN_a8;
+
+ test->mask_flags |= SOLID_FLAG;
+ }
+
+ return 0;
+}
+static int
+check_int (int got, int expected, const char *name, const char *field)
+{
+ if (got == expected)
+ return 0;
+
+ printf ("%s: %s failure: expected %d, got %d.\n",
+ name, field, expected, got);
+
+ return 1;
+}
+
+static int
+check_format (int got, int expected, const char *name, const char *field)
+{
+ if (got == expected)
+ return 0;
+
+ printf ("%s: %s failure: expected %s (%#x), got %s (%#x).\n",
+ name, field,
+ format_name (expected), expected,
+ format_name (got), got);
+
+ return 1;
+}
+
+static void
+parser_self_test (void)
+{
+ const test_entry_t *ent;
+ test_entry_t test;
+ int fails = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
+ {
+ ent = &tests_tbl[i];
+
+ if (parse_test_pattern (&test, ent->testname) < 0)
+ {
+ printf ("parsing failed for '%s'\n", ent->testname);
+ fails++;
+ continue;
+ }
+
+ fails += check_format (test.src_fmt, ent->src_fmt,
+ ent->testname, "src_fmt");
+ fails += check_format (test.mask_fmt, ent->mask_fmt,
+ ent->testname, "mask_fmt");
+ fails += check_format (test.dst_fmt, ent->dst_fmt,
+ ent->testname, "dst_fmt");
+ fails += check_int (test.src_flags, ent->src_flags,
+ ent->testname, "src_flags");
+ fails += check_int (test.mask_flags, ent->mask_flags,
+ ent->testname, "mask_flags");
+ fails += check_int (test.op, ent->op, ent->testname, "op");
+ }
+
+ if (fails)
+ {
+ printf ("Parser self-test failed.\n");
+ exit (EXIT_FAILURE);
+ }
+
+ if (!use_csv_output)
+ printf ("Parser self-test complete.\n");
+}
+
+static void
+print_test_details (const test_entry_t *test)
+{
+ printf ("%s: %s, src %s%s, mask %s%s%s, dst %s\n",
+ test->testname,
+ operator_name (test->op),
+ format_name (test->src_fmt),
+ test->src_flags & SOLID_FLAG ? " solid" : "",
+ format_name (test->mask_fmt),
+ test->mask_flags & SOLID_FLAG ? " solid" : "",
+ test->mask_flags & CA_FLAG ? " CA" : "",
+ format_name (test->dst_fmt));
+}
+
+static void
+run_one_test (const char *pattern, double bandwidth_, pixman_bool_t prdetails)
+{
+ test_entry_t test;
+
+ if (parse_test_pattern (&test, pattern) < 0)
+ {
+ printf ("Error: Could not parse the test pattern '%s'.\n", pattern);
+ return;
+ }
+
+ if (prdetails)
+ {
+ print_test_details (&test);
+ printf ("---\n");
+ }
+
+ bench_composite (pattern,
+ test.src_fmt,
+ test.src_flags,
+ test.op,
+ test.mask_fmt,
+ test.mask_flags,
+ test.dst_fmt,
+ bandwidth_ / 8);
+}
+
+static void
+run_default_tests (double bandwidth_)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
+ run_one_test (tests_tbl[i].testname, bandwidth_, FALSE);
+}
+
+static void
+print_explanation (void)
+{
printf ("Benchmark for a set of most commonly used functions\n");
printf ("---\n");
printf ("All results are presented in millions of pixels per second\n");
@@ -786,9 +1059,14 @@ main (int argc, char *argv[])
printf ("RT - as R, but %dx%d average sized rectangles are copied\n",
TINYWIDTH, TINYWIDTH);
printf ("---\n");
- bandwidth = x = bench_memcpy ();
+}
+
+static void
+print_speed_scaling (double bw)
+{
printf ("reference memcpy speed = %.1fMB/s (%.1fMP/s for 32bpp fills)\n",
- x / 1000000., x / 4000000);
+ bw / 1000000., bw / 4000000);
+
if (use_scaling)
{
printf ("---\n");
@@ -799,23 +1077,85 @@ main (int argc, char *argv[])
else
printf ("UNKNOWN scaling\n");
}
+
printf ("---\n");
+}
- for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
+static void
+usage (const char *progname)
+{
+ printf ("Usage: %s [-b] [-n] [-c] [-m M] pattern\n", progname);
+ printf (" -n : benchmark nearest scaling\n");
+ printf (" -b : benchmark bilinear scaling\n");
+ printf (" -c : print output as CSV data\n");
+ printf (" -m M : set reference memcpy speed to M MB/s instead of measuring it\n");
+}
+
+int
+main (int argc, char *argv[])
+{
+ int i;
+ const char *pattern = NULL;
+
+ for (i = 1; i < argc; i++)
{
- if (strcmp (pattern, "all") == 0 || strcmp (tests_tbl[i].testname, pattern) == 0)
+ if (argv[i][0] == '-')
{
- bench_composite (tests_tbl[i].testname,
- tests_tbl[i].src_fmt,
- tests_tbl[i].src_flags,
- tests_tbl[i].op,
- tests_tbl[i].mask_fmt,
- tests_tbl[i].mask_flags,
- tests_tbl[i].dst_fmt,
- bandwidth/8);
+ if (strchr (argv[i] + 1, 'b'))
+ {
+ use_scaling = TRUE;
+ filter = PIXMAN_FILTER_BILINEAR;
+ }
+ else if (strchr (argv[i] + 1, 'n'))
+ {
+ use_scaling = TRUE;
+ filter = PIXMAN_FILTER_NEAREST;
+ }
+
+ if (strchr (argv[i] + 1, 'c'))
+ use_csv_output = TRUE;
+
+ if (strcmp (argv[i], "-m") == 0 && i + 1 < argc)
+ bandwidth = atof (argv[++i]) * 1e6;
}
+ else
+ {
+ if (pattern)
+ {
+ pattern = NULL;
+ printf ("Error: extra arguments given.\n");
+ break;
+ }
+ pattern = argv[i];
+ }
+ }
+
+ if (!pattern)
+ {
+ usage (argv[0]);
+ return 1;
}
+ parser_self_test ();
+
+ src = aligned_malloc (4096, BUFSIZE * 3);
+ memset (src, 0xCC, BUFSIZE * 3);
+ dst = src + (BUFSIZE / 4);
+ mask = dst + (BUFSIZE / 4);
+
+ if (!use_csv_output)
+ print_explanation ();
+
+ if (bandwidth < 1.0)
+ bandwidth = bench_memcpy ();
+ if (!use_csv_output)
+ print_speed_scaling (bandwidth);
+
+ if (strcmp (pattern, "all") == 0)
+ run_default_tests (bandwidth);
+ else
+ run_one_test (pattern, bandwidth, !use_csv_output);
+
free (src);
return 0;
}
diff --git a/test/matrix-test.c b/test/matrix-test.c
index 0a5f203..cd8820c 100644
--- a/test/matrix-test.c
+++ b/test/matrix-test.c
@@ -201,8 +201,8 @@ test_matrix (int testnum, int verbose)
{
for (j = 0; j < 3; j++)
{
- double diff = fabs (result_f.v[j] -
- pixman_fixed_to_float128 (result_i.v[j]));
+ double diff = fabsl (result_f.v[j] -
+ pixman_fixed_to_float128 (result_i.v[j]));
if (is_affine && diff > (0.51 / 65536.0))
{
diff --git a/test/meson.build b/test/meson.build
new file mode 100644
index 0000000..47dd33c
--- /dev/null
+++ b/test/meson.build
@@ -0,0 +1,90 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+tests = [
+ 'oob-test',
+ 'infinite-loop',
+ 'trap-crasher',
+ 'fence-image-self-test',
+ 'region-translate-test',
+ 'fetch-test',
+ 'a1-trap-test',
+ 'prng-test',
+ 'radial-invalid',
+ 'pdf-op-test',
+ 'region-test',
+ 'combiner-test',
+ 'scaling-crash-test',
+ 'alpha-loop',
+ 'scaling-helpers-test',
+ 'rotate-test',
+ 'alphamap',
+ 'gradient-crash-test',
+ 'pixel-test',
+ 'matrix-test',
+ 'filter-reduction-test',
+ 'composite-traps-test',
+ 'region-contains-test',
+ 'glyph-test',
+ 'solid-test',
+ 'stress-test',
+ 'cover-test',
+ 'blitters-test',
+ 'affine-test',
+ 'scaling-test',
+ 'composite',
+ 'tolerance-test',
+]
+
+# Remove/update this once thread-test.c supports threading methods
+# other than PThreads and Windows threads
+if pthreads_found or host_machine.system() == 'windows'
+ tests += 'thread-test'
+endif
+
+progs = [
+ 'lowlevel-blt-bench',
+ 'radial-perf-test',
+ 'check-formats',
+ 'scaling-bench',
+ 'affine-bench',
+]
+
+foreach t : tests
+ test(
+ t,
+ executable(
+ t,
+ [t + '.c', config_h],
+ dependencies : [idep_pixman, libtestutils_dep, dep_threads, dep_openmp, dep_png],
+ ),
+ timeout : 120,
+ is_parallel : true,
+ )
+endforeach
+
+foreach p : progs
+ executable(
+ p,
+ p + '.c',
+ dependencies : [idep_pixman, libtestutils_dep, dep_openmp],
+ )
+endforeach
+
diff --git a/test/scaling-test.c b/test/scaling-test.c
index e2f7fa9..0ece611 100644
--- a/test/scaling-test.c
+++ b/test/scaling-test.c
@@ -73,7 +73,7 @@ test_composite (int testnum,
pixman_op_t op;
pixman_repeat_t repeat = PIXMAN_REPEAT_NONE;
pixman_repeat_t mask_repeat = PIXMAN_REPEAT_NONE;
- pixman_format_code_t src_fmt, dst_fmt;
+ pixman_format_code_t src_fmt, mask_fmt, dst_fmt;
uint32_t * srcbuf;
uint32_t * dstbuf;
uint32_t * maskbuf;
@@ -145,6 +145,7 @@ test_composite (int testnum,
prng_randmemset (dstbuf, dst_stride * dst_height, 0);
src_fmt = get_format (src_bpp);
+ mask_fmt = PIXMAN_a8;
dst_fmt = get_format (dst_bpp);
if (prng_rand_n (2))
@@ -169,7 +170,7 @@ test_composite (int testnum,
src_fmt, src_width, src_height, srcbuf, src_stride);
mask_img = pixman_image_create_bits (
- PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride);
+ mask_fmt, mask_width, mask_height, maskbuf, mask_stride);
dst_img = pixman_image_create_bits (
dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
@@ -255,21 +256,6 @@ test_composite (int testnum,
else
pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
- if (verbose)
- {
- printf ("src_fmt=%s, dst_fmt=%s\n",
- format_name (src_fmt), format_name (dst_fmt));
- printf ("op=%s, scale_x=%d, scale_y=%d, repeat=%d\n",
- operator_name (op), scale_x, scale_y, repeat);
- printf ("translate_x=%d, translate_y=%d\n",
- translate_x, translate_y);
- printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
- src_width, src_height, dst_width, dst_height);
- printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
- src_x, src_y, dst_x, dst_y);
- printf ("w=%d, h=%d\n", w, h);
- }
-
if (prng_rand_n (8) == 0)
{
pixman_box16_t clip_boxes[2];
@@ -352,10 +338,45 @@ test_composite (int testnum,
}
if (prng_rand_n (2) == 0)
- pixman_image_composite (op, src_img, NULL, dst_img,
- src_x, src_y, 0, 0, dst_x, dst_y, w, h);
- else
- pixman_image_composite (op, src_img, mask_img, dst_img,
+ {
+ mask_fmt = PIXMAN_null;
+ pixman_image_unref (mask_img);
+ mask_img = NULL;
+ mask_x = 0;
+ mask_y = 0;
+ }
+
+ if (verbose)
+ {
+ printf ("op=%s, src_fmt=%s, mask_fmt=%s, dst_fmt=%s\n",
+ operator_name (op), format_name (src_fmt),
+ format_name (mask_fmt), format_name (dst_fmt));
+ printf ("scale_x=%d, scale_y=%d, repeat=%d, filter=%d\n",
+ scale_x, scale_y, repeat, src_img->common.filter);
+ printf ("translate_x=%d, translate_y=%d\n",
+ translate_x, translate_y);
+ if (mask_fmt != PIXMAN_null)
+ {
+ printf ("mask_scale_x=%d, mask_scale_y=%d, "
+ "mask_repeat=%d, mask_filter=%d\n",
+ mask_scale_x, mask_scale_y, mask_repeat,
+ mask_img->common.filter);
+ printf ("mask_translate_x=%d, mask_translate_y=%d\n",
+ mask_translate_x, mask_translate_y);
+ }
+ printf ("src_width=%d, src_height=%d, src_x=%d, src_y=%d\n",
+ src_width, src_height, src_x, src_y);
+ if (mask_fmt != PIXMAN_null)
+ {
+ printf ("mask_width=%d, mask_height=%d, mask_x=%d, mask_y=%d\n",
+ mask_width, mask_height, mask_x, mask_y);
+ }
+ printf ("dst_width=%d, dst_height=%d, dst_x=%d, dst_y=%d\n",
+ dst_width, dst_height, dst_x, dst_y);
+ printf ("w=%d, h=%d\n", w, h);
+ }
+
+ pixman_image_composite (op, src_img, mask_img, dst_img,
src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
crc32 = compute_crc32_for_image (0, dst_img);
@@ -364,7 +385,8 @@ test_composite (int testnum,
print_image (dst_img);
pixman_image_unref (src_img);
- pixman_image_unref (mask_img);
+ if (mask_img != NULL)
+ pixman_image_unref (mask_img);
pixman_image_unref (dst_img);
if (src_stride < 0)
diff --git a/test/solid-test.c b/test/solid-test.c
new file mode 100644
index 0000000..b118d37
--- /dev/null
+++ b/test/solid-test.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright © 2015 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The copyright holders make no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+#include "utils.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#define WIDTH 32
+#define HEIGHT 32
+
+static const pixman_op_t op_list[] = {
+ PIXMAN_OP_SRC,
+ PIXMAN_OP_OVER,
+ PIXMAN_OP_ADD,
+ PIXMAN_OP_CLEAR,
+ PIXMAN_OP_SRC,
+ PIXMAN_OP_DST,
+ PIXMAN_OP_OVER,
+ PIXMAN_OP_OVER_REVERSE,
+ PIXMAN_OP_IN,
+ PIXMAN_OP_IN_REVERSE,
+ PIXMAN_OP_OUT,
+ PIXMAN_OP_OUT_REVERSE,
+ PIXMAN_OP_ATOP,
+ PIXMAN_OP_ATOP_REVERSE,
+ PIXMAN_OP_XOR,
+ PIXMAN_OP_ADD,
+ PIXMAN_OP_MULTIPLY,
+ PIXMAN_OP_SCREEN,
+ PIXMAN_OP_OVERLAY,
+ PIXMAN_OP_DARKEN,
+ PIXMAN_OP_LIGHTEN,
+ PIXMAN_OP_HARD_LIGHT,
+ PIXMAN_OP_DIFFERENCE,
+ PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+ PIXMAN_OP_SATURATE,
+ PIXMAN_OP_DISJOINT_CLEAR,
+ PIXMAN_OP_DISJOINT_SRC,
+ PIXMAN_OP_DISJOINT_DST,
+ PIXMAN_OP_DISJOINT_OVER,
+ PIXMAN_OP_DISJOINT_OVER_REVERSE,
+ PIXMAN_OP_DISJOINT_IN,
+ PIXMAN_OP_DISJOINT_IN_REVERSE,
+ PIXMAN_OP_DISJOINT_OUT,
+ PIXMAN_OP_DISJOINT_OUT_REVERSE,
+ PIXMAN_OP_DISJOINT_ATOP,
+ PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+ PIXMAN_OP_DISJOINT_XOR,
+ PIXMAN_OP_CONJOINT_CLEAR,
+ PIXMAN_OP_CONJOINT_SRC,
+ PIXMAN_OP_CONJOINT_DST,
+ PIXMAN_OP_CONJOINT_OVER,
+ PIXMAN_OP_CONJOINT_OVER_REVERSE,
+ PIXMAN_OP_CONJOINT_IN,
+ PIXMAN_OP_CONJOINT_IN_REVERSE,
+ PIXMAN_OP_CONJOINT_OUT,
+ PIXMAN_OP_CONJOINT_OUT_REVERSE,
+ PIXMAN_OP_CONJOINT_ATOP,
+ PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+ PIXMAN_OP_CONJOINT_XOR,
+ PIXMAN_OP_COLOR_DODGE,
+ PIXMAN_OP_COLOR_BURN,
+ PIXMAN_OP_SOFT_LIGHT,
+ PIXMAN_OP_HSL_HUE,
+ PIXMAN_OP_HSL_SATURATION,
+ PIXMAN_OP_HSL_COLOR,
+ PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+
+/* The first eight format in the list are by far the most widely
+ * used formats, so we test those more than the others
+ */
+#define N_MOST_LIKELY_FORMATS 8
+
+static const pixman_format_code_t img_fmt_list[] = {
+ PIXMAN_a8r8g8b8,
+ PIXMAN_a8b8g8r8,
+ PIXMAN_x8r8g8b8,
+ PIXMAN_x8b8g8r8,
+ PIXMAN_r5g6b5,
+ PIXMAN_b5g6r5,
+ PIXMAN_a8,
+ PIXMAN_a1,
+ PIXMAN_r3g3b2,
+ PIXMAN_b8g8r8a8,
+ PIXMAN_b8g8r8x8,
+ PIXMAN_r8g8b8a8,
+ PIXMAN_r8g8b8x8,
+ PIXMAN_x14r6g6b6,
+ PIXMAN_r8g8b8,
+ PIXMAN_b8g8r8,
+#if 0 /* These are going to use floating point in the near future */
+ PIXMAN_x2r10g10b10,
+ PIXMAN_a2r10g10b10,
+ PIXMAN_x2b10g10r10,
+ PIXMAN_a2b10g10r10,
+#endif
+ PIXMAN_a1r5g5b5,
+ PIXMAN_x1r5g5b5,
+ PIXMAN_a1b5g5r5,
+ PIXMAN_x1b5g5r5,
+ PIXMAN_a4r4g4b4,
+ PIXMAN_x4r4g4b4,
+ PIXMAN_a4b4g4r4,
+ PIXMAN_x4b4g4r4,
+ PIXMAN_r3g3b2,
+ PIXMAN_b2g3r3,
+ PIXMAN_a2r2g2b2,
+ PIXMAN_a2b2g2r2,
+ PIXMAN_c8,
+ PIXMAN_g8,
+ PIXMAN_x4c4,
+ PIXMAN_x4g4,
+ PIXMAN_c4,
+ PIXMAN_g4,
+ PIXMAN_g1,
+ PIXMAN_x4a4,
+ PIXMAN_a4,
+ PIXMAN_r1g2b1,
+ PIXMAN_b1g2r1,
+ PIXMAN_a1r1g1b1,
+ PIXMAN_a1b1g1r1,
+ PIXMAN_null
+};
+
+static const pixman_format_code_t mask_fmt_list[] = {
+ PIXMAN_a8r8g8b8,
+ PIXMAN_a8,
+ PIXMAN_a4,
+ PIXMAN_a1,
+ PIXMAN_null
+};
+
+static pixman_indexed_t rgb_palette[9];
+static pixman_indexed_t y_palette[9];
+
+static pixman_format_code_t
+random_format (const pixman_format_code_t *allowed_formats)
+{
+ int n = 0;
+
+ while (allowed_formats[n] != PIXMAN_null)
+ n++;
+
+ if (n > N_MOST_LIKELY_FORMATS && prng_rand_n (4) != 0)
+ n = N_MOST_LIKELY_FORMATS;
+
+ return allowed_formats[prng_rand_n (n)];
+}
+
+static pixman_image_t *
+create_multi_pixel_image (const pixman_format_code_t *allowed_formats,
+ uint32_t *buffer,
+ pixman_format_code_t *used_fmt)
+{
+ pixman_format_code_t fmt;
+ pixman_image_t *img;
+ int stride;
+
+ fmt = random_format (allowed_formats);
+ stride = (WIDTH * PIXMAN_FORMAT_BPP (fmt) + 31) / 32 * 4;
+ img = pixman_image_create_bits (fmt, WIDTH, HEIGHT, buffer, stride);
+
+ if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+ pixman_image_set_indexed (img, &(rgb_palette[PIXMAN_FORMAT_BPP (fmt)]));
+ else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+ pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
+
+ prng_randmemset (buffer, WIDTH * HEIGHT * 4, 0);
+ image_endian_swap (img);
+
+ if (used_fmt)
+ *used_fmt = fmt;
+
+ return img;
+}
+
+static pixman_image_t *
+create_solid_image (const pixman_format_code_t *allowed_formats,
+ uint32_t *buffer,
+ pixman_format_code_t *used_fmt)
+{
+ if (prng_rand_n (2))
+ {
+ /* Use a repeating 1x1 bitmap image for solid */
+ pixman_format_code_t fmt;
+ pixman_image_t *img, *dummy_img;
+ uint32_t bpp, dummy_buf;
+
+ fmt = random_format (allowed_formats);
+ bpp = PIXMAN_FORMAT_BPP (fmt);
+ img = pixman_image_create_bits (fmt, 1, 1, buffer, 4);
+ pixman_image_set_repeat (img, PIXMAN_REPEAT_NORMAL);
+
+ if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+ pixman_image_set_indexed (img, &(rgb_palette[bpp]));
+ else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+ pixman_image_set_indexed (img, &(y_palette[bpp]));
+
+ /* Force the flags to be calculated for image with initial
+ * bitmap contents of 0 or 2^bpp-1 by plotting from it into a
+ * separate throwaway image. It is simplest to write all 0s
+ * or all 1s to the first word irrespective of the colour
+ * depth even though we actually only care about the first
+ * pixel since the stride has to be a whole number of words.
+ */
+ *buffer = prng_rand_n (2) ? 0xFFFFFFFFu : 0;
+ dummy_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, 1, 1,
+ &dummy_buf, 4);
+ pixman_image_composite (PIXMAN_OP_SRC, img, NULL, dummy_img,
+ 0, 0, 0, 0, 0, 0, 1, 1);
+ pixman_image_unref (dummy_img);
+
+ /* Now set the bitmap contents to a random value */
+ prng_randmemset (buffer, 4, 0);
+ image_endian_swap (img);
+
+ if (used_fmt)
+ *used_fmt = fmt;
+
+ return img;
+ }
+ else
+ {
+ /* Use a native solid image */
+ pixman_color_t color;
+ pixman_image_t *img;
+
+ color.alpha = prng_rand_n (UINT16_MAX + 1);
+ color.red = prng_rand_n (UINT16_MAX + 1);
+ color.green = prng_rand_n (UINT16_MAX + 1);
+ color.blue = prng_rand_n (UINT16_MAX + 1);
+ img = pixman_image_create_solid_fill (&color);
+
+ if (used_fmt)
+ *used_fmt = PIXMAN_solid;
+
+ return img;
+ }
+}
+
+static uint32_t
+test_solid (int testnum, int verbose)
+{
+ pixman_op_t op;
+ uint32_t src_buf[WIDTH * HEIGHT];
+ uint32_t dst_buf[WIDTH * HEIGHT];
+ uint32_t mask_buf[WIDTH * HEIGHT];
+ pixman_image_t *src_img;
+ pixman_image_t *dst_img;
+ pixman_image_t *mask_img = NULL;
+ pixman_format_code_t src_fmt, dst_fmt, mask_fmt = PIXMAN_null;
+ pixman_bool_t ca = 0;
+ uint32_t crc32;
+
+ prng_srand (testnum);
+
+ op = op_list[prng_rand_n (ARRAY_LENGTH (op_list))];
+
+ dst_img = create_multi_pixel_image (img_fmt_list, dst_buf, &dst_fmt);
+ switch (prng_rand_n (3))
+ {
+ case 0: /* Solid source, no mask */
+ src_img = create_solid_image (img_fmt_list, src_buf, &src_fmt);
+ break;
+ case 1: /* Solid source, bitmap mask */
+ src_img = create_solid_image (img_fmt_list, src_buf, &src_fmt);
+ mask_img = create_multi_pixel_image (mask_fmt_list, mask_buf, &mask_fmt);
+ break;
+ case 2: /* Bitmap image, solid mask */
+ src_img = create_multi_pixel_image (img_fmt_list, src_buf, &src_fmt);
+ mask_img = create_solid_image (mask_fmt_list, mask_buf, &mask_fmt);
+ break;
+ default:
+ abort ();
+ }
+
+ if (mask_img)
+ {
+ ca = prng_rand_n (2);
+ pixman_image_set_component_alpha (mask_img, ca);
+ }
+
+ if (verbose)
+ {
+ printf ("op=%s\n", operator_name (op));
+ printf ("src_fmt=%s, dst_fmt=%s, mask_fmt=%s\n",
+ format_name (src_fmt), format_name (dst_fmt),
+ format_name (mask_fmt));
+ printf ("src_size=%u, mask_size=%u, component_alpha=%u\n",
+ src_fmt == PIXMAN_solid ? 1 : src_img->bits.width,
+ !mask_img || mask_fmt == PIXMAN_solid ? 1 : mask_img->bits.width,
+ ca);
+ }
+
+ pixman_image_composite (op, src_img, mask_img, dst_img,
+ 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+ if (verbose)
+ print_image (dst_img);
+
+ crc32 = compute_crc32_for_image (0, dst_img);
+
+ pixman_image_unref (src_img);
+ pixman_image_unref (dst_img);
+ if (mask_img)
+ pixman_image_unref (mask_img);
+
+ return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+ int i;
+
+ prng_srand (0);
+
+ for (i = 1; i <= 8; i++)
+ {
+ initialize_palette (&(rgb_palette[i]), i, TRUE);
+ initialize_palette (&(y_palette[i]), i, FALSE);
+ }
+
+ return fuzzer_test_main ("solid", 500000,
+ 0xC30FD380,
+ test_solid, argc, argv);
+}
diff --git a/test/stress-test.c b/test/stress-test.c
index 1f03c75..8ee1896 100644
--- a/test/stress-test.c
+++ b/test/stress-test.c
@@ -11,6 +11,8 @@
static const pixman_format_code_t image_formats[] =
{
+ PIXMAN_rgba_float,
+ PIXMAN_rgb_float,
PIXMAN_a8r8g8b8,
PIXMAN_x8r8g8b8,
PIXMAN_r5g6b5,
@@ -26,6 +28,7 @@ static const pixman_format_code_t image_formats[] =
PIXMAN_r8g8b8,
PIXMAN_b8g8r8,
PIXMAN_a8r8g8b8_sRGB,
+ PIXMAN_r8g8b8_sRGB,
PIXMAN_r5g6b5,
PIXMAN_b5g6r5,
PIXMAN_x2r10g10b10,
@@ -100,6 +103,14 @@ get_size (void)
}
}
+static uint32_t
+real_reader (const void *src, int size);
+
+static void *xor_ptr(const void *ptr)
+{
+ return (void *)(((intptr_t)ptr) ^ (intptr_t)0x8000000080000000);
+}
+
static void
destroy (pixman_image_t *image, void *data)
{
@@ -114,6 +125,9 @@ destroy (pixman_image_t *image, void *data)
if (image->bits.rowstride < 0)
bits -= (- image->bits.rowstride * (image->bits.height - 1));
+ if (image->bits.read_func == real_reader)
+ bits = xor_ptr(bits);
+
fence_free (bits);
}
}
@@ -124,6 +138,7 @@ destroy (pixman_image_t *image, void *data)
static uint32_t
real_reader (const void *src, int size)
{
+ src = xor_ptr(src);
switch (size)
{
case 1:
@@ -141,6 +156,7 @@ real_reader (const void *src, int size)
static void
real_writer (void *src, uint32_t value, int size)
{
+ src = xor_ptr(src);
switch (size)
{
case 1:
@@ -247,9 +263,20 @@ create_random_bits_image (alpha_preference_t alpha_preference)
pixman_filter_t filter;
pixman_fixed_t *coefficients = NULL;
int n_coefficients = 0;
+ int align_add, align_mask;
/* format */
format = random_format (alpha_preference);
+ switch (PIXMAN_FORMAT_BPP (format)) {
+ case 128:
+ align_mask = 15;
+ align_add = align_mask + prng_rand_n (65);
+ break;
+ default:
+ align_mask = 3;
+ align_add = align_mask + prng_rand_n (17);
+ break;
+ }
indexed = NULL;
if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
@@ -291,9 +318,12 @@ create_random_bits_image (alpha_preference_t alpha_preference)
{
default:
case 0:
- stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
- stride = (stride + 3) & (~3);
- bits = (uint32_t *)make_random_bytes (height * stride);
+ stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+ stride = (stride + align_add) & (~align_mask);
+ if (format == PIXMAN_rgb_float || format == PIXMAN_rgba_float)
+ bits = (uint32_t *)make_random_floats (height * stride);
+ else
+ bits = (uint32_t *)make_random_bytes (height * stride);
break;
case 1:
@@ -302,8 +332,8 @@ create_random_bits_image (alpha_preference_t alpha_preference)
break;
case 2: /* Zero-filled */
- stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
- stride = (stride + 3) & (~3);
+ stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+ stride = (stride + align_add) & (~align_mask);
bits = fence_malloc (height * stride);
if (!bits)
return NULL;
@@ -311,8 +341,8 @@ create_random_bits_image (alpha_preference_t alpha_preference)
break;
case 3: /* Filled with 0xFF */
- stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
- stride = (stride + 3) & (~3);
+ stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+ stride = (stride + align_add) & (~align_mask);
bits = fence_malloc (height * stride);
if (!bits)
return NULL;
@@ -320,27 +350,35 @@ create_random_bits_image (alpha_preference_t alpha_preference)
break;
case 4: /* bits is a bad pointer, has read/write functions */
- stride = 232;
- bits = (void *)0x01;
- read_func = fake_reader;
- write_func = fake_writer;
- break;
+ if (PIXMAN_FORMAT_BPP (format) <= 32) {
+ stride = 232;
+ bits = (void *)0x01;
+ read_func = fake_reader;
+ write_func = fake_writer;
+ break;
+ }
case 5: /* bits is a real pointer, has read/write functions */
- stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
- stride = (stride + 3) & (~3);
+ stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+ stride = (stride + align_add) & (~align_mask);
bits = fence_malloc (height * stride);
if (!bits)
return NULL;
memset (bits, 0xff, height * stride);
- read_func = real_reader;
- write_func = real_writer;
+ if (PIXMAN_FORMAT_BPP (format) <= 32) {
+ bits = xor_ptr(bits);
+ read_func = real_reader;
+ write_func = real_writer;
+ }
break;
case 6: /* bits is a real pointer, stride is negative */
- stride = (width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17));
- stride = (stride + 3) & (~3);
- bits = (uint32_t *)make_random_bytes (height * stride);
+ stride = (width * PIXMAN_FORMAT_BPP (format) + 7) / 8;
+ stride = (stride + align_add) & (~align_mask);
+ if (format == PIXMAN_rgb_float || format == PIXMAN_rgba_float)
+ bits = (uint32_t *)make_random_floats (height * stride);
+ else
+ bits = (uint32_t *)make_random_bytes (height * stride);
if (!bits)
return NULL;
bits += ((height - 1) * stride) / 4;
@@ -484,7 +522,7 @@ set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
if (image->type == BITS && prng_rand_n (8) != 0)
{
uint32_t width, height;
- int x, y;
+ uint32_t x, y;
int i;
/* Also add a couple of clip rectangles inside the image
diff --git a/test/thread-test.c b/test/thread-test.c
index 1c2f040..12c51e3 100644
--- a/test/thread-test.c
+++ b/test/thread-test.c
@@ -1,23 +1,34 @@
#include "utils.h"
-#ifndef HAVE_PTHREADS
+#if !defined (HAVE_PTHREADS) && !defined (_WIN32)
int main ()
{
- printf ("Skipped thread-test - pthreads not supported\n");
+ printf ("Skipped thread-test - pthreads or Windows Threads not supported\n");
return 0;
}
#else
#include <stdlib.h>
-#include <pthread.h>
+
+#ifdef HAVE_PTHREADS
+# include <pthread.h>
+#elif defined (_WIN32)
+# define WIN32_LEAN_AND_MEAN
+# include <windows.h>
+#endif
+
+#define THREADS 16
typedef struct
{
int thread_no;
uint32_t *dst_buf;
prng_t prng_state;
+#if defined (_WIN32) && !defined (HAVE_PTHREADS)
+ uint32_t crc32;
+#endif
} info_t;
static const pixman_op_t operators[] =
@@ -67,8 +78,13 @@ static const pixman_format_code_t formats[] =
#define DEST_WIDTH (7)
+#ifdef HAVE_PTHREADS
static void *
thread (void *data)
+#elif defined (_WIN32)
+DWORD WINAPI
+thread (LPVOID data)
+#endif
{
info_t *info = data;
uint32_t crc32 = 0x0;
@@ -112,7 +128,12 @@ thread (void *data)
pixman_image_unref (dst_img);
}
+#ifdef HAVE_PTHREADS
return (void *)(uintptr_t)crc32;
+#elif defined (_WIN32)
+ info->crc32 = crc32;
+ return 0;
+#endif
}
static inline uint32_t
@@ -127,26 +148,34 @@ byteswap32 (uint32_t x)
int
main (void)
{
- uint32_t dest[16 * DEST_WIDTH];
- info_t info[16] = { { 0 } };
- pthread_t threads[16];
- void *retvals[16];
- uint32_t crc32s[16], crc32;
+ uint32_t dest[THREADS * DEST_WIDTH];
+ info_t info[THREADS] = { { 0 } };
+
+#ifdef HAVE_PTHREADS
+ pthread_t threads[THREADS];
+ void *retvals[THREADS];
+#elif defined (_WIN32)
+ HANDLE hThreadArray[THREADS];
+ DWORD dwThreadIdArray[THREADS];
+#endif
+
+ uint32_t crc32s[THREADS], crc32;
int i;
- for (i = 0; i < 16; ++i)
+ for (i = 0; i < THREADS; ++i)
{
info[i].thread_no = i;
info[i].dst_buf = &dest[i * DEST_WIDTH];
}
- for (i = 0; i < 16; ++i)
- pthread_create (&threads[i], NULL, thread, &info[i]);
+#ifdef HAVE_PTHREADS
+ for (i = 0; i < THREADS; ++i)
+ pthread_create (&threads[i], NULL, thread, &info[i]);
- for (i = 0; i < 16; ++i)
- pthread_join (threads[i], &retvals[i]);
+ for (i = 0; i < THREADS; ++i)
+ pthread_join (threads[i], &retvals[i]);
- for (i = 0; i < 16; ++i)
+ for (i = 0; i < THREADS; ++i)
{
crc32s[i] = (uintptr_t)retvals[i];
@@ -154,6 +183,36 @@ main (void)
crc32s[i] = byteswap32 (crc32s[i]);
}
+#elif defined (_WIN32)
+ for (i = 0; i < THREADS; ++i)
+ {
+ hThreadArray[i] = CreateThread(NULL,
+ 0,
+ thread,
+ &info[i],
+ 0,
+ &dwThreadIdArray[i]);
+ if (hThreadArray[i] == NULL)
+ {
+ printf ("Windows thread creation failed!\n");
+ return 1;
+ }
+ }
+ for (i = 0; i < THREADS; ++i)
+ {
+ WaitForSingleObject (hThreadArray[i], INFINITE);
+ CloseHandle(hThreadArray[i]);
+ }
+
+ for (i = 0; i < THREADS; ++i)
+ {
+ crc32s[i] = info[i].crc32;
+
+ if (is_little_endian())
+ crc32s[i] = byteswap32 (crc32s[i]);
+ }
+#endif
+
crc32 = compute_crc32 (0, crc32s, sizeof crc32s);
#define EXPECTED 0x82C4D9FB
diff --git a/test/tolerance-test.c b/test/tolerance-test.c
index 320bb7f..3c6e818 100644
--- a/test/tolerance-test.c
+++ b/test/tolerance-test.c
@@ -76,6 +76,12 @@ static const pixman_op_t operators[] =
PIXMAN_OP_EXCLUSION,
};
+static const pixman_dither_t dithers[] =
+{
+ PIXMAN_DITHER_ORDERED_BAYER_8,
+ PIXMAN_DITHER_ORDERED_BLUE_NOISE_64,
+};
+
#define RANDOM_ELT(array) \
(array[prng_rand_n (ARRAY_LENGTH (array))])
@@ -176,7 +182,8 @@ verify (int test_no,
pixman_image_t *orig_dest,
int x, int y,
int width, int height,
- pixman_bool_t component_alpha)
+ pixman_bool_t component_alpha,
+ pixman_dither_t dither)
{
pixel_checker_t dest_checker, src_checker, mask_checker;
int i, j;
@@ -185,6 +192,9 @@ verify (int test_no,
pixel_checker_init (&dest_checker, dest->bits.format);
pixel_checker_init (&mask_checker, mask->bits.format);
+ if (dest->bits.dither != PIXMAN_DITHER_NONE)
+ pixel_checker_allow_dither (&dest_checker);
+
assert (dest->bits.format == orig_dest->bits.format);
for (j = y; j < y + height; ++j)
@@ -220,6 +230,7 @@ verify (int test_no,
printf (" operator: %s (%s alpha)\n", operator_name (op),
component_alpha? "component" : "unified");
+ printf (" dither: %s\n", dither_name (dither));
printf (" dest_x, dest_y: %d %d\n", x, y);
printf (" width, height: %d %d\n", width, height);
printf (" source: format: %-14s size: %2d x %2d\n",
@@ -275,6 +286,7 @@ do_check (int i)
pixman_image_t *dest_copy;
pixman_bool_t result = TRUE;
pixman_bool_t component_alpha;
+ pixman_dither_t dither = PIXMAN_DITHER_NONE;
prng_srand (i);
op = RANDOM_ELT (operators);
@@ -296,6 +308,12 @@ do_check (int i)
if (y + height > dest->bits.height)
height = dest->bits.height - y;
+ if (prng_rand_n (2))
+ {
+ dither = RANDOM_ELT (dithers);
+ pixman_image_set_dither (dest, dither);
+ }
+
component_alpha = prng_rand_n (2);
pixman_image_set_component_alpha (mask, component_alpha);
@@ -305,7 +323,8 @@ do_check (int i)
x, y, width, height);
if (!verify (i, op, source, mask, dest, dest_copy,
- x, y, width, height, component_alpha))
+ x, y, width, height, component_alpha,
+ dither))
{
result = FALSE;
}
diff --git a/test/utils/meson.build b/test/utils/meson.build
new file mode 100644
index 0000000..9a6e820
--- /dev/null
+++ b/test/utils/meson.build
@@ -0,0 +1,31 @@
+# Copyright © 2018 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libtestutils = static_library(
+ 'testutils',
+ ['utils.c', 'utils-prng.c', config_h],
+ dependencies : [idep_pixman, dep_openmp, dep_m, dep_png],
+)
+
+libtestutils_dep = declare_dependency(
+ link_with: libtestutils,
+ include_directories: include_directories('.'),
+)
+
diff --git a/test/utils-prng.c b/test/utils/utils-prng.c
index c27b5be..0cf53dd 100644
--- a/test/utils-prng.c
+++ b/test/utils/utils-prng.c
@@ -199,12 +199,25 @@ randmemset_internal (prng_t *prng,
}
else
{
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
#ifdef HAVE_GCC_VECTOR_EXTENSIONS
- const uint8x16 bswap_shufflemask =
+# if __has_builtin(__builtin_shufflevector)
+ randdata.vb =
+ __builtin_shufflevector (randdata.vb, randdata.vb,
+ 3, 2, 1, 0, 7, 6 , 5, 4,
+ 11, 10, 9, 8, 15, 14, 13, 12);
+# else
+ static const uint8x16 bswap_shufflemask =
{
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
};
randdata.vb = __builtin_shuffle (randdata.vb, bswap_shufflemask);
+# endif
+
store_rand_128_data (buf, &randdata, aligned);
buf += 16;
#else
diff --git a/test/utils-prng.h b/test/utils/utils-prng.h
index f9ae8dd..3cc3fbe 100644
--- a/test/utils-prng.h
+++ b/test/utils/utils-prng.h
@@ -72,7 +72,7 @@
*/
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include "pixman-private.h"
diff --git a/test/utils.c b/test/utils/utils.c
index ab3424f..23bf019 100644
--- a/test/utils.c
+++ b/test/utils/utils.c
@@ -5,6 +5,8 @@
#include <signal.h>
#include <stdlib.h>
#include <float.h>
+#include <ctype.h>
+#include <limits.h>
#ifdef HAVE_GETTIMEOFDAY
#include <sys/time.h>
@@ -28,11 +30,13 @@
#include <png.h>
#endif
+#define ROUND_UP(x, mult) (((x) + (mult) - 1) / (mult) * (mult))
+
/* Random number generator state
*/
-prng_t prng_state_data;
-prng_t *prng_state;
+prng_t prng_state_data = {0};
+prng_t *prng_state = NULL;
/*----------------------------------------------------------------------------*\
* CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
@@ -376,7 +380,16 @@ typedef struct
int n_bytes;
} info_t;
-#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H) && defined(HAVE_MMAP)
+#if FENCE_MALLOC_ACTIVE
+
+unsigned long
+fence_get_page_size ()
+{
+ /* You can fake a page size here, if you want to test e.g. 64 kB
+ * pages on a 4 kB page system. Just put a multiplier below.
+ */
+ return getpagesize ();
+}
/* This is apparently necessary on at least OS X */
#ifndef MAP_ANONYMOUS
@@ -386,7 +399,7 @@ typedef struct
void *
fence_malloc (int64_t len)
{
- unsigned long page_size = getpagesize();
+ unsigned long page_size = fence_get_page_size ();
unsigned long page_mask = page_size - 1;
uint32_t n_payload_bytes = (len + page_mask) & ~page_mask;
uint32_t n_bytes =
@@ -435,7 +448,7 @@ fence_malloc (int64_t len)
void
fence_free (void *data)
{
- uint32_t page_size = getpagesize();
+ uint32_t page_size = fence_get_page_size ();
uint8_t *payload = data;
uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
uint8_t *initial_page = leading_protected - page_size;
@@ -444,7 +457,98 @@ fence_free (void *data)
munmap (info->addr, info->n_bytes);
}
-#else
+static void
+fence_image_destroy (pixman_image_t *image, void *data)
+{
+ fence_free (data);
+}
+
+/* Create an image with fence pages.
+ *
+ * Creates an image, where the data area is allocated with fence_malloc ().
+ * Each row has an additional page in the stride.
+ *
+ * min_width is only a minimum width for the image. The width is aligned up
+ * for the row size to be divisible by both page size and pixel size.
+ *
+ * If stride_fence is true, the additional page on each row will be
+ * armed to cause SIGSEGV or SIGBUS on all accesses. This should catch
+ * all accesses outside the valid row pixels.
+ */
+pixman_image_t *
+fence_image_create_bits (pixman_format_code_t format,
+ int min_width,
+ int height,
+ pixman_bool_t stride_fence)
+{
+ unsigned page_size = fence_get_page_size ();
+ unsigned page_mask = page_size - 1;
+ unsigned bitspp = PIXMAN_FORMAT_BPP (format);
+ unsigned bits_boundary;
+ unsigned row_bits;
+ int width; /* pixels */
+ unsigned stride; /* bytes */
+ void *pixels;
+ pixman_image_t *image;
+ int i;
+
+ /* must be power of two */
+ assert (page_size && (page_size & page_mask) == 0);
+
+ if (bitspp < 1 || min_width < 1 || height < 1)
+ abort ();
+
+ /* least common multiple between page size * 8 and bitspp */
+ bits_boundary = bitspp;
+ while (! (bits_boundary & 1))
+ bits_boundary >>= 1;
+ bits_boundary *= page_size * 8;
+
+ /* round up to bits_boundary */
+ row_bits = ROUND_UP ( (unsigned)min_width * bitspp, bits_boundary);
+ width = row_bits / bitspp;
+
+ stride = row_bits / 8;
+ if (stride_fence)
+ stride += page_size; /* add fence page */
+
+ if (UINT_MAX / stride < (unsigned)height)
+ abort ();
+
+ pixels = fence_malloc (stride * (unsigned)height);
+ if (!pixels)
+ return NULL;
+
+ if (stride_fence)
+ {
+ uint8_t *guard = (uint8_t *)pixels + stride - page_size;
+
+ /* arm row end fence pages */
+ for (i = 0; i < height; i++)
+ {
+ if (mprotect (guard + i * stride, page_size, PROT_NONE) == -1)
+ goto out_fail;
+ }
+ }
+
+ assert (width >= min_width);
+
+ image = pixman_image_create_bits_no_clear (format, width, height,
+ pixels, stride);
+ if (!image)
+ goto out_fail;
+
+ pixman_image_set_destroy_function (image, fence_image_destroy, pixels);
+
+ return image;
+
+out_fail:
+ fence_free (pixels);
+
+ return NULL;
+}
+
+#else /* FENCE_MALLOC_ACTIVE */
void *
fence_malloc (int64_t len)
@@ -458,7 +562,25 @@ fence_free (void *data)
free (data);
}
-#endif
+pixman_image_t *
+fence_image_create_bits (pixman_format_code_t format,
+ int min_width,
+ int height,
+ pixman_bool_t stride_fence)
+{
+ return pixman_image_create_bits (format, min_width, height, NULL, 0);
+ /* Implicitly allocated storage does not need a destroy function
+ * to get freed on refcount hitting zero.
+ */
+}
+
+unsigned long
+fence_get_page_size ()
+{
+ return 0;
+}
+
+#endif /* FENCE_MALLOC_ACTIVE */
uint8_t *
make_random_bytes (int n_bytes)
@@ -473,6 +595,21 @@ make_random_bytes (int n_bytes)
return bytes;
}
+float *
+make_random_floats (int n_bytes)
+{
+ uint8_t *bytes = fence_malloc (n_bytes);
+ float *vals = (float *)bytes;
+
+ if (!bytes)
+ return 0;
+
+ for (n_bytes /= 4; n_bytes; vals++, n_bytes--)
+ *vals = (float)rand() / (float)RAND_MAX;
+
+ return (float *)bytes;
+}
+
void
a8r8g8b8_to_rgba_np (uint32_t *dst, uint32_t *src, int n_pixels)
{
@@ -844,9 +981,11 @@ enable_divbyzero_exceptions (void)
{
#ifdef HAVE_FENV_H
#ifdef HAVE_FEENABLEEXCEPT
+#ifdef HAVE_FEDIVBYZERO
feenableexcept (FE_DIVBYZERO);
#endif
#endif
+#endif
}
void
@@ -854,9 +993,11 @@ enable_invalid_exceptions (void)
{
#ifdef HAVE_FENV_H
#ifdef HAVE_FEENABLEEXCEPT
+#ifdef FE_INVALID
feenableexcept (FE_INVALID);
#endif
#endif
+#endif
}
void *
@@ -948,168 +1089,438 @@ initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb)
}
}
-const char *
-operator_name (pixman_op_t op)
+struct operator_entry {
+ pixman_op_t op;
+ const char *name;
+ pixman_bool_t is_alias;
+};
+
+typedef struct operator_entry operator_entry_t;
+
+static const operator_entry_t op_list[] =
{
- switch (op)
- {
- case PIXMAN_OP_CLEAR: return "PIXMAN_OP_CLEAR";
- case PIXMAN_OP_SRC: return "PIXMAN_OP_SRC";
- case PIXMAN_OP_DST: return "PIXMAN_OP_DST";
- case PIXMAN_OP_OVER: return "PIXMAN_OP_OVER";
- case PIXMAN_OP_OVER_REVERSE: return "PIXMAN_OP_OVER_REVERSE";
- case PIXMAN_OP_IN: return "PIXMAN_OP_IN";
- case PIXMAN_OP_IN_REVERSE: return "PIXMAN_OP_IN_REVERSE";
- case PIXMAN_OP_OUT: return "PIXMAN_OP_OUT";
- case PIXMAN_OP_OUT_REVERSE: return "PIXMAN_OP_OUT_REVERSE";
- case PIXMAN_OP_ATOP: return "PIXMAN_OP_ATOP";
- case PIXMAN_OP_ATOP_REVERSE: return "PIXMAN_OP_ATOP_REVERSE";
- case PIXMAN_OP_XOR: return "PIXMAN_OP_XOR";
- case PIXMAN_OP_ADD: return "PIXMAN_OP_ADD";
- case PIXMAN_OP_SATURATE: return "PIXMAN_OP_SATURATE";
-
- case PIXMAN_OP_DISJOINT_CLEAR: return "PIXMAN_OP_DISJOINT_CLEAR";
- case PIXMAN_OP_DISJOINT_SRC: return "PIXMAN_OP_DISJOINT_SRC";
- case PIXMAN_OP_DISJOINT_DST: return "PIXMAN_OP_DISJOINT_DST";
- case PIXMAN_OP_DISJOINT_OVER: return "PIXMAN_OP_DISJOINT_OVER";
- case PIXMAN_OP_DISJOINT_OVER_REVERSE: return "PIXMAN_OP_DISJOINT_OVER_REVERSE";
- case PIXMAN_OP_DISJOINT_IN: return "PIXMAN_OP_DISJOINT_IN";
- case PIXMAN_OP_DISJOINT_IN_REVERSE: return "PIXMAN_OP_DISJOINT_IN_REVERSE";
- case PIXMAN_OP_DISJOINT_OUT: return "PIXMAN_OP_DISJOINT_OUT";
- case PIXMAN_OP_DISJOINT_OUT_REVERSE: return "PIXMAN_OP_DISJOINT_OUT_REVERSE";
- case PIXMAN_OP_DISJOINT_ATOP: return "PIXMAN_OP_DISJOINT_ATOP";
- case PIXMAN_OP_DISJOINT_ATOP_REVERSE: return "PIXMAN_OP_DISJOINT_ATOP_REVERSE";
- case PIXMAN_OP_DISJOINT_XOR: return "PIXMAN_OP_DISJOINT_XOR";
-
- case PIXMAN_OP_CONJOINT_CLEAR: return "PIXMAN_OP_CONJOINT_CLEAR";
- case PIXMAN_OP_CONJOINT_SRC: return "PIXMAN_OP_CONJOINT_SRC";
- case PIXMAN_OP_CONJOINT_DST: return "PIXMAN_OP_CONJOINT_DST";
- case PIXMAN_OP_CONJOINT_OVER: return "PIXMAN_OP_CONJOINT_OVER";
- case PIXMAN_OP_CONJOINT_OVER_REVERSE: return "PIXMAN_OP_CONJOINT_OVER_REVERSE";
- case PIXMAN_OP_CONJOINT_IN: return "PIXMAN_OP_CONJOINT_IN";
- case PIXMAN_OP_CONJOINT_IN_REVERSE: return "PIXMAN_OP_CONJOINT_IN_REVERSE";
- case PIXMAN_OP_CONJOINT_OUT: return "PIXMAN_OP_CONJOINT_OUT";
- case PIXMAN_OP_CONJOINT_OUT_REVERSE: return "PIXMAN_OP_CONJOINT_OUT_REVERSE";
- case PIXMAN_OP_CONJOINT_ATOP: return "PIXMAN_OP_CONJOINT_ATOP";
- case PIXMAN_OP_CONJOINT_ATOP_REVERSE: return "PIXMAN_OP_CONJOINT_ATOP_REVERSE";
- case PIXMAN_OP_CONJOINT_XOR: return "PIXMAN_OP_CONJOINT_XOR";
-
- case PIXMAN_OP_MULTIPLY: return "PIXMAN_OP_MULTIPLY";
- case PIXMAN_OP_SCREEN: return "PIXMAN_OP_SCREEN";
- case PIXMAN_OP_OVERLAY: return "PIXMAN_OP_OVERLAY";
- case PIXMAN_OP_DARKEN: return "PIXMAN_OP_DARKEN";
- case PIXMAN_OP_LIGHTEN: return "PIXMAN_OP_LIGHTEN";
- case PIXMAN_OP_COLOR_DODGE: return "PIXMAN_OP_COLOR_DODGE";
- case PIXMAN_OP_COLOR_BURN: return "PIXMAN_OP_COLOR_BURN";
- case PIXMAN_OP_HARD_LIGHT: return "PIXMAN_OP_HARD_LIGHT";
- case PIXMAN_OP_SOFT_LIGHT: return "PIXMAN_OP_SOFT_LIGHT";
- case PIXMAN_OP_DIFFERENCE: return "PIXMAN_OP_DIFFERENCE";
- case PIXMAN_OP_EXCLUSION: return "PIXMAN_OP_EXCLUSION";
- case PIXMAN_OP_HSL_HUE: return "PIXMAN_OP_HSL_HUE";
- case PIXMAN_OP_HSL_SATURATION: return "PIXMAN_OP_HSL_SATURATION";
- case PIXMAN_OP_HSL_COLOR: return "PIXMAN_OP_HSL_COLOR";
- case PIXMAN_OP_HSL_LUMINOSITY: return "PIXMAN_OP_HSL_LUMINOSITY";
-
- case PIXMAN_OP_NONE:
- return "<invalid operator 'none'>";
- };
+#define ENTRY(op) \
+ { PIXMAN_OP_##op, "PIXMAN_OP_" #op, FALSE }
+#define ALIAS(op, nam) \
+ { PIXMAN_OP_##op, nam, TRUE }
+
+ /* operator_name () will return the first hit in this table,
+ * so keep the list properly ordered between entries and aliases.
+ * Aliases are not listed by list_operators ().
+ */
- return "<unknown operator>";
-}
+ ENTRY (CLEAR),
+ ENTRY (SRC),
+ ENTRY (DST),
+ ENTRY (OVER),
+ ENTRY (OVER_REVERSE),
+ ALIAS (OVER_REVERSE, "overrev"),
+ ENTRY (IN),
+ ENTRY (IN_REVERSE),
+ ALIAS (IN_REVERSE, "inrev"),
+ ENTRY (OUT),
+ ENTRY (OUT_REVERSE),
+ ALIAS (OUT_REVERSE, "outrev"),
+ ENTRY (ATOP),
+ ENTRY (ATOP_REVERSE),
+ ALIAS (ATOP_REVERSE, "atoprev"),
+ ENTRY (XOR),
+ ENTRY (ADD),
+ ENTRY (SATURATE),
+
+ ENTRY (DISJOINT_CLEAR),
+ ENTRY (DISJOINT_SRC),
+ ENTRY (DISJOINT_DST),
+ ENTRY (DISJOINT_OVER),
+ ENTRY (DISJOINT_OVER_REVERSE),
+ ENTRY (DISJOINT_IN),
+ ENTRY (DISJOINT_IN_REVERSE),
+ ENTRY (DISJOINT_OUT),
+ ENTRY (DISJOINT_OUT_REVERSE),
+ ENTRY (DISJOINT_ATOP),
+ ENTRY (DISJOINT_ATOP_REVERSE),
+ ENTRY (DISJOINT_XOR),
+
+ ENTRY (CONJOINT_CLEAR),
+ ENTRY (CONJOINT_SRC),
+ ENTRY (CONJOINT_DST),
+ ENTRY (CONJOINT_OVER),
+ ENTRY (CONJOINT_OVER_REVERSE),
+ ENTRY (CONJOINT_IN),
+ ENTRY (CONJOINT_IN_REVERSE),
+ ENTRY (CONJOINT_OUT),
+ ENTRY (CONJOINT_OUT_REVERSE),
+ ENTRY (CONJOINT_ATOP),
+ ENTRY (CONJOINT_ATOP_REVERSE),
+ ENTRY (CONJOINT_XOR),
+
+ ENTRY (MULTIPLY),
+ ENTRY (SCREEN),
+ ENTRY (OVERLAY),
+ ENTRY (DARKEN),
+ ENTRY (LIGHTEN),
+ ENTRY (COLOR_DODGE),
+ ENTRY (COLOR_BURN),
+ ENTRY (HARD_LIGHT),
+ ENTRY (SOFT_LIGHT),
+ ENTRY (DIFFERENCE),
+ ENTRY (EXCLUSION),
+ ENTRY (HSL_HUE),
+ ENTRY (HSL_SATURATION),
+ ENTRY (HSL_COLOR),
+ ENTRY (HSL_LUMINOSITY),
+
+ ALIAS (NONE, "<invalid operator 'none'>")
+
+#undef ENTRY
+#undef ALIAS
+};
-const char *
-format_name (pixman_format_code_t format)
+typedef struct {
+ pixman_dither_t dither;
+ const char *name;
+ pixman_bool_t is_alias;
+} dither_entry_t;
+
+static const dither_entry_t dither_list[] =
{
- switch (format)
- {
+#define ENTRY(dither) \
+ { PIXMAN_DITHER_##dither, "PIXMAN_DITHER_" #dither, FALSE }
+#define ALIAS(dither, nam) \
+ { PIXMAN_DITHER_##dither, nam, TRUE }
+
+ /* dither_name () will return the first hit in this table,
+ * so keep the list properly ordered between entries and aliases.
+ * Aliases are not listed by list_dithers ().
+ */
+
+ ENTRY (ORDERED_BAYER_8),
+ ENTRY (ORDERED_BLUE_NOISE_64),
+ ENTRY (NONE),
+
+#undef ENTRY
+#undef ALIAS
+};
+
+struct format_entry
+{
+ pixman_format_code_t format;
+ const char *name;
+ pixman_bool_t is_alias;
+};
+
+typedef struct format_entry format_entry_t;
+
+static const format_entry_t format_list[] =
+{
+#define ENTRY(f) \
+ { PIXMAN_##f, #f, FALSE }
+#define ALIAS(f, nam) \
+ { PIXMAN_##f, nam, TRUE }
+
+ /* format_name () will return the first hit in this table,
+ * so keep the list properly ordered between entries and aliases.
+ * Aliases are not listed by list_formats ().
+ */
+
+/* 128bpp formats */
+ ENTRY (rgba_float),
+/* 96bpp formats */
+ ENTRY (rgb_float),
+
/* 32bpp formats */
- case PIXMAN_a8r8g8b8: return "a8r8g8b8";
- case PIXMAN_x8r8g8b8: return "x8r8g8b8";
- case PIXMAN_a8b8g8r8: return "a8b8g8r8";
- case PIXMAN_x8b8g8r8: return "x8b8g8r8";
- case PIXMAN_b8g8r8a8: return "b8g8r8a8";
- case PIXMAN_b8g8r8x8: return "b8g8r8x8";
- case PIXMAN_r8g8b8a8: return "r8g8b8a8";
- case PIXMAN_r8g8b8x8: return "r8g8b8x8";
- case PIXMAN_x14r6g6b6: return "x14r6g6b6";
- case PIXMAN_x2r10g10b10: return "x2r10g10b10";
- case PIXMAN_a2r10g10b10: return "a2r10g10b10";
- case PIXMAN_x2b10g10r10: return "x2b10g10r10";
- case PIXMAN_a2b10g10r10: return "a2b10g10r10";
+ ENTRY (a8r8g8b8),
+ ALIAS (a8r8g8b8, "8888"),
+ ENTRY (x8r8g8b8),
+ ALIAS (x8r8g8b8, "x888"),
+ ENTRY (a8b8g8r8),
+ ENTRY (x8b8g8r8),
+ ENTRY (b8g8r8a8),
+ ENTRY (b8g8r8x8),
+ ENTRY (r8g8b8a8),
+ ENTRY (r8g8b8x8),
+ ENTRY (x14r6g6b6),
+ ENTRY (x2r10g10b10),
+ ALIAS (x2r10g10b10, "2x10"),
+ ENTRY (a2r10g10b10),
+ ALIAS (a2r10g10b10, "2a10"),
+ ENTRY (x2b10g10r10),
+ ENTRY (a2b10g10r10),
/* sRGB formats */
- case PIXMAN_a8r8g8b8_sRGB: return "a8r8g8b8_sRGB";
+ ENTRY (a8r8g8b8_sRGB),
+ ENTRY (r8g8b8_sRGB),
/* 24bpp formats */
- case PIXMAN_r8g8b8: return "r8g8b8";
- case PIXMAN_b8g8r8: return "b8g8r8";
-
-/* 16bpp formats */
- case PIXMAN_r5g6b5: return "r5g6b5";
- case PIXMAN_b5g6r5: return "b5g6r5";
-
- case PIXMAN_a1r5g5b5: return "a1r5g5b5";
- case PIXMAN_x1r5g5b5: return "x1r5g5b5";
- case PIXMAN_a1b5g5r5: return "a1b5g5r5";
- case PIXMAN_x1b5g5r5: return "x1b5g5r5";
- case PIXMAN_a4r4g4b4: return "a4r4g4b4";
- case PIXMAN_x4r4g4b4: return "x4r4g4b4";
- case PIXMAN_a4b4g4r4: return "a4b4g4r4";
- case PIXMAN_x4b4g4r4: return "x4b4g4r4";
+ ENTRY (r8g8b8),
+ ALIAS (r8g8b8, "0888"),
+ ENTRY (b8g8r8),
+
+/* 16 bpp formats */
+ ENTRY (r5g6b5),
+ ALIAS (r5g6b5, "0565"),
+ ENTRY (b5g6r5),
+
+ ENTRY (a1r5g5b5),
+ ALIAS (a1r5g5b5, "1555"),
+ ENTRY (x1r5g5b5),
+ ENTRY (a1b5g5r5),
+ ENTRY (x1b5g5r5),
+ ENTRY (a4r4g4b4),
+ ALIAS (a4r4g4b4, "4444"),
+ ENTRY (x4r4g4b4),
+ ENTRY (a4b4g4r4),
+ ENTRY (x4b4g4r4),
/* 8bpp formats */
- case PIXMAN_a8: return "a8";
- case PIXMAN_r3g3b2: return "r3g3b2";
- case PIXMAN_b2g3r3: return "b2g3r3";
- case PIXMAN_a2r2g2b2: return "a2r2g2b2";
- case PIXMAN_a2b2g2r2: return "a2b2g2r2";
-
-#if 0
- case PIXMAN_x4c4: return "x4c4";
- case PIXMAN_g8: return "g8";
-#endif
- case PIXMAN_c8: return "x4c4 / c8";
- case PIXMAN_x4g4: return "x4g4 / g8";
+ ENTRY (a8),
+ ALIAS (a8, "8"),
+ ENTRY (r3g3b2),
+ ENTRY (b2g3r3),
+ ENTRY (a2r2g2b2),
+ ALIAS (a2r2g2b2, "2222"),
+ ENTRY (a2b2g2r2),
+
+ ALIAS (c8, "x4c4 / c8"),
+ /* ENTRY (c8), */
+ ALIAS (g8, "x4g4 / g8"),
+ /* ENTRY (g8), */
+
+ ENTRY (x4a4),
+
+ /* These format codes are identical to c8 and g8, respectively. */
+ /* ENTRY (x4c4), */
+ /* ENTRY (x4g4), */
+
+/* 4 bpp formats */
+ ENTRY (a4),
+ ENTRY (r1g2b1),
+ ENTRY (b1g2r1),
+ ENTRY (a1r1g1b1),
+ ENTRY (a1b1g1r1),
+
+ ALIAS (c4, "c4"),
+ /* ENTRY (c4), */
+ ALIAS (g4, "g4"),
+ /* ENTRY (g4), */
+
+/* 1bpp formats */
+ ENTRY (a1),
+
+ ALIAS (g1, "g1"),
+ /* ENTRY (g1), */
- case PIXMAN_x4a4: return "x4a4";
+/* YUV formats */
+ ALIAS (yuy2, "yuy2"),
+ /* ENTRY (yuy2), */
+ ALIAS (yv12, "yv12"),
+ /* ENTRY (yv12), */
+
+/* Fake formats, not in pixman_format_code_t enum */
+ ALIAS (null, "null"),
+ ALIAS (solid, "solid"),
+ ALIAS (solid, "n"),
+ ALIAS (pixbuf, "pixbuf"),
+ ALIAS (rpixbuf, "rpixbuf"),
+ ALIAS (unknown, "unknown"),
+
+#undef ENTRY
+#undef ALIAS
+};
-/* 4bpp formats */
- case PIXMAN_a4: return "a4";
- case PIXMAN_r1g2b1: return "r1g2b1";
- case PIXMAN_b1g2r1: return "b1g2r1";
- case PIXMAN_a1r1g1b1: return "a1r1g1b1";
- case PIXMAN_a1b1g1r1: return "a1b1g1r1";
+pixman_format_code_t
+format_from_string (const char *s)
+{
+ int i;
- case PIXMAN_c4: return "c4";
- case PIXMAN_g4: return "g4";
+ for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
+ {
+ const format_entry_t *ent = &format_list[i];
-/* 1bpp formats */
- case PIXMAN_a1: return "a1";
+ if (strcasecmp (ent->name, s) == 0)
+ return ent->format;
+ }
- case PIXMAN_g1: return "g1";
+ return PIXMAN_null;
+}
-/* YUV formats */
- case PIXMAN_yuy2: return "yuy2";
- case PIXMAN_yv12: return "yv12";
- };
+static void
+emit (const char *s, int *n_chars)
+{
+ *n_chars += printf ("%s,", s);
+ if (*n_chars > 60)
+ {
+ printf ("\n ");
+ *n_chars = 0;
+ }
+ else
+ {
+ printf (" ");
+ (*n_chars)++;
+ }
+}
- /* Fake formats.
- *
- * This is separate switch to prevent GCC from complaining
- * that the values are not in the pixman_format_code_t enum.
- */
- switch ((uint32_t)format)
+void
+list_formats (void)
+{
+ int n_chars;
+ int i;
+
+ printf ("Formats:\n ");
+
+ n_chars = 0;
+ for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
{
- case PIXMAN_null: return "null";
- case PIXMAN_solid: return "solid";
- case PIXMAN_pixbuf: return "pixbuf";
- case PIXMAN_rpixbuf: return "rpixbuf";
- case PIXMAN_unknown: return "unknown";
- };
+ const format_entry_t *ent = &format_list[i];
+
+ if (ent->is_alias)
+ continue;
+
+ emit (ent->name, &n_chars);
+ }
+
+ printf ("\n\n");
+}
+
+void
+list_operators (void)
+{
+ char short_name [128] = { 0 };
+ int i, n_chars;
+
+ printf ("Operators:\n ");
+
+ n_chars = 0;
+ for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+ {
+ const operator_entry_t *ent = &op_list[i];
+ int j;
+
+ if (ent->is_alias)
+ continue;
+
+ snprintf (short_name, sizeof (short_name) - 1, "%s",
+ ent->name + strlen ("PIXMAN_OP_"));
+
+ for (j = 0; short_name[j] != '\0'; ++j)
+ short_name[j] = tolower (short_name[j]);
+
+ emit (short_name, &n_chars);
+ }
+
+ printf ("\n\n");
+}
+
+void
+list_dithers (void)
+{
+ int n_chars;
+ int i;
+
+ printf ("Dithers:\n ");
+
+ n_chars = 0;
+ for (i = 0; i < ARRAY_LENGTH (dither_list); ++i)
+ {
+ const dither_entry_t *ent = &dither_list[i];
+
+ if (ent->is_alias)
+ continue;
+
+ emit (ent->name, &n_chars);
+ }
+
+ printf ("\n\n");
+}
+
+pixman_op_t
+operator_from_string (const char *s)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+ {
+ const operator_entry_t *ent = &op_list[i];
+
+ if (ent->is_alias)
+ {
+ if (strcasecmp (ent->name, s) == 0)
+ return ent->op;
+ }
+ else
+ {
+ if (strcasecmp (ent->name + strlen ("PIXMAN_OP_"), s) == 0)
+ return ent->op;
+ }
+ }
+
+ return PIXMAN_OP_NONE;
+}
+
+pixman_dither_t
+dither_from_string (const char *s)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_LENGTH (dither_list); ++i)
+ {
+ const dither_entry_t *ent = &dither_list[i];
+
+ if (strcasecmp (ent->name, s) == 0)
+ return ent->dither;
+ }
+
+ return PIXMAN_DITHER_NONE;
+}
+
+const char *
+operator_name (pixman_op_t op)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+ {
+ const operator_entry_t *ent = &op_list[i];
+
+ if (ent->op == op)
+ return ent->name;
+ }
+
+ return "<unknown operator>";
+}
+
+const char *
+format_name (pixman_format_code_t format)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
+ {
+ const format_entry_t *ent = &format_list[i];
+
+ if (ent->format == format)
+ return ent->name;
+ }
return "<unknown format>";
};
+const char *
+dither_name (pixman_dither_t dither)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_LENGTH (dither_list); ++i)
+ {
+ const dither_entry_t *ent = &dither_list[i];
+
+ if (ent->dither == dither)
+ return ent->name;
+ }
+
+ return "<unknown dither>";
+}
+
#define IS_ZERO(f) (-DBL_MIN < (f) && (f) < DBL_MIN)
typedef double (* blend_func_t) (double as, double s, double ad, double d);
@@ -1596,6 +2007,10 @@ round_color (pixman_format_code_t format, color_t *color)
color->a = round_channel (color->a, PIXMAN_FORMAT_A (format));
}
+/* The acceptable deviation in units of [0.0, 1.0]
+ */
+#define DEVIATION (0.0128)
+
/* Check whether @pixel is a valid quantization of the a, r, g, b
* parameters. Some slack is permitted.
*/
@@ -1606,6 +2021,10 @@ pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format)
checker->format = format;
+ if (format == PIXMAN_rgba_float ||
+ format == PIXMAN_rgb_float)
+ return;
+
switch (PIXMAN_FORMAT_TYPE (format))
{
case PIXMAN_TYPE_A:
@@ -1651,21 +2070,46 @@ pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format)
break;
}
- checker->am = ((1 << PIXMAN_FORMAT_A (format)) - 1) << checker->as;
- checker->rm = ((1 << PIXMAN_FORMAT_R (format)) - 1) << checker->rs;
- checker->gm = ((1 << PIXMAN_FORMAT_G (format)) - 1) << checker->gs;
- checker->bm = ((1 << PIXMAN_FORMAT_B (format)) - 1) << checker->bs;
+ checker->am = ((1U << PIXMAN_FORMAT_A (format)) - 1) << checker->as;
+ checker->rm = ((1U << PIXMAN_FORMAT_R (format)) - 1) << checker->rs;
+ checker->gm = ((1U << PIXMAN_FORMAT_G (format)) - 1) << checker->gs;
+ checker->bm = ((1U << PIXMAN_FORMAT_B (format)) - 1) << checker->bs;
checker->aw = PIXMAN_FORMAT_A (format);
checker->rw = PIXMAN_FORMAT_R (format);
checker->gw = PIXMAN_FORMAT_G (format);
checker->bw = PIXMAN_FORMAT_B (format);
+
+ checker->ad = DEVIATION;
+ checker->rd = DEVIATION;
+ checker->gd = DEVIATION;
+ checker->bd = DEVIATION;
+}
+
+/* When dithering is enabled, we allow one extra pixel of tolerance
+ */
+void
+pixel_checker_allow_dither (pixel_checker_t *checker)
+{
+ checker->ad += 1 / (double)((1 << checker->aw) - 1);
+ checker->rd += 1 / (double)((1 << checker->rw) - 1);
+ checker->gd += 1 / (double)((1 << checker->gw) - 1);
+ checker->bd += 1 / (double)((1 << checker->bw) - 1);
+}
+
+static void
+pixel_checker_require_uint32_format (const pixel_checker_t *checker)
+{
+ assert (checker->format != PIXMAN_rgba_float &&
+ checker->format != PIXMAN_rgb_float);
}
void
pixel_checker_split_pixel (const pixel_checker_t *checker, uint32_t pixel,
int *a, int *r, int *g, int *b)
{
+ pixel_checker_require_uint32_format(checker);
+
*a = (pixel & checker->am) >> checker->as;
*r = (pixel & checker->rm) >> checker->rs;
*g = (pixel & checker->gm) >> checker->gs;
@@ -1679,6 +2123,8 @@ pixel_checker_get_masks (const pixel_checker_t *checker,
uint32_t *gm,
uint32_t *bm)
{
+ pixel_checker_require_uint32_format(checker);
+
if (am)
*am = checker->am;
if (rm)
@@ -1695,6 +2141,8 @@ pixel_checker_convert_pixel_to_color (const pixel_checker_t *checker,
{
int a, r, g, b;
+ pixel_checker_require_uint32_format(checker);
+
pixel_checker_split_pixel (checker, pixel, &a, &r, &g, &b);
if (checker->am == 0)
@@ -1740,7 +2188,7 @@ convert (double v, uint32_t width, uint32_t mask, uint32_t shift, double def)
}
static void
-get_limits (const pixel_checker_t *checker, double limit,
+get_limits (const pixel_checker_t *checker, double sign,
color_t *color,
int *ao, int *ro, int *go, int *bo)
{
@@ -1756,28 +2204,32 @@ get_limits (const pixel_checker_t *checker, double limit,
color = &tmp;
}
- *ao = convert (color->a + limit, checker->aw, checker->am, checker->as, 1.0);
- *ro = convert (color->r + limit, checker->rw, checker->rm, checker->rs, 0.0);
- *go = convert (color->g + limit, checker->gw, checker->gm, checker->gs, 0.0);
- *bo = convert (color->b + limit, checker->bw, checker->bm, checker->bs, 0.0);
+ *ao = convert (color->a + sign * checker->ad,
+ checker->aw, checker->am, checker->as, 1.0);
+ *ro = convert (color->r + sign * checker->rd,
+ checker->rw, checker->rm, checker->rs, 0.0);
+ *go = convert (color->g + sign * checker->gd,
+ checker->gw, checker->gm, checker->gs, 0.0);
+ *bo = convert (color->b + sign * checker->bd,
+ checker->bw, checker->bm, checker->bs, 0.0);
}
-/* The acceptable deviation in units of [0.0, 1.0]
- */
-#define DEVIATION (0.0128)
-
void
pixel_checker_get_max (const pixel_checker_t *checker, color_t *color,
int *am, int *rm, int *gm, int *bm)
{
- get_limits (checker, DEVIATION, color, am, rm, gm, bm);
+ pixel_checker_require_uint32_format(checker);
+
+ get_limits (checker, 1, color, am, rm, gm, bm);
}
void
pixel_checker_get_min (const pixel_checker_t *checker, color_t *color,
int *am, int *rm, int *gm, int *bm)
{
- get_limits (checker, - DEVIATION, color, am, rm, gm, bm);
+ pixel_checker_require_uint32_format(checker);
+
+ get_limits (checker, - 1, color, am, rm, gm, bm);
}
pixman_bool_t
@@ -1788,6 +2240,8 @@ pixel_checker_check (const pixel_checker_t *checker, uint32_t pixel,
int32_t ai, ri, gi, bi;
pixman_bool_t result;
+ pixel_checker_require_uint32_format(checker);
+
pixel_checker_get_min (checker, color, &a_lo, &r_lo, &g_lo, &b_lo);
pixel_checker_get_max (checker, color, &a_hi, &r_hi, &g_hi, &b_hi);
pixel_checker_split_pixel (checker, pixel, &ai, &ri, &gi, &bi);
@@ -1800,3 +2254,36 @@ pixel_checker_check (const pixel_checker_t *checker, uint32_t pixel,
return result;
}
+
+static void
+color_limits (const pixel_checker_t *checker,
+ double limit, const color_t *color, color_t *out)
+{
+ if (PIXMAN_FORMAT_A(checker->format))
+ out->a = color->a + limit;
+ else
+ out->a = 1.;
+
+ out->r = color->r + limit;
+ out->g = color->g + limit;
+ out->b = color->b + limit;
+}
+
+pixman_bool_t
+pixel_checker_check_color (const pixel_checker_t *checker,
+ const color_t *actual, const color_t *reference)
+{
+ color_t min, max;
+ pixman_bool_t result;
+
+ color_limits(checker, -DEVIATION, reference, &min);
+ color_limits(checker, DEVIATION, reference, &max);
+
+ result =
+ actual->a >= min.a && actual->a <= max.a &&
+ actual->r >= min.r && actual->r <= max.r &&
+ actual->g >= min.g && actual->g <= max.g &&
+ actual->b >= min.b && actual->b <= max.b;
+
+ return result;
+}
diff --git a/test/utils.h b/test/utils/utils.h
index 6804334..d3e1ba4 100644
--- a/test/utils.h
+++ b/test/utils/utils.h
@@ -1,5 +1,5 @@
#ifdef HAVE_CONFIG_H
-#include <config.h>
+#include <pixman-config.h>
#endif
#include <assert.h>
@@ -86,6 +86,17 @@ is_little_endian (void)
void
image_endian_swap (pixman_image_t *img);
+#if defined (HAVE_MPROTECT) && defined (HAVE_GETPAGESIZE) && \
+ defined (HAVE_SYS_MMAN_H) && defined (HAVE_MMAP)
+/* fence_malloc and friends have working fence implementation.
+ * Without this, fence_malloc still allocs but does not catch
+ * out-of-bounds accesses.
+ */
+#define FENCE_MALLOC_ACTIVE 1
+#else
+#define FENCE_MALLOC_ACTIVE 0
+#endif
+
/* Allocate memory that is bounded by protected pages,
* so that out-of-bounds access will cause segfaults
*/
@@ -95,9 +106,21 @@ fence_malloc (int64_t len);
void
fence_free (void *data);
+pixman_image_t *
+fence_image_create_bits (pixman_format_code_t format,
+ int min_width,
+ int height,
+ pixman_bool_t stride_fence);
+
+/* Return the page size if FENCE_MALLOC_ACTIVE, or zero otherwise */
+unsigned long
+fence_get_page_size ();
+
/* Generate n_bytes random bytes in fence_malloced memory */
uint8_t *
make_random_bytes (int n_bytes);
+float *
+make_random_floats (int n_bytes);
/* Return current time in seconds */
double
@@ -187,12 +210,32 @@ convert_linear_to_srgb (double component);
void
initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb);
+pixman_format_code_t
+format_from_string (const char *s);
+
+void
+list_formats (void);
+
+void
+list_operators (void);
+
+void list_dithers (void);
+
+pixman_op_t
+operator_from_string (const char *s);
+
+pixman_dither_t
+dither_from_string (const char *s);
+
const char *
operator_name (pixman_op_t op);
const char *
format_name (pixman_format_code_t format);
+const char *
+dither_name (pixman_dither_t dither);
+
typedef struct
{
double r, g, b, a;
@@ -215,12 +258,16 @@ typedef struct
uint32_t am, rm, gm, bm;
uint32_t as, rs, gs, bs;
uint32_t aw, rw, gw, bw;
+ float ad, rd, gd, bd;
} pixel_checker_t;
void
pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format);
void
+pixel_checker_allow_dither (pixel_checker_t *checker);
+
+void
pixel_checker_split_pixel (const pixel_checker_t *checker, uint32_t pixel,
int *a, int *r, int *g, int *b);