summaryrefslogtreecommitdiff
path: root/gst/monoscope/convolve.c
diff options
context:
space:
mode:
Diffstat (limited to 'gst/monoscope/convolve.c')
-rw-r--r--gst/monoscope/convolve.c316
1 files changed, 316 insertions, 0 deletions
diff --git a/gst/monoscope/convolve.c b/gst/monoscope/convolve.c
new file mode 100644
index 000000000..aefffa3ed
--- /dev/null
+++ b/gst/monoscope/convolve.c
@@ -0,0 +1,316 @@
+/* Karatsuba convolution
+ *
+ * Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ *
+ * $Id$
+ *
+ */
+
+/* The algorithm is based on the following. For the convolution of a pair
+ * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
+ * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
+ * b.d. A similar relation enables us to compute a 2n by 2n convolution
+ * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
+ * multiplications (as opposed to the 4^n that the quadratic algorithm
+ * takes. */
+
+/* For large n, this is slower than the O(n log n) that the FFT method
+ * takes, but we avoid using complex numbers, and we only have to compute
+ * one convolution, as opposed to 3 FFTs. We have good locality-of-
+ * reference as well, which will help on CPUs with tiny caches. */
+
+/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
+ * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
+ * algorithm. We actually want 257 outputs of a 256 x 512 convolution;
+ * that doesn't appear to give an easy advantage for the FFT algorithm, but
+ * for the Karatsuba algorithm, it's easy to use two 256 x 256
+ * convolutions, taking 2 x 3^8 = 12312 multiplications. [This difference
+ * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
+ * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
+ * - 1]. */
+
+/* There's a big lie above, actually... for a 4x4 convolution, it's quicker
+ * to do it using 16 multiplications than the more complex Karatsuba
+ * algorithm... So the recursion bottoms out at 4x4s. This increases the
+ * number of multiplications by a factor of 16/9, but reduces the overheads
+ * dramatically. */
+
+/* The convolution algorithm is implemented as a stack machine. We have a
+ * stack of commands, each in one of the forms "do a 2^n x 2^n
+ * convolution", or "combine these three length 2^n outputs into one
+ * 2^{n+1} output." */
+
+#include <stdlib.h>
+#include "convolve.h"
+
+typedef union stack_entry_s {
+ struct {const double * left, * right; double * out;} v;
+ struct {double * main, * null;} b;
+
+} stack_entry;
+
+#define STACK_SIZE (CONVOLVE_DEPTH * 3)
+
+struct _struct_convolve_state {
+ double left [CONVOLVE_BIG];
+ double right [CONVOLVE_SMALL * 3];
+ double scratch [CONVOLVE_SMALL * 3];
+ stack_entry stack[STACK_SIZE];
+};
+
+/*
+ * Initialisation routine - sets up tables and space to work in.
+ * Returns a pointer to internal state, to be used when performing calls.
+ * On error, returns NULL.
+ * The pointer should be freed when it is finished with, by convolve_close().
+ */
+convolve_state *convolve_init(void)
+{
+ return (convolve_state *) malloc (sizeof(convolve_state));
+}
+
+/*
+ * Free the state allocated with convolve_init().
+ */
+void convolve_close(convolve_state *state)
+{
+ if (state)
+ free(state);
+}
+
+static void convolve_4 (double * out, const double * left, const double * right)
+/* This does a 4x4 -> 7 convolution. For what it's worth, the slightly odd
+ * ordering gives about a 1% speed up on my Pentium II. */
+{
+ double l0, l1, l2, l3, r0, r1, r2, r3;
+ double a;
+ l0 = left[0];
+ r0 = right[0];
+ a = l0 * r0;
+ l1 = left[1];
+ r1 = right[1];
+ out[0] = a;
+ a = (l0 * r1) + (l1 * r0);
+ l2 = left[2];
+ r2 = right[2];
+ out[1] = a;
+ a = (l0 * r2) + (l1 * r1) + (l2 * r0);
+ l3 = left[3];
+ r3 = right[3];
+ out[2] = a;
+
+ out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
+ out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
+ out[5] = (l2 * r3) + (l3 * r2);
+ out[6] = l3 * r3;
+}
+
+static void convolve_run (stack_entry * top, unsigned size, double * scratch)
+/* Interpret a stack of commands. The stack starts with two entries; the
+ * convolution to do, and an illegal entry used to mark the stack top. The
+ * size is the number of entries in each input, and must be a power of 2,
+ * and at least 8. It is OK to have out equal to left and/or right.
+ * scratch must have length 3*size. The number of stack entries needed is
+ * 3n-4 where size=2^n. */
+{
+ do {
+ const double * left;
+ const double * right;
+ double * out;
+
+ /* When we get here, the stack top is always a convolve,
+ * with size > 4. So we will split it. We repeatedly split
+ * the top entry until we get to size = 4. */
+
+ left = top->v.left;
+ right = top->v.right;
+ out = top->v.out;
+ top++;
+
+ do {
+ double * s_left, * s_right;
+ int i;
+
+ /* Halve the size. */
+ size >>= 1;
+
+ /* Allocate the scratch areas. */
+ s_left = scratch + size * 3;
+ /* s_right is a length 2*size buffer also used for
+ * intermediate output. */
+ s_right = scratch + size * 4;
+
+ /* Create the intermediate factors. */
+ for (i = 0; i < size; i++) {
+ double l = left[i] + left[i + size];
+ double r = right[i] + right[i + size];
+ s_left[i + size] = r;
+ s_left[i] = l;
+ }
+
+ /* Push the combine entry onto the stack. */
+ top -= 3;
+ top[2].b.main = out;
+ top[2].b.null = NULL;
+
+ /* Push the low entry onto the stack. This must be
+ * the last of the three sub-convolutions, because
+ * it may overwrite the arguments. */
+ top[1].v.left = left;
+ top[1].v.right = right;
+ top[1].v.out = out;
+
+ /* Push the mid entry onto the stack. */
+ top[0].v.left = s_left;
+ top[0].v.right = s_right;
+ top[0].v.out = s_right;
+
+ /* Leave the high entry in variables. */
+ left += size;
+ right += size;
+ out += size * 2;
+
+ } while (size > 4);
+
+ /* When we get here, the stack top is a group of 3
+ * convolves, with size = 4, followed by some combines. */
+ convolve_4 (out, left, right);
+ convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
+ convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
+ top += 2;
+
+ /* Now process combines. */
+ do {
+ /* b.main is the output buffer, mid is the middle
+ * part which needs to be adjusted in place, and
+ * then folded back into the output. We do this in
+ * a slightly strange way, so as to avoid having
+ * two loops. */
+ double * out = top->b.main;
+ double * mid = scratch + size * 4;
+ unsigned int i;
+ top++;
+ out[size * 2 - 1] = 0;
+ for (i = 0; i < size-1; i++) {
+ double lo;
+ double hi;
+ lo = mid[0] - (out[0] + out[2 * size]) + out[size];
+ hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
+ out[size] = lo;
+ out[2 * size] = hi;
+ out++;
+ mid++;
+ }
+ size <<= 1;
+ } while (top->b.null == NULL);
+ } while (top->b.main != NULL);
+}
+
+int convolve_match (const int * lastchoice,
+ const short * input,
+ convolve_state * state)
+/* lastchoice is a 256 sized array. input is a 512 array. We find the
+ * contiguous length 256 sub-array of input that best matches lastchoice.
+ * A measure of how good a sub-array is compared with the lastchoice is
+ * given by the sum of the products of each pair of entries. We maximise
+ * that, by taking an appropriate convolution, and then finding the maximum
+ * entry in the convolutions. state is a (non-NULL) pointer returned by
+ * convolve_init. */
+{
+ double avg;
+ double best;
+ int p = 0;
+ int i;
+ double * left = state->left;
+ double * right = state->right;
+ double * scratch = state->scratch;
+ stack_entry * top = state->stack + STACK_SIZE - 1;
+#if 1
+ for (i = 0; i < 512; i++)
+ left[i] = input[i];
+
+ avg = 0;
+ for (i = 0; i < 256; i++) {
+ double a = lastchoice[255 - i];
+ right[i] = a;
+ avg += a;
+ }
+#endif
+ /* We adjust the smaller of the two input arrays to have average
+ * value 0. This makes the eventual result insensitive to both
+ * constant offsets and positive multipliers of the inputs. */
+ avg /= 256;
+ for (i = 0; i < 256; i++)
+ right[i] -= avg;
+ /* End-of-stack marker. */
+#if 0 /* The following line produces a CRASH, need to figure out why?!! */
+ top[1].b.null = scratch;
+#endif
+ top[1].b.main = NULL;
+ /* The low 256x256, of which we want the high 256 outputs. */
+ top->v.left = left;
+ top->v.right = right;
+ top->v.out = right + 256;
+ convolve_run (top, 256, scratch);
+
+ /* The high 256x256, of which we want the low 256 outputs. */
+ top->v.left = left + 256;
+ top->v.right = right;
+ top->v.out = right;
+ convolve_run (top, 256, scratch);
+
+ /* Now find the best position amoungs this. Apart from the first
+ * and last, the required convolution outputs are formed by adding
+ * outputs from the two convolutions above. */
+ best = right[511];
+ right[767] = 0;
+ p = -1;
+ for (i = 0; i < 256; i++) {
+ double a = right[i] + right[i + 512];
+ if (a > best) {
+ best = a;
+ p = i;
+ }
+ }
+ p++;
+
+#if 0
+ {
+ /* This is some debugging code... */
+ int bad = 0;
+ best = 0;
+ for (i = 0; i < 256; i++)
+ best += ((double) input[i+p]) * ((double) lastchoice[i] - avg);
+
+ for (i = 0; i < 257; i++) {
+ double tot = 0;
+ unsigned int j;
+ for (j = 0; j < 256; j++)
+ tot += ((double) input[i+j]) * ((double) lastchoice[j] - avg);
+ if (tot > best)
+ printf ("(%i)", i);
+ if (tot != left[i + 255])
+ printf ("!");
+ }
+
+ printf ("%i\n", p);
+ }
+#endif
+
+ return p;
+}