Reorganization of threshold code to move all the thresh holding operations into a new file.

git-svn-id: http://svn.ghostscript.com/ghostscript/trunk@12238 a1074d23-0009-0410-80fe-cf8c14f379e6
author: Michael Vrhel <michael.vrhel@artifex.com> 2011-03-04 06:45:29 +0000
committer: Michael Vrhel <michael.vrhel@artifex.com> 2011-03-04 06:45:29 +0000
commit: fdc21fee6c1679b641d9f296fafac9c1a4fff19d (patch)
tree: 67a0d3620f075ce3f0baca742ad394464e20050c /gs
parent: bf07d980ff2349ca540f9e87fd99e10729815b08 (diff)
6 files changed, 388 insertions, 337 deletions
diff --git a/gs/base/gsiparam.h b/gs/base/gsiparam.h
index f31f2477f..eb6eed3bb 100644
--- a/gs/base/gsiparam.h
+++ b/gs/base/gsiparam.h
@@ -288,6 +288,19 @@ void gs_image_t_init_mask_adjust(gs_image_t * pim, bool write_1s,
 #define gs_image_t_init_mask(pim, write_1s)\
   gs_image_t_init_mask_adjust(pim, write_1s, true)
 
+/* Used for bookkeeping ht buffer information in lanscape mode */
+typedef struct ht_landscape_info_s {
+    int count;
+    int widths[16];
+    int xstart;
+    int curr_pos;
+    int index;
+    int num_contones;
+    bool offset_set;
+    bool flipy;
+    int y_pos;
+} ht_landscape_info_t;
+
 
 /****** REMAINDER OF FILE UNDER CONSTRUCTION. PROCEED AT YOUR OWN RISK. ******/
 
diff --git a/gs/base/gxht_thresh.c b/gs/base/gxht_thresh.c
new file mode 100644
index 000000000..0deb156fe
--- /dev/null
+++ b/gs/base/gxht_thresh.c
@@ -0,0 +1,328 @@
+/* Copyright (C) 2011-2012 Artifex Software, Inc.
+   All Rights Reserved.
+
+   This software is provided AS-IS with no warranty, either express or
+   implied.
+
+   This software is distributed under license and may not be copied, modified
+   or distributed except as expressly authorized under the terms of that
+   license.  Refer to licensing information at http://www.artifex.com/
+   or contact Artifex Software, Inc.,  7 Mt. Lassen Drive - Suite A-134,
+   San Rafael, CA  94903, U.S.A., +1(415)492-9861, for further information.
+*/
+
+/*$Id: gxhts_thresh.c  $ */
+/* Halftone thresholding code */
+
+#include "memory_.h"
+#include "gx.h"
+#include "gsiparam.h"
+#include "gxht_thresh.h"
+#include "math_.h"
+
+#ifndef __WIN32__
+#define __align16  __attribute__((align(16)))
+#else
+#define __align16 __declspec(align(16))
+#endif
+
+#ifdef HAVE_SSE2
+
+#include <emmintrin.h>
+
+static const byte bitreverse[] =
+{ 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0,
+  0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8,
+  0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4,
+  0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC,
+  0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2,
+  0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
+  0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6,
+  0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE,
+  0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1,
+  0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9,
+  0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
+  0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD,
+  0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3,
+  0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB,
+  0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7,
+  0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
+  0x3F, 0xBF, 0x7F, 0xFF};
+#endif
+
+#if RAW_HT_DUMP
+/* This is slow thresholding, byte output for debug only */
+void
+gx_ht_threshold_row_byte(byte *contone, byte *threshold_strip, int contone_stride,
+                              byte *halftone, int dithered_stride, int width,
+                              int num_rows)
+{
+    int k, j;
+    byte *contone_ptr;
+    byte *thresh_ptr;
+    byte *halftone_ptr;
+
+    /* For the moment just do a very slow compare until we get
+       get this working */
+    for (j = 0; j < num_rows; j++) {
+        contone_ptr = contone;
+        thresh_ptr = threshold_strip + contone_stride * j;
+        halftone_ptr = halftone + dithered_stride * j;
+        for (k = 0; k < width; k++) {
+            if (contone_ptr[k] < thresh_ptr[k]) {
+                halftone_ptr[k] = 0;
+            } else {
+                halftone_ptr[k] = 255;
+            }
+        }
+    }
+}
+#endif
+
+#ifndef HAVE_SSE2
+
+/* A simple case for use in the landscape mode. Could probably be coded up
+   faster */
+static void
+threshold_16_bit(byte *contone_ptr_in, byte *thresh_ptr_in, byte *ht_data)
+{
+    int k, j;
+    byte *contone_ptr = contone_ptr_in;
+    byte *thresh_ptr = thresh_ptr_in;
+    byte bit_init;
+
+    for (j = 0; j < 2; j++) {
+        bit_init = 0x80;
+        for (k = 0; k < 8; k++) {
+            if (contone_ptr[k] < thresh_ptr[k]) {
+                ht_data[j] |=  bit_init;
+            } else {
+                ht_data[j] &=  ~bit_init;
+            }
+            bit_init >>= 1;
+        }
+        contone_ptr += 8;
+        thresh_ptr += 8;
+    }
+}
+#else
+/* Note this function has strict data alignment needs */
+static void
+threshold_16_SSE(byte *contone_ptr, byte *thresh_ptr, byte *ht_data)
+{
+    __m128i input1;
+    __m128i input2;
+    register int result_int;
+    const unsigned int mask1 = 0x80808080;
+    __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1);
+
+    /* Load */
+    input1 = _mm_load_si128((const __m128i *)contone_ptr);
+    input2 = _mm_load_si128((const __m128i *) thresh_ptr);
+    /* Unsigned subtraction does Unsigned saturation so we
+       have to use the signed operation */
+    input1 = _mm_xor_si128(input1, sign_fix);
+    input2 = _mm_xor_si128(input2, sign_fix);
+    /* Subtract the two */
+    input2 = _mm_subs_epi8(input1, input2);
+    /* Grab the sign mask */
+    result_int = _mm_movemask_epi8(input2);
+    /* bit wise reversal on 16 bit word */
+    ht_data[0] = bitreverse[(result_int & 0xff)];
+    ht_data[1] = bitreverse[((result_int >> 8) & 0xff)];
+}
+
+/* Not so fussy on its alignment */
+static void
+threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data)
+{
+    __m128i input1;
+    __m128i input2;
+    int result_int;
+    byte *sse_data;
+    const unsigned int mask1 = 0x80808080;
+    __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1);
+
+    sse_data = (byte*) &(result_int);
+    /* Load */
+    input1 = _mm_loadu_si128((const __m128i *)contone_ptr);
+    input2 = _mm_loadu_si128((const __m128i *) thresh_ptr);
+    /* Unsigned subtraction does Unsigned saturation so we
+       have to use the signed operation */
+    input1 = _mm_xor_si128(input1, sign_fix);
+    input2 = _mm_xor_si128(input2, sign_fix);
+    /* Subtract the two */
+    input2 = _mm_subs_epi8(input1, input2);
+    /* Grab the sign mask */
+    result_int = _mm_movemask_epi8(input2);
+    /* bit wise reversal on 16 bit word */
+    ht_data[0] = bitreverse[sse_data[0]];
+    ht_data[1] = bitreverse[sse_data[1]];
+}
+#endif
+
+/* SSE2 and non-SSE2 implememntation of thresholding a row  */
+void
+gx_ht_threshold_row_bit(byte *contone,  byte *threshold_strip,  int contone_stride,
+                  byte *halftone, int dithered_stride, int width,
+                  int num_rows, int offset_bits)
+{
+#ifndef HAVE_SSE2
+    int k, j;
+    byte *contone_ptr;
+    byte *thresh_ptr;
+    byte *halftone_ptr;
+    byte bit_init;
+    int ht_index;
+
+    /* For the moment just do a very slow compare until we get
+       get this working.  This could use some serious optimization */
+    for (j = 0; j < num_rows; j++) {
+        contone_ptr = contone;
+        thresh_ptr = threshold_strip + contone_stride * j;
+        halftone_ptr = halftone + dithered_stride * j;
+        /* First get the left remainder portion.  Put into MSBs of first byte */
+        bit_init = 0x80;
+        ht_index = -1;
+        for (k = 0; k < offset_bits; k++) {
+            if ( (k % 8) == 0) {
+                ht_index++;
+            }
+            if (contone_ptr[k] < thresh_ptr[k]) {
+                halftone_ptr[ht_index] |=  bit_init;
+            } else {
+                halftone_ptr[ht_index] &=  ~bit_init;
+            }
+            if (bit_init == 1) {
+                bit_init = 0x80;
+            } else {
+                bit_init >>= 1;
+            }
+        }
+        bit_init = 0x80;
+        ht_index = -1;
+        if (offset_bits > 0) {
+            halftone_ptr += 2; /* Point to the next 16 bits of data */
+        }
+        /* Now get the rest, which will be 16 bit aligned. */
+        for (k = offset_bits; k < width; k++) {
+            if (((k - offset_bits) % 8) == 0) {
+                ht_index++;
+            }
+            if (contone_ptr[k] < thresh_ptr[k]) {
+                halftone_ptr[ht_index] |=  bit_init;
+            } else {
+                halftone_ptr[ht_index] &=  ~bit_init;
+            }
+            if (bit_init == 1) {
+                bit_init = 0x80;
+            } else {
+                bit_init >>= 1;
+            }
+        }
+    }
+#else
+    byte *contone_ptr;
+    byte *thresh_ptr;
+    byte *halftone_ptr;
+    int num_tiles = (int) ceil((float) (width - offset_bits)/16.0);
+    int k, j;
+
+    for (j = 0; j < num_rows; j++) {
+        /* contone and thresh_ptr are 128 bit aligned.  We do need to do this in
+           two steps to ensure that we pack the bits in an aligned fashion
+           into halftone_ptr.  */
+        contone_ptr = contone;
+        thresh_ptr = threshold_strip + contone_stride * j;
+        halftone_ptr = halftone + dithered_stride * j;
+        if (offset_bits > 0) {
+            /* Since we allowed for 16 bits in our left remainder
+               we can go directly in to the destination.  threshold_16_SSE
+               requires 128 bit alignment.  contone_ptr and thresh_ptr
+               are set up so that after we move in by offset_bits elements
+               then we are 128 bit aligned.  */
+            threshold_16_SSE_unaligned(contone_ptr, thresh_ptr, 
+                                       halftone_ptr);
+            halftone_ptr += 2;
+            thresh_ptr += offset_bits;
+            contone_ptr += offset_bits;
+        }
+        /* Now we should have 128 bit aligned with our input data. Iterate
+           over sets of 16 going directly into our HT buffer.  Sources and
+           halftone_ptr buffers should be padded to allow 15 bit overrun */
+        for (k = 0; k < num_tiles; k++) {
+            threshold_16_SSE(contone_ptr, thresh_ptr, halftone_ptr);
+            thresh_ptr += 16;
+            contone_ptr += 16;
+            halftone_ptr += 2;
+        }
+    }
+#endif
+}
+
+
+/* This thresholds a buffer that is 16 wide by data_length tall */
+void
+gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align,
+                    ht_landscape_info_t ht_landscape, byte *halftone,
+                    int data_length)
+{
+    __align16 byte contone[16];
+    int position_start, position, curr_position;
+    int *widths = &(ht_landscape.widths[0]);
+    int local_widths[16];
+    int num_contone = ht_landscape.num_contones;
+    int k, j, w, contone_out_posit;
+    byte *contone_ptr, *thresh_ptr, *halftone_ptr;
+
+    /* Work through chunks of 16.  */
+    /* Data may have come in left to right or right to left. */
+    if (ht_landscape.index > 0) {
+        position = position_start = 0;
+    } else {
+        position = position_start = ht_landscape.curr_pos + 1;
+    }
+    thresh_ptr = thresh_align;
+    halftone_ptr = halftone;
+    /* Copy the widths to a local array, and truncate the last one (which may
+     * be the first one!) if required. */
+    k = 0;
+    for (j = 0; j < num_contone; j++)
+        k += (local_widths[j] = widths[position_start+j]);
+    if (k > 16) {
+        if (ht_landscape.index > 0) {
+            local_widths[num_contone-1] -= k-16;
+        } else {
+            local_widths[0] -= k-16;
+        }
+    }
+    for (k = data_length; k > 0; k--) { /* Loop on rows */
+        contone_ptr = &(contone_align[position]); /* Point us to our row start */
+        curr_position = 0; /* We use this in keeping track of widths */
+        contone_out_posit = 0; /* Our index out */
+        for (j = num_contone; j > 0; j--) {
+            byte c = *contone_ptr;
+            for (w = local_widths[curr_position]; w > 0; w--) {
+                contone[contone_out_posit] = c;
+                contone_out_posit++;
+            }
+            curr_position++; /* Move us to the next position in our width array */
+            contone_ptr++;   /* Move us to a new location in our contone buffer */
+        }
+        /* Now we have our left justified and expanded contone data for a single
+           set of 16.  Go ahead and threshold these */
+#ifdef HAVE_SSE2
+        threshold_16_SSE(&(contone[0]), thresh_ptr, halftone_ptr);
+#else
+        threshold_16_bit(&(contone[0]), thresh_ptr, halftone_ptr);
+#endif
+        thresh_ptr += 16;
+        position += 16;
+        halftone_ptr += 2;
+    }
+}
diff --git a/gs/base/gxht_thresh.h b/gs/base/gxht_thresh.h
new file mode 100644
index 000000000..b7b6b6d40
--- /dev/null
+++ b/gs/base/gxht_thresh.h
@@ -0,0 +1,35 @@
+/* Copyright (C) 2001-2006 Artifex Software, Inc.
+   All Rights Reserved.
+  
+   This software is provided AS-IS with no warranty, either express or
+   implied.
+
+   This software is distributed under license and may not be copied, modified
+   or distributed except as expressly authorized under the terms of that
+   license.  Refer to licensing information at http://www.artifex.com/
+   or contact Artifex Software, Inc.,  7 Mt. Lassen Drive - Suite A-134,
+   San Rafael, CA  94903, U.S.A., +1(415)492-9861, for further information.
+*/
+
+/* $Id: gsht_thresh.h  $ */
+/* Threshhold based halftoning prototypes */
+
+#ifndef gsht_thresh_INCLUDED
+#  define gsht_thresh_INCLUDED
+
+#define RAW_HT_DUMP 0
+
+#if RAW_HT_DUMP
+void gx_ht_threshold_row_byte(byte *contone, byte *threshold_strip, 
+                              int contone_stride, byte *halftone, 
+                              int dithered_stride, int width, int num_rows);
+#endif
+void gx_ht_threshold_row_bit(byte *contone,  byte *threshold_strip,  
+                             int contone_stride, byte *halftone, 
+                             int dithered_stride, int width, int num_rows, 
+                             int offset_bits);
+void gx_ht_threshold_landscape(byte *contone_align, byte *thresh_align,
+                    ht_landscape_info_t ht_landscape, byte *halftone,
+                    int data_length);
+#endif /* gshtx_INCLUDED */
+
diff --git a/gs/base/gximage.h b/gs/base/gximage.h
index bc7a2fa92..f086db3e9 100644
--- a/gs/base/gximage.h
+++ b/gs/base/gximage.h
@@ -88,18 +88,6 @@ struct sample_map_s {
     bool inverted;
 
 };
-/* Used for bookkeeping ht buffer information in lanscape mode */
-typedef struct ht_landscape_info_s {
-    int count;
-    int widths[16];
-    int xstart;
-    int curr_pos;
-    int index;
-    int num_contones;
-    bool offset_set;
-    bool flipy;
-    int y_pos;
-} ht_landscape_info_t;
 
 #ifndef sample_map_DEFINED
 #define sample_map_DEFINED
diff --git a/gs/base/gximono.c b/gs/base/gximono.c
index 8cf9ced57..fbd46f471 100644
--- a/gs/base/gximono.c
+++ b/gs/base/gximono.c
@@ -38,47 +38,11 @@
 #include "gsicc_littlecms.h"
 #include "gxcie.h"
 #include "gscie.h"
+#include "gxht_thresh.h"
 
-#define RAW_HT_DUMP 0
 #define USE_FAST_CODE 1
 #define fastfloor(x) (((int)(x)) - (((x)<0) && ((x) != (float)(int)(x))))
 
-/* This should be moved someplace else later */
-#ifndef __WIN32__
-#define __align16  __attribute__((align(16)))
-#else
-#define __align16 __declspec(align(16))
-#endif
-
-#ifdef HAVE_SSE2
-
-#include <emmintrin.h>
-
-static const byte bitreverse[] =
-{ 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0,
-  0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8,
-  0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4,
-  0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
-  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC,
-  0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2,
-  0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
-  0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
-  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6,
-  0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE,
-  0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1,
-  0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
-  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9,
-  0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
-  0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD,
-  0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
-  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3,
-  0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB,
-  0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7,
-  0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
-  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
-  0x3F, 0xBF, 0x7F, 0xFF};
-#endif
-
 /* ------ Strategy procedure ------ */
 
 /* Check the prototype. */
@@ -942,283 +906,6 @@ fill_threshhold_buffer(byte *dest_strip, byte *src_strip, int src_width,
     memcpy(ptr_out_temp, src_strip, right_width);
 }
 
-#if RAW_HT_DUMP
-/* This is slow thresholding, byte output for debug only */
-static void
-threshold_row_byte(byte *contone, byte *threshold_strip, int contone_stride,
-                              byte *halftone, int dithered_stride, int width,
-                              int num_rows)
-{
-    int k, j;
-    byte *contone_ptr;
-    byte *thresh_ptr;
-    byte *halftone_ptr;
-
-    /* For the moment just do a very slow compare until we get
-       get this working */
-    for (j = 0; j < num_rows; j++) {
-        contone_ptr = contone;
-        thresh_ptr = threshold_strip + contone_stride * j;
-        halftone_ptr = halftone + dithered_stride * j;
-        for (k = 0; k < width; k++) {
-            if (contone_ptr[k] < thresh_ptr[k]) {
-                halftone_ptr[k] = 0;
-            } else {
-                halftone_ptr[k] = 255;
-            }
-        }
-    }
-}
-#endif
-
-#ifndef HAVE_SSE2
-/* This is slow thresholding bit output */
-static void
-threshold_row_bit(byte *contone,  byte *threshold_strip,  int contone_stride,
-                  byte *halftone, int dithered_stride, int width,
-                  int num_rows, int offset_bits)
-{
-    int k, j;
-    byte *contone_ptr;
-    byte *thresh_ptr;
-    byte *halftone_ptr;
-    byte bit_init;
-    int ht_index;
-
-    /* For the moment just do a very slow compare until we get
-       get this working.  This could use some serious optimization */
-    for (j = 0; j < num_rows; j++) {
-        contone_ptr = contone;
-        thresh_ptr = threshold_strip + contone_stride * j;
-        halftone_ptr = halftone + dithered_stride * j;
-        /* First get the left remainder portion.  Put into MSBs of first byte */
-        bit_init = 0x80;
-        ht_index = -1;
-        for (k = 0; k < offset_bits; k++) {
-            if ( (k % 8) == 0) {
-                ht_index++;
-            }
-            if (contone_ptr[k] < thresh_ptr[k]) {
-                halftone_ptr[ht_index] |=  bit_init;
-            } else {
-                halftone_ptr[ht_index] &=  ~bit_init;
-            }
-            if (bit_init == 1) {
-                bit_init = 0x80;
-            } else {
-                bit_init >>= 1;
-            }
-        }
-        bit_init = 0x80;
-        ht_index = -1;
-        if (offset_bits > 0) {
-            halftone_ptr += 2; /* Point to the next 16 bits of data */
-        }
-        /* Now get the rest, which will be 16 bit aligned. */
-        for (k = offset_bits; k < width; k++) {
-            if (((k - offset_bits) % 8) == 0) {
-                ht_index++;
-            }
-            if (contone_ptr[k] < thresh_ptr[k]) {
-                halftone_ptr[ht_index] |=  bit_init;
-            } else {
-                halftone_ptr[ht_index] &=  ~bit_init;
-            }
-            if (bit_init == 1) {
-                bit_init = 0x80;
-            } else {
-                bit_init >>= 1;
-            }
-        }
-    }
-}
-
-/* A simple case for use in the landscape mode. Could probably be coded up
-   faster */
-static void
-threshold_16_bit(byte *contone_ptr_in, byte *thresh_ptr_in, byte *ht_data)
-{
-    int k, j;
-    byte *contone_ptr = contone_ptr_in;
-    byte *thresh_ptr = thresh_ptr_in;
-    byte bit_init;
-
-    for (j = 0; j < 2; j++) {
-        bit_init = 0x80;
-        for (k = 0; k < 8; k++) {
-            if (contone_ptr[k] < thresh_ptr[k]) {
-                ht_data[j] |=  bit_init;
-            } else {
-                ht_data[j] &=  ~bit_init;
-            }
-            bit_init >>= 1;
-        }
-        contone_ptr += 8;
-        thresh_ptr += 8;
-    }
-}
-#else
-/* Note this function has strict data alignment needs */
-static void
-threshold_16_SSE(byte *contone_ptr, byte *thresh_ptr, byte *ht_data)
-{
-    __m128i input1;
-    __m128i input2;
-    register int result_int;
-    const unsigned int mask1 = 0x80808080;
-    __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1);
-
-    /* Load */
-    input1 = _mm_load_si128((const __m128i *)contone_ptr);
-    input2 = _mm_load_si128((const __m128i *) thresh_ptr);
-    /* Unsigned subtraction does Unsigned saturation so we
-       have to use the signed operation */
-    input1 = _mm_xor_si128(input1, sign_fix);
-    input2 = _mm_xor_si128(input2, sign_fix);
-    /* Subtract the two */
-    input2 = _mm_subs_epi8(input1, input2);
-    /* Grab the sign mask */
-    result_int = _mm_movemask_epi8(input2);
-    /* bit wise reversal on 16 bit word */
-    ht_data[0] = bitreverse[(result_int & 0xff)];
-    ht_data[1] = bitreverse[((result_int >> 8) & 0xff)];
-}
-
-/* Not so fussy on its alignment */
-static void
-threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data)
-{
-    __m128i input1;
-    __m128i input2;
-    int result_int;
-    byte *sse_data;
-    const unsigned int mask1 = 0x80808080;
-    __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1);
-
-    sse_data = (byte*) &(result_int);
-    /* Load */
-    input1 = _mm_loadu_si128((const __m128i *)contone_ptr);
-    input2 = _mm_loadu_si128((const __m128i *) thresh_ptr);
-    /* Unsigned subtraction does Unsigned saturation so we
-       have to use the signed operation */
-    input1 = _mm_xor_si128(input1, sign_fix);
-    input2 = _mm_xor_si128(input2, sign_fix);
-    /* Subtract the two */
-    input2 = _mm_subs_epi8(input1, input2);
-    /* Grab the sign mask */
-    result_int = _mm_movemask_epi8(input2);
-    /* bit wise reversal on 16 bit word */
-    ht_data[0] = bitreverse[sse_data[0]];
-    ht_data[1] = bitreverse[sse_data[1]];
-}
-
-/* This uses SSE2 simd operations to perform the thresholding operation.
-   Intrinsics are used since in-line assm is not supported in Visual
-   Studio on 64 bit machines, plus instrinsics are easily ported between
-   Visual Studio and gcc. requires <emmintrin.h> */
-static void
-threshold_row_SSE(byte *contone,  byte *threshold_strip,  int contone_stride,
-                  byte *halftone, int dithered_stride, int width,
-                  int num_rows, int offset_bits)
-{
-    byte *contone_ptr;
-    byte *thresh_ptr;
-    byte *halftone_ptr;
-    int num_tiles = (int) ceil((float) (width - offset_bits)/16.0);
-    int k, j;
-
-    for (j = 0; j < num_rows; j++) {
-        /* contone and thresh_ptr are 128 bit aligned.  We do need to do this in
-           two steps to ensure that we pack the bits in an aligned fashion
-           into halftone_ptr.  */
-        contone_ptr = contone;
-        thresh_ptr = threshold_strip + contone_stride * j;
-        halftone_ptr = halftone + dithered_stride * j;
-        if (offset_bits > 0) {
-            /* Since we allowed for 16 bits in our left remainder
-               we can go directly in to the destination.  threshold_16_SSE
-               requires 128 bit alignment.  contone_ptr and thresh_ptr
-               are set up so that after we move in by offset_bits elements
-               then we are 128 bit aligned.  */
-            threshold_16_SSE_unaligned(contone_ptr, thresh_ptr, 
-                                       halftone_ptr);
-            halftone_ptr += 2;
-            thresh_ptr += offset_bits;
-            contone_ptr += offset_bits;
-        }
-        /* Now we should have 128 bit aligned with our input data. Iterate
-           over sets of 16 going directly into our HT buffer.  Sources and
-           halftone_ptr buffers should be padded to allow 15 bit overrun */
-        for (k = 0; k < num_tiles; k++) {
-            threshold_16_SSE(contone_ptr, thresh_ptr, halftone_ptr);
-            thresh_ptr += 16;
-            contone_ptr += 16;
-            halftone_ptr += 2;
-        }
-    }
-}
-#endif
-
-/* This thresholds a buffer that is 16 wide by data_length tall */
-static void
-threshold_landscape(byte *contone_align, byte *thresh_align,
-                    ht_landscape_info_t ht_landscape, byte *halftone,
-                    int data_length)
-{
-    __align16 byte contone[16];
-    int position_start, position, curr_position;
-    int *widths = &(ht_landscape.widths[0]);
-    int local_widths[16];
-    int num_contone = ht_landscape.num_contones;
-    int k, j, w, contone_out_posit;
-    byte *contone_ptr, *thresh_ptr, *halftone_ptr;
-
-    /* Work through chunks of 16.  */
-    /* Data may have come in left to right or right to left. */
-    if (ht_landscape.index > 0) {
-        position = position_start = 0;
-    } else {
-        position = position_start = ht_landscape.curr_pos + 1;
-    }
-    thresh_ptr = thresh_align;
-    halftone_ptr = halftone;
-    /* Copy the widths to a local array, and truncate the last one (which may
-     * be the first one!) if required. */
-    k = 0;
-    for (j = 0; j < num_contone; j++)
-        k += (local_widths[j] = widths[position_start+j]);
-    if (k > 16) {
-        if (ht_landscape.index > 0) {
-            local_widths[num_contone-1] -= k-16;
-        } else {
-            local_widths[0] -= k-16;
-        }
-    }
-    for (k = data_length; k > 0; k--) { /* Loop on rows */
-        contone_ptr = &(contone_align[position]); /* Point us to our row start */
-        curr_position = 0; /* We use this in keeping track of widths */
-        contone_out_posit = 0; /* Our index out */
-        for (j = num_contone; j > 0; j--) {
-            byte c = *contone_ptr;
-            for (w = local_widths[curr_position]; w > 0; w--) {
-                contone[contone_out_posit] = c;
-                contone_out_posit++;
-            }
-            curr_position++; /* Move us to the next position in our width array */
-            contone_ptr++;   /* Move us to a new location in our contone buffer */
-        }
-        /* Now we have our left justified and expanded contone data for a single
-           set of 16.  Go ahead and threshold these */
-#ifdef HAVE_SSE2
-        threshold_16_SSE(&(contone[0]), thresh_ptr, halftone_ptr);
-#else
-        threshold_16_bit(&(contone[0]), thresh_ptr, halftone_ptr);
-#endif
-        thresh_ptr += 16;
-        position += 16;
-        halftone_ptr += 2;
-    }
-}
 
 /* If we are in here, we had data left over.  Move it to the proper position
    and get ht_landscape_info_t set properly */
@@ -1599,7 +1286,7 @@ flush:
             }
             /* Apply the threshold operation */
 #if RAW_HT_DUMP
-            threshold_row_byte(contone_align, thresh_align, contone_stride,
+            gx_ht_threshold_row_byte(contone_align, thresh_align, contone_stride,
                               halftone, dithered_stride, dest_width, vdi);
             sprintf(file_name,"HT_Portrait_%d_%dx%dx%d.raw", penum->id, dest_width,
                     dest_height, spp_out);
@@ -1607,15 +1294,9 @@ flush:
             fwrite(halftone,1,dest_width * vdi,fid);
             fclose(fid);
 #else
-#ifdef HAVE_SSE2
-            threshold_row_SSE(contone_align, thresh_align, contone_stride,
+            gx_ht_threshold_row_bit(contone_align, thresh_align, contone_stride,
                               halftone, dithered_stride, dest_width, vdi,
                               offset_bits);
-#else
-            threshold_row_bit(contone_align, thresh_align, contone_stride,
-                              halftone, dithered_stride, dest_width, vdi,
-                              offset_bits);
-#endif
             /* Now do the copy mono operation */
             /* First the left remainder bits */
             if (offset_bits > 0) {
@@ -1712,7 +1393,7 @@ flush:
                     memcpy(ptr_out, thresh_align, 16 * tile_remainder);
                 }
                 /* Apply the threshold operation */
-                threshold_landscape(contone_align, thresh_align,
+                gx_ht_threshold_landscape(contone_align, thresh_align,
                                     penum->ht_landscape, halftone, data_length);
                 /* Perform the copy mono */
                 penum->ht_landscape.offset_set = false;
diff --git a/gs/base/lib.mak b/gs/base/lib.mak
index c0ad2e749..2a9b4663d 100644
--- a/gs/base/lib.mak
+++ b/gs/base/lib.mak
@@ -458,6 +458,7 @@ gxcspace_h=$(GLSRC)gxcspace.h\
  $(gscspace_h) $(gsccolor_h) $(gscsel_h) $(gxfrac_h) $(gxcindex_h)
 gxht_h=$(GLSRC)gxht.h $(gsht1_h) $(gsrefct_h) $(gxhttype_h) $(gxtmap_h) $(gscspace_h)
 gxcie_h=$(GLSRC)gxcie.h $(gscie_h)
+gxht_thresh_h=$(GLSRC)gxht_thresh.h
 gxpcolor_h=$(GLSRC)gxpcolor.h\
  $(gspcolor_h) $(gxcspace_h) $(gxdevice_h) $(gxdevmem_h) $(gxpcache_h) $(gxblend_h)\
  $(gxcpath_h) $(gxdcolor_h) $(gxiclass_h) 
@@ -650,6 +651,10 @@ $(GLOBJ)gxhtbit.$(OBJ) : $(GLSRC)gxhtbit.c $(GXERR) $(memory__h)\
  $(gxbitmap_h) $(gxdht_h) $(gxdhtres_h) $(gxhttile_h) $(gxtmap_h)
 	$(GLCC) $(GLO_)gxhtbit.$(OBJ) $(C_) $(GLSRC)gxhtbit.c
 
+$(GLOBJ)gxht_thresh.$(OBJ) : $(GLSRC)gxht_thresh.c $(GXERR) $(memory__h)\
+ $(gx_h) $(gsiparam_h) $(gxht_thresh_h) $(math__h)
+	$(GLCC) $(GLO_)gxht_thresh.$(OBJ) $(C_) $(GLSRC)gxht_thresh.c
+
 $(GLOBJ)gxwts.$(OBJ) : $(GLSRC)gxwts.c $(GXERR) $(gxwts_h)\
  $(stdpre_h) $(memory__h) $(gxstate_h) $(gxht_h) $(math__h) $(gxdevcli_h)\
  $(gxdht_h)
@@ -685,7 +690,7 @@ $(GLOBJ)gximono.$(OBJ) : $(GLSRC)gximono.c $(GXERR) $(memory__h) $(gpcheck_h)\
  $(gxarith_h) $(gxcmap_h) $(gxcpath_h) $(gxdcolor_h) $(gxdevice_h)\
  $(gxdevmem_h) $(gxfixed_h) $(gximage_h) $(gxistate_h) $(gxmatrix_h)\
  $(gzht_h) $(vdtrace_h) $(gsicc_h) $(gsicc_cache_h)  $(gsicc_littlecms_h)\
- $(gxcie_h) $(gscie_h)
+ $(gxcie_h) $(gscie_h) $(gxht_thresh_h)
 	$(GLCC) $(GLO_)gximono.$(OBJ) $(C_) $(GLSRC)gximono.c
 
 $(GLOBJ)gximask.$(OBJ) : $(GLSRC)gximask.c $(GXERR) $(memory__h) $(gserrors_h)\
@@ -1131,7 +1136,8 @@ LIB1x=$(GLOBJ)gxacpath.$(OBJ) $(GLOBJ)gxbcache.$(OBJ) $(GLOBJ)gxccache.$(OBJ)
 LIB2x=$(GLOBJ)gxccman.$(OBJ) $(GLOBJ)gxchar.$(OBJ) $(GLOBJ)gxcht.$(OBJ)
 LIB3x=$(GLOBJ)gxclip.$(OBJ) $(GLOBJ)gxcmap.$(OBJ) $(GLOBJ)gxcpath.$(OBJ)
 LIB4x=$(GLOBJ)gxdcconv.$(OBJ) $(GLOBJ)gxdcolor.$(OBJ) $(GLOBJ)gxhldevc.$(OBJ)
-LIB5x=$(GLOBJ)gxfill.$(OBJ) $(GLOBJ)gxfdrop.$(OBJ) $(GLOBJ)gxht.$(OBJ) $(GLOBJ)gxhtbit.$(OBJ)
+LIB5x=$(GLOBJ)gxfill.$(OBJ) $(GLOBJ)gxfdrop.$(OBJ) $(GLOBJ)gxht.$(OBJ) $(GLOBJ)gxhtbit.$(OBJ)\
+  $(GLOBJ)gxht_thresh.$(OBJ)
 LIB6x=$(GLOBJ)gxwts.$(OBJ) $(GLOBJ)gxidata.$(OBJ) $(GLOBJ)gxifast.$(OBJ) $(GLOBJ)gximage.$(OBJ)
 LIB7x=$(GLOBJ)gximage1.$(OBJ) $(GLOBJ)gximono.$(OBJ) $(GLOBJ)gxipixel.$(OBJ) $(GLOBJ)gximask.$(OBJ)
 LIB8x=$(GLOBJ)gxi12bit.$(OBJ) $(GLOBJ)gxi16bit.$(OBJ) $(GLOBJ)gxiscale.$(OBJ) $(GLOBJ)gxpaint.$(OBJ) $(GLOBJ)gxpath.$(OBJ) $(GLOBJ)gxpath2.$(OBJ)
author	Michael Vrhel <michael.vrhel@artifex.com>	2011-03-04 06:45:29 +0000
committer	Michael Vrhel <michael.vrhel@artifex.com>	2011-03-04 06:45:29 +0000
commit	fdc21fee6c1679b641d9f296fafac9c1a4fff19d (patch)
tree	67a0d3620f075ce3f0baca742ad394464e20050c /gs
parent	bf07d980ff2349ca540f9e87fd99e10729815b08 (diff)