/**************************************************************************
 *
 * Copyright 2010-2021 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 **************************************************************************/


#include "util/detect.h"

#include "util/u_math.h"
#include "util/u_cpu_detect.h"
#include "util/u_pack_color.h"
#include "util/u_rect.h"
#include "util/u_sse.h"

#include "lp_jit.h"
#include "lp_debug.h"
#include "lp_state_fs.h"
#include "lp_linear_priv.h"

#if DETECT_ARCH_SSE

#define FIXED16_SHIFT  16
#define FIXED16_ONE    (1<<16)
#define FIXED16_HALF   (1<<15)

/*
 * Color tolerance.  Allow 1 bit of error in 8 bit unorm colors.
 */
#define FIXED16_TOL (FIXED16_ONE >> 7)

/*
 * Tolerance for texture coordinate derivatives when doing linear filtering.
 *
 * (Note that extra care needs to be taken when doing linear filtering as
 * coordinates may snap up to neighbour texels inside the tile).
 */
#define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)


static inline int
float_to_fixed16(float f)
{
   return f * (float)FIXED16_ONE;
}


static inline int
fixed16_frac(int x)
{
   return x & (FIXED16_ONE - 1);
}


static inline int
fixed16_approx(int x, int y, int tol)
{
   return y - tol <= x && x <= y + tol;
}

/* set alpha channel of rgba value to 0xff. */
static inline uint32_t
rgbx(uint32_t src_val)
{
   return src_val | 0xff000000;
}

/* swap red/blue channels of a 32-bit rgba value. */
static inline uint32_t
rb_swap(uint32_t src_val)
{
   uint32_t dst_val = src_val & 0xff00ff00;
   dst_val |= (src_val & 0xff) << 16;
   dst_val |= (src_val & 0xff0000) >> 16;
   return dst_val;
}

/* swap red/blue channels and set alpha to 0xff
 * of a 32-bit rgbx value. */
static inline uint32_t
rbx_swap(uint32_t src_val)
{
   uint32_t dst_val = 0xff000000;
   dst_val |= src_val & 0xff00;
   dst_val |= (src_val & 0xff) << 16;
   dst_val |= (src_val & 0xff0000) >> 16;
   return dst_val;
}

/* set alpha channel of 128-bit 4xrgba values to 0xff. */
static inline __m128i
rgbx_128(const __m128i src_val)
{
   const __m128i mask = _mm_set1_epi32(0xff000000);
   __m128i bgrx = _mm_or_si128(src_val, mask);
   return bgrx;
}

/* swap red/blue channels of a 128-bit 4xrgba value. */
/* ssse3 could use pshufb */
static inline __m128i
rb_swap_128(const __m128i src_val)
{
   const __m128i mask = _mm_set1_epi32(0xff00ff00);
   const __m128i mask_r = _mm_set1_epi32(0xff);

   __m128i rgba = _mm_and_si128(src_val, mask);
   __m128i r = _mm_srli_epi32(src_val, 16);
   __m128i b = _mm_and_si128(src_val, mask_r);
   r = _mm_and_si128(r, mask_r);
   b = _mm_slli_epi32(b, 16);
   rgba = _mm_or_si128(rgba, r);
   rgba = _mm_or_si128(rgba, b);
   return rgba;
}

/* swap red/blue channels and set alpha to 0xff
 * of a 128-bit 4xrgbx value. */
static inline __m128i
rbx_swap_128(const __m128i src_val)
{
   const __m128i mask_a = _mm_set1_epi32(0xff000000);
   const __m128i mask_g = _mm_set1_epi32(0xff00);
   const __m128i mask_r = _mm_set1_epi32(0xff);

   __m128i rgbx = _mm_and_si128(src_val, mask_g);
   __m128i r = _mm_srli_epi32(src_val, 16);
   __m128i b = _mm_and_si128(src_val, mask_r);
   r = _mm_and_si128(r, mask_r);
   b = _mm_slli_epi32(b, 16);
   rgbx = _mm_or_si128(rgbx, mask_a);
   rgbx = _mm_or_si128(rgbx, r);
   rgbx = _mm_or_si128(rgbx, b);
   return rgbx;
}

/*
 * Unstretched blit of a bgra texture.
 */
static const uint32_t *
fetch_memcpy_bgra(struct lp_linear_elem *elem)
{
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const struct lp_jit_texture *texture = samp->texture;
   const uint32_t *src_row =
      (const uint32_t *)((const uint8_t *)texture->base +
                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
   const int s     = samp->s;
   const int width = samp->width;
   const uint32_t *row;

   src_row = &src_row[s >> FIXED16_SHIFT];

   if (((uintptr_t)src_row & 0xf) == 0) {
      /* The source texels are already aligned. Return them */
      row = src_row;
   } else {
      memcpy(samp->row, src_row, width * sizeof *row);
      row = samp->row;
   }

   samp->t += samp->dtdy;
   return row;
}

/**
 * Fetch and stretch one row.
 */
static inline const uint32_t *
fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
                           int y)
{
   const struct lp_jit_texture *texture = samp->texture;
   const uint32_t *data = (const uint32_t *)texture->base;
   const int stride = texture->row_stride[0] / sizeof(uint32_t);
   const int width = samp->width;

   /*
    * Search the stretched row cache first.
    */

   if (y == samp->stretched_row_y[0]) {
      samp->stretched_row_index = 1;
      return samp->stretched_row[0];
   }

   if (y == samp->stretched_row_y[1]) {
      samp->stretched_row_index = 0;
      return samp->stretched_row[1];
   }

   /*
    * Replace one entry.
    */

   const uint32_t * restrict src_row = data + y * stride;
   uint32_t * restrict dst_row = samp->stretched_row[samp->stretched_row_index];

   if (fixed16_frac(samp->s) == 0 &&
       samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
      /*
       * 1:1 blit on the x direction.
       */
      src_row += samp->s >> FIXED16_SHIFT;

      if (((uintptr_t)src_row & 0xf) == 0) {
         /* The source texture is already aligned. Return it */
         return src_row;
      }

      /* Copy the source texture */
      for (int i = 0; i < width; i += 4) {
         __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
         *(__m128i *)&dst_row[i] = src;
      }
   } else {
      util_sse2_stretch_row_8unorm((__m128i *)dst_row,
                                   align(width, 4),
                                   src_row, samp->s, samp->dsdx);
   }

   samp->stretched_row_y[samp->stretched_row_index] = y;
   samp->stretched_row_index ^= 1;

   return dst_row;
}


/* Maximise only as we fetch unscaled pixels linearly into a size-64
 * temporary.  For minimise, we will want to either have a bigger
 * temporary or fetch sparsely.
 */
static const uint32_t *
fetch_axis_aligned_linear_bgra(struct lp_linear_elem *elem)
{
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const int width = samp->width;
   uint32_t * restrict row = samp->row;
   const int y = samp->t >> FIXED16_SHIFT;
   const int w = (samp->t >> 8) & 0xff;

   samp->t += samp->dtdy;

   const uint32_t * restrict src_row0 = fetch_and_stretch_bgra_row(samp, y);

   if (w == 0) {
      return src_row0;
   }

   const uint32_t * restrict src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);

   __m128i wt = _mm_set1_epi16(w);

   /* Combine the two rows using a constant weight.
    */
   for (int i = 0; i < width; i += 4) {
      __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
      __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);

      *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
   }

   return row;
}


/* Non-axis-aligned version.  Don't try to take advantage of
 * maximize.
 */
static const uint32_t *
fetch_linear_bgra(struct lp_linear_elem *elem)
{
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const struct lp_jit_texture *texture = samp->texture;
   const int stride     = texture->row_stride[0] / sizeof(uint32_t);
   const uint32_t *data  = (const uint32_t *)texture->base;
   const int dsdx  = samp->dsdx;
   const int dtdx  = samp->dtdx;
   const int width = samp->width;
   uint32_t *row   = samp->row;
   int s = samp->s;
   int t = samp->t;

   for (int i = 0; i < width; i += 4) {
      union m128i si0, si1, si2, si3, ws, wt;
      __m128i si02, si13;

      for (int j = 0; j < 4; j++) {
         const uint32_t *src = data + (t >> 16) * stride + (s >> 16);

         si0.ui[j] = src[0];
         si1.ui[j] = src[1];
         si2.ui[j] = src[stride + 0];
         si3.ui[j] = src[stride + 1];

         ws.ui[j] = (s>>8) & 0xff;
         wt.ui[j] = (t>>8) & 0xff;

         s += dsdx;
         t += dtdx;
      }

      ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
      ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));

      wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
      wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));

      si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
      si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);

      *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
   }

   samp->s += samp->dsdy;
   samp->t += samp->dtdy;
   return row;
}


/* Clamped, non-axis-aligned version.  Don't try to take advantage of
 * maximize.
 */
static const uint32_t *
fetch_clamp_linear_bgra(struct lp_linear_elem *elem)
{
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const struct lp_jit_texture *texture = samp->texture;
   const uint32_t *data  = (const uint32_t *)texture->base;
   const int stride     = texture->row_stride[0] / sizeof(uint32_t);
   const int tex_height = texture->height - 1;
   const int tex_width  = texture->width - 1;
   const int dsdx  = samp->dsdx;
   const int dtdx  = samp->dtdx;
   const int width = samp->width;
   uint32_t *row   = samp->row;
   int s = samp->s;
   int t = samp->t;

   /* width, height, stride (in pixels) must be smaller than 32768 */
   __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
   s4 = _mm_set1_epi32(s);
   t4 = _mm_set1_epi32(t);
   s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
   t4 =  _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
   dsdx4 = _mm_set1_epi32(4*dsdx);
   dtdx4 = _mm_set1_epi32(4*dtdx);
   stride4 = _mm_set1_epi32(stride);
   w4 = _mm_set1_epi32(tex_width);
   h4 = _mm_set1_epi32(tex_height);
   zero = _mm_setzero_si128();
   one = _mm_set1_epi32(1);

   for (int i = 0; i < width; i += 4) {
      union m128i addr[4];
      __m128i ws, wt, wsl, wsh, wtl, wth;
      __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];

      s4s = _mm_srli_epi32(s4, 16);
      t4s = _mm_srli_epi32(t4, 16);
      cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
      cs1 = _mm_add_epi16(s4s, one);
      cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
      ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
      ct1 = _mm_add_epi16(t4s, one);
      ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
      tmp = _mm_madd_epi16(ct0, stride4);
      addr[0].m = _mm_add_epi32(tmp, cs0);
      addr[1].m = _mm_add_epi32(tmp, cs1);
      tmp = _mm_madd_epi16(ct1, stride4);
      addr[2].m = _mm_add_epi32(tmp, cs0);
      addr[3].m = _mm_add_epi32(tmp, cs1);

      for (int j = 0; j < 4; j++) {
         __m128i ld1, ld2, ld3;
         si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
         ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
         si[j] = _mm_unpacklo_epi32(si[j], ld1);
         ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
         ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
         ld2 = _mm_unpacklo_epi32(ld2, ld3);
         si[j] =  _mm_unpacklo_epi64(si[j], ld2);
      }

      ws = _mm_srli_epi32(s4, 8);
      ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
      wt = _mm_srli_epi32(t4, 8);
      wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));

      s4 = _mm_add_epi32(s4, dsdx4);
      t4 = _mm_add_epi32(t4, dtdx4);

#if 0
/* scalar code for reference */
      for (int j = 0; j < 4; j++) {
         int s0 = s >> FIXED16_SHIFT;
         int t0 = t >> FIXED16_SHIFT;
         int cs0 = CLAMP(s0    , 0, tex_width);
         int cs1 = CLAMP(s0 + 1, 0, tex_width);
         int ct0 = CLAMP(t0    , 0, tex_height);
         int ct1 = CLAMP(t0 + 1, 0, tex_height);

         si0.ui[j] = data[ct0 * stride + cs0];
         si1.ui[j] = data[ct0 * stride + cs1];
         si2.ui[j] = data[ct1 * stride + cs0];
         si3.ui[j] = data[ct1 * stride + cs1];

         ws.ui[j] = (s>>8) & 0xff;
         wt.ui[j] = (t>>8) & 0xff;

         s += dsdx;
         t += dtdx;
      }
#endif

      ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
      wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
      wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));

      wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
      wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
      wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));

      *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
                                                           &si[1], &si[3],
                                                           &wtl, &wth,
                                                           &wsl, &wsh);
   }

   samp->s += samp->dsdy;
   samp->t += samp->dtdy;

   return row;
}

/* don't generate bgra 128-bits or memcpy ops they have their own path */
#define FETCH_TYPE bgra
#define OP
#define NO_MEMCPY
#include "lp_linear_sampler_tmp.h"

#define FETCH_TYPE bgrx
#define OP rgbx
#define OP128 rgbx_128
#include "lp_linear_sampler_tmp.h"

#define FETCH_TYPE bgra_swapped
#define OP rb_swap
#define OP128 rb_swap_128
#include "lp_linear_sampler_tmp.h"

#define FETCH_TYPE bgrx_swapped
#define OP rbx_swap
#define OP128 rbx_swap_128
#include "lp_linear_sampler_tmp.h"

static bool
sampler_is_nearest(const struct lp_linear_sampler *samp,
                   const struct lp_sampler_static_state *sampler_state,
                   bool minify)
{
   unsigned img_filter;

   if (minify)
      img_filter = sampler_state->sampler_state.min_img_filter;
   else
      img_filter = sampler_state->sampler_state.mag_img_filter;

   /* Is it obviously nearest?
    */
   if (img_filter == PIPE_TEX_FILTER_NEAREST)
      return true;

   /* Otherwise look for linear samplers which devolve to nearest.
    */

   /* Needs to be axis aligned.
    */
   if (!samp->axis_aligned)
      return false;

   if (0) {
      /* For maximizing shaders, revert to nearest
       */
      if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
          samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
         return true;

      /* For severely minimising shaders, revert to nearest:
       */
      if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
          (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
         return true;
   }

   /*
    * Must be near a pixel center:
    */
   if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
       !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
      return false;

   /*
    * Must make a full step between pixels:
    */
   if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
       !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
      return false;

   /* Treat it as nearest!
    */
   return true;
}


/* XXX: Lots of static-state parameters being passed in here but very
 * little info is extracted from each one.  Consolidate it all down to
 * something succinct in the prepare phase?
 */
bool
lp_linear_init_sampler(struct lp_linear_sampler *samp,
                       const struct lp_tgsi_texture_info *info,
                       const struct lp_sampler_static_state *sampler_state,
                       const struct lp_jit_texture *texture,
                       int x0, int y0, int width, int height,
                       const float (*a0)[4],
                       const float (*dadx)[4],
                       const float (*dady)[4],
                       bool rgba_order)
{
   const struct lp_tgsi_channel_info *schan = &info->coord[0];
   const struct lp_tgsi_channel_info *tchan = &info->coord[1];

   assert(schan->file == TGSI_FILE_INPUT);
   assert(tchan->file == TGSI_FILE_INPUT);

   float w0   =   a0[0][3];

   int foo = 1;
   float s0   =   a0[schan->u.index+foo][schan->swizzle];
   float dsdx = dadx[schan->u.index+foo][schan->swizzle];
   float dsdy = dady[schan->u.index+foo][schan->swizzle];

   float t0   =   a0[tchan->u.index+foo][tchan->swizzle];
   float dtdx = dadx[tchan->u.index+foo][tchan->swizzle];
   float dtdy = dady[tchan->u.index+foo][tchan->swizzle];

   int mins, mint, maxs, maxt;
   float oow = 1.0f / w0;
   float width_oow = texture->width * oow;
   float height_oow = texture->height * oow;
   float fdsdx = dsdx * width_oow;
   float fdsdy = dsdy * width_oow;
   float fdtdx = dtdx * height_oow;
   float fdtdy = dtdy * height_oow;
   int fetch_width;
   int fetch_height;
   bool minify;
   bool need_wrap;
   bool is_nearest;

   samp->texture = texture;
   samp->width = width;

   samp->s = float_to_fixed16(fdsdx * x0 +
                              fdsdy * y0 +
                              s0 * width_oow);

   samp->t = float_to_fixed16(fdtdx * x0 +
                              fdtdy * y0 +
                              t0 * height_oow);

   samp->dsdx = float_to_fixed16(fdsdx);
   samp->dsdy = float_to_fixed16(fdsdy);
   samp->dtdx = float_to_fixed16(fdtdx);
   samp->dtdy = float_to_fixed16(fdtdy);


   samp->axis_aligned = (samp->dsdy == 0 &&
                         samp->dtdx == 0); // TODO: could be relaxed

   {
      int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
      int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
      int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
      int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
      int rho = MAX4(dsdx, dsdy, dtdx, dtdy);

      minify = (rho > FIXED16_ONE);
   }

   is_nearest = sampler_is_nearest(samp, sampler_state, minify);

   if (!is_nearest) {
      samp->s -= FIXED16_HALF;
      samp->t -= FIXED16_HALF;
   }

   /* Check for clamping.  This rarely happens as we're rejecting interpolants
    * which fall outside the 0..1 range.
    */

   if (is_nearest) {
      /* Nearest fetch routines don't employ SSE and always operate one pixel
       * at a time.
       */
      fetch_width = width - 1;
   } else {
      /* Linear fetch routines employ SSE, and always fetch groups of four
       * texels.
       */
      fetch_width = align(width, 4) - 1;
   }
   fetch_height = height - 1;

   if (samp->axis_aligned) {
      int s0 = samp->s;
      int s1 = samp->s + fetch_width  * samp->dsdx;
      int t0 = samp->t;
      int t1 = samp->t + fetch_height * samp->dtdy;

      mins = MIN2(s0, s1);
      mint = MIN2(t0, t1);
      maxs = MAX2(s0, s1);
      maxt = MAX2(t0, t1);
   } else {
      int s0 = samp->s;
      int s1 = samp->s + fetch_width  * samp->dsdx;
      int s2 = samp->s + fetch_height * samp->dsdy;
      int s3 = samp->s + fetch_width  * samp->dsdx + fetch_height * samp->dsdy;
      int t0 = samp->t;
      int t1 = samp->t + fetch_width  * samp->dtdx;
      int t2 = samp->t + fetch_height * samp->dtdy;
      int t3 = samp->t + fetch_width  * samp->dtdx + fetch_height * samp->dtdy;

      mins = MIN4(s0, s1, s2, s3);
      mint = MIN4(t0, t1, t2, t3);
      maxs = MAX4(s0, s1, s2, s3);
      maxt = MAX4(t0, t1, t2, t3);
   }

   if (is_nearest) {
      need_wrap = (mins < 0 ||
                   mint < 0 ||
                   maxs >= (texture->width  << FIXED16_SHIFT) ||
                   maxt >= (texture->height << FIXED16_SHIFT));
   } else {
      need_wrap = (mins < 0 ||
                   mint < 0 ||
                   maxs + FIXED16_ONE >= (texture->width  << FIXED16_SHIFT) ||
                   maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
   }

   if (0 && need_wrap) {
      debug_printf("%u x %u %s\n",
                   texture->width, texture->height,
                   is_nearest ? "nearest" : "linear");
      debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
      debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
      debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
      debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
      debug_printf("\n");
   }

   /* We accept any mode below, but we only implement clamping.
    */
   if (need_wrap &&
       (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
        sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
       return false;
   }

   if (is_nearest) {
      switch (sampler_state->texture_state.format) {
      case PIPE_FORMAT_B8G8R8A8_UNORM:
         if (rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgra_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgra_swapped;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgra_swapped;
            else
               samp->base.fetch = fetch_memcpy_bgra_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgra;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgra;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgra;
            else
               samp->base.fetch = fetch_memcpy_bgra;
         }
         return true;
      case PIPE_FORMAT_B8G8R8X8_UNORM:
         if (rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgrx_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgrx_swapped;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgrx_swapped;
            else
               samp->base.fetch = fetch_memcpy_bgrx_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgrx;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgrx;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgrx;
            else
               samp->base.fetch = fetch_memcpy_bgrx;
         }
         return true;
      case PIPE_FORMAT_R8G8B8A8_UNORM:
         if (!rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgra_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgra_swapped;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgra_swapped;
            else
               samp->base.fetch = fetch_memcpy_bgra_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgra;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgra;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgra;
            else
               samp->base.fetch = fetch_memcpy_bgra;
         }
         return true;
      case PIPE_FORMAT_R8G8B8X8_UNORM:
         if (!rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgrx_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgrx_swapped;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgrx_swapped;
            else
               samp->base.fetch = fetch_memcpy_bgrx_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_bgrx;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_bgrx;
            else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
               samp->base.fetch = fetch_axis_aligned_bgrx;
            else
               samp->base.fetch = fetch_memcpy_bgrx;
         }
         return true;
      default:
         break;
      }

      FAIL("unknown format for nearest");
   } else {
      samp->stretched_row_y[0] = -1;
      samp->stretched_row_y[1] = -1;
      samp->stretched_row_index = 0;

      switch (sampler_state->texture_state.format) {
      case PIPE_FORMAT_B8G8R8A8_UNORM:
         if (rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgra_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgra_swapped;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgra_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgra;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgra;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgra;
         }
         return true;
      case PIPE_FORMAT_B8G8R8X8_UNORM:
         if (rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgrx_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgrx_swapped;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgrx_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgrx;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgrx;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgrx;
         }
         return true;
      case PIPE_FORMAT_R8G8B8A8_UNORM:
         if (!rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgra_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgra_swapped;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgra_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgra;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgra;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgra;
         }
         return true;
      case PIPE_FORMAT_R8G8B8X8_UNORM:
         if (!rgba_order) {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgrx_swapped;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgrx_swapped;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgrx_swapped;
         } else {
            if (need_wrap)
               samp->base.fetch = fetch_clamp_linear_bgrx;
            else if (!samp->axis_aligned)
               samp->base.fetch = fetch_linear_bgrx;
            else
               samp->base.fetch = fetch_axis_aligned_linear_bgrx;
         }
         return true;
      default:
         break;
      }

      FAIL("unknown format");
   }
}


static const uint32_t *
fetch_noop(struct lp_linear_elem *elem)
{
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   return samp->row;
}


void
lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
{
   samp->base.fetch = fetch_noop;
}


/*
 * Check the given sampler and texture info for linear path compatibility.
 */
bool
lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
                        const struct lp_tgsi_texture_info *tex)
{
   if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
      return false;

   if (tex->target != TGSI_TEXTURE_2D)
      return false;

   if (tex->coord[0].file != TGSI_FILE_INPUT ||
       tex->coord[1].file != TGSI_FILE_INPUT)
      return false;

   /* These are the only sampling modes we support at the moment.
    *
    * Actually we'll accept any mode as we're failing on any
    * interpolant which exceeds 0..1.  Clamping is applied only to
    * avoid invalid reads.
    */
   if (!is_nearest_sampler(sampler) &&
       !is_linear_sampler(sampler))
      return false;

   /* These are the only texture formats we support at the moment
    */
   if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
       sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM &&
       sampler->texture_state.format != PIPE_FORMAT_R8G8B8A8_UNORM &&
       sampler->texture_state.format != PIPE_FORMAT_R8G8B8X8_UNORM)
      return false;

   /* We don't support sampler view swizzling on the linear path */
   if (sampler->texture_state.swizzle_r != PIPE_SWIZZLE_X ||
       sampler->texture_state.swizzle_g != PIPE_SWIZZLE_Y ||
       sampler->texture_state.swizzle_b != PIPE_SWIZZLE_Z ||
       sampler->texture_state.swizzle_a != PIPE_SWIZZLE_W) {
      return false;
   }

   return true;
}

#else  // DETECT_ARCH_SSE

bool
lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
                        const struct lp_tgsi_texture_info *tex)
{
   return false;
}

#endif  // DETECT_ARCH_SSE