/**************************************************************************
 *
 * Copyright 2009 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/


#include "pipe/p_defines.h"

#include "util/u_format.h"
#include "util/u_memory.h"
#include "util/u_string.h"

#include "lp_bld_type.h"
#include "lp_bld_const.h"
#include "lp_bld_conv.h"
#include "lp_bld_swizzle.h"
#include "lp_bld_gather.h"
#include "lp_bld_debug.h"
#include "lp_bld_format.h"
#include "lp_bld_arit.h"


void
lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
                            struct lp_build_context *bld,
                            const LLVMValueRef *unswizzled,
                            LLVMValueRef swizzled_out[4])
{
   assert(UTIL_FORMAT_SWIZZLE_0 == PIPE_SWIZZLE_ZERO);
   assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE);

   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
      enum util_format_swizzle swizzle;
      LLVMValueRef depth_or_stencil;

      if (util_format_has_stencil(format_desc) &&
          !util_format_has_depth(format_desc)) {
         assert(!bld->type.floating);
         swizzle = format_desc->swizzle[1];
      }
      else {
         assert(bld->type.floating);
         swizzle = format_desc->swizzle[0];
      }
      /*
       * Return zzz1 or sss1 for depth-stencil formats here.
       * Correct swizzling will be handled by apply_sampler_swizzle() later.
       */
      depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);

      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
      swizzled_out[3] = bld->one;
   }
   else {
      unsigned chan;
      for (chan = 0; chan < 4; ++chan) {
         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
         swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
      }
   }
}


/**
 * Unpack several pixels in SoA.
 *
 * It takes a vector of packed pixels:
 *
 *   packed = {P0, P1, P2, P3, ..., Pn}
 *
 * And will produce four vectors:
 *
 *   red    = {R0, R1, R2, R3, ..., Rn}
 *   green  = {G0, G1, G2, G3, ..., Gn}
 *   blue   = {B0, B1, B2, B3, ..., Bn}
 *   alpha  = {A0, A1, A2, A3, ..., An}
 *
 * It requires that a packed pixel fits into an element of the output
 * channels. The common case is when converting pixel with a depth of 32 bit or
 * less into floats.
 *
 * \param format_desc  the format of the 'packed' incoming pixel vector
 * \param type  the desired type for rgba_out (type.length = n, above)
 * \param packed  the incoming vector of packed pixels
 * \param rgba_out  returns the SoA R,G,B,A vectors
 */
void
lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
                         LLVMValueRef packed,
                         LLVMValueRef rgba_out[4])
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context bld;
   LLVMValueRef inputs[4];
   unsigned chan;

   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
   assert(format_desc->block.width == 1);
   assert(format_desc->block.height == 1);
   assert(format_desc->block.bits <= type.width);
   /* FIXME: Support more output types */
   assert(type.width == 32);

   lp_build_context_init(&bld, gallivm, type);

   /* Decode the input vector components */
   for (chan = 0; chan < format_desc->nr_channels; ++chan) {
      const unsigned width = format_desc->channel[chan].size;
      const unsigned start = format_desc->channel[chan].shift;
      const unsigned stop = start + width;
      LLVMValueRef input;

      input = packed;

      switch(format_desc->channel[chan].type) {
      case UTIL_FORMAT_TYPE_VOID:
         input = lp_build_undef(gallivm, type);
         break;

      case UTIL_FORMAT_TYPE_UNSIGNED:
         /*
          * Align the LSB
          */

         if (start) {
            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
         }

         /*
          * Zero the MSBs
          */

         if (stop < format_desc->block.bits) {
            unsigned mask = ((unsigned long long)1 << width) - 1;
            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
         }

         /*
          * Type conversion
          */

         if (type.floating) {
            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
               assert(width == 8);
               if (format_desc->swizzle[3] == chan) {
                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
               }
               else {
                  struct lp_type conv_type = lp_uint_type(type);
                  input = lp_build_srgb_to_linear(gallivm, conv_type, input);
               }
            }
            else {
               if(format_desc->channel[chan].normalized)
                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
               else
                  input = LLVMBuildSIToFP(builder, input,
                                          lp_build_vec_type(gallivm, type), "");
            }
         }
         else if (format_desc->channel[chan].pure_integer) {
            /* Nothing to do */
         } else {
             /* FIXME */
             assert(0);
         }

         break;

      case UTIL_FORMAT_TYPE_SIGNED:
         /*
          * Align the sign bit first.
          */

         if (stop < type.width) {
            unsigned bits = type.width - stop;
            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
            input = LLVMBuildShl(builder, input, bits_val, "");
         }

         /*
          * Align the LSB (with an arithmetic shift to preserve the sign)
          */

         if (format_desc->channel[chan].size < type.width) {
            unsigned bits = type.width - format_desc->channel[chan].size;
            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
            input = LLVMBuildAShr(builder, input, bits_val, "");
         }

         /*
          * Type conversion
          */

         if (type.floating) {
            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
            if (format_desc->channel[chan].normalized) {
               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
               LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
               input = LLVMBuildFMul(builder, input, scale_val, "");
               /* the formula above will produce value below -1.0 for most negative
                * value but everything seems happy with that hence disable for now */
               if (0)
                  input = lp_build_max(&bld, input,
                                       lp_build_const_vec(gallivm, type, -1.0f));
            }
         }
         else if (format_desc->channel[chan].pure_integer) {
            /* Nothing to do */
         } else {
             /* FIXME */
             assert(0);
         }

         break;

      case UTIL_FORMAT_TYPE_FLOAT:
         if (type.floating) {
            assert(start == 0);
            assert(stop == 32);
            assert(type.width == 32);
            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
         }
         else {
            /* FIXME */
            assert(0);
            input = lp_build_undef(gallivm, type);
         }
         break;

      case UTIL_FORMAT_TYPE_FIXED:
         if (type.floating) {
            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
            input = LLVMBuildFMul(builder, input, scale_val, "");
         }
         else {
            /* FIXME */
            assert(0);
            input = lp_build_undef(gallivm, type);
         }
         break;

      default:
         assert(0);
         input = lp_build_undef(gallivm, type);
         break;
      }

      inputs[chan] = input;
   }

   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
}


/**
 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
 *
 * \param dst_type  The desired return type. For pure integer formats
 *                  this should be a 32bit wide int or uint vector type,
 *                  otherwise a float vector type.
 *
 * \param packed    The rgba8 values to pack.
 *
 * \param rgba      The 4 SoA return vectors.
 */
void
lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
                           struct lp_type dst_type,
                           LLVMValueRef packed,
                           LLVMValueRef *rgba)
{
   LLVMBuilderRef builder = gallivm->builder;
   LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
   unsigned chan;

   /* XXX technically shouldn't use that for uint dst_type */
   packed = LLVMBuildBitCast(builder, packed,
                             lp_build_int_vec_type(gallivm, dst_type), "");

   /* Decode the input vector components */
   for (chan = 0; chan < 4; ++chan) {
#ifdef PIPE_ARCH_LITTLE_ENDIAN
      unsigned start = chan*8;
#else
      unsigned start = (3-chan)*8;
#endif
      unsigned stop = start + 8;
      LLVMValueRef input;

      input = packed;

      if (start)
         input = LLVMBuildLShr(builder, input,
                               lp_build_const_int_vec(gallivm, dst_type, start), "");

      if (stop < 32)
         input = LLVMBuildAnd(builder, input, mask, "");

      if (dst_type.floating)
         input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);

      rgba[chan] = input;
   }
}


/**
 * Fetch a texels from a texture, returning them in SoA layout.
 *
 * \param type  the desired return type for 'rgba'.  The vector length
 *              is the number of texels to fetch
 *
 * \param base_ptr  points to the base of the texture mip tree.
 * \param offset    offset to start of the texture image block.  For non-
 *                  compressed formats, this simply is an offset to the texel.
 *                  For compressed formats, it is an offset to the start of the
 *                  compressed data block.
 *
 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 *              these will always be (0,0).  For compressed formats, i will
 *              be in [0, block_width-1] and j will be in [0, block_height-1].
 */
void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                        const struct util_format_description *format_desc,
                        struct lp_type type,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef i,
                        LLVMValueRef j,
                        LLVMValueRef rgba_out[4])
{
   LLVMBuilderRef builder = gallivm->builder;

   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
       format_desc->block.width == 1 &&
       format_desc->block.height == 1 &&
       format_desc->block.bits <= type.width &&
       (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
        format_desc->channel[0].size == 32))
   {
      /*
       * The packed pixel fits into an element of the destination format. Put
       * the packed pixels into a vector and extract each component for all
       * vector elements in parallel.
       */

      LLVMValueRef packed;

      /*
       * gather the texels from the texture
       * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
       */
      assert(format_desc->block.bits <= type.width);
      packed = lp_build_gather(gallivm,
                               type.length,
                               format_desc->block.bits,
                               type.width,
                               base_ptr, offset, FALSE);

      /*
       * convert texels to float rgba
       */
      lp_build_unpack_rgba_soa(gallivm,
                               format_desc,
                               type,
                               packed, rgba_out);
      return;
   }

   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
       format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
      /*
       * similar conceptually to above but requiring special
       * AoS packed -> SoA float conversion code.
       */
      LLVMValueRef packed;

      assert(type.floating);
      assert(type.width == 32);

      packed = lp_build_gather(gallivm, type.length,
                               format_desc->block.bits,
                               type.width, base_ptr, offset,
                               FALSE);
      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
      }
      else {
         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
      }
      return;
   }

   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
       format_desc->block.bits == 64) {
      /*
       * special case the format is 64 bits but we only require
       * 32bit (or 8bit) from each block.
       */
      LLVMValueRef packed;

      if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
         /*
          * for stencil simply fix up offsets - could in fact change
          * base_ptr instead even outside the shader.
          */
         unsigned mask = (1 << 8) - 1;
         LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
         offset = LLVMBuildAdd(builder, offset, s_offset, "");
         packed = lp_build_gather(gallivm, type.length,
                                  32, type.width, base_ptr, offset, FALSE);
         packed = LLVMBuildAnd(builder, packed,
                               lp_build_const_int_vec(gallivm, type, mask), "");
      }
      else {
         assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
         packed = lp_build_gather(gallivm, type.length,
                                  32, type.width, base_ptr, offset, TRUE);
         packed = LLVMBuildBitCast(builder, packed,
                                   lp_build_vec_type(gallivm, type), "");
      }
      /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
      rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
      rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
      return;
   }

   /*
    * Try calling lp_build_fetch_rgba_aos for all pixels.
    */

   if (util_format_fits_8unorm(format_desc) &&
       type.floating && type.width == 32 &&
       (type.length == 1 || (type.length % 4 == 0))) {
      struct lp_type tmp_type;
      LLVMValueRef tmp;

      memset(&tmp_type, 0, sizeof tmp_type);
      tmp_type.width = 8;
      tmp_type.length = type.length * 4;
      tmp_type.norm = TRUE;

      tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                    base_ptr, offset, i, j);

      lp_build_rgba8_to_fi32_soa(gallivm,
                                type,
                                tmp,
                                rgba_out);

      return;
   }

   /*
    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
    *
    * This is not the most efficient way of fetching pixels, as we
    * miss some opportunities to do vectorization, but this is
    * convenient for formats or scenarios for which there was no
    * opportunity or incentive to optimize.
    */

   {
      unsigned k, chan;
      struct lp_type tmp_type;

      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
         debug_printf("%s: scalar unpacking of %s\n",
                      __FUNCTION__, format_desc->short_name);
      }

      tmp_type = type;
      tmp_type.length = 4;

      for (chan = 0; chan < 4; ++chan) {
         rgba_out[chan] = lp_build_undef(gallivm, type);
      }

      /* loop over number of pixels */
      for(k = 0; k < type.length; ++k) {
         LLVMValueRef index = lp_build_const_int32(gallivm, k);
         LLVMValueRef offset_elem;
         LLVMValueRef i_elem, j_elem;
         LLVMValueRef tmp;

         offset_elem = LLVMBuildExtractElement(builder, offset,
                                               index, "");

         i_elem = LLVMBuildExtractElement(builder, i, index, "");
         j_elem = LLVMBuildExtractElement(builder, j, index, "");

         /* Get a single float[4]={R,G,B,A} pixel */
         tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                       base_ptr, offset_elem,
                                       i_elem, j_elem);

         /*
          * Insert the AoS tmp value channels into the SoA result vectors at
          * position = 'index'.
          */
         for (chan = 0; chan < 4; ++chan) {
            LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
            tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
            rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
                                                    tmp_chan, index, "");
         }
      }
   }
}