/*
 * Copyright 2011 Christoph Bumiller
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "codegen/nv50_ir.h"
#include "codegen/nv50_ir_build_util.h"

#include "codegen/nv50_ir_target_nv50.h"

namespace nv50_ir {

// nv50 doesn't support 32 bit integer multiplication
//
//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
// -------------------
//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
//       al*bl
//    ah*bl 00
//
// fffe0001 + fffe0001
static bool
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
{
   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;

   DataType fTy = mul->sType; // full type
   DataType hTy;
   switch (fTy) {
   case TYPE_S32: hTy = TYPE_S16; break;
   case TYPE_U32: hTy = TYPE_U16; break;
   case TYPE_U64: hTy = TYPE_U32; break;
   case TYPE_S64: hTy = TYPE_S32; break;
   default:
      return false;
   }
   unsigned int fullSize = typeSizeof(fTy);
   unsigned int halfSize = typeSizeof(hTy);

   Instruction *i[9];

   bld->setPosition(mul, true);

   Value *a[2], *b[2];
   Value *c[2];
   Value *t[4];
   for (int j = 0; j < 4; ++j)
      t[j] = bld->getSSA(fullSize);

   // split sources into halves
   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));

   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);

   if (highResult) {
      Value *r[4];
      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
      c[0] = bld->getSSA(1, FILE_FLAGS);
      c[1] = bld->getSSA(1, FILE_FLAGS);
      for (int j = 0; j < 4; ++j)
         r[j] = bld->getSSA(fullSize);

      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
      bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);

      // set carry defs / sources
      i[3]->setFlagsDef(1, c[0]);
      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
      i[6]->setPredicate(CC_C, c[0]);
      i[5]->setFlagsSrc(3, c[1]);
   } else {
      bld->mkMov(mul->getDef(0), t[3]);
   }
   delete_Instruction(bld->getProgram(), mul);

   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
      if (i[j])
         i[j]->sType = hTy;

   return true;
}

#define QOP_ADD  0
#define QOP_SUBR 1
#define QOP_SUB  2
#define QOP_MOV2 3

//             UL UR LL LR
#define QUADOP(q, r, s, t)            \
   ((QOP_##q << 6) | (QOP_##r << 4) | \
    (QOP_##s << 2) | (QOP_##t << 0))

class NV50LegalizePostRA : public Pass
{
private:
   virtual bool visit(Function *);
   virtual bool visit(BasicBlock *);

   void handlePRERET(FlowInstruction *);
   void replaceZero(Instruction *);

   LValue *r63;
};

bool
NV50LegalizePostRA::visit(Function *fn)
{
   Program *prog = fn->getProgram();

   r63 = new_LValue(fn, FILE_GPR);
   r63->reg.data.id = 63;

   // this is actually per-program, but we can do it all on visiting main()
   std::list<Instruction *> *outWrites =
      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);

   if (outWrites) {
      for (std::list<Instruction *>::iterator it = outWrites->begin();
           it != outWrites->end(); ++it)
         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
      // instructions will be deleted on exit
      outWrites->clear();
   }

   return true;
}

void
NV50LegalizePostRA::replaceZero(Instruction *i)
{
   for (int s = 0; i->srcExists(s); ++s) {
      ImmediateValue *imm = i->getSrc(s)->asImm();
      if (imm && imm->reg.data.u64 == 0)
         i->setSrc(s, r63);
   }
}

// Emulate PRERET: jump to the target and call to the origin from there
//
// WARNING: atm only works if BBs are affected by at most a single PRERET
//
// BB:0
// preret BB:3
// (...)
// BB:3
// (...)
//             --->
// BB:0
// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
// (...)
// BB:3
// bra BB:3 + n1 (skip the call)
// call BB:0 + n2 (skip bra at beginning of BB:0)
// (...)
void
NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
{
   BasicBlock *bbE = pre->bb;
   BasicBlock *bbT = pre->target.bb;

   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
   bbE->remove(pre);
   bbE->insertHead(pre);

   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);

   bbT->insertHead(call);
   bbT->insertHead(skip);

   // NOTE: maybe split blocks to prevent the instructions from moving ?

   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
}

bool
NV50LegalizePostRA::visit(BasicBlock *bb)
{
   Instruction *i, *next;

   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
   for (i = bb->getFirst(); i; i = next) {
      next = i->next;
      if (i->isNop()) {
         bb->remove(i);
      } else
      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
         handlePRERET(i->asFlow());
      } else {
         // TODO: We will want to do this before register allocation,
         // since have to use a $c register for the carry flag.
         if (typeSizeof(i->dType) == 8) {
            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
            if (hi)
               next = hi;
         }

         if (i->op != OP_MOV && i->op != OP_PFETCH &&
             i->op != OP_BAR &&
             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
            replaceZero(i);
      }
   }
   if (!bb->getEntry())
      return true;

   return true;
}

class NV50LegalizeSSA : public Pass
{
public:
   NV50LegalizeSSA(Program *);

   virtual bool visit(BasicBlock *bb);

private:
   void propagateWriteToOutput(Instruction *);
   void handleDIV(Instruction *);
   void handleMOD(Instruction *);
   void handleMUL(Instruction *);
   void handleAddrDef(Instruction *);

   inline bool isARL(const Instruction *) const;

   BuildUtil bld;

   std::list<Instruction *> *outWrites;
};

NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
{
   bld.setProgram(prog);

   if (prog->optLevel >= 2 &&
       (prog->getType() == Program::TYPE_GEOMETRY ||
        prog->getType() == Program::TYPE_VERTEX))
      outWrites =
         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
   else
      outWrites = NULL;
}

void
NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
{
   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
      return;

   // check def instruction can store
   Instruction *di = st->getSrc(1)->defs.front()->getInsn();

   // TODO: move exports (if beneficial) in common opt pass
   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
      return;

   for (int s = 0; di->srcExists(s); ++s)
      if (di->src(s).getFile() == FILE_IMMEDIATE)
         return;

   if (prog->getType() == Program::TYPE_GEOMETRY) {
      // Only propagate output writes in geometry shaders when we can be sure
      // that we are propagating to the same output vertex.
      if (di->bb != st->bb)
         return;
      Instruction *i;
      for (i = di; i != st; i = i->next) {
         if (i->op == OP_EMIT || i->op == OP_RESTART)
            return;
      }
      assert(i); // st after di
   }

   // We cannot set defs to non-lvalues before register allocation, so
   // save & remove (to save registers) the exports and replace later.
   outWrites->push_back(st);
   st->bb->remove(st);
}

bool
NV50LegalizeSSA::isARL(const Instruction *i) const
{
   ImmediateValue imm;

   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
      return false;
   if (!i->src(1).getImmediate(imm))
      return false;
   return imm.isInteger(0);
}

void
NV50LegalizeSSA::handleAddrDef(Instruction *i)
{
   Instruction *arl;

   i->getDef(0)->reg.size = 2; // $aX are only 16 bit

   // PFETCH can always write to $a
   if (i->op == OP_PFETCH)
      return;
   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
         return;
      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
         return;
   }

   // turn $a sources into $r sources (can't operate on $a)
   for (int s = 0; i->srcExists(s); ++s) {
      Value *a = i->getSrc(s);
      Value *r;
      if (a->reg.file == FILE_ADDRESS) {
         if (a->getInsn() && isARL(a->getInsn())) {
            i->setSrc(s, a->getInsn()->getSrc(0));
         } else {
            bld.setPosition(i, false);
            r = bld.getSSA();
            bld.mkMov(r, a);
            i->setSrc(s, r);
         }
      }
   }
   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
      return;

   // turn result back into $a
   bld.setPosition(i, true);
   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
   i->setDef(0, arl->getSrc(0));
}

void
NV50LegalizeSSA::handleMUL(Instruction *mul)
{
   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
      return;
   Value *def = mul->getDef(0);
   Value *pred = mul->getPredicate();
   CondCode cc = mul->cc;
   if (pred)
      mul->setPredicate(CC_ALWAYS, NULL);

   if (mul->op == OP_MAD) {
      Instruction *add = mul;
      bld.setPosition(add, false);
      Value *res = cloneShallow(func, mul->getDef(0));
      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
      add->op = OP_ADD;
      add->setSrc(0, mul->getDef(0));
      add->setSrc(1, add->getSrc(2));
      for (int s = 2; add->srcExists(s); ++s)
         add->setSrc(s, NULL);
      mul->subOp = add->subOp;
      add->subOp = 0;
   }
   expandIntegerMUL(&bld, mul);
   if (pred)
      def->getInsn()->setPredicate(cc, pred);
}

// Use f32 division: first compute an approximate result, use it to reduce
// the dividend, which should then be representable as f32, divide the reduced
// dividend, and add the quotients.
void
NV50LegalizeSSA::handleDIV(Instruction *div)
{
   const DataType ty = div->sType;

   if (ty != TYPE_U32 && ty != TYPE_S32)
      return;

   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;

   bld.setPosition(div, false);

   Value *a, *af = bld.getSSA();
   Value *b, *bf = bld.getSSA();

   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));

   if (isSignedType(ty)) {
      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
      a = bld.getSSA();
      b = bld.getSSA();
      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
   } else {
      a = div->getSrc(0);
      b = div->getSrc(1);
   }

   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));

   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;

   // get error of 1st result
   expandIntegerMUL(&bld,
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);

   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);

   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
      ->rnd = ROUND_Z;
   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients

   // correction: if modulus >= divisor, add 1
   expandIntegerMUL(&bld,
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
   if (!isSignedType(ty)) {
      div->op = OP_SUB;
      div->setSrc(0, q);
      div->setSrc(1, s);
   } else {
      t = q;
      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
      s = bld.getSSA();
      t = bld.getSSA();
      // fix the sign
      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);

      div->op = OP_UNION;
      div->setSrc(0, s);
      div->setSrc(1, t);
   }
}

void
NV50LegalizeSSA::handleMOD(Instruction *mod)
{
   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
      return;
   bld.setPosition(mod, false);

   Value *q = bld.getSSA();
   Value *m = bld.getSSA();

   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
   handleDIV(q->getInsn());

   bld.setPosition(mod, false);
   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));

   mod->op = OP_SUB;
   mod->setSrc(1, m);
}

bool
NV50LegalizeSSA::visit(BasicBlock *bb)
{
   Instruction *insn, *next;
   // skipping PHIs (don't pass them to handleAddrDef) !
   for (insn = bb->getEntry(); insn; insn = next) {
      next = insn->next;

      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
         handleAddrDef(insn);

      switch (insn->op) {
      case OP_EXPORT:
         if (outWrites)
            propagateWriteToOutput(insn);
         break;
      case OP_DIV:
         handleDIV(insn);
         break;
      case OP_MOD:
         handleMOD(insn);
         break;
      case OP_MAD:
      case OP_MUL:
         handleMUL(insn);
         break;
      default:
         break;
      }
   }
   return true;
}

class NV50LoweringPreSSA : public Pass
{
public:
   NV50LoweringPreSSA(Program *);

private:
   virtual bool visit(Instruction *);
   virtual bool visit(Function *);

   bool handleRDSV(Instruction *);
   bool handleWRSV(Instruction *);

   bool handlePFETCH(Instruction *);
   bool handleEXPORT(Instruction *);
   bool handleLOAD(Instruction *);

   bool handleDIV(Instruction *);
   bool handleSQRT(Instruction *);
   bool handlePOW(Instruction *);

   bool handleSET(Instruction *);
   bool handleSLCT(CmpInstruction *);
   bool handleSELP(Instruction *);

   bool handleTEX(TexInstruction *);
   bool handleTXB(TexInstruction *); // I really
   bool handleTXL(TexInstruction *); // hate
   bool handleTXD(TexInstruction *); // these 3
   bool handleTXLQ(TexInstruction *);

   bool handleCALL(Instruction *);
   bool handlePRECONT(Instruction *);
   bool handleCONT(Instruction *);

   void checkPredicate(Instruction *);
   void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
   void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);

private:
   const Target *const targ;

   BuildUtil bld;

   Value *tid;
};

NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
   targ(prog->getTarget()), tid(NULL)
{
   bld.setProgram(prog);
}

bool
NV50LoweringPreSSA::visit(Function *f)
{
   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());

   if (prog->getType() == Program::TYPE_COMPUTE) {
      // Add implicit "thread id" argument in $r0 to the function
      Value *arg = new_LValue(func, FILE_GPR);
      arg->reg.data.id = 0;
      f->ins.push_back(arg);

      bld.setPosition(root, false);
      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
   }

   return true;
}

void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
                                       Value **ms_x, Value **ms_y) {
   // This loads the texture-indexed ms setting from the constant buffer
   Value *tmp = new_LValue(func, FILE_GPR);
   uint8_t b = prog->driver->io.resInfoCBSlot;
   off += prog->driver->io.suInfoBase;
   if (prog->getType() > Program::TYPE_VERTEX)
      off += 16 * 2 * 4;
   if (prog->getType() > Program::TYPE_GEOMETRY)
      off += 16 * 2 * 4;
   *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
                             FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
   *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
                             FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
   *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
}

void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
   // Given a MS level, and a sample id, compute the delta x/y
   uint8_t b = prog->driver->io.msInfoCBSlot;
   Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);

   // The required information is at mslevel * 16 * 4 + sample * 8
   // = (mslevel * 8 + sample) * 8
   bld.mkOp2(OP_SHL,
             TYPE_U32,
             off,
             bld.mkOp2v(OP_ADD, TYPE_U32, t,
                        bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
                        s),
             bld.mkImm(3));
   *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
                           FILE_MEMORY_CONST, b, TYPE_U32,
                           prog->driver->io.msInfoBase), off);
   *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
                           FILE_MEMORY_CONST, b, TYPE_U32,
                           prog->driver->io.msInfoBase + 4), off);
}

bool
NV50LoweringPreSSA::handleTEX(TexInstruction *i)
{
   const int arg = i->tex.target.getArgCount();
   const int dref = arg;
   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;

   // handle MS, which means looking up the MS params for this texture, and
   // adjusting the input coordinates to point at the right sample.
   if (i->tex.target.isMS()) {
      Value *x = i->getSrc(0);
      Value *y = i->getSrc(1);
      Value *s = i->getSrc(arg - 1);
      Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
         *ms, *ms_x, *ms_y, *dx, *dy;

      i->tex.target.clearMS();

      loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
      loadMsInfo(ms, s, &dx, &dy);

      bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
      bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
      bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
      bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
      i->setSrc(0, tx);
      i->setSrc(1, ty);
      i->setSrc(arg - 1, bld.loadImm(NULL, 0));
   }

   // dref comes before bias/lod
   if (i->tex.target.isShadow())
      if (i->op == OP_TXB || i->op == OP_TXL)
         i->swapSources(dref, lod);

   if (i->tex.target.isArray()) {
      if (i->op != OP_TXF) {
         // array index must be converted to u32, but it's already an integer
         // for TXF
         Value *layer = i->getSrc(arg - 1);
         LValue *src = new_LValue(func, FILE_GPR);
         bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
         bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
         i->setSrc(arg - 1, src);
      }
      if (i->tex.target.isCube() && i->srcCount() > 4) {
         std::vector<Value *> acube, a2d;
         int c;

         acube.resize(4);
         for (c = 0; c < 4; ++c)
            acube[c] = i->getSrc(c);
         a2d.resize(4);
         for (c = 0; c < 3; ++c)
            a2d[c] = new_LValue(func, FILE_GPR);
         a2d[3] = NULL;

         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
                   a2d, acube)->asTex()->tex.mask = 0x7;

         for (c = 0; c < 3; ++c)
            i->setSrc(c, a2d[c]);
         for (; i->srcExists(c + 1); ++c)
            i->setSrc(c, i->getSrc(c + 1));
         i->setSrc(c, NULL);
         assert(c <= 4);

         i->tex.target = i->tex.target.isShadow() ?
            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
      }
   }

   // texel offsets are 3 immediate fields in the instruction,
   // nv50 cannot do textureGatherOffsets
   assert(i->tex.useOffsets <= 1);
   if (i->tex.useOffsets) {
      for (int c = 0; c < 3; ++c) {
         ImmediateValue val;
         assert(i->offset[0][c].getImmediate(val));
         i->tex.offset[c] = val.reg.data.u32;
         i->offset[0][c].set(NULL);
      }
   }

   return true;
}

// Bias must be equal for all threads of a quad or lod calculation will fail.
//
// The lanes of a quad are grouped by the bit in the condition register they
// have set, which is selected by differing bias values.
// Move the input values for TEX into a new register set for each group and
// execute TEX only for a specific group.
// We always need to use 4 new registers for the inputs/outputs because the
// implicitly calculated derivatives must be correct.
//
// TODO: move to SSA phase so we can easily determine whether bias is constant
bool
NV50LoweringPreSSA::handleTXB(TexInstruction *i)
{
   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
   int l, d;

   handleTEX(i);
   Value *bias = i->getSrc(i->tex.target.getArgCount());
   if (bias->isUniform())
      return true;

   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
                                 bld.loadImm(NULL, 1));
   bld.setPosition(cond, false);

   for (l = 1; l < 4; ++l) {
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
      Value *bit = bld.getSSA();
      Value *pred = bld.getScratch(1, FILE_FLAGS);
      Value *imm = bld.loadImm(NULL, (1 << l));
      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
      cond->setSrc(l, bit);
   }
   Value *flags = bld.getScratch(1, FILE_FLAGS);
   bld.setPosition(cond, true);
   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));

   Instruction *tex[4];
   for (l = 0; l < 4; ++l) {
      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
      bld.insert(tex[l]);
   }

   Value *res[4][4];
   for (d = 0; i->defExists(d); ++d)
      res[0][d] = tex[0]->getDef(d);
   for (l = 1; l < 4; ++l) {
      for (d = 0; tex[l]->defExists(d); ++d) {
         res[l][d] = cloneShallow(func, res[0][d]);
         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
      }
   }

   for (d = 0; i->defExists(d); ++d) {
      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
      for (l = 0; l < 4; ++l)
         dst->setSrc(l, res[l][d]);
   }
   delete_Instruction(prog, i);
   return true;
}

// LOD must be equal for all threads of a quad.
// Unlike with TXB, here we can just diverge since there's no LOD calculation
// that would require all 4 threads' sources to be set up properly.
bool
NV50LoweringPreSSA::handleTXL(TexInstruction *i)
{
   handleTEX(i);
   Value *lod = i->getSrc(i->tex.target.getArgCount());
   if (lod->isUniform())
      return true;

   BasicBlock *currBB = i->bb;
   BasicBlock *texiBB = i->bb->splitBefore(i, false);
   BasicBlock *joinBB = i->bb->splitAfter(i);

   bld.setPosition(currBB, true);
   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);

   for (int l = 0; l <= 3; ++l) {
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
      Value *pred = bld.getScratch(1, FILE_FLAGS);
      bld.setPosition(currBB, true);
      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
      if (l <= 2) {
         BasicBlock *laneBB = new BasicBlock(func);
         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
         currBB = laneBB;
      }
   }
   bld.setPosition(joinBB, false);
   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
   return true;
}

bool
NV50LoweringPreSSA::handleTXD(TexInstruction *i)
{
   static const uint8_t qOps[4][2] =
   {
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
   };
   Value *def[4][4];
   Value *crd[3];
   Instruction *tex;
   Value *zero = bld.loadImm(bld.getSSA(), 0);
   int l, c;
   const int dim = i->tex.target.getDim();

   handleTEX(i);
   i->op = OP_TEX; // no need to clone dPdx/dPdy later

   for (c = 0; c < dim; ++c)
      crd[c] = bld.getScratch();

   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
   for (l = 0; l < 4; ++l) {
      // mov coordinates from lane l to all lanes
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
      // add dPdx from lane l to lanes dx
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
      // add dPdy from lane l to lanes dy
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
         tex->setSrc(c, crd[c]);
      // save results
      for (c = 0; i->defExists(c); ++c) {
         Instruction *mov;
         def[c][l] = bld.getSSA();
         mov = bld.mkMov(def[c][l], tex->getDef(c));
         mov->fixed = 1;
         mov->lanes = 1 << l;
      }
   }
   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);

   for (c = 0; i->defExists(c); ++c) {
      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
      for (l = 0; l < 4; ++l)
         u->setSrc(l, def[c][l]);
   }

   i->bb->remove(i);
   return true;
}

bool
NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
{
   handleTEX(i);
   bld.setPosition(i, true);

   /* The returned values are not quite what we want:
    * (a) convert from s32 to f32
    * (b) multiply by 1/256
    */
   for (int def = 0; def < 2; ++def) {
      if (!i->defExists(def))
         continue;
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
   }
   return true;
}

bool
NV50LoweringPreSSA::handleSET(Instruction *i)
{
   if (i->dType == TYPE_F32) {
      bld.setPosition(i, true);
      i->dType = TYPE_U32;
      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
   }
   return true;
}

bool
NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
{
   Value *src0 = bld.getSSA();
   Value *src1 = bld.getSSA();
   Value *pred = bld.getScratch(1, FILE_FLAGS);

   Value *v0 = i->getSrc(0);
   Value *v1 = i->getSrc(1);
   // XXX: these probably shouldn't be immediates in the first place ...
   if (v0->asImm())
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
   if (v1->asImm())
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);

   bld.setPosition(i, true);
   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);

   bld.setPosition(i, false);
   i->op = OP_SET;
   i->setFlagsDef(0, pred);
   i->dType = TYPE_U8;
   i->setSrc(0, i->getSrc(2));
   i->setSrc(2, NULL);
   i->setSrc(1, bld.loadImm(NULL, 0));

   return true;
}

bool
NV50LoweringPreSSA::handleSELP(Instruction *i)
{
   Value *src0 = bld.getSSA();
   Value *src1 = bld.getSSA();

   Value *v0 = i->getSrc(0);
   Value *v1 = i->getSrc(1);
   if (v0->asImm())
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
   if (v1->asImm())
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);

   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
   delete_Instruction(prog, i);
   return true;
}

bool
NV50LoweringPreSSA::handleWRSV(Instruction *i)
{
   Symbol *sym = i->getSrc(0)->asSym();

   // these are all shader outputs, $sreg are not writeable
   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
   if (addr >= 0x400)
      return false;
   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);

   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));

   bld.getBB()->remove(i);
   return true;
}

bool
NV50LoweringPreSSA::handleCALL(Instruction *i)
{
   if (prog->getType() == Program::TYPE_COMPUTE) {
      // Add implicit "thread id" argument in $r0 to the function
      i->setSrc(i->srcCount(), tid);
   }
   return true;
}

bool
NV50LoweringPreSSA::handlePRECONT(Instruction *i)
{
   delete_Instruction(prog, i);
   return true;
}

bool
NV50LoweringPreSSA::handleCONT(Instruction *i)
{
   i->op = OP_BRA;
   return true;
}

bool
NV50LoweringPreSSA::handleRDSV(Instruction *i)
{
   Symbol *sym = i->getSrc(0)->asSym();
   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
   Value *def = i->getDef(0);
   SVSemantic sv = sym->reg.data.sv.sv;
   int idx = sym->reg.data.sv.index;

   if (addr >= 0x400) // mov $sreg
      return true;

   switch (sv) {
   case SV_POSITION:
      assert(prog->getType() == Program::TYPE_FRAGMENT);
      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
      break;
   case SV_FACE:
      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
      if (i->dType == TYPE_F32) {
         bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
         bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
      }
      break;
   case SV_NCTAID:
   case SV_CTAID:
   case SV_NTID:
      if ((sv == SV_NCTAID && idx >= 2) ||
          (sv == SV_NTID && idx >= 3)) {
         bld.mkMov(def, bld.mkImm(1));
      } else if (sv == SV_CTAID && idx >= 2) {
         bld.mkMov(def, bld.mkImm(0));
      } else {
         Value *x = bld.getSSA(2);
         bld.mkOp1(OP_LOAD, TYPE_U16, x,
                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
      }
      break;
   case SV_TID:
      if (idx == 0) {
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
      } else if (idx == 1) {
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
      } else if (idx == 2) {
         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
      } else {
         bld.mkMov(def, bld.mkImm(0));
      }
      break;
   case SV_SAMPLE_POS: {
      Value *off = new_LValue(func, FILE_ADDRESS);
      bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
      bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
      bld.mkLoad(TYPE_F32,
                 def,
                 bld.mkSymbol(
                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
                       TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
                 off);
      break;
   }
   default:
      bld.mkFetch(i->getDef(0), i->dType,
                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
      break;
   }
   bld.getBB()->remove(i);
   return true;
}

bool
NV50LoweringPreSSA::handleDIV(Instruction *i)
{
   if (!isFloatType(i->dType))
      return true;
   bld.setPosition(i, false);
   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
   i->op = OP_MUL;
   i->setSrc(1, rcp->getDef(0));
   return true;
}

bool
NV50LoweringPreSSA::handleSQRT(Instruction *i)
{
   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
                                bld.getSSA(), i->getSrc(0));
   i->op = OP_MUL;
   i->setSrc(1, rsq->getDef(0));

   return true;
}

bool
NV50LoweringPreSSA::handlePOW(Instruction *i)
{
   LValue *val = bld.getScratch();

   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);

   i->op = OP_EX2;
   i->setSrc(0, val);
   i->setSrc(1, NULL);

   return true;
}

bool
NV50LoweringPreSSA::handleEXPORT(Instruction *i)
{
   if (prog->getType() == Program::TYPE_FRAGMENT) {
      if (i->getIndirect(0, 0)) {
         // TODO: redirect to l[] here, load to GPRs at exit
         return false;
      } else {
         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units

         i->op = OP_MOV;
         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
         i->src(0).set(i->src(1));
         i->setSrc(1, NULL);
         i->setDef(0, new_LValue(func, FILE_GPR));
         i->getDef(0)->reg.data.id = id;

         prog->maxGPR = MAX2(prog->maxGPR, id);
      }
   }
   return true;
}

// Handle indirect addressing in geometry shaders:
//
// ld $r0 a[$a1][$a2+k] ->
// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
//
bool
NV50LoweringPreSSA::handleLOAD(Instruction *i)
{
   ValueRef src = i->src(0);

   if (src.isIndirect(1)) {
      assert(prog->getType() == Program::TYPE_GEOMETRY);
      Value *addr = i->getIndirect(0, 1);

      if (src.isIndirect(0)) {
         // base address is in an address register, so move to a GPR
         Value *base = bld.getScratch();
         bld.mkMov(base, addr);

         Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
         Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
         Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                    i->getIndirect(0, 0), bld.mkImm(2));

         // Calculate final address: addr = base + attr*vstride; use 16-bit
         // multiplication since 32-bit would be lowered to multiple
         // instructions, and we only need the low 16 bits of the result
         Value *a[2], *b[2];
         bld.mkSplit(a, 2, attrib);
         bld.mkSplit(b, 2, vstride);
         Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
                                 base);

         // move address from GPR into an address register
         addr = bld.getSSA(2, FILE_ADDRESS);
         bld.mkMov(addr, sum);
      }

      i->setIndirect(0, 1, NULL);
      i->setIndirect(0, 0, addr);
   }

   return true;
}

bool
NV50LoweringPreSSA::handlePFETCH(Instruction *i)
{
   assert(prog->getType() == Program::TYPE_GEOMETRY);

   // NOTE: cannot use getImmediate here, not in SSA form yet, move to
   // later phase if that assertion ever triggers:

   ImmediateValue *imm = i->getSrc(0)->asImm();
   assert(imm);

   assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens

   if (i->srcExists(1)) {
      // indirect addressing of vertex in primitive space

      LValue *val = bld.getScratch();
      Value *ptr = bld.getSSA(2, FILE_ADDRESS);
      bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
      bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);

      // NOTE: PFETCH directly to an $aX only works with direct addressing
      i->op = OP_SHL;
      i->setSrc(0, val);
      i->setSrc(1, bld.mkImm(0));
   }

   return true;
}

// Set flags according to predicate and make the instruction read $cX.
void
NV50LoweringPreSSA::checkPredicate(Instruction *insn)
{
   Value *pred = insn->getPredicate();
   Value *cdst;

   if (!pred || pred->reg.file == FILE_FLAGS)
      return;
   cdst = bld.getSSA(1, FILE_FLAGS);

   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);

   insn->setPredicate(insn->cc, cdst);
}

//
// - add quadop dance for texturing
// - put FP outputs in GPRs
// - convert instruction sequences
//
bool
NV50LoweringPreSSA::visit(Instruction *i)
{
   bld.setPosition(i, false);

   if (i->cc != CC_ALWAYS)
      checkPredicate(i);

   switch (i->op) {
   case OP_TEX:
   case OP_TXF:
   case OP_TXG:
      return handleTEX(i->asTex());
   case OP_TXB:
      return handleTXB(i->asTex());
   case OP_TXL:
      return handleTXL(i->asTex());
   case OP_TXD:
      return handleTXD(i->asTex());
   case OP_TXLQ:
      return handleTXLQ(i->asTex());
   case OP_EX2:
      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
      i->setSrc(0, i->getDef(0));
      break;
   case OP_SET:
      return handleSET(i);
   case OP_SLCT:
      return handleSLCT(i->asCmp());
   case OP_SELP:
      return handleSELP(i);
   case OP_POW:
      return handlePOW(i);
   case OP_DIV:
      return handleDIV(i);
   case OP_SQRT:
      return handleSQRT(i);
   case OP_EXPORT:
      return handleEXPORT(i);
   case OP_LOAD:
      return handleLOAD(i);
   case OP_RDSV:
      return handleRDSV(i);
   case OP_WRSV:
      return handleWRSV(i);
   case OP_CALL:
      return handleCALL(i);
   case OP_PRECONT:
      return handlePRECONT(i);
   case OP_CONT:
      return handleCONT(i);
   case OP_PFETCH:
      return handlePFETCH(i);
   default:
      break;
   }
   return true;
}

bool
TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
{
   bool ret = false;

   if (stage == CG_STAGE_PRE_SSA) {
      NV50LoweringPreSSA pass(prog);
      ret = pass.run(prog, false, true);
   } else
   if (stage == CG_STAGE_SSA) {
      if (!prog->targetPriv)
         prog->targetPriv = new std::list<Instruction *>();
      NV50LegalizeSSA pass(prog);
      ret = pass.run(prog, false, true);
   } else
   if (stage == CG_STAGE_POST_RA) {
      NV50LegalizePostRA pass;
      ret = pass.run(prog, false, true);
      if (prog->targetPriv)
         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
   }
   return ret;
}

} // namespace nv50_ir