4 files changed, 202 insertions, 54 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 6eefe8f0025..e244bd0d610 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -122,6 +122,8 @@ private:
    void emitSAM();
    void emitRAM();
 
+   void emitPSETP();
+
    void emitMOV();
    void emitS2R();
    void emitCS2R();
@@ -690,6 +692,31 @@ CodeEmitterGM107::emitRAM()
  * predicate/cc
  ******************************************************************************/
 
+void
+CodeEmitterGM107::emitPSETP()
+{
+
+   emitInsn(0x50900000);
+
+   switch (insn->op) {
+   case OP_AND: emitField(0x18, 3, 0); break;
+   case OP_OR:  emitField(0x18, 3, 1); break;
+   case OP_XOR: emitField(0x18, 3, 2); break;
+   default:
+      assert(!"unexpected operation");
+      break;
+   }
+
+   // emitINV (0x2a);
+   emitPRED(0x27); // TODO: support 3-arg
+   emitINV (0x20, insn->src(1));
+   emitPRED(0x1d, insn->src(1));
+   emitINV (0x0f, insn->src(0));
+   emitPRED(0x0c, insn->src(0));
+   emitPRED(0x03, insn->def(0));
+   emitPRED(0x00);
+}
+
 /*******************************************************************************
  * movement / conversion
  ******************************************************************************/
@@ -3557,7 +3584,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
    case OP_AND:
    case OP_OR:
    case OP_XOR:
-      emitLOP();
+      switch (insn->def(0).getFile()) {
+      case FILE_GPR: emitLOP(); break;
+      case FILE_PREDICATE: emitPSETP(); break;
+      default:
+         assert(!"invalid bool op");
+      }
       break;
    case OP_NOT:
       emitNOT();
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 1f702a987d8..a76d6c60cda 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1802,6 +1802,9 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless
 {
    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
 
+   // We don't upload surface info for bindless for GM107+
+   assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);
+
    if (ptr) {
       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
       if (bindless)
@@ -2204,7 +2207,7 @@ getDestType(const ImgType type) {
 }
 
 void
-NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
+NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)
 {
    const TexInstruction::ImgFormatDesc *format = su->tex.format;
    int width = format->bits[0] + format->bits[1] +
@@ -2223,21 +2226,38 @@ NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
    if (width < 32)
       untypedDst[0] = bld.getSSA();
 
-   for (int i = 0; i < 4; i++) {
-      typedDst[i] = su->getDef(i);
+   if (loaded && loaded[0]) {
+      for (int i = 0; i < 4; i++) {
+         if (loaded[i])
+            typedDst[i] = loaded[i]->getDef(0);
+      }
+   } else {
+      for (int i = 0; i < 4; i++) {
+         typedDst[i] = su->getDef(i);
+      }
    }
 
    // Set the untyped dsts as the su's destinations
-   for (int i = 0; i < 4; i++)
-      su->setDef(i, untypedDst[i]);
+   if (loaded && loaded[0]) {
+      for (int i = 0; i < 4; i++)
+         if (loaded[i])
+            loaded[i]->setDef(0, untypedDst[i]);
+   } else {
+      for (int i = 0; i < 4; i++)
+         su->setDef(i, untypedDst[i]);
 
-   bld.setPosition(su, true);
+      bld.setPosition(su, true);
+   }
 
    // Unpack each component into the typed dsts
    int bits = 0;
    for (int i = 0; i < 4; bits += format->bits[i], i++) {
       if (!typedDst[i])
          continue;
+
+      if (loaded && loaded[0])
+         bld.setPosition(loaded[i], true);
+
       if (i >= format->components) {
          if (format->type == FLOAT ||
              format->type == UNORM ||
@@ -2308,7 +2328,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
    processSurfaceCoordsNVE4(su);
 
    if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
+      convertSurfaceFormat(su, NULL);
       insertOOBSurfaceOpResult(su);
    }
 
@@ -2421,7 +2441,7 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
    processSurfaceCoordsNVC0(su);
 
    if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
+      convertSurfaceFormat(su, NULL);
       insertOOBSurfaceOpResult(su);
    }
 
@@ -2463,14 +2483,16 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
    }
 }
 
-void
-NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
+TexInstruction *
+NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])
 {
    const int slot = su->tex.r;
    const int dim = su->tex.target.getDim();
-   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   const bool array = su->tex.target.isArray() || su->tex.target.isCube();
+   const int arg = dim + array;
    Value *ind = su->getIndirectR();
    Value *handle;
+   Instruction *pred = NULL, *pred2d = NULL;
    int pos = 0;
 
    bld.setPosition(su, false);
@@ -2489,67 +2511,153 @@ NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
       assert(pos == 0);
       break;
    }
+
+   if (dim == 2 && !array) {
+      // This might be a 2d slice of a 3d texture, try to load the z
+      // coordinate in.
+      Value *v;
+      if (!su->tex.bindless)
+         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
+      else
+         v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));
+      Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));
+      pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+                         TYPE_U32, bld.mkImm(0), is_3d);
+
+      bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));
+      su->moveSources(dim, 1);
+      su->setSrc(dim, v);
+      su->tex.target = nv50_ir::TEX_TARGET_3D;
+      pos++;
+   }
+
    if (su->tex.bindless)
-      handle = ind;
+      handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));
    else
       handle = loadTexHandle(ind, slot + 32);
+
    su->setSrc(arg + pos, handle);
 
    // The address check doesn't make sense here. The format check could make
    // sense but it's a bit of a pain.
-   if (su->tex.bindless)
-      return;
+   if (!su->tex.bindless) {
+      // prevent read fault when the image is not actually bound
+      pred =
+         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+                   TYPE_U32, bld.mkImm(0),
+                   loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
+      if (su->op != OP_SUSTP && su->tex.format) {
+         const TexInstruction::ImgFormatDesc *format = su->tex.format;
+         int blockwidth = format->bits[0] + format->bits[1] +
+            format->bits[2] + format->bits[3];
+
+         assert(format->components != 0);
+         // make sure that the format doesn't mismatch when it's not FMT_NONE
+         bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
+                   TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
+                   loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
+                   pred->getDef(0));
+      }
+   }
 
-   // prevent read fault when the image is not actually bound
-   CmpInstruction *pred =
-      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
-                TYPE_U32, bld.mkImm(0),
-                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
-   if (su->op != OP_SUSTP && su->tex.format) {
-      const TexInstruction::ImgFormatDesc *format = su->tex.format;
-      int blockwidth = format->bits[0] + format->bits[1] +
-                       format->bits[2] + format->bits[3];
+   // Now we have "pred" which (optionally) contains whether to do the surface
+   // op at all, and a "pred2d" which indicates that, in case of doing the
+   // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
+   TexInstruction *su2d = NULL;
+   if (pred2d) {
+      su2d = cloneForward(func, su)->asTex();
+      for (unsigned i = 0; su->defExists(i); ++i)
+         su2d->setDef(i, bld.getSSA());
+      su2d->moveSources(dim + 1, -1);
+      su2d->tex.target = nv50_ir::TEX_TARGET_2D;
+   }
+   if (pred2d && pred) {
+      Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,
+                                      bld.getSSA(1, FILE_PREDICATE),
+                                      pred->getDef(0), pred2d->getDef(0));
+      pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
+      pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);
+      su->setPredicate(CC_P, pred3d->getDef(0));
+      pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),
+                         pred->getDef(0), pred2d->getDef(0));
+      pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
+   } else if (pred) {
+      su->setPredicate(CC_NOT_P, pred->getDef(0));
+   } else if (pred2d) {
+      su->setPredicate(CC_NOT_P, pred2d->getDef(0));
+   }
+   if (su2d) {
+      su2d->setPredicate(CC_P, pred2d->getDef(0));
+      bld.insert(su2d);
+
+      // Create a UNION so that RA assigns the same registers
+      bld.setPosition(su, true);
+      for (unsigned i = 0; su->defExists(i); ++i) {
+         assert(i < 4);
 
-      assert(format->components != 0);
-      // make sure that the format doesn't mismatch when it's not FMT_NONE
-      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
-                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
-                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
-                pred->getDef(0));
+         ValueDef &def = su->def(i);
+         ValueDef &def2 = su2d->def(i);
+         Instruction *mov = NULL;
+
+         if (pred) {
+            mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+            mov->setPredicate(CC_P, pred->getDef(0));
+         }
+
+         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
+                                      bld.getSSA(),
+                                      NULL, def2.get());
+         def.replace(uni->getDef(0), false);
+         uni->setSrc(0, def.get());
+         if (mov)
+            uni->setSrc(2, mov->getDef(0));
+      }
+   } else if (pred) {
+      // Create a UNION so that RA assigns the same registers
+      bld.setPosition(su, true);
+      for (unsigned i = 0; su->defExists(i); ++i) {
+         assert(i < 4);
+
+         ValueDef &def = su->def(i);
+
+         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
+         mov->setPredicate(CC_P, pred->getDef(0));
+
+         Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
+                                      bld.getSSA(),
+                                      NULL, mov->getDef(0));
+         def.replace(uni->getDef(0), false);
+         uni->setSrc(0, def.get());
+      }
    }
-   su->setPredicate(CC_NOT_P, pred->getDef(0));
+
+   return su2d;
 }
 
 void
 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
 {
-   processSurfaceCoordsGM107(su);
+   // processSurfaceCoords also takes care of fixing up the outputs and
+   // union'ing them with 0 as necessary. Additionally it may create a second
+   // surface which needs some of the similar fixups.
+
+   Instruction *loaded[4] = {};
+   TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);
 
    if (su->op == OP_SULDP) {
-      convertSurfaceFormat(su);
-      insertOOBSurfaceOpResult(su);
+      convertSurfaceFormat(su, loaded);
    }
 
    if (su->op == OP_SUREDP) {
-      Value *def = su->getDef(0);
-
       su->op = OP_SUREDB;
+   }
 
-      // There may not be a predicate in the bindless case.
-      if (su->getPredicate()) {
-         su->setDef(0, bld.getSSA());
-
-         bld.setPosition(su, true);
-
-         // make sure to initialize dst value when the atomic operation is not
-         // performed
-         Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
-
-         assert(su->cc == CC_NOT_P);
-         mov->setPredicate(CC_P, su->getPredicate());
-
-         bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
-      }
+   // If we fixed up the type of the regular surface load instruction, we also
+   // have to fix up the copy.
+   if (su2) {
+      su2->op = su->op;
+      su2->dType = su->dType;
+      su2->sType = su->sType;
    }
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 0ce2a4b80f8..b4c405a9ea5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -171,10 +171,10 @@ private:
    Value *loadMsInfo32(Value *ptr, uint32_t off);
 
    void adjustCoordinatesMS(TexInstruction *);
-   void processSurfaceCoordsGM107(TexInstruction *);
+   TexInstruction *processSurfaceCoordsGM107(TexInstruction *, Instruction *[4]);
    void processSurfaceCoordsNVE4(TexInstruction *);
    void processSurfaceCoordsNVC0(TexInstruction *);
-   void convertSurfaceFormat(TexInstruction *);
+   void convertSurfaceFormat(TexInstruction *, Instruction **);
    void insertOOBSurfaceOpResult(TexInstruction *);
    Value *calculateSampleOffset(Value *sampleID);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index f62e508258b..4948a8f4cea 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -1433,7 +1433,15 @@ gm107_create_image_handle(struct pipe_context *pipe,
 
    nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
 
-   return 0x100000000ULL | tic->id;
+   // Compute handle. This will include the TIC as well as some additional
+   // info regarding the bound 3d surface layer, if applicable.
+   uint64_t handle = 0x100000000ULL | tic->id;
+   struct nv04_resource *res = nv04_resource(view->resource);
+   if (res->base.target == PIPE_TEXTURE_3D) {
+      handle |= 1 << 11;
+      handle |= view->u.tex.first_layer << (11 + 16);
+   }
+   return handle;
 
 fail:
    FREE(tic);