diff options
author | Ilia Mirkin <imirkin@alum.mit.edu> | 2021-03-20 23:40:00 -0400 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-05-01 20:04:21 +0000 |
commit | b53b96a86a13cc321ef8812b9ec96c2c229e8e5c (patch) | |
tree | 5670bcdc202eecfa52ba81d0c92db7633d1605aa | |
parent | f451854f39f580c6c95a574428498e32ffb6e840 (diff) |
nv50: add support for doing membars
This requires an address that's safe to read from.
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10164>
7 files changed, 63 insertions, 6 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index 8bf0809fa10..83aae296fbb 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -125,6 +125,9 @@ struct nv50_ir_prog_info uint8_t msInfoCBSlot; /* cX[] used for multisample info */ uint16_t msInfoBase; /* base address for multisample info */ uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */ + + uint16_t membarOffset; /* base address for membar reads (nv50) */ + uint8_t gmemMembar; /* gX[] on which to perform membar reads (nv50) */ } io; /* driver callback to assign input/output locations */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 9ebb780bfff..2b09855b19f 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -647,6 +647,7 @@ private: bool handleEXPORT(Instruction *); bool handleLOAD(Instruction *); bool handleLDST(Instruction *); + bool handleMEMBAR(Instruction *); bool handleSharedATOM(Instruction *); bool handleSULDP(TexInstruction *); bool handleSUREDP(TexInstruction *); @@ -1619,6 +1620,42 @@ NV50LoweringPreSSA::handleLDST(Instruction *i) return true; } +bool +NV50LoweringPreSSA::handleMEMBAR(Instruction *i) +{ + // For global memory, apparently doing a bunch of reads at different + // addresses forces things to get sufficiently flushed. + if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) { + uint8_t b = prog->driver->io.auxCBSlot; + Value *base = + bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, + prog->driver->io.membarOffset), NULL); + Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0)); + Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), + bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), + physid, bld.loadImm(NULL, 0x1f)), + bld.loadImm(NULL, 2)); + base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off); + Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0); + for (int i = 0; i < 8; i++) { + if (i != 0) { + base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100)); + } + bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base) + ->fixed = 1; + } + } + + // Both global and shared memory barriers also need a regular control bar + // TODO: double-check this is the case + i->op = OP_BAR; + i->subOp = NV50_IR_SUBOP_BAR_SYNC; + i->setSrc(0, bld.mkImm(0u)); + i->setSrc(1, bld.mkImm(0u)); + + return true; +} + // The type that bests represents how each component can be stored when packed. static DataType getPackedType(const TexInstruction::ImgFormatDesc *t, int c) @@ -2188,6 +2225,8 @@ NV50LoweringPreSSA::visit(Instruction *i) return handleEXPORT(i); case OP_LOAD: return handleLOAD(i); + case OP_MEMBAR: + return handleMEMBAR(i); case OP_ATOM: case OP_STORE: return handleLDST(i); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 6a6ad01ce40..f4340014396 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -2569,6 +2569,13 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) addHazard(i, i->src(0).getIndirect(0)); if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8) addHazard(i, i->src(0).getIndirect(1)); + if (i->op == OP_LOAD && i->fixed && targ->getChipset() < 0xc0) { + // Add a hazard to make sure we keep the op around. These are used + // for membars. + Instruction *nop = new_Instruction(func, OP_NOP, i->dType); + nop->setSrc(0, i->getDef(0)); + i->bb->insertAfter(i, nop); + } } else if (i->op == OP_UNION || i->op == OP_MERGE || diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index ccb14535b55..e5a3a995505 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -297,11 +297,6 @@ CodeEmitter::prepareEmission(BasicBlock *bb) for (i = bb->getEntry(); i; i = next) { next = i->next; - if (i->op == OP_MEMBAR && !targ->isOpSupported(OP_MEMBAR, TYPE_NONE)) { - bb->remove(i); - continue; - } - i->encSize = getMinEncodingSize(i); if (next && i->encSize < 8) ++nShort; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h index af8a290db71..ed89d54afe4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h @@ -123,7 +123,11 @@ /* Compute buffer info: 16 surfaces, 12 32-bit integers each */ #define NV50_CB_AUX_BUF_INFO(i) (0x3c4 + (i) * 12 * 4) #define NV50_CB_AUX_BUF_SIZE (NV50_MAX_GLOBALS * 12 * 4) -/* next spot: 0x644 */ +/* Compute membar mapped area */ +#define NV50_CB_AUX_MEMBAR_OFFSET 0x6c4 +/* next spot: 0x6c8 */ +/* 0x800 from the end for compute shader membars, reads only. */ +#define NV50_CB_AUX_MEMBAR (NV50_CB_AUX_SIZE - 0x800) /* 4 32-bit floats for the vertex runout, put at the end */ #define NV50_CB_AUX_RUNOUT_OFFSET (NV50_CB_AUX_SIZE - 0x10) diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c index d818d21ef19..90349a92b01 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c @@ -364,6 +364,9 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset, info->io.msInfoCBSlot = 15; info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET; + info->io.membarOffset = NV50_CB_AUX_MEMBAR_OFFSET; + info->io.gmemMembar = 15; + info->assignSlots = nv50_program_assign_varying_slots; prog->vp.bfc[0] = 0xff; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 2dffde2b2ad..7948a6d37cc 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -797,6 +797,12 @@ nv50_screen_init_hwctx(struct nv50_screen *screen) PUSH_DATAh(push, screen->uniforms->offset + (4 << 16) + NV50_CB_AUX_RUNOUT_OFFSET); PUSH_DATA (push, screen->uniforms->offset + (4 << 16) + NV50_CB_AUX_RUNOUT_OFFSET); + /* set the membar offset */ + BEGIN_NV04(push, NV50_3D(CB_ADDR), 1); + PUSH_DATA (push, (NV50_CB_AUX_MEMBAR_OFFSET << (8 - 2)) | NV50_CB_AUX); + BEGIN_NI04(push, NV50_3D(CB_DATA(0)), 1); + PUSH_DATA (push, screen->uniforms->offset + (4 << 16) + NV50_CB_AUX_MEMBAR_OFFSET); + nv50_upload_ms_info(push); /* max TIC (bits 4:8) & TSC bindings, per program type */ |