summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Bumiller <e0425955@student.tuwien.ac.at>2010-09-09 19:12:54 +0200
committerChristoph Bumiller <e0425955@student.tuwien.ac.at>2010-09-09 19:21:34 +0200
commitf30810cb68a53c4fef360778a230126ed0ee0ee3 (patch)
tree6b4f2fe6919261a5e356f5d75078c78a941f6cb7
parentd8dcff79702860eae92d3d35b461c9b71114c1c5 (diff)
nv50: use actual loads/stores if TEMPs are accessed indirectly
-rw-r--r--src/gallium/drivers/nv50/nv50_pc.c2
-rw-r--r--src/gallium/drivers/nv50/nv50_pc.h3
-rw-r--r--src/gallium/drivers/nv50/nv50_pc_emit.c28
-rw-r--r--src/gallium/drivers/nv50/nv50_pc_optimize.c19
-rw-r--r--src/gallium/drivers/nv50/nv50_pc_print.c3
-rw-r--r--src/gallium/drivers/nv50/nv50_program.c7
-rw-r--r--src/gallium/drivers/nv50/nv50_program.h1
-rw-r--r--src/gallium/drivers/nv50/nv50_screen.c25
-rw-r--r--src/gallium/drivers/nv50/nv50_screen.h3
-rw-r--r--src/gallium/drivers/nv50/nv50_tgsi_to_nc.c54
10 files changed, 122 insertions, 23 deletions
diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index c54f16e4c53..637b3cf2fe3 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -414,6 +414,8 @@ nv50_generate_code(struct nv50_translation_info *ti)
nv_print_program(pc);
#endif
+ pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE;
+
/* optimization */
ret = nv_pc_exec_pass0(pc);
if (ret)
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index d9cc775572e..ba32ab08ab2 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -345,6 +345,9 @@ struct nv_pc {
struct nv_fixup *fixups;
int num_fixups;
+
+ /* optimization enables */
+ boolean opt_reload_elim;
};
void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index bb0a6f32d1e..8c64b198756 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -412,25 +412,25 @@ emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask)
}
static void
-set_ld_st_size(struct nv_pc *pc, ubyte type)
+set_ld_st_size(struct nv_pc *pc, int s, ubyte type)
{
switch (type) {
case NV_TYPE_F64:
- pc->emit[1] |= 0x8000;
+ pc->emit[1] |= 0x8000 << s;
break;
case NV_TYPE_F32:
case NV_TYPE_S32:
case NV_TYPE_U32:
- pc->emit[1] |= 0xc000;
+ pc->emit[1] |= 0xc000 << s;
break;
case NV_TYPE_S16:
- pc->emit[1] |= 0x6000;
+ pc->emit[1] |= 0x6000 << s;
break;
case NV_TYPE_U16:
- pc->emit[1] |= 0x4000;
+ pc->emit[1] |= 0x4000 << s;
break;
case NV_TYPE_S8:
- pc->emit[1] |= 0x2000;
+ pc->emit[1] |= 0x2000 << s;
break;
default:
break;
@@ -473,12 +473,14 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
if (sf == NV_FILE_MEM_L) {
pc->emit[0] = 0xd0000001;
pc->emit[1] = 0x40000000;
+
+ set_addr(pc, i);
} else {
NOUVEAU_ERR("invalid ld source file\n");
abort();
}
- set_ld_st_size(pc, STYPE(i, 0));
+ set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0));
set_dst(pc, i->def[0]);
set_pred_wr(pc, i);
@@ -495,7 +497,19 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
static void
emit_st(struct nv_pc *pc, struct nv_instruction *i)
{
+ assert(SFILE(i, 1) == NV_FILE_GPR);
+ assert(SFILE(i, 0) == NV_FILE_MEM_L);
+
+ pc->emit[0] = 0xd0000001;
+ pc->emit[1] = 0x60000000;
+ SID(pc, i->src[1], 2);
+ SID(pc, i->src[0], 9);
+
+ set_ld_st_size(pc, 8, STYPE(i, 1));
+
+ set_addr(pc, i);
+ set_pred(pc, i);
}
static int
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 4f5bdc1f9fb..09d232abda0 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -82,6 +82,8 @@ inst_commutation_legal(struct nv_instruction *a,
static INLINE boolean
inst_cullable(struct nv_instruction *nvi)
{
+ if (nvi->opcode == NV_OP_STA)
+ return FALSE;
return (!(nvi->is_terminator || nvi->is_join ||
nvi->target ||
nvi->fixed ||
@@ -739,6 +741,7 @@ struct nv_pass_reld_elim {
int alloc;
};
+/* TODO: properly handle loads from l[] memory in the presence of stores */
static int
nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
{
@@ -1074,13 +1077,15 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
if (ret)
return ret;
- reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
- reldelim->pc = pc;
- pc->pass_seq++;
- ret = nv_pass_reload_elim(reldelim, root);
- FREE(reldelim);
- if (ret)
- return ret;
+ if (pc->opt_reload_elim) {
+ reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
+ reldelim->pc = pc;
+ pc->pass_seq++;
+ ret = nv_pass_reload_elim(reldelim, root);
+ FREE(reldelim);
+ if (ret)
+ return ret;
+ }
pc->pass_seq++;
ret = nv_pass_cse(&pass, root);
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 01a6f009979..74c3970f404 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -217,6 +217,9 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
case NV_FILE_FLAGS:
PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value));
break;
+ case NV_FILE_MEM_L:
+ nv_print_address('l', -1, ind, 4 * nv_value_id(value));
+ break;
case NV_FILE_MEM_S:
nv_print_address('s', -1, ind, 4 * nv_value_id(value));
break;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 925028700cd..24952f70f14 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -168,10 +168,17 @@ prog_inst(struct nv50_translation_info *ti,
inst->Src[0].Register.File == TGSI_FILE_INPUT &&
dst->Index == ti->edgeflag_out)
ti->p->vp.edgeflag = inst->Src[0].Register.Index;
+ } else
+ if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
+ if (inst->Dst[0].Register.Indirect)
+ ti->store_to_memory = TRUE;
}
for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
src = &inst->Src[s].Register;
+ if (src->File == TGSI_FILE_TEMPORARY)
+ if (inst->Src[s].Register.Indirect)
+ ti->store_to_memory = TRUE;
if (src->File != TGSI_FILE_INPUT)
continue;
mask = nv50_tgsi_src_mask(inst, s);
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 918baf325f5..a1b2bde97bf 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -116,6 +116,7 @@ struct nv50_translation_info {
int output_access[PIPE_MAX_SHADER_OUTPUTS][4];
boolean indirect_inputs;
boolean indirect_outputs;
+ boolean store_to_memory;
struct tgsi_shader_info scan;
uint32_t *immd32;
unsigned immd32_nr;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index c1efa443daf..24a6d8055c8 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -274,7 +274,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
uint64_t value;
unsigned chipset = dev->chipset;
unsigned tesla_class = 0;
- unsigned stack_size;
+ unsigned stack_size, local_size, max_warps;
int ret, i;
const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
@@ -495,9 +495,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
/* shader stack */
nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
- stack_size = util_bitcount(value & 0xffff);
- stack_size *= util_bitcount((value >> 24) & 0xf);
- stack_size *= 32 * 64 * 8;
+ max_warps = util_bitcount(value & 0xffff);
+ max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
+
+ stack_size = max_warps * 64 * 8;
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
stack_size, &screen->stack_bo);
@@ -510,6 +511,22 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
OUT_RING (chan, 4);
+ local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32;
+
+ ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+ local_size, &screen->local_bo);
+ if (ret) {
+ nv50_screen_destroy(pscreen);
+ return NULL;
+ }
+
+ local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
+
+ BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3);
+ OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+ OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+ OUT_RING (chan, util_unsigned_logbase2(local_size / 8));
+
/* Vertex array limits - max them out */
for (i = 0; i < 16; i++) {
BEGIN_RING(chan, screen->tesla,
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index 1517f5608f2..ad6bdeb27c8 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -25,7 +25,8 @@ struct nv50_screen {
struct nouveau_bo *tic;
struct nouveau_bo *tsc;
- struct nouveau_bo *stack_bo;
+ struct nouveau_bo *stack_bo; /* control flow stack */
+ struct nouveau_bo *local_bo; /* l[] memory */
boolean force_push;
};
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 983fcb2fbf3..f4fee4e0f23 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -558,6 +558,38 @@ bld_insn_3(struct bld_context *bld, uint opcode,
return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
}
+static void
+bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst,
+ struct nv_value *val)
+{
+ struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA);
+ struct nv_value *loc;
+
+ loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+ loc->reg.id = ofst * 4;
+
+ nv_reference(bld->pc, &insn->src[0], loc);
+ nv_reference(bld->pc, &insn->src[1], val);
+ nv_reference(bld->pc, &insn->src[4], ptr);
+}
+
+static struct nv_value *
+bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst)
+{
+ struct nv_value *loc, *val;
+
+ loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+ loc->reg.id = ofst * 4;
+
+ val = bld_insn_1(bld, NV_OP_LDA, loc);
+
+ nv_reference(bld->pc, &val->insn->src[4], ptr);
+
+ return val;
+}
+
#define BLD_INSN_1_EX(d, op, dt, s0, s0t) \
do { \
(d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \
@@ -854,10 +886,18 @@ infer_dst_type(unsigned opcode)
static void
emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
- unsigned chan, struct nv_value *value)
+ unsigned chan, struct nv_value *value)
{
+ struct nv_value *ptr;
const struct tgsi_full_dst_register *reg = &inst->Dst[0];
+ if (reg->Register.Indirect) {
+ ptr = FETCH_ADDR(reg->Indirect.Index,
+ tgsi_util_get_src_register_swizzle(&reg->Indirect, 0));
+ } else {
+ ptr = NULL;
+ }
+
assert(chan < 4);
if (inst->Instruction.Opcode != TGSI_OPCODE_MOV)
@@ -893,7 +933,11 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
value->reg.file = NV_FILE_GPR;
if (value->insn->bb != bld->pc->current_block)
value = bld_insn_1(bld, NV_OP_MOV, value);
- STORE_TEMP(reg->Register.Index, chan, value);
+
+ if (bld->ti->store_to_memory)
+ bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value);
+ else
+ STORE_TEMP(reg->Register.Index, chan, value);
break;
case TGSI_FILE_ADDRESS:
assert(reg->Register.Index < BLD_MAX_ADDRS);
@@ -1064,8 +1108,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
bld->saved_inputs[bld->ti->input_map[idx][swz]] = res;
break;
case TGSI_FILE_TEMPORARY:
- /* this should be load from l[], with reload elimination later on */
- res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
+ if (bld->ti->store_to_memory)
+ res = bld_lmem_load(bld, ptr, idx * 4 + swz);
+ else
+ res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
break;
case TGSI_FILE_ADDRESS:
res = bld_fetch_global(bld, &bld->avs[idx][swz]);