diff options
Diffstat (limited to 'src/broadcom/compiler/vir_to_qpu.c')
-rw-r--r-- | src/broadcom/compiler/vir_to_qpu.c | 202 |
1 files changed, 133 insertions, 69 deletions
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c index aa33545420e..605c3e4c7d5 100644 --- a/src/broadcom/compiler/vir_to_qpu.c +++ b/src/broadcom/compiler/vir_to_qpu.c @@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr) return reg; } -static inline struct qpu_reg -qpu_acc(int acc) -{ - return qpu_magic(V3D_QPU_WADDR_R0 + acc); -} - struct v3d_qpu_instr v3d_qpu_nop(void) { @@ -92,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst) return q; } +static void +v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src) +{ + /* If we have a small immediate move it from inst->raddr_b to the + * corresponding raddr. + */ + if (src.smimm) { + assert(instr->sig.small_imm_a || instr->sig.small_imm_b || + instr->sig.small_imm_c || instr->sig.small_imm_d); + *raddr = instr->raddr_b; + return; + } + + assert(!src.magic); + *raddr = src.index; +} + /** * Allocates the src register (accumulator or register file) into the RADDR * fields of the instruction. */ static void -set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) +v3d42_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) { if (src.smimm) { - assert(instr->sig.small_imm); + assert(instr->sig.small_imm_b); *mux = V3D_QPU_MUX_B; return; } @@ -112,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) return; } - if (instr->alu.add.a != V3D_QPU_MUX_A && - instr->alu.add.b != V3D_QPU_MUX_A && - instr->alu.mul.a != V3D_QPU_MUX_A && - instr->alu.mul.b != V3D_QPU_MUX_A) { + if (instr->alu.add.a.mux != V3D_QPU_MUX_A && + instr->alu.add.b.mux != V3D_QPU_MUX_A && + instr->alu.mul.a.mux != V3D_QPU_MUX_A && + instr->alu.mul.b.mux != V3D_QPU_MUX_A) { instr->raddr_a = src.index; *mux = V3D_QPU_MUX_A; } else { if (instr->raddr_a == src.index) { *mux = V3D_QPU_MUX_A; } else { - assert(!(instr->alu.add.a == V3D_QPU_MUX_B && - instr->alu.add.b == V3D_QPU_MUX_B && - instr->alu.mul.a == V3D_QPU_MUX_B && - instr->alu.mul.b == V3D_QPU_MUX_B) || + assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B && + instr->alu.add.b.mux == V3D_QPU_MUX_B && + instr->alu.mul.a.mux == V3D_QPU_MUX_B && + instr->alu.mul.b.mux == V3D_QPU_MUX_B) || src.index == instr->raddr_b); instr->raddr_b = src.index; @@ -134,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) } } -static bool -is_no_op_mov(struct qinst *qinst) +/* + * The main purpose of the following wrapper is to make calling set_src + * cleaner. This is the reason it receives both mux and raddr pointers. Those + * will be filled or not based on the device version. + */ +static void +set_src(struct v3d_qpu_instr *instr, + enum v3d_qpu_mux *mux, + uint8_t *raddr, + struct qpu_reg src, + const struct v3d_device_info *devinfo) { - static const struct v3d_qpu_sig no_sig = {0}; - - /* Make sure it's just a lone MOV. */ - if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || - qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || - qinst->qpu.alu.add.op != V3D_QPU_A_NOP || - memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { - return false; - } + if (devinfo->ver < 71) + return v3d42_set_src(instr, mux, src); + else + return v3d71_set_src(instr, raddr, src); +} - /* Check if it's a MOV from a register to itself. */ +static bool +v3d42_mov_src_and_dst_equal(struct qinst *qinst) +{ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; if (qinst->qpu.alu.mul.magic_write) { if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4) return false; - if (qinst->qpu.alu.mul.a != + if (qinst->qpu.alu.mul.a.mux != V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) { return false; } } else { int raddr; - switch (qinst->qpu.alu.mul.a) { + switch (qinst->qpu.alu.mul.a.mux) { case V3D_QPU_MUX_A: raddr = qinst->qpu.raddr_a; break; @@ -174,10 +192,61 @@ is_no_op_mov(struct qinst *qinst) return false; } + return true; +} + +static bool +v3d71_mov_src_and_dst_equal(struct qinst *qinst) +{ + if (qinst->qpu.alu.mul.magic_write) + return false; + + enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; + int raddr; + + raddr = qinst->qpu.alu.mul.a.raddr; + if (raddr != waddr) + return false; + + return true; +} + +static bool +mov_src_and_dst_equal(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return v3d42_mov_src_and_dst_equal(qinst); + else + return v3d71_mov_src_and_dst_equal(qinst); +} + + +static bool +is_no_op_mov(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + static const struct v3d_qpu_sig no_sig = {0}; + + /* Make sure it's just a lone MOV. We only check for M_MOV. Although + * for V3D 7.x there is also A_MOV, we don't need to check for it as + * we always emit using M_MOV. We could use A_MOV later on the + * squedule to improve performance + */ + if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || + qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || + qinst->qpu.alu.add.op != V3D_QPU_A_NOP || + memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { + return false; + } + + if (!mov_src_and_dst_equal(qinst, devinfo)) + return false; + /* No packing or flags updates, or we need to execute the * instruction. */ - if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || + if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE || qinst->qpu.flags.mc != V3D_QPU_COND_NONE || qinst->qpu.flags.mpf != V3D_QPU_PF_NONE || @@ -193,8 +262,6 @@ v3d_generate_code_block(struct v3d_compile *c, struct qblock *block, struct qpu_reg *temp_registers) { - int last_vpm_read_index = -1; - vir_for_each_inst_safe(qinst, block) { #if 0 fprintf(stderr, "translating qinst to qpu: "); @@ -202,8 +269,6 @@ v3d_generate_code_block(struct v3d_compile *c, fprintf(stderr, "\n"); #endif - struct qinst *temp; - if (vir_has_uniform(qinst)) c->num_uniforms++; @@ -219,8 +284,14 @@ v3d_generate_code_block(struct v3d_compile *c, src[i] = qpu_magic(qinst->src[i].index); break; case QFILE_NULL: + /* QFILE_NULL is an undef, so we can load + * anything. Using a reg that doesn't have + * sched. restrictions. + */ + src[i] = qpu_reg(5); + break; case QFILE_LOAD_IMM: - src[i] = qpu_acc(0); + assert(!"not reached"); break; case QFILE_TEMP: src[i] = temp_registers[index]; @@ -228,18 +299,6 @@ v3d_generate_code_block(struct v3d_compile *c, case QFILE_SMALL_IMM: src[i].smimm = true; break; - - case QFILE_VPM: - assert((int)qinst->src[i].index >= - last_vpm_read_index); - (void)last_vpm_read_index; - last_vpm_read_index = qinst->src[i].index; - - temp = new_qpu_nop_before(qinst); - temp->qpu.sig.ldvpm = true; - - src[i] = qpu_acc(3); - break; } } @@ -261,10 +320,6 @@ v3d_generate_code_block(struct v3d_compile *c, dst = temp_registers[qinst->dst.index]; break; - case QFILE_VPM: - dst = qpu_magic(V3D_QPU_WADDR_VPM); - break; - case QFILE_SMALL_IMM: case QFILE_LOAD_IMM: assert(!"not reached"); @@ -276,10 +331,15 @@ v3d_generate_code_block(struct v3d_compile *c, assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); - if (!dst.magic || - dst.index != V3D_QPU_WADDR_R5) { - assert(c->devinfo->ver >= 40); + bool use_rf; + if (c->devinfo->has_accumulators) { + use_rf = !dst.magic || + dst.index != V3D_QPU_WADDR_R5; + } else { + use_rf = dst.magic || dst.index != 0; + } + if (use_rf) { if (qinst->qpu.sig.ldunif) { qinst->qpu.sig.ldunif = false; qinst->qpu.sig.ldunifrf = true; @@ -299,13 +359,18 @@ v3d_generate_code_block(struct v3d_compile *c, qinst->qpu.sig_magic = dst.magic; } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) { assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.a, src[0]); + &qinst->qpu.alu.add.a.mux, + &qinst->qpu.alu.add.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.b, src[1]); + &qinst->qpu.alu.add.b.mux, + &qinst->qpu.alu.add.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.add.waddr = dst.index; @@ -313,17 +378,21 @@ v3d_generate_code_block(struct v3d_compile *c, } else { if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.a, src[0]); + &qinst->qpu.alu.mul.a.mux, + &qinst->qpu.alu.mul.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.b, src[1]); + &qinst->qpu.alu.mul.b.mux, + &qinst->qpu.alu.mul.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.mul.waddr = dst.index; qinst->qpu.alu.mul.magic_write = dst.magic; - if (is_no_op_mov(qinst)) { + if (is_no_op_mov(qinst, c->devinfo)) { vir_remove_instruction(c, qinst); continue; } @@ -378,11 +447,7 @@ v3d_dump_qpu(struct v3d_compile *c) const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]); fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str); - /* We can only do this on 4.x, because we're not tracking TMU - * implicit uniforms here on 3.x. - */ - if (c->devinfo->ver >= 40 && - reads_uniform(c->devinfo, c->qpu_insts[i])) { + if (reads_uniform(c->devinfo, c->qpu_insts[i])) { fprintf(stderr, " ("); vir_dump_uniform(c->uniform_contents[next_uniform], c->uniform_data[next_uniform]); @@ -394,8 +459,7 @@ v3d_dump_qpu(struct v3d_compile *c) } /* Make sure our dumping lined up. */ - if (c->devinfo->ver >= 40) - assert(next_uniform == c->num_uniforms); + assert(next_uniform == c->num_uniforms); fprintf(stderr, "\n"); } @@ -431,8 +495,8 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers) } assert(i == c->qpu_inst_count); - if (V3D_DEBUG & (V3D_DEBUG_QPU | - v3d_debug_flag_for_shader_stage(c->s->info.stage))) { + if (V3D_DBG(QPU) || + v3d_debug_flag_for_shader_stage(c->s->info.stage)) { v3d_dump_qpu(c); } |