diff options
Diffstat (limited to 'src/freedreno/ir3/ir3_ra.c')
-rw-r--r-- | src/freedreno/ir3/ir3_ra.c | 775 |
1 files changed, 607 insertions, 168 deletions
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index f03ceef4b24..2debdeab167 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -193,6 +193,8 @@ void ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) { + assert(interval->inserted); + if (interval->parent) { rb_tree_remove(&interval->parent->children, &interval->node); } else { @@ -538,8 +540,6 @@ ra_file_init(struct ra_file *file) BITSET_SET(file->available_to_evict, i); } - file->start = 0; - rb_tree_init(&file->reg_ctx.intervals); rb_tree_init(&file->physreg_intervals); @@ -686,6 +686,8 @@ ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file, struct ra_interval *interval) { assert(!interval->interval.parent); + /* shared live splitting is not allowed! */ + assert(!(interval->interval.reg->flags & IR3_REG_SHARED)); /* Check if we've already moved this reg before */ unsigned pcopy_index; @@ -720,6 +722,10 @@ ra_push_interval(struct ra_ctx *ctx, struct ra_file *file, interval->physreg_start = dst; interval->physreg_end = dst + removed->size; + assert(interval->physreg_end <= file->size); + if (interval->interval.reg->flags & IR3_REG_HALF) + assert(interval->physreg_end <= RA_HALF_SIZE); + ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval); } @@ -732,16 +738,76 @@ ra_move_interval(struct ra_ctx *ctx, struct ra_file *file, ra_push_interval(ctx, file, &temp, dst); } +static struct ra_file * +ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg) +{ + if (reg->flags & IR3_REG_SHARED) + return &ctx->shared; + else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF)) + return &ctx->full; + else + return &ctx->half; +} + + +/* Returns true if the proposed spot for "dst" or a killed source overlaps a + * destination that's been allocated. + */ static bool -get_reg_specified(struct ra_file *file, struct ir3_register *reg, - physreg_t physreg, bool is_source) +check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file, + struct ir3_register *dst, physreg_t start, + physreg_t end) +{ + struct ir3_instruction *instr = dst->instr; + + ra_foreach_dst (other_dst, instr) { + /* We assume only destinations before the current one have been allocated. + */ + if (other_dst == dst) + break; + + if (ra_get_file(ctx, other_dst) != file) + continue; + + struct ra_interval *other_interval = &ctx->intervals[other_dst->name]; + assert(!other_interval->interval.parent); + physreg_t other_start = other_interval->physreg_start; + physreg_t other_end = other_interval->physreg_end; + + if (other_end > start && end > other_start) + return true; + } + + return false; +} + +/* True if the destination is "early-clobber," meaning that it cannot be + * allocated over killed sources. Some destinations always require it, but it + * also is implicitly true for tied destinations whose source is live-through. + * If the source is killed, then we skip allocating a register for the + * destination altogether so we don't need to worry about that case here. + */ +static bool +is_early_clobber(struct ir3_register *reg) +{ + return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied; +} + +static bool +get_reg_specified(struct ra_ctx *ctx, struct ra_file *file, + struct ir3_register *reg, physreg_t physreg, bool is_source) { for (unsigned i = 0; i < reg_size(reg); i++) { - if (!BITSET_TEST(is_source ? file->available_to_evict : file->available, + if (!BITSET_TEST(is_early_clobber(reg) || is_source ? + file->available_to_evict : file->available, physreg + i)) return false; } + if (!is_source && + check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg))) + return false; + return true; } @@ -759,8 +825,13 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, memcpy(available_to_evict, file->available_to_evict, sizeof(available_to_evict)); - for (unsigned i = 0; i < reg_size(reg); i++) + BITSET_DECLARE(available, RA_MAX_FILE_SIZE); + memcpy(available, file->available, sizeof(available)); + + for (unsigned i = 0; i < reg_size(reg); i++) { BITSET_CLEAR(available_to_evict, physreg + i); + BITSET_CLEAR(available, physreg + i); + } unsigned eviction_count = 0; /* Iterate over each range conflicting with physreg */ @@ -769,7 +840,7 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, conflicting != NULL && conflicting->physreg_start < physreg + reg_size(reg); conflicting = next, next = ra_interval_next_or_null(next)) { - if (!is_source && conflicting->is_killed) + if (!is_early_clobber(reg) && !is_source && conflicting->is_killed) continue; if (conflicting->frozen) { @@ -777,10 +848,12 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, return false; } + unsigned conflicting_file_size = + reg_file_size(file, conflicting->interval.reg); unsigned avail_start, avail_end; bool evicted = false; BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict, - reg_file_size(file, conflicting->interval.reg)) { + conflicting_file_size) { unsigned size = avail_end - avail_start; /* non-half registers must be aligned */ @@ -790,7 +863,11 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, size--; } - if (size >= conflicting->physreg_end - conflicting->physreg_start) { + unsigned conflicting_size = + conflicting->physreg_end - conflicting->physreg_start; + if (size >= conflicting_size && + !check_dst_overlap(ctx, file, reg, avail_start, avail_start + + conflicting_size)) { for (unsigned i = 0; i < conflicting->physreg_end - conflicting->physreg_start; i++) BITSET_CLEAR(available_to_evict, avail_start + i); @@ -803,6 +880,76 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, } } + if (evicted) + continue; + + /* If we couldn't evict this range, but the register we're allocating is + * allowed to overlap with a killed range, then we may be able to swap it + * with a killed range to acheive the same effect. + */ + if (is_early_clobber(reg) || is_source) + return false; + + foreach_interval (killed, file) { + if (!killed->is_killed) + continue; + + if (killed->physreg_end - killed->physreg_start != + conflicting->physreg_end - conflicting->physreg_start) + continue; + + if (killed->physreg_end > conflicting_file_size || + conflicting->physreg_end > reg_file_size(file, killed->interval.reg)) + continue; + + /* We can't swap the killed range if it partially/fully overlaps the + * space we're trying to allocate or (in speculative mode) if it's + * already been swapped and will overlap when we actually evict. + */ + bool killed_available = true; + for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { + if (!BITSET_TEST(available, i)) { + killed_available = false; + break; + } + } + + if (!killed_available) + continue; + + if (check_dst_overlap(ctx, file, reg, killed->physreg_start, + killed->physreg_end)) + continue; + + /* Check for alignment if one is a full reg */ + if ((!(killed->interval.reg->flags & IR3_REG_HALF) || + !(conflicting->interval.reg->flags & IR3_REG_HALF)) && + (killed->physreg_start % 2 != 0 || + conflicting->physreg_start % 2 != 0)) + continue; + + for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { + BITSET_CLEAR(available, i); + } + /* Because this will generate swaps instead of moves, multiply the + * cost by 2. + */ + eviction_count += (killed->physreg_end - killed->physreg_start) * 2; + if (!speculative) { + physreg_t killed_start = killed->physreg_start, + conflicting_start = conflicting->physreg_start; + struct ra_removed_interval killed_removed = + ra_pop_interval(ctx, file, killed); + struct ra_removed_interval conflicting_removed = + ra_pop_interval(ctx, file, conflicting); + ra_push_interval(ctx, file, &killed_removed, conflicting_start); + ra_push_interval(ctx, file, &conflicting_removed, killed_start); + } + + evicted = true; + break; + } + if (!evicted) return false; } @@ -819,15 +966,16 @@ removed_interval_cmp(const void *_i1, const void *_i2) /* We sort the registers as follows: * - * |--------------------------------------------------------------------| - * | | | | | - * | Half live-through | Half killed | Full killed | Full live-through | - * | | | | | - * |--------------------------------------------------------------------| - * | | - * | Destination | - * | | - * |-----------------| + * |------------------------------------------------------------------------------------------| + * | | | | | | | + * | Half | Half early-clobber | Half | Full | Full early-clobber | Full | + * | live-through | destination | killed | killed | destination | live-through | + * | | | | | | | + * |------------------------------------------------------------------------------------------| + * | | + * | Destination | + * | | + * |-----------------| * * Half-registers have to be first so that they stay in the low half of * the register file. Then half and full killed must stay together so that @@ -859,6 +1007,37 @@ removed_interval_cmp(const void *_i1, const void *_i2) return 0; } +static int +dsts_cmp(const void *_i1, const void *_i2) +{ + struct ir3_register *i1 = *(struct ir3_register *const *) _i1; + struct ir3_register *i2 = *(struct ir3_register *const *) _i2; + + /* Treat tied destinations as-if they are live-through sources, and normal + * destinations as killed sources. + */ + unsigned i1_align = reg_elem_size(i1); + unsigned i2_align = reg_elem_size(i2); + if (i1_align > i2_align) + return 1; + if (i1_align < i2_align) + return -1; + + if (i1_align == 1) { + if (!is_early_clobber(i2)) + return -1; + if (!is_early_clobber(i1)) + return 1; + } else { + if (!is_early_clobber(i2)) + return 1; + if (!is_early_clobber(i1)) + return -1; + } + + return 0; +} + /* "Compress" all the live intervals so that there is enough space for the * destination register. As there can be gaps when a more-aligned interval * follows a less-aligned interval, this also sorts them to remove such @@ -869,29 +1048,100 @@ removed_interval_cmp(const void *_i1, const void *_i2) * Return the physreg to use. */ static physreg_t -compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size, - unsigned align, bool is_source) +compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, + struct ir3_register *reg) { + unsigned reg_align = reg_elem_size(reg); DECLARE_ARRAY(struct ra_removed_interval, intervals); intervals_count = intervals_sz = 0; intervals = NULL; + DECLARE_ARRAY(struct ir3_register *, dsts); + dsts_count = dsts_sz = 0; + dsts = NULL; + array_insert(ctx, dsts, reg); + bool dst_inserted[reg->instr->dsts_count]; + + unsigned dst_size = reg->tied ? 0 : reg_size(reg); + unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0; + unsigned half_dst_size = 0, ec_half_dst_size = 0; + if (reg_align == 1) { + half_dst_size = dst_size; + ec_half_dst_size = ec_dst_size; + } + unsigned removed_size = 0, removed_half_size = 0; + unsigned removed_killed_size = 0, removed_killed_half_size = 0; unsigned file_size = - align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size; + reg_align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size; physreg_t start_reg = 0; foreach_interval_rev_safe (interval, file) { + /* We'll check if we can compact the intervals starting here. */ + physreg_t candidate_start = interval->physreg_end; + + /* Check if there are any other destinations we need to compact. */ + ra_foreach_dst_n (other_dst, n, reg->instr) { + if (other_dst == reg) + break; + if (ra_get_file(ctx, other_dst) != file) + continue; + if (dst_inserted[n]) + continue; + + struct ra_interval *other_interval = &ctx->intervals[other_dst->name]; + /* if the destination partially overlaps this interval, we need to + * extend candidate_start to the end. + */ + if (other_interval->physreg_start < candidate_start) { + candidate_start = MAX2(candidate_start, + other_interval->physreg_end); + continue; + } + + dst_inserted[n] = true; + + /* dst intervals with a tied killed source are considered attached to + * that source. Don't actually insert them. This means we have to + * update them below if their tied source moves. + */ + if (other_dst->tied) { + struct ra_interval *tied_interval = + &ctx->intervals[other_dst->tied->def->name]; + if (tied_interval->is_killed) + continue; + } + + d("popping destination %u physreg %u\n", + other_interval->interval.reg->name, + other_interval->physreg_start); + + array_insert(ctx, dsts, other_dst); + unsigned interval_size = reg_size(other_dst); + if (is_early_clobber(other_dst)) { + ec_dst_size += interval_size; + if (other_interval->interval.reg->flags & IR3_REG_HALF) + ec_half_dst_size += interval_size; + } else { + dst_size += interval_size; + if (other_interval->interval.reg->flags & IR3_REG_HALF) + half_dst_size += interval_size; + } + } + /* Check if we can sort the intervals *after* this one and have enough - * space leftover to accomodate "size" units. Also check that we have - * enough space leftover for half-registers, if we're inserting a - * half-register (otherwise we only shift any half-registers down so they - * should be safe). + * space leftover to accomodate all intervals, keeping in mind that killed + * sources overlap non-tied destinations. Also check that we have enough + * space leftover for half-registers, if we're inserting a half-register + * (otherwise we only shift any half-registers down so they should be + * safe). */ - if (interval->physreg_end + size + removed_size <= file->size && - (align != 1 || - interval->physreg_end + size + removed_half_size <= file_size)) { - start_reg = interval->physreg_end; + if (candidate_start + removed_size + ec_dst_size + + MAX2(removed_killed_size, dst_size) <= file->size && + (reg_align != 1 || + candidate_start + removed_half_size + ec_half_dst_size + + MAX2(removed_killed_half_size, half_dst_size) <= file_size)) { + start_reg = candidate_start; break; } @@ -900,20 +1150,23 @@ compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size, */ assert(!interval->frozen); - /* Killed sources don't count because they go at the end and can + /* Killed sources are different because they go at the end and can * overlap the register we're trying to add. */ - if (!interval->is_killed && !is_source) { - removed_size += interval->physreg_end - interval->physreg_start; - if (interval->interval.reg->flags & IR3_REG_HALF) { - removed_half_size += interval->physreg_end - - interval->physreg_start; - } + unsigned interval_size = interval->physreg_end - interval->physreg_start; + if (interval->is_killed) { + removed_killed_size += interval_size; + if (interval->interval.reg->flags & IR3_REG_HALF) + removed_killed_half_size += interval_size; + } else { + removed_size += interval_size; + if (interval->interval.reg->flags & IR3_REG_HALF) + removed_half_size += interval_size; } /* Now that we've done the accounting, pop this off */ - d("popping interval %u physreg %u\n", interval->interval.reg->name, - interval->physreg_start); + d("popping interval %u physreg %u%s\n", interval->interval.reg->name, + interval->physreg_start, interval->is_killed ? ", killed" : ""); array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval)); } @@ -922,55 +1175,138 @@ compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size, */ qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp); + qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp); - physreg_t physreg = start_reg; + physreg_t live_reg = start_reg; + physreg_t dst_reg = (physreg_t)~0; physreg_t ret_reg = (physreg_t)~0; - for (unsigned i = 0; i < intervals_count; i++) { - if (ret_reg == (physreg_t)~0 && - ((intervals[i].interval->is_killed && !is_source) || - !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) { - ret_reg = ALIGN(physreg, align); + unsigned dst_index = 0; + unsigned live_index = 0; + + /* We have two lists of intervals to process, live intervals and destination + * intervals. Process them in the order of the disgram in insert_cmp(). + */ + while (live_index < intervals_count || dst_index < dsts_count) { + bool process_dst; + if (live_index == intervals_count) { + process_dst = true; + } else if (dst_index == dsts_count) { + process_dst = false; + } else { + struct ir3_register *dst = dsts[dst_index]; + struct ra_interval *live_interval = intervals[live_index].interval; + + bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF; + bool live_killed = live_interval->is_killed; + bool dst_half = dst->flags & IR3_REG_HALF; + bool dst_early_clobber = is_early_clobber(dst); + + if (live_half && !live_killed) { + /* far-left of diagram. */ + process_dst = false; + } else if (dst_half && dst_early_clobber) { + /* mid-left of diagram. */ + process_dst = true; + } else if (!dst_early_clobber) { + /* bottom of disagram. */ + process_dst = true; + } else if (live_killed) { + /* middle of diagram. */ + process_dst = false; + } else if (!dst_half && dst_early_clobber) { + /* mid-right of diagram. */ + process_dst = true; + } else { + /* far right of diagram. */ + assert(!live_killed && !live_half); + process_dst = false; + } } - if (ret_reg != (physreg_t)~0 && - (is_source || !intervals[i].interval->is_killed)) { - physreg = MAX2(physreg, ret_reg + size); + struct ir3_register *cur_reg = + process_dst ? dsts[dst_index] : + intervals[live_index].interval->interval.reg; + + physreg_t physreg; + if (process_dst && !is_early_clobber(cur_reg)) { + if (dst_reg == (physreg_t)~0) + dst_reg = live_reg; + physreg = dst_reg; + } else { + physreg = live_reg; + struct ra_interval *live_interval = intervals[live_index].interval; + bool live_killed = live_interval->is_killed; + /* If this is live-through and we've processed the destinations, we + * need to make sure we take into account any overlapping destinations. + */ + if (!live_killed && dst_reg != (physreg_t)~0) + physreg = MAX2(physreg, dst_reg); } - if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) { + if (!(cur_reg->flags & IR3_REG_HALF)) physreg = ALIGN(physreg, 2); - } - if (physreg + intervals[i].size > - reg_file_size(file, intervals[i].interval->interval.reg)) { + d("pushing reg %u physreg %u\n", cur_reg->name, physreg); + + unsigned interval_size = reg_size(cur_reg); + if (physreg + interval_size > + reg_file_size(file, cur_reg)) { d("ran out of room for interval %u!\n", - intervals[i].interval->interval.reg->name); + cur_reg->name); unreachable("reg pressure calculation was wrong!"); return 0; } - d("pushing interval %u physreg %u\n", - intervals[i].interval->interval.reg->name, physreg); - ra_push_interval(ctx, file, &intervals[i], physreg); + if (process_dst) { + if (cur_reg == reg) { + ret_reg = physreg; + } else { + struct ra_interval *interval = &ctx->intervals[cur_reg->name]; + interval->physreg_start = physreg; + interval->physreg_end = physreg + interval_size; + } + dst_index++; + } else { + ra_push_interval(ctx, file, &intervals[live_index], physreg); + live_index++; + } + + physreg += interval_size; - physreg += intervals[i].size; + if (process_dst && !is_early_clobber(cur_reg)) { + dst_reg = physreg; + } else { + live_reg = physreg; + } } - if (ret_reg == (physreg_t)~0) - ret_reg = physreg; + /* If we shuffled around a tied source that is killed, we may have to update + * its corresponding destination since we didn't insert it above. + */ + ra_foreach_dst (dst, reg->instr) { + if (dst == reg) + break; - ret_reg = ALIGN(ret_reg, align); - if (ret_reg + size > file_size) { - d("ran out of room for the new interval!\n"); - unreachable("reg pressure calculation was wrong!"); - return 0; + struct ir3_register *tied = dst->tied; + if (!tied) + continue; + + struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; + if (!tied_interval->is_killed) + continue; + + struct ra_interval *dst_interval = &ctx->intervals[dst->name]; + unsigned dst_size = reg_size(dst); + dst_interval->physreg_start = ra_interval_get_physreg(tied_interval); + dst_interval->physreg_end = dst_interval->physreg_start + dst_size; } return ret_reg; } static void -update_affinity(struct ir3_register *reg, physreg_t physreg) +update_affinity(struct ra_file *file, struct ir3_register *reg, + physreg_t physreg) { if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0) return; @@ -978,6 +1314,9 @@ update_affinity(struct ir3_register *reg, physreg_t physreg) if (physreg < reg->merge_set_offset) return; + if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size) + return; + reg->merge_set->preferred_reg = physreg - reg->merge_set_offset; } @@ -985,8 +1324,9 @@ update_affinity(struct ir3_register *reg, physreg_t physreg) * a round-robin algorithm to reduce false dependencies. */ static physreg_t -find_best_gap(struct ra_file *file, unsigned file_size, unsigned size, - unsigned align, bool is_source) +find_best_gap(struct ra_ctx *ctx, struct ra_file *file, + struct ir3_register *dst, unsigned file_size, unsigned size, + unsigned alignment) { /* This can happen if we create a very large merge set. Just bail out in that * case. @@ -995,9 +1335,9 @@ find_best_gap(struct ra_file *file, unsigned file_size, unsigned size, return (physreg_t) ~0; BITSET_WORD *available = - is_source ? file->available_to_evict : file->available; + is_early_clobber(dst) ? file->available_to_evict : file->available; - unsigned start = ALIGN(file->start, align) % (file_size - size + align); + unsigned start = ALIGN(file->start, alignment) % (file_size - size + alignment); unsigned candidate = start; do { bool is_available = true; @@ -1009,11 +1349,16 @@ find_best_gap(struct ra_file *file, unsigned file_size, unsigned size, } if (is_available) { + is_available = + !check_dst_overlap(ctx, file, dst, candidate, candidate + size); + } + + if (is_available) { file->start = (candidate + size) % file_size; return candidate; } - candidate += align; + candidate += alignment; if (candidate + size > file_size) candidate = 0; } while (candidate != start); @@ -1021,17 +1366,6 @@ find_best_gap(struct ra_file *file, unsigned file_size, unsigned size, return (physreg_t)~0; } -static struct ra_file * -ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg) -{ - if (reg->flags & IR3_REG_SHARED) - return &ctx->shared; - else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF)) - return &ctx->full; - else - return &ctx->half; -} - /* This is the main entrypoint for picking a register. Pick a free register * for "reg", shuffling around sources if necessary. In the normal case where * "is_source" is false, this register can overlap with killed sources @@ -1042,16 +1376,15 @@ ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg) */ static physreg_t -get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg, - bool is_source) +get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg) { unsigned file_size = reg_file_size(file, reg); if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) { physreg_t preferred_reg = reg->merge_set->preferred_reg + reg->merge_set_offset; - if (preferred_reg < file_size && + if (preferred_reg + reg_size(reg) <= file_size && preferred_reg % reg_elem_size(reg) == 0 && - get_reg_specified(file, reg, preferred_reg, is_source)) + get_reg_specified(ctx, file, reg, preferred_reg, false)) return preferred_reg; } @@ -1062,8 +1395,9 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg, unsigned size = reg_size(reg); if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 && size < reg->merge_set->size) { - physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size, - reg->merge_set->alignment, is_source); + physreg_t best_reg = find_best_gap(ctx, file, reg, file_size, + reg->merge_set->size, + reg->merge_set->alignment); if (best_reg != (physreg_t)~0u) { best_reg += reg->merge_set_offset; return best_reg; @@ -1085,14 +1419,14 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg, physreg_t src_physreg = ra_interval_get_physreg(src_interval); if (src_physreg % reg_elem_size(reg) == 0 && src_physreg + size <= file_size && - get_reg_specified(file, reg, src_physreg, is_source)) + get_reg_specified(ctx, file, reg, src_physreg, false)) return src_physreg; } } } physreg_t best_reg = - find_best_gap(file, file_size, size, reg_elem_size(reg), is_source); + find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg)); if (best_reg != (physreg_t)~0u) { return best_reg; } @@ -1104,7 +1438,7 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg, unsigned best_eviction_count = ~0; for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) { unsigned eviction_count; - if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) { + if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) { if (eviction_count < best_eviction_count) { best_eviction_count = eviction_count; best_reg = i; @@ -1114,14 +1448,13 @@ get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg, if (best_eviction_count != ~0) { ASSERTED bool result = try_evict_regs( - ctx, file, reg, best_reg, &best_eviction_count, is_source, false); + ctx, file, reg, best_reg, &best_eviction_count, false, false); assert(result); return best_reg; } /* Use the dumb fallback only if try_evict_regs() fails. */ - return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg), - is_source); + return compress_regs_left(ctx, file, reg); } static void @@ -1170,14 +1503,47 @@ static void allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, physreg_t physreg) { + struct ra_file *file = ra_get_file(ctx, dst); struct ra_interval *interval = &ctx->intervals[dst->name]; - update_affinity(dst, physreg); + update_affinity(file, dst, physreg); ra_interval_init(interval, dst); interval->physreg_start = physreg; interval->physreg_end = physreg + reg_size(dst); } +/* If a tied destination interferes with its source register, we have to insert + * a copy beforehand to copy the source to the destination. Because we are using + * the parallel_copies array and not creating a separate copy, this copy will + * happen in parallel with any shuffling around of the tied source, so we have + * to copy the source *as it exists before it is shuffled around*. We do this by + * inserting the copy early, before any other copies are inserted. We don't + * actually know the destination of the copy, but that's ok because the + * dst_interval will be filled out later. + */ +static void +insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst) +{ + struct ir3_register *tied = dst->tied; + + if (!tied) + return; + + struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; + struct ra_interval *dst_interval = &ctx->intervals[dst->name]; + + if (tied_interval->is_killed) + return; + + physreg_t tied_physreg = ra_interval_get_physreg(tied_interval); + + array_insert(ctx, ctx->parallel_copies, + (struct ra_parallel_copy){ + .interval = dst_interval, + .src = tied_physreg, + }); +} + static void allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst) { @@ -1186,33 +1552,17 @@ allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst) struct ir3_register *tied = dst->tied; if (tied) { struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; - struct ra_interval *dst_interval = &ctx->intervals[dst->name]; - physreg_t tied_physreg = ra_interval_get_physreg(tied_interval); if (tied_interval->is_killed) { /* The easy case: the source is killed, so we can just reuse it * for the destination. */ allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval)); - } else { - /* The source is live-through, so we need to get a free register - * (which is free for both the source and destination!), copy the - * original source to it, then use that for the source and - * destination. - */ - physreg_t physreg = get_reg(ctx, file, dst, true); - allocate_dst_fixed(ctx, dst, physreg); - array_insert(ctx, ctx->parallel_copies, - (struct ra_parallel_copy){ - .interval = dst_interval, - .src = tied_physreg, - }); + return; } - - return; } /* All the hard work is done by get_reg here. */ - physreg_t physreg = get_reg(ctx, file, dst, false); + physreg_t physreg = get_reg(ctx, file, dst); allocate_dst_fixed(ctx, dst, physreg); } @@ -1256,7 +1606,8 @@ insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; struct ir3_register *reg = ir3_dst_create(pcopy, INVALID_REG, - entry->interval->interval.reg->flags & ~IR3_REG_SSA); + entry->interval->interval.reg->flags & + (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED)); reg->size = entry->interval->interval.reg->size; reg->wrmask = entry->interval->interval.reg->wrmask; assign_reg(pcopy, reg, ra_interval_get_num(entry->interval)); @@ -1266,7 +1617,8 @@ insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; struct ir3_register *reg = ir3_src_create(pcopy, INVALID_REG, - entry->interval->interval.reg->flags & ~IR3_REG_SSA); + entry->interval->interval.reg->flags & + (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED)); reg->size = entry->interval->interval.reg->size; reg->wrmask = entry->interval->interval.reg->wrmask; assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags)); @@ -1285,6 +1637,11 @@ handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) mark_src_killed(ctx, src); } + /* Pre-insert tied dst copies. */ + ra_foreach_dst (dst, instr) { + insert_tied_dst_copy(ctx, dst); + } + /* Allocate the destination. */ ra_foreach_dst (dst, instr) { allocate_dst(ctx, dst); @@ -1312,6 +1669,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr) struct ir3_register *dst = instr->dsts[0]; struct ir3_register *src = instr->srcs[0]; + if (!(dst->flags & IR3_REG_SSA)) + return; + if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) { handle_normal_instr(ctx, instr); return; @@ -1330,6 +1690,9 @@ handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr) static void handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr) { + if (!(instr->dsts[0]->flags & IR3_REG_SSA)) + return; + struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set; unsigned dst_offset = instr->dsts[0]->merge_set_offset; @@ -1362,8 +1725,14 @@ handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr) struct ra_interval *interval = &ctx->intervals[src->def->name]; - if (src->def->merge_set != dst_set || interval->is_killed) + /* We only need special handling if the source's interval overlaps with + * the destination's interval. + */ + if (src->def->interval_start >= instr->dsts[0]->interval_end || + instr->dsts[0]->interval_start >= src->def->interval_end || + interval->is_killed) continue; + while (interval->interval.parent != NULL) { interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); } @@ -1445,13 +1814,19 @@ handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr) static void handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr) { - if (instr->dsts[0]->num == INVALID_REG) + if (instr->dsts[0]->num == INVALID_REG || + !(instr->dsts[0]->flags & IR3_REG_SSA)) return; + struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]); allocate_dst_fixed(ctx, instr->dsts[0], physreg); - insert_dst(ctx, instr->dsts[0]); + + d("insert precolored dst %u physreg %u", instr->dsts[0]->name, + ra_interval_get_physreg(interval)); + + ra_file_insert(file, interval); interval->frozen = true; } @@ -1471,6 +1846,9 @@ handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr) static void assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr) { + if (!(instr->dsts[0]->flags & IR3_REG_SSA)) + return; + struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); @@ -1517,7 +1895,7 @@ handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src) * anything unless it overlaps with our precolored physreg, so we don't * have to worry about evicting other precolored sources. */ - if (!get_reg_specified(file, src, physreg, true)) { + if (!get_reg_specified(ctx, file, src, physreg, true)) { unsigned eviction_count; if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, false)) { @@ -1615,6 +1993,9 @@ handle_live_out(struct ra_ctx *ctx, struct ir3_register *def) static void handle_phi(struct ra_ctx *ctx, struct ir3_register *def) { + if (!(def->flags & IR3_REG_SSA)) + return; + struct ra_file *file = ra_get_file(ctx, def); struct ra_interval *interval = &ctx->intervals[def->name]; @@ -1630,7 +2011,7 @@ handle_phi(struct ra_ctx *ctx, struct ir3_register *def) physreg = ra_interval_get_physreg(parent) + (def->interval_start - parent_ir3->reg->interval_start); } else { - physreg = get_reg(ctx, file, def, false); + physreg = get_reg(ctx, file, def); } allocate_dst_fixed(ctx, def, physreg); @@ -1641,6 +2022,9 @@ handle_phi(struct ra_ctx *ctx, struct ir3_register *def) static void assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi) { + if (!(phi->dsts[0]->flags & IR3_REG_SSA)) + return; + struct ra_file *file = ra_get_file(ctx, phi->dsts[0]); struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name]; assert(!interval->interval.parent); @@ -1686,12 +2070,10 @@ insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src, struct ir3_register *reg) { struct ir3_instruction *old_pcopy = NULL; - if (!list_is_empty(&block->instr_list)) { - struct ir3_instruction *last = - LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node); - if (last->opc == OPC_META_PARALLEL_COPY) - old_pcopy = last; - } + struct ir3_instruction *last = ir3_block_get_last_non_terminator(block); + + if (last && last->opc == OPC_META_PARALLEL_COPY) + old_pcopy = last; unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0; struct ir3_instruction *pcopy = ir3_instr_create( @@ -1702,8 +2084,9 @@ insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src, pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i]; } - struct ir3_register *dst_reg = - ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA); + unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY); + + struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags); dst_reg->wrmask = reg->wrmask; dst_reg->size = reg->size; assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags)); @@ -1712,8 +2095,7 @@ insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src, pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i]; } - struct ir3_register *src_reg = - ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA); + struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags); src_reg->wrmask = reg->wrmask; src_reg->size = reg->size; assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags)); @@ -1727,15 +2109,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval) { physreg_t physreg = ra_interval_get_physreg(interval); - bool shared = interval->interval.reg->flags & IR3_REG_SHARED; - struct ir3_block **predecessors = - shared ? ctx->block->physical_predecessors : ctx->block->predecessors; - unsigned predecessors_count = shared - ? ctx->block->physical_predecessors_count - : ctx->block->predecessors_count; - - for (unsigned i = 0; i < predecessors_count; i++) { - struct ir3_block *pred = predecessors[i]; + for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { + struct ir3_block *pred = ctx->block->predecessors[i]; struct ra_block_state *pred_state = &ctx->blocks[pred->index]; if (!pred_state->visited) @@ -1743,28 +2118,8 @@ insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval) physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg); if (pred_reg != physreg) { + assert(!(interval->interval.reg->flags & IR3_REG_SHARED)); insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg); - - /* This is a bit tricky, but when visiting the destination of a - * physical-only edge, we have two predecessors (the if and the - * header block) and both have multiple successors. We pick the - * register for all live-ins from the normal edge, which should - * guarantee that there's no need for shuffling things around in - * the normal predecessor as long as there are no phi nodes, but - * we still may need to insert fixup code in the physical - * predecessor (i.e. the last block of the if) and that has - * another successor (the block after the if) so we need to update - * the renames state for when we process the other successor. This - * crucially depends on the other successor getting processed - * after this. - * - * For normal (non-physical) edges we disallow critical edges so - * that hacks like this aren't necessary. - */ - if (!pred_state->renames) - pred_state->renames = _mesa_pointer_hash_table_create(ctx); - _mesa_hash_table_insert(pred_state->renames, interval->interval.reg, - (void *)(uintptr_t)physreg); } } } @@ -1977,13 +2332,13 @@ calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure) unsigned reg_independent_max_waves = ir3_get_reg_independent_max_waves(v, double_threadsize); unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves( - v->shader->compiler, reg_count, double_threadsize); + v->compiler, reg_count, double_threadsize); unsigned target_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves); while (target <= RA_FULL_SIZE / (2 * 4) && ir3_should_double_threadsize(v, target) == double_threadsize && - ir3_get_reg_dependent_max_waves(v->shader->compiler, target, + ir3_get_reg_dependent_max_waves(v->compiler, target, double_threadsize) >= target_waves) target++; @@ -2136,17 +2491,68 @@ calc_min_limit_pressure(struct ir3_shader_variant *v, ralloc_free(ctx); } +/* + * If barriers are used, it must be possible for all waves in the workgroup + * to execute concurrently. Thus we may have to reduce the registers limit. + */ +static void +calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v, + struct ir3_pressure *limit_pressure) +{ + const struct ir3_compiler *compiler = v->compiler; + + unsigned threads_per_wg; + if (v->local_size_variable) { + /* We have to expect the worst case. */ + threads_per_wg = compiler->max_variable_workgroup_size; + } else { + threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2]; + } + + /* The register file is grouped into reg_size_vec4 number of parts. + * Each part has enough registers to add a single vec4 register to + * each thread of a single-sized wave-pair. With double threadsize + * each wave-pair would consume two parts of the register file to get + * a single vec4 for a thread. The more active wave-pairs the less + * parts each could get. + */ + + bool double_threadsize = ir3_should_double_threadsize(v, 0); + unsigned waves_per_wg = DIV_ROUND_UP( + threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) * + compiler->wave_granularity); + + uint32_t vec4_regs_per_thread = + compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1)); + assert(vec4_regs_per_thread > 0); + + uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2; + + if (limit_pressure->full > half_regs_per_thread) { + if (v->mergedregs) { + limit_pressure->full = half_regs_per_thread; + } else { + /* TODO: Handle !mergedregs case, probably we would have to do this + * after the first register pressure pass. + */ + } + } +} + int ir3_ra(struct ir3_shader_variant *v) { ir3_calc_dominance(v->ir); + /* Predicate RA needs dominance. */ + ir3_ra_predicates(v); + ir3_create_parallel_copies(v->ir); struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx); ctx->merged_regs = v->mergedregs; - ctx->compiler = v->shader->compiler; + ctx->compiler = v->compiler; ctx->stage = v->type; struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir); @@ -2155,6 +2561,18 @@ ir3_ra(struct ir3_shader_variant *v) ir3_merge_regs(live, v->ir); + bool has_shared_vectors = false; + foreach_block (block, &v->ir->block_list) { + foreach_instr (instr, &block->instr_list) { + ra_foreach_dst (dst, instr) { + if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) { + has_shared_vectors = true; + break; + } + } + } + } + struct ir3_pressure max_pressure; ir3_calc_pressure(v, live, &max_pressure); d("max pressure:"); @@ -2162,26 +2580,45 @@ ir3_ra(struct ir3_shader_variant *v) d("\thalf: %u", max_pressure.half); d("\tshared: %u", max_pressure.shared); - /* TODO: calculate half/full limit correctly for CS with barrier */ struct ir3_pressure limit_pressure; limit_pressure.full = RA_FULL_SIZE; limit_pressure.half = RA_HALF_SIZE; limit_pressure.shared = RA_SHARED_SIZE; + if (gl_shader_stage_is_compute(v->type) && v->has_barrier) { + calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure); + } + + /* If the user forces a doubled threadsize, we may have to lower the limit + * because on some gens the register file is not big enough to hold a + * double-size wave with all 48 registers in use. + */ + if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY) { + limit_pressure.full = + MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16); + } + /* If requested, lower the limit so that spilling happens more often. */ if (ir3_shader_debug & IR3_DBG_SPILLALL) calc_min_limit_pressure(v, live, &limit_pressure); - if (max_pressure.shared > limit_pressure.shared) { - /* TODO shared reg -> normal reg spilling */ - d("shared max pressure exceeded!"); - goto fail; + if (max_pressure.shared > limit_pressure.shared || has_shared_vectors) { + ir3_ra_shared(v, live); + + /* Recalculate liveness and register pressure now that additional values + * have been added. + */ + ralloc_free(live); + live = ir3_calc_liveness(ctx, v->ir); + ir3_calc_pressure(v, live, &max_pressure); + + ir3_debug_print(v->ir, "AFTER: shared register allocation"); } bool spilled = false; if (max_pressure.full > limit_pressure.full || max_pressure.half > limit_pressure.half) { - if (!v->shader->compiler->has_pvtmem) { + if (!v->compiler->has_pvtmem) { d("max pressure exceeded!"); goto fail; } @@ -2206,10 +2643,12 @@ ir3_ra(struct ir3_shader_variant *v) ctx->shared.size = RA_SHARED_SIZE; + ctx->full.start = ctx->half.start = ctx->shared.start = 0; + foreach_block (block, &v->ir->block_list) handle_block(ctx, block); - ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count); + ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false); /* Strip array-ness and SSA-ness at the end, because various helpers still * need to work even on definitions that have already been assigned. For |