diff options
author | Rhys Perry <pendingchaos02@gmail.com> | 2020-11-06 19:27:09 +0000 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-06-10 13:17:22 +0000 |
commit | 2e7bceb2204fe66a3232f78ed4a43965eb50529a (patch) | |
tree | 66dc1af96e0adc86cbb8c16626fbea5e11c31c92 /src/compiler | |
parent | 674b0af3b343acd546d64d118af56de2b9f5b77f (diff) |
nir/load_store_vectorizer: fix check_for_robustness() with indirect loads
fossil-db (GFX10.3, robustness2 enabled):
Totals from 13958 (9.54% of 146267) affected shaders:
VGPRs: 609168 -> 624304 (+2.48%); split: -0.05%, +2.53%
CodeSize: 48229504 -> 48488392 (+0.54%); split: -0.02%, +0.56%
MaxWaves: 354426 -> 349448 (-1.40%); split: +0.00%, -1.41%
Instrs: 9332093 -> 9375053 (+0.46%); split: -0.03%, +0.49%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7295>
Diffstat (limited to 'src/compiler')
-rw-r--r-- | src/compiler/nir/nir_opt_load_store_vectorize.c | 67 | ||||
-rw-r--r-- | src/compiler/nir/tests/load_store_vectorizer_tests.cpp | 76 |
2 files changed, 132 insertions, 11 deletions
diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 10c260e46d3..070da47e670 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -997,20 +997,65 @@ check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry return false; } +static uint64_t +calc_gcd(uint64_t a, uint64_t b) +{ + while (b != 0) { + int tmp_a = a; + a = b; + b = tmp_a % b; + } + return a; +} + +static uint64_t +round_down(uint64_t a, uint64_t b) +{ + return a / b * b; +} + static bool -check_for_robustness(struct vectorize_ctx *ctx, struct entry *low) +addition_wraps(uint64_t a, uint64_t b, unsigned bits) +{ + uint64_t mask = BITFIELD64_MASK(bits); + return ((a + b) & mask) < (a & mask); +} + +/* Return true if the addition of "low"'s offset and "high_offset" could wrap + * around. + * + * This is to prevent a situation where the hardware considers the high load + * out-of-bounds after vectorization if the low load is out-of-bounds, even if + * the wrap-around from the addition could make the high load in-bounds. + */ +static bool +check_for_robustness(struct vectorize_ctx *ctx, struct entry *low, uint64_t high_offset) { nir_variable_mode mode = get_variable_mode(low); - if (mode & ctx->options->robust_modes) { - unsigned low_bit_size = get_bit_size(low); - unsigned low_size = low->intrin->num_components * low_bit_size; + if (!(mode & ctx->options->robust_modes)) + return false; - /* don't attempt to vectorize accesses if the offset can overflow. */ - /* TODO: handle indirect accesses. */ - return low->offset_signed < 0 && low->offset_signed + low_size >= 0; - } + /* First, try to use alignment information in case the application provided some. If the addition + * of the maximum offset of the low load and "high_offset" wraps around, we can't combine the low + * and high loads. + */ + uint64_t max_low = round_down(UINT64_MAX, low->align_mul) + low->align_offset; + if (!addition_wraps(max_low, high_offset, 64)) + return false; - return false; + /* Second, use information about the factors from address calculation (offset_defs_mul). These + * are not guaranteed to be power-of-2. + */ + uint64_t stride = 0; + for (unsigned i = 0; i < low->key->offset_def_count; i++) + stride = calc_gcd(low->key->offset_defs_mul[i], stride); + + unsigned addition_bits = low->intrin->src[low->info->base_src].ssa->bit_size; + /* low's offset must be a multiple of "stride" plus "low->offset". */ + max_low = low->offset; + if (stride) + max_low = round_down(BITFIELD64_MASK(addition_bits), stride) + (low->offset % stride); + return addition_wraps(max_low, high_offset, addition_bits); } static bool @@ -1037,7 +1082,8 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, if (check_for_aliasing(ctx, first, second)) return false; - if (check_for_robustness(ctx, low)) + uint64_t diff = high->offset_signed - low->offset_signed; + if (check_for_robustness(ctx, low, diff)) return false; /* we can only vectorize non-volatile loads/stores of the same type and with @@ -1055,7 +1101,6 @@ try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx, } /* gather information */ - uint64_t diff = high->offset_signed - low->offset_signed; unsigned low_bit_size = get_bit_size(low); unsigned high_bit_size = get_bit_size(high); unsigned low_size = low->intrin->num_components * low_bit_size; diff --git a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp index 5a841a2ceb7..575290e595a 100644 --- a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp +++ b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp @@ -1859,6 +1859,82 @@ TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust) ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); } +TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride1) +{ + nir_ssa_def *offset = nir_load_local_invocation_index(b); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 4), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_FALSE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride8) +{ + nir_ssa_def *offset = nir_load_local_invocation_index(b); + offset = nir_imul_imm(b, offset, 8); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 4), 0x2); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride12) +{ + nir_ssa_def *offset = nir_load_local_invocation_index(b); + offset = nir_imul_imm(b, offset, 12); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1); + nir_ssa_def *offset_4 = nir_iadd_imm(b, offset, 4); + create_indirect_load(nir_var_mem_ssbo, 0, offset_4, 0x2); + create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 8), 0x3); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 3); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 2); + + nir_intrinsic_instr *load = get_intrinsic(nir_intrinsic_load_ssbo, 0); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 1); + ASSERT_EQ(load->src[1].ssa, offset); + EXPECT_INSTR_SWIZZLES(movs[0x1], load, "x"); + + load = get_intrinsic(nir_intrinsic_load_ssbo, 1); + ASSERT_EQ(load->dest.ssa.bit_size, 32); + ASSERT_EQ(load->dest.ssa.num_components, 2); + ASSERT_EQ(load->src[1].ssa, offset_4); + EXPECT_INSTR_SWIZZLES(movs[0x2], load, "x"); + EXPECT_INSTR_SWIZZLES(movs[0x3], load, "y"); +} + +TEST_F(nir_load_store_vectorize_test, ssbo_offset_overflow_robust_indirect_stride16) +{ + nir_ssa_def *offset = nir_load_local_invocation_index(b); + offset = nir_imul_imm(b, offset, 16); + create_indirect_load(nir_var_mem_ssbo, 0, offset, 0x1); + create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 4), 0x2); + create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 8), 0x3); + create_indirect_load(nir_var_mem_ssbo, 0, nir_iadd_imm(b, offset, 12), 0x4); + + nir_validate_shader(b->shader, NULL); + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 4); + + EXPECT_TRUE(run_vectorizer(nir_var_mem_ssbo, false, nir_var_mem_ssbo)); + + ASSERT_EQ(count_intrinsics(nir_intrinsic_load_ssbo), 1); +} + TEST_F(nir_load_store_vectorize_test, ubo_alignment_16_4) { nir_ssa_def *offset = nir_load_local_invocation_index(b); |