summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Ekstrand <jason@jlekstrand.net>2019-10-31 15:57:52 -0500
committerJason Ekstrand <jason@jlekstrand.net>2019-11-18 18:35:14 +0000
commitd1c4e64a69e49c64148529024ecb700d18d3c1c8 (patch)
tree728c7cfc80cfd4ab9c47f8419f7ff09014a1bec5
parentaecde235198f6c8dccb0d26b6397f1efb5e22bfe (diff)
intel/compiler: Add a flag to avoid compacting push constants
In vec4, we can just not run the pass. In fs, things are a bit more deeply intertwined. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
-rw-r--r--src/gallium/drivers/iris/iris_screen.c1
-rw-r--r--src/intel/compiler/brw_compiler.h6
-rw-r--r--src/intel/compiler/brw_fs.cpp303
-rw-r--r--src/intel/compiler/brw_vec4.cpp3
-rw-r--r--src/intel/vulkan/anv_device.c1
-rw-r--r--src/mesa/drivers/dri/i965/intel_screen.c1
6 files changed, 170 insertions, 145 deletions
diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c
index f1bb03de8f9..cbefa9dfb13 100644
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -673,6 +673,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
screen->compiler->shader_perf_log = iris_shader_perf_log;
screen->compiler->supports_pull_constants = false;
screen->compiler->supports_shader_constants = true;
+ screen->compiler->compact_params = false;
iris_disk_cache_init(screen);
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index dd2de2094e7..6e6610a26af 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -119,6 +119,12 @@ struct brw_compiler {
* whether nir_opt_large_constants will be run.
*/
bool supports_shader_constants;
+
+ /**
+ * Whether or not the driver wants uniform params to be compacted by the
+ * back-end compiler.
+ */
+ bool compact_params;
};
/**
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 37a90d0d489..0d3d6137058 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2307,159 +2307,190 @@ fs_visitor::assign_constant_locations()
return;
}
- struct uniform_slot_info slots[uniforms];
- memset(slots, 0, sizeof(slots));
+ if (compiler->compact_params) {
+ struct uniform_slot_info slots[uniforms];
+ memset(slots, 0, sizeof(slots));
- foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
- for (int i = 0 ; i < inst->sources; i++) {
- if (inst->src[i].file != UNIFORM)
- continue;
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ for (int i = 0 ; i < inst->sources; i++) {
+ if (inst->src[i].file != UNIFORM)
+ continue;
- /* NIR tightly packs things so the uniform number might not be
- * aligned (if we have a double right after a float, for instance).
- * This is fine because the process of re-arranging them will ensure
- * that things are properly aligned. The offset into that uniform,
- * however, must be aligned.
- *
- * In Vulkan, we have explicit offsets but everything is crammed
- * into a single "variable" so inst->src[i].nr will always be 0.
- * Everything will be properly aligned relative to that one base.
- */
- assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
+ /* NIR tightly packs things so the uniform number might not be
+ * aligned (if we have a double right after a float, for
+ * instance). This is fine because the process of re-arranging
+ * them will ensure that things are properly aligned. The offset
+ * into that uniform, however, must be aligned.
+ *
+ * In Vulkan, we have explicit offsets but everything is crammed
+ * into a single "variable" so inst->src[i].nr will always be 0.
+ * Everything will be properly aligned relative to that one base.
+ */
+ assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
- unsigned u = inst->src[i].nr +
- inst->src[i].offset / UNIFORM_SLOT_SIZE;
+ unsigned u = inst->src[i].nr +
+ inst->src[i].offset / UNIFORM_SLOT_SIZE;
- if (u >= uniforms)
- continue;
+ if (u >= uniforms)
+ continue;
- unsigned slots_read;
- if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
- slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
- } else {
- unsigned bytes_read = inst->components_read(i) *
- type_sz(inst->src[i].type);
- slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
- }
+ unsigned slots_read;
+ if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+ slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
+ } else {
+ unsigned bytes_read = inst->components_read(i) *
+ type_sz(inst->src[i].type);
+ slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
+ }
- assert(u + slots_read <= uniforms);
- mark_uniform_slots_read(&slots[u], slots_read,
- type_sz(inst->src[i].type));
+ assert(u + slots_read <= uniforms);
+ mark_uniform_slots_read(&slots[u], slots_read,
+ type_sz(inst->src[i].type));
+ }
}
- }
- int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
+ int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
- /* Only allow 16 registers (128 uniform components) as push constants.
- *
- * Just demote the end of the list. We could probably do better
- * here, demoting things that are rarely used in the program first.
- *
- * If changing this value, note the limitation about total_regs in
- * brw_curbe.c.
- */
- unsigned int max_push_components = 16 * 8;
- if (subgroup_id_index >= 0)
- max_push_components--; /* Save a slot for the thread ID */
+ /* Only allow 16 registers (128 uniform components) as push constants.
+ *
+ * Just demote the end of the list. We could probably do better
+ * here, demoting things that are rarely used in the program first.
+ *
+ * If changing this value, note the limitation about total_regs in
+ * brw_curbe.c.
+ */
+ unsigned int max_push_components = 16 * 8;
+ if (subgroup_id_index >= 0)
+ max_push_components--; /* Save a slot for the thread ID */
- /* We push small arrays, but no bigger than 16 floats. This is big enough
- * for a vec4 but hopefully not large enough to push out other stuff. We
- * should probably use a better heuristic at some point.
- */
- const unsigned int max_chunk_size = 16;
+ /* We push small arrays, but no bigger than 16 floats. This is big
+ * enough for a vec4 but hopefully not large enough to push out other
+ * stuff. We should probably use a better heuristic at some point.
+ */
+ const unsigned int max_chunk_size = 16;
- unsigned int num_push_constants = 0;
- unsigned int num_pull_constants = 0;
+ unsigned int num_push_constants = 0;
+ unsigned int num_pull_constants = 0;
- push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
- /* Default to -1 meaning no location */
- memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
- memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+ /* Default to -1 meaning no location */
+ memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+ memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
- int chunk_start = -1;
- struct cplx_align align;
- for (unsigned u = 0; u < uniforms; u++) {
- if (!slots[u].is_live) {
- assert(chunk_start == -1);
- continue;
- }
+ int chunk_start = -1;
+ struct cplx_align align;
+ for (unsigned u = 0; u < uniforms; u++) {
+ if (!slots[u].is_live) {
+ assert(chunk_start == -1);
+ continue;
+ }
- /* Skip subgroup_id_index to put it in the last push register. */
- if (subgroup_id_index == (int)u)
- continue;
+ /* Skip subgroup_id_index to put it in the last push register. */
+ if (subgroup_id_index == (int)u)
+ continue;
- if (chunk_start == -1) {
- chunk_start = u;
- align = slots[u].align;
- } else {
- /* Offset into the chunk */
- unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
+ if (chunk_start == -1) {
+ chunk_start = u;
+ align = slots[u].align;
+ } else {
+ /* Offset into the chunk */
+ unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
- /* Shift the slot alignment down by the chunk offset so it is
- * comparable with the base chunk alignment.
- */
- struct cplx_align slot_align = slots[u].align;
- slot_align.offset =
- (slot_align.offset - chunk_offset) & (align.mul - 1);
+ /* Shift the slot alignment down by the chunk offset so it is
+ * comparable with the base chunk alignment.
+ */
+ struct cplx_align slot_align = slots[u].align;
+ slot_align.offset =
+ (slot_align.offset - chunk_offset) & (align.mul - 1);
- align = cplx_align_combine(align, slot_align);
- }
+ align = cplx_align_combine(align, slot_align);
+ }
- /* Sanity check the alignment */
- cplx_align_assert_sane(align);
+ /* Sanity check the alignment */
+ cplx_align_assert_sane(align);
- if (slots[u].contiguous)
- continue;
+ if (slots[u].contiguous)
+ continue;
- /* Adjust the alignment to be in terms of slots, not bytes */
- assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
- assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
- align.mul /= UNIFORM_SLOT_SIZE;
- align.offset /= UNIFORM_SLOT_SIZE;
-
- unsigned push_start_align = cplx_align_apply(align, num_push_constants);
- unsigned chunk_size = u - chunk_start + 1;
- if ((!compiler->supports_pull_constants && u < UBO_START) ||
- (chunk_size < max_chunk_size &&
- push_start_align + chunk_size <= max_push_components)) {
- /* Align up the number of push constants */
- num_push_constants = push_start_align;
- for (unsigned i = 0; i < chunk_size; i++)
- push_constant_loc[chunk_start + i] = num_push_constants++;
- } else {
- /* We need to pull this one */
- num_pull_constants = cplx_align_apply(align, num_pull_constants);
- for (unsigned i = 0; i < chunk_size; i++)
- pull_constant_loc[chunk_start + i] = num_pull_constants++;
+ /* Adjust the alignment to be in terms of slots, not bytes */
+ assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
+ assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
+ align.mul /= UNIFORM_SLOT_SIZE;
+ align.offset /= UNIFORM_SLOT_SIZE;
+
+ unsigned push_start_align = cplx_align_apply(align, num_push_constants);
+ unsigned chunk_size = u - chunk_start + 1;
+ if ((!compiler->supports_pull_constants && u < UBO_START) ||
+ (chunk_size < max_chunk_size &&
+ push_start_align + chunk_size <= max_push_components)) {
+ /* Align up the number of push constants */
+ num_push_constants = push_start_align;
+ for (unsigned i = 0; i < chunk_size; i++)
+ push_constant_loc[chunk_start + i] = num_push_constants++;
+ } else {
+ /* We need to pull this one */
+ num_pull_constants = cplx_align_apply(align, num_pull_constants);
+ for (unsigned i = 0; i < chunk_size; i++)
+ pull_constant_loc[chunk_start + i] = num_pull_constants++;
+ }
+
+ /* Reset the chunk and start again */
+ chunk_start = -1;
}
- /* Reset the chunk and start again */
- chunk_start = -1;
- }
+ /* Add the CS local thread ID uniform at the end of the push constants */
+ if (subgroup_id_index >= 0)
+ push_constant_loc[subgroup_id_index] = num_push_constants++;
- /* Add the CS local thread ID uniform at the end of the push constants */
- if (subgroup_id_index >= 0)
- push_constant_loc[subgroup_id_index] = num_push_constants++;
+ /* As the uniforms are going to be reordered, stash the old array and
+ * create two new arrays for push/pull params.
+ */
+ uint32_t *param = stage_prog_data->param;
+ stage_prog_data->nr_params = num_push_constants;
+ if (num_push_constants) {
+ stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
+ num_push_constants);
+ } else {
+ stage_prog_data->param = NULL;
+ }
+ assert(stage_prog_data->nr_pull_params == 0);
+ assert(stage_prog_data->pull_param == NULL);
+ if (num_pull_constants > 0) {
+ stage_prog_data->nr_pull_params = num_pull_constants;
+ stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
+ num_pull_constants);
+ }
- /* As the uniforms are going to be reordered, stash the old array and
- * create two new arrays for push/pull params.
- */
- uint32_t *param = stage_prog_data->param;
- stage_prog_data->nr_params = num_push_constants;
- if (num_push_constants) {
- stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
- num_push_constants);
+ /* Up until now, the param[] array has been indexed by reg + offset
+ * of UNIFORM registers. Move pull constants into pull_param[] and
+ * condense param[] to only contain the uniforms we chose to push.
+ *
+ * NOTE: Because we are condensing the params[] array, we know that
+ * push_constant_loc[i] <= i and we can do it in one smooth loop without
+ * having to make a copy.
+ */
+ for (unsigned int i = 0; i < uniforms; i++) {
+ uint32_t value = param[i];
+ if (pull_constant_loc[i] != -1) {
+ stage_prog_data->pull_param[pull_constant_loc[i]] = value;
+ } else if (push_constant_loc[i] != -1) {
+ stage_prog_data->param[push_constant_loc[i]] = value;
+ }
+ }
+ ralloc_free(param);
} else {
- stage_prog_data->param = NULL;
- }
- assert(stage_prog_data->nr_pull_params == 0);
- assert(stage_prog_data->pull_param == NULL);
- if (num_pull_constants > 0) {
- stage_prog_data->nr_pull_params = num_pull_constants;
- stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
- num_pull_constants);
+ /* If we don't want to compact anything, just set up dummy push/pull
+ * arrays. All the rest of the compiler cares about are these arrays.
+ */
+ push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+ pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+
+ for (unsigned u = 0; u < uniforms; u++)
+ push_constant_loc[u] = u;
+
+ memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
}
/* Now that we know how many regular uniforms we'll push, reduce the
@@ -2475,24 +2506,6 @@ fs_visitor::assign_constant_locations()
push_length += range->length;
}
assert(push_length <= 64);
-
- /* Up until now, the param[] array has been indexed by reg + offset
- * of UNIFORM registers. Move pull constants into pull_param[] and
- * condense param[] to only contain the uniforms we chose to push.
- *
- * NOTE: Because we are condensing the params[] array, we know that
- * push_constant_loc[i] <= i and we can do it in one smooth loop without
- * having to make a copy.
- */
- for (unsigned int i = 0; i < uniforms; i++) {
- uint32_t value = param[i];
- if (pull_constant_loc[i] != -1) {
- stage_prog_data->pull_param[pull_constant_loc[i]] = value;
- } else if (push_constant_loc[i] != -1) {
- stage_prog_data->param[push_constant_loc[i]] = value;
- }
- }
- ralloc_free(param);
}
bool
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index 9fa946ccdb4..fb4cf365cb4 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -633,6 +633,9 @@ set_push_constant_loc(const int nr_uniforms, int *new_uniform_count,
void
vec4_visitor::pack_uniform_registers()
{
+ if (!compiler->compact_params)
+ return;
+
uint8_t chans_used[this->uniforms];
int new_loc[this->uniforms];
int new_chan[this->uniforms];
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 28a652867d9..0fc507c12a8 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -560,6 +560,7 @@ anv_physical_device_init(struct anv_physical_device *device,
device->compiler->constant_buffer_0_is_relative =
device->info.gen < 8 || !device->has_context_isolation;
device->compiler->supports_shader_constants = true;
+ device->compiler->compact_params = true;
/* Broadwell PRM says:
*
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 6040161d8f9..c690a8ede7c 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -2799,6 +2799,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
!(screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION);
screen->compiler->supports_pull_constants = true;
+ screen->compiler->compact_params = true;
screen->has_exec_fence =
intel_get_boolean(screen, I915_PARAM_HAS_EXEC_FENCE);