summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2011-03-11 19:19:01 -0800
committerEric Anholt <eric@anholt.net>2011-04-26 12:19:46 -0700
commit662f1b48bd1a02907bb42ecda889a3aa52a5755d (patch)
tree3291e2e1fde69576de2dc7c84c80e5f77656ea5c
parent76b7a0c1af23838cb5100424a2a88d621b881d05 (diff)
i965/fs: Add initial support for 16-wide dispatch on gen6.
At this point it doesn't do uniforms, which have to be laid out the same between 8 and 16. Other than that, it supports everything but flow control, which was the thing that forced us to choose 8-wide for general GLSL support. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.h3
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp242
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h13
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm.c13
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm.h4
-rw-r--r--src/mesa/drivers/dri/i965/gen6_wm_state.c16
6 files changed, 210 insertions, 81 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1daa49abfb3..6bf8a1c83c7 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -204,13 +204,16 @@ struct brw_wm_prog_data {
GLuint urb_read_length;
GLuint first_curbe_grf;
+ GLuint first_curbe_grf_16;
GLuint total_grf;
+ GLuint total_grf_16;
GLuint total_scratch;
GLuint nr_params; /**< number of float params/constants */
GLuint nr_pull_params;
GLboolean error;
int dispatch_width;
+ uint32_t prog_offset_16;
/* Pointer to tracked values (only valid once
* _mesa_load_state_parameters has been called at runtime).
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index bb71463bebc..8785957b6e6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -194,6 +194,32 @@ fs_visitor::fail(const char *format, ...)
}
}
+void
+fs_visitor::push_force_uncompressed()
+{
+ force_uncompressed_stack++;
+}
+
+void
+fs_visitor::pop_force_uncompressed()
+{
+ force_uncompressed_stack--;
+ assert(force_uncompressed_stack >= 0);
+}
+
+void
+fs_visitor::push_force_sechalf()
+{
+ force_sechalf_stack++;
+}
+
+void
+fs_visitor::pop_force_sechalf()
+{
+ force_sechalf_stack--;
+ assert(force_sechalf_stack >= 0);
+}
+
/**
* Returns how many MRFs an FS opcode will write over.
*
@@ -1738,6 +1764,10 @@ fs_visitor::visit(ir_if *ir)
{
fs_inst *inst;
+ if (c->dispatch_width == 16) {
+ fail("Can't support (non-uniform) control flow on 16-wide\n");
+ }
+
/* Don't point the annotation at the if statement, because then it plus
* the then and else blocks get printed.
*/
@@ -1778,6 +1808,10 @@ fs_visitor::visit(ir_loop *ir)
{
fs_reg counter = reg_undef;
+ if (c->dispatch_width == 16) {
+ fail("Can't support (non-uniform) control flow on 16-wide\n");
+ }
+
if (ir->counter) {
this->base_ir = ir->counter;
ir->counter->accept(this);
@@ -1881,6 +1915,11 @@ fs_visitor::emit(fs_inst inst)
fs_inst *list_inst = new(mem_ctx) fs_inst;
*list_inst = inst;
+ if (force_uncompressed_stack > 0)
+ list_inst->force_uncompressed = true;
+ else if (force_sechalf_stack > 0)
+ list_inst->force_sechalf = true;
+
list_inst->annotation = this->current_annotation;
list_inst->ir = this->base_ir;
@@ -2006,6 +2045,7 @@ fs_visitor::emit_fb_writes()
this->current_annotation = "FB write header";
GLboolean header_present = GL_TRUE;
int nr = 0;
+ int reg_width = c->dispatch_width / 8;
if (intel->gen >= 6 &&
!this->kill_emitted &&
@@ -2019,31 +2059,44 @@ fs_visitor::emit_fb_writes()
}
if (c->aa_dest_stencil_reg) {
+ push_force_uncompressed();
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
+ pop_force_uncompressed();
}
/* Reserve space for color. It'll be filled in per MRT below. */
int color_mrf = nr;
- nr += 4;
+ nr += 4 * reg_width;
if (c->source_depth_to_render_target) {
+ if (intel->gen == 6 && c->dispatch_width == 16) {
+ /* For outputting oDepth on gen6, SIMD8 writes have to be
+ * used. This would require 8-wide moves of each half to
+ * message regs, kind of like pre-gen5 SIMD16 FB writes.
+ * Just bail on doing so for now.
+ */
+ fail("Missing support for simd16 depth writes on gen6\n");
+ }
+
if (c->computes_depth) {
/* Hand over gl_FragDepth. */
assert(this->frag_depth);
fs_reg depth = *(variable_storage(this->frag_depth));
- emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
+ emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
} else {
/* Pass through the payload depth. */
- emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+ emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
}
+ nr += reg_width;
}
if (c->dest_depth_reg) {
- emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+ emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
+ nr += reg_width;
}
fs_reg color = reg_undef;
@@ -2060,7 +2113,7 @@ fs_visitor::emit_fb_writes()
target);
if (this->frag_color || this->frag_data) {
for (int i = 0; i < 4; i++) {
- emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
+ emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color);
color.reg_offset++;
}
}
@@ -2144,7 +2197,7 @@ fs_visitor::generate_fb_write(fs_inst *inst)
brw_pop_insn_state(p);
brw_fb_WRITE(p,
- 8, /* dispatch_width */
+ c->dispatch_width,
inst->base_mrf,
implied_header,
inst->target,
@@ -2608,8 +2661,12 @@ fs_visitor::setup_paramvalues_refs()
void
fs_visitor::assign_curb_setup()
{
- c->prog_data.first_curbe_grf = c->nr_payload_regs;
c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
+ if (c->dispatch_width == 8) {
+ c->prog_data.first_curbe_grf = c->nr_payload_regs;
+ } else {
+ c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
+ }
/* Map the offsets in the UNIFORM file to fixed HW regs. */
foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -2618,7 +2675,7 @@ fs_visitor::assign_curb_setup()
for (unsigned int i = 0; i < 3; i++) {
if (inst->src[i].file == UNIFORM) {
int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
- struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
+ struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
constant_nr / 8,
constant_nr % 8);
@@ -2670,7 +2727,7 @@ fs_visitor::calculate_urb_setup()
void
fs_visitor::assign_urb_setup()
{
- int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
+ int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
/* Offset all the urb_setup[] index by the actual position of the
* setup regs, now that the location of the constants has been chosen.
@@ -3516,7 +3573,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
void
fs_visitor::generate_code()
{
- int last_native_inst = 0;
+ int last_native_inst = p->nr_insn;
const char *last_annotation_string = NULL;
ir_instruction *last_annotation_ir = NULL;
@@ -3532,8 +3589,8 @@ fs_visitor::generate_code()
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
- printf("Native code for fragment shader %d:\n",
- ctx->Shader.CurrentFragmentProgram->Name);
+ printf("Native code for fragment shader %d (%d-wide dispatch):\n",
+ ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
}
foreach_iter(exec_list_iterator, iter, this->instructions) {
@@ -3566,6 +3623,14 @@ fs_visitor::generate_code()
brw_set_predicate_inverse(p, inst->predicate_inverse);
brw_set_saturate(p, inst->saturate);
+ if (inst->force_uncompressed || c->dispatch_width == 8) {
+ brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+ } else if (inst->force_sechalf) {
+ brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ } else {
+ brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ }
+
switch (inst->opcode) {
case BRW_OPCODE_MOV:
brw_MOV(p, dst, src[0]);
@@ -3804,108 +3869,149 @@ fs_visitor::generate_code()
}
}
-GLboolean
-brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+bool
+fs_visitor::run()
{
- struct intel_context *intel = &brw->intel;
- struct gl_context *ctx = &intel->ctx;
- struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+ uint32_t prog_offset_16 = 0;
- if (!prog)
- return GL_FALSE;
+ brw_wm_payload_setup(brw, c);
- struct brw_shader *shader =
- (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
- if (!shader)
- return GL_FALSE;
+ if (c->dispatch_width == 16) {
+ if (c->prog_data.curb_read_length) {
+ /* Haven't hooked in support for uniforms through the 16-wide
+ * version yet.
+ */
+ return GL_FALSE;
+ }
- /* We always use 8-wide mode, at least for now. For one, flow
- * control only works in 8-wide. Also, when we're fragment shader
- * bound, we're almost always under register pressure as well, so
- * 8-wide would save us from the performance cliff of spilling
- * regs.
- */
- c->dispatch_width = 8;
+ /* align to 64 byte boundary. */
+ while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
+ brw_NOP(p);
+ }
- if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
- printf("GLSL IR for native fragment shader %d:\n", prog->Name);
- _mesa_print_ir(shader->ir, NULL);
- printf("\n");
- }
+ /* Save off the start of this 16-wide program in case we succeed. */
+ prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
- /* Now the main event: Visit the shader IR and generate our FS IR for it.
- */
- fs_visitor v(c, shader);
+ brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ }
if (0) {
- v.emit_dummy_fs();
+ emit_dummy_fs();
} else {
- v.calculate_urb_setup();
+ calculate_urb_setup();
if (intel->gen < 6)
- v.emit_interpolation_setup_gen4();
+ emit_interpolation_setup_gen4();
else
- v.emit_interpolation_setup_gen6();
+ emit_interpolation_setup_gen6();
/* Generate FS IR for main(). (the visitor only descends into
* functions called "main").
*/
foreach_iter(exec_list_iterator, iter, *shader->ir) {
ir_instruction *ir = (ir_instruction *)iter.get();
- v.base_ir = ir;
- ir->accept(&v);
+ base_ir = ir;
+ ir->accept(this);
}
- v.emit_fb_writes();
+ emit_fb_writes();
- v.split_virtual_grfs();
+ split_virtual_grfs();
- v.setup_paramvalues_refs();
- v.setup_pull_constants();
+ setup_paramvalues_refs();
+ setup_pull_constants();
bool progress;
do {
progress = false;
- progress = v.remove_duplicate_mrf_writes() || progress;
+ progress = remove_duplicate_mrf_writes() || progress;
- progress = v.propagate_constants() || progress;
- progress = v.register_coalesce() || progress;
- progress = v.compute_to_mrf() || progress;
- progress = v.dead_code_eliminate() || progress;
+ progress = propagate_constants() || progress;
+ progress = register_coalesce() || progress;
+ progress = compute_to_mrf() || progress;
+ progress = dead_code_eliminate() || progress;
} while (progress);
- v.schedule_instructions();
+ schedule_instructions();
- v.assign_curb_setup();
- v.assign_urb_setup();
+ assign_curb_setup();
+ assign_urb_setup();
if (0) {
/* Debug of register spilling: Go spill everything. */
- int virtual_grf_count = v.virtual_grf_next;
+ int virtual_grf_count = virtual_grf_next;
for (int i = 1; i < virtual_grf_count; i++) {
- v.spill_reg(i);
+ spill_reg(i);
}
}
if (0)
- v.assign_regs_trivial();
+ assign_regs_trivial();
else {
- while (!v.assign_regs()) {
- if (v.failed)
+ while (!assign_regs()) {
+ if (failed)
break;
}
}
}
+ assert(force_uncompressed_stack == 0);
+ assert(force_sechalf_stack == 0);
- if (!v.failed)
- v.generate_code();
-
- assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
+ if (!failed)
+ generate_code();
- if (v.failed)
+ if (failed)
return GL_FALSE;
- c->prog_data.total_grf = v.grf_used;
+ if (c->dispatch_width == 8) {
+ c->prog_data.total_grf = grf_used;
+ } else {
+ c->prog_data.total_grf_16 = grf_used;
+ c->prog_data.prog_offset_16 = prog_offset_16;
+ }
+
+ return !failed;
+}
- return GL_TRUE;
+bool
+brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+ struct intel_context *intel = &brw->intel;
+ struct gl_context *ctx = &intel->ctx;
+ struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
+
+ if (!prog)
+ return false;
+
+ struct brw_shader *shader =
+ (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
+ if (!shader)
+ return false;
+
+ if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+ printf("GLSL IR for native fragment shader %d:\n", prog->Name);
+ _mesa_print_ir(shader->ir, NULL);
+ printf("\n");
+ }
+
+ /* Now the main event: Visit the shader IR and generate our FS IR for it.
+ */
+ c->dispatch_width = 8;
+
+ fs_visitor v(c, shader);
+ if (!v.run()) {
+ /* FINISHME: Cleanly fail, test at link time, etc. */
+ assert(!"not reached");
+ return false;
+ }
+
+ if (intel->gen >= 6) {
+ c->dispatch_width = 16;
+ fs_visitor v2(c, shader);
+ v2.run();
+ }
+
+ c->prog_data.dispatch_width = 8;
+
+ return true;
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index fd83fcb3829..b158992071e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -343,6 +343,8 @@ public:
bool eot;
bool header_present;
bool shadow_compare;
+ bool force_uncompressed;
+ bool force_sechalf;
uint32_t offset; /* spill/unspill offset */
/** @{
@@ -405,6 +407,8 @@ public:
this->live_intervals_valid = false;
this->kill_emitted = false;
+ this->force_uncompressed_stack = 0;
+ this->force_sechalf_stack = 0;
}
~fs_visitor()
@@ -461,6 +465,7 @@ public:
return emit(fs_inst(opcode, dst, src0, src1, src2));
}
+ bool run();
void setup_paramvalues_refs();
void assign_curb_setup();
void calculate_urb_setup();
@@ -481,6 +486,11 @@ public:
void schedule_instructions();
void fail(const char *msg, ...);
+ void push_force_uncompressed();
+ void pop_force_uncompressed();
+ void push_force_sechalf();
+ void pop_force_sechalf();
+
void generate_code();
void generate_fb_write(fs_inst *inst);
void generate_pixel_xy(struct brw_reg dst, bool is_x);
@@ -568,6 +578,9 @@ public:
fs_reg reg_null_cmp;
int grf_used;
+
+ int force_uncompressed_stack;
+ int force_sechalf_stack;
};
GLboolean brw_do_channel_expressions(struct exec_list *instructions);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index c4b2157db55..4564fb6b1ad 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -120,7 +120,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
brw_wm_emit(c);
}
-static void
+void
brw_wm_payload_setup(struct brw_context *brw,
struct brw_wm_compile *c)
{
@@ -225,18 +225,13 @@ static void do_wm_prog( struct brw_context *brw,
brw_init_compile(brw, &c->func);
- brw_wm_payload_setup(brw, c);
-
if (!brw_wm_fs_emit(brw, c)) {
- /*
- * Shader which use GLSL features such as flow control are handled
- * differently from "simple" shaders.
- */
+ /* Fallback for fixed function and ARB_fp shaders. */
c->dispatch_width = 16;
brw_wm_payload_setup(brw, c);
brw_wm_non_glsl_emit(brw, c);
+ c->prog_data.dispatch_width = 16;
}
- c->prog_data.dispatch_width = c->dispatch_width;
/* Scratch space is used for register spilling */
if (c->last_scratch) {
@@ -467,7 +462,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
struct brw_wm_prog_key key;
struct brw_fragment_program *fp = (struct brw_fragment_program *)
brw->fragment_program;
-
+
brw_wm_populate_key(brw, &key);
/* Make an early check for the key.
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 5d1e4045928..8e5a9cdb86c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -314,7 +314,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
void brw_wm_lookup_iz(struct intel_context *intel,
struct brw_wm_compile *c);
-GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
+bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
/* brw_wm_emit.c */
void emit_alu1(struct brw_compile *p,
@@ -474,5 +474,7 @@ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint
bool brw_color_buffer_write_enabled(struct brw_context *brw);
bool brw_render_target_supported(gl_format format);
+void brw_wm_payload_setup(struct brw_context *brw,
+ struct brw_wm_compile *c);
#endif
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 8215cb15a9c..d4fca788cb9 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -143,14 +143,19 @@ upload_wm_state(struct brw_context *brw)
dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
dw4 |= (brw->wm.prog_data->first_curbe_grf <<
GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+ dw4 |= (brw->wm.prog_data->first_curbe_grf_16 <<
+ GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
dw5 |= (brw->wm_max_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
/* CACHE_NEW_WM_PROG */
- if (brw->wm.prog_data->dispatch_width == 8)
+ if (brw->wm.prog_data->dispatch_width == 8) {
dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
- else
+ if (brw->wm.prog_data->prog_offset_16)
+ dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+ } else {
dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+ }
/* _NEW_LINE */
if (ctx->Line.StippleFlag)
@@ -194,7 +199,12 @@ upload_wm_state(struct brw_context *brw)
OUT_BATCH(dw5);
OUT_BATCH(dw6);
OUT_BATCH(0); /* kernel 1 pointer */
- OUT_BATCH(0); /* kernel 2 pointer */
+ if (brw->wm.prog_data->prog_offset_16) {
+ OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+ brw->wm.prog_data->prog_offset_16);
+ } else {
+ OUT_BATCH(0); /* kernel 2 pointer */
+ }
ADVANCE_BATCH();
}