summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChia-I Wu <olv@lunarg.com>2013-09-12 13:00:52 +0800
committerChia-I Wu <olv@lunarg.com>2013-10-02 15:26:40 +0800
commit848c0e72f36d0e1e460193a2d30b2f631529156f (patch)
treea68300ffa0b3a4a30f584f5ea5733328a52a33a8
parent72edba16592e6b1589f2db410b9ab2939e196a07 (diff)
i965: compute DDX in a subspan based only on top row
Consider only the top-left and top-right pixels to approximate DDX in a 2x2 subspan, unless the application requests a more accurate approximation via GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the new driconf option disable_derivative_optimization. This results in a less accurate approximation. However, it improves the performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0% confidence) on Haswell. No noticeable image quality difference observed. The improvement comes from faster sample_d. It seems, on Haswell, some optimizations are introduced to allow faster sample_d when all pixels in a subspan have the same derivative. I considered SAMPLE_STATE too, which allows one to control the quality of sample_d on Haswell. But it gave much worse image quality without giving better performance comparing to this change. No piglit quick.tests regression on Haswell (tested with v1). v2: better guess for precompile program key Signed-off-by: Chia-I Wu <olv@lunarg.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.c2
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.h1
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp6
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_generator.cpp33
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm.c10
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm.h1
-rw-r--r--src/mesa/drivers/dri/i965/intel_screen.c4
7 files changed, 49 insertions, 8 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 5f58a291d8d..18b8e573b59 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -478,6 +478,8 @@ brwCreateContext(int api,
brw_draw_init( brw );
brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
+ brw->disable_derivative_optimization =
+ driQueryOptionb(&brw->optionCache, "disable_derivative_optimization");
ctx->Const.ContextFlags = 0;
if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 0f88bad2475..0ec12185ee4 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1005,6 +1005,7 @@ struct brw_context
bool always_flush_cache;
bool disable_throttling;
bool precompile;
+ bool disable_derivative_optimization;
driOptionCache optionCache;
/** @} */
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index bcb15ee0f92..0b441d451f1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3213,6 +3213,12 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
key.nr_color_regions = 1;
+ /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
+ * quality of the derivatives is likely to be determined by the driconf
+ * option.
+ */
+ key.high_quality_derivatives = brw->disable_derivative_optimization;
+
key.program_string_id = bfp->id;
uint32_t old_prog_offset = brw->wm.base.prog_offset;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 7ce42c4b9dc..9eb5e177928 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
*
* arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
*
- * and we're trying to produce:
+ * Ideally, we want to produce:
*
* DDX DDY
* dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
@@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
*
* For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
* for each pair, and vertstride = 2 jumps us 2 elements after processing a
- * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
- * between each other. We could probably do it like ddx and swizzle the right
- * order later, but bail for now and just produce
+ * pair. But the ideal approximation may impose a huge performance cost on
+ * sample_d. On at least Haswell, sample_d instruction does some
+ * optimizations if the same LOD is used for all pixels in the subspan.
+ *
+ * For DDY, it's harder, as we want to produce the pairs swizzled between each
+ * other. We could probably do it like ddx and swizzle the right order later,
+ * but bail for now and just produce
* ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
*/
void
fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
{
+ unsigned vstride, width;
+
+ if (c->key.high_quality_derivatives) {
+ /* produce accurate derivatives */
+ vstride = BRW_VERTICAL_STRIDE_2;
+ width = BRW_WIDTH_2;
+ }
+ else {
+ /* replicate the derivative at the top-left pixel to other pixels */
+ vstride = BRW_VERTICAL_STRIDE_4;
+ width = BRW_WIDTH_4;
+ }
+
struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
BRW_REGISTER_TYPE_F,
- BRW_VERTICAL_STRIDE_2,
- BRW_WIDTH_2,
+ vstride,
+ width,
BRW_HORIZONTAL_STRIDE_0,
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
BRW_REGISTER_TYPE_F,
- BRW_VERTICAL_STRIDE_2,
- BRW_WIDTH_2,
+ vstride,
+ width,
BRW_HORIZONTAL_STRIDE_0,
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
brw_ADD(p, dst, src0, negate(src1));
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 3d7ca2a4d03..9745edac6ca 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -416,6 +416,15 @@ static void brw_wm_populate_key( struct brw_context *brw,
key->line_aa = line_aa;
+ /* _NEW_HINT */
+ if (brw->disable_derivative_optimization) {
+ key->high_quality_derivatives =
+ ctx->Hint.FragmentShaderDerivative != GL_FASTEST;
+ } else {
+ key->high_quality_derivatives =
+ ctx->Hint.FragmentShaderDerivative == GL_NICEST;
+ }
+
if (brw->gen < 6)
key->stats_wm = brw->stats_wm;
@@ -503,6 +512,7 @@ const struct brw_tracked_state brw_wm_prog = {
_NEW_STENCIL |
_NEW_POLYGON |
_NEW_LINE |
+ _NEW_HINT |
_NEW_LIGHT |
_NEW_FRAG_CLAMP |
_NEW_BUFFERS |
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index f7a2c5f234d..aa786def463 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -66,6 +66,7 @@ struct brw_wm_prog_key {
GLuint render_to_fbo:1;
GLuint clamp_fragment_color:1;
GLuint line_aa:2;
+ GLuint high_quality_derivatives:1;
GLushort drawable_height;
GLbitfield64 input_slots_valid;
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index de80a00ebb3..cddc8e81330 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] =
DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+")
DRI_CONF_OPT_END
+ DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false")
+ DRI_CONF_DESC(en, "Derivatives with finer granularity by default")
+ DRI_CONF_OPT_END
+
DRI_CONF_SECTION_END
DRI_CONF_SECTION_QUALITY
DRI_CONF_FORCE_S3TC_ENABLE("false")