sna/gen6: Prefer the BLT ring, except for copies on behalf of DRI

As demonstrated by the all-important trap300, using the BLT is 2x faster than the RENDER ring for the simple case of solid fills. (Though note that performing the relocations costs 3x as much CPU for 2x GPU performance.) One case that may regress from this change is copywinpix which should benefit from the batching in the RENDER commands, and might warrant revisiting in the future (with realistic and synthetic benchmarks in hand!) However, due to the forced stall when switching rings, we still want to perform RENDER copies on behalf of DRI clients and before page-flips. Checking against cairo-perf-trace indicated no major impact -- I had worried that setting the BLT flag for some clears might have had a knock-on effect causing too many operations that could be pipelined on the RENDER ring to be sent to the BLT ring instead. Reported-by: Michael Larabel <Michael@phoronix.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
author: Chris Wilson <chris@chris-wilson.co.uk> 2011-08-31 23:21:54 +0100
committer: Chris Wilson <chris@chris-wilson.co.uk> 2011-08-31 23:58:39 +0100
commit: 32fc0c896e0dfd06617c12beda1ccacedf69fb4a (patch)
tree: 8045fd9c3edb9c4282e3f3717af53b3de4ce6a73
parent: 5586dd729b153e37f942a285d328a07f3fe7ae16 (diff)
2 files changed, 22 insertions, 4 deletions
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index 5641b3ca..eb67fc67 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2304,13 +2304,14 @@ gen6_render_copy_boxes(struct sna *sna, uint8_t alu,
 #endif
 
 	DBG(("%s (%d, %d)->(%d, %d) x %d, alu=%x, self-copy=%d\n",
 	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu,
 	     src_bo == dst_bo));
 
-	if (sna->kgem.mode == KGEM_BLT &&
+	/* XXX benchmark me! */
+	if (sna->kgem.mode != KGEM_RENDER &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy_boxes(sna, alu,
 			       src_bo, src_dx, src_dy,
 			       dst_bo, dst_dx, dst_dy,
 			       dst->drawable.bitsPerPixel,
 			       box, n))
@@ -2461,13 +2462,14 @@ gen6_render_copy(struct sna *sna, uint8_t alu,
 
 	DBG(("%s (alu=%d, src=(%dx%d), dst=(%dx%d))\n",
 	     __FUNCTION__, alu,
 	     src->drawable.width, src->drawable.height,
 	     dst->drawable.width, dst->drawable.height));
 
-	if (sna->kgem.mode == KGEM_BLT &&
+	/* XXX benchmark me! */
+	if (sna->kgem.mode != KGEM_RENDER &&
 	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy(sna, alu,
 			 src_bo, dst_bo,
 			 dst->drawable.bitsPerPixel,
 			 op))
 		return TRUE;
@@ -2574,13 +2576,13 @@ gen6_render_fill_boxes(struct sna *sna,
 	if (op >= ARRAY_SIZE(gen6_blend_op)) {
 		DBG(("%s: fallback due to unhandled blend op: %d\n",
 		     __FUNCTION__, op));
 		return FALSE;
 	}
 
-	if (sna->kgem.mode == KGEM_BLT ||
+	if (sna->kgem.mode != KGEM_RENDER ||
 	    dst->drawable.width > 8192 ||
 	    dst->drawable.height > 8192 ||
 	    !gen6_check_dst_format(format)) {
 		uint8_t alu = GXcopy;
 
 		if (op == PictOpClear) {
@@ -2731,13 +2733,13 @@ gen6_render_fill(struct sna *sna, uint8_t alu,
 	return sna_blt_fill(sna, alu,
 			    dst_bo, dst->drawable.bitsPerPixel,
 			    color,
 			    op);
 #endif
 
-	if (sna->kgem.mode == KGEM_BLT &&
+	if (sna->kgem.mode != KGEM_RENDER &&
 	    sna_blt_fill(sna, alu,
 			 dst_bo, dst->drawable.bitsPerPixel,
 			 color,
 			 op))
 		return TRUE;
 
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 0a01f8af..f4049f19 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -458,12 +458,28 @@ sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
 			flush = sna_wait_for_scanline(sna, dst, NULL,
 						      &region->extents);
 
 		get_drawable_deltas(draw, dst, &dx, &dy);
 	}
 
+	if (sna->kgem.gen >= 60) {
+		/* Sandybridge introduced a separate ring which it uses to
+		 * perform blits. Switching rendering between rings incurs
+		 * a stall as we wait upon the old ring to finish and
+		 * flush its render cache before we can proceed on with
+		 * the operation on the new ring.
+		 *
+		 * As this buffer, we presume, has just been written to by
+		 * the DRI client using the RENDER ring, we want to perform
+		 * our operation on the same ring, and ideally on the same
+		 * ring as we will flip from (which should be the RENDER ring
+		 * as well).
+		 */
+		kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	}
+
 	if (region) {
 		boxes = REGION_RECTS(region);
 		n = REGION_NUM_RECTS(region);
 		assert(n);
 	} else {
 		boxes = &box;
author	Chris Wilson <chris@chris-wilson.co.uk>	2011-08-31 23:21:54 +0100
committer	Chris Wilson <chris@chris-wilson.co.uk>	2011-08-31 23:58:39 +0100
commit	32fc0c896e0dfd06617c12beda1ccacedf69fb4a (patch)
tree	8045fd9c3edb9c4282e3f3717af53b3de4ce6a73
parent	5586dd729b153e37f942a285d328a07f3fe7ae16 (diff)