summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott D Phillips <scott.d.phillips@intel.com>2018-01-26 16:07:15 -0800
committerScott D Phillips <scott.d.phillips@intel.com>2018-02-14 12:38:34 -0800
commitecaad89525338ae637bd03004da78b9d937fb05b (patch)
tree3f23cc85fc0b62d147bb18c8d1ff9e05fbf296df
parenteb2e17e2d15bf58b60460437330d719131fb859e (diff)
i965/tiled_memcpy: linear_to_ytiled a cache line at a time
TileY's low 6 address bits are: v1 v0 u3 u2 u1 u0 Thus a cache line in the tiled surface is composed of a 2d area of 16x4 bytes of the linear surface. Add a special case where the area being copied is 4-line aligned and a multiple of 4-lines so that entire cache lines will be written at a time. On Apollolake, this increases tiling throughput to wc maps by 84.0103% +/- 0.862818% v2: Split [y0, y1) and [y2, y3) loops apart for clarity (Jason Ekstrand) v3: Don't reset src var (Jason), Ensure y0 <= y1 <= y2 <= y3 Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
-rw-r--r--src/mesa/drivers/dri/i965/intel_tiled_memcpy.c72
1 files changed, 66 insertions, 6 deletions
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 53a56796915..a362891d7e7 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -287,7 +287,7 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
*/
static inline void
linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
- uint32_t y0, uint32_t y1,
+ uint32_t y0, uint32_t y3,
char *dst, const char *src,
int32_t src_pitch,
uint32_t swizzle_bit,
@@ -306,6 +306,9 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
const uint32_t column_width = ytile_span;
const uint32_t bytes_per_column = column_width * ytile_height;
+ uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
+ uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
+
uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
@@ -321,24 +324,81 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
src += (ptrdiff_t)y0 * src_pitch;
- for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
+ if (y0 != y1) {
+ for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
+ uint32_t xo = xo1;
+ uint32_t swizzle = swizzle1;
+
+ mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
+
+ /* Step by spans/columns. As it happens, the swizzle bit flips
+ * at each step so we don't need to calculate it explicitly.
+ */
+ for (x = x1; x < x2; x += ytile_span) {
+ mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
+ xo += bytes_per_column;
+ swizzle ^= swizzle_bit;
+ }
+
+ mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+
+ src += src_pitch;
+ }
+ }
+
+ for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
uint32_t xo = xo1;
uint32_t swizzle = swizzle1;
- mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
+ if (x0 != x1) {
+ mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
+ mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
+ mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
+ mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
+ }
/* Step by spans/columns. As it happens, the swizzle bit flips
* at each step so we don't need to calculate it explicitly.
*/
for (x = x1; x < x2; x += ytile_span) {
- mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
+ mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
+ mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
+ mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
+ mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
xo += bytes_per_column;
swizzle ^= swizzle_bit;
}
- mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+ if (x2 != x3) {
+ mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
+ mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
+ mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
+ mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
+ }
- src += src_pitch;
+ src += 4 * src_pitch;
+ }
+
+ if (y2 != y3) {
+ for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
+ uint32_t xo = xo1;
+ uint32_t swizzle = swizzle1;
+
+ mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
+
+ /* Step by spans/columns. As it happens, the swizzle bit flips
+ * at each step so we don't need to calculate it explicitly.
+ */
+ for (x = x1; x < x2; x += ytile_span) {
+ mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
+ xo += bytes_per_column;
+ swizzle ^= swizzle_bit;
+ }
+
+ mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+
+ src += src_pitch;
+ }
}
}