1 files changed, 269 insertions, 13 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 05262f1b438e..6555cc92dc48 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -497,9 +497,123 @@ static bool wa_1209644611_applies(int ver, u32 size)
  * location.
  */
 
-static int emit_copy(struct i915_request *rq,
-		     u32 dst_offset, u32 src_offset, int size)
+static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
 {
+	*cmd++ = MI_FLUSH_DW | flags;
+	*cmd++ = 0;
+	*cmd++ = 0;
+
+	return cmd;
+}
+
+static u32 calc_ctrl_surf_instr_size(struct drm_i915_private *i915, int size)
+{
+	u32 num_cmds, num_blks, total_size;
+
+	if (!GET_CCS_BYTES(i915, size))
+		return 0;
+
+	/*
+	 * XY_CTRL_SURF_COPY_BLT transfers CCS in 256 byte
+	 * blocks. one XY_CTRL_SURF_COPY_BLT command can
+	 * transfer upto 1024 blocks.
+	 */
+	num_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
+				NUM_CCS_BYTES_PER_BLOCK);
+	num_cmds = DIV_ROUND_UP(num_blks, NUM_CCS_BLKS_PER_XFER);
+	total_size = XY_CTRL_SURF_INSTR_SIZE * num_cmds;
+
+	/*
+	 * Adding a flush before and after XY_CTRL_SURF_COPY_BLT
+	 */
+	total_size += 2 * MI_FLUSH_DW_SIZE;
+
+	return total_size;
+}
+
+static u32 *_i915_ctrl_surf_copy_blt(u32 *cmd, u64 src_addr, u64 dst_addr,
+				     u8 src_mem_access, u8 dst_mem_access,
+				     int src_mocs, int dst_mocs,
+				     u32 ccs_blocks)
+{
+	/*
+	 * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS
+	 * data in and out of the CCS region.
+	 *
+	 * We can copy at most 1024 blocks of 256 bytes using one
+	 * XY_CTRL_SURF_COPY_BLT instruction.
+	 *
+	 * In case we need to copy more than 1024 blocks, we need to add
+	 * another instruction to the same batch buffer.
+	 *
+	 * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS.
+	 *
+	 * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM.
+	 */
+	do {
+		int blks_per_copy;
+
+		blks_per_copy = ccs_blocks >= NUM_CCS_BLKS_PER_XFER ?
+				NUM_CCS_BLKS_PER_XFER : ccs_blocks;
+		*cmd++ = XY_CTRL_SURF_COPY_BLT |
+			 src_mem_access << SRC_ACCESS_TYPE_SHIFT |
+			 dst_mem_access << DST_ACCESS_TYPE_SHIFT |
+			 FIELD_PREP(CCS_SIZE_MASK, blks_per_copy - 1);
+		*cmd++ = lower_32_bits(src_addr);
+		*cmd++ = (upper_32_bits(src_addr) & 0xFFFF) |
+			  FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, src_mocs);
+		*cmd++ = lower_32_bits(dst_addr);
+		*cmd++ = (upper_32_bits(dst_addr) & 0xFFFF) |
+			  FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, dst_mocs);
+		src_addr += SZ_64M;
+		dst_addr += SZ_64M;
+		ccs_blocks -= blks_per_copy;
+	} while (ccs_blocks > 0);
+
+	return cmd;
+}
+
+static int emit_ccs_copy(struct i915_request *rq,
+			 bool dst_is_lmem, u32 dst_offset,
+			 bool src_is_lmem, u32 src_offset, int size)
+{
+	struct drm_i915_private *i915 = rq->engine->i915;
+	int mocs = rq->engine->gt->mocs.uc_index << 1;
+	u32 num_ccs_blks, ccs_ring_size;
+	u8 src_access, dst_access;
+	u32 *cs;
+
+	GEM_BUG_ON(!(src_is_lmem ^ dst_is_lmem) || !HAS_FLAT_CCS(i915));
+
+	ccs_ring_size = calc_ctrl_surf_instr_size(i915, size);
+	WARN_ON(!ccs_ring_size);
+
+	cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2));
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
+				    NUM_CCS_BYTES_PER_BLOCK);
+
+	src_access = !src_is_lmem && dst_is_lmem;
+	dst_access = !src_access;
+
+	cs = i915_flush_dw(cs, MI_FLUSH_LLC | MI_FLUSH_CCS);
+	cs = _i915_ctrl_surf_copy_blt(cs, src_offset, dst_offset,
+				      src_access, dst_access,
+				      mocs, mocs, num_ccs_blks);
+	cs = i915_flush_dw(cs, MI_FLUSH_LLC | MI_FLUSH_CCS);
+	if (ccs_ring_size & 1)
+		*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int emit_main_copy(struct i915_request *rq,
+			  bool dst_is_lmem, u32 dst_offset,
+			  bool src_is_lmem, u32 src_offset, int size)
 	const int ver = GRAPHICS_VER(rq->engine->i915);
 	u32 instance = rq->engine->instance;
 	u32 *cs;
@@ -544,6 +658,91 @@ static int emit_copy(struct i915_request *rq,
 	return 0;
 }
 
+static int scatter_list_length(struct scatterlist *sg)
+{
+	int len = 0;
+
+	while (sg && sg_dma_len(sg)) {
+		len += sg_dma_len(sg);
+		sg = sg_next(sg);
+	};
+
+	return len;
+}
+
+static void
+calculate_src_dst_sz(struct drm_i915_private *i915, bool src_is_lmem,
+		     u32 *src_sz, u32 *dst_sz, u32 bytes_to_cpy,
+		     bool ccs_copy, u32 ccs_bytes)
+{
+	u32 ccs_sz, smem_sz;
+
+	if (ccs_copy) {
+		/*
+		 * We can only copy the ccs data corresponding to
+		 * the CHUNK_SZ of lmem which is
+		 * GET_CCS_BYTES(i915, CHUNK_SZ))
+		 */
+		ccs_sz = min_t(int, bytes_to_cpy, GET_CCS_BYTES(i915, CHUNK_SZ));
+
+		/* Flat-CCS: CCS data copy */
+		if (!src_is_lmem) { /* src is smem */
+			*src_sz = ccs_sz;
+			*dst_sz = CHUNK_SZ;
+		} else {
+			*src_sz = CHUNK_SZ;
+			*dst_sz = ccs_sz;
+		}
+	} else if (!ccs_copy && ccs_bytes) {
+		/*
+		 * When CHUNK_SZ is passed all the pages upto CHUNK_SZ will
+		 * be taken for the blt. in Flat-ccs supported platform Smem
+		 * obj will have more pages than required for main meory
+		 * hence limit it to the required size for main memory
+		 */
+		smem_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
+		/* Flat-CCS: Main memory copy */
+		if (!src_is_lmem) {
+			*src_sz = smem_sz;
+			*dst_sz = CHUNK_SZ;
+		} else {
+			*dst_sz = smem_sz;
+			*src_sz = CHUNK_SZ;
+		}
+	} else { /* ccs handling is not required */
+		*src_sz = CHUNK_SZ;
+	}
+}
+
+static int emit_copy(struct i915_request *rq, bool dst_is_lmem, u32 dst_offset,
+		     bool src_is_lmem, u32 src_offset, u32 src_sz, u32 dst_sz,
+		     int *size, bool ccs_copy)
+{
+	int err;
+
+	if (ccs_copy) {
+		/*
+		 * Using max of src_sz and dst_sz, as we need to
+		 * pass the lmem size corresponding to the ccs
+		 * blocks we need to handle.
+		 */
+		*size = max_t(int, src_sz, dst_sz);
+		err = emit_ccs_copy(rq, dst_is_lmem, dst_offset,
+				    src_is_lmem, src_offset,
+				    *size);
+
+		/* Converting back to ccs bytes */
+		*size = GET_CCS_BYTES(rq->engine->i915, *size);
+	} else {
+		WARN(src_sz != dst_sz, "%d != %d", src_sz, dst_sz);
+		*size = src_sz;
+		err = emit_main_copy(rq, dst_is_lmem, dst_offset,
+				     src_is_lmem, src_offset, *size);
+	}
+
+	return err;
+}
+
 int
 intel_context_migrate_copy(struct intel_context *ce,
 			   const struct i915_deps *deps,
@@ -556,7 +755,10 @@ intel_context_migrate_copy(struct intel_context *ce,
 			   struct i915_request **out)
 {
 	struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+	struct drm_i915_private *i915 = ce->engine->i915;
+	u32 src_sz, dst_sz, ccs_bytes = 0, bytes_to_cpy;
 	struct i915_request *rq;
+	bool ccs_copy = false;
 	int err;
 
 	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -564,9 +766,28 @@ intel_context_migrate_copy(struct intel_context *ce,
 
 	GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+	if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
+		src_sz = scatter_list_length(src);
+		dst_sz = scatter_list_length(dst);
+
+		if (src_is_lmem)
+			bytes_to_cpy = src_sz;
+		else if (dst_is_lmem)
+			bytes_to_cpy = dst_sz;
+
+		/*
+		 * When there is a eviction of ccs needed smem will have the
+		 * extra pages for the ccs data
+		 *
+		 * TO-DO: Want to move the size mismatch check to a WARN_ON,
+		 * but still we have some requests of smem->lmem with same size.
+		 * Need to fix it.
+		 */
+		ccs_bytes = src_sz != dst_sz ? GET_CCS_BYTES(i915, bytes_to_cpy) : 0;
+	}
+
 	do {
-		u32 src_offset, dst_offset;
-		int len;
+		u32 src_offset, dst_offset, copy_sz;
 
 		rq = i915_request_create(ce);
 		if (IS_ERR(rq)) {
@@ -606,27 +827,38 @@ intel_context_migrate_copy(struct intel_context *ce,
 				dst_offset = 2 * CHUNK_SZ;
 		}
 
-		len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
-			       src_offset, CHUNK_SZ);
-		if (len <= 0) {
-			err = len;
+		calculate_src_dst_sz(i915, src_is_lmem, &src_sz, &dst_sz,
+				     bytes_to_cpy, ccs_copy, ccs_bytes);
+
+		src_sz = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
+				  src_offset, src_sz);
+		if (src_sz <= 0) {
+			err = src_sz;
 			goto out_rq;
 		}
 
+		if (!ccs_bytes)
+			dst_sz = src_sz;
+
 		err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem,
-			       dst_offset, len);
+			       dst_offset, dst_sz);
 		if (err < 0)
 			goto out_rq;
-		if (err < len) {
+		if (err < dst_sz && !ccs_bytes) {
 			err = -EINVAL;
 			goto out_rq;
 		}
 
+		dst_sz = err;
+
 		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
 		if (err)
 			goto out_rq;
 
-		err = emit_copy(rq, dst_offset, src_offset, len);
+		err = emit_copy(rq, dst_is_lmem, dst_offset, src_is_lmem,
+				src_offset, src_sz, dst_sz, &copy_sz, ccs_copy);
+		if (!err && ccs_bytes)
+			bytes_to_cpy -= copy_sz;
 
 		/* Arbitration is re-enabled between requests. */
 out_rq:
@@ -634,9 +866,33 @@ out_rq:
 			i915_request_put(*out);
 		*out = i915_request_get(rq);
 		i915_request_add(rq);
-		if (err || !it_src.sg || !sg_dma_len(it_src.sg))
-			break;
 
+		if (err || !it_src.sg || !sg_dma_len(it_src.sg) ||
+		    !it_dst.sg || !sg_dma_len(it_src.sg)) {
+			if (err || !ccs_bytes)
+				break;
+
+			GEM_BUG_ON(bytes_to_cpy);
+			if (ccs_copy) {
+				break;
+			} else if (ccs_bytes) {
+				if (src_is_lmem) {
+					WARN_ON(it_src.sg && sg_dma_len(it_src.sg));
+					it_src = sg_sgt(src);
+				} else {
+					WARN_ON(it_dst.sg && sg_dma_len(it_dst.sg));
+					it_dst = sg_sgt(dst);
+				}
+				bytes_to_cpy = ccs_bytes;
+				ccs_copy = true;
+
+				continue;
+			} else {
+				DRM_ERROR("Invalid state\n");
+				err = -EINVAL;
+				break;
+			}
+		}
 		cond_resched();
 	} while (1);