summaryrefslogtreecommitdiff
authorChris Wilson <chris@chris-wilson.co.uk>2011-12-10 22:45:25 (GMT)
committer Chris Wilson <chris@chris-wilson.co.uk>2011-12-11 00:52:54 (GMT)
commit051a18063df075536cb1ac0dc4dfc3c1306ab74e (patch) (side-by-side diff)
treec485da7f3349fe814b863482a54d642ca9a4a92b
parent735a15208dd600eefa3090f344186df9cac0462d (diff)
downloadxf86-video-intel-051a18063df075536cb1ac0dc4dfc3c1306ab74e.zip
xf86-video-intel-051a18063df075536cb1ac0dc4dfc3c1306ab74e.tar.gz
sna: Implement a VMA cache
A VMA cache appears unavoidable thanks to compiz and an excrutiatingly slow GTT pagefault, though it does look like it will be ineffectual during everyday usage. Compiz (and presumably other compositing managers) appears to be undoing all the pagefault minimisation as demonstrated on gen5 with large XPutImage. It also appears the CPU to memory bandwidth ratio plays a crucial role in determining whether going straight to GTT or through the CPU cache is a win - so no trivial heuristic. x11perf -putimage10 -putimage500 on i5-2467m: Before: bare: 1150,000 2,410 compiz: 438,000 2,670 After: bare: 1190,000 2,730 compiz: 437,000 2,690 UXA: bare: 658,000 2,670 compiz: 389,000 2,520 On i3-330m Before: bare: 537,000 1,080 compiz: 263,000 398 After: bare: 606,000 1,360 compiz: 203,000 985 UXA: bare: 294,000 1,070 compiz: 197,000 821 On pnv: Before: bare: 179,000 213 compiz: 106,000 123 After: bare: 181,000 246 compiz: 103,000 197 UXA: bare: 114,000 312 compiz: 75,700 191 Reported-by: Michael Larabel <Michael@phoronix.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--src/sna/kgem.c90
-rw-r--r--src/sna/kgem.h5
-rw-r--r--src/sna/kgem_debug_gen3.c4
-rw-r--r--src/sna/kgem_debug_gen4.c8
-rw-r--r--src/sna/kgem_debug_gen5.c8
-rw-r--r--src/sna/kgem_debug_gen6.c10
-rw-r--r--src/sna/kgem_debug_gen7.c10
-rw-r--r--src/sna/sna_accel.c9
-rw-r--r--src/sna/sna_io.c5
-rw-r--r--src/sna/sna_video.c1
10 files changed, 107 insertions, 43 deletions
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 68a1831..3609a6f 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -45,6 +45,12 @@ static inline void list_move(struct list *list, struct list *head)
list_add(list, head);
}
+static inline void list_move_tail(struct list *list, struct list *head)
+{
+ __list_del(list->prev, list->next);
+ list_add_tail(list, head);
+}
+
static inline void list_replace(struct list *old,
struct list *new)
{
@@ -75,6 +81,7 @@ static inline void list_replace(struct list *old,
#endif
#define PAGE_SIZE 4096
+#define MAX_VMA_CACHE 128
struct kgem_partial_bo {
struct kgem_bo base;
@@ -125,7 +132,6 @@ static int gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
static void *gem_mmap(int fd, uint32_t handle, int size, int prot)
{
struct drm_i915_gem_mmap_gtt mmap_arg;
- struct drm_i915_gem_set_domain set_domain;
void *ptr;
DBG(("%s(handle=%d, size=%d, prot=%s)\n", __FUNCTION__,
@@ -144,12 +150,6 @@ static void *gem_mmap(int fd, uint32_t handle, int size, int prot)
ptr = NULL;
}
- VG_CLEAR(set_domain);
- set_domain.handle = handle;
- set_domain.read_domains = I915_GEM_DOMAIN_GTT;
- set_domain.write_domain = prot & PROT_WRITE ? I915_GEM_DOMAIN_GTT : 0;
- drmIoctl(fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
-
return ptr;
}
@@ -274,6 +274,7 @@ static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
bo->cpu_write = true;
list_init(&bo->request);
list_init(&bo->list);
+ list_init(&bo->vma);
return bo;
}
@@ -352,6 +353,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
list_init(&kgem->partial);
list_init(&kgem->requests);
list_init(&kgem->flushing);
+ list_init(&kgem->vma_cache);
for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
list_init(&kgem->inactive[i]);
for (i = 0; i < ARRAY_SIZE(kgem->active); i++)
@@ -594,6 +596,12 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo)
b = next;
}
+ if (bo->map) {
+ munmap(bo->map, bo->size);
+ list_del(&bo->vma);
+ kgem->vma_count--;
+ }
+
list_del(&bo->list);
list_del(&bo->request);
gem_close(kgem->fd, bo->handle);
@@ -620,6 +628,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
base->reusable = true;
list_init(&base->list);
list_replace(&bo->request, &base->request);
+ list_replace(&bo->vma, &base->vma);
free(bo);
bo = base;
}
@@ -1814,19 +1823,76 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot)
{
void *ptr;
- ptr = gem_mmap(kgem->fd, bo->handle, bo->size, prot);
- if (ptr == NULL)
- return NULL;
+ ptr = bo->map;
+ if (ptr == NULL) {
+ /* vma are limited on a per-process basis to around 64k.
+ * This includes all malloc arenas as well as other file
+ * mappings. In order to be fair and not hog the cache,
+ * and more importantly not to exhaust that limit and to
+ * start failing mappings, we keep our own number of open
+ * vma to within a conservative value.
+ */
+ while (kgem->vma_count > MAX_VMA_CACHE) {
+ struct kgem_bo *old;
+
+ old = list_first_entry(&kgem->vma_cache,
+ struct kgem_bo,
+ vma);
+ DBG(("%s: discarding vma cache for %d\n",
+ __FUNCTION__, old->handle));
+ munmap(old->map, old->size);
+ old->map = NULL;
+ list_del(&old->vma);
+ kgem->vma_count--;
+ }
+
+ ptr = gem_mmap(kgem->fd, bo->handle, bo->size,
+ PROT_READ | PROT_WRITE);
+ if (ptr == NULL)
+ return NULL;
+
+ /* Cache this mapping to avoid the overhead of an
+ * excruciatingly slow GTT pagefault. This is more an
+ * issue with compositing managers which need to frequently
+ * flush CPU damage to their GPU bo.
+ */
+ bo->map = ptr;
+ kgem->vma_count++;
+
+ DBG(("%s: caching vma for %d\n",
+ __FUNCTION__, bo->handle));
+ }
+
+ if (bo->needs_flush | bo->gpu) {
+ struct drm_i915_gem_set_domain set_domain;
+
+ VG_CLEAR(set_domain);
+ set_domain.handle = bo->handle;
+ set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+ set_domain.write_domain = prot & PROT_WRITE ? I915_GEM_DOMAIN_GTT : 0;
+ drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
- if (prot & PROT_WRITE) {
bo->needs_flush = false;
if (bo->gpu)
kgem_retire(kgem);
}
+ list_move_tail(&bo->vma, &kgem->vma_cache);
+
return ptr;
}
+void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo)
+{
+ assert(bo->map);
+
+ munmap(bo->map, bo->size);
+ bo->map = NULL;
+
+ list_del(&bo->vma);
+ kgem->vma_count--;
+}
+
uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
{
struct drm_gem_flink flink;
@@ -2151,6 +2217,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
&bo->base.request);
else
list_init(&bo->base.request);
+ list_replace(&old->vma,
+ &bo->base.vma);
free(old);
bo->base.refcnt = 1;
} else {
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index e9e7cdc..0d85f64 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -47,7 +47,9 @@ struct kgem_bo {
struct list list;
struct list request;
+ struct list vma;
+ void *map;
struct kgem_request *rq;
struct drm_i915_gem_exec_object2 *exec;
@@ -103,6 +105,7 @@ struct kgem {
struct list flushing, active[16], inactive[16];
struct list partial;
struct list requests;
+ struct list vma_cache;
struct kgem_request *next_request;
uint16_t nbatch;
@@ -110,6 +113,7 @@ struct kgem {
uint16_t nexec;
uint16_t nreloc;
uint16_t nfence;
+ uint16_t vma_count;
uint32_t flush:1;
uint32_t sync:1;
@@ -314,6 +318,7 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
uint32_t delta);
void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot);
+void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo);
uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
diff --git a/src/sna/kgem_debug_gen3.c b/src/sna/kgem_debug_gen3.c
index d152b60..0238b73 100644
--- a/src/sna/kgem_debug_gen3.c
+++ b/src/sna/kgem_debug_gen3.c
@@ -102,7 +102,7 @@ static void gen3_update_vertex_buffer_addr(struct kgem *kgem,
ptr = (char *)base + kgem->reloc[i].delta;
if (state.vb.current)
- munmap(state.vb.base, state.vb.current->size);
+ kgem_bo_unmap(kgem, state.vb.current);
state.vb.current = bo;
state.vb.base = base;
@@ -1613,7 +1613,7 @@ int kgem_gen3_decode_3d(struct kgem *kgem, uint32_t offset)
void kgem_gen3_finish_state(struct kgem *kgem)
{
if (state.vb.current)
- munmap(state.vb.base, state.vb.current->size);
+ kgem_bo_unmap(kgem, state.vb.current);
memset(&state, 0, sizeof(state));
}
diff --git a/src/sna/kgem_debug_gen4.c b/src/sna/kgem_debug_gen4.c
index d736cbd..0f91d29 100644
--- a/src/sna/kgem_debug_gen4.c
+++ b/src/sna/kgem_debug_gen4.c
@@ -90,7 +90,7 @@ static void gen4_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
i = data[0] >> 27;
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].current);
state.vb[i].current = bo;
state.vb[i].base = base;
@@ -420,7 +420,7 @@ static void
put_reloc(struct kgem *kgem, struct reloc *r)
{
if (r->bo != NULL)
- munmap(r->base, r->bo->size);
+ kgem_bo_unmap(kgem, r->bo);
}
#endif
@@ -697,7 +697,7 @@ static void finish_vertex_buffers(struct kgem *kgem)
for (i = 0; i < ARRAY_SIZE(state.vb); i++)
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].current);
}
void kgem_gen4_finish_state(struct kgem *kgem)
@@ -705,7 +705,7 @@ void kgem_gen4_finish_state(struct kgem *kgem)
finish_vertex_buffers(kgem);
if (state.dynamic_state.current)
- munmap(state.dynamic_state.base, state.dynamic_state.current->size);
+ kgem_bo_unmap(kgem, state.dynamic_state.base);
memset(&state, 0, sizeof(state));
}
diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c
index 78ba443..c4f5df1 100644
--- a/src/sna/kgem_debug_gen5.c
+++ b/src/sna/kgem_debug_gen5.c
@@ -85,7 +85,7 @@ static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
i = data[0] >> 27;
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].current);
state.vb[i].handle = reloc->target_handle;
state.vb[i].current = bo;
@@ -394,7 +394,7 @@ static void
put_reloc(struct kgem *kgem, struct reloc *r)
{
if (r->bo != NULL)
- munmap(r->base, r->bo->size);
+ kgem_bo_umap(kgem, r->bo);
}
#endif
@@ -673,7 +673,7 @@ static void finish_vertex_buffers(struct kgem *kgem)
for (i = 0; i < ARRAY_SIZE(state.vb); i++)
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].current);
}
void kgem_gen5_finish_state(struct kgem *kgem)
@@ -681,7 +681,7 @@ void kgem_gen5_finish_state(struct kgem *kgem)
finish_vertex_buffers(kgem);
if (state.dynamic_state.current)
- munmap(state.dynamic_state.base, state.dynamic_state.current->size);
+ kgem_bo_unmap(kgem,state. dynamic_state.current);
memset(&state, 0, sizeof(state));
}
diff --git a/src/sna/kgem_debug_gen6.c b/src/sna/kgem_debug_gen6.c
index d441b53..5bcd85d 100644
--- a/src/sna/kgem_debug_gen6.c
+++ b/src/sna/kgem_debug_gen6.c
@@ -89,7 +89,7 @@ static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
i = data[0] >> 26;
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].current);
state.vb[i].current = bo;
state.vb[i].base = base;
@@ -130,7 +130,7 @@ static void gen6_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset)
}
if (state.dynamic_state.current)
- munmap(state.dynamic_state.base, state.dynamic_state.current->size);
+ kgem_bo_unmap(kgem, state.dynamic_state.current);
state.dynamic_state.current = bo;
state.dynamic_state.base = base;
@@ -306,7 +306,7 @@ static void finish_vertex_buffers(struct kgem *kgem)
for (i = 0; i < ARRAY_SIZE(state.vb); i++)
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].current);
}
static void finish_state(struct kgem *kgem)
@@ -314,7 +314,7 @@ static void finish_state(struct kgem *kgem)
finish_vertex_buffers(kgem);
if (state.dynamic_state.current)
- munmap(state.dynamic_state.base, state.dynamic_state.current->size);
+ kgem_bo_unmap(kgem, state.dynamic_state.base);
memset(&state, 0, sizeof(state));
}
@@ -482,7 +482,7 @@ static void
put_reloc(struct kgem *kgem, struct reloc *r)
{
if (r->bo != NULL)
- munmap(r->base, r->bo->size);
+ kgem_bo_unmap(kgem, r->bo);
}
static const char *
diff --git a/src/sna/kgem_debug_gen7.c b/src/sna/kgem_debug_gen7.c
index f6a4975..a33a918 100644
--- a/src/sna/kgem_debug_gen7.c
+++ b/src/sna/kgem_debug_gen7.c
@@ -89,7 +89,7 @@ static void gen7_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
i = data[0] >> 26;
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].base);
state.vb[i].current = bo;
state.vb[i].base = base;
@@ -130,7 +130,7 @@ static void gen7_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset)
}
if (state.dynamic_state.current)
- munmap(state.dynamic_state.base, state.dynamic_state.current->size);
+ kgem_bo_unmap(kgem, state.dynamic_state.base);
state.dynamic_state.current = bo;
state.dynamic_state.base = base;
@@ -306,7 +306,7 @@ static void finish_vertex_buffers(struct kgem *kgem)
for (i = 0; i < ARRAY_SIZE(state.vb); i++)
if (state.vb[i].current)
- munmap(state.vb[i].base, state.vb[i].current->size);
+ kgem_bo_unmap(kgem, state.vb[i].current);
}
static void finish_state(struct kgem *kgem)
@@ -314,7 +314,7 @@ static void finish_state(struct kgem *kgem)
finish_vertex_buffers(kgem);
if (state.dynamic_state.current)
- munmap(state.dynamic_state.base, state.dynamic_state.current->size);
+ kgem_bo_unmap(kgem, state.dynamic_state.base);
memset(&state, 0, sizeof(state));
}
@@ -482,7 +482,7 @@ static void
put_reloc(struct kgem *kgem, struct reloc *r)
{
if (r->bo != NULL)
- munmap(r->base, r->bo->size);
+ kgem_bo_unmap(kgem, r->bo);
}
static const char *
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index bb52770..44580be 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -187,9 +187,6 @@ static Bool sna_destroy_private(PixmapPtr pixmap, struct sna_pixmap *priv)
sna_damage_destroy(&priv->gpu_damage);
sna_damage_destroy(&priv->cpu_damage);
- if (priv->mapped)
- munmap(pixmap->devPrivate.ptr, priv->gpu_bo->size);
-
/* Always release the gpu bo back to the lower levels of caching */
if (priv->gpu_bo)
kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
@@ -1407,9 +1404,10 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
/* XXX performing the upload inplace is currently about 20x slower
* for putimage10 on gen6 -- mostly due to slow page faulting in kernel.
+ * So we try again with vma caching and only for pixmaps who will be
+ * immediately flushed...
*/
-#if 0
- if (priv->gpu_bo->rq == NULL &&
+ if (priv->flush &&
sna_put_image_upload_blt(drawable, gc, region,
x, y, w, h, bits, stride)) {
if (region_subsumes_drawable(region, &pixmap->drawable)) {
@@ -1425,7 +1423,6 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
return true;
}
-#endif
if (priv->cpu_bo)
kgem_bo_sync(&sna->kgem, priv->cpu_bo, true);
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index aba636c..767824f 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -80,8 +80,6 @@ static void read_boxes_inplace(struct kgem *kgem,
box->x2 - box->x1, box->y2 - box->y1);
box++;
} while (--n);
-
- munmap(src, bo->size);
}
void sna_read_boxes(struct sna *sna,
@@ -283,8 +281,6 @@ static void write_boxes_inplace(struct kgem *kgem,
box->x2 - box->x1, box->y2 - box->y1);
box++;
} while (--n);
-
- munmap(dst, bo->size);
}
void sna_write_boxes(struct sna *sna,
@@ -464,7 +460,6 @@ struct kgem_bo *sna_replace(struct sna *sna,
0, 0,
pixmap->drawable.width,
pixmap->drawable.height);
- munmap(dst, bo->size);
}
}
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index bd5ff14..d6d56f4 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -481,7 +481,6 @@ sna_video_copy_data(struct sna *sna,
else
sna_copy_packed_data(video, frame, buf, dst);
- munmap(dst, frame->bo->size);
return TRUE;
}