diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2011-12-10 22:45:25 +0000 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2011-12-11 00:52:54 +0000 |
commit | 051a18063df075536cb1ac0dc4dfc3c1306ab74e (patch) | |
tree | c485da7f3349fe814b863482a54d642ca9a4a92b | |
parent | 735a15208dd600eefa3090f344186df9cac0462d (diff) |
sna: Implement a VMA cache
A VMA cache appears unavoidable thanks to compiz and an excrutiatingly
slow GTT pagefault, though it does look like it will be ineffectual
during everyday usage. Compiz (and presumably other compositing
managers) appears to be undoing all the pagefault minimisation as
demonstrated on gen5 with large XPutImage. It also appears the CPU to
memory bandwidth ratio plays a crucial role in determining whether
going straight to GTT or through the CPU cache is a win - so no trivial
heuristic.
x11perf -putimage10 -putimage500 on i5-2467m:
Before:
bare: 1150,000 2,410
compiz: 438,000 2,670
After:
bare: 1190,000 2,730
compiz: 437,000 2,690
UXA:
bare: 658,000 2,670
compiz: 389,000 2,520
On i3-330m
Before:
bare: 537,000 1,080
compiz: 263,000 398
After:
bare: 606,000 1,360
compiz: 203,000 985
UXA:
bare: 294,000 1,070
compiz: 197,000 821
On pnv:
Before:
bare: 179,000 213
compiz: 106,000 123
After:
bare: 181,000 246
compiz: 103,000 197
UXA:
bare: 114,000 312
compiz: 75,700 191
Reported-by: Michael Larabel <Michael@phoronix.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r-- | src/sna/kgem.c | 90 | ||||
-rw-r--r-- | src/sna/kgem.h | 5 | ||||
-rw-r--r-- | src/sna/kgem_debug_gen3.c | 4 | ||||
-rw-r--r-- | src/sna/kgem_debug_gen4.c | 8 | ||||
-rw-r--r-- | src/sna/kgem_debug_gen5.c | 8 | ||||
-rw-r--r-- | src/sna/kgem_debug_gen6.c | 10 | ||||
-rw-r--r-- | src/sna/kgem_debug_gen7.c | 10 | ||||
-rw-r--r-- | src/sna/sna_accel.c | 9 | ||||
-rw-r--r-- | src/sna/sna_io.c | 5 | ||||
-rw-r--r-- | src/sna/sna_video.c | 1 |
10 files changed, 107 insertions, 43 deletions
diff --git a/src/sna/kgem.c b/src/sna/kgem.c index 68a1831b..3609a6f3 100644 --- a/src/sna/kgem.c +++ b/src/sna/kgem.c @@ -45,6 +45,12 @@ static inline void list_move(struct list *list, struct list *head) list_add(list, head); } +static inline void list_move_tail(struct list *list, struct list *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + static inline void list_replace(struct list *old, struct list *new) { @@ -75,6 +81,7 @@ static inline void list_replace(struct list *old, #endif #define PAGE_SIZE 4096 +#define MAX_VMA_CACHE 128 struct kgem_partial_bo { struct kgem_bo base; @@ -125,7 +132,6 @@ static int gem_set_tiling(int fd, uint32_t handle, int tiling, int stride) static void *gem_mmap(int fd, uint32_t handle, int size, int prot) { struct drm_i915_gem_mmap_gtt mmap_arg; - struct drm_i915_gem_set_domain set_domain; void *ptr; DBG(("%s(handle=%d, size=%d, prot=%s)\n", __FUNCTION__, @@ -144,12 +150,6 @@ static void *gem_mmap(int fd, uint32_t handle, int size, int prot) ptr = NULL; } - VG_CLEAR(set_domain); - set_domain.handle = handle; - set_domain.read_domains = I915_GEM_DOMAIN_GTT; - set_domain.write_domain = prot & PROT_WRITE ? I915_GEM_DOMAIN_GTT : 0; - drmIoctl(fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain); - return ptr; } @@ -274,6 +274,7 @@ static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo, bo->cpu_write = true; list_init(&bo->request); list_init(&bo->list); + list_init(&bo->vma); return bo; } @@ -352,6 +353,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen) list_init(&kgem->partial); list_init(&kgem->requests); list_init(&kgem->flushing); + list_init(&kgem->vma_cache); for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) list_init(&kgem->inactive[i]); for (i = 0; i < ARRAY_SIZE(kgem->active); i++) @@ -594,6 +596,12 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo) b = next; } + if (bo->map) { + munmap(bo->map, bo->size); + list_del(&bo->vma); + kgem->vma_count--; + } + list_del(&bo->list); list_del(&bo->request); gem_close(kgem->fd, bo->handle); @@ -620,6 +628,7 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo) base->reusable = true; list_init(&base->list); list_replace(&bo->request, &base->request); + list_replace(&bo->vma, &base->vma); free(bo); bo = base; } @@ -1814,19 +1823,76 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot) { void *ptr; - ptr = gem_mmap(kgem->fd, bo->handle, bo->size, prot); - if (ptr == NULL) - return NULL; + ptr = bo->map; + if (ptr == NULL) { + /* vma are limited on a per-process basis to around 64k. + * This includes all malloc arenas as well as other file + * mappings. In order to be fair and not hog the cache, + * and more importantly not to exhaust that limit and to + * start failing mappings, we keep our own number of open + * vma to within a conservative value. + */ + while (kgem->vma_count > MAX_VMA_CACHE) { + struct kgem_bo *old; + + old = list_first_entry(&kgem->vma_cache, + struct kgem_bo, + vma); + DBG(("%s: discarding vma cache for %d\n", + __FUNCTION__, old->handle)); + munmap(old->map, old->size); + old->map = NULL; + list_del(&old->vma); + kgem->vma_count--; + } + + ptr = gem_mmap(kgem->fd, bo->handle, bo->size, + PROT_READ | PROT_WRITE); + if (ptr == NULL) + return NULL; + + /* Cache this mapping to avoid the overhead of an + * excruciatingly slow GTT pagefault. This is more an + * issue with compositing managers which need to frequently + * flush CPU damage to their GPU bo. + */ + bo->map = ptr; + kgem->vma_count++; + + DBG(("%s: caching vma for %d\n", + __FUNCTION__, bo->handle)); + } + + if (bo->needs_flush | bo->gpu) { + struct drm_i915_gem_set_domain set_domain; + + VG_CLEAR(set_domain); + set_domain.handle = bo->handle; + set_domain.read_domains = I915_GEM_DOMAIN_GTT; + set_domain.write_domain = prot & PROT_WRITE ? I915_GEM_DOMAIN_GTT : 0; + drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain); - if (prot & PROT_WRITE) { bo->needs_flush = false; if (bo->gpu) kgem_retire(kgem); } + list_move_tail(&bo->vma, &kgem->vma_cache); + return ptr; } +void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo) +{ + assert(bo->map); + + munmap(bo->map, bo->size); + bo->map = NULL; + + list_del(&bo->vma); + kgem->vma_count--; +} + uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo) { struct drm_gem_flink flink; @@ -2151,6 +2217,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem, &bo->base.request); else list_init(&bo->base.request); + list_replace(&old->vma, + &bo->base.vma); free(old); bo->base.refcnt = 1; } else { diff --git a/src/sna/kgem.h b/src/sna/kgem.h index e9e7cdcb..0d85f643 100644 --- a/src/sna/kgem.h +++ b/src/sna/kgem.h @@ -47,7 +47,9 @@ struct kgem_bo { struct list list; struct list request; + struct list vma; + void *map; struct kgem_request *rq; struct drm_i915_gem_exec_object2 *exec; @@ -103,6 +105,7 @@ struct kgem { struct list flushing, active[16], inactive[16]; struct list partial; struct list requests; + struct list vma_cache; struct kgem_request *next_request; uint16_t nbatch; @@ -110,6 +113,7 @@ struct kgem { uint16_t nexec; uint16_t nreloc; uint16_t nfence; + uint16_t vma_count; uint32_t flush:1; uint32_t sync:1; @@ -314,6 +318,7 @@ uint32_t kgem_add_reloc(struct kgem *kgem, uint32_t delta); void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot); +void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo); uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo); Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo, diff --git a/src/sna/kgem_debug_gen3.c b/src/sna/kgem_debug_gen3.c index d152b608..0238b734 100644 --- a/src/sna/kgem_debug_gen3.c +++ b/src/sna/kgem_debug_gen3.c @@ -102,7 +102,7 @@ static void gen3_update_vertex_buffer_addr(struct kgem *kgem, ptr = (char *)base + kgem->reloc[i].delta; if (state.vb.current) - munmap(state.vb.base, state.vb.current->size); + kgem_bo_unmap(kgem, state.vb.current); state.vb.current = bo; state.vb.base = base; @@ -1613,7 +1613,7 @@ int kgem_gen3_decode_3d(struct kgem *kgem, uint32_t offset) void kgem_gen3_finish_state(struct kgem *kgem) { if (state.vb.current) - munmap(state.vb.base, state.vb.current->size); + kgem_bo_unmap(kgem, state.vb.current); memset(&state, 0, sizeof(state)); } diff --git a/src/sna/kgem_debug_gen4.c b/src/sna/kgem_debug_gen4.c index d736cbd9..0f91d29a 100644 --- a/src/sna/kgem_debug_gen4.c +++ b/src/sna/kgem_debug_gen4.c @@ -90,7 +90,7 @@ static void gen4_update_vertex_buffer(struct kgem *kgem, const uint32_t *data) i = data[0] >> 27; if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].current); state.vb[i].current = bo; state.vb[i].base = base; @@ -420,7 +420,7 @@ static void put_reloc(struct kgem *kgem, struct reloc *r) { if (r->bo != NULL) - munmap(r->base, r->bo->size); + kgem_bo_unmap(kgem, r->bo); } #endif @@ -697,7 +697,7 @@ static void finish_vertex_buffers(struct kgem *kgem) for (i = 0; i < ARRAY_SIZE(state.vb); i++) if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].current); } void kgem_gen4_finish_state(struct kgem *kgem) @@ -705,7 +705,7 @@ void kgem_gen4_finish_state(struct kgem *kgem) finish_vertex_buffers(kgem); if (state.dynamic_state.current) - munmap(state.dynamic_state.base, state.dynamic_state.current->size); + kgem_bo_unmap(kgem, state.dynamic_state.base); memset(&state, 0, sizeof(state)); } diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c index 78ba4432..c4f5df15 100644 --- a/src/sna/kgem_debug_gen5.c +++ b/src/sna/kgem_debug_gen5.c @@ -85,7 +85,7 @@ static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data) i = data[0] >> 27; if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].current); state.vb[i].handle = reloc->target_handle; state.vb[i].current = bo; @@ -394,7 +394,7 @@ static void put_reloc(struct kgem *kgem, struct reloc *r) { if (r->bo != NULL) - munmap(r->base, r->bo->size); + kgem_bo_umap(kgem, r->bo); } #endif @@ -673,7 +673,7 @@ static void finish_vertex_buffers(struct kgem *kgem) for (i = 0; i < ARRAY_SIZE(state.vb); i++) if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].current); } void kgem_gen5_finish_state(struct kgem *kgem) @@ -681,7 +681,7 @@ void kgem_gen5_finish_state(struct kgem *kgem) finish_vertex_buffers(kgem); if (state.dynamic_state.current) - munmap(state.dynamic_state.base, state.dynamic_state.current->size); + kgem_bo_unmap(kgem,state. dynamic_state.current); memset(&state, 0, sizeof(state)); } diff --git a/src/sna/kgem_debug_gen6.c b/src/sna/kgem_debug_gen6.c index d441b536..5bcd85dc 100644 --- a/src/sna/kgem_debug_gen6.c +++ b/src/sna/kgem_debug_gen6.c @@ -89,7 +89,7 @@ static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data) i = data[0] >> 26; if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].current); state.vb[i].current = bo; state.vb[i].base = base; @@ -130,7 +130,7 @@ static void gen6_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset) } if (state.dynamic_state.current) - munmap(state.dynamic_state.base, state.dynamic_state.current->size); + kgem_bo_unmap(kgem, state.dynamic_state.current); state.dynamic_state.current = bo; state.dynamic_state.base = base; @@ -306,7 +306,7 @@ static void finish_vertex_buffers(struct kgem *kgem) for (i = 0; i < ARRAY_SIZE(state.vb); i++) if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].current); } static void finish_state(struct kgem *kgem) @@ -314,7 +314,7 @@ static void finish_state(struct kgem *kgem) finish_vertex_buffers(kgem); if (state.dynamic_state.current) - munmap(state.dynamic_state.base, state.dynamic_state.current->size); + kgem_bo_unmap(kgem, state.dynamic_state.base); memset(&state, 0, sizeof(state)); } @@ -482,7 +482,7 @@ static void put_reloc(struct kgem *kgem, struct reloc *r) { if (r->bo != NULL) - munmap(r->base, r->bo->size); + kgem_bo_unmap(kgem, r->bo); } static const char * diff --git a/src/sna/kgem_debug_gen7.c b/src/sna/kgem_debug_gen7.c index f6a49752..a33a918d 100644 --- a/src/sna/kgem_debug_gen7.c +++ b/src/sna/kgem_debug_gen7.c @@ -89,7 +89,7 @@ static void gen7_update_vertex_buffer(struct kgem *kgem, const uint32_t *data) i = data[0] >> 26; if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].base); state.vb[i].current = bo; state.vb[i].base = base; @@ -130,7 +130,7 @@ static void gen7_update_dynamic_buffer(struct kgem *kgem, const uint32_t offset) } if (state.dynamic_state.current) - munmap(state.dynamic_state.base, state.dynamic_state.current->size); + kgem_bo_unmap(kgem, state.dynamic_state.base); state.dynamic_state.current = bo; state.dynamic_state.base = base; @@ -306,7 +306,7 @@ static void finish_vertex_buffers(struct kgem *kgem) for (i = 0; i < ARRAY_SIZE(state.vb); i++) if (state.vb[i].current) - munmap(state.vb[i].base, state.vb[i].current->size); + kgem_bo_unmap(kgem, state.vb[i].current); } static void finish_state(struct kgem *kgem) @@ -314,7 +314,7 @@ static void finish_state(struct kgem *kgem) finish_vertex_buffers(kgem); if (state.dynamic_state.current) - munmap(state.dynamic_state.base, state.dynamic_state.current->size); + kgem_bo_unmap(kgem, state.dynamic_state.base); memset(&state, 0, sizeof(state)); } @@ -482,7 +482,7 @@ static void put_reloc(struct kgem *kgem, struct reloc *r) { if (r->bo != NULL) - munmap(r->base, r->bo->size); + kgem_bo_unmap(kgem, r->bo); } static const char * diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c index bb52770b..44580be1 100644 --- a/src/sna/sna_accel.c +++ b/src/sna/sna_accel.c @@ -187,9 +187,6 @@ static Bool sna_destroy_private(PixmapPtr pixmap, struct sna_pixmap *priv) sna_damage_destroy(&priv->gpu_damage); sna_damage_destroy(&priv->cpu_damage); - if (priv->mapped) - munmap(pixmap->devPrivate.ptr, priv->gpu_bo->size); - /* Always release the gpu bo back to the lower levels of caching */ if (priv->gpu_bo) kgem_bo_destroy(&sna->kgem, priv->gpu_bo); @@ -1407,9 +1404,10 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region, /* XXX performing the upload inplace is currently about 20x slower * for putimage10 on gen6 -- mostly due to slow page faulting in kernel. + * So we try again with vma caching and only for pixmaps who will be + * immediately flushed... */ -#if 0 - if (priv->gpu_bo->rq == NULL && + if (priv->flush && sna_put_image_upload_blt(drawable, gc, region, x, y, w, h, bits, stride)) { if (region_subsumes_drawable(region, &pixmap->drawable)) { @@ -1425,7 +1423,6 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region, return true; } -#endif if (priv->cpu_bo) kgem_bo_sync(&sna->kgem, priv->cpu_bo, true); diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c index aba636cc..767824fa 100644 --- a/src/sna/sna_io.c +++ b/src/sna/sna_io.c @@ -80,8 +80,6 @@ static void read_boxes_inplace(struct kgem *kgem, box->x2 - box->x1, box->y2 - box->y1); box++; } while (--n); - - munmap(src, bo->size); } void sna_read_boxes(struct sna *sna, @@ -283,8 +281,6 @@ static void write_boxes_inplace(struct kgem *kgem, box->x2 - box->x1, box->y2 - box->y1); box++; } while (--n); - - munmap(dst, bo->size); } void sna_write_boxes(struct sna *sna, @@ -464,7 +460,6 @@ struct kgem_bo *sna_replace(struct sna *sna, 0, 0, pixmap->drawable.width, pixmap->drawable.height); - munmap(dst, bo->size); } } diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c index bd5ff14a..d6d56f40 100644 --- a/src/sna/sna_video.c +++ b/src/sna/sna_video.c @@ -481,7 +481,6 @@ sna_video_copy_data(struct sna *sna, else sna_copy_packed_data(video, frame, buf, dst); - munmap(dst, frame->bo->size); return TRUE; } |