anv/allocator: Simplify anv_scratch_pool

The previous implementation was being overly clever and using the anv_bo::size field as its mutex. Scratch pool allocations don't happen often, will happen at most a fixed number of times, and never happen in the critical path (they only happen in shader compilation). We can make this much simpler by just using the device mutex. This also means that we can start using anv_bo_init_new directly on the bo and avoid setting fields one-at-a-time. Signed-off-by: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Kristian H. Kristensen <hoegsberg@google.com> Cc: "13.0" <mesa-stable@lists.freedesktop.org> (cherry picked from commit bd0f8d50706fce400ff0768c659acc90696aadb6)
author: Jason Ekstrand <jason.ekstrand@intel.com> 2016-11-01 13:10:11 -0700
committer: Emil Velikov <emil.l.velikov@gmail.com> 2016-11-09 23:29:42 +0000
commit: 5bdd4fc273176bbbfb9adfe6cc397a6b0525c1d2 (patch)
tree: 7b07084a1bee830d7427000eb5124e305736644f
parent: c4643f5f1eb431e6780ec263dbdfe21796b0bf5d (diff)
2 files changed, 55 insertions, 61 deletions
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index 85c2b70079d..204c8711a92 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -887,9 +887,9 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
 {
    for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
       for (unsigned i = 0; i < 16; i++) {
-         struct anv_bo *bo = &pool->bos[i][s];
-         if (bo->size > 0)
-            anv_gem_close(device, bo->gem_handle);
+         struct anv_scratch_bo *bo = &pool->bos[i][s];
+         if (bo->exists > 0)
+            anv_gem_close(device, bo->bo.gem_handle);
       }
    }
 }
@@ -904,70 +904,59 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
    unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
    assert(scratch_size_log2 < 16);
 
-   struct anv_bo *bo = &pool->bos[scratch_size_log2][stage];
+   struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage];
 
-   /* From now on, we go into a critical section.  In order to remain
-    * thread-safe, we use the bo size as a lock.  A value of 0 means we don't
-    * have a valid BO yet.  A value of 1 means locked.  A value greater than 1
-    * means we have a bo of the given size.
-    */
+   /* We can use "exists" to shortcut and ignore the critical section */
+   if (bo->exists)
+      return &bo->bo;
 
-   if (bo->size > 1)
-      return bo;
-
-   uint64_t size = __sync_val_compare_and_swap(&bo->size, 0, 1);
-   if (size == 0) {
-      /* We own the lock.  Allocate a buffer */
-
-      const struct anv_physical_device *physical_device =
-         &device->instance->physicalDevice;
-      const struct gen_device_info *devinfo = &physical_device->info;
-
-      /* WaCSScratchSize:hsw
-       *
-       * Haswell's scratch space address calculation appears to be sparse
-       * rather than tightly packed. The Thread ID has bits indicating which
-       * subslice, EU within a subslice, and thread within an EU it is.
-       * There's a maximum of two slices and two subslices, so these can be
-       * stored with a single bit. Even though there are only 10 EUs per
-       * subslice, this is stored in 4 bits, so there's an effective maximum
-       * value of 16 EUs. Similarly, although there are only 7 threads per EU,
-       * this is stored in a 3 bit number, giving an effective maximum value
-       * of 8 threads per EU.
-       *
-       * This means that we need to use 16 * 8 instead of 10 * 7 for the
-       * number of threads per subslice.
-       */
-      const unsigned subslices = MAX2(physical_device->subslice_total, 1);
-      const unsigned scratch_ids_per_subslice =
-         device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;
+   pthread_mutex_lock(&device->mutex);
+
+   __sync_synchronize();
+   if (bo->exists)
+      return &bo->bo;
 
-      uint32_t max_threads[] = {
-         [MESA_SHADER_VERTEX]           = devinfo->max_vs_threads,
-         [MESA_SHADER_TESS_CTRL]        = devinfo->max_tcs_threads,
-         [MESA_SHADER_TESS_EVAL]        = devinfo->max_tes_threads,
-         [MESA_SHADER_GEOMETRY]         = devinfo->max_gs_threads,
-         [MESA_SHADER_FRAGMENT]         = devinfo->max_wm_threads,
-         [MESA_SHADER_COMPUTE]          = scratch_ids_per_subslice * subslices,
-      };
+   const struct anv_physical_device *physical_device =
+      &device->instance->physicalDevice;
+   const struct gen_device_info *devinfo = &physical_device->info;
+
+   /* WaCSScratchSize:hsw
+    *
+    * Haswell's scratch space address calculation appears to be sparse
+    * rather than tightly packed. The Thread ID has bits indicating which
+    * subslice, EU within a subslice, and thread within an EU it is.
+    * There's a maximum of two slices and two subslices, so these can be
+    * stored with a single bit. Even though there are only 10 EUs per
+    * subslice, this is stored in 4 bits, so there's an effective maximum
+    * value of 16 EUs. Similarly, although there are only 7 threads per EU,
+    * this is stored in a 3 bit number, giving an effective maximum value
+    * of 8 threads per EU.
+    *
+    * This means that we need to use 16 * 8 instead of 10 * 7 for the
+    * number of threads per subslice.
+    */
+   const unsigned subslices = MAX2(physical_device->subslice_total, 1);
+   const unsigned scratch_ids_per_subslice =
+      device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads;
 
-      size = per_thread_scratch * max_threads[stage];
+   uint32_t max_threads[] = {
+      [MESA_SHADER_VERTEX]           = devinfo->max_vs_threads,
+      [MESA_SHADER_TESS_CTRL]        = devinfo->max_tcs_threads,
+      [MESA_SHADER_TESS_EVAL]        = devinfo->max_tes_threads,
+      [MESA_SHADER_GEOMETRY]         = devinfo->max_gs_threads,
+      [MESA_SHADER_FRAGMENT]         = devinfo->max_wm_threads,
+      [MESA_SHADER_COMPUTE]          = scratch_ids_per_subslice * subslices,
+   };
 
-      struct anv_bo new_bo;
-      anv_bo_init_new(&new_bo, device, size);
+   uint32_t size = per_thread_scratch * max_threads[stage];
 
-      bo->gem_handle = new_bo.gem_handle;
+   anv_bo_init_new(&bo->bo, device, size);
 
-      /* Set the size last because we use it as a lock */
-      __sync_synchronize();
-      bo->size = size;
+   /* Set the exists last because it may be read by other threads */
+   __sync_synchronize();
+   bo->exists = true;
 
-      futex_wake((uint32_t *)&bo->size, INT_MAX);
-   } else {
-      /* Someone else got here first */
-      while (bo->size == 1)
-         futex_wait((uint32_t *)&bo->size, 1);
-   }
+   pthread_mutex_unlock(&device->mutex);
 
-   return bo;
+   return &bo->bo;
 }
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index b4f7e92f70b..1302df76a42 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -450,9 +450,14 @@ VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo,
                            uint32_t size);
 void anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo);
 
+struct anv_scratch_bo {
+   bool exists;
+   struct anv_bo bo;
+};
+
 struct anv_scratch_pool {
    /* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */
-   struct anv_bo bos[16][MESA_SHADER_STAGES];
+   struct anv_scratch_bo bos[16][MESA_SHADER_STAGES];
 };
 
 void anv_scratch_pool_init(struct anv_device *device,
author	Jason Ekstrand <jason.ekstrand@intel.com>	2016-11-01 13:10:11 -0700
committer	Emil Velikov <emil.l.velikov@gmail.com>	2016-11-09 23:29:42 +0000
commit	5bdd4fc273176bbbfb9adfe6cc397a6b0525c1d2 (patch)
tree	7b07084a1bee830d7427000eb5124e305736644f
parent	c4643f5f1eb431e6780ec263dbdfe21796b0bf5d (diff)