1 files changed, 52 insertions, 38 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 64f5904ac68..7673dd36eb9 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -35,7 +35,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "brw_hal.h"
 
 #define VS 0
 #define GS 1
@@ -43,7 +42,44 @@
 #define SF 3
 #define CS 4
 
-/* XXX: Are the min_entry_size numbers useful?
+/** @file brw_urb.c
+ *
+ * Manages the division of the URB space between the various fixed-function
+ * units.
+ *
+ * See the Thread Initiation Management section of the GEN4 B-Spec, and
+ * the individual *_STATE structures for restrictions on numbers of
+ * entries and threads.
+ */
+
+/*
+ * Generally, a unit requires a min_nr_entries based on how many entries
+ * it produces before the downstream unit gets unblocked and can use and
+ * dereference some of its handles.
+ *
+ * The SF unit preallocates a PUE at the start of thread dispatch, and only
+ * uses that one.  So it requires one entry per thread.
+ *
+ * For CLIP, the SF unit will hold the previous primitive while the
+ * next is getting assembled, meaning that linestrips require 3 CLIP VUEs
+ * (vertices) to ensure continued processing, trifans require 4, and tristrips
+ * require 5.  There can be 1 or 2 threads, and each has the same requirement.
+ *
+ * GS has the same requirement as CLIP, but it never handles tristrips,
+ * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces.
+ * We only run it single-threaded.
+ *
+ * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X).
+ * Each thread processes 2 preallocated VUEs (vertices) at a time, and they
+ * get streamed down as soon as threads processing earlier vertices get
+ * theirs accepted.
+ *
+ * Each unit will take the number of URB entries we give it (based on the
+ * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs,
+ * and brw_curbe.c for the CURBEs) and decide its maximum number of
+ * threads it can support based on that. in brw_*_state.c.
+ *
+ * XXX: Are the min_entry_size numbers useful?
  * XXX: Verify min_nr_entries, esp for VS.
  * XXX: Verify SF min_entry_size.
  */
@@ -53,9 +89,9 @@ static const struct {
    GLuint min_entry_size;
    GLuint max_entry_size;
 } limits[CS+1] = {
-   { 8, 32, 1, 5 },			/* vs */
+   { 16, 32, 1, 5 },			/* vs */
    { 4, 8,  1, 5 },			/* gs */
-   { 6, 8,  1, 5 },			/* clp */
+   { 5, 10,  1, 5 },			/* clp */
    { 1, 8,  1, 12 },		        /* sf */
    { 1, 4,  1, 32 }			/* cs */
 };
@@ -69,7 +105,7 @@ static GLboolean check_urb_layout( struct brw_context *brw )
    brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
    brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
 
-   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= 256;
+   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw);
 }
 
 /* Most minimal update, forces re-emit of URB fence packet after GS
@@ -81,20 +117,6 @@ static void recalculate_urb_fence( struct brw_context *brw )
    GLuint vsize = brw->vs.prog_data->urb_entry_size;
    GLuint sfsize = brw->sf.prog_data->urb_entry_size;
 
-   static GLboolean (*hal_recalculate_urb_fence) (struct brw_context *brw);
-   static GLboolean hal_tried;
-
-   if (!hal_tried)
-   {
-      hal_recalculate_urb_fence = brw_hal_find_symbol ("intel_hal_recalculate_urb_fence");
-      hal_tried = 1;
-   }
-   if (hal_recalculate_urb_fence)
-   {
-      if ((*hal_recalculate_urb_fence) (brw))
-	 return;
-   }
-   
    if (csize < limits[CS].min_entry_size)
       csize = limits[CS].min_entry_size;
 
@@ -107,9 +129,9 @@ static void recalculate_urb_fence( struct brw_context *brw )
    if (brw->urb.vsize < vsize ||
        brw->urb.sfsize < sfsize ||
        brw->urb.csize < csize ||
-       (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize ||
-				 brw->urb.sfsize > brw->urb.sfsize ||
-				 brw->urb.csize > brw->urb.csize))) {
+       (brw->urb.constrained && (brw->urb.vsize > vsize ||
+				 brw->urb.sfsize > sfsize ||
+				 brw->urb.csize > csize))) {
       
 
       brw->urb.csize = csize;
@@ -129,6 +151,10 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;	
 	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;	
 
+	 /* Mark us as operating with constrained nr_entries, so that next
+	  * time we recalculate we'll resize the fences in the hope of
+	  * escaping constrained mode and getting back to normal performance.
+	  */
 	 brw->urb.constrained = 1;
 	 
 	 if (!check_urb_layout(brw)) {
@@ -153,7 +179,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
 		      brw->urb.clip_start,
 		      brw->urb.sf_start,
 		      brw->urb.cs_start, 
-		      256);
+		      URB_SIZES(brw));
       
       brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
    }
@@ -167,7 +193,7 @@ const struct brw_tracked_state brw_recalculate_urb_fence = {
       .cache = (CACHE_NEW_VS_PROG |
 		CACHE_NEW_SF_PROG)
    },
-   .update = recalculate_urb_fence
+   .prepare = recalculate_urb_fence
 };
 
 
@@ -191,25 +217,13 @@ void brw_upload_urb_fence(struct brw_context *brw)
    /* The ordering below is correct, not the layout in the
     * instruction.
     *
-    * There are 256 urb reg pairs in total.
+    * There are 256/384 urb reg pairs in total.
     */
    uf.bits0.vs_fence  = brw->urb.gs_start;
    uf.bits0.gs_fence  = brw->urb.clip_start; 
    uf.bits0.clp_fence = brw->urb.sf_start; 
    uf.bits1.sf_fence  = brw->urb.cs_start; 
-   uf.bits1.cs_fence  = 256;
+   uf.bits1.cs_fence  = URB_SIZES(brw);
 
    BRW_BATCH_STRUCT(brw, &uf);
 }
-
-
-#if 0
-const struct brw_tracked_state brw_urb_fence = {
-   .dirty = {
-      .mesa = 0,
-      .brw = BRW_NEW_URB_FENCE | BRW_NEW_PSP,
-      .cache = 0
-   },
-   .update = brw_upload_urb_fence
-};
-#endif