diff options
Diffstat (limited to 'src/mesa/drivers/dri/i965/brw_urb.c')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_urb.c | 90 |
1 files changed, 52 insertions, 38 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c index 64f5904ac68..7673dd36eb9 100644 --- a/src/mesa/drivers/dri/i965/brw_urb.c +++ b/src/mesa/drivers/dri/i965/brw_urb.c @@ -35,7 +35,6 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "brw_hal.h" #define VS 0 #define GS 1 @@ -43,7 +42,44 @@ #define SF 3 #define CS 4 -/* XXX: Are the min_entry_size numbers useful? +/** @file brw_urb.c + * + * Manages the division of the URB space between the various fixed-function + * units. + * + * See the Thread Initiation Management section of the GEN4 B-Spec, and + * the individual *_STATE structures for restrictions on numbers of + * entries and threads. + */ + +/* + * Generally, a unit requires a min_nr_entries based on how many entries + * it produces before the downstream unit gets unblocked and can use and + * dereference some of its handles. + * + * The SF unit preallocates a PUE at the start of thread dispatch, and only + * uses that one. So it requires one entry per thread. + * + * For CLIP, the SF unit will hold the previous primitive while the + * next is getting assembled, meaning that linestrips require 3 CLIP VUEs + * (vertices) to ensure continued processing, trifans require 4, and tristrips + * require 5. There can be 1 or 2 threads, and each has the same requirement. + * + * GS has the same requirement as CLIP, but it never handles tristrips, + * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces. + * We only run it single-threaded. + * + * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X). + * Each thread processes 2 preallocated VUEs (vertices) at a time, and they + * get streamed down as soon as threads processing earlier vertices get + * theirs accepted. + * + * Each unit will take the number of URB entries we give it (based on the + * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs, + * and brw_curbe.c for the CURBEs) and decide its maximum number of + * threads it can support based on that. in brw_*_state.c. + * + * XXX: Are the min_entry_size numbers useful? * XXX: Verify min_nr_entries, esp for VS. * XXX: Verify SF min_entry_size. */ @@ -53,9 +89,9 @@ static const struct { GLuint min_entry_size; GLuint max_entry_size; } limits[CS+1] = { - { 8, 32, 1, 5 }, /* vs */ + { 16, 32, 1, 5 }, /* vs */ { 4, 8, 1, 5 }, /* gs */ - { 6, 8, 1, 5 }, /* clp */ + { 5, 10, 1, 5 }, /* clp */ { 1, 8, 1, 12 }, /* sf */ { 1, 4, 1, 32 } /* cs */ }; @@ -69,7 +105,7 @@ static GLboolean check_urb_layout( struct brw_context *brw ) brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize; brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize; - return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= 256; + return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw); } /* Most minimal update, forces re-emit of URB fence packet after GS @@ -81,20 +117,6 @@ static void recalculate_urb_fence( struct brw_context *brw ) GLuint vsize = brw->vs.prog_data->urb_entry_size; GLuint sfsize = brw->sf.prog_data->urb_entry_size; - static GLboolean (*hal_recalculate_urb_fence) (struct brw_context *brw); - static GLboolean hal_tried; - - if (!hal_tried) - { - hal_recalculate_urb_fence = brw_hal_find_symbol ("intel_hal_recalculate_urb_fence"); - hal_tried = 1; - } - if (hal_recalculate_urb_fence) - { - if ((*hal_recalculate_urb_fence) (brw)) - return; - } - if (csize < limits[CS].min_entry_size) csize = limits[CS].min_entry_size; @@ -107,9 +129,9 @@ static void recalculate_urb_fence( struct brw_context *brw ) if (brw->urb.vsize < vsize || brw->urb.sfsize < sfsize || brw->urb.csize < csize || - (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize || - brw->urb.sfsize > brw->urb.sfsize || - brw->urb.csize > brw->urb.csize))) { + (brw->urb.constrained && (brw->urb.vsize > vsize || + brw->urb.sfsize > sfsize || + brw->urb.csize > csize))) { brw->urb.csize = csize; @@ -129,6 +151,10 @@ static void recalculate_urb_fence( struct brw_context *brw ) brw->urb.nr_sf_entries = limits[SF].min_nr_entries; brw->urb.nr_cs_entries = limits[CS].min_nr_entries; + /* Mark us as operating with constrained nr_entries, so that next + * time we recalculate we'll resize the fences in the hope of + * escaping constrained mode and getting back to normal performance. + */ brw->urb.constrained = 1; if (!check_urb_layout(brw)) { @@ -153,7 +179,7 @@ static void recalculate_urb_fence( struct brw_context *brw ) brw->urb.clip_start, brw->urb.sf_start, brw->urb.cs_start, - 256); + URB_SIZES(brw)); brw->state.dirty.brw |= BRW_NEW_URB_FENCE; } @@ -167,7 +193,7 @@ const struct brw_tracked_state brw_recalculate_urb_fence = { .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_SF_PROG) }, - .update = recalculate_urb_fence + .prepare = recalculate_urb_fence }; @@ -191,25 +217,13 @@ void brw_upload_urb_fence(struct brw_context *brw) /* The ordering below is correct, not the layout in the * instruction. * - * There are 256 urb reg pairs in total. + * There are 256/384 urb reg pairs in total. */ uf.bits0.vs_fence = brw->urb.gs_start; uf.bits0.gs_fence = brw->urb.clip_start; uf.bits0.clp_fence = brw->urb.sf_start; uf.bits1.sf_fence = brw->urb.cs_start; - uf.bits1.cs_fence = 256; + uf.bits1.cs_fence = URB_SIZES(brw); BRW_BATCH_STRUCT(brw, &uf); } - - -#if 0 -const struct brw_tracked_state brw_urb_fence = { - .dirty = { - .mesa = 0, - .brw = BRW_NEW_URB_FENCE | BRW_NEW_PSP, - .cache = 0 - }, - .update = brw_upload_urb_fence -}; -#endif |