src/panfrost/encoder/pan_tiler.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373

/*
 * Copyright (C) 2019 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors:
 *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
 */

#include "util/u_math.h"
#include "util/macros.h"
#include "pan_encoder.h"

/* Mali GPUs are tiled-mode renderers, rather than immediate-mode.
 * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run.
 * Then, a fixed-function hardware block (the tiler) consumes the gl_Position
 * results. For each triangle specified, it marks each containing tile as
 * containing that triangle. This set of "triangles per tile" form the "polygon
 * list". Finally, the rasterization unit consumes the polygon list to invoke
 * the fragment shader.
 *
 * In practice, it's a bit more complicated than this. On Midgard chips with an
 * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical
 * tile size, but Midgard features "hierarchical tiling", where power-of-two
 * multiples of the base tile size can be used: hierarchy level 0 (16x16),
 * level 1 (32x32), level 2 (64x64), per public information about Midgard's
 * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice
 * 128x128 is the largest usually used (though higher modes are enabled).  The
 * idea behind hierarchical tiling is to use low tiling levels for small
 * triangles and high levels for large triangles, to minimize memory bandwidth
 * and repeated fragment shader invocations (the former issue inherent to
 * immediate-mode rendering and the latter common in traditional tilers).
 *
 * The tiler itself works by reading varyings in and writing a polygon list
 * out. Unfortunately (for us), both of these buffers are managed in main
 * memory; although they ideally will be cached, it is the drivers'
 * responsibility to allocate these buffers. Varying buffer allocation is
 * handled elsewhere, as it is not tiler specific; the real issue is allocating
 * the polygon list.
 *
 * This is hard, because from the driver's perspective, we have no information
 * about what geometry will actually look like on screen; that information is
 * only gained from running the vertex shader. (Theoretically, we could run the
 * vertex shaders in software as a prepass, or in hardware with transform
 * feedback as a prepass, but either idea is ludicrous on so many levels).
 *
 * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list
 * into three distinct pieces. First, the driver statically determines which
 * tile hierarchy levels to use (more on that later). At this point, we know the
 * framebuffer dimensions and all the possible tilings of the framebuffer, so
 * we know exactly how many tiles exist across all hierarchy levels. The first
 * piece of the polygon list is the header, which is exactly 8 bytes per tile,
 * plus padding and a small 64-byte prologue. (If that doesn't remind you of
 * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is
 * the polygon list body, which seems to contain 512 bytes per tile, again
 * across every level of the hierarchy. These two parts form the polygon list
 * buffer. This buffer has a statically determinable size, approximately equal
 * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus
 * alignment / minimum restrictions / etc.
 *
 * The third piece is the easy one (for us): the tiler heap. In essence, the
 * tiler heap is a gigantic slab that's as big as could possibly be necessary
 * in the worst case imaginable. Just... a gigantic allocation that we give a
 * start and end pointer to. What's the catch? The tiler heap is lazily
 * allocated; that is, a huge amount of memory is _reserved_, but only a tiny
 * bit is actually allocated upfront. The GPU just keeps using the
 * unallocated-but-reserved portions as it goes along, generating page faults
 * if it goes beyond the allocation, and then the kernel is instructed to
 * expand the allocation on page fault (known in the vendor kernel as growable
 * memory). This is quite a bit of bookkeeping of its own, but that task is
 * pushed to kernel space and we can mostly ignore it here, just remembering to
 * set the GROWABLE flag so the kernel actually uses this path rather than
 * allocating a gigantic amount up front and burning a hole in RAM.
 *
 * As far as determining which hierarchy levels to use, the simple answer is
 * that right now, we don't. In the tiler configuration fields (consistent from
 * the earliest Midgard's SFBD through the latest Bifrost traces we have),
 * there is a hierarchy_mask field, controlling which levels (tile sizes) are
 * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to
 * big tiles and small polygons to small tiles -- would be realized here as
 * well. As long as there are polygons at all needing tiling, we always have to
 * have big tiles available, in case there are big polygons. But we don't
 * necessarily need small tiles available. Ideally, when there are small
 * polygons, small tiles are enabled (to avoid waste from putting small
 * triangles in the big tiles); when there are not, small tiles are disabled to
 * avoid enabling more levels than necessary, which potentially costs in memory
 * bandwidth / power / tiler performance.
 *
 * Of course, the driver has to figure this out statically. When tile
 * hiearchies are actually established, this occurs by the tiler in
 * fixed-function hardware, after the vertex shaders have run and there is
 * sufficient information to figure out the size of triangles. The driver has
 * no such luxury, again barring insane hacks like additionally running the
 * vertex shaders in software or in hardware via transform feedback. Thus, for
 * the driver, we need a heuristic approach.
 *
 * There are lots of heuristics to guess triangle size statically you could
 * imagine, but one approach shines as particularly simple-stupid: assume all
 * on-screen triangles are equal size and spread equidistantly throughout the
 * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with
 * it, then we see:
 *
 *      Triangle Area   = (Screen Area / # of triangles)
 *                      = (Width * Height) / (# of triangles)
 *
 * Or if you prefer, we can also make a third CRAZY assumption that we only draw
 * right triangles with edges parallel/perpendicular to the sides of the screen
 * with no overdraw, forming a triangle grid across the screen:
 *
 * |--w--|
 *  _____   |
 * | /| /|  |
 * |/_|/_|  h
 * | /| /|  |
 * |/_|/_|  |
 *
 * Then you can use some middle school geometry and algebra to work out the
 * triangle dimensions. I started working on this, but realised I didn't need
 * to to make my point, but couldn't bare to erase that ASCII art. Anyway.
 *
 * POINT IS, by considering the ratio of screen area and triangle count, we can
 * estimate the triangle size. For a small size, use small bins; for a large
 * size, use large bins. Intuitively, this metric makes sense: when there are
 * few triangles on a large screen, you're probably compositing a UI and
 * therefore the triangles are large; when there are a lot of triangles on a
 * small screen, you're probably rendering a 3D mesh and therefore the
 * triangles are tiny. (Or better said -- there will be tiny triangles, even if
 * there are also large triangles. There have to be unless you expect crazy
 * overdraw. Generally, it's better to allow more small bin sizes than
 * necessary than not allow enough.)
 *
 * From this heuristic (or whatever), we determine the minimum allowable tile
 * size, and we use that to decide the hierarchy masking, selecting from the
 * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice).
 *
 * Once we have that mask and the framebuffer dimensions, we can compute the
 * size of the statically-sized polygon list structures, allocate them, and go!
 *
 * -----
 *
 * On T720, T820, and T830, there is no support for hierarchical tiling.
 * Instead, the hardware allows the driver to select the tile size dynamically
 * on a per-framebuffer basis, including allowing rectangular/non-square tiles.
 * Rules for tile size selection are as follows:
 *
 *  - Dimensions must be powers-of-two.
 *  - The smallest tile is 16x16.
 *  - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix)
 *  - There must be no more than 64 tiles in either dimension.
 *
 * Within these constraints, the driver is free to pick a tile size according
 * to some heuristic, similar to units with an advanced tiling unit.
 *
 * To pick a size without any heuristics, we may satisfy the constraints by
 * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size
 * constraint, consider:
 *
 *      # of tiles < 64
 *      ceil (fb / tile) < 64
 *      (fb / tile) <= (64 - 1)
 *      tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1))
 *
 * Hence we clamp up to align_pot(fb / (64 - 1)).
 
 * Extending to use a selection heuristic left for future work.
 *
 * Once the tile size (w, h) is chosen, we compute the hierarchy "mask":
 *
 *      hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16)
 *
 * Of course with no hierarchical tiling, this is not a mask; it's just a field
 * specifying the tile size. But I digress.
 *
 * We also compute the polgon list sizes (with framebuffer size W, H) as:
 *
 *      full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h)
 *      offset = 8 * ceil(W / w) * ceil(H / h)
 *
 * It further appears necessary to round down offset to the nearest 0x200.
 * Possibly we would also round down full_size to the nearest 0x200 but
 * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's
 * nothing to do.
 */

/* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */

#define MIN_TILE_SIZE 16
#define MAX_TILE_SIZE 4096

/* Constants as shifts for easier power-of-two iteration */

#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE)
#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE)

/* The hierarchy has a 64-byte prologue */
#define PROLOGUE_SIZE 0x40

/* For each tile (across all hierarchy levels), there is 8 bytes of header */
#define HEADER_BYTES_PER_TILE 0x8

/* Likewise, each tile per level has 512 bytes of body */
#define FULL_BYTES_PER_TILE 0x200

/* If the width-x-height framebuffer is divided into tile_size-x-tile_size
 * tiles, how many tiles are there? Rounding up in each direction. For the
 * special case of tile_size=16, this aligns with the usual Midgard count.
 * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum,
 * because those care about the stride (not just the overall count) and only at
 * a a fixed-tile size (not any of a number of power-of-twos) */

static unsigned
pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height)
{
        unsigned aligned_width = ALIGN_POT(width, tile_width);
        unsigned aligned_height = ALIGN_POT(height, tile_height);

        unsigned tile_count_x = aligned_width / tile_width;
        unsigned tile_count_y = aligned_height / tile_height;

        return tile_count_x * tile_count_y;
}

/* For `masked_count` of the smallest tile sizes masked out, computes how the
 * size of the polygon list header. We iterate the tile sizes (16x16 through
 * 2048x2048). For each tile size, we figure out how many tiles there are at
 * this hierarchy level and therefore many bytes this level is, leaving us with
 * a byte count for each level. We then just sum up the byte counts across the
 * levels to find a byte count for all levels. */

static unsigned
panfrost_hierarchy_size(
                unsigned width,
                unsigned height,
                unsigned mask,
                unsigned bytes_per_tile)
{
        unsigned size = PROLOGUE_SIZE;

        /* Iterate hierarchy levels */

        for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) {
                /* Check if this level is enabled */
                if (!(mask & (1 << b)))
                        continue;

                /* Shift from a level to a tile size */
                unsigned tile_size = (1 << b) * MIN_TILE_SIZE;

                unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size);
                unsigned level_count = bytes_per_tile * tile_count;

                size += level_count;
        }

        /* This size will be used as an offset, so ensure it's aligned */
        return ALIGN_POT(size, 0x200);
}

/* Implement the formula:
 *
 *      0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h)
 *
 * rounding down the answer to the nearest 0x200. This is used to compute both
 * header and body sizes for GPUs without hierarchical tiling. Essentially,
 * computing a single hierarchy level, since there isn't any hierarchy!
 */

static unsigned
panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile)
{
        /* First, extract the tile dimensions */

        unsigned tw = (1 << (dim & 0b111)) * 8;
        unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8;

        /* tile_count is ceil(W/w) * ceil(H/h) */
        unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile;

        /* Round down and add offset */
        return 0x200 + ((raw / 0x200) * 0x200);
}

/* Given a hierarchy mask and a framebuffer size, compute the header size */

unsigned
panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
{
        if (hierarchy)
                return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE);
        else
                return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE);
}

/* The combined header/body is sized similarly (but it is significantly
 * larger), except that it can be empty when the tiler disabled, rather than
 * getting clamped to a minimum size.
 */

unsigned
panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
{
        if (hierarchy)
                return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE);
        else
                return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE);
}

/* On GPUs without hierarchical tiling, we choose a tile size directly and
 * stuff it into the field otherwise known as hierarchy mask (not a mask). */

static unsigned
panfrost_choose_tile_size(
        unsigned width, unsigned height, unsigned vertex_count)
{
        /* Figure out the ideal tile size. Eventually a heuristic should be
         * used for this */

        unsigned best_w = 16;
        unsigned best_h = 16;

        /* Clamp so there are less than 64 tiles in each direction */

        best_w = MAX2(best_w, util_next_power_of_two(width / 63));
        best_h = MAX2(best_h, util_next_power_of_two(height / 63));

        /* We have our ideal tile size, so encode */

        unsigned exp_w = util_logbase2(best_w / 16);
        unsigned exp_h = util_logbase2(best_h / 16);

        return exp_w | (exp_h << 6);
}

/* In the future, a heuristic to choose a tiler hierarchy mask would go here.
 * At the moment, we just default to 0xFF, which enables all possible hierarchy
 * levels. Overall this yields good performance but presumably incurs a cost in
 * memory bandwidth / power consumption / etc, at least on smaller scenes that
 * don't really need all the smaller levels enabled */

unsigned
panfrost_choose_hierarchy_mask(
        unsigned width, unsigned height,
        unsigned vertex_count, bool hierarchy)
{
        /* If there is no geometry, we don't bother enabling anything */

        if (!vertex_count)
                return 0x00;

        if (!hierarchy)
                return panfrost_choose_tile_size(width, height, vertex_count);

        /* Otherwise, default everything on. TODO: Proper tests */

        return 0xFF;
}