src/panfrost/include/panfrost-job.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803

/*
 * © Copyright 2017-2018 Alyssa Rosenzweig
 * © Copyright 2017-2018 Connor Abbott
 * © Copyright 2017-2018 Lyude Paul
 * © Copyright2019 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#ifndef __PANFROST_JOB_H__
#define __PANFROST_JOB_H__

#include <stdint.h>
#include <stdbool.h>
#include <inttypes.h>

typedef uint8_t  u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef uint64_t mali_ptr;

/* Applies to tiler_gl_enables */

#define MALI_OCCLUSION_QUERY    (1 << 3)
#define MALI_OCCLUSION_PRECISE  (1 << 4)

/* Set for a glFrontFace(GL_CCW) in a Y=0=TOP coordinate system (like Gallium).
 * In OpenGL, this would corresponds to glFrontFace(GL_CW). Mesa and the blob
 * disagree about how to do viewport flipping, so the blob actually sets this
 * for GL_CW but then has a negative viewport stride */

#define MALI_FRONT_CCW_TOP      (1 << 5)

#define MALI_CULL_FACE_FRONT    (1 << 6)
#define MALI_CULL_FACE_BACK     (1 << 7)

/* Flags apply to unknown2_3? */

#define MALI_HAS_MSAA		(1 << 0)

/* Execute fragment shader per-sample if set (e.g. to implement gl_SampleID
 * reads) */
#define MALI_PER_SAMPLE         (1 << 2)
#define MALI_CAN_DISCARD 	(1 << 5)

/* Applies on SFBD systems, specifying that programmable blending is in use */
#define MALI_HAS_BLEND_SHADER 	(1 << 6)

/* func is mali_func */
#define MALI_DEPTH_FUNC(func)	   (func << 8)
#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7)
#define MALI_DEPTH_FUNC_MASK	   MALI_DEPTH_FUNC(0x7)

#define MALI_DEPTH_WRITEMASK    (1 << 11)

#define MALI_DEPTH_CLIP_NEAR    (1 << 12)
#define MALI_DEPTH_CLIP_FAR     (1 << 13)

/* Next flags to unknown2_4 */
#define MALI_STENCIL_TEST      	(1 << 0)

#define MALI_ALPHA_TO_COVERAGE (1 << 1)

#define MALI_NO_DITHER		(1 << 9)
#define MALI_DEPTH_RANGE_A	(1 << 12)
#define MALI_DEPTH_RANGE_B	(1 << 13)
#define MALI_NO_MSAA		(1 << 14)

/* Stencil test state is all encoded in a single u32, just with a lot of
 * enums... */

struct mali_stencil_test {
        unsigned ref  			: 8;
        unsigned mask 			: 8;
        enum mali_func func 		: 3;
        enum mali_stencil_op sfail 	: 3;
        enum mali_stencil_op dpfail 	: 3;
        enum mali_stencil_op dppass 	: 3;
        unsigned zero			: 4;
} __attribute__((packed));

#define MALI_MASK_R (1 << 0)
#define MALI_MASK_G (1 << 1)
#define MALI_MASK_B (1 << 2)
#define MALI_MASK_A (1 << 3)

enum mali_nondominant_mode {
        MALI_BLEND_NON_MIRROR = 0,
        MALI_BLEND_NON_ZERO = 1
};

enum mali_dominant_blend {
        MALI_BLEND_DOM_SOURCE = 0,
        MALI_BLEND_DOM_DESTINATION  = 1
};

enum mali_dominant_factor {
        MALI_DOMINANT_UNK0 = 0,
        MALI_DOMINANT_ZERO = 1,
        MALI_DOMINANT_SRC_COLOR = 2,
        MALI_DOMINANT_DST_COLOR = 3,
        MALI_DOMINANT_UNK4 = 4,
        MALI_DOMINANT_SRC_ALPHA = 5,
        MALI_DOMINANT_DST_ALPHA = 6,
        MALI_DOMINANT_CONSTANT = 7,
};

enum mali_blend_modifier {
        MALI_BLEND_MOD_UNK0 = 0,
        MALI_BLEND_MOD_NORMAL = 1,
        MALI_BLEND_MOD_SOURCE_ONE = 2,
        MALI_BLEND_MOD_DEST_ONE = 3,
};

struct mali_blend_mode {
        enum mali_blend_modifier clip_modifier : 2;
        unsigned unused_0 : 1;
        unsigned negate_source : 1;

        enum mali_dominant_blend dominant : 1;

        enum mali_nondominant_mode nondominant_mode : 1;

        unsigned unused_1 : 1;

        unsigned negate_dest : 1;

        enum mali_dominant_factor dominant_factor : 3;
        unsigned complement_dominant : 1;
} __attribute__((packed));

struct mali_blend_equation {
        /* Of type mali_blend_mode */
        unsigned rgb_mode : 12;
        unsigned alpha_mode : 12;

        unsigned zero1 : 4;

        /* Corresponds to MALI_MASK_* above and glColorMask arguments */

        unsigned color_mask : 4;
} __attribute__((packed));

/* Used with channel swizzling */
enum mali_channel {
	MALI_CHANNEL_RED = 0,
	MALI_CHANNEL_GREEN = 1,
	MALI_CHANNEL_BLUE = 2,
	MALI_CHANNEL_ALPHA = 3,
	MALI_CHANNEL_ZERO = 4,
	MALI_CHANNEL_ONE = 5,
	MALI_CHANNEL_RESERVED_0 = 6,
	MALI_CHANNEL_RESERVED_1 = 7,
};

struct mali_channel_swizzle {
	enum mali_channel r : 3;
	enum mali_channel g : 3;
	enum mali_channel b : 3;
	enum mali_channel a : 3;
} __attribute__((packed));

/* Compressed per-pixel formats. Each of these formats expands to one to four
 * floating-point or integer numbers, as defined by the OpenGL specification.
 * There are various places in OpenGL where the user can specify a compressed
 * format in memory, which all use the same 8-bit enum in the various
 * descriptors, although different hardware units support different formats.
 */

/* The top 3 bits specify how the bits of each component are interpreted. */

/* e.g. ETC2_RGB8 */
#define MALI_FORMAT_COMPRESSED (0 << 5)

/* e.g. R11F_G11F_B10F */
#define MALI_FORMAT_SPECIAL (2 << 5)

/* signed normalized, e.g. RGBA8_SNORM */
#define MALI_FORMAT_SNORM (3 << 5)

/* e.g. RGBA8UI */
#define MALI_FORMAT_UINT (4 << 5)

/* e.g. RGBA8 and RGBA32F */
#define MALI_FORMAT_UNORM (5 << 5)

/* e.g. RGBA8I and RGBA16F */
#define MALI_FORMAT_SINT (6 << 5)

/* These formats seem to largely duplicate the others. They're used at least
 * for Bifrost framebuffer output.
 */
#define MALI_FORMAT_SPECIAL2 (7 << 5)
#define MALI_EXTRACT_TYPE(fmt) ((fmt) & 0xe0)

/* If the high 3 bits are 3 to 6 these two bits say how many components
 * there are.
 */
#define MALI_NR_CHANNELS(n) ((n - 1) << 3)
#define MALI_EXTRACT_CHANNELS(fmt) ((((fmt) >> 3) & 3) + 1)

/* If the high 3 bits are 3 to 6, then the low 3 bits say how big each
 * component is, except the special MALI_CHANNEL_FLOAT which overrides what the
 * bits mean.
 */

#define MALI_CHANNEL_4 2

#define MALI_CHANNEL_8 3

#define MALI_CHANNEL_16 4

#define MALI_CHANNEL_32 5

/* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For
 * MALI_FORMAT_UNORM, it means a 32-bit float.
 */
#define MALI_CHANNEL_FLOAT 7
#define MALI_EXTRACT_BITS(fmt) (fmt & 0x7)

enum mali_format {
        /* Not all formats are in fact available, need to query dynamically to
         * check. Factory settings for Juno enables only ETC2 and ASTC, no
         * DXT/RGTC formats.
         * */

        /* 0x0 invalid */
	MALI_ETC2_RGB8       = MALI_FORMAT_COMPRESSED | 0x1,
	MALI_ETC2_R11_UNORM  = MALI_FORMAT_COMPRESSED | 0x2,
	MALI_ETC2_RGBA8      = MALI_FORMAT_COMPRESSED | 0x3,
	MALI_ETC2_RG11_UNORM = MALI_FORMAT_COMPRESSED | 0x4,
        /* 0x5 reserved */
        MALI_NXR             = MALI_FORMAT_COMPRESSED | 0x6, /* Nokia eXtended Range */
        MALI_BC1_UNORM       = MALI_FORMAT_COMPRESSED | 0x7, /* DXT1 */
        MALI_BC2_UNORM       = MALI_FORMAT_COMPRESSED | 0x8, /* DXT3 */
        MALI_BC3_UNORM       = MALI_FORMAT_COMPRESSED | 0x9, /* DXT5 */
        MALI_BC4_UNORM       = MALI_FORMAT_COMPRESSED | 0xA, /* RGTC1_UNORM */
        MALI_BC4_SNORM       = MALI_FORMAT_COMPRESSED | 0xB, /* RGTC1_SNORM */
        MALI_BC5_UNORM       = MALI_FORMAT_COMPRESSED | 0xC, /* RGTC2_UNORM */
        MALI_BC5_SNORM       = MALI_FORMAT_COMPRESSED | 0xD, /* RGTC2_SNORM */
        MALI_BC6H_UF16       = MALI_FORMAT_COMPRESSED | 0xE,
        MALI_BC6H_SF16       = MALI_FORMAT_COMPRESSED | 0xF,
        MALI_BC7_UNORM       = MALI_FORMAT_COMPRESSED | 0x10,
	MALI_ETC2_R11_SNORM  = MALI_FORMAT_COMPRESSED | 0x11, /* EAC_SNORM */
	MALI_ETC2_RG11_SNORM = MALI_FORMAT_COMPRESSED | 0x12, /* EAC_SNORM */
	MALI_ETC2_RGB8A1     = MALI_FORMAT_COMPRESSED | 0x13,
	MALI_ASTC_3D_LDR     = MALI_FORMAT_COMPRESSED | 0x14,
	MALI_ASTC_3D_HDR     = MALI_FORMAT_COMPRESSED | 0x15,
	MALI_ASTC_2D_LDR     = MALI_FORMAT_COMPRESSED | 0x16,
	MALI_ASTC_2D_HDR     = MALI_FORMAT_COMPRESSED | 0x17,

	MALI_RGB565         = MALI_FORMAT_SPECIAL | 0x0,
	MALI_RGB5_X1_UNORM  = MALI_FORMAT_SPECIAL | 0x1,
	MALI_RGB5_A1_UNORM  = MALI_FORMAT_SPECIAL | 0x2,
	MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3,
	MALI_RGB10_A2_SNORM = MALI_FORMAT_SPECIAL | 0x5,
	MALI_RGB10_A2UI     = MALI_FORMAT_SPECIAL | 0x7,
	MALI_RGB10_A2I      = MALI_FORMAT_SPECIAL | 0x9,

	MALI_RGB332_UNORM   = MALI_FORMAT_SPECIAL | 0xb,
	MALI_RGB233_UNORM   = MALI_FORMAT_SPECIAL | 0xc,

	MALI_Z24X8_UNORM    = MALI_FORMAT_SPECIAL | 0xd,
	MALI_R32_FIXED      = MALI_FORMAT_SPECIAL | 0x11,
	MALI_RG32_FIXED     = MALI_FORMAT_SPECIAL | 0x12,
	MALI_RGB32_FIXED    = MALI_FORMAT_SPECIAL | 0x13,
	MALI_RGBA32_FIXED   = MALI_FORMAT_SPECIAL | 0x14,
	MALI_R11F_G11F_B10F = MALI_FORMAT_SPECIAL | 0x19,
        MALI_R9F_G9F_B9F_E5F = MALI_FORMAT_SPECIAL | 0x1b,
	/* Only used for varyings, to indicate the transformed gl_Position */
	MALI_VARYING_POS    = MALI_FORMAT_SPECIAL | 0x1e,
	/* Only used for varyings, to indicate that the write should be
	 * discarded.
	 */
	MALI_VARYING_DISCARD = MALI_FORMAT_SPECIAL | 0x1f,

	MALI_R8_SNORM     = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
	MALI_R16_SNORM    = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
	MALI_R32_SNORM    = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
	MALI_RG8_SNORM    = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
	MALI_RG16_SNORM   = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
	MALI_RG32_SNORM   = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
	MALI_RGB8_SNORM   = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
	MALI_RGB16_SNORM  = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
	MALI_RGB32_SNORM  = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
	MALI_RGBA8_SNORM  = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
	MALI_RGBA16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
	MALI_RGBA32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,

	MALI_R8UI     = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
	MALI_R16UI    = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
	MALI_R32UI    = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
	MALI_RG8UI    = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
	MALI_RG16UI   = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
	MALI_RG32UI   = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
	MALI_RGB8UI   = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
	MALI_RGB16UI  = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
	MALI_RGB32UI  = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
	MALI_RGBA8UI  = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
	MALI_RGBA16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
	MALI_RGBA32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,

	MALI_R8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
	MALI_R16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
	MALI_R32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
	MALI_R32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
	MALI_RG8_UNORM    = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
	MALI_RG16_UNORM   = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
	MALI_RG32_UNORM   = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
	MALI_RG32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
	MALI_RGB8_UNORM   = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
	MALI_RGB16_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
	MALI_RGB32_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
	MALI_RGB32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
	MALI_RGBA4_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_4,
	MALI_RGBA8_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
	MALI_RGBA16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
	MALI_RGBA32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
	MALI_RGBA32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,

	MALI_R8I     = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
	MALI_R16I    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
	MALI_R32I    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
	MALI_R16F    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
	MALI_RG8I    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
	MALI_RG16I   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
	MALI_RG32I   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
	MALI_RG16F   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
	MALI_RGB8I   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
	MALI_RGB16I  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
	MALI_RGB32I  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
	MALI_RGB16F  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
	MALI_RGBA8I  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
	MALI_RGBA16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
	MALI_RGBA32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
	MALI_RGBA16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,

	MALI_RGBA4      = MALI_FORMAT_SPECIAL2 | 0x8,
	MALI_RGBA8_2    = MALI_FORMAT_SPECIAL2 | 0xd,
	MALI_RGB10_A2_2 = MALI_FORMAT_SPECIAL2 | 0xe,
};


/* Applies to midgard1.flags_lo */

/* Should be set when the fragment shader updates the depth value. */
#define MALI_WRITES_Z (1 << 4)

/* Should the hardware perform early-Z testing? Set if the shader does not use
 * discard, alpha-to-coverage, shader depth writes, and if the shader has no
 * side effects (writes to global memory or images) unless early-z testing is
 * forced in the shader.
 */

#define MALI_EARLY_Z (1 << 6)

/* Should the hardware calculate derivatives (via helper invocations)? Set in a
 * fragment shader that uses texturing or derivative functions */

#define MALI_HELPER_INVOCATIONS (1 << 7)

/* Flags denoting the fragment shader's use of tilebuffer readback. If the
 * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If
 * it might read depth/stencil in particular, also set MALI_READS_ZS */

#define MALI_READS_ZS (1 << 8)

/* The shader might write to global memory (via OpenCL, SSBOs, or images).
 * Reading is okay, as are ordinary writes to the tilebuffer/varyings. Setting
 * incurs a performance penalty. On a fragment shader, this bit implies there
 * are side effects, hence it interacts with early-z. */
#define MALI_WRITES_GLOBAL (1 << 9)

#define MALI_READS_TILEBUFFER (1 << 10)

/* Applies to midgard1.flags_hi */

/* Should be set when the fragment shader updates the stencil value. */
#define MALI_WRITES_S (1 << 2)

/* Mode to suppress generation of Infinity and NaN values by clamping inf
 * (-inf) to MAX_FLOAT (-MIN_FLOAT) and flushing NaN to 0.0
 *
 * Compare suppress_inf/suppress_nan flags on the Bifrost clause header for the
 * same functionality.
 *
 * This is not conformant on GLES3 or OpenCL, but is optional on GLES2, where
 * it works around app bugs (e.g. in glmark2-es2 -bterrain with FP16).
 */
#define MALI_SUPPRESS_INF_NAN (1 << 3)

/* Flags for bifrost1.unk1 */

/* Shader uses less than 32 registers, partitioned as [R0, R15] U [R48, R63],
 * allowing for full thread count. If clear, the full [R0, R63] register set is
 * available at half thread count */
#define MALI_BIFROST_FULL_THREAD (1 << 9)

/* Enable early-z testing (presumably). This flag may not be set if the shader:
 *
 *  - Uses blending
 *  - Uses discard
 *  - Writes gl_FragDepth
 *
 * This differs from Midgard which sets the MALI_EARLY_Z flag even with
 * blending, although I've begun to suspect that flag does not in fact enable
 * EARLY_Z alone. */
#define MALI_BIFROST_EARLY_Z (1 << 15)

/* First clause type is ATEST */
#define MALI_BIFROST_FIRST_ATEST (1 << 26)

/* The raw Midgard blend payload can either be an equation or a shader
 * address, depending on the context */

union midgard_blend {
        mali_ptr shader;

        struct {
                struct mali_blend_equation equation;
                float constant;
        };
};

/* We need to load the tilebuffer to blend (i.e. the destination factor is not
 * ZERO) */

#define MALI_BLEND_LOAD_TIB (0x1)

/* A blend shader is used to blend this render target */
#define MALI_BLEND_MRT_SHADER (0x2)

/* On MRT Midgard systems (using an MFBD), each render target gets its own
 * blend descriptor */

#define MALI_BLEND_SRGB (0x400)

/* Dithering is specified here for MFBD, otherwise NO_DITHER for SFBD */
#define MALI_BLEND_NO_DITHER (0x800)

struct midgard_blend_rt {
        /* Flags base value of 0x200 to enable the render target.
         * OR with 0x1 for blending (anything other than REPLACE).
         * OR with 0x2 for programmable blending
         * OR with MALI_BLEND_SRGB for implicit sRGB
         */

        u64 flags;
        union midgard_blend blend;
} __attribute__((packed));

/* On Bifrost systems (all MRT), each render target gets one of these
 * descriptors */

enum bifrost_shader_type {
        BIFROST_BLEND_F16 = 0,
        BIFROST_BLEND_F32 = 1,
        BIFROST_BLEND_I32 = 2,
        BIFROST_BLEND_U32 = 3,
        BIFROST_BLEND_I16 = 4,
        BIFROST_BLEND_U16 = 5,
};

#define BIFROST_MAX_RENDER_TARGET_COUNT 8

struct bifrost_blend_rt {
        /* This is likely an analogue of the flags on
         * midgard_blend_rt */

        u16 flags; // = 0x200

        /* Single-channel blend constants are encoded in a sort of
         * fixed-point. Basically, the float is mapped to a byte, becoming
         * a high byte, and then the lower-byte is added for precision.
         * For the original float f:
         *
         * f = (constant_hi / 255) + (constant_lo / 65535)
         *
         * constant_hi = int(f / 255)
         * constant_lo = 65535*f - (65535/255) * constant_hi
         */
        u16 constant;

        struct mali_blend_equation equation;

        /*
         * - 0x19 normally
         * - 0x3 when this slot is unused (everything else is 0 except the index)
         * - 0x11 when this is the fourth slot (and it's used)
         * - 0 when there is a blend shader
         */
        u16 unk2;

        /* increments from 0 to 3 */
        u16 index;

        union {
                struct {
                        /* So far, I've only seen:
                         * - R001 for 1-component formats
                         * - RG01 for 2-component formats
                         * - RGB1 for 3-component formats
                         * - RGBA for 4-component formats
                         */
                        u32 swizzle : 12;
                        enum mali_format format : 8;

                        /* Type of the shader output variable. Note, this can
                          * be different from the format.
                          * enum bifrost_shader_type
                         */
                        u32 zero1 : 4;
                        u32 shader_type : 3;
                        u32 zero2 : 5;
                };

                /* Only the low 32 bits of the blend shader are stored, the
                 * high 32 bits are implicitly the same as the original shader.
                 * According to the kernel driver, the program counter for
                 * shaders is actually only 24 bits, so shaders cannot cross
                 * the 2^24-byte boundary, and neither can the blend shader.
                 * The blob handles this by allocating a 2^24 byte pool for
                 * shaders, and making sure that any blend shaders are stored
                 * in the same pool as the original shader. The kernel will
                 * make sure this allocation is aligned to 2^24 bytes.
                 */
                u32 shader;
        };
} __attribute__((packed));

/* Descriptor for the shader. Following this is at least one, up to four blend
 * descriptors for each active render target */

struct mali_shader_meta {
        mali_ptr shader;
        u16 sampler_count;
        u16 texture_count;
        u16 attribute_count;
        u16 varying_count;

        union {
                struct {
                        u32 uniform_buffer_count : 4;
                        u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler
                } bifrost1;
                struct {
                        unsigned uniform_buffer_count : 4;
                        unsigned flags_lo : 12;

                        /* vec4 units */
                        unsigned work_count : 5;
                        unsigned uniform_count : 5;
                        unsigned flags_hi : 6;
                } midgard1;
        };

        /* Same as glPolygoOffset() arguments */
        float depth_units;
        float depth_factor;

        u32 unknown2_2;

        /* Generated from SAMPLE_COVERAGE_VALUE and SAMPLE_COVERAGE_INVERT. See
         * 13.8.3 ("Multisample Fragment Operations") in the OpenGL ES 3.2
         * specification. Only matters when multisampling is enabled. */
        u16 coverage_mask;

        u16 unknown2_3;

        u8 stencil_mask_front;
        u8 stencil_mask_back;
        u16 unknown2_4;

        struct mali_stencil_test stencil_front;
        struct mali_stencil_test stencil_back;

        union {
                struct {
                        u32 unk3 : 7;
                        /* On Bifrost, some system values are preloaded in
                         * registers R55-R62 by the thread dispatcher prior to
                         * the start of shader execution. This is a bitfield
                         * with one entry for each register saying which
                         * registers need to be preloaded. Right now, the known
                         * values are:
                         *
                         * Vertex/compute:
                         * - R55 : gl_LocalInvocationID.xy
                         * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits
                         * - R57 : gl_WorkGroupID.x
                         * - R58 : gl_WorkGroupID.y
                         * - R59 : gl_WorkGroupID.z
                         * - R60 : gl_GlobalInvocationID.x
                         * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base)
                         * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base)
                         *
                         * Fragment:
                         * - R55 : unknown, never seen (but the bit for this is
                         *   always set?)
                         * - R56 : unknown (bit always unset)
                         * - R57 : gl_PrimitiveID
                         * - R58 : gl_FrontFacing in low bit, potentially other stuff
                         * - R59 : u16 fragment coordinates (used to compute
                         *   gl_FragCoord.xy, together with sample positions)
                         * - R60 : gl_SampleMask (used in epilog, so pretty
                         *   much always used, but the bit is always 0 -- is
                         *   this just always pushed?)
                         * - R61 : gl_SampleMaskIn and gl_SampleID, used by
                         *   varying interpolation.
                         * - R62 : unknown (bit always unset).
                         *
                         * Later GPUs (starting with Mali-G52?) support
                         * preloading float varyings into r0-r7. This is
                         * indicated by setting 0x40. There is no distinction
                         * here between 1 varying and 2.
                         */
                        u32 preload_regs : 8;
                        /* In units of 8 bytes or 64 bits, since the
                         * uniform/const port loads 64 bits at a time.
                         */
                        u32 uniform_count : 7;
                        u32 unk4 : 10; // = 2
                } bifrost2;
                struct {
                        u32 unknown2_7;
                } midgard2;
        };

        u32 padding;

        /* Blending information for the older non-MRT Midgard HW. Check for
         * MALI_HAS_BLEND_SHADER to decide how to interpret.
         */

        union midgard_blend blend;
} __attribute__((packed));

/* This only concerns hardware jobs */

/* Possible values for job_descriptor_size */

#define MALI_JOB_32 0
#define MALI_JOB_64 1

struct mali_job_descriptor_header {
        u32 exception_status;
        u32 first_incomplete_task;
        u64 fault_pointer;
        u8 job_descriptor_size : 1;
        enum mali_job_type job_type : 7;
        u8 job_barrier : 1;
        u8 unknown_flags : 7;
        u16 job_index;
        u16 job_dependency_index_1;
        u16 job_dependency_index_2;
        u64 next_job;
} __attribute__((packed));

/* These concern exception_status */

/* Access type causing a fault, paralleling AS_FAULTSTATUS_* entries in the
 * kernel */

enum mali_exception_access {
        /* Atomic in the kernel for MMU, but that doesn't make sense for a job
         * fault so it's just unused */
        MALI_EXCEPTION_ACCESS_NONE    = 0,

        MALI_EXCEPTION_ACCESS_EXECUTE = 1,
        MALI_EXCEPTION_ACCESS_READ    = 2,
        MALI_EXCEPTION_ACCESS_WRITE   = 3
};

/* Details about write_value from panfrost igt tests which use it as a generic
 * dword write primitive */

#define MALI_WRITE_VALUE_ZERO 3

struct mali_payload_write_value {
        u64 address;
        u32 value_descriptor;
        u32 reserved;
        u64 immediate;
} __attribute__((packed));

/*
 * Mali Attributes
 *
 * This structure lets the attribute unit compute the address of an attribute
 * given the vertex and instance ID. Unfortunately, the way this works is
 * rather complicated when instancing is enabled.
 *
 * To explain this, first we need to explain how compute and vertex threads are
 * dispatched. This is a guess (although a pretty firm guess!) since the
 * details are mostly hidden from the driver, except for attribute instancing.
 * When a quad is dispatched, it receives a single, linear index. However, we
 * need to translate that index into a (vertex id, instance id) pair, or a
 * (local id x, local id y, local id z) triple for compute shaders (although
 * vertex shaders and compute shaders are handled almost identically).
 * Focusing on vertex shaders, one option would be to do:
 *
 * vertex_id = linear_id % num_vertices
 * instance_id = linear_id / num_vertices
 *
 * but this involves a costly division and modulus by an arbitrary number.
 * Instead, we could pad num_vertices. We dispatch padded_num_vertices *
 * num_instances threads instead of num_vertices * num_instances, which results
 * in some "extra" threads with vertex_id >= num_vertices, which we have to
 * discard.  The more we pad num_vertices, the more "wasted" threads we
 * dispatch, but the division is potentially easier.
 *
 * One straightforward choice is to pad num_vertices to the next power of two,
 * which means that the division and modulus are just simple bit shifts and
 * masking. But the actual algorithm is a bit more complicated. The thread
 * dispatcher has special support for dividing by 3, 5, 7, and 9, in addition
 * to dividing by a power of two. This is possibly using the technique
 * described in patent US20170010862A1. As a result, padded_num_vertices can be
 * 1, 3, 5, 7, or 9 times a power of two. This results in less wasted threads,
 * since we need less padding.
 *
 * padded_num_vertices is picked by the hardware. The driver just specifies the
 * actual number of vertices. At least for Mali G71, the first few cases are
 * given by:
 *
 * num_vertices	| padded_num_vertices
 * 3		| 4
 * 4-7		| 8
 * 8-11		| 12 (3 * 4)
 * 12-15	| 16
 * 16-19	| 20 (5 * 4)
 *
 * Note that padded_num_vertices is a multiple of four (presumably because
 * threads are dispatched in groups of 4). Also, padded_num_vertices is always
 * at least one more than num_vertices, which seems like a quirk of the
 * hardware. For larger num_vertices, the hardware uses the following
 * algorithm: using the binary representation of num_vertices, we look at the
 * most significant set bit as well as the following 3 bits. Let n be the
 * number of bits after those 4 bits. Then we set padded_num_vertices according
 * to the following table:
 *
 * high bits	| padded_num_vertices
 * 1000		| 9 * 2^n
 * 1001		| 5 * 2^(n+1)
 * 101x		| 3 * 2^(n+2)
 * 110x		| 7 * 2^(n+1)
 * 111x		| 2^(n+4)
 *
 * For example, if num_vertices = 70 is passed to glDraw(), its binary
 * representation is 1000110, so n = 3 and the high bits are 1000, and
 * therefore padded_num_vertices = 9 * 2^3 = 72.
 *
 * The attribute unit works in terms of the original linear_id. if
 * num_instances = 1, then they are the same, and everything is simple.
 * However, with instancing things get more complicated. There are four
 * possible modes, two of them we can group together:
 *
 * 1. Use the linear_id directly. Only used when there is no instancing.
 *
 * 2. Use the linear_id modulo a constant. This is used for per-vertex
 * attributes with instancing enabled by making the constant equal
 * padded_num_vertices. Because the modulus is always padded_num_vertices, this
 * mode only supports a modulus that is a power of 2 times 1, 3, 5, 7, or 9.
 * The shift field specifies the power of two, while the extra_flags field
 * specifies the odd number. If shift = n and extra_flags = m, then the modulus
 * is (2m + 1) * 2^n. As an example, if num_vertices = 70, then as computed
 * above, padded_num_vertices = 9 * 2^3, so we should set extra_flags = 4 and
 * shift = 3. Note that we must exactly follow the hardware algorithm used to
 * get padded_num_vertices in order to correctly implement per-vertex
 * attributes.
 *
 * 3. Divide the linear_id by a constant. In order to correctly implement
 * instance divisors, we have to divide linear_id by padded_num_vertices times
 * to user-specified divisor. So first we compute padded_num_vertices, again
 * following the exact same algorithm that the hardware uses, then multiply it
 * by the GL-level divisor to get the hardware-level divisor. This case is
 * further divided into two more cases. If the hardware-level divisor is a
 * power of two, then we just need to shift. The shift amount is specified by
 * the shift field, so that the hardware-level divisor is just 2^shift.
 *
 * If it isn't a power of two, then we have to divide by an arbitrary integer.
 * For that, we use the well-known technique of multiplying by an approximation
 * of the inverse. The driver must compute the magic multiplier and shift
 * amount, and then the hardware does the multiplication and shift. The
 * hardware and driver also use the "round-down" optimization as described in
 * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf.
 * The hardware further assumes the multiplier is between 2^31 and 2^32, so the
 * high bit is implicitly set to 1 even though it is set to 0 by the driver --
 * presumably this simplifies the hardware multiplier a little. The hardware
 * first multiplies linear_id by the multiplier and takes the high 32 bits,
 * then applies the round-down correction if extra_flags = 1, then finally
 * shifts right by the shift field.
 *
 * There are some differences between ridiculousfish's algorithm and the Mali
 * hardware algorithm, which means that the reference code from ridiculousfish
 * doesn't always produce the right constants. Mali does not use the pre-shift
 * optimization, since that would make a hardware implementation slower (it
 * would have to always do the pre-shift, multiply, and post-shift operations).
 * It also forces the multplier to be at least 2^31, which means that the
 * exponent is entirely fixed, so there is no trial-and-error. Altogether,
 * given the divisor d, the algorithm the driver must follow is:
 *
 * 1. Set shift = floor(log2(d)).
 * 2. Compute m = ceil(2^(shift + 32) / d) and e = 2^(shift + 32) % d.
 * 3. If e <= 2^shift, then we need to use the round-down algorithm. Set
 * magic_divisor = m - 1 and extra_flags = 1.
 * 4. Otherwise, set magic_divisor = m and extra_flags = 0.
 *
 * Unrelated to instancing/actual attributes, images (the OpenCL kind) are
 * implemented as special attributes, denoted by MALI_ATTR_IMAGE. For images,
 * let shift=extra_flags=0. Stride is set to the image format's bytes-per-pixel
 * (*NOT the row stride*). Size is set to the size of the image itself.
 *
 * Special internal attribtues and varyings (gl_VertexID, gl_FrontFacing, etc)
 * use particular fixed addresses with modified structures.
 */

enum mali_attr_mode {
	MALI_ATTR_UNUSED = 0,
	MALI_ATTR_LINEAR = 1,
	MALI_ATTR_POT_DIVIDE = 2,
	MALI_ATTR_MODULO = 3,
	MALI_ATTR_NPOT_DIVIDE = 4,
        MALI_ATTR_IMAGE = 5,
};

/* Pseudo-address for gl_VertexID, gl_FragCoord, gl_FrontFacing */

#define MALI_ATTR_VERTEXID (0x22)
#define MALI_ATTR_INSTANCEID (0x24)
#define MALI_VARYING_FRAG_COORD (0x25)
#define MALI_VARYING_FRONT_FACING (0x26)

/* This magic "pseudo-address" is used as `elements` to implement
 * gl_PointCoord. When read from a fragment shader, it generates a point
 * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces
 * require an affine transformation in the shader. */

#define MALI_VARYING_POINT_COORD (0x61)

/* Used for comparison to check if an address is special. Mostly a guess, but
 * it doesn't really matter. */

#define MALI_RECORD_SPECIAL (0x100)

union mali_attr {
	/* This is used for actual attributes. */
	struct {
		/* The bottom 3 bits are the mode */
		mali_ptr elements : 64 - 8;
		u32 shift : 5;
		u32 extra_flags : 3;
		u32 stride;
		u32 size;
	};
	/* The entry after an NPOT_DIVIDE entry has this format. It stores
	 * extra information that wouldn't fit in a normal entry.
	 */
	struct {
		u32 unk; /* = 0x20 */
		u32 magic_divisor;
		u32 zero;
		/* This is the original, GL-level divisor. */
		u32 divisor;
	};
} __attribute__((packed));

struct mali_attr_meta {
        /* Vertex buffer index */
        u8 index;

        unsigned unknown1 : 2;
        unsigned swizzle : 12;
        enum mali_format format : 8;

        /* Always observed to be zero at the moment */
        unsigned unknown3 : 2;

        /* When packing multiple attributes in a buffer, offset addresses by
         * this value. Obscurely, this is signed. */
        int32_t src_offset;
} __attribute__((packed));

#define FBD_MASK (~0x3f)

/* MFBD, rather than SFBD */
#define MALI_MFBD (0x1)

/* ORed into an MFBD address to specify the fbx section is included */
#define MALI_MFBD_TAG_EXTRA (0x2)

/* Uniform buffer objects are 64-bit fields divided as:
 *
 *      u64 size : 10;
 *      mali_ptr ptr : 64 - 10;
 *
 * The size is actually the size minus 1 (MALI_POSITIVE), in units of 16 bytes.
 * This gives a maximum of 2^14 bytes, which just so happens to be the GL
 * minimum-maximum for GL_MAX_UNIFORM_BLOCK_SIZE.
 *
 * The pointer is missing the bottom 2 bits and top 8 bits. The top 8 bits
 * should be 0 for userspace pointers, according to
 * https://lwn.net/Articles/718895/. By reusing these bits, we can make each
 * entry in the table only 64 bits.
 */

#define MALI_MAKE_UBO(elements, ptr) \
        (MALI_POSITIVE((elements)) | (((ptr) >> 2) << 10))

/* On Bifrost, these fields are the same between the vertex and tiler payloads.
 * They also seem to be the same between Bifrost and Midgard. They're shared in
 * fused payloads.
 */

/* Applies to unknown_draw */

#define MALI_DRAW_INDEXED_UINT8  (0x10)
#define MALI_DRAW_INDEXED_UINT16 (0x20)
#define MALI_DRAW_INDEXED_UINT32 (0x30)
#define MALI_DRAW_INDEXED_SIZE   (0x30)
#define MALI_DRAW_INDEXED_SHIFT  (4)

#define MALI_DRAW_VARYING_SIZE   (0x100)

/* Set to use first vertex as the provoking vertex for flatshading. Clear to
 * use the last vertex. This is the default in DX and VK, but not in GL. */

#define MALI_DRAW_FLATSHADE_FIRST (0x800)

#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000)

struct mali_vertex_tiler_prefix {
        /* This is a dynamic bitfield containing the following things in this order:
         *
         * - gl_WorkGroupSize.x
         * - gl_WorkGroupSize.y
         * - gl_WorkGroupSize.z
         * - gl_NumWorkGroups.x
         * - gl_NumWorkGroups.y
         * - gl_NumWorkGroups.z
         *
         * The number of bits allocated for each number is based on the *_shift
         * fields below. For example, workgroups_y_shift gives the bit that
         * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit
         * that gl_NumWorkGroups.z starts at (and therefore one after the bit
         * that gl_NumWorkGroups.y ends at). The actual value for each gl_*
         * value is one more than the stored value, since if any of the values
         * are zero, then there would be no invocations (and hence no job). If
         * there were 0 bits allocated to a given field, then it must be zero,
         * and hence the real value is one.
         *
         * Vertex jobs reuse the same job dispatch mechanism as compute jobs,
         * effectively doing glDispatchCompute(1, vertex_count, instance_count)
         * where vertex count is the number of vertices.
         */
        u32 invocation_count;

        /* Bitfield for shifts:
         *
         * size_y_shift : 5
         * size_z_shift : 5
         * workgroups_x_shift : 6
         * workgroups_y_shift : 6
         * workgroups_z_shift : 6
         * workgroups_x_shift_2 : 4
         */
        u32 invocation_shifts;

        u32 draw_mode : 4;
        u32 unknown_draw : 22;

        /* This is the the same as workgroups_x_shift_2 in compute shaders, but
         * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has
         * something to do with how many quads get put in the same execution
         * engine, which is a balance (you don't want to starve the engine, but
         * you also want to distribute work evenly).
         */
        u32 workgroups_x_shift_3 : 6;


        /* Negative of min_index. This is used to compute
         * the unbiased index in tiler/fragment shader runs.
         * 
         * The hardware adds offset_bias_correction in each run,
         * so that absent an index bias, the first vertex processed is
         * genuinely the first vertex (0). But with an index bias,
         * the first vertex process is numbered the same as the bias.
         *
         * To represent this more conviniently:
         * unbiased_index = lower_bound_index +
         *                  index_bias +
         *                  offset_bias_correction
         *
         * This is done since the hardware doesn't accept a index_bias
         * and this allows it to recover the unbiased index.
         */
        int32_t offset_bias_correction;
        u32 zero1;

        /* Like many other strictly nonzero quantities, index_count is
         * subtracted by one. For an indexed cube, this is equal to 35 = 6
         * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is,
         * for an indexed draw, index_count is the number of actual vertices
         * rendered whereas invocation_count is the number of unique vertices
         * rendered (the number of times the vertex shader must be invoked).
         * For non-indexed draws, this is just equal to invocation_count. */

        u32 index_count;

        /* No hidden structure; literally just a pointer to an array of uint
         * indices (width depends on flags). Thanks, guys, for not making my
         * life insane for once! NULL for non-indexed draws. */

        u64 indices;
} __attribute__((packed));

/* Point size / line width can either be specified as a 32-bit float (for
 * constant size) or as a [machine word size]-bit GPU pointer (for varying size). If a pointer
 * is selected, by setting the appropriate MALI_DRAW_VARYING_SIZE bit in the tiler
 * payload, the contents of varying_pointer will be intepreted as an array of
 * fp16 sizes, one for each vertex. gl_PointSize is therefore implemented by
 * creating a special MALI_R16F varying writing to varying_pointer. */

union midgard_primitive_size {
        float constant;
        u64 pointer;
};

struct bifrost_tiler_heap_meta {
        u32 zero;
        u32 heap_size;
        /* note: these are just guesses! */
        mali_ptr tiler_heap_start;
        mali_ptr tiler_heap_free;
        mali_ptr tiler_heap_end;

        /* hierarchy weights? but they're still 0 after the job has run... */
        u32 zeros[10];
        u32 unk1;
        u32 unk7e007e;
} __attribute__((packed));

struct bifrost_tiler_meta {
        u32 tiler_heap_next_start;  /* To be written by the GPU */
        u32 used_hierarchy_mask;  /* To be written by the GPU */
        u16 hierarchy_mask; /* Five values observed: 0xa, 0x14, 0x28, 0x50, 0xa0 */
        u16 flags;
        u16 width;
        u16 height;
        u64 zero0;
        mali_ptr tiler_heap_meta;
        /* TODO what is this used for? */
        u64 zeros[20];
} __attribute__((packed));

struct bifrost_tiler_only {
        /* 0x20 */
        union midgard_primitive_size primitive_size;

        mali_ptr tiler_meta;

        u64 zero1, zero2, zero3, zero4, zero5, zero6;
} __attribute__((packed));

struct mali_vertex_tiler_postfix {
        u16 gl_enables; // 0x6 on Midgard, 0x2 on Bifrost

        /* Both zero for non-instanced draws. For instanced draws, a
         * decomposition of padded_num_vertices. See the comments about the
         * corresponding fields in mali_attr for context. */

        unsigned instance_shift : 5;
        unsigned instance_odd : 3;

        u8 zero4;

        /* Offset for first vertex in buffer */
        u32 offset_start;

	u64 zero5;

        /* Zero for vertex jobs. Pointer to the position (gl_Position) varying
         * output from the vertex shader for tiler jobs.
         */

        u64 position_varying;

        /* An array of mali_uniform_buffer_meta's. The size is given by the
         * shader_meta.
         */
        u64 uniform_buffers;

        /* On Bifrost, this is a pointer to an array of bifrost_texture_descriptor.
         * On Midgard, this is a pointer to an array of pointers to the texture
         * descriptors, number of pointers bounded by number of textures. The
         * indirection is needed to accomodate varying numbers and sizes of
         * texture descriptors */
        u64 textures;

        /* For OpenGL, from what I've seen, this is intimately connected to
         * texture_meta. cwabbott says this is not the case under Vulkan, hence
         * why this field is seperate (Midgard is Vulkan capable). Pointer to
         * array of sampler descriptors (which are uniform in size) */
        u64 sampler_descriptor;

        u64 uniforms;
        u64 shader;
        u64 attributes; /* struct attribute_buffer[] */
        u64 attribute_meta; /* attribute_meta[] */
        u64 varyings; /* struct attr */
        u64 varying_meta; /* pointer */
        u64 viewport;
        u64 occlusion_counter; /* A single bit as far as I can tell */

        /* On Bifrost, this points directly to a mali_shared_memory structure.
         * On Midgard, this points to a framebuffer (either SFBD or MFBD as
         * tagged), which embeds a mali_shared_memory structure */
        mali_ptr shared_memory;
} __attribute__((packed));

struct midgard_payload_vertex_tiler {
        struct mali_vertex_tiler_prefix prefix;
        struct mali_vertex_tiler_postfix postfix;

        union midgard_primitive_size primitive_size;
} __attribute__((packed));

struct bifrost_payload_vertex {
        struct mali_vertex_tiler_prefix prefix;
        struct mali_vertex_tiler_postfix postfix;
} __attribute__((packed));

struct bifrost_payload_tiler {
        struct mali_vertex_tiler_prefix prefix;
        struct bifrost_tiler_only tiler;
        struct mali_vertex_tiler_postfix postfix;
} __attribute__((packed));

struct bifrost_payload_fused {
        struct mali_vertex_tiler_prefix prefix;
        struct bifrost_tiler_only tiler;
        struct mali_vertex_tiler_postfix tiler_postfix;
        u64 padding; /* zero */
        struct mali_vertex_tiler_postfix vertex_postfix;
} __attribute__((packed));

/* Purposeful off-by-one in width, height fields. For example, a (64, 64)
 * texture is stored as (63, 63) in these fields. This adjusts for that.
 * There's an identical pattern in the framebuffer descriptor. Even vertex
 * count fields work this way, hence the generic name -- integral fields that
 * are strictly positive generally need this adjustment. */

#define MALI_POSITIVE(dim) (dim - 1)

/* Shared across both command stream and Midgard, and even with Bifrost */

enum mali_texture_type {
        MALI_TEX_CUBE = 0x0,
        MALI_TEX_1D = 0x1,
        MALI_TEX_2D = 0x2,
        MALI_TEX_3D = 0x3
};

/* 8192x8192 */
#define MAX_MIP_LEVELS (13)

/* Cubemap bloats everything up */
#define MAX_CUBE_FACES (6)

/* For each pointer, there is an address and optionally also a stride */
#define MAX_ELEMENTS (2)

/* It's not known why there are 4-bits allocated -- this enum is almost
 * certainly incomplete */

enum mali_texture_layout {
        /* For a Z/S texture, this is linear */
        MALI_TEXTURE_TILED = 0x1,

        /* Z/S textures cannot be tiled */
        MALI_TEXTURE_LINEAR = 0x2,

        /* 16x16 sparse */
        MALI_TEXTURE_AFBC = 0xC
};

/* Corresponds to the type passed to glTexImage2D and so forth */

struct mali_texture_format {
        unsigned swizzle : 12;
        enum mali_format format : 8;

        unsigned srgb : 1;
        unsigned unknown1 : 1;

        enum mali_texture_type type : 2;
        enum mali_texture_layout layout : 4;

        /* Always set */
        unsigned unknown2 : 1;

        /* Set to allow packing an explicit stride */
        unsigned manual_stride : 1;

        unsigned zero : 2;
} __attribute__((packed));

struct mali_texture_descriptor {
        uint16_t width;
        uint16_t height;
        uint16_t depth;
        uint16_t array_size;

        struct mali_texture_format format;

        uint16_t unknown3;

        /* One for non-mipmapped, zero for mipmapped */
        uint8_t unknown3A;

        /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */
        uint8_t levels;

        /* Swizzling is a single 32-bit word, broken up here for convenience.
         * Here, swizzling refers to the ES 3.0 texture parameters for channel
         * level swizzling, not the internal pixel-level swizzling which is
         * below OpenGL's reach */

        unsigned swizzle : 12;
        unsigned swizzle_zero       : 20;

        uint32_t unknown5;
        uint32_t unknown6;
        uint32_t unknown7;
} __attribute__((packed));

/* While Midgard texture descriptors are variable length, Bifrost descriptors
 * are fixed like samplers with more pointers to expand if necessary */

struct bifrost_texture_descriptor {
        unsigned format_unk : 4; /* 2 */
        enum mali_texture_type type : 2;
        unsigned zero : 4;
        unsigned format_swizzle : 12;
        enum mali_format format : 8;
        unsigned srgb : 1;
        unsigned format_unk3 : 1; /* 0 */

        uint16_t width; /* MALI_POSITIVE */
        uint16_t height; /* MALI_POSITIVE */

        /* OpenGL swizzle */
        unsigned swizzle : 12;
        enum mali_texture_layout layout : 4;
        uint8_t levels : 8; /* Number of levels-1 if mipmapped, 0 if not */
        unsigned unk1 : 8;

        unsigned levels_unk : 24; /* 0 */
        unsigned level_2 : 8; /* Number of levels, again? */

        mali_ptr payload;

        uint16_t array_size;
        uint16_t unk4;

        uint16_t depth;
        uint16_t unk5;
} __attribute__((packed));

/* filter_mode */

#define MALI_SAMP_MAG_NEAREST (1 << 0)
#define MALI_SAMP_MIN_NEAREST (1 << 1)

/* TODO: What do these bits mean individually? Only seen set together */

#define MALI_SAMP_MIP_LINEAR_1 (1 << 3)
#define MALI_SAMP_MIP_LINEAR_2 (1 << 4)

/* Flag in filter_mode, corresponding to OpenCL's NORMALIZED_COORDS_TRUE
 * sampler_t flag. For typical OpenGL textures, this is always set. */

#define MALI_SAMP_NORM_COORDS (1 << 5)

/* Used for lod encoding. Thanks @urjaman for pointing out these routines can
 * be cleaned up a lot. */

#define DECODE_FIXED_16(x) ((float) (x / 256.0))

static inline int16_t
FIXED_16(float x, bool allow_negative)
{
        /* Clamp inputs, accounting for float error */
        float max_lod = (32.0 - (1.0 / 512.0));
        float min_lod = allow_negative ? -max_lod : 0.0;

        x = ((x > max_lod) ? max_lod : ((x < min_lod) ? min_lod : x));

        return (int) (x * 256.0);
}

struct mali_sampler_descriptor {
        uint16_t filter_mode;

        /* Fixed point, signed.
         * Upper 7 bits before the decimal point, although it caps [0-31].
         * Lower 8 bits after the decimal point: int(round(x * 256)) */

        int16_t lod_bias;
        int16_t min_lod;
        int16_t max_lod;

        /* All one word in reality, but packed a bit. Comparisons are flipped
         * from OpenGL. */

        enum mali_wrap_mode wrap_s : 4;
        enum mali_wrap_mode wrap_t : 4;
        enum mali_wrap_mode wrap_r : 4;
        enum mali_func compare_func : 3;

        /* No effect on 2D textures. For cubemaps, set for ES3 and clear for
         * ES2, controlling seamless cubemapping */
        unsigned seamless_cube_map : 1;

        unsigned zero : 16;

        uint32_t zero2;
        float border_color[4];
} __attribute__((packed));

/* Bifrost sampler descriptors look pretty similar */

#define BIFROST_SAMP_MIN_NEAREST        (1)
#define BIFROST_SAMP_MAG_LINEAR         (1)

struct bifrost_sampler_descriptor {
        uint8_t unk1;

        enum mali_wrap_mode wrap_r : 4;
        enum mali_wrap_mode wrap_t : 4;
        enum mali_wrap_mode wrap_s : 4;
        uint8_t unk8 : 4;

        uint8_t unk2 : 1;
        uint8_t norm_coords : 1;
        uint8_t unk3 : 1;
        uint8_t min_filter : 1;
        uint8_t zero1 : 1;
        uint8_t mag_filter : 1;
        uint8_t mip_filter : 1;

        int16_t min_lod;
        int16_t max_lod;

        uint64_t zero2;
        uint64_t zero3;
        uint64_t zero4;
} __attribute__((packed));

/* From presentations, 16x16 tiles externally. Use shift for fast computation
 * of tile numbers. */

#define MALI_TILE_SHIFT 4
#define MALI_TILE_LENGTH (1 << MALI_TILE_SHIFT)

/* Tile coordinates are stored as a compact u32, as only 12 bits are needed to
 * each component. Notice that this provides a theoretical upper bound of (1 <<
 * 12) = 4096 tiles in each direction, addressing a maximum framebuffer of size
 * 65536x65536. Multiplying that together, times another four given that Mali
 * framebuffers are 32-bit ARGB8888, means that this upper bound would take 16
 * gigabytes of RAM just to store the uncompressed framebuffer itself, let
 * alone rendering in real-time to such a buffer.
 *
 * Nice job, guys.*/

/* From mali_kbase_10969_workaround.c */
#define MALI_X_COORD_MASK 0x00000FFF
#define MALI_Y_COORD_MASK 0x0FFF0000

/* Extract parts of a tile coordinate */

#define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK)
#define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16)

/* Helpers to generate tile coordinates based on the boundary coordinates in
 * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these
 * functions would convert it to the bounding tiles (0, 0) to (7, 7).
 * Intentional "off-by-one"; finding the tile number is a form of fencepost
 * problem. */

#define MALI_MAKE_TILE_COORDS(X, Y) ((X) | ((Y) << 16))
#define MALI_BOUND_TO_TILE(B, bias) ((B - bias) >> MALI_TILE_SHIFT)
#define MALI_COORDINATE_TO_TILE(W, H, bias) MALI_MAKE_TILE_COORDS(MALI_BOUND_TO_TILE(W, bias), MALI_BOUND_TO_TILE(H, bias))
#define MALI_COORDINATE_TO_TILE_MIN(W, H) MALI_COORDINATE_TO_TILE(W, H, 0)
#define MALI_COORDINATE_TO_TILE_MAX(W, H) MALI_COORDINATE_TO_TILE(W, H, 1)

struct mali_payload_fragment {
        u32 min_tile_coord;
        u32 max_tile_coord;
        mali_ptr framebuffer;
} __attribute__((packed));

/* Single Framebuffer Descriptor */

/* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is
 * configured for 4x. With MSAA_8, it is configured for 8x. */

#define MALI_SFBD_FORMAT_MSAA_8 (1 << 3)
#define MALI_SFBD_FORMAT_MSAA_A (1 << 4)
#define MALI_SFBD_FORMAT_MSAA_B (1 << 4)
#define MALI_SFBD_FORMAT_SRGB 	(1 << 5)

/* Fast/slow based on whether all three buffers are cleared at once */

#define MALI_CLEAR_FAST         (1 << 18)
#define MALI_CLEAR_SLOW         (1 << 28)
#define MALI_CLEAR_SLOW_STENCIL (1 << 31)

/* Configures hierarchical tiling on Midgard for both SFBD/MFBD (embedded
 * within the larget framebuffer descriptor). Analogous to
 * bifrost_tiler_heap_meta and bifrost_tiler_meta*/

/* See pan_tiler.c for derivation */
#define MALI_HIERARCHY_MASK ((1 << 9) - 1)

/* Flag disabling the tiler for clear-only jobs, with
   hierarchical tiling */
#define MALI_TILER_DISABLED (1 << 12)

/* Flag selecting userspace-generated polygon list, for clear-only jobs without
 * hierarhical tiling. */
#define MALI_TILER_USER 0xFFF

/* Absent any geometry, the minimum size of the polygon list header */
#define MALI_TILER_MINIMUM_HEADER_SIZE 0x200

struct midgard_tiler_descriptor {
        /* Size of the entire polygon list; see pan_tiler.c for the
         * computation. It's based on hierarchical tiling */

        u32 polygon_list_size;

        /* Name known from the replay workaround in the kernel. What exactly is
         * flagged here is less known. We do that (tiler_hierarchy_mask & 0x1ff)
         * specifies a mask of hierarchy weights, which explains some of the
         * performance mysteries around setting it. We also see the bottom bit
         * of tiler_flags set in the kernel, but no comment why.
         *
         * hierarchy_mask can have the TILER_DISABLED flag */

        u16 hierarchy_mask;
        u16 flags;

        /* See mali_tiler.c for an explanation */
        mali_ptr polygon_list;
        mali_ptr polygon_list_body;

        /* Names based on we see symmetry with replay jobs which name these
         * explicitly */

        mali_ptr heap_start; /* tiler heap_free_address */
        mali_ptr heap_end;

        /* Hierarchy weights. We know these are weights based on the kernel,
         * but I've never seen them be anything other than zero */
        u32 weights[8];
};

enum mali_block_format {
        MALI_BLOCK_TILED   = 0x0,
        MALI_BLOCK_UNKNOWN = 0x1,
        MALI_BLOCK_LINEAR  = 0x2,
        MALI_BLOCK_AFBC    = 0x3,
};

struct mali_sfbd_format {
        /* 0x1 */
        unsigned unk1 : 6;

        /* mali_channel_swizzle */
        unsigned swizzle : 12;

        /* MALI_POSITIVE */
        unsigned nr_channels : 2;

        /* 0x4 */
        unsigned unk2 : 6;

        enum mali_block_format block : 2;

        /* 0xb */
        unsigned unk3 : 4;
};

/* Shared structure at the start of framebuffer descriptors, or used bare for
 * compute jobs, configuring stack and shared memory */

struct mali_shared_memory {
        u32 stack_shift : 4;
        u32 unk0 : 28;

        /* Configuration for shared memory for compute shaders.
         * shared_workgroup_count is logarithmic and may be computed for a
         * compute shader using shared memory as:
         *
         *  shared_workgroup_count = MAX2(ceil(log2(count_x)) + ... + ceil(log2(count_z), 10)
         *
         * For compute shaders that don't use shared memory, or non-compute
         * shaders, this is set to ~0
         */

        u32 shared_workgroup_count : 5;
        u32 shared_unk1 : 3;
        u32 shared_shift : 4;
        u32 shared_zero : 20;

        mali_ptr scratchpad;

        /* For compute shaders, the RAM backing of workgroup-shared memory. For
         * fragment shaders on Bifrost, apparently multisampling locations */

        mali_ptr shared_memory;
        mali_ptr unknown1;
} __attribute__((packed));

/* Configures multisampling on Bifrost fragment jobs */

struct bifrost_multisampling {
        u64 zero1;
        u64 zero2;
        mali_ptr sample_locations;
        u64 zero4;
} __attribute__((packed));

struct mali_single_framebuffer {
        struct mali_shared_memory shared_memory;
        struct mali_sfbd_format format;

        u32 clear_flags;
        u32 zero2;

        /* Purposeful off-by-one in these fields should be accounted for by the
         * MALI_DIMENSION macro */

        u16 width;
        u16 height;

        u32 zero3[4];
        mali_ptr checksum;
        u32 checksum_stride;
        u32 zero5;

        /* By default, the framebuffer is upside down from OpenGL's
         * perspective. Set framebuffer to the end and negate the stride to
         * flip in the Y direction */

        mali_ptr framebuffer;
        int32_t stride;

        u32 zero4;

        /* Depth and stencil buffers are interleaved, it appears, as they are
         * set to the same address in captures. Both fields set to zero if the
         * buffer is not being cleared. Depending on GL_ENABLE magic, you might
         * get a zero enable despite the buffer being present; that still is
         * disabled. */

        mali_ptr depth_buffer; // not SAME_VA
        u32 depth_stride_zero : 4;
        u32 depth_stride : 28;
        u32 zero7;

        mali_ptr stencil_buffer; // not SAME_VA
        u32 stencil_stride_zero : 4;
        u32 stencil_stride : 28;
        u32 zero8;

        u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
        u32 clear_color_2; // always equal, but unclear function?
        u32 clear_color_3; // always equal, but unclear function?
        u32 clear_color_4; // always equal, but unclear function?

        /* Set to zero if not cleared */

        float clear_depth_1; // float32, ditto
        float clear_depth_2; // float32, ditto
        float clear_depth_3; // float32, ditto
        float clear_depth_4; // float32, ditto

        u32 clear_stencil; // Exactly as it appears in OpenGL

        u32 zero6[7];

        struct midgard_tiler_descriptor tiler;

        /* More below this, maybe */
} __attribute__((packed));


/* SINGLE to disable multisampling, AVERAGE for
 * EXT_multisampled_render_to_texture operation where multiple tilebuffer
 * samples are implicitly resolved before writeout, MULTIPLE to write multiple
 * samples inline, and LAYERED for ES3-style multisampling with each sample in
 * a different buffer.
 */

enum mali_msaa_mode {
        MALI_MSAA_SINGLE = 0,
        MALI_MSAA_AVERAGE = 1,
        MALI_MSAA_MULTIPLE = 2,
        MALI_MSAA_LAYERED = 3,
};

#define MALI_MFBD_FORMAT_SRGB 	  (1 << 0)

struct mali_rt_format {
        unsigned unk1 : 32;
        unsigned unk2 : 3;

        unsigned nr_channels : 2; /* MALI_POSITIVE */

        unsigned unk3 : 4;
        unsigned unk4 : 1;
        enum mali_block_format block : 2;
        enum mali_msaa_mode msaa : 2;
        unsigned flags : 2;

        unsigned swizzle : 12;

        unsigned zero : 3;

        /* Disables MFBD preload. When this bit is set, the render target will
         * be cleared every frame. When this bit is clear, the hardware will
         * automatically wallpaper the render target back from main memory.
         * Unfortunately, MFBD preload is very broken on Midgard, so in
         * practice, this is a chicken bit that should always be set.
         * Discovered by accident, as all good chicken bits are. */

        unsigned no_preload : 1;
} __attribute__((packed));

/* Flags for afbc.flags and ds_afbc.flags */

#define MALI_AFBC_FLAGS 0x10009

/* Lossless RGB and RGBA colorspace transform */
#define MALI_AFBC_YTR (1 << 17)

struct mali_render_target {
        struct mali_rt_format format;

        u64 zero1;

        struct {
                /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled,
                 * there is an extra metadata buffer that contains 16 bytes per tile.
                 * The framebuffer needs to be the same size as before, since we don't
                 * know ahead of time how much space it will take up. The
                 * framebuffer_stride is set to 0, since the data isn't stored linearly
                 * anymore.
                 *
                 * When AFBC is disabled, these fields are zero.
                 */

                mali_ptr metadata;
                u32 stride; // stride in units of tiles
                u32 flags; // = 0x20000
        } afbc;

        mali_ptr framebuffer;

        u32 zero2 : 4;
        u32 framebuffer_stride : 28; // in units of bytes, row to next
        u32 layer_stride; /* For multisample rendering */

        u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
        u32 clear_color_2; // always equal, but unclear function?
        u32 clear_color_3; // always equal, but unclear function?
        u32 clear_color_4; // always equal, but unclear function?
} __attribute__((packed));

/* An optional part of mali_framebuffer. It comes between the main structure
 * and the array of render targets. It must be included if any of these are
 * enabled:
 *
 * - Transaction Elimination
 * - Depth/stencil
 * - TODO: Anything else?
 */

/* flags_hi */
#define MALI_EXTRA_PRESENT      (0x1)

/* flags_lo */
#define MALI_EXTRA_ZS           (0x4)

struct mali_framebuffer_extra  {
        mali_ptr checksum;
        /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */
        u32 checksum_stride;

        unsigned flags_lo : 4;
        enum mali_block_format zs_block : 2;

        /* Number of samples in Z/S attachment, MALI_POSITIVE. So zero for
         * 1-sample (non-MSAA), 0x3 for MSAA 4x, etc */
        unsigned zs_samples : 4;
        unsigned flags_hi : 22;

        union {
                /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */
                struct {
                        mali_ptr depth_stencil_afbc_metadata;
                        u32 depth_stencil_afbc_stride; // in units of tiles
                        u32 flags;

                        mali_ptr depth_stencil;

                        u64 padding;
                } ds_afbc;

                struct {
                        /* Depth becomes depth/stencil in case of combined D/S */
                        mali_ptr depth;
                        u32 depth_stride_zero : 4;
                        u32 depth_stride : 28;
                        u32 depth_layer_stride;

                        mali_ptr stencil;
                        u32 stencil_stride_zero : 4;
                        u32 stencil_stride : 28;
                        u32 stencil_layer_stride;
                } ds_linear;
        };


        u32 clear_color_1;
        u32 clear_color_2;
        u64 zero3;
} __attribute__((packed));

/* Flags for mfbd_flags */

/* Enables writing depth results back to main memory (rather than keeping them
 * on-chip in the tile buffer and then discarding) */

#define MALI_MFBD_DEPTH_WRITE (1 << 10)

/* The MFBD contains the extra mali_framebuffer_extra  section */

#define MALI_MFBD_EXTRA (1 << 13)

struct mali_framebuffer {
        union {
                struct mali_shared_memory shared_memory;
                struct bifrost_multisampling msaa;
        };

        /* 0x20 */
        u16 width1, height1;
        u32 zero3;
        u16 width2, height2;
        u32 unk1 : 19; // = 0x01000
        u32 rt_count_1 : 3; // off-by-one (use MALI_POSITIVE)
        u32 unk2 : 2; // = 0
        u32 rt_count_2 : 3; // no off-by-one
        u32 zero4 : 5;
        /* 0x30 */
        u32 clear_stencil : 8;
        u32 mfbd_flags : 24; // = 0x100
        float clear_depth;

        union {
                struct midgard_tiler_descriptor tiler;
                struct {
                        mali_ptr tiler_meta;
                        u32 zeros[16];
                };
        };

        /* optional: struct mali_framebuffer_extra  extra */
        /* struct mali_render_target rts[] */
} __attribute__((packed));

#endif /* __PANFROST_JOB_H__ */