src/broadcom/compiler/v3d_compiler.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474

/*
 * Copyright © 2016 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#ifndef V3D_COMPILER_H
#define V3D_COMPILER_H

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>

#include "util/macros.h"
#include "common/v3d_debug.h"
#include "common/v3d_device_info.h"
#include "common/v3d_limits.h"
#include "compiler/nir/nir.h"
#include "util/list.h"
#include "util/u_math.h"

#include "qpu/qpu_instr.h"
#include "pipe/p_state.h"

/**
 * Maximum number of outstanding TMU operations we can queue for execution.
 *
 * This is mostly limited by the size of the TMU fifos. The Input and Config
 * fifos can stall, but we prefer that than injecting TMU flushes manually
 * in the driver, so we can ignore these, but we can't overflow the Output fifo,
 * which has 16 / threads per-thread entries, meaning that the maximum number
 * of outstanding LDTMUs we can ever have is 8, for a 2-way threaded shader.
 * This means that at most we can have 8 outstanding TMU loads, if each load
 * is just one component.
 *
 * NOTE: we could actually have a larger value here because TMU stores don't
 * consume any entries in the Output fifo (so we could have any number of
 * outstanding stores) and the driver keeps track of used Output fifo entries
 * and will flush if we ever needs more than 8, but since loads are much more
 * common than stores, it is probably not worth it.
 */
#define MAX_TMU_QUEUE_SIZE 8

/**
 * Maximum offset distance in bytes between two consecutive constant UBO loads
 * for the same UBO where we would favor updating the unifa address by emitting
 * dummy ldunifa instructions to avoid writing the unifa register.
 */
#define MAX_UNIFA_SKIP_DISTANCE 16

struct nir_builder;

struct v3d_fs_inputs {
        /**
         * Array of the meanings of the VPM inputs this shader needs.
         *
         * It doesn't include those that aren't part of the VPM, like
         * point/line coordinates.
         */
        struct v3d_varying_slot *input_slots;
        uint32_t num_inputs;
};

enum qfile {
        /** An unused source or destination register. */
        QFILE_NULL,

        /** A physical register, such as the W coordinate payload. */
        QFILE_REG,
        /** One of the regsiters for fixed function interactions. */
        QFILE_MAGIC,

        /**
         *  A virtual register, that will be allocated to actual accumulator
         * or physical registers later.
         */
        QFILE_TEMP,

        /**
         * VPM reads use this with an index value to say what part of the VPM
         * is being read.
         */
        QFILE_VPM,

        /**
         * Stores an immediate value in the index field that will be used
         * directly by qpu_load_imm().
         */
        QFILE_LOAD_IMM,

        /**
         * Stores an immediate value in the index field that can be turned
         * into a small immediate field by qpu_encode_small_immediate().
         */
        QFILE_SMALL_IMM,
};

/**
 * A reference to a QPU register or a virtual temp register.
 */
struct qreg {
        enum qfile file;
        uint32_t index;
};

static inline struct qreg vir_reg(enum qfile file, uint32_t index)
{
        return (struct qreg){file, index};
}

static inline struct qreg vir_magic_reg(uint32_t index)
{
        return (struct qreg){QFILE_MAGIC, index};
}

static inline struct qreg vir_nop_reg(void)
{
        return (struct qreg){QFILE_NULL, 0};
}

/**
 * A reference to an actual register at the QPU level, for register
 * allocation.
 */
struct qpu_reg {
        bool magic;
        bool smimm;
        int index;
};

struct qinst {
        /** Entry in qblock->instructions */
        struct list_head link;

        /**
         * The instruction being wrapped.  Its condition codes, pack flags,
         * signals, etc. will all be used, with just the register references
         * being replaced by the contents of qinst->dst and qinst->src[].
         */
        struct v3d_qpu_instr qpu;

        /* Pre-register-allocation references to src/dst registers */
        struct qreg dst;
        struct qreg src[3];
        bool is_last_thrsw;

        /* If the instruction reads a uniform (other than through src[i].file
         * == QFILE_UNIF), that uniform's index in c->uniform_contents.  ~0
         * otherwise.
         */
        int uniform;
};

enum quniform_contents {
        /**
         * Indicates that a constant 32-bit value is copied from the program's
         * uniform contents.
         */
        QUNIFORM_CONSTANT,
        /**
         * Indicates that the program's uniform contents are used as an index
         * into the GL uniform storage.
         */
        QUNIFORM_UNIFORM,

        /** @{
         * Scaling factors from clip coordinates to relative to the viewport
         * center.
         *
         * This is used by the coordinate and vertex shaders to produce the
         * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed
         * point offsets from the viewport ccenter.
         */
        QUNIFORM_VIEWPORT_X_SCALE,
        QUNIFORM_VIEWPORT_Y_SCALE,
        /** @} */

        QUNIFORM_VIEWPORT_Z_OFFSET,
        QUNIFORM_VIEWPORT_Z_SCALE,

        QUNIFORM_USER_CLIP_PLANE,

        /**
         * A reference to a V3D 3.x texture config parameter 0 uniform.
         *
         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
         * defines texture type, miplevels, and such.  It will be found as a
         * parameter to the first QOP_TEX_[STRB] instruction in a sequence.
         */
        QUNIFORM_TEXTURE_CONFIG_P0_0,
        QUNIFORM_TEXTURE_CONFIG_P0_1,
        QUNIFORM_TEXTURE_CONFIG_P0_2,
        QUNIFORM_TEXTURE_CONFIG_P0_3,
        QUNIFORM_TEXTURE_CONFIG_P0_4,
        QUNIFORM_TEXTURE_CONFIG_P0_5,
        QUNIFORM_TEXTURE_CONFIG_P0_6,
        QUNIFORM_TEXTURE_CONFIG_P0_7,
        QUNIFORM_TEXTURE_CONFIG_P0_8,
        QUNIFORM_TEXTURE_CONFIG_P0_9,
        QUNIFORM_TEXTURE_CONFIG_P0_10,
        QUNIFORM_TEXTURE_CONFIG_P0_11,
        QUNIFORM_TEXTURE_CONFIG_P0_12,
        QUNIFORM_TEXTURE_CONFIG_P0_13,
        QUNIFORM_TEXTURE_CONFIG_P0_14,
        QUNIFORM_TEXTURE_CONFIG_P0_15,
        QUNIFORM_TEXTURE_CONFIG_P0_16,
        QUNIFORM_TEXTURE_CONFIG_P0_17,
        QUNIFORM_TEXTURE_CONFIG_P0_18,
        QUNIFORM_TEXTURE_CONFIG_P0_19,
        QUNIFORM_TEXTURE_CONFIG_P0_20,
        QUNIFORM_TEXTURE_CONFIG_P0_21,
        QUNIFORM_TEXTURE_CONFIG_P0_22,
        QUNIFORM_TEXTURE_CONFIG_P0_23,
        QUNIFORM_TEXTURE_CONFIG_P0_24,
        QUNIFORM_TEXTURE_CONFIG_P0_25,
        QUNIFORM_TEXTURE_CONFIG_P0_26,
        QUNIFORM_TEXTURE_CONFIG_P0_27,
        QUNIFORM_TEXTURE_CONFIG_P0_28,
        QUNIFORM_TEXTURE_CONFIG_P0_29,
        QUNIFORM_TEXTURE_CONFIG_P0_30,
        QUNIFORM_TEXTURE_CONFIG_P0_31,
        QUNIFORM_TEXTURE_CONFIG_P0_32,

        /**
         * A reference to a V3D 3.x texture config parameter 1 uniform.
         *
         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
         * has the pointer to the indirect texture state.  Our data[] field
         * will have a packed p1 value, but the address field will be just
         * which texture unit's texture should be referenced.
         */
        QUNIFORM_TEXTURE_CONFIG_P1,

        /* A V3D 4.x texture config parameter.  The high 8 bits will be
         * which texture or sampler is being sampled, and the driver must
         * replace the address field with the appropriate address.
         */
        QUNIFORM_TMU_CONFIG_P0,
        QUNIFORM_TMU_CONFIG_P1,

        QUNIFORM_IMAGE_TMU_CONFIG_P0,

        QUNIFORM_TEXTURE_FIRST_LEVEL,

        QUNIFORM_TEXTURE_WIDTH,
        QUNIFORM_TEXTURE_HEIGHT,
        QUNIFORM_TEXTURE_DEPTH,
        QUNIFORM_TEXTURE_ARRAY_SIZE,
        QUNIFORM_TEXTURE_LEVELS,
        QUNIFORM_TEXTURE_SAMPLES,

        QUNIFORM_UBO_ADDR,

        QUNIFORM_TEXRECT_SCALE_X,
        QUNIFORM_TEXRECT_SCALE_Y,

        /* Returns the base offset of the SSBO given by the data value. */
        QUNIFORM_SSBO_OFFSET,

        /* Returns the size of the SSBO or UBO given by the data value. */
        QUNIFORM_GET_SSBO_SIZE,
        QUNIFORM_GET_UBO_SIZE,

        /* Sizes (in pixels) of a shader image given by the data value. */
        QUNIFORM_IMAGE_WIDTH,
        QUNIFORM_IMAGE_HEIGHT,
        QUNIFORM_IMAGE_DEPTH,
        QUNIFORM_IMAGE_ARRAY_SIZE,

        QUNIFORM_LINE_WIDTH,

        /* The line width sent to hardware. This includes the expanded width
         * when anti-aliasing is enabled.
         */
        QUNIFORM_AA_LINE_WIDTH,

        /* Number of workgroups passed to glDispatchCompute in the dimension
         * selected by the data value.
         */
        QUNIFORM_NUM_WORK_GROUPS,

        /* Base workgroup offset passed to vkCmdDispatchBase in the dimension
         * selected by the data value.
         */
        QUNIFORM_WORK_GROUP_BASE,

        /**
         * Returns the the offset of the scratch buffer for register spilling.
         */
        QUNIFORM_SPILL_OFFSET,
        QUNIFORM_SPILL_SIZE_PER_THREAD,

        /**
         * Returns the offset of the shared memory for compute shaders.
         *
         * This will be accessed using TMU general memory operations, so the
         * L2T cache will effectively be the shared memory area.
         */
        QUNIFORM_SHARED_OFFSET,

        /**
         * Returns the number of layers in the framebuffer.
         *
         * This is used to cap gl_Layer in geometry shaders to avoid
         * out-of-bounds accesses into the tile state during binning.
         */
        QUNIFORM_FB_LAYERS,
};

static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
{
        assert(value < (1 << 24));
        return unit << 24 | value;
}

static inline uint32_t v3d_unit_data_get_unit(uint32_t data)
{
        return data >> 24;
}

static inline uint32_t v3d_unit_data_get_offset(uint32_t data)
{
        return data & 0xffffff;
}

struct v3d_varying_slot {
        uint8_t slot_and_component;
};

static inline struct v3d_varying_slot
v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component)
{
        assert(slot < 255 / 4);
        return (struct v3d_varying_slot){ (slot << 2) + component };
}

static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot)
{
        return slot.slot_and_component >> 2;
}

static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
{
        return slot.slot_and_component & 3;
}

enum v3d_execution_environment {
   V3D_ENVIRONMENT_OPENGL = 0,
   V3D_ENVIRONMENT_VULKAN,
};

struct v3d_key {
        void *shader_state;
        struct {
                uint8_t swizzle[4];
        } tex[V3D_MAX_TEXTURE_SAMPLERS];
        struct {
                uint8_t return_size;
                uint8_t return_channels;
        } sampler[V3D_MAX_TEXTURE_SAMPLERS];

        uint8_t num_tex_used;
        uint8_t num_samplers_used;
        uint8_t ucp_enables;
        bool is_last_geometry_stage;
        bool robust_buffer_access;

        enum v3d_execution_environment environment;
};

struct v3d_fs_key {
        struct v3d_key base;
        bool is_points;
        bool is_lines;
        bool line_smoothing;
        bool point_coord_upper_left;
        bool msaa;
        bool sample_coverage;
        bool sample_alpha_to_coverage;
        bool sample_alpha_to_one;
        /* Mask of which color render targets are present. */
        uint8_t cbufs;
        uint8_t swap_color_rb;
        /* Mask of which render targets need to be written as 32-bit floats */
        uint8_t f32_color_rb;
        /* Masks of which render targets need to be written as ints/uints.
         * Used by gallium to work around lost information in TGSI.
         */
        uint8_t int_color_rb;
        uint8_t uint_color_rb;

        /* Color format information per render target. Only set when logic
         * operations are enabled.
         */
        struct {
                enum pipe_format format;
                const uint8_t *swizzle;
        } color_fmt[V3D_MAX_DRAW_BUFFERS];

        uint8_t logicop_func;
        uint32_t point_sprite_mask;

        struct pipe_rt_blend_state blend;

        /* If the fragment shader reads gl_PrimitiveID then we have 2 scenarios:
         *
         * - If there is a geometry shader, then gl_PrimitiveID must be written
         *   by it and the fragment shader loads it as a regular explicit input
         *   varying. This is the only valid use case in GLES 3.1.
         *
         * - If there is not a geometry shader (allowed since GLES 3.2 and
         *   Vulkan 1.0), then gl_PrimitiveID must be implicitly written by
         *   hardware and is considered an implicit input varying in the
         *   fragment shader.
         */
        bool has_gs;
};

struct v3d_gs_key {
        struct v3d_key base;

        struct v3d_varying_slot used_outputs[V3D_MAX_FS_INPUTS];
        uint8_t num_used_outputs;

        bool is_coord;
        bool per_vertex_point_size;
};

struct v3d_vs_key {
        struct v3d_key base;

        struct v3d_varying_slot used_outputs[V3D_MAX_ANY_STAGE_INPUTS];
        uint8_t num_used_outputs;

        /* A bit-mask indicating if we need to swap the R/B channels for
         * vertex attributes. Since the hardware doesn't provide any
         * means to swizzle vertex attributes we need to do it in the shader.
         */
        uint32_t va_swap_rb_mask;

        bool is_coord;
        bool per_vertex_point_size;
        bool clamp_color;
};

/** A basic block of VIR intructions. */
struct qblock {
        struct list_head link;

        struct list_head instructions;

        struct set *predecessors;
        struct qblock *successors[2];

        int index;

        /* Instruction IPs for the first and last instruction of the block.
         * Set by qpu_schedule.c.
         */
        uint32_t start_qpu_ip;
        uint32_t end_qpu_ip;

        /* Instruction IP for the branch instruction of the block.  Set by
         * qpu_schedule.c.
         */
        uint32_t branch_qpu_ip;

        /** Offset within the uniform stream at the start of the block. */
        uint32_t start_uniform;
        /** Offset within the uniform stream of the branch instruction */
        uint32_t branch_uniform;

        /**
         * Has the terminating branch of this block already been emitted
         * by a break or continue?
         */
        bool branch_emitted;

        /** @{ used by v3d_vir_live_variables.c */
        BITSET_WORD *def;
        BITSET_WORD *defin;
        BITSET_WORD *defout;
        BITSET_WORD *use;
        BITSET_WORD *live_in;
        BITSET_WORD *live_out;
        int start_ip, end_ip;
        /** @} */
};

/** Which util/list.h add mode we should use when inserting an instruction. */
enum vir_cursor_mode {
        vir_cursor_add,
        vir_cursor_addtail,
};

/**
 * Tracking structure for where new instructions should be inserted.  Create
 * with one of the vir_after_inst()-style helper functions.
 *
 * This does not protect against removal of the block or instruction, so we
 * have an assert in instruction removal to try to catch it.
 */
struct vir_cursor {
        enum vir_cursor_mode mode;
        struct list_head *link;
};

static inline struct vir_cursor
vir_before_inst(struct qinst *inst)
{
        return (struct vir_cursor){ vir_cursor_addtail, &inst->link };
}

static inline struct vir_cursor
vir_after_inst(struct qinst *inst)
{
        return (struct vir_cursor){ vir_cursor_add, &inst->link };
}

static inline struct vir_cursor
vir_before_block(struct qblock *block)
{
        return (struct vir_cursor){ vir_cursor_add, &block->instructions };
}

static inline struct vir_cursor
vir_after_block(struct qblock *block)
{
        return (struct vir_cursor){ vir_cursor_addtail, &block->instructions };
}

enum v3d_compilation_result {
        V3D_COMPILATION_SUCCEEDED,
        V3D_COMPILATION_FAILED_REGISTER_ALLOCATION,
        V3D_COMPILATION_FAILED,
};

/**
 * Compiler state saved across compiler invocations, for any expensive global
 * setup.
 */
struct v3d_compiler {
        const struct v3d_device_info *devinfo;
        struct ra_regs *regs;
        struct ra_class *reg_class_any[3];
        struct ra_class *reg_class_r5[3];
        struct ra_class *reg_class_phys[3];
        struct ra_class *reg_class_phys_or_acc[3];
};

/**
 * This holds partially interpolated inputs as provided by hardware
 * (The Vp = A*(x - x0) + B*(y - y0) term), as well as the C coefficient
 * required to compute the final interpolated value.
 */
struct v3d_interp_input {
   struct qreg vp;
   struct qreg C;
   unsigned mode; /* interpolation mode */
};

struct v3d_compile {
        const struct v3d_device_info *devinfo;
        nir_shader *s;
        nir_function_impl *impl;
        struct exec_list *cf_node_list;
        const struct v3d_compiler *compiler;

        void (*debug_output)(const char *msg,
                             void *debug_output_data);
        void *debug_output_data;

        /**
         * Mapping from nir_register * or nir_ssa_def * to array of struct
         * qreg for the values.
         */
        struct hash_table *def_ht;

        /* For each temp, the instruction generating its value. */
        struct qinst **defs;
        uint32_t defs_array_size;

        /* TMU pipelining tracking */
        struct {
                /* NIR registers that have been updated with a TMU operation
                 * that has not been flushed yet.
                 */
                struct set *outstanding_regs;

                uint32_t output_fifo_size;

                struct {
                        nir_dest *dest;
                        uint8_t num_components;
                        uint8_t component_mask;
                } flush[MAX_TMU_QUEUE_SIZE];
                uint32_t flush_count;
        } tmu;

        /**
         * Inputs to the shader, arranged by TGSI declaration order.
         *
         * Not all fragment shader QFILE_VARY reads are present in this array.
         */
        struct qreg *inputs;
        /**
         * Partially interpolated inputs to the shader.
         */
        struct v3d_interp_input *interp;
        struct qreg *outputs;
        bool msaa_per_sample_output;
        struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
        struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
        uint32_t inputs_array_size;
        uint32_t outputs_array_size;
        uint32_t uniforms_array_size;

        /* Booleans for whether the corresponding QFILE_VARY[i] is
         * flat-shaded.  This includes gl_FragColor flat-shading, which is
         * customized based on the shademodel_flat shader key.
         */
        uint32_t flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];

        uint32_t noperspective_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];

        uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];

        bool uses_center_w;
        bool writes_z;
        bool uses_implicit_point_line_varyings;

        /* True if a fragment shader reads gl_PrimitiveID */
        bool fs_uses_primitive_id;

        /* If the fragment shader does anything that requires to force
         * per-sample MSAA, such as reading gl_SampleID.
         */
        bool force_per_sample_msaa;

        /* Whether we are using the fallback scheduler. This will be set after
         * register allocation has failed once.
         */
        bool fallback_scheduler;

        /* Disable TMU pipelining. This may increase the chances of being able
         * to compile shaders with high register pressure that require to emit
         * TMU spills.
         */
        bool disable_tmu_pipelining;
        bool pipelined_any_tmu;

        /* Disable sorting of UBO loads with constant offset. This may
         * increase the chances of being able to compile shaders with high
         * register pressure.
         */
        bool disable_constant_ubo_load_sorting;
        bool sorted_any_ubo_loads;

        /* Emits ldunif for each new uniform, even if the uniform was already
         * emitted in the same block. Useful to compile shaders with high
         * register pressure or to disable the optimization during uniform
         * spills.
         */
        bool disable_ldunif_opt;

        /* Disables loop unrolling to reduce register pressure. */
        bool disable_loop_unrolling;
        bool unrolled_any_loops;

        /* Minimum number of threads we are willing to use to register allocate
         * a shader with the current compilation strategy. This only prevents
         * us from lowering the thread count to register allocate successfully,
         * which can be useful when we prefer doing other changes to the
         * compilation strategy before dropping thread count.
         */
        uint32_t min_threads_for_reg_alloc;

        /* Whether TMU spills are allowed. If this is disabled it may cause
         * register allocation to fail. We set this to favor other compilation
         * strategies that can reduce register pressure and hopefully reduce or
         * eliminate TMU spills in the shader.
         */
        bool tmu_spilling_allowed;

        /* The UBO index and block used with the last unifa load, as well as the
         * current unifa offset *after* emitting that load. This is used to skip
         * unifa writes (and their 3 delay slot) when the next UBO load reads
         * right after the previous one in the same block.
         */
        struct qblock *current_unifa_block;
        int32_t current_unifa_index;
        uint32_t current_unifa_offset;

        /* State for whether we're executing on each channel currently.  0 if
         * yes, otherwise a block number + 1 that the channel jumped to.
         */
        struct qreg execute;
        bool in_control_flow;

        struct qreg line_x, point_x, point_y, primitive_id;

        /**
         * Instance ID, which comes in before the vertex attribute payload if
         * the shader record requests it.
         */
        struct qreg iid;

        /**
         * Base Instance ID, which comes in before the vertex attribute payload
         * (after Instance ID) if the shader record requests it.
         */
        struct qreg biid;

        /**
         * Vertex ID, which comes in before the vertex attribute payload
         * (after Base Instance) if the shader record requests it.
         */
        struct qreg vid;

        /* Fragment shader payload regs. */
        struct qreg payload_w, payload_w_centroid, payload_z;

        struct qreg cs_payload[2];
        struct qreg cs_shared_offset;
        int local_invocation_index_bits;

        /* If the shader uses subgroup functionality */
        bool has_subgroups;

        uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
        uint32_t vpm_output_size;

        /* Size in bytes of registers that have been spilled. This is how much
         * space needs to be available in the spill BO per thread per QPU.
         */
        uint32_t spill_size;
        /* Shader-db stats */
        uint32_t spills, fills, loops;
        /**
         * Register spilling's per-thread base address, shared between each
         * spill/fill's addressing calculations.
         */
        struct qreg spill_base;
        /* Bit vector of which temps may be spilled */
        BITSET_WORD *spillable;

        /**
         * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
         *
         * This includes those that aren't part of the VPM varyings, like
         * point/line coordinates.
         */
        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];

        /**
         * An entry per outputs[] in the VS indicating what the VARYING_SLOT_*
         * of the output is.  Used to emit from the VS in the order that the
         * FS needs.
         */
        struct v3d_varying_slot *output_slots;

        struct pipe_shader_state *shader_state;
        struct v3d_key *key;
        struct v3d_fs_key *fs_key;
        struct v3d_gs_key *gs_key;
        struct v3d_vs_key *vs_key;

        /* Live ranges of temps. */
        int *temp_start, *temp_end;
        bool live_intervals_valid;

        uint32_t *uniform_data;
        enum quniform_contents *uniform_contents;
        uint32_t uniform_array_size;
        uint32_t num_uniforms;
        uint32_t output_position_index;
        nir_variable *output_color_var[4];
        uint32_t output_sample_mask_index;

        struct qreg undef;
        uint32_t num_temps;

        struct vir_cursor cursor;
        struct list_head blocks;
        int next_block_index;
        struct qblock *cur_block;
        struct qblock *loop_cont_block;
        struct qblock *loop_break_block;
        /**
         * Which temp, if any, do we currently have in the flags?
         * This is set when processing a comparison instruction, and
         * reset to -1 by anything else that touches the flags.
         */
        int32_t flags_temp;
        enum v3d_qpu_cond flags_cond;

        uint64_t *qpu_insts;
        uint32_t qpu_inst_count;
        uint32_t qpu_inst_size;
        uint32_t qpu_inst_stalled_count;
        uint32_t nop_count;

        /* For the FS, the number of varying inputs not counting the
         * point/line varyings payload
         */
        uint32_t num_inputs;

        uint32_t program_id;
        uint32_t variant_id;

        /* Set to compile program in in 1x, 2x, or 4x threaded mode, where
         * SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
         * limiting ourselves to the part of the physical reg space.
         *
         * On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x.  On
         * V3D 4.x, all shaders are 2x threaded, and 4x only divides the
         * physical reg space in half.
         */
        uint8_t threads;
        struct qinst *last_thrsw;
        bool last_thrsw_at_top_level;

        bool emitted_tlb_load;
        bool lock_scoreboard_on_first_thrsw;

        /* Total number of spilled registers in the program */
        uint32_t spill_count;

        enum v3d_compilation_result compilation_result;

        bool tmu_dirty_rcl;
};

struct v3d_uniform_list {
        enum quniform_contents *contents;
        uint32_t *data;
        uint32_t count;
};

struct v3d_prog_data {
        struct v3d_uniform_list uniforms;

        uint32_t spill_size;

        uint8_t threads;

        /* For threads > 1, whether the program should be dispatched in the
         * after-final-THRSW state.
         */
        bool single_seg;

        bool tmu_dirty_rcl;

        bool has_control_barrier;
};

struct v3d_vs_prog_data {
        struct v3d_prog_data base;

        bool uses_iid, uses_biid, uses_vid;

        /* Number of components read from each vertex attribute. */
        uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];

        /* Total number of components read, for the shader state record. */
        uint32_t vpm_input_size;

        /* Total number of components written, for the shader state record. */
        uint32_t vpm_output_size;

        /* Set if there should be separate VPM segments for input and output.
         * If unset, vpm_input_size will be 0.
         */
        bool separate_segments;

        /* Value to be programmed in VCM_CACHE_SIZE. */
        uint8_t vcm_cache_size;

        /* Maps the nir->data.location to its
         * nir->data.driver_location. In general we are using the
         * driver location as index (like vattr_sizes above), so this
         * map is useful when what we have is the location
         *
         * Returns -1 if the location is not used
         */
        int32_t driver_location_map[V3D_MAX_VS_INPUTS];
};

struct v3d_gs_prog_data {
        struct v3d_prog_data base;

        /* Whether the program reads gl_PrimitiveIDIn */
        bool uses_pid;

        /* Number of components read from each input varying. */
        uint8_t input_sizes[V3D_MAX_GS_INPUTS / 4];

        /* Number of inputs */
        uint8_t num_inputs;
        struct v3d_varying_slot input_slots[V3D_MAX_GS_INPUTS];

        /* Total number of components written, for the shader state record. */
        uint32_t vpm_output_size;

        /* Maximum SIMD dispatch width to not exceed VPM output size limits
         * in the geometry shader. Notice that the final dispatch width has to
         * be decided at draw time and could be lower based on the VPM pressure
         * added by other shader stages.
         */
        uint8_t simd_width;

        /* Output primitive type */
        uint8_t out_prim_type;

        /* Number of GS invocations */
        uint8_t num_invocations;

        bool writes_psiz;
};

struct v3d_fs_prog_data {
        struct v3d_prog_data base;

        /* Whether the program reads gl_PrimitiveID */
        bool uses_pid;

        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];

        /* Array of flat shade flags.
         *
         * Each entry is only 24 bits (high 8 bits 0), to match the hardware
         * packet layout.
         */
        uint32_t flat_shade_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];

        uint32_t noperspective_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];

        uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];

        uint8_t num_inputs;
        bool writes_z;
        bool disable_ez;
        bool uses_center_w;
        bool uses_implicit_point_line_varyings;
        bool lock_scoreboard_on_first_thrsw;
        bool force_per_sample_msaa;
};

struct v3d_compute_prog_data {
        struct v3d_prog_data base;
        /* Size in bytes of the workgroup's shared space. */
        uint32_t shared_size;
        uint16_t local_size[3];
        /* If the shader uses subgroup functionality */
        bool has_subgroups;
};

struct vpm_config {
   uint32_t As;
   uint32_t Vc;
   uint32_t Gs;
   uint32_t Gd;
   uint32_t Gv;
   uint32_t Ve;
   uint32_t gs_width;
};

bool
v3d_compute_vpm_config(struct v3d_device_info *devinfo,
                       struct v3d_vs_prog_data *vs_bin,
                       struct v3d_vs_prog_data *vs,
                       struct v3d_gs_prog_data *gs_bin,
                       struct v3d_gs_prog_data *gs,
                       struct vpm_config *vpm_cfg_bin,
                       struct vpm_config *vpm_cfg);

static inline bool
vir_has_uniform(struct qinst *inst)
{
        return inst->uniform != ~0;
}

const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
void v3d_compiler_free(const struct v3d_compiler *compiler);
void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);

uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                      struct v3d_key *key,
                      struct v3d_prog_data **prog_data,
                      nir_shader *s,
                      void (*debug_output)(const char *msg,
                                           void *debug_output_data),
                      void *debug_output_data,
                      int program_id, int variant_id,
                      uint32_t *final_assembly_size);

uint32_t v3d_prog_data_size(gl_shader_stage stage);
void v3d_nir_to_vir(struct v3d_compile *c);

void vir_compile_destroy(struct v3d_compile *c);
const char *vir_get_stage_name(struct v3d_compile *c);
struct qblock *vir_new_block(struct v3d_compile *c);
void vir_set_emit_block(struct v3d_compile *c, struct qblock *block);
void vir_link_blocks(struct qblock *predecessor, struct qblock *successor);
struct qblock *vir_entry_block(struct v3d_compile *c);
struct qblock *vir_exit_block(struct v3d_compile *c);
struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst,
                           struct qreg src0, struct qreg src1);
struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst,
                           struct qreg src0, struct qreg src1);
struct qinst *vir_branch_inst(struct v3d_compile *c,
                              enum v3d_qpu_branch_cond cond);
void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst);
uint32_t vir_get_uniform_index(struct v3d_compile *c,
                               enum quniform_contents contents,
                               uint32_t data);
struct qreg vir_uniform(struct v3d_compile *c,
                        enum quniform_contents contents,
                        uint32_t data);
void vir_schedule_instructions(struct v3d_compile *c);
void v3d_setup_spill_base(struct v3d_compile *c);
struct v3d_qpu_instr v3d_qpu_nop(void);

struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst);
void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond);
void vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf);
void vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf);
void vir_set_unpack(struct qinst *inst, int src,
                    enum v3d_qpu_input_unpack unpack);
void vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack);

struct qreg vir_get_temp(struct v3d_compile *c);
void vir_emit_last_thrsw(struct v3d_compile *c);
void vir_calculate_live_intervals(struct v3d_compile *c);
int vir_get_nsrc(struct qinst *inst);
bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op);
bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op);
bool vir_is_raw_mov(struct qinst *inst);
bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_is_add(struct qinst *inst);
bool vir_is_mul(struct qinst *inst);
bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
uint8_t vir_channels_written(struct qinst *inst);
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                    struct qreg result);
bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
                               uint32_t component_mask);
void ntq_flush_tmu(struct v3d_compile *c);
void vir_emit_thrsw(struct v3d_compile *c);

void vir_dump(struct v3d_compile *c);
void vir_dump_inst(struct v3d_compile *c, struct qinst *inst);
void vir_dump_uniform(enum quniform_contents contents, uint32_t data);

void vir_validate(struct v3d_compile *c);

void vir_optimize(struct v3d_compile *c);
bool vir_opt_algebraic(struct v3d_compile *c);
bool vir_opt_constant_folding(struct v3d_compile *c);
bool vir_opt_copy_propagate(struct v3d_compile *c);
bool vir_opt_dead_code(struct v3d_compile *c);
bool vir_opt_peephole_sf(struct v3d_compile *c);
bool vir_opt_redundant_flags(struct v3d_compile *c);
bool vir_opt_small_immediates(struct v3d_compile *c);
bool vir_opt_vpm(struct v3d_compile *c);
bool vir_opt_constant_alu(struct v3d_compile *c);
void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_line_smooth(nir_shader *shader);
void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *c);
void v3d_nir_lower_scratch(nir_shader *s);
void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_image_load_store(nir_shader *s);
void vir_lower_uniforms(struct v3d_compile *c);

void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
void v3d33_vir_vpm_write_setup(struct v3d_compile *c);
void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
void v3d40_vir_emit_image_load_store(struct v3d_compile *c,
                                     nir_intrinsic_instr *instr);

void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
void qpu_validate(struct v3d_compile *c);
struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
bool vir_init_reg_sets(struct v3d_compiler *compiler);

int v3d_shaderdb_dump(struct v3d_compile *c, char **shaderdb_str);

bool v3d_gl_format_is_return_32(GLenum format);

uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);

static inline bool
quniform_contents_is_texture_p0(enum quniform_contents contents)
{
        return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 &&
                contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 +
                            V3D_MAX_TEXTURE_SAMPLERS));
}

static inline bool
vir_in_nonuniform_control_flow(struct v3d_compile *c)
{
        return c->execute.file != QFILE_NULL;
}

static inline struct qreg
vir_uniform_ui(struct v3d_compile *c, uint32_t ui)
{
        return vir_uniform(c, QUNIFORM_CONSTANT, ui);
}

static inline struct qreg
vir_uniform_f(struct v3d_compile *c, float f)
{
        return vir_uniform(c, QUNIFORM_CONSTANT, fui(f));
}

#define VIR_ALU0(name, vir_inst, op)                                     \
static inline struct qreg                                                \
vir_##name(struct v3d_compile *c)                                        \
{                                                                        \
        return vir_emit_def(c, vir_inst(op, c->undef,                    \
                                        c->undef, c->undef));            \
}                                                                        \
static inline struct qinst *                                             \
vir_##name##_dest(struct v3d_compile *c, struct qreg dest)               \
{                                                                        \
        return vir_emit_nondef(c, vir_inst(op, dest,                     \
                                           c->undef, c->undef));         \
}

#define VIR_ALU1(name, vir_inst, op)                                     \
static inline struct qreg                                                \
vir_##name(struct v3d_compile *c, struct qreg a)                         \
{                                                                        \
        return vir_emit_def(c, vir_inst(op, c->undef,                    \
                                        a, c->undef));                   \
}                                                                        \
static inline struct qinst *                                             \
vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
                  struct qreg a)                                         \
{                                                                        \
        return vir_emit_nondef(c, vir_inst(op, dest, a,          \
                                           c->undef));                   \
}

#define VIR_ALU2(name, vir_inst, op)                                       \
static inline struct qreg                                                \
vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)          \
{                                                                        \
        return vir_emit_def(c, vir_inst(op, c->undef, a, b));    \
}                                                                        \
static inline struct qinst *                                             \
vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
                  struct qreg a, struct qreg b)                          \
{                                                                        \
        return vir_emit_nondef(c, vir_inst(op, dest, a, b));     \
}

#define VIR_NODST_0(name, vir_inst, op)                                 \
static inline struct qinst *                                            \
vir_##name(struct v3d_compile *c)                                       \
{                                                                       \
        return vir_emit_nondef(c, vir_inst(op, c->undef,                \
                                           c->undef, c->undef));        \
}

#define VIR_NODST_1(name, vir_inst, op)                                               \
static inline struct qinst *                                            \
vir_##name(struct v3d_compile *c, struct qreg a)                        \
{                                                                       \
        return vir_emit_nondef(c, vir_inst(op, c->undef,        \
                                           a, c->undef));               \
}

#define VIR_NODST_2(name, vir_inst, op)                                               \
static inline struct qinst *                                            \
vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)         \
{                                                                       \
        return vir_emit_nondef(c, vir_inst(op, c->undef,                \
                                           a, b));                      \
}

#define VIR_SFU(name)                                                      \
static inline struct qreg                                                \
vir_##name(struct v3d_compile *c, struct qreg a)                         \
{                                                                        \
        if (c->devinfo->ver >= 41) {                                     \
                return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,    \
                                                    c->undef,            \
                                                    a, c->undef));       \
        } else {                                                         \
                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
                return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
        }                                                                \
}                                                                        \
static inline struct qinst *                                             \
vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
                  struct qreg a)                                         \
{                                                                        \
        if (c->devinfo->ver >= 41) {                                     \
                return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
                                                       dest,             \
                                                       a, c->undef));    \
        } else {                                                         \
                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
                return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
        }                                                                \
}

#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name)
#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name)
#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name)
#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name)
#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name)
#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name)
#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name)
#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name)
#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name)
#define VIR_A_NODST_0(name) VIR_NODST_0(name, vir_add_inst, V3D_QPU_A_##name)

VIR_A_ALU2(FADD)
VIR_A_ALU2(VFPACK)
VIR_A_ALU2(FSUB)
VIR_A_ALU2(FMIN)
VIR_A_ALU2(FMAX)

VIR_A_ALU2(ADD)
VIR_A_ALU2(SUB)
VIR_A_ALU2(SHL)
VIR_A_ALU2(SHR)
VIR_A_ALU2(ASR)
VIR_A_ALU2(ROR)
VIR_A_ALU2(MIN)
VIR_A_ALU2(MAX)
VIR_A_ALU2(UMIN)
VIR_A_ALU2(UMAX)
VIR_A_ALU2(AND)
VIR_A_ALU2(OR)
VIR_A_ALU2(XOR)
VIR_A_ALU2(VADD)
VIR_A_ALU2(VSUB)
VIR_A_NODST_2(STVPMV)
VIR_A_NODST_2(STVPMD)
VIR_A_ALU1(NOT)
VIR_A_ALU1(NEG)
VIR_A_ALU1(FLAPUSH)
VIR_A_ALU1(FLBPUSH)
VIR_A_ALU1(FLPOP)
VIR_A_ALU0(FLAFIRST)
VIR_A_ALU0(FLNAFIRST)
VIR_A_ALU1(SETMSF)
VIR_A_ALU1(SETREVF)
VIR_A_ALU0(TIDX)
VIR_A_ALU0(EIDX)
VIR_A_ALU1(LDVPMV_IN)
VIR_A_ALU1(LDVPMV_OUT)
VIR_A_ALU1(LDVPMD_IN)
VIR_A_ALU1(LDVPMD_OUT)
VIR_A_ALU2(LDVPMG_IN)
VIR_A_ALU2(LDVPMG_OUT)
VIR_A_ALU0(TMUWT)

VIR_A_ALU0(IID)
VIR_A_ALU0(FXCD)
VIR_A_ALU0(XCD)
VIR_A_ALU0(FYCD)
VIR_A_ALU0(YCD)
VIR_A_ALU0(MSF)
VIR_A_ALU0(REVF)
VIR_A_ALU0(BARRIERID)
VIR_A_ALU0(SAMPID)
VIR_A_NODST_1(VPMSETUP)
VIR_A_NODST_0(VPMWT)
VIR_A_ALU2(FCMP)
VIR_A_ALU2(VFMAX)

VIR_A_ALU1(FROUND)
VIR_A_ALU1(FTOIN)
VIR_A_ALU1(FTRUNC)
VIR_A_ALU1(FTOIZ)
VIR_A_ALU1(FFLOOR)
VIR_A_ALU1(FTOUZ)
VIR_A_ALU1(FCEIL)
VIR_A_ALU1(FTOC)

VIR_A_ALU1(FDX)
VIR_A_ALU1(FDY)

VIR_A_ALU1(ITOF)
VIR_A_ALU1(CLZ)
VIR_A_ALU1(UTOF)

VIR_M_ALU2(UMUL24)
VIR_M_ALU2(FMUL)
VIR_M_ALU2(SMUL24)
VIR_M_NODST_2(MULTOP)

VIR_M_ALU1(MOV)
VIR_M_ALU1(FMOV)

VIR_SFU(RECIP)
VIR_SFU(RSQRT)
VIR_SFU(EXP)
VIR_SFU(LOG)
VIR_SFU(SIN)
VIR_SFU(RSQRT2)

static inline struct qinst *
vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
             struct qreg dest, struct qreg src)
{
        struct qinst *mov = vir_MOV_dest(c, dest, src);
        vir_set_cond(mov, cond);
        return mov;
}

static inline struct qreg
vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond,
        struct qreg src0, struct qreg src1)
{
        struct qreg t = vir_get_temp(c);
        vir_MOV_dest(c, t, src1);
        vir_MOV_cond(c, cond, t, src0);
        return t;
}

static inline struct qinst *
vir_NOP(struct v3d_compile *c)
{
        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP,
                                               c->undef, c->undef, c->undef));
}

static inline struct qreg
vir_LDTMU(struct v3d_compile *c)
{
        if (c->devinfo->ver >= 41) {
                struct qinst *ldtmu = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                                   c->undef, c->undef);
                ldtmu->qpu.sig.ldtmu = true;

                return vir_emit_def(c, ldtmu);
        } else {
                vir_NOP(c)->qpu.sig.ldtmu = true;
                return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
        }
}

static inline struct qreg
vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
{
        vir_MULTOP(c, src0, src1);
        return vir_UMUL24(c, src0, src1);
}

static inline struct qreg
vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
{
        assert(c->devinfo->ver >= 41); /* XXX */
        assert((config & 0xffffff00) == 0xffffff00);

        struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                           c->undef, c->undef);
        ldtlb->qpu.sig.ldtlbu = true;
        ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
        return vir_emit_def(c, ldtlb);
}

static inline struct qreg
vir_TLB_COLOR_READ(struct v3d_compile *c)
{
        assert(c->devinfo->ver >= 41); /* XXX */

        struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
                                           c->undef, c->undef);
        ldtlb->qpu.sig.ldtlb = true;
        return vir_emit_def(c, ldtlb);
}

/*
static inline struct qreg
vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
{
        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
                                        vir_reg(QFILE_LOAD_IMM, val), c->undef));
}

static inline struct qreg
vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
{
        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
                                        vir_reg(QFILE_LOAD_IMM, val),
                                        c->undef));
}
static inline struct qreg
vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
{
        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
                                        vir_reg(QFILE_LOAD_IMM, val),
                                        c->undef));
}
*/

static inline struct qinst *
vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
{
        /* The actual uniform_data value will be set at scheduling time */
        return vir_emit_nondef(c, vir_branch_inst(c, cond));
}

#define vir_for_each_block(block, c)                                    \
        list_for_each_entry(struct qblock, block, &c->blocks, link)

#define vir_for_each_block_rev(block, c)                                \
        list_for_each_entry_rev(struct qblock, block, &c->blocks, link)

/* Loop over the non-NULL members of the successors array. */
#define vir_for_each_successor(succ, block)                             \
        for (struct qblock *succ = block->successors[0];                \
             succ != NULL;                                              \
             succ = (succ == block->successors[1] ? NULL :              \
                     block->successors[1]))

#define vir_for_each_inst(inst, block)                                  \
        list_for_each_entry(struct qinst, inst, &block->instructions, link)

#define vir_for_each_inst_rev(inst, block)                                  \
        list_for_each_entry_rev(struct qinst, inst, &block->instructions, link)

#define vir_for_each_inst_safe(inst, block)                             \
        list_for_each_entry_safe(struct qinst, inst, &block->instructions, link)

#define vir_for_each_inst_inorder(inst, c)                              \
        vir_for_each_block(_block, c)                                   \
                vir_for_each_inst(inst, _block)

#define vir_for_each_inst_inorder_safe(inst, c)                         \
        vir_for_each_block(_block, c)                                   \
                vir_for_each_inst_safe(inst, _block)

#endif /* V3D_COMPILER_H */