summaryrefslogtreecommitdiff
path: root/src/r600_shader.h
blob: d79ed4005213bf2aadbf72b070e88a48055c9d62 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
/*
 * RadeonHD R6xx, R7xx DRI driver
 *
 * Copyright (C) 2008-2009  Alexander Deucher
 * Copyright (C) 2008-2009  Matthias Hopf
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

/*
 * Shader macros
 */

#ifndef __SHADER_H__
#define __SHADER_H__

#include "radeon.h"

/* Restrictions of ALU instructions
 * order of scalar ops is always x,y,z,w,t(rans), last to be indicated by last==1.
 * max of 3 different src GPRs per instr.
 * max of 4 different cfile constant components per instr.
 * max of 2 (different) constants (any type) for t.
 * bank swizzle (see below).
 * GPR write stalls read of same register. Auto-replaced by PV/PS, NOP needed if registers are relative to
 * different indices (gpr,loop,nothing).
 * may use constant registers or constant cache, but not both.
 */

/* Bank_swizzle: (pp. 297ff)
 * Only one of each x,y,z,w GPR component can be loaded per cycle (3 cycles per instr, called 0-2).
 * per scalar instruction bank_swizzle can select which cycle each operand comes from. e.g.:
 *   SRC0 SRC1 SRC2  SWIZZLE  cycle0 cycle1 cycle2
 *   1.x  2.x          012     1.x    2.x     -
 *   3.x  1.y          201     1.y     -     3.x
 *   2.x  1.y          102    (1.y)  (2.x)    -
 * If data is read in a cycle, multiple scalar instructions can reference it.
 * Special case: square() - i.e. same component in src0+src1 doesn't need read port -> ignores swizzle for src1.
 * No restrictions for constants or PV/PS.
 * t can load multiple components in a single cycle slot, but has to share cycles with xyzw.
 * t with single constant may not load GPRs or PV/PS in cycle 0 (carefull with ALU_TRANS_210).
 * t with two constants may only load GPRs or PV/PS in cycle 2.
 */


/* Oder of instructions: All CF, All ALU, All Tex/Vtx fetches */


// CF insts
// addr
#define ADDR(x)  (x)
// pc
#define POP_COUNT(x)      (x)
// const
#define CF_CONST(x)       (x)
// cond
#define COND(x)        (x)		// SQ_COND_*
// count
#define I_COUNT(x)        ((x) ? ((x) - 1) : 0)
//r7xx
#define COUNT_3(x)        (x)
// call count
#define CALL_COUNT(x)     (x)
// eop
#define END_OF_PROGRAM(x)   (x)
// vpm
#define VALID_PIXEL_MODE(x) (x)
// cf inst
#define CF_INST(x)        (x)		// SQ_CF_INST_*

// wqm
#define WHOLE_QUAD_MODE(x)  (x)
// barrier
#define BARRIER(x)          (x)
//kb0
#define KCACHE_BANK0(x)          (x)
//kb1
#define KCACHE_BANK1(x)          (x)
// km0/1
#define KCACHE_MODE0(x)          (x)
#define KCACHE_MODE1(x)          (x)	// SQ_CF_KCACHE_*
//
#define KCACHE_ADDR0(x)          (x)
#define KCACHE_ADDR1(x)          (x)
// uw
#define USES_WATERFALL(x)        (x)

#define ARRAY_BASE(x)        (x)
// export pixel
#define CF_PIXEL_MRT0         0
#define CF_PIXEL_MRT1         1
#define CF_PIXEL_MRT2         2
#define CF_PIXEL_MRT3         3
#define CF_PIXEL_MRT4         4
#define CF_PIXEL_MRT5         5
#define CF_PIXEL_MRT6         6
#define CF_PIXEL_MRT7         7
// *_FOG: r6xx only
#define CF_PIXEL_MRT0_FOG     16
#define CF_PIXEL_MRT1_FOG     17
#define CF_PIXEL_MRT2_FOG     18
#define CF_PIXEL_MRT3_FOG     19
#define CF_PIXEL_MRT4_FOG     20
#define CF_PIXEL_MRT5_FOG     21
#define CF_PIXEL_MRT6_FOG     22
#define CF_PIXEL_MRT7_FOG     23
#define CF_PIXEL_Z            61
// export pos
#define CF_POS0               60
#define CF_POS1               61
#define CF_POS2               62
#define CF_POS3               63
// export param
// 0...31
#define TYPE(x)              (x)	// SQ_EXPORT_*
#if 0
// type export
#define SQ_EXPORT_PIXEL              0
#define SQ_EXPORT_POS                1
#define SQ_EXPORT_PARAM              2
// reserved 3
// type mem
#define SQ_EXPORT_WRITE              0
#define SQ_EXPORT_WRITE_IND          1
#define SQ_EXPORT_WRITE_ACK          2
#define SQ_EXPORT_WRITE_IND_ACK      3
#endif

#define RW_GPR(x)            (x)
#define RW_REL(x)            (x)
#define ABSOLUTE                  0
#define RELATIVE                  1
#define INDEX_GPR(x)            (x)
#define ELEM_SIZE(x)            (x ? (x - 1) : 0)
#define COMP_MASK(x)            (x)
#define R6xx_ELEM_LOOP(x)            (x)
#define BURST_COUNT(x)          (x ? (x - 1) : 0)

// swiz
#define SRC_SEL_X(x)    (x)		// SQ_SEL_* each
#define SRC_SEL_Y(x)    (x)
#define SRC_SEL_Z(x)    (x)
#define SRC_SEL_W(x)    (x)

#define CF_DWORD0(addr) cpu_to_le32((addr))
// R7xx has another entry (COUNT3), but that is only used for adding a bit to count.
// We allow one more bit for count in the argument of the macro on R7xx instead.
// R6xx: [0,7]  R7xx: [1,16]
#define CF_DWORD1(pc, cf_const, cond, count, call_count, eop, vpm, cf_inst, wqm, b) \
    cpu_to_le32((((pc) << 0) | ((cf_const) << 3) | ((cond) << 8) | (((count) & 7) << 10) | (((count) >> 3) << 19) | \
		 ((call_count) << 13) | ((eop) << 21) | ((vpm) << 22) | ((cf_inst) << 23) | ((wqm) << 30) | ((b) << 31)))

#define CF_ALU_DWORD0(addr, kb0, kb1, km0) cpu_to_le32((((addr) << 0) | ((kb0) << 22) | ((kb1) << 26) | ((km0) << 30)))
#define CF_ALU_DWORD1(km1, kcache_addr0, kcache_addr1, count, uw, cf_inst, wqm, b) \
    cpu_to_le32((((km1) << 0) | ((kcache_addr0) << 2) | ((kcache_addr1) << 10) | \
		 ((count) << 18) | ((uw) << 25) | ((cf_inst) << 26) | ((wqm) << 30) | ((b) << 31)))

#define CF_ALLOC_IMP_EXP_DWORD0(array_base, type, rw_gpr, rr, index_gpr, es) \
    cpu_to_le32((((array_base) << 0) | ((type) << 13) | ((rw_gpr) << 15) | ((rr) << 22) | ((index_gpr) << 23) | \
		 ((es) << 30)))
// R7xx apparently doesn't have the ELEM_LOOP entry any more
// We still expose it, but ELEM_LOOP is explicitely R6xx now.
// TODO: is this just forgotten in the docs, or really not available any more?
#define CF_ALLOC_IMP_EXP_DWORD1_BUF(array_size, comp_mask, el, bc, eop, vpm, cf_inst, wqm, b) \
    cpu_to_le32((((array_size) << 0) | ((comp_mask) << 12) | ((el) << 16) | ((bc) << 17) | \
		 ((eop) << 21) | ((vpm) << 22) | ((cf_inst) << 23) | ((wqm) << 30) | ((b) << 31)))
#define CF_ALLOC_IMP_EXP_DWORD1_SWIZ(sel_x, sel_y, sel_z, sel_w, el, bc, eop, vpm, cf_inst, wqm, b) \
    cpu_to_le32((((sel_x) << 0) | ((sel_y) << 3) | ((sel_z) << 6) | ((sel_w) << 9) | ((el) << 16) | \
		 ((bc) << 17) | ((eop) << 21) | ((vpm) << 22) | ((cf_inst) << 23) | \
		 ((wqm) << 30) | ((b) << 31)))

// ALU clause insts
#define SRC0_SEL(x)        (x)
#define SRC1_SEL(x)        (x)
#define SRC2_SEL(x)        (x)
// src[0-2]_sel
//   0-127 GPR
// 128-159 kcache constants bank 0
// 160-191 kcache constants bank 1
// 248-255 special SQ_ALU_SRC_* (0, 1, etc.)
#define ALU_SRC_GPR_BASE        0
#define ALU_SRC_KCACHE0_BASE  128
#define ALU_SRC_KCACHE1_BASE  160
#define ALU_SRC_CFILE_BASE    256

#define SRC0_REL(x)        (x)
#define SRC1_REL(x)        (x)
#define SRC2_REL(x)        (x)
// elem
#define SRC0_ELEM(x)        (x)
#define SRC1_ELEM(x)        (x)
#define SRC2_ELEM(x)        (x)
#define ELEM_X        0
#define ELEM_Y        1
#define ELEM_Z        2
#define ELEM_W        3
// neg
#define SRC0_NEG(x)        (x)
#define SRC1_NEG(x)        (x)
#define SRC2_NEG(x)        (x)
// im
#define INDEX_MODE(x)    (x)		// SQ_INDEX_*
// ps
#define PRED_SEL(x)      (x)		// SQ_PRED_SEL_*
// last
#define LAST(x)          (x)
// abs
#define SRC0_ABS(x)       (x)
#define SRC1_ABS(x)       (x)
// uem
#define UPDATE_EXECUTE_MASK(x) (x)
// up
#define UPDATE_PRED(x)      (x)
// wm
#define WRITE_MASK(x)   (x)
// fm
#define FOG_MERGE(x)    (x)
// omod
#define OMOD(x)        (x)		// SQ_ALU_OMOD_*
// alu inst
#define ALU_INST(x)        (x)		// SQ_ALU_INST_*
//bs
#define BANK_SWIZZLE(x)        (x)	// SQ_ALU_VEC_*
#define DST_GPR(x)        (x)
#define DST_REL(x)        (x)
#define DST_ELEM(x)       (x)
#define CLAMP(x)          (x)

#define ALU_DWORD0(src0_sel, s0r, s0e, s0n, src1_sel, s1r, s1e, s1n, im, ps, last) \
    cpu_to_le32((((src0_sel) << 0) | ((s0r) << 9) | ((s0e) << 10) | ((s0n) << 12) | \
		 ((src1_sel) << 13) | ((s1r) << 22) | ((s1e) << 23) | ((s1n) << 25) | \
		 ((im) << 26) | ((ps) << 29) | ((last) << 31)))
// R7xx has alu_inst at a different slot, and no fog merge any more (no fix function fog any more)
#define R6xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, fm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) \
    cpu_to_le32((((s0a) << 0) | ((s1a) << 1) | ((uem) << 2) | ((up) << 3) | ((wm) << 4) | \
		 ((fm) << 5) | ((omod) << 6) | ((alu_inst) << 8) | ((bs) << 18) | ((dst_gpr) << 21) | \
		 ((dr) << 28) | ((de) << 29) | ((clamp) << 31)))
#define R7xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) \
    cpu_to_le32((((s0a) << 0) | ((s1a) << 1) | ((uem) << 2) | ((up) << 3) | ((wm) << 4) | \
		 ((omod) << 5) | ((alu_inst) << 7) | ((bs) << 18) | ((dst_gpr) << 21) | \
		 ((dr) << 28) | ((de) << 29) | ((clamp) << 31)))
// This is a general chipset macro, but due to selection by chipid typically not usable in static arrays
// Fog is NOT USED on R7xx, even if specified.
#define ALU_DWORD1_OP2(chipfamily, s0a, s1a, uem, up, wm, fm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) \
    ((chipfamily) < CHIP_FAMILY_RV770 ? \
     R6xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, fm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) : \
     R7xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, omod, alu_inst, bs, dst_gpr, dr, de, clamp))
#define ALU_DWORD1_OP3(src2_sel, s2r, s2e, s2n, alu_inst, bs, dst_gpr, dr, de, clamp) \
    cpu_to_le32((((src2_sel) << 0) | ((s2r) << 9) | ((s2e) << 10) | ((s2n) << 12) | \
		 ((alu_inst) << 13) | ((bs) << 18) | ((dst_gpr) << 21) | ((dr) << 28) | \
		 ((de) << 29) | ((clamp) << 31)))

// VTX clause insts
// vxt insts
#define VTX_INST(x)        (x)		// SQ_VTX_INST_*

// fetch type
#define FETCH_TYPE(x)        (x)	// SQ_VTX_FETCH_*

#define FETCH_WHOLE_QUAD(x)        (x)
#define BUFFER_ID(x)        (x)
#define SRC_GPR(x)          (x)
#define SRC_REL(x)          (x)
#define MEGA_FETCH_COUNT(x)        ((x) ? ((x) - 1) : 0)

#define SEMANTIC_ID(x)        (x)
#define DST_SEL_X(x)          (x)
#define DST_SEL_Y(x)          (x)
#define DST_SEL_Z(x)          (x)
#define DST_SEL_W(x)          (x)
#define USE_CONST_FIELDS(x)   (x)
#define DATA_FORMAT(x)        (x)
// num format
#define NUM_FORMAT_ALL(x)     (x)	// SQ_NUM_FORMAT_*
// format comp
#define FORMAT_COMP_ALL(x)     (x)	// SQ_FORMAT_COMP_*
// sma
#define SRF_MODE_ALL(x)     (x)
#define SRF_MODE_ZERO_CLAMP_MINUS_ONE      0
#define SRF_MODE_NO_ZERO                   1
#define OFFSET(x)     (x)
// endian swap
#define ENDIAN_SWAP(x)     (x)		// SQ_ENDIAN_*
#define CONST_BUF_NO_STRIDE(x)     (x)
// mf
#define MEGA_FETCH(x)     (x)

#define VTX_DWORD0(vtx_inst, ft, fwq, buffer_id, src_gpr, sr, ssx, mfc) \
    cpu_to_le32((((vtx_inst) << 0) | ((ft) << 5) | ((fwq) << 7) | ((buffer_id) << 8) | \
		 ((src_gpr) << 16) | ((sr) << 23) | ((ssx) << 24) | ((mfc) << 26)))
#define VTX_DWORD1_SEM(semantic_id, dsx, dsy, dsz, dsw, ucf, data_format, nfa, fca, sma) \
    cpu_to_le32((((semantic_id) << 0) | ((dsx) << 9) | ((dsy) << 12) | ((dsz) << 15) | ((dsw) << 18) | \
		 ((ucf) << 21) | ((data_format) << 22) | ((nfa) << 28) | ((fca) << 30) | ((sma) << 31)))
#define VTX_DWORD1_GPR(dst_gpr, dr, dsx, dsy, dsz, dsw, ucf, data_format, nfa, fca, sma) \
    cpu_to_le32((((dst_gpr) << 0) | ((dr) << 7) | ((dsx) << 9) | ((dsy) << 12) | ((dsz) << 15) | ((dsw) << 18) | \
		 ((ucf) << 21) | ((data_format) << 22) | ((nfa) << 28) | ((fca) << 30) | ((sma) << 31)))
#define VTX_DWORD2(offset, es, cbns, mf) \
    cpu_to_le32((((offset) << 0) | ((es) << 16) | ((cbns) << 18) | ((mf) << 19)))
#define VTX_DWORD_PAD cpu_to_le32(0x00000000)

// TEX clause insts
// tex insts
#define TEX_INST(x)     (x)		// SQ_TEX_INST_*

#define BC_FRAC_MODE(x)         (x)
#define FETCH_WHOLE_QUAD(x)     (x)
#define RESOURCE_ID(x)          (x)
#define R7xx_ALT_CONST(x)            (x)

#define LOD_BIAS(x)     (x)
//ct
#define COORD_TYPE_X(x)     (x)
#define COORD_TYPE_Y(x)     (x)
#define COORD_TYPE_Z(x)     (x)
#define COORD_TYPE_W(x)     (x)
#define TEX_UNNORMALIZED                0
#define TEX_NORMALIZED                  1
#define OFFSET_X(x) (((int)(x) * 2) & 0x1f) /* 4:1-bits 2's-complement fixed-point: [-8.0..7.5] */
#define OFFSET_Y(x) (((int)(x) * 2) & 0x1f)
#define OFFSET_Z(x) (((int)(x) * 2) & 0x1f)
#define SAMPLER_ID(x)     (x)

// R7xx has an additional parameter ALT_CONST. We always expose it, but ALT_CONST is R7xx only
#define TEX_DWORD0(tex_inst, bfm, fwq, resource_id, src_gpr, sr, ac) \
    cpu_to_le32((((tex_inst) << 0) | ((bfm) << 5) | ((fwq) << 7) | ((resource_id) << 8) | \
		 ((src_gpr) << 16) | ((sr) << 23) | ((ac) << 24)))
#define TEX_DWORD1(dst_gpr, dr, dsx, dsy, dsz, dsw, lod_bias, ctx, cty, ctz, ctw) \
    cpu_to_le32((((dst_gpr) << 0) | ((dr) << 7) | ((dsx) << 9) | ((dsy) << 12) | ((dsz) << 15) | ((dsw) << 18) | \
		 ((lod_bias) << 21) | ((ctx) << 28) | ((cty) << 29) | ((ctz) << 30) | ((ctw) << 31)))
#define TEX_DWORD2(offset_x, offset_y, offset_z, sampler_id, ssx, ssy, ssz, ssw) \
    cpu_to_le32((((offset_x) << 0) | ((offset_y) << 5) | ((offset_z) << 10) | ((sampler_id) << 15) | \
		 ((ssx) << 20) | ((ssy) << 23) | ((ssz) << 26) | ((ssw) << 29)))
#define TEX_DWORD_PAD cpu_to_le32(0x00000000)

extern int R600_solid_vs(RADEONChipFamily ChipSet, uint32_t* vs);
extern int R600_solid_ps(RADEONChipFamily ChipSet, uint32_t* ps);

extern int R600_copy_vs(RADEONChipFamily ChipSet, uint32_t* vs);
extern int R600_copy_ps(RADEONChipFamily ChipSet, uint32_t* ps);

extern int R600_xv_vs(RADEONChipFamily ChipSet, uint32_t* shader);
extern int R600_xv_ps(RADEONChipFamily ChipSet, uint32_t* shader);

extern int R600_comp_vs(RADEONChipFamily ChipSet, uint32_t* vs);
extern int R600_comp_ps(RADEONChipFamily ChipSet, uint32_t* ps);

#endif