src/mesa/drivers/dri/i965/intel_batchbuffer.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

#ifndef INTEL_BATCHBUFFER_H
#define INTEL_BATCHBUFFER_H

#include "main/mtypes.h"

#include "brw_context.h"
#include "brw_bufmgr.h"

#ifdef __cplusplus
extern "C" {
#endif

/**
 * Number of bytes to reserve for commands necessary to complete a batch.
 *
 * This includes:
 * - MI_BATCHBUFFER_END (4 bytes)
 * - Optional MI_NOOP for ensuring the batch length is qword aligned (4 bytes)
 * - Any state emitted by vtbl->finish_batch():
 *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
 *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
 *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
 *     - Two sets of PIPE_CONTROLs, which become 4 PIPE_CONTROLs each on SNB,
 *       which are 5 DWords each ==> 2 * 4 * 5 * 4 = 160 bytes
 *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
 *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
 *       Sandybridge PIPE_CONTROL madness.
 *   - CC_STATE workaround on HSW (17 * 4 = 68 bytes)
 *     - 10 dwords for initial mi_flush
 *     - 2 dwords for CC state setup
 *     - 5 dwords for the required pipe control at the end
 *   - Restoring L3 configuration: (24 dwords = 96 bytes)
 *     - 2*6 dwords for two PIPE_CONTROL flushes.
 *     - 7 dwords for L3 configuration set-up.
 *     - 5 dwords for L3 atomic set-up (on HSW).
 */
#define BATCH_RESERVED 308

struct intel_batchbuffer;

void intel_batchbuffer_init(struct intel_batchbuffer *batch,
                            struct brw_bufmgr *bufmgr,
                            bool has_llc);
void intel_batchbuffer_free(struct intel_batchbuffer *batch);
void intel_batchbuffer_save_state(struct brw_context *brw);
void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
                                     enum brw_gpu_ring ring);
int _intel_batchbuffer_flush_fence(struct brw_context *brw,
                                   int in_fence_fd, int *out_fence_fd,
                                   const char *file, int line);

#define intel_batchbuffer_flush(brw) \
   _intel_batchbuffer_flush_fence((brw), -1, NULL, __FILE__, __LINE__)

#define intel_batchbuffer_flush_fence(brw, in_fence_fd, out_fence_fd) \
   _intel_batchbuffer_flush_fence((brw), (in_fence_fd), (out_fence_fd), \
                                  __FILE__, __LINE__)

/* Unlike bmBufferData, this currently requires the buffer be mapped.
 * Consider it a convenience function wrapping multple
 * intel_buffer_dword() calls.
 */
void intel_batchbuffer_data(struct brw_context *brw,
                            const void *data, GLuint bytes,
                            enum brw_gpu_ring ring);

bool brw_batch_has_aperture_space(struct brw_context *brw,
                                  unsigned extra_space_in_bytes);

bool brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo);

uint64_t brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
                        struct brw_bo *target, uint32_t target_offset,
                        uint32_t read_domains, uint32_t write_domain);

static inline uint32_t
brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
		  uint32_t prog_offset)
{
   if (brw->gen >= 5) {
      /* Using state base address. */
      return prog_offset;
   }

   brw_emit_reloc(&brw->batch, state_offset, brw->cache.bo, prog_offset,
                  I915_GEM_DOMAIN_INSTRUCTION, 0);

   return brw->cache.bo->offset64 + prog_offset;
}

#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))

static inline uint32_t float_as_int(float f)
{
   union {
      float f;
      uint32_t d;
   } fi;

   fi.f = f;
   return fi.d;
}

/* Inline functions - might actually be better off with these
 * non-inlined.  Certainly better off switching all command packets to
 * be passed as structs rather than dwords, but that's a little bit of
 * work...
 */
static inline unsigned
intel_batchbuffer_space(struct intel_batchbuffer *batch)
{
   return (batch->state_batch_offset - batch->reserved_space)
      - USED_BATCH(*batch) * 4;
}


static inline void
intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword)
{
#ifdef DEBUG
   assert(intel_batchbuffer_space(batch) >= 4);
#endif
   *batch->map_next++ = dword;
   assert(batch->ring != UNKNOWN_RING);
}

static inline void
intel_batchbuffer_emit_float(struct intel_batchbuffer *batch, float f)
{
   intel_batchbuffer_emit_dword(batch, float_as_int(f));
}

static inline void
intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
{
   intel_batchbuffer_require_space(brw, n * 4, ring);

#ifdef DEBUG
   brw->batch.emit = USED_BATCH(brw->batch);
   brw->batch.total = n;
#endif
}

static inline void
intel_batchbuffer_advance(struct brw_context *brw)
{
#ifdef DEBUG
   struct intel_batchbuffer *batch = &brw->batch;
   unsigned int _n = USED_BATCH(*batch) - batch->emit;
   assert(batch->total != 0);
   if (_n != batch->total) {
      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",
	      _n, batch->total);
      abort();
   }
   batch->total = 0;
#else
   (void) brw;
#endif
}

#define BEGIN_BATCH(n) do {                            \
   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
   uint32_t *__map = brw->batch.map_next;              \
   brw->batch.map_next += (n)

#define BEGIN_BATCH_BLT(n) do {                        \
   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
   uint32_t *__map = brw->batch.map_next;              \
   brw->batch.map_next += (n)

#define OUT_BATCH(d) *__map++ = (d)
#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))

#define OUT_RELOC(buf, read_domains, write_domain, delta) do {          \
   uint32_t __offset = (__map - brw->batch.map) * 4;                    \
   uint32_t reloc =                                                     \
      brw_emit_reloc(&brw->batch, __offset, (buf), (delta),             \
                     (read_domains), (write_domain));                   \
   OUT_BATCH(reloc);                                                    \
} while (0)

/* Handle 48-bit address relocations for Gen8+ */
#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {        \
   uint32_t __offset = (__map - brw->batch.map) * 4;                    \
   uint64_t reloc64 =                                                   \
      brw_emit_reloc(&brw->batch, __offset, (buf), (delta),             \
                     (read_domains), (write_domain));                   \
   OUT_BATCH(reloc64);                                                  \
   OUT_BATCH(reloc64 >> 32);                                            \
} while (0)

#define ADVANCE_BATCH()                  \
   assert(__map == brw->batch.map_next); \
   intel_batchbuffer_advance(brw);       \
} while (0)

#ifdef __cplusplus
}
#endif

#endif