summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Graunke <kenneth@whitecape.org>2013-04-11 13:22:29 -0700
committerKenneth Graunke <kenneth@whitecape.org>2013-09-26 16:55:18 -0700
commit0f2da773070c06b6d20ad264d3abb19c4dfd9761 (patch)
treea8000bf054ee72091b11d3b4a6a0a7dc253a7301
parentb2e327e08f8519da131dd382adcc99240d433404 (diff)
i965: Add support for GL_AMD_performance_monitor on Ironlake.
Ironlake's counters are always enabled; userspace can simply send a MI_REPORT_PERF_COUNT packet to take a snapshot of them. This makes it easy to implement. The counters are documented in the source code for the intel-gpu-tools intel_perf_counters utility. v2: Adjust for core data structure changes. Add a table mapping buffer object offsets to exposed counters (which changes each generation). Finally, add report ID assertions to sanity check the BO layout (thanks to Carl Worth). v3: Update for core BeginPerfMonitor hook changes (requested by Brian). Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
-rw-r--r--src/mesa/drivers/dri/i965/Makefile.sources1
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.c4
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.h14
-rw-r--r--src/mesa/drivers/dri/i965/brw_defines.h7
-rw-r--r--src/mesa/drivers/dri/i965/brw_performance_monitor.c391
-rw-r--r--src/mesa/drivers/dri/i965/intel_extensions.c3
6 files changed, 420 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index f521daaf719..e2193164382 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -69,6 +69,7 @@ i965_FILES = \
brw_lower_texture_gradients.cpp \
brw_misc_state.c \
brw_object_purgeable.c \
+ brw_performance_monitor.c \
brw_program.c \
brw_primitive_restart.c \
brw_queryobj.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index f60d4df5d32..5f58a291d8d 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -503,6 +503,10 @@ brwCreateContext(int api,
_mesa_initialize_dispatch_tables(ctx);
_mesa_initialize_vbo_vtxfmt(ctx);
+ if (ctx->Extensions.AMD_performance_monitor) {
+ brw_init_performance_monitors(brw);
+ }
+
return true;
}
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 656fb3c0504..0f88bad2475 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -128,6 +128,7 @@ struct brw_vs_prog_key;
struct brw_vec4_prog_key;
struct brw_wm_prog_key;
struct brw_wm_prog_data;
+struct brw_perf_bo_layout;
enum brw_state_id {
BRW_STATE_URB_FENCE,
@@ -1313,6 +1314,16 @@ struct brw_context
bool begin_emitted;
} query;
+ struct {
+ /* A map describing which counters are stored at a particular 32-bit
+ * offset in the buffer object.
+ */
+ const struct brw_perf_bo_layout *bo_layout;
+
+ /* Number of 32-bit entries in the buffer object. */
+ int entries_in_bo;
+ } perfmon;
+
int num_atoms;
const struct brw_tracked_state **atoms;
@@ -1485,6 +1496,9 @@ bool brw_is_hiz_depth_format(struct brw_context *ctx, gl_format format);
bool brw_render_target_supported(struct brw_context *brw,
struct gl_renderbuffer *rb);
+/* brw_performance_monitor.c */
+void brw_init_performance_monitors(struct brw_context *brw);
+
/* gen6_sol.c */
void
brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index e9e0c4a0e3a..e2838059050 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1817,6 +1817,13 @@ enum brw_wm_barycentric_interp_mode {
#define CMD_MI_FLUSH 0x0200
+#define GEN5_MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2))
+/* DW0 */
+# define GEN5_MI_COUNTER_SET_0 (0 << 6)
+# define GEN5_MI_COUNTER_SET_1 (1 << 6)
+/* DW1 */
+# define MI_COUNTER_ADDRESS_GTT (1 << 0)
+/* DW2: a user-defined report ID (written to the buffer but can be anything) */
/* Bitfields for the URB_WRITE message, DW2 of message header: */
#define URB_WRITE_PRIM_END 0x1
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
new file mode 100644
index 00000000000..87c4a633bfd
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -0,0 +1,391 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_performance_monitor.c
+ *
+ * Implementation of the GL_AMD_performance_monitor extension.
+ *
+ * Currently only for Ironlake.
+ */
+
+#include <limits.h>
+
+#include "main/bitset.h"
+#include "main/macros.h"
+#include "main/mtypes.h"
+#include "main/performance_monitor.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+/**
+ * i965 representation of a performance monitor object.
+ */
+struct brw_perf_monitor_object
+{
+ /** The base class. */
+ struct gl_perf_monitor_object base;
+
+ /**
+ * BO containing raw counter data in a hardware specific form.
+ */
+ drm_intel_bo *bo;
+};
+
+/** Downcasting convenience macro. */
+static inline struct brw_perf_monitor_object *
+brw_perf_monitor(struct gl_perf_monitor_object *m)
+{
+ return (struct brw_perf_monitor_object *) m;
+}
+
+#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
+
+/* Two random values used to ensure we're getting valid snapshots. */
+#define FIRST_SNAPSHOT_REPORT_ID 0xd2e9c607
+#define SECOND_SNAPSHOT_REPORT_ID 0xad584b1d
+
+/******************************************************************************/
+
+#define COUNTER(name) \
+ { \
+ .Name = name, \
+ .Type = GL_UNSIGNED_INT, \
+ .Minimum = { .u32 = 0 }, \
+ .Maximum = { .u32 = ~0 }, \
+ }
+
+#define GROUP(name, max_active, counter_list) \
+ { \
+ .Name = name, \
+ .MaxActiveCounters = max_active, \
+ .Counters = counter_list, \
+ .NumCounters = ARRAY_SIZE(counter_list), \
+ }
+
+struct brw_perf_bo_layout {
+ int group;
+ int counter;
+};
+
+/**
+ * Ironlake:
+ * @{
+ */
+const static struct gl_perf_monitor_counter gen5_raw_aggregating_counters[] = {
+ COUNTER("cycles the CS unit is starved"),
+ COUNTER("cycles the CS unit is stalled"),
+ COUNTER("cycles the VF unit is starved"),
+ COUNTER("cycles the VF unit is stalled"),
+ COUNTER("cycles the VS unit is starved"),
+ COUNTER("cycles the VS unit is stalled"),
+ COUNTER("cycles the GS unit is starved"),
+ COUNTER("cycles the GS unit is stalled"),
+ COUNTER("cycles the CL unit is starved"),
+ COUNTER("cycles the CL unit is stalled"),
+ COUNTER("cycles the SF unit is starved"),
+ COUNTER("cycles the SF unit is stalled"),
+ COUNTER("cycles the WZ unit is starved"),
+ COUNTER("cycles the WZ unit is stalled"),
+ COUNTER("Z buffer read/write"),
+ COUNTER("cycles each EU was active"),
+ COUNTER("cycles each EU was suspended"),
+ COUNTER("cycles threads loaded all EUs"),
+ COUNTER("cycles filtering active"),
+ COUNTER("cycles PS threads executed"),
+ COUNTER("subspans written to RC"),
+ COUNTER("bytes read for texture reads"),
+ COUNTER("texels returned from sampler"),
+ COUNTER("polygons not culled"),
+ COUNTER("clocks MASF has valid message"),
+ COUNTER("64b writes/reads from RC"),
+ COUNTER("reads on dataport"),
+ COUNTER("clocks MASF has valid msg not consumed by sampler"),
+ COUNTER("cycles any EU is stalled for math"),
+};
+
+const static struct gl_perf_monitor_group gen5_groups[] = {
+ GROUP("Aggregating Counters", INT_MAX, gen5_raw_aggregating_counters),
+};
+
+const static struct brw_perf_bo_layout gen5_perf_bo_layout[] =
+{
+ { -1, -1, }, /* Report ID */
+ { -1, -1, }, /* TIMESTAMP (64-bit) */
+ { -1, -1, }, /* ...second half... */
+ { 0, 0, }, /* cycles the CS unit is starved */
+ { 0, 1, }, /* cycles the CS unit is stalled */
+ { 0, 2, }, /* cycles the VF unit is starved */
+ { 0, 3, }, /* cycles the VF unit is stalled */
+ { 0, 4, }, /* cycles the VS unit is starved */
+ { 0, 5, }, /* cycles the VS unit is stalled */
+ { 0, 6, }, /* cycles the GS unit is starved */
+ { 0, 7, }, /* cycles the GS unit is stalled */
+ { 0, 8, }, /* cycles the CL unit is starved */
+ { 0, 9, }, /* cycles the CL unit is stalled */
+ { 0, 10, }, /* cycles the SF unit is starved */
+ { 0, 11, }, /* cycles the SF unit is stalled */
+ { 0, 12, }, /* cycles the WZ unit is starved */
+ { 0, 13, }, /* cycles the WZ unit is stalled */
+ { 0, 14, }, /* Z buffer read/write */
+ { 0, 15, }, /* cycles each EU was active */
+ { 0, 16, }, /* cycles each EU was suspended */
+ { 0, 17, }, /* cycles threads loaded all EUs */
+ { 0, 18, }, /* cycles filtering active */
+ { 0, 19, }, /* cycles PS threads executed */
+ { 0, 20, }, /* subspans written to RC */
+ { 0, 21, }, /* bytes read for texture reads */
+ { 0, 22, }, /* texels returned from sampler */
+ { 0, 23, }, /* polygons not culled */
+ { 0, 24, }, /* clocks MASF has valid message */
+ { 0, 25, }, /* 64b writes/reads from RC */
+ { 0, 26, }, /* reads on dataport */
+ { 0, 27, }, /* clocks MASF has valid msg not consumed by sampler */
+ { 0, 28, }, /* cycles any EU is stalled for math */
+};
+
+/** @} */
+
+/******************************************************************************/
+
+static void
+snapshot_aggregating_counters(struct brw_context *brw,
+ drm_intel_bo *bo, uint32_t offset_in_bytes)
+{
+ uint32_t report_id = offset_in_bytes == 0 ? FIRST_SNAPSHOT_REPORT_ID
+ : SECOND_SNAPSHOT_REPORT_ID;
+
+ if (brw->gen == 5) {
+ /* Ironlake requires two MI_REPORT_PERF_COUNT commands to write all
+ * the counters. The report ID is ignored in the second set.
+ */
+ BEGIN_BATCH(6);
+ OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
+ OUT_RELOC(bo,
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ offset_in_bytes);
+ OUT_BATCH(report_id);
+
+ OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
+ OUT_RELOC(bo,
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ offset_in_bytes + 64);
+ OUT_BATCH(report_id);
+ ADVANCE_BATCH();
+ } else {
+ assert(!"Unsupported generation for performance counters.");
+ }
+}
+
+static bool
+aggregating_counters_needed(struct brw_context *brw,
+ struct gl_perf_monitor_object *m)
+{
+ return m->ActiveGroups[0];
+}
+
+/******************************************************************************/
+
+/**
+ * Create a new performance monitor object.
+ */
+static struct gl_perf_monitor_object *
+brw_new_perf_monitor(struct gl_context *ctx)
+{
+ return calloc(1, sizeof(struct brw_perf_monitor_object));
+}
+
+/**
+ * Delete a performance monitor object.
+ */
+static void
+brw_delete_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
+{
+ struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+ if (monitor->bo)
+ drm_intel_bo_unreference(monitor->bo);
+
+ free(monitor);
+}
+
+/**
+ * Driver hook for glBeginPerformanceMonitorAMD().
+ */
+static GLboolean
+brw_begin_perf_monitor(struct gl_context *ctx,
+ struct gl_perf_monitor_object *m)
+{
+ struct brw_context *brw = brw_context(ctx);
+ struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+ /* If the BO already exists, throw it away. It contains old results
+ * that we're not interested in any more.
+ */
+ if (monitor->bo)
+ drm_intel_bo_unreference(monitor->bo);
+
+ /* Create a new BO. */
+ monitor->bo =
+ drm_intel_bo_alloc(brw->bufmgr, "performance monitor", 4096, 64);
+ drm_intel_bo_map(monitor->bo, true);
+ memset((char *) monitor->bo->virtual, 0xff, 4096);
+ drm_intel_bo_unmap(monitor->bo);
+
+ /* Take a shapshot of all active counters */
+ if (aggregating_counters_needed(brw, m)) {
+ snapshot_aggregating_counters(brw, monitor->bo, 0);
+ }
+
+ return true;
+}
+
+/**
+ * Driver hook for glEndPerformanceMonitorAMD().
+ */
+static void
+brw_end_perf_monitor(struct gl_context *ctx,
+ struct gl_perf_monitor_object *m)
+{
+ struct brw_context *brw = brw_context(ctx);
+ struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+ if (aggregating_counters_needed(brw, m)) {
+ snapshot_aggregating_counters(brw, monitor->bo,
+ SECOND_SNAPSHOT_OFFSET_IN_BYTES);
+ }
+}
+
+/**
+ * Reset a performance monitor, throwing away any results.
+ */
+static void
+brw_reset_perf_monitor(struct gl_context *ctx,
+ struct gl_perf_monitor_object *m)
+{
+ struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+ if (monitor->bo) {
+ drm_intel_bo_unreference(monitor->bo);
+ monitor->bo = NULL;
+ }
+
+ if (m->Active) {
+ brw_begin_perf_monitor(ctx, m);
+ }
+}
+
+/**
+ * Is a performance monitor result available?
+ */
+static GLboolean
+brw_is_perf_monitor_result_available(struct gl_context *ctx,
+ struct gl_perf_monitor_object *m)
+{
+ struct brw_context *brw = brw_context(ctx);
+ struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+ return !m->Active && monitor->bo &&
+ !drm_intel_bo_references(brw->batch.bo, monitor->bo) &&
+ !drm_intel_bo_busy(monitor->bo);
+}
+
+/**
+ * Get the performance monitor result.
+ */
+static void
+brw_get_perf_monitor_result(struct gl_context *ctx,
+ struct gl_perf_monitor_object *m,
+ GLsizei data_size,
+ GLuint *data,
+ GLint *bytes_written)
+{
+ struct brw_context *brw = brw_context(ctx);
+ struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+ /* This hook should only be called when results are available. */
+ assert(monitor->bo != NULL);
+
+ drm_intel_bo_map(monitor->bo, false);
+ unsigned *gpu_bo = monitor->bo->virtual;
+
+ /* Copy data from the BO to the supplied array.
+ *
+ * The output data format is: <group ID, counter ID, value> for each
+ * active counter. The API allows counters to appear in any order.
+ */
+ GLsizei offset = 0;
+
+ /* Look for expected report ID values to ensure data is present. */
+ assert(gpu_bo[0] == FIRST_SNAPSHOT_REPORT_ID);
+ assert(gpu_bo[SECOND_SNAPSHOT_OFFSET_IN_BYTES/4] == SECOND_SNAPSHOT_REPORT_ID);
+
+ for (int i = 0; i < brw->perfmon.entries_in_bo; i++) {
+ int group = brw->perfmon.bo_layout[i].group;
+ int counter = brw->perfmon.bo_layout[i].counter;
+
+ if (group < 0 || !BITSET_TEST(m->ActiveCounters[group], counter))
+ continue;
+
+ const struct gl_perf_monitor_group *group_obj =
+ &ctx->PerfMonitor.Groups[group];
+
+ const struct gl_perf_monitor_counter *c = &group_obj->Counters[counter];
+
+ data[offset++] = group;
+ data[offset++] = counter;
+
+ uint32_t second_snapshot_index =
+ SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint32_t) + i;
+
+ /* Won't work for uint64_t values, but we don't expose any yet. */
+ data[offset] = gpu_bo[second_snapshot_index] - gpu_bo[i];
+ offset += _mesa_perf_monitor_counter_size(c) / sizeof(uint32_t);
+ }
+
+ drm_intel_bo_unmap(monitor->bo);
+
+ if (bytes_written)
+ *bytes_written = offset * sizeof(uint32_t);
+}
+
+void
+brw_init_performance_monitors(struct brw_context *brw)
+{
+ struct gl_context *ctx = &brw->ctx;
+
+ ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
+ ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
+ ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
+ ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
+ ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
+ ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
+ ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
+
+ if (brw->gen == 5) {
+ ctx->PerfMonitor.Groups = gen5_groups;
+ ctx->PerfMonitor.NumGroups = ARRAY_SIZE(gen5_groups);
+ brw->perfmon.bo_layout = gen5_perf_bo_layout;
+ brw->perfmon.entries_in_bo = ARRAY_SIZE(gen5_perf_bo_layout);
+ }
+}
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index aef78051e62..0fc5aad23c0 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -160,6 +160,9 @@ intelInitExtensions(struct gl_context *ctx)
ctx->Extensions.EXT_shader_integer_mix = ctx->Const.GLSLVersion >= 130;
}
+ if (brw->gen == 5)
+ ctx->Extensions.AMD_performance_monitor = true;
+
if (ctx->API == API_OPENGL_CORE)
ctx->Extensions.ARB_base_instance = true;
if (ctx->API != API_OPENGL_CORE)