/*
 * Copyright © 2014 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

/**
 * Expose V3D HW perf counters.
 *
 * We also have code to fake support for occlusion queries.
 * Since we expose support for GL 2.0, we have to expose occlusion queries,
 * but the spec allows you to expose 0 query counter bits, so we just return 0
 * as the result of all our queries.
 */
#include "vc4_context.h"

struct vc4_query
{
        unsigned num_queries;
        struct vc4_hwperfmon *hwperfmon;
};

static const char *v3d_counter_names[] = {
        "FEP-valid-primitives-no-rendered-pixels",
        "FEP-valid-primitives-rendered-pixels",
        "FEP-clipped-quads",
        "FEP-valid-quads",
        "TLB-quads-not-passing-stencil-test",
        "TLB-quads-not-passing-z-and-stencil-test",
        "TLB-quads-passing-z-and-stencil-test",
        "TLB-quads-with-zero-coverage",
        "TLB-quads-with-non-zero-coverage",
        "TLB-quads-written-to-color-buffer",
        "PTB-primitives-discarded-outside-viewport",
        "PTB-primitives-need-clipping",
        "PTB-primitives-discared-reversed",
        "QPU-total-idle-clk-cycles",
        "QPU-total-clk-cycles-vertex-coord-shading",
        "QPU-total-clk-cycles-fragment-shading",
        "QPU-total-clk-cycles-executing-valid-instr",
        "QPU-total-clk-cycles-waiting-TMU",
        "QPU-total-clk-cycles-waiting-scoreboard",
        "QPU-total-clk-cycles-waiting-varyings",
        "QPU-total-instr-cache-hit",
        "QPU-total-instr-cache-miss",
        "QPU-total-uniform-cache-hit",
        "QPU-total-uniform-cache-miss",
        "TMU-total-text-quads-processed",
        "TMU-total-text-cache-miss",
        "VPM-total-clk-cycles-VDW-stalled",
        "VPM-total-clk-cycles-VCD-stalled",
        "L2C-total-cache-hit",
        "L2C-total-cache-miss",
};

int vc4_get_driver_query_group_info(struct pipe_screen *pscreen,
                                    unsigned index,
                                    struct pipe_driver_query_group_info *info)
{
        struct vc4_screen *screen = vc4_screen(pscreen);

        if (!screen->has_perfmon_ioctl)
                return 0;

        if (!info)
                return 1;

        if (index > 0)
                return 0;

        info->name = "V3D counters";
        info->max_active_queries = DRM_VC4_MAX_PERF_COUNTERS;
        info->num_queries = ARRAY_SIZE(v3d_counter_names);
        return 1;
}

int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
                              struct pipe_driver_query_info *info)
{
        struct vc4_screen *screen = vc4_screen(pscreen);

        if (!screen->has_perfmon_ioctl)
                return 0;

        if (!info)
                return ARRAY_SIZE(v3d_counter_names);

        if (index >= ARRAY_SIZE(v3d_counter_names))
                return 0;

        info->group_id = 0;
        info->name = v3d_counter_names[index];
        info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index;
        info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
        info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
        info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
        return 1;
}

static struct pipe_query *
vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
                       unsigned *query_types)
{
        struct vc4_query *query = calloc(1, sizeof(*query));
        struct vc4_hwperfmon *hwperfmon;
        unsigned i, nhwqueries = 0;

        if (!query)
                return NULL;

        for (i = 0; i < num_queries; i++) {
                if (query_types[i] >= PIPE_QUERY_DRIVER_SPECIFIC)
                        nhwqueries++;
        }

        /* We can't mix HW and non-HW queries. */
        if (nhwqueries && nhwqueries != num_queries)
                return NULL;

        if (!nhwqueries)
                return (struct pipe_query *)query;

        hwperfmon = calloc(1, sizeof(*hwperfmon));
        if (!hwperfmon)
                goto err_free_query;

        for (i = 0; i < num_queries; i++)
                hwperfmon->events[i] = query_types[i] -
                                       PIPE_QUERY_DRIVER_SPECIFIC;

        query->hwperfmon = hwperfmon;
        query->num_queries = num_queries;

        /* Note that struct pipe_query isn't actually defined anywhere. */
        return (struct pipe_query *)query;

err_free_query:
        free(query);

        return NULL;
}

static struct pipe_query *
vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
{
        return vc4_create_batch_query(ctx, 1, &query_type);
}

static void
vc4_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery)
{
        struct vc4_context *ctx = vc4_context(pctx);
        struct vc4_query *query = (struct vc4_query *)pquery;

        if (query->hwperfmon && query->hwperfmon->id) {
                if (query->hwperfmon->id) {
                        struct drm_vc4_perfmon_destroy req = { };

                        req.id = query->hwperfmon->id;
                        vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY,
                                  &req);
                }

                free(query->hwperfmon);
        }

        free(query);
}

static boolean
vc4_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
{
        struct vc4_query *query = (struct vc4_query *)pquery;
        struct vc4_context *ctx = vc4_context(pctx);
        struct drm_vc4_perfmon_create req = { };
        unsigned i;
        int ret;

        if (!query->hwperfmon)
                return true;

        /* Only one perfmon can be activated per context. */
        if (ctx->perfmon)
                return false;

        /* Reset the counters by destroying the previously allocated perfmon */
        if (query->hwperfmon->id) {
                struct drm_vc4_perfmon_destroy destroyreq = { };

                destroyreq.id = query->hwperfmon->id;
                vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY, &destroyreq);
        }

        for (i = 0; i < query->num_queries; i++)
                req.events[i] = query->hwperfmon->events[i];

        req.ncounters = query->num_queries;
        ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_CREATE, &req);
        if (ret)
                return false;

        query->hwperfmon->id = req.id;

        /* Make sure all pendings jobs are flushed before activating the
         * perfmon.
         */
        vc4_flush(pctx);
        ctx->perfmon = query->hwperfmon;
        return true;
}

static bool
vc4_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
{
        struct vc4_query *query = (struct vc4_query *)pquery;
        struct vc4_context *ctx = vc4_context(pctx);

        if (!query->hwperfmon)
                return true;

        if (ctx->perfmon != query->hwperfmon)
                return false;

        /* Make sure all pendings jobs are flushed before deactivating the
         * perfmon.
         */
        vc4_flush(pctx);
        ctx->perfmon = NULL;
        return true;
}

static boolean
vc4_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
                     boolean wait, union pipe_query_result *vresult)
{
        struct vc4_context *ctx = vc4_context(pctx);
        struct vc4_query *query = (struct vc4_query *)pquery;
        struct drm_vc4_perfmon_get_values req;
        unsigned i;
        int ret;

        if (!query->hwperfmon) {
                vresult->u64 = 0;
                return true;
        }

        if (!vc4_wait_seqno(ctx->screen, query->hwperfmon->last_seqno,
                            wait ? PIPE_TIMEOUT_INFINITE : 0, "perfmon"))
                return false;

        req.id = query->hwperfmon->id;
        req.values_ptr = (uintptr_t)query->hwperfmon->counters;
        ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_GET_VALUES, &req);
        if (ret)
                return false;

        for (i = 0; i < query->num_queries; i++)
                vresult->batch[i].u64 = query->hwperfmon->counters[i];

        return true;
}

static void
vc4_set_active_query_state(struct pipe_context *pctx, boolean enable)
{
}

void
vc4_query_init(struct pipe_context *pctx)
{
        pctx->create_query = vc4_create_query;
        pctx->create_batch_query = vc4_create_batch_query;
        pctx->destroy_query = vc4_destroy_query;
        pctx->begin_query = vc4_begin_query;
        pctx->end_query = vc4_end_query;
        pctx->get_query_result = vc4_get_query_result;
        pctx->set_active_query_state = vc4_set_active_query_state;
}