17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2021 Google, Inc.
37ec681f3Smrg *
47ec681f3Smrg * SPDX-License-Identifier: MIT
57ec681f3Smrg */
67ec681f3Smrg
77ec681f3Smrg#include "fd_pps_driver.h"
87ec681f3Smrg
97ec681f3Smrg#include <cstring>
107ec681f3Smrg#include <iostream>
117ec681f3Smrg#include <perfetto.h>
127ec681f3Smrg
137ec681f3Smrg#include "pps/pps.h"
147ec681f3Smrg#include "pps/pps_algorithm.h"
157ec681f3Smrg
167ec681f3Smrgnamespace pps
177ec681f3Smrg{
187ec681f3Smrg
197ec681f3Smrguint64_t
207ec681f3SmrgFreedrenoDriver::get_min_sampling_period_ns()
217ec681f3Smrg{
227ec681f3Smrg   return 100000;
237ec681f3Smrg}
247ec681f3Smrg
257ec681f3Smrg/*
267ec681f3SmrgTODO this sees like it would be largely the same for a5xx as well
277ec681f3Smrg(ie. same countable names)..
287ec681f3Smrg */
297ec681f3Smrgvoid
307ec681f3SmrgFreedrenoDriver::setup_a6xx_counters()
317ec681f3Smrg{
327ec681f3Smrg   /* TODO is there a reason to want more than one group? */
337ec681f3Smrg   CounterGroup group = {};
347ec681f3Smrg   group.name = "counters";
357ec681f3Smrg   groups.clear();
367ec681f3Smrg   counters.clear();
377ec681f3Smrg   countables.clear();
387ec681f3Smrg   enabled_counters.clear();
397ec681f3Smrg   groups.emplace_back(std::move(group));
407ec681f3Smrg
417ec681f3Smrg   /*
427ec681f3Smrg    * Create the countables that we'll be using.
437ec681f3Smrg    */
447ec681f3Smrg
457ec681f3Smrg   auto PERF_CP_ALWAYS_COUNT = countable("PERF_CP_ALWAYS_COUNT");
467ec681f3Smrg   auto PERF_CP_BUSY_CYCLES  = countable("PERF_CP_BUSY_CYCLES");
477ec681f3Smrg   auto PERF_RB_3D_PIXELS    = countable("PERF_RB_3D_PIXELS");
487ec681f3Smrg   auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS");
497ec681f3Smrg   auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS");
507ec681f3Smrg   auto PERF_TP_L1_CACHELINE_MISSES = countable("PERF_TP_L1_CACHELINE_MISSES");
517ec681f3Smrg   auto PERF_SP_BUSY_CYCLES  = countable("PERF_SP_BUSY_CYCLES");
527ec681f3Smrg
537ec681f3Smrg   /*
547ec681f3Smrg    * And then setup the derived counters that we are exporting to
557ec681f3Smrg    * pps based on the captured countable values
567ec681f3Smrg    */
577ec681f3Smrg
587ec681f3Smrg   counter("GPU Frequency", Counter::Units::Hertz, [=]() {
597ec681f3Smrg         return PERF_CP_ALWAYS_COUNT / time;
607ec681f3Smrg      }
617ec681f3Smrg   );
627ec681f3Smrg
637ec681f3Smrg   counter("GPU % Utilization", Counter::Units::Percent, [=]() {
647ec681f3Smrg         return 100.0 * (PERF_CP_BUSY_CYCLES / time) / max_freq;
657ec681f3Smrg      }
667ec681f3Smrg   );
677ec681f3Smrg
687ec681f3Smrg   // This one is a bit of a guess, but seems plausible..
697ec681f3Smrg   counter("ALU / Fragment", Counter::Units::None, [=]() {
707ec681f3Smrg         return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
717ec681f3Smrg               PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / PERF_RB_3D_PIXELS;
727ec681f3Smrg      }
737ec681f3Smrg   );
747ec681f3Smrg
757ec681f3Smrg   counter("TP L1 Cache Misses", Counter::Units::None, [=]() {
767ec681f3Smrg         return PERF_TP_L1_CACHELINE_MISSES / time;
777ec681f3Smrg      }
787ec681f3Smrg   );
797ec681f3Smrg
807ec681f3Smrg   counter("Shader Core Utilization", Counter::Units::Percent, [=]() {
817ec681f3Smrg         return 100.0 * (PERF_SP_BUSY_CYCLES / time) / (max_freq * info->num_sp_cores);
827ec681f3Smrg      }
837ec681f3Smrg   );
847ec681f3Smrg
857ec681f3Smrg   // TODO add more.. see https://gpuinspector.dev/docs/gpu-counters/qualcomm
867ec681f3Smrg   // for what blob exposes
877ec681f3Smrg}
887ec681f3Smrg
897ec681f3Smrg/**
907ec681f3Smrg * Generate an submit the cmdstream to configure the counter/countable
917ec681f3Smrg * muxing
927ec681f3Smrg */
937ec681f3Smrgvoid
947ec681f3SmrgFreedrenoDriver::configure_counters(bool reset, bool wait)
957ec681f3Smrg{
967ec681f3Smrg   struct fd_submit *submit = fd_submit_new(pipe);
977ec681f3Smrg   enum fd_ringbuffer_flags flags =
987ec681f3Smrg      (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
997ec681f3Smrg   struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0x1000, flags);
1007ec681f3Smrg
1017ec681f3Smrg   for (auto countable : countables)
1027ec681f3Smrg      countable.configure(ring, reset);
1037ec681f3Smrg
1047ec681f3Smrg   struct fd_submit_fence fence = {};
1057ec681f3Smrg   util_queue_fence_init(&fence.ready);
1067ec681f3Smrg
1077ec681f3Smrg   fd_submit_flush(submit, -1, &fence);
1087ec681f3Smrg
1097ec681f3Smrg   util_queue_fence_wait(&fence.ready);
1107ec681f3Smrg
1117ec681f3Smrg   fd_ringbuffer_del(ring);
1127ec681f3Smrg   fd_submit_del(submit);
1137ec681f3Smrg
1147ec681f3Smrg   if (wait)
1157ec681f3Smrg      fd_pipe_wait(pipe, &fence.fence);
1167ec681f3Smrg}
1177ec681f3Smrg
1187ec681f3Smrg/**
1197ec681f3Smrg * Read the current counter values and record the time.
1207ec681f3Smrg */
1217ec681f3Smrgvoid
1227ec681f3SmrgFreedrenoDriver::collect_countables()
1237ec681f3Smrg{
1247ec681f3Smrg   last_dump_ts = perfetto::base::GetBootTimeNs().count();
1257ec681f3Smrg
1267ec681f3Smrg   for (auto countable : countables)
1277ec681f3Smrg      countable.collect();
1287ec681f3Smrg}
1297ec681f3Smrg
1307ec681f3Smrgbool
1317ec681f3SmrgFreedrenoDriver::init_perfcnt()
1327ec681f3Smrg{
1337ec681f3Smrg   uint64_t val;
1347ec681f3Smrg
1357ec681f3Smrg   dev = fd_device_new(drm_device.fd);
1367ec681f3Smrg   pipe = fd_pipe_new(dev, FD_PIPE_3D);
1377ec681f3Smrg   dev_id = fd_pipe_dev_id(pipe);
1387ec681f3Smrg
1397ec681f3Smrg   if (fd_pipe_get_param(pipe, FD_MAX_FREQ, &val)) {
1407ec681f3Smrg      PERFETTO_FATAL("Could not get MAX_FREQ");
1417ec681f3Smrg      return false;
1427ec681f3Smrg   }
1437ec681f3Smrg   max_freq = val;
1447ec681f3Smrg
1457ec681f3Smrg   if (fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val)) {
1467ec681f3Smrg      PERFETTO_ILOG("Could not get SUSPEND_COUNT");
1477ec681f3Smrg   } else {
1487ec681f3Smrg      suspend_count = val;
1497ec681f3Smrg      has_suspend_count = true;
1507ec681f3Smrg   }
1517ec681f3Smrg
1527ec681f3Smrg   perfcntrs = fd_perfcntrs(fd_pipe_dev_id(pipe), &num_perfcntrs);
1537ec681f3Smrg   if (num_perfcntrs == 0) {
1547ec681f3Smrg      PERFETTO_FATAL("No hw counters available");
1557ec681f3Smrg      return false;
1567ec681f3Smrg   }
1577ec681f3Smrg
1587ec681f3Smrg   assigned_counters.resize(num_perfcntrs);
1597ec681f3Smrg   assigned_counters.assign(assigned_counters.size(), 0);
1607ec681f3Smrg
1617ec681f3Smrg   switch (fd_dev_gen(dev_id)) {
1627ec681f3Smrg   case 6:
1637ec681f3Smrg      setup_a6xx_counters();
1647ec681f3Smrg      break;
1657ec681f3Smrg   default:
1667ec681f3Smrg      PERFETTO_FATAL("Unsupported GPU: a%03u", fd_dev_gpu_id(dev_id));
1677ec681f3Smrg      return false;
1687ec681f3Smrg   }
1697ec681f3Smrg
1707ec681f3Smrg   state.resize(next_countable_id);
1717ec681f3Smrg
1727ec681f3Smrg   for (auto countable : countables)
1737ec681f3Smrg      countable.resolve();
1747ec681f3Smrg
1757ec681f3Smrg   info = fd_dev_info(dev_id);
1767ec681f3Smrg
1777ec681f3Smrg   io = fd_dt_find_io();
1787ec681f3Smrg   if (!io) {
1797ec681f3Smrg      PERFETTO_FATAL("Could not map GPU I/O space");
1807ec681f3Smrg      return false;
1817ec681f3Smrg   }
1827ec681f3Smrg
1837ec681f3Smrg   configure_counters(true, true);
1847ec681f3Smrg   collect_countables();
1857ec681f3Smrg
1867ec681f3Smrg   return true;
1877ec681f3Smrg}
1887ec681f3Smrg
1897ec681f3Smrgvoid
1907ec681f3SmrgFreedrenoDriver::enable_counter(const uint32_t counter_id)
1917ec681f3Smrg{
1927ec681f3Smrg   enabled_counters.push_back(counters[counter_id]);
1937ec681f3Smrg}
1947ec681f3Smrg
1957ec681f3Smrgvoid
1967ec681f3SmrgFreedrenoDriver::enable_all_counters()
1977ec681f3Smrg{
1987ec681f3Smrg   enabled_counters.reserve(counters.size());
1997ec681f3Smrg   for (auto &counter : counters) {
2007ec681f3Smrg      enabled_counters.push_back(counter);
2017ec681f3Smrg   }
2027ec681f3Smrg}
2037ec681f3Smrg
2047ec681f3Smrgvoid
2057ec681f3SmrgFreedrenoDriver::enable_perfcnt(const uint64_t /* sampling_period_ns */)
2067ec681f3Smrg{
2077ec681f3Smrg}
2087ec681f3Smrg
2097ec681f3Smrgbool
2107ec681f3SmrgFreedrenoDriver::dump_perfcnt()
2117ec681f3Smrg{
2127ec681f3Smrg   if (has_suspend_count) {
2137ec681f3Smrg      uint64_t val;
2147ec681f3Smrg
2157ec681f3Smrg      fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val);
2167ec681f3Smrg
2177ec681f3Smrg      if (suspend_count != val) {
2187ec681f3Smrg         PERFETTO_ILOG("Device had suspended!");
2197ec681f3Smrg
2207ec681f3Smrg         suspend_count = val;
2217ec681f3Smrg
2227ec681f3Smrg         configure_counters(true, true);
2237ec681f3Smrg         collect_countables();
2247ec681f3Smrg
2257ec681f3Smrg         /* We aren't going to have anything sensible by comparing
2267ec681f3Smrg          * current values to values from prior to the suspend, so
2277ec681f3Smrg          * just skip this sampling period.
2287ec681f3Smrg          */
2297ec681f3Smrg         return false;
2307ec681f3Smrg      }
2317ec681f3Smrg   }
2327ec681f3Smrg
2337ec681f3Smrg   auto last_ts = last_dump_ts;
2347ec681f3Smrg
2357ec681f3Smrg   /* Capture the timestamp from the *start* of the sampling period: */
2367ec681f3Smrg   last_capture_ts = last_dump_ts;
2377ec681f3Smrg
2387ec681f3Smrg   collect_countables();
2397ec681f3Smrg
2407ec681f3Smrg   auto elapsed_time_ns = last_dump_ts - last_ts;
2417ec681f3Smrg
2427ec681f3Smrg   time = (float)elapsed_time_ns / 1000000000.0;
2437ec681f3Smrg
2447ec681f3Smrg   /* On older kernels that dont' support querying the suspend-
2457ec681f3Smrg    * count, just send configuration cmdstream regularly to keep
2467ec681f3Smrg    * the GPU alive and correctly configured for the countables
2477ec681f3Smrg    * we want
2487ec681f3Smrg    */
2497ec681f3Smrg   if (!has_suspend_count) {
2507ec681f3Smrg      configure_counters(false, false);
2517ec681f3Smrg   }
2527ec681f3Smrg
2537ec681f3Smrg   return true;
2547ec681f3Smrg}
2557ec681f3Smrg
2567ec681f3Smrguint64_t FreedrenoDriver::next()
2577ec681f3Smrg{
2587ec681f3Smrg   auto ret = last_capture_ts;
2597ec681f3Smrg   last_capture_ts = 0;
2607ec681f3Smrg   return ret;
2617ec681f3Smrg}
2627ec681f3Smrg
2637ec681f3Smrgvoid FreedrenoDriver::disable_perfcnt()
2647ec681f3Smrg{
2657ec681f3Smrg   /* There isn't really any disable, only reconfiguring which countables
2667ec681f3Smrg    * get muxed to which counters
2677ec681f3Smrg    */
2687ec681f3Smrg}
2697ec681f3Smrg
2707ec681f3Smrg/*
2717ec681f3Smrg * Countable
2727ec681f3Smrg */
2737ec681f3Smrg
2747ec681f3SmrgFreedrenoDriver::Countable
2757ec681f3SmrgFreedrenoDriver::countable(std::string name)
2767ec681f3Smrg{
2777ec681f3Smrg   auto countable = Countable(this, name);
2787ec681f3Smrg   countables.emplace_back(countable);
2797ec681f3Smrg   return countable;
2807ec681f3Smrg}
2817ec681f3Smrg
2827ec681f3SmrgFreedrenoDriver::Countable::Countable(FreedrenoDriver *d, std::string name)
2837ec681f3Smrg   : id {d->next_countable_id++}, d {d}, name {name}
2847ec681f3Smrg{
2857ec681f3Smrg}
2867ec681f3Smrg
2877ec681f3Smrg/* Emit register writes on ring to configure counter/countable muxing: */
2887ec681f3Smrgvoid
2897ec681f3SmrgFreedrenoDriver::Countable::configure(struct fd_ringbuffer *ring, bool reset)
2907ec681f3Smrg{
2917ec681f3Smrg   const struct fd_perfcntr_countable *countable = d->state[id].countable;
2927ec681f3Smrg   const struct fd_perfcntr_counter   *counter   = d->state[id].counter;
2937ec681f3Smrg
2947ec681f3Smrg   OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
2957ec681f3Smrg
2967ec681f3Smrg   if (counter->enable && reset) {
2977ec681f3Smrg      OUT_PKT4(ring, counter->enable, 1);
2987ec681f3Smrg      OUT_RING(ring, 0);
2997ec681f3Smrg   }
3007ec681f3Smrg
3017ec681f3Smrg   if (counter->clear && reset) {
3027ec681f3Smrg      OUT_PKT4(ring, counter->clear, 1);
3037ec681f3Smrg      OUT_RING(ring, 1);
3047ec681f3Smrg
3057ec681f3Smrg      OUT_PKT4(ring, counter->clear, 1);
3067ec681f3Smrg      OUT_RING(ring, 0);
3077ec681f3Smrg   }
3087ec681f3Smrg
3097ec681f3Smrg   OUT_PKT4(ring, counter->select_reg, 1);
3107ec681f3Smrg   OUT_RING(ring, countable->selector);
3117ec681f3Smrg
3127ec681f3Smrg   if (counter->enable && reset) {
3137ec681f3Smrg      OUT_PKT4(ring, counter->enable, 1);
3147ec681f3Smrg      OUT_RING(ring, 1);
3157ec681f3Smrg   }
3167ec681f3Smrg}
3177ec681f3Smrg
3187ec681f3Smrg/* Collect current counter value and calculate delta since last sample: */
3197ec681f3Smrgvoid
3207ec681f3SmrgFreedrenoDriver::Countable::collect()
3217ec681f3Smrg{
3227ec681f3Smrg   const struct fd_perfcntr_counter *counter = d->state[id].counter;
3237ec681f3Smrg
3247ec681f3Smrg   d->state[id].last_value = d->state[id].value;
3257ec681f3Smrg
3267ec681f3Smrg   uint32_t *reg_lo = (uint32_t *)d->io + counter->counter_reg_lo;
3277ec681f3Smrg   uint32_t *reg_hi = (uint32_t *)d->io + counter->counter_reg_hi;
3287ec681f3Smrg
3297ec681f3Smrg   uint32_t lo = *reg_lo;
3307ec681f3Smrg   uint32_t hi = *reg_hi;
3317ec681f3Smrg
3327ec681f3Smrg   d->state[id].value = lo | ((uint64_t)hi << 32);
3337ec681f3Smrg}
3347ec681f3Smrg
3357ec681f3Smrg/* Resolve the countable and assign next counter from it's group: */
3367ec681f3Smrgvoid
3377ec681f3SmrgFreedrenoDriver::Countable::resolve()
3387ec681f3Smrg{
3397ec681f3Smrg   for (unsigned i = 0; i < d->num_perfcntrs; i++) {
3407ec681f3Smrg      const struct fd_perfcntr_group *g = &d->perfcntrs[i];
3417ec681f3Smrg      for (unsigned j = 0; j < g->num_countables; j++) {
3427ec681f3Smrg         const struct fd_perfcntr_countable *c = &g->countables[j];
3437ec681f3Smrg         if (name == c->name) {
3447ec681f3Smrg            d->state[id].countable = c;
3457ec681f3Smrg
3467ec681f3Smrg            /* Assign a counter from the same group: */
3477ec681f3Smrg            assert(d->assigned_counters[i] < g->num_counters);
3487ec681f3Smrg            d->state[id].counter = &g->counters[d->assigned_counters[i]++];
3497ec681f3Smrg
3507ec681f3Smrg            std::cout << "Countable: " << name << ", group=" << g->name <<
3517ec681f3Smrg                  ", counter=" << d->assigned_counters[i] - 1 << "\n";
3527ec681f3Smrg
3537ec681f3Smrg            return;
3547ec681f3Smrg         }
3557ec681f3Smrg      }
3567ec681f3Smrg   }
3577ec681f3Smrg   unreachable("no such countable!");
3587ec681f3Smrg}
3597ec681f3Smrg
3607ec681f3Smrguint64_t
3617ec681f3SmrgFreedrenoDriver::Countable::get_value() const
3627ec681f3Smrg{
3637ec681f3Smrg   return d->state[id].value - d->state[id].last_value;
3647ec681f3Smrg}
3657ec681f3Smrg
3667ec681f3Smrg/*
3677ec681f3Smrg * DerivedCounter
3687ec681f3Smrg */
3697ec681f3Smrg
3707ec681f3SmrgFreedrenoDriver::DerivedCounter::DerivedCounter(FreedrenoDriver *d, std::string name,
3717ec681f3Smrg                                                Counter::Units units,
3727ec681f3Smrg                                                std::function<int64_t()> derive)
3737ec681f3Smrg   : Counter(d->next_counter_id++, name, 0)
3747ec681f3Smrg{
3757ec681f3Smrg   std::cout << "DerivedCounter: " << name << ", id=" << id << "\n";
3767ec681f3Smrg   this->units = units;
3777ec681f3Smrg   set_getter([=](const Counter &c, const Driver &d) {
3787ec681f3Smrg         return derive();
3797ec681f3Smrg      }
3807ec681f3Smrg   );
3817ec681f3Smrg}
3827ec681f3Smrg
3837ec681f3SmrgFreedrenoDriver::DerivedCounter
3847ec681f3SmrgFreedrenoDriver::counter(std::string name, Counter::Units units,
3857ec681f3Smrg                         std::function<int64_t()> derive)
3867ec681f3Smrg{
3877ec681f3Smrg   auto counter = DerivedCounter(this, name, units, derive);
3887ec681f3Smrg   counters.emplace_back(counter);
3897ec681f3Smrg   return counter;
3907ec681f3Smrg}
3917ec681f3Smrg
3927ec681f3Smrg} // namespace pps
393