17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2021 Google, Inc. 37ec681f3Smrg * 47ec681f3Smrg * SPDX-License-Identifier: MIT 57ec681f3Smrg */ 67ec681f3Smrg 77ec681f3Smrg#include "fd_pps_driver.h" 87ec681f3Smrg 97ec681f3Smrg#include <cstring> 107ec681f3Smrg#include <iostream> 117ec681f3Smrg#include <perfetto.h> 127ec681f3Smrg 137ec681f3Smrg#include "pps/pps.h" 147ec681f3Smrg#include "pps/pps_algorithm.h" 157ec681f3Smrg 167ec681f3Smrgnamespace pps 177ec681f3Smrg{ 187ec681f3Smrg 197ec681f3Smrguint64_t 207ec681f3SmrgFreedrenoDriver::get_min_sampling_period_ns() 217ec681f3Smrg{ 227ec681f3Smrg return 100000; 237ec681f3Smrg} 247ec681f3Smrg 257ec681f3Smrg/* 267ec681f3SmrgTODO this sees like it would be largely the same for a5xx as well 277ec681f3Smrg(ie. same countable names).. 287ec681f3Smrg */ 297ec681f3Smrgvoid 307ec681f3SmrgFreedrenoDriver::setup_a6xx_counters() 317ec681f3Smrg{ 327ec681f3Smrg /* TODO is there a reason to want more than one group? */ 337ec681f3Smrg CounterGroup group = {}; 347ec681f3Smrg group.name = "counters"; 357ec681f3Smrg groups.clear(); 367ec681f3Smrg counters.clear(); 377ec681f3Smrg countables.clear(); 387ec681f3Smrg enabled_counters.clear(); 397ec681f3Smrg groups.emplace_back(std::move(group)); 407ec681f3Smrg 417ec681f3Smrg /* 427ec681f3Smrg * Create the countables that we'll be using. 437ec681f3Smrg */ 447ec681f3Smrg 457ec681f3Smrg auto PERF_CP_ALWAYS_COUNT = countable("PERF_CP_ALWAYS_COUNT"); 467ec681f3Smrg auto PERF_CP_BUSY_CYCLES = countable("PERF_CP_BUSY_CYCLES"); 477ec681f3Smrg auto PERF_RB_3D_PIXELS = countable("PERF_RB_3D_PIXELS"); 487ec681f3Smrg auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS"); 497ec681f3Smrg auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS"); 507ec681f3Smrg auto PERF_TP_L1_CACHELINE_MISSES = countable("PERF_TP_L1_CACHELINE_MISSES"); 517ec681f3Smrg auto PERF_SP_BUSY_CYCLES = countable("PERF_SP_BUSY_CYCLES"); 527ec681f3Smrg 537ec681f3Smrg /* 547ec681f3Smrg * And then setup the derived counters that we are exporting to 557ec681f3Smrg * pps based on the captured countable values 567ec681f3Smrg */ 577ec681f3Smrg 587ec681f3Smrg counter("GPU Frequency", Counter::Units::Hertz, [=]() { 597ec681f3Smrg return PERF_CP_ALWAYS_COUNT / time; 607ec681f3Smrg } 617ec681f3Smrg ); 627ec681f3Smrg 637ec681f3Smrg counter("GPU % Utilization", Counter::Units::Percent, [=]() { 647ec681f3Smrg return 100.0 * (PERF_CP_BUSY_CYCLES / time) / max_freq; 657ec681f3Smrg } 667ec681f3Smrg ); 677ec681f3Smrg 687ec681f3Smrg // This one is a bit of a guess, but seems plausible.. 697ec681f3Smrg counter("ALU / Fragment", Counter::Units::None, [=]() { 707ec681f3Smrg return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + 717ec681f3Smrg PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / PERF_RB_3D_PIXELS; 727ec681f3Smrg } 737ec681f3Smrg ); 747ec681f3Smrg 757ec681f3Smrg counter("TP L1 Cache Misses", Counter::Units::None, [=]() { 767ec681f3Smrg return PERF_TP_L1_CACHELINE_MISSES / time; 777ec681f3Smrg } 787ec681f3Smrg ); 797ec681f3Smrg 807ec681f3Smrg counter("Shader Core Utilization", Counter::Units::Percent, [=]() { 817ec681f3Smrg return 100.0 * (PERF_SP_BUSY_CYCLES / time) / (max_freq * info->num_sp_cores); 827ec681f3Smrg } 837ec681f3Smrg ); 847ec681f3Smrg 857ec681f3Smrg // TODO add more.. see https://gpuinspector.dev/docs/gpu-counters/qualcomm 867ec681f3Smrg // for what blob exposes 877ec681f3Smrg} 887ec681f3Smrg 897ec681f3Smrg/** 907ec681f3Smrg * Generate an submit the cmdstream to configure the counter/countable 917ec681f3Smrg * muxing 927ec681f3Smrg */ 937ec681f3Smrgvoid 947ec681f3SmrgFreedrenoDriver::configure_counters(bool reset, bool wait) 957ec681f3Smrg{ 967ec681f3Smrg struct fd_submit *submit = fd_submit_new(pipe); 977ec681f3Smrg enum fd_ringbuffer_flags flags = 987ec681f3Smrg (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE); 997ec681f3Smrg struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0x1000, flags); 1007ec681f3Smrg 1017ec681f3Smrg for (auto countable : countables) 1027ec681f3Smrg countable.configure(ring, reset); 1037ec681f3Smrg 1047ec681f3Smrg struct fd_submit_fence fence = {}; 1057ec681f3Smrg util_queue_fence_init(&fence.ready); 1067ec681f3Smrg 1077ec681f3Smrg fd_submit_flush(submit, -1, &fence); 1087ec681f3Smrg 1097ec681f3Smrg util_queue_fence_wait(&fence.ready); 1107ec681f3Smrg 1117ec681f3Smrg fd_ringbuffer_del(ring); 1127ec681f3Smrg fd_submit_del(submit); 1137ec681f3Smrg 1147ec681f3Smrg if (wait) 1157ec681f3Smrg fd_pipe_wait(pipe, &fence.fence); 1167ec681f3Smrg} 1177ec681f3Smrg 1187ec681f3Smrg/** 1197ec681f3Smrg * Read the current counter values and record the time. 1207ec681f3Smrg */ 1217ec681f3Smrgvoid 1227ec681f3SmrgFreedrenoDriver::collect_countables() 1237ec681f3Smrg{ 1247ec681f3Smrg last_dump_ts = perfetto::base::GetBootTimeNs().count(); 1257ec681f3Smrg 1267ec681f3Smrg for (auto countable : countables) 1277ec681f3Smrg countable.collect(); 1287ec681f3Smrg} 1297ec681f3Smrg 1307ec681f3Smrgbool 1317ec681f3SmrgFreedrenoDriver::init_perfcnt() 1327ec681f3Smrg{ 1337ec681f3Smrg uint64_t val; 1347ec681f3Smrg 1357ec681f3Smrg dev = fd_device_new(drm_device.fd); 1367ec681f3Smrg pipe = fd_pipe_new(dev, FD_PIPE_3D); 1377ec681f3Smrg dev_id = fd_pipe_dev_id(pipe); 1387ec681f3Smrg 1397ec681f3Smrg if (fd_pipe_get_param(pipe, FD_MAX_FREQ, &val)) { 1407ec681f3Smrg PERFETTO_FATAL("Could not get MAX_FREQ"); 1417ec681f3Smrg return false; 1427ec681f3Smrg } 1437ec681f3Smrg max_freq = val; 1447ec681f3Smrg 1457ec681f3Smrg if (fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val)) { 1467ec681f3Smrg PERFETTO_ILOG("Could not get SUSPEND_COUNT"); 1477ec681f3Smrg } else { 1487ec681f3Smrg suspend_count = val; 1497ec681f3Smrg has_suspend_count = true; 1507ec681f3Smrg } 1517ec681f3Smrg 1527ec681f3Smrg perfcntrs = fd_perfcntrs(fd_pipe_dev_id(pipe), &num_perfcntrs); 1537ec681f3Smrg if (num_perfcntrs == 0) { 1547ec681f3Smrg PERFETTO_FATAL("No hw counters available"); 1557ec681f3Smrg return false; 1567ec681f3Smrg } 1577ec681f3Smrg 1587ec681f3Smrg assigned_counters.resize(num_perfcntrs); 1597ec681f3Smrg assigned_counters.assign(assigned_counters.size(), 0); 1607ec681f3Smrg 1617ec681f3Smrg switch (fd_dev_gen(dev_id)) { 1627ec681f3Smrg case 6: 1637ec681f3Smrg setup_a6xx_counters(); 1647ec681f3Smrg break; 1657ec681f3Smrg default: 1667ec681f3Smrg PERFETTO_FATAL("Unsupported GPU: a%03u", fd_dev_gpu_id(dev_id)); 1677ec681f3Smrg return false; 1687ec681f3Smrg } 1697ec681f3Smrg 1707ec681f3Smrg state.resize(next_countable_id); 1717ec681f3Smrg 1727ec681f3Smrg for (auto countable : countables) 1737ec681f3Smrg countable.resolve(); 1747ec681f3Smrg 1757ec681f3Smrg info = fd_dev_info(dev_id); 1767ec681f3Smrg 1777ec681f3Smrg io = fd_dt_find_io(); 1787ec681f3Smrg if (!io) { 1797ec681f3Smrg PERFETTO_FATAL("Could not map GPU I/O space"); 1807ec681f3Smrg return false; 1817ec681f3Smrg } 1827ec681f3Smrg 1837ec681f3Smrg configure_counters(true, true); 1847ec681f3Smrg collect_countables(); 1857ec681f3Smrg 1867ec681f3Smrg return true; 1877ec681f3Smrg} 1887ec681f3Smrg 1897ec681f3Smrgvoid 1907ec681f3SmrgFreedrenoDriver::enable_counter(const uint32_t counter_id) 1917ec681f3Smrg{ 1927ec681f3Smrg enabled_counters.push_back(counters[counter_id]); 1937ec681f3Smrg} 1947ec681f3Smrg 1957ec681f3Smrgvoid 1967ec681f3SmrgFreedrenoDriver::enable_all_counters() 1977ec681f3Smrg{ 1987ec681f3Smrg enabled_counters.reserve(counters.size()); 1997ec681f3Smrg for (auto &counter : counters) { 2007ec681f3Smrg enabled_counters.push_back(counter); 2017ec681f3Smrg } 2027ec681f3Smrg} 2037ec681f3Smrg 2047ec681f3Smrgvoid 2057ec681f3SmrgFreedrenoDriver::enable_perfcnt(const uint64_t /* sampling_period_ns */) 2067ec681f3Smrg{ 2077ec681f3Smrg} 2087ec681f3Smrg 2097ec681f3Smrgbool 2107ec681f3SmrgFreedrenoDriver::dump_perfcnt() 2117ec681f3Smrg{ 2127ec681f3Smrg if (has_suspend_count) { 2137ec681f3Smrg uint64_t val; 2147ec681f3Smrg 2157ec681f3Smrg fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val); 2167ec681f3Smrg 2177ec681f3Smrg if (suspend_count != val) { 2187ec681f3Smrg PERFETTO_ILOG("Device had suspended!"); 2197ec681f3Smrg 2207ec681f3Smrg suspend_count = val; 2217ec681f3Smrg 2227ec681f3Smrg configure_counters(true, true); 2237ec681f3Smrg collect_countables(); 2247ec681f3Smrg 2257ec681f3Smrg /* We aren't going to have anything sensible by comparing 2267ec681f3Smrg * current values to values from prior to the suspend, so 2277ec681f3Smrg * just skip this sampling period. 2287ec681f3Smrg */ 2297ec681f3Smrg return false; 2307ec681f3Smrg } 2317ec681f3Smrg } 2327ec681f3Smrg 2337ec681f3Smrg auto last_ts = last_dump_ts; 2347ec681f3Smrg 2357ec681f3Smrg /* Capture the timestamp from the *start* of the sampling period: */ 2367ec681f3Smrg last_capture_ts = last_dump_ts; 2377ec681f3Smrg 2387ec681f3Smrg collect_countables(); 2397ec681f3Smrg 2407ec681f3Smrg auto elapsed_time_ns = last_dump_ts - last_ts; 2417ec681f3Smrg 2427ec681f3Smrg time = (float)elapsed_time_ns / 1000000000.0; 2437ec681f3Smrg 2447ec681f3Smrg /* On older kernels that dont' support querying the suspend- 2457ec681f3Smrg * count, just send configuration cmdstream regularly to keep 2467ec681f3Smrg * the GPU alive and correctly configured for the countables 2477ec681f3Smrg * we want 2487ec681f3Smrg */ 2497ec681f3Smrg if (!has_suspend_count) { 2507ec681f3Smrg configure_counters(false, false); 2517ec681f3Smrg } 2527ec681f3Smrg 2537ec681f3Smrg return true; 2547ec681f3Smrg} 2557ec681f3Smrg 2567ec681f3Smrguint64_t FreedrenoDriver::next() 2577ec681f3Smrg{ 2587ec681f3Smrg auto ret = last_capture_ts; 2597ec681f3Smrg last_capture_ts = 0; 2607ec681f3Smrg return ret; 2617ec681f3Smrg} 2627ec681f3Smrg 2637ec681f3Smrgvoid FreedrenoDriver::disable_perfcnt() 2647ec681f3Smrg{ 2657ec681f3Smrg /* There isn't really any disable, only reconfiguring which countables 2667ec681f3Smrg * get muxed to which counters 2677ec681f3Smrg */ 2687ec681f3Smrg} 2697ec681f3Smrg 2707ec681f3Smrg/* 2717ec681f3Smrg * Countable 2727ec681f3Smrg */ 2737ec681f3Smrg 2747ec681f3SmrgFreedrenoDriver::Countable 2757ec681f3SmrgFreedrenoDriver::countable(std::string name) 2767ec681f3Smrg{ 2777ec681f3Smrg auto countable = Countable(this, name); 2787ec681f3Smrg countables.emplace_back(countable); 2797ec681f3Smrg return countable; 2807ec681f3Smrg} 2817ec681f3Smrg 2827ec681f3SmrgFreedrenoDriver::Countable::Countable(FreedrenoDriver *d, std::string name) 2837ec681f3Smrg : id {d->next_countable_id++}, d {d}, name {name} 2847ec681f3Smrg{ 2857ec681f3Smrg} 2867ec681f3Smrg 2877ec681f3Smrg/* Emit register writes on ring to configure counter/countable muxing: */ 2887ec681f3Smrgvoid 2897ec681f3SmrgFreedrenoDriver::Countable::configure(struct fd_ringbuffer *ring, bool reset) 2907ec681f3Smrg{ 2917ec681f3Smrg const struct fd_perfcntr_countable *countable = d->state[id].countable; 2927ec681f3Smrg const struct fd_perfcntr_counter *counter = d->state[id].counter; 2937ec681f3Smrg 2947ec681f3Smrg OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); 2957ec681f3Smrg 2967ec681f3Smrg if (counter->enable && reset) { 2977ec681f3Smrg OUT_PKT4(ring, counter->enable, 1); 2987ec681f3Smrg OUT_RING(ring, 0); 2997ec681f3Smrg } 3007ec681f3Smrg 3017ec681f3Smrg if (counter->clear && reset) { 3027ec681f3Smrg OUT_PKT4(ring, counter->clear, 1); 3037ec681f3Smrg OUT_RING(ring, 1); 3047ec681f3Smrg 3057ec681f3Smrg OUT_PKT4(ring, counter->clear, 1); 3067ec681f3Smrg OUT_RING(ring, 0); 3077ec681f3Smrg } 3087ec681f3Smrg 3097ec681f3Smrg OUT_PKT4(ring, counter->select_reg, 1); 3107ec681f3Smrg OUT_RING(ring, countable->selector); 3117ec681f3Smrg 3127ec681f3Smrg if (counter->enable && reset) { 3137ec681f3Smrg OUT_PKT4(ring, counter->enable, 1); 3147ec681f3Smrg OUT_RING(ring, 1); 3157ec681f3Smrg } 3167ec681f3Smrg} 3177ec681f3Smrg 3187ec681f3Smrg/* Collect current counter value and calculate delta since last sample: */ 3197ec681f3Smrgvoid 3207ec681f3SmrgFreedrenoDriver::Countable::collect() 3217ec681f3Smrg{ 3227ec681f3Smrg const struct fd_perfcntr_counter *counter = d->state[id].counter; 3237ec681f3Smrg 3247ec681f3Smrg d->state[id].last_value = d->state[id].value; 3257ec681f3Smrg 3267ec681f3Smrg uint32_t *reg_lo = (uint32_t *)d->io + counter->counter_reg_lo; 3277ec681f3Smrg uint32_t *reg_hi = (uint32_t *)d->io + counter->counter_reg_hi; 3287ec681f3Smrg 3297ec681f3Smrg uint32_t lo = *reg_lo; 3307ec681f3Smrg uint32_t hi = *reg_hi; 3317ec681f3Smrg 3327ec681f3Smrg d->state[id].value = lo | ((uint64_t)hi << 32); 3337ec681f3Smrg} 3347ec681f3Smrg 3357ec681f3Smrg/* Resolve the countable and assign next counter from it's group: */ 3367ec681f3Smrgvoid 3377ec681f3SmrgFreedrenoDriver::Countable::resolve() 3387ec681f3Smrg{ 3397ec681f3Smrg for (unsigned i = 0; i < d->num_perfcntrs; i++) { 3407ec681f3Smrg const struct fd_perfcntr_group *g = &d->perfcntrs[i]; 3417ec681f3Smrg for (unsigned j = 0; j < g->num_countables; j++) { 3427ec681f3Smrg const struct fd_perfcntr_countable *c = &g->countables[j]; 3437ec681f3Smrg if (name == c->name) { 3447ec681f3Smrg d->state[id].countable = c; 3457ec681f3Smrg 3467ec681f3Smrg /* Assign a counter from the same group: */ 3477ec681f3Smrg assert(d->assigned_counters[i] < g->num_counters); 3487ec681f3Smrg d->state[id].counter = &g->counters[d->assigned_counters[i]++]; 3497ec681f3Smrg 3507ec681f3Smrg std::cout << "Countable: " << name << ", group=" << g->name << 3517ec681f3Smrg ", counter=" << d->assigned_counters[i] - 1 << "\n"; 3527ec681f3Smrg 3537ec681f3Smrg return; 3547ec681f3Smrg } 3557ec681f3Smrg } 3567ec681f3Smrg } 3577ec681f3Smrg unreachable("no such countable!"); 3587ec681f3Smrg} 3597ec681f3Smrg 3607ec681f3Smrguint64_t 3617ec681f3SmrgFreedrenoDriver::Countable::get_value() const 3627ec681f3Smrg{ 3637ec681f3Smrg return d->state[id].value - d->state[id].last_value; 3647ec681f3Smrg} 3657ec681f3Smrg 3667ec681f3Smrg/* 3677ec681f3Smrg * DerivedCounter 3687ec681f3Smrg */ 3697ec681f3Smrg 3707ec681f3SmrgFreedrenoDriver::DerivedCounter::DerivedCounter(FreedrenoDriver *d, std::string name, 3717ec681f3Smrg Counter::Units units, 3727ec681f3Smrg std::function<int64_t()> derive) 3737ec681f3Smrg : Counter(d->next_counter_id++, name, 0) 3747ec681f3Smrg{ 3757ec681f3Smrg std::cout << "DerivedCounter: " << name << ", id=" << id << "\n"; 3767ec681f3Smrg this->units = units; 3777ec681f3Smrg set_getter([=](const Counter &c, const Driver &d) { 3787ec681f3Smrg return derive(); 3797ec681f3Smrg } 3807ec681f3Smrg ); 3817ec681f3Smrg} 3827ec681f3Smrg 3837ec681f3SmrgFreedrenoDriver::DerivedCounter 3847ec681f3SmrgFreedrenoDriver::counter(std::string name, Counter::Units units, 3857ec681f3Smrg std::function<int64_t()> derive) 3867ec681f3Smrg{ 3877ec681f3Smrg auto counter = DerivedCounter(this, name, units, derive); 3887ec681f3Smrg counters.emplace_back(counter); 3897ec681f3Smrg return counter; 3907ec681f3Smrg} 3917ec681f3Smrg 3927ec681f3Smrg} // namespace pps 393