17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2020-2021 Collabora, Ltd.
37ec681f3Smrg * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
47ec681f3Smrg * Author: Corentin Noël <corentin.noel@collabora.com>
57ec681f3Smrg *
67ec681f3Smrg * SPDX-License-Identifier: MIT
77ec681f3Smrg */
87ec681f3Smrg
97ec681f3Smrg#include "intel_pps_driver.h"
107ec681f3Smrg
117ec681f3Smrg#include <dirent.h>
127ec681f3Smrg#include <fcntl.h>
137ec681f3Smrg#include <math.h>
147ec681f3Smrg#include <poll.h>
157ec681f3Smrg#include <strings.h>
167ec681f3Smrg#include <sys/ioctl.h>
177ec681f3Smrg#include <unistd.h>
187ec681f3Smrg
197ec681f3Smrg#include <i915_drm.h>
207ec681f3Smrg#include <intel/perf/intel_perf_query.h>
217ec681f3Smrg
227ec681f3Smrg#include <pps/pps.h>
237ec681f3Smrg#include <pps/pps_algorithm.h>
247ec681f3Smrg
257ec681f3Smrg#include "intel_pps_perf.h"
267ec681f3Smrg
277ec681f3Smrgnamespace pps
287ec681f3Smrg{
297ec681f3Smrguint64_t IntelDriver::get_min_sampling_period_ns()
307ec681f3Smrg{
317ec681f3Smrg   return 500000;
327ec681f3Smrg}
337ec681f3Smrg
347ec681f3Smrgvoid IntelDriver::enable_counter(uint32_t counter_id)
357ec681f3Smrg{
367ec681f3Smrg   auto &counter = counters[counter_id];
377ec681f3Smrg   auto &group = groups[counter.group];
387ec681f3Smrg   if (perf->query) {
397ec681f3Smrg      if (perf->query->symbol_name != group.name) {
407ec681f3Smrg         PPS_LOG_ERROR(
417ec681f3Smrg            "Unable to enable metrics from different sets: %u "
427ec681f3Smrg            "belongs to %s but %s is currently in use.",
437ec681f3Smrg            counter_id,
447ec681f3Smrg            perf->query->symbol_name,
457ec681f3Smrg            group.name.c_str());
467ec681f3Smrg         return;
477ec681f3Smrg      }
487ec681f3Smrg   }
497ec681f3Smrg
507ec681f3Smrg   enabled_counters.emplace_back(counter);
517ec681f3Smrg   if (!perf->query) {
527ec681f3Smrg      perf->query = perf->find_query_by_name(group.name);
537ec681f3Smrg   }
547ec681f3Smrg}
557ec681f3Smrg
567ec681f3Smrgvoid IntelDriver::enable_all_counters()
577ec681f3Smrg{
587ec681f3Smrg   // We can only enable one metric set at a time so at least enable one.
597ec681f3Smrg   for (auto &group : groups) {
607ec681f3Smrg      if (group.name == "RenderBasic") {
617ec681f3Smrg         for (uint32_t counter_id : group.counters) {
627ec681f3Smrg            auto &counter = counters[counter_id];
637ec681f3Smrg            enabled_counters.emplace_back(counter);
647ec681f3Smrg         }
657ec681f3Smrg
667ec681f3Smrg         perf->query = perf->find_query_by_name(group.name);
677ec681f3Smrg         break;
687ec681f3Smrg      }
697ec681f3Smrg   }
707ec681f3Smrg}
717ec681f3Smrg
727ec681f3Smrgstatic uint64_t timespec_diff(timespec *begin, timespec *end)
737ec681f3Smrg{
747ec681f3Smrg   return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
757ec681f3Smrg}
767ec681f3Smrg
777ec681f3Smrg/// @brief This function tries to correlate CPU time with GPU time
787ec681f3Smrgstd::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
797ec681f3Smrg{
807ec681f3Smrg   TimestampCorrelation corr = {};
817ec681f3Smrg
827ec681f3Smrg   clock_t correlation_clock_id = CLOCK_BOOTTIME;
837ec681f3Smrg
847ec681f3Smrg   drm_i915_reg_read reg_read = {};
857ec681f3Smrg   const uint64_t render_ring_timestamp = 0x2358;
867ec681f3Smrg   reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
877ec681f3Smrg
887ec681f3Smrg   constexpr size_t attempt_count = 3;
897ec681f3Smrg   struct {
907ec681f3Smrg      timespec cpu_ts_begin;
917ec681f3Smrg      timespec cpu_ts_end;
927ec681f3Smrg      uint64_t gpu_ts;
937ec681f3Smrg   } attempts[attempt_count] = {};
947ec681f3Smrg
957ec681f3Smrg   uint32_t best = 0;
967ec681f3Smrg
977ec681f3Smrg   // Gather 3 correlations
987ec681f3Smrg   for (uint32_t i = 0; i < attempt_count; i++) {
997ec681f3Smrg      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
1007ec681f3Smrg      if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, &reg_read) < 0) {
1017ec681f3Smrg         return std::nullopt;
1027ec681f3Smrg      }
1037ec681f3Smrg      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
1047ec681f3Smrg
1057ec681f3Smrg      attempts[i].gpu_ts = reg_read.val;
1067ec681f3Smrg   }
1077ec681f3Smrg
1087ec681f3Smrg   // Now select the best
1097ec681f3Smrg   for (uint32_t i = 1; i < attempt_count; i++) {
1107ec681f3Smrg      if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
1117ec681f3Smrg         timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
1127ec681f3Smrg         best = i;
1137ec681f3Smrg      }
1147ec681f3Smrg   }
1157ec681f3Smrg
1167ec681f3Smrg   corr.cpu_timestamp =
1177ec681f3Smrg      (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
1187ec681f3Smrg      timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
1197ec681f3Smrg   corr.gpu_timestamp = attempts[best].gpu_ts;
1207ec681f3Smrg
1217ec681f3Smrg   return corr;
1227ec681f3Smrg}
1237ec681f3Smrg
1247ec681f3Smrgvoid IntelDriver::get_new_correlation()
1257ec681f3Smrg{
1267ec681f3Smrg   // Rotate left correlations by one position so to make space at the end
1277ec681f3Smrg   std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
1287ec681f3Smrg
1297ec681f3Smrg   // Then we overwrite the last correlation with a new one
1307ec681f3Smrg   if (auto corr = query_correlation_timestamps()) {
1317ec681f3Smrg      correlations.back() = *corr;
1327ec681f3Smrg   } else {
1337ec681f3Smrg      PPS_LOG_FATAL("Failed to get correlation timestamps");
1347ec681f3Smrg   }
1357ec681f3Smrg}
1367ec681f3Smrg
1377ec681f3Smrgbool IntelDriver::init_perfcnt()
1387ec681f3Smrg{
1397ec681f3Smrg   assert(!perf && "Intel perf should not be initialized at this point");
1407ec681f3Smrg
1417ec681f3Smrg   perf = std::make_unique<IntelPerf>(drm_device.fd);
1427ec681f3Smrg
1437ec681f3Smrg   for (auto &query : perf->get_queries()) {
1447ec681f3Smrg      // Create group
1457ec681f3Smrg      CounterGroup group = {};
1467ec681f3Smrg      group.id = groups.size();
1477ec681f3Smrg      group.name = query->symbol_name;
1487ec681f3Smrg
1497ec681f3Smrg      for (int i = 0; i < query->n_counters; ++i) {
1507ec681f3Smrg         intel_perf_query_counter &counter = query->counters[i];
1517ec681f3Smrg
1527ec681f3Smrg         // Create counter
1537ec681f3Smrg         Counter counter_desc = {};
1547ec681f3Smrg         counter_desc.id = counters.size();
1557ec681f3Smrg         counter_desc.name = counter.symbol_name;
1567ec681f3Smrg         counter_desc.group = group.id;
1577ec681f3Smrg         counter_desc.getter = [counter, query, this](
1587ec681f3Smrg                                  const Counter &c, const Driver &dri) -> Counter::Value {
1597ec681f3Smrg            switch (counter.data_type) {
1607ec681f3Smrg            case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
1617ec681f3Smrg            case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
1627ec681f3Smrg            case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
1637ec681f3Smrg               return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result);
1647ec681f3Smrg               break;
1657ec681f3Smrg            case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
1667ec681f3Smrg            case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
1677ec681f3Smrg               return counter.oa_counter_read_float(perf->cfg, query, &result);
1687ec681f3Smrg               break;
1697ec681f3Smrg            }
1707ec681f3Smrg
1717ec681f3Smrg            return {};
1727ec681f3Smrg         };
1737ec681f3Smrg
1747ec681f3Smrg         // Add counter id to the group
1757ec681f3Smrg         group.counters.emplace_back(counter_desc.id);
1767ec681f3Smrg
1777ec681f3Smrg         // Store counter
1787ec681f3Smrg         counters.emplace_back(std::move(counter_desc));
1797ec681f3Smrg      }
1807ec681f3Smrg
1817ec681f3Smrg      // Store group
1827ec681f3Smrg      groups.emplace_back(std::move(group));
1837ec681f3Smrg   }
1847ec681f3Smrg
1857ec681f3Smrg   assert(groups.size() && "Failed to query groups");
1867ec681f3Smrg   assert(counters.size() && "Failed to query counters");
1877ec681f3Smrg
1887ec681f3Smrg   // Clear accumulations
1897ec681f3Smrg   intel_perf_query_result_clear(&result);
1907ec681f3Smrg
1917ec681f3Smrg   return true;
1927ec681f3Smrg}
1937ec681f3Smrg
1947ec681f3Smrgvoid IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
1957ec681f3Smrg{
1967ec681f3Smrg   this->sampling_period_ns = sampling_period_ns;
1977ec681f3Smrg
1987ec681f3Smrg   // Fill correlations with an initial one
1997ec681f3Smrg   if (auto corr = query_correlation_timestamps()) {
2007ec681f3Smrg      correlations.fill(*corr);
2017ec681f3Smrg   } else {
2027ec681f3Smrg      PPS_LOG_FATAL("Failed to get correlation timestamps");
2037ec681f3Smrg   }
2047ec681f3Smrg
2057ec681f3Smrg   if (!perf->open(sampling_period_ns)) {
2067ec681f3Smrg      PPS_LOG_FATAL("Failed to open intel perf");
2077ec681f3Smrg   }
2087ec681f3Smrg}
2097ec681f3Smrg
2107ec681f3Smrg/// @brief Transforms the GPU timestop into a CPU timestamp equivalent
2117ec681f3Smrguint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
2127ec681f3Smrg{
2137ec681f3Smrg   auto &corr_a = correlations[0];
2147ec681f3Smrg   auto &corr_b = correlations[correlations.size() - 1];
2157ec681f3Smrg
2167ec681f3Smrg   // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
2177ec681f3Smrg   uint64_t mask = 0xffffffff;
2187ec681f3Smrg   uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
2197ec681f3Smrg   uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
2207ec681f3Smrg
2217ec681f3Smrg   // Make sure it is within the interval [a,b)
2227ec681f3Smrg   assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
2237ec681f3Smrg   assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
2247ec681f3Smrg
2257ec681f3Smrg   uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
2267ec681f3Smrg   // Factor to convert gpu time to cpu time
2277ec681f3Smrg   double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
2287ec681f3Smrg      double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
2297ec681f3Smrg   uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
2307ec681f3Smrg   return corr_a.cpu_timestamp + cpu_delta;
2317ec681f3Smrg}
2327ec681f3Smrg
2337ec681f3Smrgvoid IntelDriver::disable_perfcnt()
2347ec681f3Smrg{
2357ec681f3Smrg   perf = nullptr;
2367ec681f3Smrg   groups.clear();
2377ec681f3Smrg   counters.clear();
2387ec681f3Smrg   enabled_counters.clear();
2397ec681f3Smrg}
2407ec681f3Smrg
2417ec681f3Smrgstruct Report {
2427ec681f3Smrg   uint32_t version;
2437ec681f3Smrg   uint32_t timestamp;
2447ec681f3Smrg   uint32_t id;
2457ec681f3Smrg};
2467ec681f3Smrg
2477ec681f3Smrg/// @brief Some perf record durations can be really short
2487ec681f3Smrg/// @return True if the duration is at least close to the sampling period
2497ec681f3Smrgstatic bool close_enough(uint64_t duration, uint64_t sampling_period)
2507ec681f3Smrg{
2517ec681f3Smrg   return duration > sampling_period - 100000;
2527ec681f3Smrg}
2537ec681f3Smrg
2547ec681f3Smrg/// @brief Transforms the raw data received in from the driver into records
2557ec681f3Smrgstd::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
2567ec681f3Smrg   const size_t byte_count)
2577ec681f3Smrg{
2587ec681f3Smrg   std::vector<PerfRecord> records;
2597ec681f3Smrg   records.reserve(128);
2607ec681f3Smrg
2617ec681f3Smrg   PerfRecord record;
2627ec681f3Smrg   record.reserve(512);
2637ec681f3Smrg
2647ec681f3Smrg   const uint8_t *iter = data.data();
2657ec681f3Smrg   const uint8_t *end = iter + byte_count;
2667ec681f3Smrg
2677ec681f3Smrg   uint64_t prev_cpu_timestamp = last_cpu_timestamp;
2687ec681f3Smrg
2697ec681f3Smrg   while (iter < end) {
2707ec681f3Smrg      // Iterate a record at a time
2717ec681f3Smrg      auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
2727ec681f3Smrg
2737ec681f3Smrg      if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
2747ec681f3Smrg         // Report is next to the header
2757ec681f3Smrg         auto report = reinterpret_cast<const Report *>(header + 1);
2767ec681f3Smrg         auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
2777ec681f3Smrg         auto duration = cpu_timestamp - prev_cpu_timestamp;
2787ec681f3Smrg
2797ec681f3Smrg         // Skip perf-records that are too short by checking
2807ec681f3Smrg         // the distance between last report and this one
2817ec681f3Smrg         if (close_enough(duration, sampling_period_ns)) {
2827ec681f3Smrg            prev_cpu_timestamp = cpu_timestamp;
2837ec681f3Smrg
2847ec681f3Smrg            // Add the new record to the list
2857ec681f3Smrg            record.resize(header->size); // Possibly 264?
2867ec681f3Smrg            memcpy(record.data(), iter, header->size);
2877ec681f3Smrg            records.emplace_back(record);
2887ec681f3Smrg         }
2897ec681f3Smrg      }
2907ec681f3Smrg
2917ec681f3Smrg      // Go to the next record
2927ec681f3Smrg      iter += header->size;
2937ec681f3Smrg   }
2947ec681f3Smrg
2957ec681f3Smrg   return records;
2967ec681f3Smrg}
2977ec681f3Smrg
2987ec681f3Smrg/// @brief Read all the available data from the metric set currently in use
2997ec681f3Smrgvoid IntelDriver::read_data_from_metric_set()
3007ec681f3Smrg{
3017ec681f3Smrg   assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
3027ec681f3Smrg
3037ec681f3Smrg   ssize_t bytes_read = 0;
3047ec681f3Smrg   while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
3057ec681f3Smrg              metric_buffer.size() - total_bytes_read)) > 0 ||
3067ec681f3Smrg      errno == EINTR) {
3077ec681f3Smrg      total_bytes_read += std::max(ssize_t(0), bytes_read);
3087ec681f3Smrg
3097ec681f3Smrg      // Increase size of the buffer for the next read
3107ec681f3Smrg      if (metric_buffer.size() / 2 < total_bytes_read) {
3117ec681f3Smrg         metric_buffer.resize(metric_buffer.size() * 2);
3127ec681f3Smrg      }
3137ec681f3Smrg   }
3147ec681f3Smrg
3157ec681f3Smrg   assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
3167ec681f3Smrg}
3177ec681f3Smrg
3187ec681f3Smrgbool IntelDriver::dump_perfcnt()
3197ec681f3Smrg{
3207ec681f3Smrg   if (!perf->oa_stream_ready()) {
3217ec681f3Smrg      return false;
3227ec681f3Smrg   }
3237ec681f3Smrg
3247ec681f3Smrg   read_data_from_metric_set();
3257ec681f3Smrg
3267ec681f3Smrg   get_new_correlation();
3277ec681f3Smrg
3287ec681f3Smrg   auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
3297ec681f3Smrg   if (new_records.empty()) {
3307ec681f3Smrg      PPS_LOG("No new records");
3317ec681f3Smrg      // No new records from the GPU yet
3327ec681f3Smrg      return false;
3337ec681f3Smrg   } else {
3347ec681f3Smrg      PPS_LOG("Records parsed bytes: %lu", total_bytes_read);
3357ec681f3Smrg      // Records are parsed correctly, so we can reset the
3367ec681f3Smrg      // number of bytes read so far from the metric set
3377ec681f3Smrg      total_bytes_read = 0;
3387ec681f3Smrg   }
3397ec681f3Smrg
3407ec681f3Smrg   APPEND(records, new_records);
3417ec681f3Smrg
3427ec681f3Smrg   if (records.size() < 2) {
3437ec681f3Smrg      // Not enough records to accumulate
3447ec681f3Smrg      return false;
3457ec681f3Smrg   }
3467ec681f3Smrg
3477ec681f3Smrg   return true;
3487ec681f3Smrg}
3497ec681f3Smrg
3507ec681f3Smrguint32_t IntelDriver::gpu_next()
3517ec681f3Smrg{
3527ec681f3Smrg   if (records.size() < 2) {
3537ec681f3Smrg      // Not enough records to accumulate
3547ec681f3Smrg      return 0;
3557ec681f3Smrg   }
3567ec681f3Smrg
3577ec681f3Smrg   // Get first and second
3587ec681f3Smrg   auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
3597ec681f3Smrg   auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
3607ec681f3Smrg
3617ec681f3Smrg   intel_perf_query_result_accumulate_fields(&result,
3627ec681f3Smrg      &perf->query.value(),
3637ec681f3Smrg      &perf->devinfo,
3647ec681f3Smrg      record_a + 1,
3657ec681f3Smrg      record_b + 1,
3667ec681f3Smrg      false /* no_oa_accumulate */);
3677ec681f3Smrg
3687ec681f3Smrg   // Get last timestamp
3697ec681f3Smrg   auto report_b = reinterpret_cast<const Report *>(record_b + 1);
3707ec681f3Smrg   auto gpu_timestamp = report_b->timestamp;
3717ec681f3Smrg
3727ec681f3Smrg   // Consume first record
3737ec681f3Smrg   records.erase(std::begin(records), std::begin(records) + 1);
3747ec681f3Smrg
3757ec681f3Smrg   return gpu_timestamp;
3767ec681f3Smrg}
3777ec681f3Smrg
3787ec681f3Smrguint64_t IntelDriver::cpu_next()
3797ec681f3Smrg{
3807ec681f3Smrg   if (auto gpu_timestamp = gpu_next()) {
3817ec681f3Smrg      auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
3827ec681f3Smrg
3837ec681f3Smrg      last_cpu_timestamp = cpu_timestamp;
3847ec681f3Smrg      return cpu_timestamp;
3857ec681f3Smrg   }
3867ec681f3Smrg
3877ec681f3Smrg   return 0;
3887ec681f3Smrg}
3897ec681f3Smrg
3907ec681f3Smrguint64_t IntelDriver::next()
3917ec681f3Smrg{
3927ec681f3Smrg   // Reset accumulation
3937ec681f3Smrg   intel_perf_query_result_clear(&result);
3947ec681f3Smrg   return cpu_next();
3957ec681f3Smrg}
3967ec681f3Smrg
3977ec681f3Smrg} // namespace pps
398