1/*
2 * Copyright © 2020-2021 Collabora, Ltd.
3 * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
4 * Author: Corentin Noël <corentin.noel@collabora.com>
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9#include "intel_pps_driver.h"
10
11#include <dirent.h>
12#include <fcntl.h>
13#include <math.h>
14#include <poll.h>
15#include <strings.h>
16#include <sys/ioctl.h>
17#include <unistd.h>
18
19#include <i915_drm.h>
20#include <intel/perf/intel_perf_query.h>
21
22#include <pps/pps.h>
23#include <pps/pps_algorithm.h>
24
25#include "intel_pps_perf.h"
26
27namespace pps
28{
29uint64_t IntelDriver::get_min_sampling_period_ns()
30{
31   return 500000;
32}
33
34void IntelDriver::enable_counter(uint32_t counter_id)
35{
36   auto &counter = counters[counter_id];
37   auto &group = groups[counter.group];
38   if (perf->query) {
39      if (perf->query->symbol_name != group.name) {
40         PPS_LOG_ERROR(
41            "Unable to enable metrics from different sets: %u "
42            "belongs to %s but %s is currently in use.",
43            counter_id,
44            perf->query->symbol_name,
45            group.name.c_str());
46         return;
47      }
48   }
49
50   enabled_counters.emplace_back(counter);
51   if (!perf->query) {
52      perf->query = perf->find_query_by_name(group.name);
53   }
54}
55
56void IntelDriver::enable_all_counters()
57{
58   // We can only enable one metric set at a time so at least enable one.
59   for (auto &group : groups) {
60      if (group.name == "RenderBasic") {
61         for (uint32_t counter_id : group.counters) {
62            auto &counter = counters[counter_id];
63            enabled_counters.emplace_back(counter);
64         }
65
66         perf->query = perf->find_query_by_name(group.name);
67         break;
68      }
69   }
70}
71
72static uint64_t timespec_diff(timespec *begin, timespec *end)
73{
74   return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
75}
76
77/// @brief This function tries to correlate CPU time with GPU time
78std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
79{
80   TimestampCorrelation corr = {};
81
82   clock_t correlation_clock_id = CLOCK_BOOTTIME;
83
84   drm_i915_reg_read reg_read = {};
85   const uint64_t render_ring_timestamp = 0x2358;
86   reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
87
88   constexpr size_t attempt_count = 3;
89   struct {
90      timespec cpu_ts_begin;
91      timespec cpu_ts_end;
92      uint64_t gpu_ts;
93   } attempts[attempt_count] = {};
94
95   uint32_t best = 0;
96
97   // Gather 3 correlations
98   for (uint32_t i = 0; i < attempt_count; i++) {
99      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
100      if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, &reg_read) < 0) {
101         return std::nullopt;
102      }
103      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
104
105      attempts[i].gpu_ts = reg_read.val;
106   }
107
108   // Now select the best
109   for (uint32_t i = 1; i < attempt_count; i++) {
110      if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
111         timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
112         best = i;
113      }
114   }
115
116   corr.cpu_timestamp =
117      (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
118      timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
119   corr.gpu_timestamp = attempts[best].gpu_ts;
120
121   return corr;
122}
123
124void IntelDriver::get_new_correlation()
125{
126   // Rotate left correlations by one position so to make space at the end
127   std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
128
129   // Then we overwrite the last correlation with a new one
130   if (auto corr = query_correlation_timestamps()) {
131      correlations.back() = *corr;
132   } else {
133      PPS_LOG_FATAL("Failed to get correlation timestamps");
134   }
135}
136
137bool IntelDriver::init_perfcnt()
138{
139   assert(!perf && "Intel perf should not be initialized at this point");
140
141   perf = std::make_unique<IntelPerf>(drm_device.fd);
142
143   for (auto &query : perf->get_queries()) {
144      // Create group
145      CounterGroup group = {};
146      group.id = groups.size();
147      group.name = query->symbol_name;
148
149      for (int i = 0; i < query->n_counters; ++i) {
150         intel_perf_query_counter &counter = query->counters[i];
151
152         // Create counter
153         Counter counter_desc = {};
154         counter_desc.id = counters.size();
155         counter_desc.name = counter.symbol_name;
156         counter_desc.group = group.id;
157         counter_desc.getter = [counter, query, this](
158                                  const Counter &c, const Driver &dri) -> Counter::Value {
159            switch (counter.data_type) {
160            case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
161            case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
162            case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
163               return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result);
164               break;
165            case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
166            case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
167               return counter.oa_counter_read_float(perf->cfg, query, &result);
168               break;
169            }
170
171            return {};
172         };
173
174         // Add counter id to the group
175         group.counters.emplace_back(counter_desc.id);
176
177         // Store counter
178         counters.emplace_back(std::move(counter_desc));
179      }
180
181      // Store group
182      groups.emplace_back(std::move(group));
183   }
184
185   assert(groups.size() && "Failed to query groups");
186   assert(counters.size() && "Failed to query counters");
187
188   // Clear accumulations
189   intel_perf_query_result_clear(&result);
190
191   return true;
192}
193
194void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
195{
196   this->sampling_period_ns = sampling_period_ns;
197
198   // Fill correlations with an initial one
199   if (auto corr = query_correlation_timestamps()) {
200      correlations.fill(*corr);
201   } else {
202      PPS_LOG_FATAL("Failed to get correlation timestamps");
203   }
204
205   if (!perf->open(sampling_period_ns)) {
206      PPS_LOG_FATAL("Failed to open intel perf");
207   }
208}
209
210/// @brief Transforms the GPU timestop into a CPU timestamp equivalent
211uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
212{
213   auto &corr_a = correlations[0];
214   auto &corr_b = correlations[correlations.size() - 1];
215
216   // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
217   uint64_t mask = 0xffffffff;
218   uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
219   uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
220
221   // Make sure it is within the interval [a,b)
222   assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
223   assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
224
225   uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
226   // Factor to convert gpu time to cpu time
227   double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
228      double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
229   uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
230   return corr_a.cpu_timestamp + cpu_delta;
231}
232
233void IntelDriver::disable_perfcnt()
234{
235   perf = nullptr;
236   groups.clear();
237   counters.clear();
238   enabled_counters.clear();
239}
240
241struct Report {
242   uint32_t version;
243   uint32_t timestamp;
244   uint32_t id;
245};
246
247/// @brief Some perf record durations can be really short
248/// @return True if the duration is at least close to the sampling period
249static bool close_enough(uint64_t duration, uint64_t sampling_period)
250{
251   return duration > sampling_period - 100000;
252}
253
254/// @brief Transforms the raw data received in from the driver into records
255std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
256   const size_t byte_count)
257{
258   std::vector<PerfRecord> records;
259   records.reserve(128);
260
261   PerfRecord record;
262   record.reserve(512);
263
264   const uint8_t *iter = data.data();
265   const uint8_t *end = iter + byte_count;
266
267   uint64_t prev_cpu_timestamp = last_cpu_timestamp;
268
269   while (iter < end) {
270      // Iterate a record at a time
271      auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
272
273      if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
274         // Report is next to the header
275         auto report = reinterpret_cast<const Report *>(header + 1);
276         auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
277         auto duration = cpu_timestamp - prev_cpu_timestamp;
278
279         // Skip perf-records that are too short by checking
280         // the distance between last report and this one
281         if (close_enough(duration, sampling_period_ns)) {
282            prev_cpu_timestamp = cpu_timestamp;
283
284            // Add the new record to the list
285            record.resize(header->size); // Possibly 264?
286            memcpy(record.data(), iter, header->size);
287            records.emplace_back(record);
288         }
289      }
290
291      // Go to the next record
292      iter += header->size;
293   }
294
295   return records;
296}
297
298/// @brief Read all the available data from the metric set currently in use
299void IntelDriver::read_data_from_metric_set()
300{
301   assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
302
303   ssize_t bytes_read = 0;
304   while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
305              metric_buffer.size() - total_bytes_read)) > 0 ||
306      errno == EINTR) {
307      total_bytes_read += std::max(ssize_t(0), bytes_read);
308
309      // Increase size of the buffer for the next read
310      if (metric_buffer.size() / 2 < total_bytes_read) {
311         metric_buffer.resize(metric_buffer.size() * 2);
312      }
313   }
314
315   assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
316}
317
318bool IntelDriver::dump_perfcnt()
319{
320   if (!perf->oa_stream_ready()) {
321      return false;
322   }
323
324   read_data_from_metric_set();
325
326   get_new_correlation();
327
328   auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
329   if (new_records.empty()) {
330      PPS_LOG("No new records");
331      // No new records from the GPU yet
332      return false;
333   } else {
334      PPS_LOG("Records parsed bytes: %lu", total_bytes_read);
335      // Records are parsed correctly, so we can reset the
336      // number of bytes read so far from the metric set
337      total_bytes_read = 0;
338   }
339
340   APPEND(records, new_records);
341
342   if (records.size() < 2) {
343      // Not enough records to accumulate
344      return false;
345   }
346
347   return true;
348}
349
350uint32_t IntelDriver::gpu_next()
351{
352   if (records.size() < 2) {
353      // Not enough records to accumulate
354      return 0;
355   }
356
357   // Get first and second
358   auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
359   auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
360
361   intel_perf_query_result_accumulate_fields(&result,
362      &perf->query.value(),
363      &perf->devinfo,
364      record_a + 1,
365      record_b + 1,
366      false /* no_oa_accumulate */);
367
368   // Get last timestamp
369   auto report_b = reinterpret_cast<const Report *>(record_b + 1);
370   auto gpu_timestamp = report_b->timestamp;
371
372   // Consume first record
373   records.erase(std::begin(records), std::begin(records) + 1);
374
375   return gpu_timestamp;
376}
377
378uint64_t IntelDriver::cpu_next()
379{
380   if (auto gpu_timestamp = gpu_next()) {
381      auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
382
383      last_cpu_timestamp = cpu_timestamp;
384      return cpu_timestamp;
385   }
386
387   return 0;
388}
389
390uint64_t IntelDriver::next()
391{
392   // Reset accumulation
393   intel_perf_query_result_clear(&result);
394   return cpu_next();
395}
396
397} // namespace pps
398