1/* 2 * Copyright © 2020-2021 Collabora, Ltd. 3 * Author: Antonio Caggiano <antonio.caggiano@collabora.com> 4 * Author: Corentin Noël <corentin.noel@collabora.com> 5 * 6 * SPDX-License-Identifier: MIT 7 */ 8 9#include "intel_pps_driver.h" 10 11#include <dirent.h> 12#include <fcntl.h> 13#include <math.h> 14#include <poll.h> 15#include <strings.h> 16#include <sys/ioctl.h> 17#include <unistd.h> 18 19#include <i915_drm.h> 20#include <intel/perf/intel_perf_query.h> 21 22#include <pps/pps.h> 23#include <pps/pps_algorithm.h> 24 25#include "intel_pps_perf.h" 26 27namespace pps 28{ 29uint64_t IntelDriver::get_min_sampling_period_ns() 30{ 31 return 500000; 32} 33 34void IntelDriver::enable_counter(uint32_t counter_id) 35{ 36 auto &counter = counters[counter_id]; 37 auto &group = groups[counter.group]; 38 if (perf->query) { 39 if (perf->query->symbol_name != group.name) { 40 PPS_LOG_ERROR( 41 "Unable to enable metrics from different sets: %u " 42 "belongs to %s but %s is currently in use.", 43 counter_id, 44 perf->query->symbol_name, 45 group.name.c_str()); 46 return; 47 } 48 } 49 50 enabled_counters.emplace_back(counter); 51 if (!perf->query) { 52 perf->query = perf->find_query_by_name(group.name); 53 } 54} 55 56void IntelDriver::enable_all_counters() 57{ 58 // We can only enable one metric set at a time so at least enable one. 59 for (auto &group : groups) { 60 if (group.name == "RenderBasic") { 61 for (uint32_t counter_id : group.counters) { 62 auto &counter = counters[counter_id]; 63 enabled_counters.emplace_back(counter); 64 } 65 66 perf->query = perf->find_query_by_name(group.name); 67 break; 68 } 69 } 70} 71 72static uint64_t timespec_diff(timespec *begin, timespec *end) 73{ 74 return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec; 75} 76 77/// @brief This function tries to correlate CPU time with GPU time 78std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const 79{ 80 TimestampCorrelation corr = {}; 81 82 clock_t correlation_clock_id = CLOCK_BOOTTIME; 83 84 drm_i915_reg_read reg_read = {}; 85 const uint64_t render_ring_timestamp = 0x2358; 86 reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA; 87 88 constexpr size_t attempt_count = 3; 89 struct { 90 timespec cpu_ts_begin; 91 timespec cpu_ts_end; 92 uint64_t gpu_ts; 93 } attempts[attempt_count] = {}; 94 95 uint32_t best = 0; 96 97 // Gather 3 correlations 98 for (uint32_t i = 0; i < attempt_count; i++) { 99 clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin); 100 if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) { 101 return std::nullopt; 102 } 103 clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end); 104 105 attempts[i].gpu_ts = reg_read.val; 106 } 107 108 // Now select the best 109 for (uint32_t i = 1; i < attempt_count; i++) { 110 if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) < 111 timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) { 112 best = i; 113 } 114 } 115 116 corr.cpu_timestamp = 117 (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) + 118 timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2; 119 corr.gpu_timestamp = attempts[best].gpu_ts; 120 121 return corr; 122} 123 124void IntelDriver::get_new_correlation() 125{ 126 // Rotate left correlations by one position so to make space at the end 127 std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end()); 128 129 // Then we overwrite the last correlation with a new one 130 if (auto corr = query_correlation_timestamps()) { 131 correlations.back() = *corr; 132 } else { 133 PPS_LOG_FATAL("Failed to get correlation timestamps"); 134 } 135} 136 137bool IntelDriver::init_perfcnt() 138{ 139 assert(!perf && "Intel perf should not be initialized at this point"); 140 141 perf = std::make_unique<IntelPerf>(drm_device.fd); 142 143 for (auto &query : perf->get_queries()) { 144 // Create group 145 CounterGroup group = {}; 146 group.id = groups.size(); 147 group.name = query->symbol_name; 148 149 for (int i = 0; i < query->n_counters; ++i) { 150 intel_perf_query_counter &counter = query->counters[i]; 151 152 // Create counter 153 Counter counter_desc = {}; 154 counter_desc.id = counters.size(); 155 counter_desc.name = counter.symbol_name; 156 counter_desc.group = group.id; 157 counter_desc.getter = [counter, query, this]( 158 const Counter &c, const Driver &dri) -> Counter::Value { 159 switch (counter.data_type) { 160 case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: 161 case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: 162 case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: 163 return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result); 164 break; 165 case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: 166 case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: 167 return counter.oa_counter_read_float(perf->cfg, query, &result); 168 break; 169 } 170 171 return {}; 172 }; 173 174 // Add counter id to the group 175 group.counters.emplace_back(counter_desc.id); 176 177 // Store counter 178 counters.emplace_back(std::move(counter_desc)); 179 } 180 181 // Store group 182 groups.emplace_back(std::move(group)); 183 } 184 185 assert(groups.size() && "Failed to query groups"); 186 assert(counters.size() && "Failed to query counters"); 187 188 // Clear accumulations 189 intel_perf_query_result_clear(&result); 190 191 return true; 192} 193 194void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns) 195{ 196 this->sampling_period_ns = sampling_period_ns; 197 198 // Fill correlations with an initial one 199 if (auto corr = query_correlation_timestamps()) { 200 correlations.fill(*corr); 201 } else { 202 PPS_LOG_FATAL("Failed to get correlation timestamps"); 203 } 204 205 if (!perf->open(sampling_period_ns)) { 206 PPS_LOG_FATAL("Failed to open intel perf"); 207 } 208} 209 210/// @brief Transforms the GPU timestop into a CPU timestamp equivalent 211uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts) 212{ 213 auto &corr_a = correlations[0]; 214 auto &corr_b = correlations[correlations.size() - 1]; 215 216 // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts 217 uint64_t mask = 0xffffffff; 218 uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask; 219 uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask; 220 221 // Make sure it is within the interval [a,b) 222 assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a"); 223 assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b"); 224 225 uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts; 226 // Factor to convert gpu time to cpu time 227 double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) / 228 double(corr_b.gpu_timestamp - corr_a.gpu_timestamp); 229 uint64_t cpu_delta = gpu_delta * gpu_to_cpu; 230 return corr_a.cpu_timestamp + cpu_delta; 231} 232 233void IntelDriver::disable_perfcnt() 234{ 235 perf = nullptr; 236 groups.clear(); 237 counters.clear(); 238 enabled_counters.clear(); 239} 240 241struct Report { 242 uint32_t version; 243 uint32_t timestamp; 244 uint32_t id; 245}; 246 247/// @brief Some perf record durations can be really short 248/// @return True if the duration is at least close to the sampling period 249static bool close_enough(uint64_t duration, uint64_t sampling_period) 250{ 251 return duration > sampling_period - 100000; 252} 253 254/// @brief Transforms the raw data received in from the driver into records 255std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data, 256 const size_t byte_count) 257{ 258 std::vector<PerfRecord> records; 259 records.reserve(128); 260 261 PerfRecord record; 262 record.reserve(512); 263 264 const uint8_t *iter = data.data(); 265 const uint8_t *end = iter + byte_count; 266 267 uint64_t prev_cpu_timestamp = last_cpu_timestamp; 268 269 while (iter < end) { 270 // Iterate a record at a time 271 auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter); 272 273 if (header->type == DRM_I915_PERF_RECORD_SAMPLE) { 274 // Report is next to the header 275 auto report = reinterpret_cast<const Report *>(header + 1); 276 auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp); 277 auto duration = cpu_timestamp - prev_cpu_timestamp; 278 279 // Skip perf-records that are too short by checking 280 // the distance between last report and this one 281 if (close_enough(duration, sampling_period_ns)) { 282 prev_cpu_timestamp = cpu_timestamp; 283 284 // Add the new record to the list 285 record.resize(header->size); // Possibly 264? 286 memcpy(record.data(), iter, header->size); 287 records.emplace_back(record); 288 } 289 } 290 291 // Go to the next record 292 iter += header->size; 293 } 294 295 return records; 296} 297 298/// @brief Read all the available data from the metric set currently in use 299void IntelDriver::read_data_from_metric_set() 300{ 301 assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading"); 302 303 ssize_t bytes_read = 0; 304 while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read, 305 metric_buffer.size() - total_bytes_read)) > 0 || 306 errno == EINTR) { 307 total_bytes_read += std::max(ssize_t(0), bytes_read); 308 309 // Increase size of the buffer for the next read 310 if (metric_buffer.size() / 2 < total_bytes_read) { 311 metric_buffer.resize(metric_buffer.size() * 2); 312 } 313 } 314 315 assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough"); 316} 317 318bool IntelDriver::dump_perfcnt() 319{ 320 if (!perf->oa_stream_ready()) { 321 return false; 322 } 323 324 read_data_from_metric_set(); 325 326 get_new_correlation(); 327 328 auto new_records = parse_perf_records(metric_buffer, total_bytes_read); 329 if (new_records.empty()) { 330 PPS_LOG("No new records"); 331 // No new records from the GPU yet 332 return false; 333 } else { 334 PPS_LOG("Records parsed bytes: %lu", total_bytes_read); 335 // Records are parsed correctly, so we can reset the 336 // number of bytes read so far from the metric set 337 total_bytes_read = 0; 338 } 339 340 APPEND(records, new_records); 341 342 if (records.size() < 2) { 343 // Not enough records to accumulate 344 return false; 345 } 346 347 return true; 348} 349 350uint32_t IntelDriver::gpu_next() 351{ 352 if (records.size() < 2) { 353 // Not enough records to accumulate 354 return 0; 355 } 356 357 // Get first and second 358 auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data()); 359 auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data()); 360 361 intel_perf_query_result_accumulate_fields(&result, 362 &perf->query.value(), 363 &perf->devinfo, 364 record_a + 1, 365 record_b + 1, 366 false /* no_oa_accumulate */); 367 368 // Get last timestamp 369 auto report_b = reinterpret_cast<const Report *>(record_b + 1); 370 auto gpu_timestamp = report_b->timestamp; 371 372 // Consume first record 373 records.erase(std::begin(records), std::begin(records) + 1); 374 375 return gpu_timestamp; 376} 377 378uint64_t IntelDriver::cpu_next() 379{ 380 if (auto gpu_timestamp = gpu_next()) { 381 auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp); 382 383 last_cpu_timestamp = cpu_timestamp; 384 return cpu_timestamp; 385 } 386 387 return 0; 388} 389 390uint64_t IntelDriver::next() 391{ 392 // Reset accumulation 393 intel_perf_query_result_clear(&result); 394 return cpu_next(); 395} 396 397} // namespace pps 398