17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2020-2021 Collabora, Ltd. 37ec681f3Smrg * Author: Antonio Caggiano <antonio.caggiano@collabora.com> 47ec681f3Smrg * Author: Corentin Noël <corentin.noel@collabora.com> 57ec681f3Smrg * 67ec681f3Smrg * SPDX-License-Identifier: MIT 77ec681f3Smrg */ 87ec681f3Smrg 97ec681f3Smrg#include "intel_pps_driver.h" 107ec681f3Smrg 117ec681f3Smrg#include <dirent.h> 127ec681f3Smrg#include <fcntl.h> 137ec681f3Smrg#include <math.h> 147ec681f3Smrg#include <poll.h> 157ec681f3Smrg#include <strings.h> 167ec681f3Smrg#include <sys/ioctl.h> 177ec681f3Smrg#include <unistd.h> 187ec681f3Smrg 197ec681f3Smrg#include <i915_drm.h> 207ec681f3Smrg#include <intel/perf/intel_perf_query.h> 217ec681f3Smrg 227ec681f3Smrg#include <pps/pps.h> 237ec681f3Smrg#include <pps/pps_algorithm.h> 247ec681f3Smrg 257ec681f3Smrg#include "intel_pps_perf.h" 267ec681f3Smrg 277ec681f3Smrgnamespace pps 287ec681f3Smrg{ 297ec681f3Smrguint64_t IntelDriver::get_min_sampling_period_ns() 307ec681f3Smrg{ 317ec681f3Smrg return 500000; 327ec681f3Smrg} 337ec681f3Smrg 347ec681f3Smrgvoid IntelDriver::enable_counter(uint32_t counter_id) 357ec681f3Smrg{ 367ec681f3Smrg auto &counter = counters[counter_id]; 377ec681f3Smrg auto &group = groups[counter.group]; 387ec681f3Smrg if (perf->query) { 397ec681f3Smrg if (perf->query->symbol_name != group.name) { 407ec681f3Smrg PPS_LOG_ERROR( 417ec681f3Smrg "Unable to enable metrics from different sets: %u " 427ec681f3Smrg "belongs to %s but %s is currently in use.", 437ec681f3Smrg counter_id, 447ec681f3Smrg perf->query->symbol_name, 457ec681f3Smrg group.name.c_str()); 467ec681f3Smrg return; 477ec681f3Smrg } 487ec681f3Smrg } 497ec681f3Smrg 507ec681f3Smrg enabled_counters.emplace_back(counter); 517ec681f3Smrg if (!perf->query) { 527ec681f3Smrg perf->query = perf->find_query_by_name(group.name); 537ec681f3Smrg } 547ec681f3Smrg} 557ec681f3Smrg 567ec681f3Smrgvoid IntelDriver::enable_all_counters() 577ec681f3Smrg{ 587ec681f3Smrg // We can only enable one metric set at a time so at least enable one. 597ec681f3Smrg for (auto &group : groups) { 607ec681f3Smrg if (group.name == "RenderBasic") { 617ec681f3Smrg for (uint32_t counter_id : group.counters) { 627ec681f3Smrg auto &counter = counters[counter_id]; 637ec681f3Smrg enabled_counters.emplace_back(counter); 647ec681f3Smrg } 657ec681f3Smrg 667ec681f3Smrg perf->query = perf->find_query_by_name(group.name); 677ec681f3Smrg break; 687ec681f3Smrg } 697ec681f3Smrg } 707ec681f3Smrg} 717ec681f3Smrg 727ec681f3Smrgstatic uint64_t timespec_diff(timespec *begin, timespec *end) 737ec681f3Smrg{ 747ec681f3Smrg return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec; 757ec681f3Smrg} 767ec681f3Smrg 777ec681f3Smrg/// @brief This function tries to correlate CPU time with GPU time 787ec681f3Smrgstd::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const 797ec681f3Smrg{ 807ec681f3Smrg TimestampCorrelation corr = {}; 817ec681f3Smrg 827ec681f3Smrg clock_t correlation_clock_id = CLOCK_BOOTTIME; 837ec681f3Smrg 847ec681f3Smrg drm_i915_reg_read reg_read = {}; 857ec681f3Smrg const uint64_t render_ring_timestamp = 0x2358; 867ec681f3Smrg reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA; 877ec681f3Smrg 887ec681f3Smrg constexpr size_t attempt_count = 3; 897ec681f3Smrg struct { 907ec681f3Smrg timespec cpu_ts_begin; 917ec681f3Smrg timespec cpu_ts_end; 927ec681f3Smrg uint64_t gpu_ts; 937ec681f3Smrg } attempts[attempt_count] = {}; 947ec681f3Smrg 957ec681f3Smrg uint32_t best = 0; 967ec681f3Smrg 977ec681f3Smrg // Gather 3 correlations 987ec681f3Smrg for (uint32_t i = 0; i < attempt_count; i++) { 997ec681f3Smrg clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin); 1007ec681f3Smrg if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) { 1017ec681f3Smrg return std::nullopt; 1027ec681f3Smrg } 1037ec681f3Smrg clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end); 1047ec681f3Smrg 1057ec681f3Smrg attempts[i].gpu_ts = reg_read.val; 1067ec681f3Smrg } 1077ec681f3Smrg 1087ec681f3Smrg // Now select the best 1097ec681f3Smrg for (uint32_t i = 1; i < attempt_count; i++) { 1107ec681f3Smrg if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) < 1117ec681f3Smrg timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) { 1127ec681f3Smrg best = i; 1137ec681f3Smrg } 1147ec681f3Smrg } 1157ec681f3Smrg 1167ec681f3Smrg corr.cpu_timestamp = 1177ec681f3Smrg (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) + 1187ec681f3Smrg timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2; 1197ec681f3Smrg corr.gpu_timestamp = attempts[best].gpu_ts; 1207ec681f3Smrg 1217ec681f3Smrg return corr; 1227ec681f3Smrg} 1237ec681f3Smrg 1247ec681f3Smrgvoid IntelDriver::get_new_correlation() 1257ec681f3Smrg{ 1267ec681f3Smrg // Rotate left correlations by one position so to make space at the end 1277ec681f3Smrg std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end()); 1287ec681f3Smrg 1297ec681f3Smrg // Then we overwrite the last correlation with a new one 1307ec681f3Smrg if (auto corr = query_correlation_timestamps()) { 1317ec681f3Smrg correlations.back() = *corr; 1327ec681f3Smrg } else { 1337ec681f3Smrg PPS_LOG_FATAL("Failed to get correlation timestamps"); 1347ec681f3Smrg } 1357ec681f3Smrg} 1367ec681f3Smrg 1377ec681f3Smrgbool IntelDriver::init_perfcnt() 1387ec681f3Smrg{ 1397ec681f3Smrg assert(!perf && "Intel perf should not be initialized at this point"); 1407ec681f3Smrg 1417ec681f3Smrg perf = std::make_unique<IntelPerf>(drm_device.fd); 1427ec681f3Smrg 1437ec681f3Smrg for (auto &query : perf->get_queries()) { 1447ec681f3Smrg // Create group 1457ec681f3Smrg CounterGroup group = {}; 1467ec681f3Smrg group.id = groups.size(); 1477ec681f3Smrg group.name = query->symbol_name; 1487ec681f3Smrg 1497ec681f3Smrg for (int i = 0; i < query->n_counters; ++i) { 1507ec681f3Smrg intel_perf_query_counter &counter = query->counters[i]; 1517ec681f3Smrg 1527ec681f3Smrg // Create counter 1537ec681f3Smrg Counter counter_desc = {}; 1547ec681f3Smrg counter_desc.id = counters.size(); 1557ec681f3Smrg counter_desc.name = counter.symbol_name; 1567ec681f3Smrg counter_desc.group = group.id; 1577ec681f3Smrg counter_desc.getter = [counter, query, this]( 1587ec681f3Smrg const Counter &c, const Driver &dri) -> Counter::Value { 1597ec681f3Smrg switch (counter.data_type) { 1607ec681f3Smrg case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: 1617ec681f3Smrg case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: 1627ec681f3Smrg case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: 1637ec681f3Smrg return (int64_t)counter.oa_counter_read_uint64(perf->cfg, query, &result); 1647ec681f3Smrg break; 1657ec681f3Smrg case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: 1667ec681f3Smrg case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: 1677ec681f3Smrg return counter.oa_counter_read_float(perf->cfg, query, &result); 1687ec681f3Smrg break; 1697ec681f3Smrg } 1707ec681f3Smrg 1717ec681f3Smrg return {}; 1727ec681f3Smrg }; 1737ec681f3Smrg 1747ec681f3Smrg // Add counter id to the group 1757ec681f3Smrg group.counters.emplace_back(counter_desc.id); 1767ec681f3Smrg 1777ec681f3Smrg // Store counter 1787ec681f3Smrg counters.emplace_back(std::move(counter_desc)); 1797ec681f3Smrg } 1807ec681f3Smrg 1817ec681f3Smrg // Store group 1827ec681f3Smrg groups.emplace_back(std::move(group)); 1837ec681f3Smrg } 1847ec681f3Smrg 1857ec681f3Smrg assert(groups.size() && "Failed to query groups"); 1867ec681f3Smrg assert(counters.size() && "Failed to query counters"); 1877ec681f3Smrg 1887ec681f3Smrg // Clear accumulations 1897ec681f3Smrg intel_perf_query_result_clear(&result); 1907ec681f3Smrg 1917ec681f3Smrg return true; 1927ec681f3Smrg} 1937ec681f3Smrg 1947ec681f3Smrgvoid IntelDriver::enable_perfcnt(uint64_t sampling_period_ns) 1957ec681f3Smrg{ 1967ec681f3Smrg this->sampling_period_ns = sampling_period_ns; 1977ec681f3Smrg 1987ec681f3Smrg // Fill correlations with an initial one 1997ec681f3Smrg if (auto corr = query_correlation_timestamps()) { 2007ec681f3Smrg correlations.fill(*corr); 2017ec681f3Smrg } else { 2027ec681f3Smrg PPS_LOG_FATAL("Failed to get correlation timestamps"); 2037ec681f3Smrg } 2047ec681f3Smrg 2057ec681f3Smrg if (!perf->open(sampling_period_ns)) { 2067ec681f3Smrg PPS_LOG_FATAL("Failed to open intel perf"); 2077ec681f3Smrg } 2087ec681f3Smrg} 2097ec681f3Smrg 2107ec681f3Smrg/// @brief Transforms the GPU timestop into a CPU timestamp equivalent 2117ec681f3Smrguint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts) 2127ec681f3Smrg{ 2137ec681f3Smrg auto &corr_a = correlations[0]; 2147ec681f3Smrg auto &corr_b = correlations[correlations.size() - 1]; 2157ec681f3Smrg 2167ec681f3Smrg // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts 2177ec681f3Smrg uint64_t mask = 0xffffffff; 2187ec681f3Smrg uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask; 2197ec681f3Smrg uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask; 2207ec681f3Smrg 2217ec681f3Smrg // Make sure it is within the interval [a,b) 2227ec681f3Smrg assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a"); 2237ec681f3Smrg assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b"); 2247ec681f3Smrg 2257ec681f3Smrg uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts; 2267ec681f3Smrg // Factor to convert gpu time to cpu time 2277ec681f3Smrg double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) / 2287ec681f3Smrg double(corr_b.gpu_timestamp - corr_a.gpu_timestamp); 2297ec681f3Smrg uint64_t cpu_delta = gpu_delta * gpu_to_cpu; 2307ec681f3Smrg return corr_a.cpu_timestamp + cpu_delta; 2317ec681f3Smrg} 2327ec681f3Smrg 2337ec681f3Smrgvoid IntelDriver::disable_perfcnt() 2347ec681f3Smrg{ 2357ec681f3Smrg perf = nullptr; 2367ec681f3Smrg groups.clear(); 2377ec681f3Smrg counters.clear(); 2387ec681f3Smrg enabled_counters.clear(); 2397ec681f3Smrg} 2407ec681f3Smrg 2417ec681f3Smrgstruct Report { 2427ec681f3Smrg uint32_t version; 2437ec681f3Smrg uint32_t timestamp; 2447ec681f3Smrg uint32_t id; 2457ec681f3Smrg}; 2467ec681f3Smrg 2477ec681f3Smrg/// @brief Some perf record durations can be really short 2487ec681f3Smrg/// @return True if the duration is at least close to the sampling period 2497ec681f3Smrgstatic bool close_enough(uint64_t duration, uint64_t sampling_period) 2507ec681f3Smrg{ 2517ec681f3Smrg return duration > sampling_period - 100000; 2527ec681f3Smrg} 2537ec681f3Smrg 2547ec681f3Smrg/// @brief Transforms the raw data received in from the driver into records 2557ec681f3Smrgstd::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data, 2567ec681f3Smrg const size_t byte_count) 2577ec681f3Smrg{ 2587ec681f3Smrg std::vector<PerfRecord> records; 2597ec681f3Smrg records.reserve(128); 2607ec681f3Smrg 2617ec681f3Smrg PerfRecord record; 2627ec681f3Smrg record.reserve(512); 2637ec681f3Smrg 2647ec681f3Smrg const uint8_t *iter = data.data(); 2657ec681f3Smrg const uint8_t *end = iter + byte_count; 2667ec681f3Smrg 2677ec681f3Smrg uint64_t prev_cpu_timestamp = last_cpu_timestamp; 2687ec681f3Smrg 2697ec681f3Smrg while (iter < end) { 2707ec681f3Smrg // Iterate a record at a time 2717ec681f3Smrg auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter); 2727ec681f3Smrg 2737ec681f3Smrg if (header->type == DRM_I915_PERF_RECORD_SAMPLE) { 2747ec681f3Smrg // Report is next to the header 2757ec681f3Smrg auto report = reinterpret_cast<const Report *>(header + 1); 2767ec681f3Smrg auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp); 2777ec681f3Smrg auto duration = cpu_timestamp - prev_cpu_timestamp; 2787ec681f3Smrg 2797ec681f3Smrg // Skip perf-records that are too short by checking 2807ec681f3Smrg // the distance between last report and this one 2817ec681f3Smrg if (close_enough(duration, sampling_period_ns)) { 2827ec681f3Smrg prev_cpu_timestamp = cpu_timestamp; 2837ec681f3Smrg 2847ec681f3Smrg // Add the new record to the list 2857ec681f3Smrg record.resize(header->size); // Possibly 264? 2867ec681f3Smrg memcpy(record.data(), iter, header->size); 2877ec681f3Smrg records.emplace_back(record); 2887ec681f3Smrg } 2897ec681f3Smrg } 2907ec681f3Smrg 2917ec681f3Smrg // Go to the next record 2927ec681f3Smrg iter += header->size; 2937ec681f3Smrg } 2947ec681f3Smrg 2957ec681f3Smrg return records; 2967ec681f3Smrg} 2977ec681f3Smrg 2987ec681f3Smrg/// @brief Read all the available data from the metric set currently in use 2997ec681f3Smrgvoid IntelDriver::read_data_from_metric_set() 3007ec681f3Smrg{ 3017ec681f3Smrg assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading"); 3027ec681f3Smrg 3037ec681f3Smrg ssize_t bytes_read = 0; 3047ec681f3Smrg while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read, 3057ec681f3Smrg metric_buffer.size() - total_bytes_read)) > 0 || 3067ec681f3Smrg errno == EINTR) { 3077ec681f3Smrg total_bytes_read += std::max(ssize_t(0), bytes_read); 3087ec681f3Smrg 3097ec681f3Smrg // Increase size of the buffer for the next read 3107ec681f3Smrg if (metric_buffer.size() / 2 < total_bytes_read) { 3117ec681f3Smrg metric_buffer.resize(metric_buffer.size() * 2); 3127ec681f3Smrg } 3137ec681f3Smrg } 3147ec681f3Smrg 3157ec681f3Smrg assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough"); 3167ec681f3Smrg} 3177ec681f3Smrg 3187ec681f3Smrgbool IntelDriver::dump_perfcnt() 3197ec681f3Smrg{ 3207ec681f3Smrg if (!perf->oa_stream_ready()) { 3217ec681f3Smrg return false; 3227ec681f3Smrg } 3237ec681f3Smrg 3247ec681f3Smrg read_data_from_metric_set(); 3257ec681f3Smrg 3267ec681f3Smrg get_new_correlation(); 3277ec681f3Smrg 3287ec681f3Smrg auto new_records = parse_perf_records(metric_buffer, total_bytes_read); 3297ec681f3Smrg if (new_records.empty()) { 3307ec681f3Smrg PPS_LOG("No new records"); 3317ec681f3Smrg // No new records from the GPU yet 3327ec681f3Smrg return false; 3337ec681f3Smrg } else { 3347ec681f3Smrg PPS_LOG("Records parsed bytes: %lu", total_bytes_read); 3357ec681f3Smrg // Records are parsed correctly, so we can reset the 3367ec681f3Smrg // number of bytes read so far from the metric set 3377ec681f3Smrg total_bytes_read = 0; 3387ec681f3Smrg } 3397ec681f3Smrg 3407ec681f3Smrg APPEND(records, new_records); 3417ec681f3Smrg 3427ec681f3Smrg if (records.size() < 2) { 3437ec681f3Smrg // Not enough records to accumulate 3447ec681f3Smrg return false; 3457ec681f3Smrg } 3467ec681f3Smrg 3477ec681f3Smrg return true; 3487ec681f3Smrg} 3497ec681f3Smrg 3507ec681f3Smrguint32_t IntelDriver::gpu_next() 3517ec681f3Smrg{ 3527ec681f3Smrg if (records.size() < 2) { 3537ec681f3Smrg // Not enough records to accumulate 3547ec681f3Smrg return 0; 3557ec681f3Smrg } 3567ec681f3Smrg 3577ec681f3Smrg // Get first and second 3587ec681f3Smrg auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data()); 3597ec681f3Smrg auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data()); 3607ec681f3Smrg 3617ec681f3Smrg intel_perf_query_result_accumulate_fields(&result, 3627ec681f3Smrg &perf->query.value(), 3637ec681f3Smrg &perf->devinfo, 3647ec681f3Smrg record_a + 1, 3657ec681f3Smrg record_b + 1, 3667ec681f3Smrg false /* no_oa_accumulate */); 3677ec681f3Smrg 3687ec681f3Smrg // Get last timestamp 3697ec681f3Smrg auto report_b = reinterpret_cast<const Report *>(record_b + 1); 3707ec681f3Smrg auto gpu_timestamp = report_b->timestamp; 3717ec681f3Smrg 3727ec681f3Smrg // Consume first record 3737ec681f3Smrg records.erase(std::begin(records), std::begin(records) + 1); 3747ec681f3Smrg 3757ec681f3Smrg return gpu_timestamp; 3767ec681f3Smrg} 3777ec681f3Smrg 3787ec681f3Smrguint64_t IntelDriver::cpu_next() 3797ec681f3Smrg{ 3807ec681f3Smrg if (auto gpu_timestamp = gpu_next()) { 3817ec681f3Smrg auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp); 3827ec681f3Smrg 3837ec681f3Smrg last_cpu_timestamp = cpu_timestamp; 3847ec681f3Smrg return cpu_timestamp; 3857ec681f3Smrg } 3867ec681f3Smrg 3877ec681f3Smrg return 0; 3887ec681f3Smrg} 3897ec681f3Smrg 3907ec681f3Smrguint64_t IntelDriver::next() 3917ec681f3Smrg{ 3927ec681f3Smrg // Reset accumulation 3937ec681f3Smrg intel_perf_query_result_clear(&result); 3947ec681f3Smrg return cpu_next(); 3957ec681f3Smrg} 3967ec681f3Smrg 3977ec681f3Smrg} // namespace pps 398