17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2020 Google, Inc. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#ifndef _U_TRACE_H 257ec681f3Smrg#define _U_TRACE_H 267ec681f3Smrg 277ec681f3Smrg#include <stdbool.h> 287ec681f3Smrg#include <stdint.h> 297ec681f3Smrg#include <stdio.h> 307ec681f3Smrg 317ec681f3Smrg#include "util/u_queue.h" 327ec681f3Smrg 337ec681f3Smrg#ifdef __cplusplus 347ec681f3Smrgextern "C" { 357ec681f3Smrg#endif 367ec681f3Smrg 377ec681f3Smrg/* A trace mechanism (very) loosely inspired by the linux kernel tracepoint 387ec681f3Smrg * mechanism, in that it allows for defining driver specific (or common) 397ec681f3Smrg * tracepoints, which generate 'trace_$name()' functions that can be 407ec681f3Smrg * called at various points in commandstream emit. 417ec681f3Smrg * 427ec681f3Smrg * Currently a printf backend is implemented, but the expectation is to 437ec681f3Smrg * also implement a perfetto backend for shipping out traces to a tool like 447ec681f3Smrg * AGI. 457ec681f3Smrg * 467ec681f3Smrg * Notable differences: 477ec681f3Smrg * 487ec681f3Smrg * - GPU timestamps! A driver provided callback is used to emit timestamps 497ec681f3Smrg * to a buffer. At a later point in time (when stalling to wait for the 507ec681f3Smrg * GPU is not required), the timestamps are re-united with the trace 517ec681f3Smrg * payload. This makes the trace mechanism suitable for profiling. 527ec681f3Smrg * 537ec681f3Smrg * - Instead of a systemwide trace ringbuffer, buffering of un-retired 547ec681f3Smrg * tracepoints is split into two stages. Traces are emitted to a 557ec681f3Smrg * 'u_trace' instance, and at a later time flushed to a 'u_trace_context' 567ec681f3Smrg * instance. This avoids the requirement that commandstream containing 577ec681f3Smrg * tracepoints is emitted in the same order as it is generated. 587ec681f3Smrg * 597ec681f3Smrg * If the hw has multiple parallel "engines" (for example, 3d/blit/compute) 607ec681f3Smrg * then a `u_trace_context` per-engine should be used. 617ec681f3Smrg * 627ec681f3Smrg * - Unlike kernel tracepoints, u_trace tracepoints are defined in py 637ec681f3Smrg * from which header and src files are generated. Since we already have 647ec681f3Smrg * a build dependency on python+mako, this gives more flexibility than 657ec681f3Smrg * clunky preprocessor macro magic. 667ec681f3Smrg * 677ec681f3Smrg */ 687ec681f3Smrg 697ec681f3Smrgstruct u_trace_context; 707ec681f3Smrgstruct u_trace; 717ec681f3Smrgstruct u_trace_chunk; 727ec681f3Smrg 737ec681f3Smrg/** 747ec681f3Smrg * Special reserved value to indicate that no timestamp was captured, 757ec681f3Smrg * and that the timestamp of the previous trace should be reused. 767ec681f3Smrg */ 777ec681f3Smrg#define U_TRACE_NO_TIMESTAMP ((uint64_t)0) 787ec681f3Smrg 797ec681f3Smrg/** 807ec681f3Smrg * Driver provided callback to create a timestamp buffer which will be 817ec681f3Smrg * read by u_trace_read_ts function. 827ec681f3Smrg */ 837ec681f3Smrgtypedef void* (*u_trace_create_ts_buffer)(struct u_trace_context *utctx, 847ec681f3Smrg uint32_t timestamps_count); 857ec681f3Smrg 867ec681f3Smrg/** 877ec681f3Smrg * Driver provided callback to delete a timestamp buffer. 887ec681f3Smrg */ 897ec681f3Smrgtypedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx, 907ec681f3Smrg void *timestamps); 917ec681f3Smrg 927ec681f3Smrg/** 937ec681f3Smrg * Driver provided callback to emit commands into the soecified command 947ec681f3Smrg * stream to capture a 64b timestamp into the specified timestamps buffer, 957ec681f3Smrg * at the specified index. 967ec681f3Smrg * 977ec681f3Smrg * The hw counter that the driver records should be something that runs at 987ec681f3Smrg * a fixed rate, even as the GPU freq changes. The same source used for 997ec681f3Smrg * GL_TIMESTAMP queries should be appropriate. 1007ec681f3Smrg */ 1017ec681f3Smrgtypedef void (*u_trace_record_ts)(struct u_trace *ut, void *cs, 1027ec681f3Smrg void *timestamps, unsigned idx); 1037ec681f3Smrg 1047ec681f3Smrg/** 1057ec681f3Smrg * Driver provided callback to read back a previously recorded timestamp. 1067ec681f3Smrg * If necessary, this should block until the GPU has finished writing back 1077ec681f3Smrg * the timestamps. (The timestamps will be read back in order, so it is 1087ec681f3Smrg * safe to only synchronize on idx==0.) 1097ec681f3Smrg * 1107ec681f3Smrg * flush_data is data provided by the driver via u_trace_flush. 1117ec681f3Smrg * 1127ec681f3Smrg * The returned timestamp should be in units of nanoseconds. The same 1137ec681f3Smrg * timebase as GL_TIMESTAMP queries should be used. 1147ec681f3Smrg * 1157ec681f3Smrg * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate 1167ec681f3Smrg * that no timestamp was captured and the timestamp from the previous trace 1177ec681f3Smrg * will be re-used. (The first trace in the u_trace buf may not do this.) 1187ec681f3Smrg * This allows the driver to detect cases where multiple tracepoints are 1197ec681f3Smrg * emitted with no other intervening cmdstream, to avoid pointlessly 1207ec681f3Smrg * capturing the same timestamp multiple times in a row. 1217ec681f3Smrg */ 1227ec681f3Smrgtypedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx, 1237ec681f3Smrg void *timestamps, unsigned idx, void *flush_data); 1247ec681f3Smrg 1257ec681f3Smrg/** 1267ec681f3Smrg * Driver provided callback to delete flush data. 1277ec681f3Smrg */ 1287ec681f3Smrgtypedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx, 1297ec681f3Smrg void *flush_data); 1307ec681f3Smrg 1317ec681f3Smrg/** 1327ec681f3Smrg * The trace context provides tracking for "in-flight" traces, once the 1337ec681f3Smrg * cmdstream that records timestamps has been flushed. 1347ec681f3Smrg */ 1357ec681f3Smrgstruct u_trace_context { 1367ec681f3Smrg void *pctx; 1377ec681f3Smrg 1387ec681f3Smrg u_trace_create_ts_buffer create_timestamp_buffer; 1397ec681f3Smrg u_trace_delete_ts_buffer delete_timestamp_buffer; 1407ec681f3Smrg u_trace_record_ts record_timestamp; 1417ec681f3Smrg u_trace_read_ts read_timestamp; 1427ec681f3Smrg u_trace_delete_flush_data delete_flush_data; 1437ec681f3Smrg 1447ec681f3Smrg FILE *out; 1457ec681f3Smrg 1467ec681f3Smrg /* Once u_trace_flush() is called u_trace_chunk's are queued up to 1477ec681f3Smrg * render tracepoints on a queue. The per-chunk queue jobs block until 1487ec681f3Smrg * timestamps are available. 1497ec681f3Smrg */ 1507ec681f3Smrg struct util_queue queue; 1517ec681f3Smrg 1527ec681f3Smrg#ifdef HAVE_PERFETTO 1537ec681f3Smrg /* node in global list of trace contexts. */ 1547ec681f3Smrg struct list_head node; 1557ec681f3Smrg#endif 1567ec681f3Smrg 1577ec681f3Smrg /* State to accumulate time across N chunks associated with a single 1587ec681f3Smrg * batch (u_trace). 1597ec681f3Smrg */ 1607ec681f3Smrg uint64_t last_time_ns; 1617ec681f3Smrg uint64_t first_time_ns; 1627ec681f3Smrg 1637ec681f3Smrg uint32_t frame_nr; 1647ec681f3Smrg 1657ec681f3Smrg /* list of unprocessed trace chunks in fifo order: */ 1667ec681f3Smrg struct list_head flushed_trace_chunks; 1677ec681f3Smrg}; 1687ec681f3Smrg 1697ec681f3Smrg/** 1707ec681f3Smrg * The u_trace ptr is passed as the first arg to generated tracepoints. 1717ec681f3Smrg * It provides buffering for tracepoint payload until the corresponding 1727ec681f3Smrg * driver cmdstream containing the emitted commands to capture is 1737ec681f3Smrg * flushed. 1747ec681f3Smrg * 1757ec681f3Smrg * Individual tracepoints emitted to u_trace are expected to be "executed" 1767ec681f3Smrg * (ie. timestamp captured) in FIFO order with respect to other tracepoints 1777ec681f3Smrg * emitted to the same u_trace. But the order WRT other u_trace instances 1787ec681f3Smrg * is undefined util u_trace_flush(). 1797ec681f3Smrg */ 1807ec681f3Smrgstruct u_trace { 1817ec681f3Smrg struct u_trace_context *utctx; 1827ec681f3Smrg 1837ec681f3Smrg struct list_head trace_chunks; /* list of unflushed trace chunks in fifo order */ 1847ec681f3Smrg 1857ec681f3Smrg bool enabled; 1867ec681f3Smrg}; 1877ec681f3Smrg 1887ec681f3Smrgvoid u_trace_context_init(struct u_trace_context *utctx, 1897ec681f3Smrg void *pctx, 1907ec681f3Smrg u_trace_create_ts_buffer create_timestamp_buffer, 1917ec681f3Smrg u_trace_delete_ts_buffer delete_timestamp_buffer, 1927ec681f3Smrg u_trace_record_ts record_timestamp, 1937ec681f3Smrg u_trace_read_ts read_timestamp, 1947ec681f3Smrg u_trace_delete_flush_data delete_flush_data); 1957ec681f3Smrgvoid u_trace_context_fini(struct u_trace_context *utctx); 1967ec681f3Smrg 1977ec681f3Smrg/** 1987ec681f3Smrg * Flush (trigger processing) of traces previously flushed to the trace-context 1997ec681f3Smrg * by u_trace_flush(). 2007ec681f3Smrg * 2017ec681f3Smrg * This should typically be called in the driver's pctx->flush(). 2027ec681f3Smrg */ 2037ec681f3Smrgvoid u_trace_context_process(struct u_trace_context *utctx, bool eof); 2047ec681f3Smrg 2057ec681f3Smrgvoid u_trace_init(struct u_trace *ut, struct u_trace_context *utctx); 2067ec681f3Smrgvoid u_trace_fini(struct u_trace *ut); 2077ec681f3Smrg 2087ec681f3Smrgbool u_trace_has_points(struct u_trace *ut); 2097ec681f3Smrg 2107ec681f3Smrgstruct u_trace_iterator 2117ec681f3Smrg{ 2127ec681f3Smrg struct u_trace *ut; 2137ec681f3Smrg struct u_trace_chunk *chunk; 2147ec681f3Smrg uint32_t event_idx; 2157ec681f3Smrg}; 2167ec681f3Smrg 2177ec681f3Smrgstruct u_trace_iterator 2187ec681f3Smrgu_trace_begin_iterator(struct u_trace *ut); 2197ec681f3Smrg 2207ec681f3Smrgstruct u_trace_iterator 2217ec681f3Smrgu_trace_end_iterator(struct u_trace *ut); 2227ec681f3Smrg 2237ec681f3Smrgbool 2247ec681f3Smrgu_trace_iterator_equal(struct u_trace_iterator a, 2257ec681f3Smrg struct u_trace_iterator b); 2267ec681f3Smrg 2277ec681f3Smrgtypedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx, 2287ec681f3Smrg void *cmdstream, 2297ec681f3Smrg void *ts_from, uint32_t from_offset, 2307ec681f3Smrg void *ts_to, uint32_t to_offset, 2317ec681f3Smrg uint32_t count); 2327ec681f3Smrg 2337ec681f3Smrg/** 2347ec681f3Smrg * Clones tracepoints range into target u_trace. 2357ec681f3Smrg * Provides callback for driver to copy timestamps on GPU from 2367ec681f3Smrg * one buffer to another. 2377ec681f3Smrg * 2387ec681f3Smrg * It allows: 2397ec681f3Smrg * - Tracing re-usable command buffer in Vulkan, by copying tracepoints 2407ec681f3Smrg * each time it is submitted. 2417ec681f3Smrg * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints 2427ec681f3Smrg * corresponding to a tile. 2437ec681f3Smrg */ 2447ec681f3Smrgvoid u_trace_clone_append(struct u_trace_iterator begin_it, 2457ec681f3Smrg struct u_trace_iterator end_it, 2467ec681f3Smrg struct u_trace *into, 2477ec681f3Smrg void *cmdstream, 2487ec681f3Smrg u_trace_copy_ts_buffer copy_ts_buffer); 2497ec681f3Smrg 2507ec681f3Smrgvoid u_trace_disable_event_range(struct u_trace_iterator begin_it, 2517ec681f3Smrg struct u_trace_iterator end_it); 2527ec681f3Smrg 2537ec681f3Smrg/** 2547ec681f3Smrg * Flush traces to the parent trace-context. At this point, the expectation 2557ec681f3Smrg * is that all the tracepoints are "executed" by the GPU following any previously 2567ec681f3Smrg * flushed u_trace batch. 2577ec681f3Smrg * 2587ec681f3Smrg * flush_data is a way for driver to pass additional data, which becomes available 2597ec681f3Smrg * only at the point of flush, to the u_trace_read_ts callback and perfetto. 2607ec681f3Smrg * The typical example of such data would be a fence to wait on in u_trace_read_ts, 2617ec681f3Smrg * and a submission_id to pass into perfetto. 2627ec681f3Smrg * The destruction of the data is done via u_trace_delete_flush_data. 2637ec681f3Smrg * 2647ec681f3Smrg * This should typically be called when the corresponding cmdstream (containing 2657ec681f3Smrg * the timestamp reads) is flushed to the kernel. 2667ec681f3Smrg */ 2677ec681f3Smrgvoid u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data); 2687ec681f3Smrg 2697ec681f3Smrg#ifdef HAVE_PERFETTO 2707ec681f3Smrgextern int ut_perfetto_enabled; 2717ec681f3Smrg 2727ec681f3Smrgvoid u_trace_perfetto_start(void); 2737ec681f3Smrgvoid u_trace_perfetto_stop(void); 2747ec681f3Smrg#else 2757ec681f3Smrg# define ut_perfetto_enabled 0 2767ec681f3Smrg#endif 2777ec681f3Smrg 2787ec681f3Smrgstatic inline bool 2797ec681f3Smrgu_trace_context_tracing(struct u_trace_context *utctx) 2807ec681f3Smrg{ 2817ec681f3Smrg return !!utctx->out || (ut_perfetto_enabled > 0); 2827ec681f3Smrg} 2837ec681f3Smrg 2847ec681f3Smrg#ifdef __cplusplus 2857ec681f3Smrg} 2867ec681f3Smrg#endif 2877ec681f3Smrg 2887ec681f3Smrg#endif /* _U_TRACE_H */ 289