17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2020 Google, Inc.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#ifndef _U_TRACE_H
257ec681f3Smrg#define _U_TRACE_H
267ec681f3Smrg
277ec681f3Smrg#include <stdbool.h>
287ec681f3Smrg#include <stdint.h>
297ec681f3Smrg#include <stdio.h>
307ec681f3Smrg
317ec681f3Smrg#include "util/u_queue.h"
327ec681f3Smrg
337ec681f3Smrg#ifdef  __cplusplus
347ec681f3Smrgextern "C" {
357ec681f3Smrg#endif
367ec681f3Smrg
377ec681f3Smrg/* A trace mechanism (very) loosely inspired by the linux kernel tracepoint
387ec681f3Smrg * mechanism, in that it allows for defining driver specific (or common)
397ec681f3Smrg * tracepoints, which generate 'trace_$name()' functions that can be
407ec681f3Smrg * called at various points in commandstream emit.
417ec681f3Smrg *
427ec681f3Smrg * Currently a printf backend is implemented, but the expectation is to
437ec681f3Smrg * also implement a perfetto backend for shipping out traces to a tool like
447ec681f3Smrg * AGI.
457ec681f3Smrg *
467ec681f3Smrg * Notable differences:
477ec681f3Smrg *
487ec681f3Smrg *  - GPU timestamps!  A driver provided callback is used to emit timestamps
497ec681f3Smrg *    to a buffer.  At a later point in time (when stalling to wait for the
507ec681f3Smrg *    GPU is not required), the timestamps are re-united with the trace
517ec681f3Smrg *    payload.  This makes the trace mechanism suitable for profiling.
527ec681f3Smrg *
537ec681f3Smrg *  - Instead of a systemwide trace ringbuffer, buffering of un-retired
547ec681f3Smrg *    tracepoints is split into two stages.  Traces are emitted to a
557ec681f3Smrg *    'u_trace' instance, and at a later time flushed to a 'u_trace_context'
567ec681f3Smrg *    instance.  This avoids the requirement that commandstream containing
577ec681f3Smrg *    tracepoints is emitted in the same order as it is generated.
587ec681f3Smrg *
597ec681f3Smrg *    If the hw has multiple parallel "engines" (for example, 3d/blit/compute)
607ec681f3Smrg *    then a `u_trace_context` per-engine should be used.
617ec681f3Smrg *
627ec681f3Smrg *  - Unlike kernel tracepoints, u_trace tracepoints are defined in py
637ec681f3Smrg *    from which header and src files are generated.  Since we already have
647ec681f3Smrg *    a build dependency on python+mako, this gives more flexibility than
657ec681f3Smrg *    clunky preprocessor macro magic.
667ec681f3Smrg *
677ec681f3Smrg */
687ec681f3Smrg
697ec681f3Smrgstruct u_trace_context;
707ec681f3Smrgstruct u_trace;
717ec681f3Smrgstruct u_trace_chunk;
727ec681f3Smrg
737ec681f3Smrg/**
747ec681f3Smrg * Special reserved value to indicate that no timestamp was captured,
757ec681f3Smrg * and that the timestamp of the previous trace should be reused.
767ec681f3Smrg */
777ec681f3Smrg#define U_TRACE_NO_TIMESTAMP ((uint64_t)0)
787ec681f3Smrg
797ec681f3Smrg/**
807ec681f3Smrg * Driver provided callback to create a timestamp buffer which will be
817ec681f3Smrg * read by u_trace_read_ts function.
827ec681f3Smrg */
837ec681f3Smrgtypedef void* (*u_trace_create_ts_buffer)(struct u_trace_context *utctx,
847ec681f3Smrg      uint32_t timestamps_count);
857ec681f3Smrg
867ec681f3Smrg/**
877ec681f3Smrg * Driver provided callback to delete a timestamp buffer.
887ec681f3Smrg */
897ec681f3Smrgtypedef void (*u_trace_delete_ts_buffer)(struct u_trace_context *utctx,
907ec681f3Smrg      void *timestamps);
917ec681f3Smrg
927ec681f3Smrg/**
937ec681f3Smrg * Driver provided callback to emit commands into the soecified command
947ec681f3Smrg * stream to capture a 64b timestamp into the specified timestamps buffer,
957ec681f3Smrg * at the specified index.
967ec681f3Smrg *
977ec681f3Smrg * The hw counter that the driver records should be something that runs at
987ec681f3Smrg * a fixed rate, even as the GPU freq changes.  The same source used for
997ec681f3Smrg * GL_TIMESTAMP queries should be appropriate.
1007ec681f3Smrg */
1017ec681f3Smrgtypedef void (*u_trace_record_ts)(struct u_trace *ut, void *cs,
1027ec681f3Smrg      void *timestamps, unsigned idx);
1037ec681f3Smrg
1047ec681f3Smrg/**
1057ec681f3Smrg * Driver provided callback to read back a previously recorded timestamp.
1067ec681f3Smrg * If necessary, this should block until the GPU has finished writing back
1077ec681f3Smrg * the timestamps.  (The timestamps will be read back in order, so it is
1087ec681f3Smrg * safe to only synchronize on idx==0.)
1097ec681f3Smrg *
1107ec681f3Smrg * flush_data is data provided by the driver via u_trace_flush.
1117ec681f3Smrg *
1127ec681f3Smrg * The returned timestamp should be in units of nanoseconds.  The same
1137ec681f3Smrg * timebase as GL_TIMESTAMP queries should be used.
1147ec681f3Smrg *
1157ec681f3Smrg * The driver can return the special U_TRACE_NO_TIMESTAMP value to indicate
1167ec681f3Smrg * that no timestamp was captured and the timestamp from the previous trace
1177ec681f3Smrg * will be re-used.  (The first trace in the u_trace buf may not do this.)
1187ec681f3Smrg * This allows the driver to detect cases where multiple tracepoints are
1197ec681f3Smrg * emitted with no other intervening cmdstream, to avoid pointlessly
1207ec681f3Smrg * capturing the same timestamp multiple times in a row.
1217ec681f3Smrg */
1227ec681f3Smrgtypedef uint64_t (*u_trace_read_ts)(struct u_trace_context *utctx,
1237ec681f3Smrg      void *timestamps, unsigned idx, void *flush_data);
1247ec681f3Smrg
1257ec681f3Smrg/**
1267ec681f3Smrg * Driver provided callback to delete flush data.
1277ec681f3Smrg */
1287ec681f3Smrgtypedef void (*u_trace_delete_flush_data)(struct u_trace_context *utctx,
1297ec681f3Smrg      void *flush_data);
1307ec681f3Smrg
1317ec681f3Smrg/**
1327ec681f3Smrg * The trace context provides tracking for "in-flight" traces, once the
1337ec681f3Smrg * cmdstream that records timestamps has been flushed.
1347ec681f3Smrg */
1357ec681f3Smrgstruct u_trace_context {
1367ec681f3Smrg   void *pctx;
1377ec681f3Smrg
1387ec681f3Smrg   u_trace_create_ts_buffer  create_timestamp_buffer;
1397ec681f3Smrg   u_trace_delete_ts_buffer  delete_timestamp_buffer;
1407ec681f3Smrg   u_trace_record_ts         record_timestamp;
1417ec681f3Smrg   u_trace_read_ts           read_timestamp;
1427ec681f3Smrg   u_trace_delete_flush_data delete_flush_data;
1437ec681f3Smrg
1447ec681f3Smrg   FILE *out;
1457ec681f3Smrg
1467ec681f3Smrg   /* Once u_trace_flush() is called u_trace_chunk's are queued up to
1477ec681f3Smrg    * render tracepoints on a queue.  The per-chunk queue jobs block until
1487ec681f3Smrg    * timestamps are available.
1497ec681f3Smrg    */
1507ec681f3Smrg   struct util_queue queue;
1517ec681f3Smrg
1527ec681f3Smrg#ifdef HAVE_PERFETTO
1537ec681f3Smrg   /* node in global list of trace contexts. */
1547ec681f3Smrg   struct list_head node;
1557ec681f3Smrg#endif
1567ec681f3Smrg
1577ec681f3Smrg   /* State to accumulate time across N chunks associated with a single
1587ec681f3Smrg    * batch (u_trace).
1597ec681f3Smrg    */
1607ec681f3Smrg   uint64_t last_time_ns;
1617ec681f3Smrg   uint64_t first_time_ns;
1627ec681f3Smrg
1637ec681f3Smrg   uint32_t frame_nr;
1647ec681f3Smrg
1657ec681f3Smrg   /* list of unprocessed trace chunks in fifo order: */
1667ec681f3Smrg   struct list_head flushed_trace_chunks;
1677ec681f3Smrg};
1687ec681f3Smrg
1697ec681f3Smrg/**
1707ec681f3Smrg * The u_trace ptr is passed as the first arg to generated tracepoints.
1717ec681f3Smrg * It provides buffering for tracepoint payload until the corresponding
1727ec681f3Smrg * driver cmdstream containing the emitted commands to capture is
1737ec681f3Smrg * flushed.
1747ec681f3Smrg *
1757ec681f3Smrg * Individual tracepoints emitted to u_trace are expected to be "executed"
1767ec681f3Smrg * (ie. timestamp captured) in FIFO order with respect to other tracepoints
1777ec681f3Smrg * emitted to the same u_trace.  But the order WRT other u_trace instances
1787ec681f3Smrg * is undefined util u_trace_flush().
1797ec681f3Smrg */
1807ec681f3Smrgstruct u_trace {
1817ec681f3Smrg   struct u_trace_context *utctx;
1827ec681f3Smrg
1837ec681f3Smrg   struct list_head trace_chunks;  /* list of unflushed trace chunks in fifo order */
1847ec681f3Smrg
1857ec681f3Smrg   bool enabled;
1867ec681f3Smrg};
1877ec681f3Smrg
1887ec681f3Smrgvoid u_trace_context_init(struct u_trace_context *utctx,
1897ec681f3Smrg      void *pctx,
1907ec681f3Smrg      u_trace_create_ts_buffer   create_timestamp_buffer,
1917ec681f3Smrg      u_trace_delete_ts_buffer   delete_timestamp_buffer,
1927ec681f3Smrg      u_trace_record_ts          record_timestamp,
1937ec681f3Smrg      u_trace_read_ts            read_timestamp,
1947ec681f3Smrg      u_trace_delete_flush_data  delete_flush_data);
1957ec681f3Smrgvoid u_trace_context_fini(struct u_trace_context *utctx);
1967ec681f3Smrg
1977ec681f3Smrg/**
1987ec681f3Smrg * Flush (trigger processing) of traces previously flushed to the trace-context
1997ec681f3Smrg * by u_trace_flush().
2007ec681f3Smrg *
2017ec681f3Smrg * This should typically be called in the driver's pctx->flush().
2027ec681f3Smrg */
2037ec681f3Smrgvoid u_trace_context_process(struct u_trace_context *utctx, bool eof);
2047ec681f3Smrg
2057ec681f3Smrgvoid u_trace_init(struct u_trace *ut, struct u_trace_context *utctx);
2067ec681f3Smrgvoid u_trace_fini(struct u_trace *ut);
2077ec681f3Smrg
2087ec681f3Smrgbool u_trace_has_points(struct u_trace *ut);
2097ec681f3Smrg
2107ec681f3Smrgstruct u_trace_iterator
2117ec681f3Smrg{
2127ec681f3Smrg   struct u_trace *ut;
2137ec681f3Smrg   struct u_trace_chunk *chunk;
2147ec681f3Smrg   uint32_t event_idx;
2157ec681f3Smrg};
2167ec681f3Smrg
2177ec681f3Smrgstruct u_trace_iterator
2187ec681f3Smrgu_trace_begin_iterator(struct u_trace *ut);
2197ec681f3Smrg
2207ec681f3Smrgstruct u_trace_iterator
2217ec681f3Smrgu_trace_end_iterator(struct u_trace *ut);
2227ec681f3Smrg
2237ec681f3Smrgbool
2247ec681f3Smrgu_trace_iterator_equal(struct u_trace_iterator a,
2257ec681f3Smrg                       struct u_trace_iterator b);
2267ec681f3Smrg
2277ec681f3Smrgtypedef void (*u_trace_copy_ts_buffer)(struct u_trace_context *utctx,
2287ec681f3Smrg      void *cmdstream,
2297ec681f3Smrg      void *ts_from, uint32_t from_offset,
2307ec681f3Smrg      void *ts_to, uint32_t to_offset,
2317ec681f3Smrg      uint32_t count);
2327ec681f3Smrg
2337ec681f3Smrg/**
2347ec681f3Smrg * Clones tracepoints range into target u_trace.
2357ec681f3Smrg * Provides callback for driver to copy timestamps on GPU from
2367ec681f3Smrg * one buffer to another.
2377ec681f3Smrg *
2387ec681f3Smrg * It allows:
2397ec681f3Smrg * - Tracing re-usable command buffer in Vulkan, by copying tracepoints
2407ec681f3Smrg *   each time it is submitted.
2417ec681f3Smrg * - Per-tile tracing for tiling GPUs, by copying a range of tracepoints
2427ec681f3Smrg *   corresponding to a tile.
2437ec681f3Smrg */
2447ec681f3Smrgvoid u_trace_clone_append(struct u_trace_iterator begin_it,
2457ec681f3Smrg                          struct u_trace_iterator end_it,
2467ec681f3Smrg                          struct u_trace *into,
2477ec681f3Smrg                          void *cmdstream,
2487ec681f3Smrg                          u_trace_copy_ts_buffer copy_ts_buffer);
2497ec681f3Smrg
2507ec681f3Smrgvoid u_trace_disable_event_range(struct u_trace_iterator begin_it,
2517ec681f3Smrg                                 struct u_trace_iterator end_it);
2527ec681f3Smrg
2537ec681f3Smrg/**
2547ec681f3Smrg * Flush traces to the parent trace-context.  At this point, the expectation
2557ec681f3Smrg * is that all the tracepoints are "executed" by the GPU following any previously
2567ec681f3Smrg * flushed u_trace batch.
2577ec681f3Smrg *
2587ec681f3Smrg * flush_data is a way for driver to pass additional data, which becomes available
2597ec681f3Smrg * only at the point of flush, to the u_trace_read_ts callback and perfetto.
2607ec681f3Smrg * The typical example of such data would be a fence to wait on in u_trace_read_ts,
2617ec681f3Smrg * and a submission_id to pass into perfetto.
2627ec681f3Smrg * The destruction of the data is done via u_trace_delete_flush_data.
2637ec681f3Smrg *
2647ec681f3Smrg * This should typically be called when the corresponding cmdstream (containing
2657ec681f3Smrg * the timestamp reads) is flushed to the kernel.
2667ec681f3Smrg */
2677ec681f3Smrgvoid u_trace_flush(struct u_trace *ut, void *flush_data, bool free_data);
2687ec681f3Smrg
2697ec681f3Smrg#ifdef HAVE_PERFETTO
2707ec681f3Smrgextern int ut_perfetto_enabled;
2717ec681f3Smrg
2727ec681f3Smrgvoid u_trace_perfetto_start(void);
2737ec681f3Smrgvoid u_trace_perfetto_stop(void);
2747ec681f3Smrg#else
2757ec681f3Smrg#  define ut_perfetto_enabled 0
2767ec681f3Smrg#endif
2777ec681f3Smrg
2787ec681f3Smrgstatic inline bool
2797ec681f3Smrgu_trace_context_tracing(struct u_trace_context *utctx)
2807ec681f3Smrg{
2817ec681f3Smrg   return !!utctx->out || (ut_perfetto_enabled > 0);
2827ec681f3Smrg}
2837ec681f3Smrg
2847ec681f3Smrg#ifdef  __cplusplus
2857ec681f3Smrg}
2867ec681f3Smrg#endif
2877ec681f3Smrg
2887ec681f3Smrg#endif  /* _U_TRACE_H */
289