tu_perfetto.cc revision 7ec681f3
1/*
2 * Copyright © 2021 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include <perfetto.h>
25
26#include "tu_perfetto.h"
27
28#include "util/u_perfetto.h"
29#include "util/hash_table.h"
30
31#include "tu_tracepoints.h"
32#include "tu_tracepoints_perfetto.h"
33
34static uint32_t gpu_clock_id;
35static uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
36
37/**
38 * The timestamp at the point where we first emitted the clock_sync..
39 * this  will be a *later* timestamp that the first GPU traces (since
40 * we capture the first clock_sync from the CPU *after* the first GPU
41 * tracepoints happen).  To avoid confusing perfetto we need to drop
42 * the GPU traces with timestamps before this.
43 */
44static uint64_t sync_gpu_ts;
45
46struct TuRenderpassIncrementalState {
47   bool was_cleared = true;
48};
49
50struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
51   using IncrementalStateType = TuRenderpassIncrementalState;
52};
53
54class TuRenderpassDataSource : public perfetto::DataSource<TuRenderpassDataSource, TuRenderpassTraits> {
55public:
56   void OnSetup(const SetupArgs &) override
57   {
58      // Use this callback to apply any custom configuration to your data source
59      // based on the TraceConfig in SetupArgs.
60   }
61
62   void OnStart(const StartArgs &) override
63   {
64      // This notification can be used to initialize the GPU driver, enable
65      // counters, etc. StartArgs will contains the DataSourceDescriptor,
66      // which can be extended.
67      u_trace_perfetto_start();
68      PERFETTO_LOG("Tracing started");
69
70      /* Note: clock_id's below 128 are reserved.. for custom clock sources,
71       * using the hash of a namespaced string is the recommended approach.
72       * See: https://perfetto.dev/docs/concepts/clock-sync
73       */
74      gpu_clock_id =
75         _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
76   }
77
78   void OnStop(const StopArgs &) override
79   {
80      PERFETTO_LOG("Tracing stopped");
81
82      // Undo any initialization done in OnStart.
83      u_trace_perfetto_stop();
84      // TODO we should perhaps block until queued traces are flushed?
85
86      Trace([](TuRenderpassDataSource::TraceContext ctx) {
87         auto packet = ctx.NewTracePacket();
88         packet->Finalize();
89         ctx.Flush();
90      });
91   }
92};
93
94PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
95PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
96
97static void
98send_descriptors(TuRenderpassDataSource::TraceContext &ctx, uint64_t ts_ns)
99{
100   PERFETTO_LOG("Sending renderstage descriptors");
101
102   auto packet = ctx.NewTracePacket();
103
104   packet->set_timestamp(0);
105
106   auto event = packet->set_gpu_render_stage_event();
107   event->set_gpu_id(0);
108
109   auto spec = event->set_specifications();
110
111   for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
112      auto desc = spec->add_hw_queue();
113
114      desc->set_name(queues[i].name);
115      desc->set_description(queues[i].desc);
116   }
117
118   for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
119      auto desc = spec->add_stage();
120
121      desc->set_name(stages[i].name);
122      if (stages[i].desc)
123         desc->set_description(stages[i].desc);
124   }
125}
126
127static void
128stage_start(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage)
129{
130   struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
131
132   p->start_ts[stage] = ts_ns;
133}
134
135typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
136
137static void
138stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage,
139          uint32_t submission_id, const void* payload = nullptr,
140          trace_payload_as_extra_func payload_as_extra = nullptr)
141{
142   struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
143
144   /* If we haven't managed to calibrate the alignment between GPU and CPU
145    * timestamps yet, then skip this trace, otherwise perfetto won't know
146    * what to do with it.
147    */
148   if (!sync_gpu_ts)
149      return;
150
151   TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
152      if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
153         send_descriptors(tctx, p->start_ts[stage]);
154         state->was_cleared = false;
155      }
156
157      auto packet = tctx.NewTracePacket();
158
159      packet->set_timestamp(p->start_ts[stage]);
160      packet->set_timestamp_clock_id(gpu_clock_id);
161
162      auto event = packet->set_gpu_render_stage_event();
163      event->set_event_id(0); // ???
164      event->set_hw_queue_id(DEFAULT_HW_QUEUE_ID);
165      event->set_duration(ts_ns - p->start_ts[stage]);
166      event->set_stage_id(stage);
167      event->set_context((uintptr_t)dev);
168      event->set_submission_id(submission_id);
169
170      if (payload && payload_as_extra) {
171         payload_as_extra(event, payload);
172      }
173   });
174}
175
176#ifdef __cplusplus
177extern "C" {
178#endif
179
180void
181tu_perfetto_init(void)
182{
183   util_perfetto_init();
184
185   perfetto::DataSourceDescriptor dsd;
186   dsd.set_name("gpu.renderstages.msm");
187   TuRenderpassDataSource::Register(dsd);
188}
189
190static void
191sync_timestamp(struct tu_device *dev)
192{
193   uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
194   uint64_t gpu_ts = 0;
195
196   if (cpu_ts < next_clock_sync_ns)
197      return;
198
199    if (tu_device_get_timestamp(dev, &gpu_ts)) {
200      PERFETTO_ELOG("Could not sync CPU and GPU clocks");
201      return;
202    }
203
204   /* convert GPU ts into ns: */
205   gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
206
207   TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
208      auto packet = tctx.NewTracePacket();
209
210      packet->set_timestamp(cpu_ts);
211
212      auto event = packet->set_clock_snapshot();
213
214      {
215         auto clock = event->add_clocks();
216
217         clock->set_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
218         clock->set_timestamp(cpu_ts);
219      }
220
221      {
222         auto clock = event->add_clocks();
223
224         clock->set_clock_id(gpu_clock_id);
225         clock->set_timestamp(gpu_ts);
226      }
227
228      sync_gpu_ts = gpu_ts;
229      next_clock_sync_ns = cpu_ts + 30000000;
230   });
231}
232
233static void
234emit_submit_id(uint32_t submission_id)
235{
236   TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
237      auto packet = tctx.NewTracePacket();
238
239      packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
240
241      auto event = packet->set_vulkan_api_event();
242      auto submit = event->set_vk_queue_submit();
243
244      submit->set_submission_id(submission_id);
245   });
246}
247
248void
249tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
250{
251   sync_timestamp(dev);
252   emit_submit_id(submission_id);
253}
254
255/*
256 * Trace callbacks, called from u_trace once the timestamps from GPU have been
257 * collected.
258 */
259
260#define CREATE_EVENT_CALLBACK(event_name, stage)                              \
261void                                                                          \
262tu_start_##event_name(struct tu_device *dev, uint64_t ts_ns,                  \
263                   const void *flush_data,                                    \
264                   const struct trace_start_##event_name *payload)            \
265{                                                                             \
266   stage_start(dev, ts_ns, stage);                                            \
267}                                                                             \
268                                                                              \
269void                                                                          \
270tu_end_##event_name(struct tu_device *dev, uint64_t ts_ns,                    \
271                   const void *flush_data,                                    \
272                   const struct trace_end_##event_name *payload)              \
273{                                                                             \
274   auto trace_flush_data = (const struct tu_u_trace_flush_data *) flush_data; \
275   uint32_t submission_id =                                                   \
276      tu_u_trace_flush_data_get_submit_id(trace_flush_data);                  \
277   stage_end(dev, ts_ns, stage, submission_id, payload,                       \
278      (trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name);\
279}
280
281CREATE_EVENT_CALLBACK(render_pass, SURFACE_STAGE_ID)
282CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
283CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
284CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
285CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
286CREATE_EVENT_CALLBACK(compute, COMPUTE_STAGE_ID)
287CREATE_EVENT_CALLBACK(gmem_clear, CLEAR_GMEM_STAGE_ID)
288CREATE_EVENT_CALLBACK(sysmem_clear, CLEAR_SYSMEM_STAGE_ID)
289CREATE_EVENT_CALLBACK(sysmem_clear_all, CLEAR_SYSMEM_STAGE_ID)
290CREATE_EVENT_CALLBACK(gmem_load, GMEM_LOAD_STAGE_ID)
291CREATE_EVENT_CALLBACK(gmem_store, GMEM_STORE_STAGE_ID)
292CREATE_EVENT_CALLBACK(sysmem_resolve, SYSMEM_RESOLVE_STAGE_ID)
293
294#ifdef __cplusplus
295}
296#endif
297