17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2021 Google, Inc.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#ifndef FREEDRENO_AUTOTUNE_H
257ec681f3Smrg#define FREEDRENO_AUTOTUNE_H
267ec681f3Smrg
277ec681f3Smrg#include "util/hash_table.h"
287ec681f3Smrg#include "util/list.h"
297ec681f3Smrg
307ec681f3Smrg#include "freedreno_util.h"
317ec681f3Smrg
327ec681f3Smrgstruct fd_autotune_results;
337ec681f3Smrg
347ec681f3Smrg/**
357ec681f3Smrg * "autotune" our decisions about bypass vs GMEM rendering, based on historical
367ec681f3Smrg * data about a given render target.
377ec681f3Smrg *
387ec681f3Smrg * In deciding which path to take there are tradeoffs, including some that
397ec681f3Smrg * are not reasonably estimateable without having some additional information:
407ec681f3Smrg *
417ec681f3Smrg *  (1) If you know you are touching every pixel (ie. there is a glClear()),
427ec681f3Smrg *      then the GMEM path will at least not cost more memory bandwidth than
437ec681f3Smrg *      sysmem[1]
447ec681f3Smrg *
457ec681f3Smrg *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
467ec681f3Smrg *      due to sysmem->GMEM restore pass.
477ec681f3Smrg *
487ec681f3Smrg *  (3) If you see a high draw count, that is an indication that there will be
497ec681f3Smrg *      enough pixels accessed multiple times to benefit from the reduced
507ec681f3Smrg *      memory bandwidth that GMEM brings
517ec681f3Smrg *
527ec681f3Smrg *  (4) But high draw count where there is not much overdraw can actually be
537ec681f3Smrg *      faster in bypass mode if it is pushing a lot of state change, due to
547ec681f3Smrg *      not having to go thru the state changes per-tile[2]
557ec681f3Smrg *
567ec681f3Smrg * The approach taken is to measure the samples-passed for the batch to estimate
577ec681f3Smrg * the amount of overdraw to detect cases where the number of pixels touched is
587ec681f3Smrg * low.
597ec681f3Smrg *
607ec681f3Smrg * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE}
617ec681f3Smrg * performance countables, which give a more direct measurement of what we want
627ec681f3Smrg * to know (ie. is framebuffer memory access high enough to prefer GMEM), but
637ec681f3Smrg * with the downside of consuming half of the available RB counters.  With the
647ec681f3Smrg * additional complication that external perfcntr collection (fdperf, perfetto)
657ec681f3Smrg * and the drive could be stomping on each other's feet.  (Also reading the
667ec681f3Smrg * perfcntrs accurately requires a WFI.)
677ec681f3Smrg *
687ec681f3Smrg * [1] ignoring UBWC
697ec681f3Smrg * [2] ignoring early-tile-exit optimizations, but any draw that touches all/
707ec681f3Smrg *     most of the tiles late in the tile-pass can defeat that
717ec681f3Smrg */
727ec681f3Smrgstruct fd_autotune {
737ec681f3Smrg
747ec681f3Smrg   /**
757ec681f3Smrg    * Cache to map batch->key (also used for batch-cache) to historical
767ec681f3Smrg    * information about rendering to that particular render target.
777ec681f3Smrg    */
787ec681f3Smrg   struct hash_table *ht;
797ec681f3Smrg
807ec681f3Smrg   /**
817ec681f3Smrg    * List of recently used historical results (to age out old results)
827ec681f3Smrg    */
837ec681f3Smrg   struct list_head lru;
847ec681f3Smrg
857ec681f3Smrg   /**
867ec681f3Smrg    * GPU buffer used to communicate back results to the CPU
877ec681f3Smrg    */
887ec681f3Smrg   struct fd_bo *results_mem;
897ec681f3Smrg   struct fd_autotune_results *results;
907ec681f3Smrg
917ec681f3Smrg   /**
927ec681f3Smrg    * List of per-batch results that we are waiting for the GPU to finish
937ec681f3Smrg    * with before reading back the results.
947ec681f3Smrg    */
957ec681f3Smrg   struct list_head pending_results;
967ec681f3Smrg
977ec681f3Smrg   uint32_t fence_counter;
987ec681f3Smrg   uint32_t idx_counter;
997ec681f3Smrg};
1007ec681f3Smrg
1017ec681f3Smrg/**
1027ec681f3Smrg * The layout of the memory used to read back per-batch results from the
1037ec681f3Smrg * GPU
1047ec681f3Smrg *
1057ec681f3Smrg * Note this struct is intentionally aligned to 4k.  And hw requires the
1067ec681f3Smrg * sample start/stop locations to be 128b aligned.
1077ec681f3Smrg */
1087ec681f3Smrgstruct fd_autotune_results {
1097ec681f3Smrg
1107ec681f3Smrg   /**
1117ec681f3Smrg    * The GPU writes back a "fence" seqno value from the cmdstream after
1127ec681f3Smrg    * it finishes writing it's result slot, so that the CPU knows when
1137ec681f3Smrg    * results are valid
1147ec681f3Smrg    */
1157ec681f3Smrg   uint32_t fence;
1167ec681f3Smrg
1177ec681f3Smrg   uint32_t __pad0;
1187ec681f3Smrg   uint64_t __pad1;
1197ec681f3Smrg
1207ec681f3Smrg   /**
1217ec681f3Smrg    * From the cmdstream, the captured samples-passed values are recorded
1227ec681f3Smrg    * at the start and end of the batch.
1237ec681f3Smrg    *
1247ec681f3Smrg    * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
1257ec681f3Smrg    * may force us to revisit that.
1267ec681f3Smrg    */
1277ec681f3Smrg   struct {
1287ec681f3Smrg      uint64_t samples_start;
1297ec681f3Smrg      uint64_t __pad0;
1307ec681f3Smrg      uint64_t samples_end;
1317ec681f3Smrg      uint64_t __pad1;
1327ec681f3Smrg   } result[127];
1337ec681f3Smrg};
1347ec681f3Smrg
1357ec681f3Smrg#define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base))
1367ec681f3Smrg#define results_ptr(at, member)                                                \
1377ec681f3Smrg   (at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0
1387ec681f3Smrg
1397ec681f3Smrgstruct fd_batch_history;
1407ec681f3Smrg
1417ec681f3Smrg/**
1427ec681f3Smrg * Tracks the results from an individual batch.  Initially created per batch,
1437ec681f3Smrg * and appended to the tail of at->pending_results.  At a later time, when
1447ec681f3Smrg * the GPU has finished writing the results,
1457ec681f3Smrg *
1467ec681f3Smrg * ralloc parent is the associated fd_batch_history
1477ec681f3Smrg */
1487ec681f3Smrgstruct fd_batch_result {
1497ec681f3Smrg
1507ec681f3Smrg   /**
1517ec681f3Smrg    * The index/slot in fd_autotune_results::result[] to write start/end
1527ec681f3Smrg    * counter to
1537ec681f3Smrg    */
1547ec681f3Smrg   unsigned idx;
1557ec681f3Smrg
1567ec681f3Smrg   /**
1577ec681f3Smrg    * Fence value to write back to fd_autotune_results::fence after both
1587ec681f3Smrg    * start/end values written
1597ec681f3Smrg    */
1607ec681f3Smrg   uint32_t fence;
1617ec681f3Smrg
1627ec681f3Smrg   /*
1637ec681f3Smrg    * Below here, only used internally within autotune
1647ec681f3Smrg    */
1657ec681f3Smrg   struct fd_batch_history *history;
1667ec681f3Smrg   struct list_head node;
1677ec681f3Smrg   uint32_t cost;
1687ec681f3Smrg   uint64_t samples_passed;
1697ec681f3Smrg};
1707ec681f3Smrg
1717ec681f3Smrgvoid fd_autotune_init(struct fd_autotune *at, struct fd_device *dev);
1727ec681f3Smrgvoid fd_autotune_fini(struct fd_autotune *at);
1737ec681f3Smrg
1747ec681f3Smrgstruct fd_batch;
1757ec681f3Smrgbool fd_autotune_use_bypass(struct fd_autotune *at,
1767ec681f3Smrg                            struct fd_batch *batch) assert_dt;
1777ec681f3Smrg
1787ec681f3Smrg#endif /* FREEDRENO_AUTOTUNE_H */
179