17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2021 Google, Inc. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#ifndef FREEDRENO_AUTOTUNE_H 257ec681f3Smrg#define FREEDRENO_AUTOTUNE_H 267ec681f3Smrg 277ec681f3Smrg#include "util/hash_table.h" 287ec681f3Smrg#include "util/list.h" 297ec681f3Smrg 307ec681f3Smrg#include "freedreno_util.h" 317ec681f3Smrg 327ec681f3Smrgstruct fd_autotune_results; 337ec681f3Smrg 347ec681f3Smrg/** 357ec681f3Smrg * "autotune" our decisions about bypass vs GMEM rendering, based on historical 367ec681f3Smrg * data about a given render target. 377ec681f3Smrg * 387ec681f3Smrg * In deciding which path to take there are tradeoffs, including some that 397ec681f3Smrg * are not reasonably estimateable without having some additional information: 407ec681f3Smrg * 417ec681f3Smrg * (1) If you know you are touching every pixel (ie. there is a glClear()), 427ec681f3Smrg * then the GMEM path will at least not cost more memory bandwidth than 437ec681f3Smrg * sysmem[1] 447ec681f3Smrg * 457ec681f3Smrg * (2) If there is no clear, GMEM could potentially cost *more* bandwidth 467ec681f3Smrg * due to sysmem->GMEM restore pass. 477ec681f3Smrg * 487ec681f3Smrg * (3) If you see a high draw count, that is an indication that there will be 497ec681f3Smrg * enough pixels accessed multiple times to benefit from the reduced 507ec681f3Smrg * memory bandwidth that GMEM brings 517ec681f3Smrg * 527ec681f3Smrg * (4) But high draw count where there is not much overdraw can actually be 537ec681f3Smrg * faster in bypass mode if it is pushing a lot of state change, due to 547ec681f3Smrg * not having to go thru the state changes per-tile[2] 557ec681f3Smrg * 567ec681f3Smrg * The approach taken is to measure the samples-passed for the batch to estimate 577ec681f3Smrg * the amount of overdraw to detect cases where the number of pixels touched is 587ec681f3Smrg * low. 597ec681f3Smrg * 607ec681f3Smrg * Note however, that (at least since a5xx) we have PERF_RB_{Z,C}_{READ,WRITE} 617ec681f3Smrg * performance countables, which give a more direct measurement of what we want 627ec681f3Smrg * to know (ie. is framebuffer memory access high enough to prefer GMEM), but 637ec681f3Smrg * with the downside of consuming half of the available RB counters. With the 647ec681f3Smrg * additional complication that external perfcntr collection (fdperf, perfetto) 657ec681f3Smrg * and the drive could be stomping on each other's feet. (Also reading the 667ec681f3Smrg * perfcntrs accurately requires a WFI.) 677ec681f3Smrg * 687ec681f3Smrg * [1] ignoring UBWC 697ec681f3Smrg * [2] ignoring early-tile-exit optimizations, but any draw that touches all/ 707ec681f3Smrg * most of the tiles late in the tile-pass can defeat that 717ec681f3Smrg */ 727ec681f3Smrgstruct fd_autotune { 737ec681f3Smrg 747ec681f3Smrg /** 757ec681f3Smrg * Cache to map batch->key (also used for batch-cache) to historical 767ec681f3Smrg * information about rendering to that particular render target. 777ec681f3Smrg */ 787ec681f3Smrg struct hash_table *ht; 797ec681f3Smrg 807ec681f3Smrg /** 817ec681f3Smrg * List of recently used historical results (to age out old results) 827ec681f3Smrg */ 837ec681f3Smrg struct list_head lru; 847ec681f3Smrg 857ec681f3Smrg /** 867ec681f3Smrg * GPU buffer used to communicate back results to the CPU 877ec681f3Smrg */ 887ec681f3Smrg struct fd_bo *results_mem; 897ec681f3Smrg struct fd_autotune_results *results; 907ec681f3Smrg 917ec681f3Smrg /** 927ec681f3Smrg * List of per-batch results that we are waiting for the GPU to finish 937ec681f3Smrg * with before reading back the results. 947ec681f3Smrg */ 957ec681f3Smrg struct list_head pending_results; 967ec681f3Smrg 977ec681f3Smrg uint32_t fence_counter; 987ec681f3Smrg uint32_t idx_counter; 997ec681f3Smrg}; 1007ec681f3Smrg 1017ec681f3Smrg/** 1027ec681f3Smrg * The layout of the memory used to read back per-batch results from the 1037ec681f3Smrg * GPU 1047ec681f3Smrg * 1057ec681f3Smrg * Note this struct is intentionally aligned to 4k. And hw requires the 1067ec681f3Smrg * sample start/stop locations to be 128b aligned. 1077ec681f3Smrg */ 1087ec681f3Smrgstruct fd_autotune_results { 1097ec681f3Smrg 1107ec681f3Smrg /** 1117ec681f3Smrg * The GPU writes back a "fence" seqno value from the cmdstream after 1127ec681f3Smrg * it finishes writing it's result slot, so that the CPU knows when 1137ec681f3Smrg * results are valid 1147ec681f3Smrg */ 1157ec681f3Smrg uint32_t fence; 1167ec681f3Smrg 1177ec681f3Smrg uint32_t __pad0; 1187ec681f3Smrg uint64_t __pad1; 1197ec681f3Smrg 1207ec681f3Smrg /** 1217ec681f3Smrg * From the cmdstream, the captured samples-passed values are recorded 1227ec681f3Smrg * at the start and end of the batch. 1237ec681f3Smrg * 1247ec681f3Smrg * Note that we do the math on the CPU to avoid a WFI. But pre-emption 1257ec681f3Smrg * may force us to revisit that. 1267ec681f3Smrg */ 1277ec681f3Smrg struct { 1287ec681f3Smrg uint64_t samples_start; 1297ec681f3Smrg uint64_t __pad0; 1307ec681f3Smrg uint64_t samples_end; 1317ec681f3Smrg uint64_t __pad1; 1327ec681f3Smrg } result[127]; 1337ec681f3Smrg}; 1347ec681f3Smrg 1357ec681f3Smrg#define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base)) 1367ec681f3Smrg#define results_ptr(at, member) \ 1377ec681f3Smrg (at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0 1387ec681f3Smrg 1397ec681f3Smrgstruct fd_batch_history; 1407ec681f3Smrg 1417ec681f3Smrg/** 1427ec681f3Smrg * Tracks the results from an individual batch. Initially created per batch, 1437ec681f3Smrg * and appended to the tail of at->pending_results. At a later time, when 1447ec681f3Smrg * the GPU has finished writing the results, 1457ec681f3Smrg * 1467ec681f3Smrg * ralloc parent is the associated fd_batch_history 1477ec681f3Smrg */ 1487ec681f3Smrgstruct fd_batch_result { 1497ec681f3Smrg 1507ec681f3Smrg /** 1517ec681f3Smrg * The index/slot in fd_autotune_results::result[] to write start/end 1527ec681f3Smrg * counter to 1537ec681f3Smrg */ 1547ec681f3Smrg unsigned idx; 1557ec681f3Smrg 1567ec681f3Smrg /** 1577ec681f3Smrg * Fence value to write back to fd_autotune_results::fence after both 1587ec681f3Smrg * start/end values written 1597ec681f3Smrg */ 1607ec681f3Smrg uint32_t fence; 1617ec681f3Smrg 1627ec681f3Smrg /* 1637ec681f3Smrg * Below here, only used internally within autotune 1647ec681f3Smrg */ 1657ec681f3Smrg struct fd_batch_history *history; 1667ec681f3Smrg struct list_head node; 1677ec681f3Smrg uint32_t cost; 1687ec681f3Smrg uint64_t samples_passed; 1697ec681f3Smrg}; 1707ec681f3Smrg 1717ec681f3Smrgvoid fd_autotune_init(struct fd_autotune *at, struct fd_device *dev); 1727ec681f3Smrgvoid fd_autotune_fini(struct fd_autotune *at); 1737ec681f3Smrg 1747ec681f3Smrgstruct fd_batch; 1757ec681f3Smrgbool fd_autotune_use_bypass(struct fd_autotune *at, 1767ec681f3Smrg struct fd_batch *batch) assert_dt; 1777ec681f3Smrg 1787ec681f3Smrg#endif /* FREEDRENO_AUTOTUNE_H */ 179