17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2021 Google, Inc.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#include "freedreno_autotune.h"
257ec681f3Smrg#include "freedreno_batch.h"
267ec681f3Smrg#include "freedreno_util.h"
277ec681f3Smrg
287ec681f3Smrg/**
297ec681f3Smrg * Tracks, for a given batch key (which maps to a FBO/framebuffer state),
307ec681f3Smrg *
317ec681f3Smrg * ralloc parent is fd_autotune::ht
327ec681f3Smrg */
337ec681f3Smrgstruct fd_batch_history {
347ec681f3Smrg   struct fd_batch_key *key;
357ec681f3Smrg
367ec681f3Smrg   /* Entry in fd_autotune::lru: */
377ec681f3Smrg   struct list_head node;
387ec681f3Smrg
397ec681f3Smrg   unsigned num_results;
407ec681f3Smrg
417ec681f3Smrg   /**
427ec681f3Smrg    * List of recent fd_batch_result's
437ec681f3Smrg    */
447ec681f3Smrg   struct list_head results;
457ec681f3Smrg#define MAX_RESULTS 5
467ec681f3Smrg};
477ec681f3Smrg
487ec681f3Smrgstatic struct fd_batch_history *
497ec681f3Smrgget_history(struct fd_autotune *at, struct fd_batch *batch)
507ec681f3Smrg{
517ec681f3Smrg   struct fd_batch_history *history;
527ec681f3Smrg
537ec681f3Smrg   /* draw batches should still have their key at this point. */
547ec681f3Smrg   assert(batch->key || batch->nondraw);
557ec681f3Smrg   if (!batch->key)
567ec681f3Smrg      return NULL;
577ec681f3Smrg
587ec681f3Smrg   struct hash_entry *entry =
597ec681f3Smrg      _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key);
607ec681f3Smrg
617ec681f3Smrg   if (entry) {
627ec681f3Smrg      history = entry->data;
637ec681f3Smrg      goto found;
647ec681f3Smrg   }
657ec681f3Smrg
667ec681f3Smrg   history = rzalloc_size(at->ht, sizeof(*history));
677ec681f3Smrg
687ec681f3Smrg   history->key = fd_batch_key_clone(history, batch->key);
697ec681f3Smrg   list_inithead(&history->node);
707ec681f3Smrg   list_inithead(&history->results);
717ec681f3Smrg
727ec681f3Smrg   /* Note: We cap # of cached GMEM states at 20.. so assuming double-
737ec681f3Smrg    * buffering, 40 should be a good place to cap cached autotune state
747ec681f3Smrg    */
757ec681f3Smrg   if (at->ht->entries >= 40) {
767ec681f3Smrg      struct fd_batch_history *last =
777ec681f3Smrg         list_last_entry(&at->lru, struct fd_batch_history, node);
787ec681f3Smrg      _mesa_hash_table_remove_key(at->ht, last->key);
797ec681f3Smrg      list_del(&last->node);
807ec681f3Smrg      ralloc_free(last);
817ec681f3Smrg   }
827ec681f3Smrg
837ec681f3Smrg   _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key,
847ec681f3Smrg                                      history);
857ec681f3Smrg
867ec681f3Smrgfound:
877ec681f3Smrg   /* Move to the head of the LRU: */
887ec681f3Smrg   list_delinit(&history->node);
897ec681f3Smrg   list_add(&history->node, &at->lru);
907ec681f3Smrg
917ec681f3Smrg   return history;
927ec681f3Smrg}
937ec681f3Smrg
947ec681f3Smrgstatic void
957ec681f3Smrgresult_destructor(void *r)
967ec681f3Smrg{
977ec681f3Smrg   struct fd_batch_result *result = r;
987ec681f3Smrg
997ec681f3Smrg   /* Just in case we manage to somehow still be on the pending_results list: */
1007ec681f3Smrg   list_del(&result->node);
1017ec681f3Smrg}
1027ec681f3Smrg
1037ec681f3Smrgstatic struct fd_batch_result *
1047ec681f3Smrgget_result(struct fd_autotune *at, struct fd_batch_history *history)
1057ec681f3Smrg{
1067ec681f3Smrg   struct fd_batch_result *result = rzalloc_size(history, sizeof(*result));
1077ec681f3Smrg
1087ec681f3Smrg   result->fence =
1097ec681f3Smrg      ++at->fence_counter; /* pre-increment so zero isn't valid fence */
1107ec681f3Smrg   result->idx = at->idx_counter++;
1117ec681f3Smrg
1127ec681f3Smrg   if (at->idx_counter >= ARRAY_SIZE(at->results->result))
1137ec681f3Smrg      at->idx_counter = 0;
1147ec681f3Smrg
1157ec681f3Smrg   result->history = history;
1167ec681f3Smrg   list_addtail(&result->node, &at->pending_results);
1177ec681f3Smrg
1187ec681f3Smrg   ralloc_set_destructor(result, result_destructor);
1197ec681f3Smrg
1207ec681f3Smrg   return result;
1217ec681f3Smrg}
1227ec681f3Smrg
1237ec681f3Smrgstatic void
1247ec681f3Smrgprocess_results(struct fd_autotune *at)
1257ec681f3Smrg{
1267ec681f3Smrg   uint32_t current_fence = at->results->fence;
1277ec681f3Smrg
1287ec681f3Smrg   list_for_each_entry_safe (struct fd_batch_result, result,
1297ec681f3Smrg                             &at->pending_results, node) {
1307ec681f3Smrg      if (result->fence > current_fence)
1317ec681f3Smrg         break;
1327ec681f3Smrg
1337ec681f3Smrg      struct fd_batch_history *history = result->history;
1347ec681f3Smrg
1357ec681f3Smrg      result->samples_passed = at->results->result[result->idx].samples_end -
1367ec681f3Smrg                               at->results->result[result->idx].samples_start;
1377ec681f3Smrg
1387ec681f3Smrg      list_delinit(&result->node);
1397ec681f3Smrg      list_add(&result->node, &history->results);
1407ec681f3Smrg
1417ec681f3Smrg      if (history->num_results < MAX_RESULTS) {
1427ec681f3Smrg         history->num_results++;
1437ec681f3Smrg      } else {
1447ec681f3Smrg         /* Once above a limit, start popping old results off the
1457ec681f3Smrg          * tail of the list:
1467ec681f3Smrg          */
1477ec681f3Smrg         struct fd_batch_result *old_result =
1487ec681f3Smrg            list_last_entry(&history->results, struct fd_batch_result, node);
1497ec681f3Smrg         list_delinit(&old_result->node);
1507ec681f3Smrg         ralloc_free(old_result);
1517ec681f3Smrg      }
1527ec681f3Smrg   }
1537ec681f3Smrg}
1547ec681f3Smrg
1557ec681f3Smrgstatic bool
1567ec681f3Smrgfallback_use_bypass(struct fd_batch *batch)
1577ec681f3Smrg{
1587ec681f3Smrg   struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1597ec681f3Smrg
1607ec681f3Smrg   /* Fallback logic if we have no historical data about the rendertarget: */
1617ec681f3Smrg   if (batch->cleared || batch->gmem_reason ||
1627ec681f3Smrg       (batch->num_draws > 5) || (pfb->samples > 1)) {
1637ec681f3Smrg      return false;
1647ec681f3Smrg   }
1657ec681f3Smrg
1667ec681f3Smrg   return true;
1677ec681f3Smrg}
1687ec681f3Smrg
1697ec681f3Smrg/**
1707ec681f3Smrg * A magic 8-ball that tells the gmem code whether we should do bypass mode
1717ec681f3Smrg * for moar fps.
1727ec681f3Smrg */
1737ec681f3Smrgbool
1747ec681f3Smrgfd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch)
1757ec681f3Smrg{
1767ec681f3Smrg   struct pipe_framebuffer_state *pfb = &batch->framebuffer;
1777ec681f3Smrg
1787ec681f3Smrg   process_results(at);
1797ec681f3Smrg
1807ec681f3Smrg   /* Only enable on gen's that opt-in (and actually have sample-passed
1817ec681f3Smrg    * collection wired up:
1827ec681f3Smrg    */
1837ec681f3Smrg   if (!batch->ctx->screen->gmem_reason_mask)
1847ec681f3Smrg      return fallback_use_bypass(batch);
1857ec681f3Smrg
1867ec681f3Smrg   if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask)
1877ec681f3Smrg      return fallback_use_bypass(batch);
1887ec681f3Smrg
1897ec681f3Smrg   for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
1907ec681f3Smrg      /* If ms-rtt is involved, force GMEM, as we don't currently
1917ec681f3Smrg       * implement a temporary render target that we can MSAA resolve
1927ec681f3Smrg       * from
1937ec681f3Smrg       */
1947ec681f3Smrg      if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples)
1957ec681f3Smrg         return fallback_use_bypass(batch);
1967ec681f3Smrg   }
1977ec681f3Smrg
1987ec681f3Smrg   struct fd_batch_history *history = get_history(at, batch);
1997ec681f3Smrg   if (!history)
2007ec681f3Smrg      return fallback_use_bypass(batch);
2017ec681f3Smrg
2027ec681f3Smrg   batch->autotune_result = get_result(at, history);
2037ec681f3Smrg   batch->autotune_result->cost = batch->cost;
2047ec681f3Smrg
2057ec681f3Smrg   bool use_bypass = fallback_use_bypass(batch);
2067ec681f3Smrg
2077ec681f3Smrg   if (use_bypass)
2087ec681f3Smrg      return true;
2097ec681f3Smrg
2107ec681f3Smrg   if (history->num_results > 0) {
2117ec681f3Smrg      uint32_t total_samples = 0;
2127ec681f3Smrg
2137ec681f3Smrg      // TODO we should account for clears somehow
2147ec681f3Smrg      // TODO should we try to notice if there is a drastic change from
2157ec681f3Smrg      // frame to frame?
2167ec681f3Smrg      list_for_each_entry (struct fd_batch_result, result, &history->results,
2177ec681f3Smrg                           node) {
2187ec681f3Smrg         total_samples += result->samples_passed;
2197ec681f3Smrg      }
2207ec681f3Smrg
2217ec681f3Smrg      float avg_samples = (float)total_samples / (float)history->num_results;
2227ec681f3Smrg
2237ec681f3Smrg      /* Low sample count could mean there was only a clear.. or there was
2247ec681f3Smrg       * a clear plus draws that touch no or few samples
2257ec681f3Smrg       */
2267ec681f3Smrg      if (avg_samples < 500.0)
2277ec681f3Smrg         return true;
2287ec681f3Smrg
2297ec681f3Smrg      /* Cost-per-sample is an estimate for the average number of reads+
2307ec681f3Smrg       * writes for a given passed sample.
2317ec681f3Smrg       */
2327ec681f3Smrg      float sample_cost = batch->cost;
2337ec681f3Smrg      sample_cost /= batch->num_draws;
2347ec681f3Smrg
2357ec681f3Smrg      float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws;
2367ec681f3Smrg      DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, "
2377ec681f3Smrg          "total_draw_cost=%f\n",
2387ec681f3Smrg          batch->hash, batch->num_draws, total_samples, avg_samples,
2397ec681f3Smrg          sample_cost, total_draw_cost);
2407ec681f3Smrg
2417ec681f3Smrg      if (total_draw_cost < 3000.0)
2427ec681f3Smrg         return true;
2437ec681f3Smrg   }
2447ec681f3Smrg
2457ec681f3Smrg   return use_bypass;
2467ec681f3Smrg}
2477ec681f3Smrg
2487ec681f3Smrgvoid
2497ec681f3Smrgfd_autotune_init(struct fd_autotune *at, struct fd_device *dev)
2507ec681f3Smrg{
2517ec681f3Smrg   at->ht =
2527ec681f3Smrg      _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals);
2537ec681f3Smrg   list_inithead(&at->lru);
2547ec681f3Smrg
2557ec681f3Smrg   at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results),
2567ec681f3Smrg                               0, "autotune");
2577ec681f3Smrg   at->results = fd_bo_map(at->results_mem);
2587ec681f3Smrg
2597ec681f3Smrg   list_inithead(&at->pending_results);
2607ec681f3Smrg}
2617ec681f3Smrg
2627ec681f3Smrgvoid
2637ec681f3Smrgfd_autotune_fini(struct fd_autotune *at)
2647ec681f3Smrg{
2657ec681f3Smrg   _mesa_hash_table_destroy(at->ht, NULL);
2667ec681f3Smrg   fd_bo_del(at->results_mem);
2677ec681f3Smrg}
268