freedreno_autotune.c revision 7ec681f3
1/*
2 * Copyright © 2021 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include "freedreno_autotune.h"
25#include "freedreno_batch.h"
26#include "freedreno_util.h"
27
28/**
29 * Tracks, for a given batch key (which maps to a FBO/framebuffer state),
30 *
31 * ralloc parent is fd_autotune::ht
32 */
33struct fd_batch_history {
34   struct fd_batch_key *key;
35
36   /* Entry in fd_autotune::lru: */
37   struct list_head node;
38
39   unsigned num_results;
40
41   /**
42    * List of recent fd_batch_result's
43    */
44   struct list_head results;
45#define MAX_RESULTS 5
46};
47
48static struct fd_batch_history *
49get_history(struct fd_autotune *at, struct fd_batch *batch)
50{
51   struct fd_batch_history *history;
52
53   /* draw batches should still have their key at this point. */
54   assert(batch->key || batch->nondraw);
55   if (!batch->key)
56      return NULL;
57
58   struct hash_entry *entry =
59      _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key);
60
61   if (entry) {
62      history = entry->data;
63      goto found;
64   }
65
66   history = rzalloc_size(at->ht, sizeof(*history));
67
68   history->key = fd_batch_key_clone(history, batch->key);
69   list_inithead(&history->node);
70   list_inithead(&history->results);
71
72   /* Note: We cap # of cached GMEM states at 20.. so assuming double-
73    * buffering, 40 should be a good place to cap cached autotune state
74    */
75   if (at->ht->entries >= 40) {
76      struct fd_batch_history *last =
77         list_last_entry(&at->lru, struct fd_batch_history, node);
78      _mesa_hash_table_remove_key(at->ht, last->key);
79      list_del(&last->node);
80      ralloc_free(last);
81   }
82
83   _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key,
84                                      history);
85
86found:
87   /* Move to the head of the LRU: */
88   list_delinit(&history->node);
89   list_add(&history->node, &at->lru);
90
91   return history;
92}
93
94static void
95result_destructor(void *r)
96{
97   struct fd_batch_result *result = r;
98
99   /* Just in case we manage to somehow still be on the pending_results list: */
100   list_del(&result->node);
101}
102
103static struct fd_batch_result *
104get_result(struct fd_autotune *at, struct fd_batch_history *history)
105{
106   struct fd_batch_result *result = rzalloc_size(history, sizeof(*result));
107
108   result->fence =
109      ++at->fence_counter; /* pre-increment so zero isn't valid fence */
110   result->idx = at->idx_counter++;
111
112   if (at->idx_counter >= ARRAY_SIZE(at->results->result))
113      at->idx_counter = 0;
114
115   result->history = history;
116   list_addtail(&result->node, &at->pending_results);
117
118   ralloc_set_destructor(result, result_destructor);
119
120   return result;
121}
122
123static void
124process_results(struct fd_autotune *at)
125{
126   uint32_t current_fence = at->results->fence;
127
128   list_for_each_entry_safe (struct fd_batch_result, result,
129                             &at->pending_results, node) {
130      if (result->fence > current_fence)
131         break;
132
133      struct fd_batch_history *history = result->history;
134
135      result->samples_passed = at->results->result[result->idx].samples_end -
136                               at->results->result[result->idx].samples_start;
137
138      list_delinit(&result->node);
139      list_add(&result->node, &history->results);
140
141      if (history->num_results < MAX_RESULTS) {
142         history->num_results++;
143      } else {
144         /* Once above a limit, start popping old results off the
145          * tail of the list:
146          */
147         struct fd_batch_result *old_result =
148            list_last_entry(&history->results, struct fd_batch_result, node);
149         list_delinit(&old_result->node);
150         ralloc_free(old_result);
151      }
152   }
153}
154
155static bool
156fallback_use_bypass(struct fd_batch *batch)
157{
158   struct pipe_framebuffer_state *pfb = &batch->framebuffer;
159
160   /* Fallback logic if we have no historical data about the rendertarget: */
161   if (batch->cleared || batch->gmem_reason ||
162       (batch->num_draws > 5) || (pfb->samples > 1)) {
163      return false;
164   }
165
166   return true;
167}
168
169/**
170 * A magic 8-ball that tells the gmem code whether we should do bypass mode
171 * for moar fps.
172 */
173bool
174fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch)
175{
176   struct pipe_framebuffer_state *pfb = &batch->framebuffer;
177
178   process_results(at);
179
180   /* Only enable on gen's that opt-in (and actually have sample-passed
181    * collection wired up:
182    */
183   if (!batch->ctx->screen->gmem_reason_mask)
184      return fallback_use_bypass(batch);
185
186   if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask)
187      return fallback_use_bypass(batch);
188
189   for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
190      /* If ms-rtt is involved, force GMEM, as we don't currently
191       * implement a temporary render target that we can MSAA resolve
192       * from
193       */
194      if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples)
195         return fallback_use_bypass(batch);
196   }
197
198   struct fd_batch_history *history = get_history(at, batch);
199   if (!history)
200      return fallback_use_bypass(batch);
201
202   batch->autotune_result = get_result(at, history);
203   batch->autotune_result->cost = batch->cost;
204
205   bool use_bypass = fallback_use_bypass(batch);
206
207   if (use_bypass)
208      return true;
209
210   if (history->num_results > 0) {
211      uint32_t total_samples = 0;
212
213      // TODO we should account for clears somehow
214      // TODO should we try to notice if there is a drastic change from
215      // frame to frame?
216      list_for_each_entry (struct fd_batch_result, result, &history->results,
217                           node) {
218         total_samples += result->samples_passed;
219      }
220
221      float avg_samples = (float)total_samples / (float)history->num_results;
222
223      /* Low sample count could mean there was only a clear.. or there was
224       * a clear plus draws that touch no or few samples
225       */
226      if (avg_samples < 500.0)
227         return true;
228
229      /* Cost-per-sample is an estimate for the average number of reads+
230       * writes for a given passed sample.
231       */
232      float sample_cost = batch->cost;
233      sample_cost /= batch->num_draws;
234
235      float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws;
236      DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, "
237          "total_draw_cost=%f\n",
238          batch->hash, batch->num_draws, total_samples, avg_samples,
239          sample_cost, total_draw_cost);
240
241      if (total_draw_cost < 3000.0)
242         return true;
243   }
244
245   return use_bypass;
246}
247
248void
249fd_autotune_init(struct fd_autotune *at, struct fd_device *dev)
250{
251   at->ht =
252      _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals);
253   list_inithead(&at->lru);
254
255   at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results),
256                               0, "autotune");
257   at->results = fd_bo_map(at->results_mem);
258
259   list_inithead(&at->pending_results);
260}
261
262void
263fd_autotune_fini(struct fd_autotune *at)
264{
265   _mesa_hash_table_destroy(at->ht, NULL);
266   fd_bo_del(at->results_mem);
267}
268