1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32/** @file brw_program_cache.c
33 *
34 * This file implements a simple program cache for 965.  The consumers can
35 *  query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 *  data) in return.  Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
39 *
40 * The inner workings are a simple hash table based on a CRC of the
41 * key data.
42 *
43 * Replacement is not implemented.  Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
45 */
46
47#include "main/imports.h"
48#include "main/streaming-load-memcpy.h"
49#include "x86/common_x86_asm.h"
50#include "intel_batchbuffer.h"
51#include "brw_state.h"
52#include "brw_wm.h"
53#include "brw_gs.h"
54#include "brw_cs.h"
55#include "brw_program.h"
56#include "compiler/brw_eu.h"
57
58#define FILE_DEBUG_FLAG DEBUG_STATE
59
60struct brw_cache_item {
61   /**
62    * Effectively part of the key, cache_id identifies what kind of state
63    * buffer is involved, and also which dirty flag should set.
64    */
65   enum brw_cache_id cache_id;
66
67   /** 32-bit hash of the key data */
68   GLuint hash;
69
70   /** for variable-sized keys */
71   GLuint key_size;
72   GLuint prog_data_size;
73   const void *key;
74
75   uint32_t offset;
76   uint32_t size;
77
78   struct brw_cache_item *next;
79};
80
81enum brw_cache_id
82brw_stage_cache_id(gl_shader_stage stage)
83{
84   static const enum brw_cache_id stage_ids[] = {
85      BRW_CACHE_VS_PROG,
86      BRW_CACHE_TCS_PROG,
87      BRW_CACHE_TES_PROG,
88      BRW_CACHE_GS_PROG,
89      BRW_CACHE_FS_PROG,
90      BRW_CACHE_CS_PROG,
91   };
92   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
93   return stage_ids[stage];
94}
95
96static unsigned
97get_program_string_id(enum brw_cache_id cache_id, const void *key)
98{
99   switch (cache_id) {
100   case BRW_CACHE_VS_PROG:
101      return ((struct brw_vs_prog_key *) key)->program_string_id;
102   case BRW_CACHE_TCS_PROG:
103      return ((struct brw_tcs_prog_key *) key)->program_string_id;
104   case BRW_CACHE_TES_PROG:
105      return ((struct brw_tes_prog_key *) key)->program_string_id;
106   case BRW_CACHE_GS_PROG:
107      return ((struct brw_gs_prog_key *) key)->program_string_id;
108   case BRW_CACHE_CS_PROG:
109      return ((struct brw_cs_prog_key *) key)->program_string_id;
110   case BRW_CACHE_FS_PROG:
111      return ((struct brw_wm_prog_key *) key)->program_string_id;
112   default:
113      unreachable("no program string id for this kind of program");
114   }
115}
116
117static GLuint
118hash_key(struct brw_cache_item *item)
119{
120   GLuint *ikey = (GLuint *)item->key;
121   GLuint hash = item->cache_id, i;
122
123   assert(item->key_size % 4 == 0);
124
125   /* I'm sure this can be improved on:
126    */
127   for (i = 0; i < item->key_size/4; i++) {
128      hash ^= ikey[i];
129      hash = (hash << 5) | (hash >> 27);
130   }
131
132   return hash;
133}
134
135static int
136brw_cache_item_equals(const struct brw_cache_item *a,
137                      const struct brw_cache_item *b)
138{
139   return a->cache_id == b->cache_id &&
140      a->hash == b->hash &&
141      a->key_size == b->key_size &&
142      (memcmp(a->key, b->key, a->key_size) == 0);
143}
144
145static struct brw_cache_item *
146search_cache(struct brw_cache *cache, GLuint hash,
147             struct brw_cache_item *lookup)
148{
149   struct brw_cache_item *c;
150
151#if 0
152   int bucketcount = 0;
153
154   for (c = cache->items[hash % cache->size]; c; c = c->next)
155      bucketcount++;
156
157   fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
158           cache->size, bucketcount, cache->n_items);
159#endif
160
161   for (c = cache->items[hash % cache->size]; c; c = c->next) {
162      if (brw_cache_item_equals(lookup, c))
163         return c;
164   }
165
166   return NULL;
167}
168
169
170static void
171rehash(struct brw_cache *cache)
172{
173   struct brw_cache_item **items;
174   struct brw_cache_item *c, *next;
175   GLuint size, i;
176
177   size = cache->size * 3;
178   items = calloc(size, sizeof(*items));
179
180   for (i = 0; i < cache->size; i++)
181      for (c = cache->items[i]; c; c = next) {
182         next = c->next;
183         c->next = items[c->hash % size];
184         items[c->hash % size] = c;
185      }
186
187   free(cache->items);
188   cache->items = items;
189   cache->size = size;
190}
191
192
193/**
194 * Returns the buffer object matching cache_id and key, or NULL.
195 */
196bool
197brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
198                 const void *key, GLuint key_size, uint32_t *inout_offset,
199                 void *inout_prog_data, bool flag_state)
200{
201   struct brw_cache_item *item;
202   struct brw_cache_item lookup;
203   GLuint hash;
204
205   lookup.cache_id = cache_id;
206   lookup.key = key;
207   lookup.key_size = key_size;
208   hash = hash_key(&lookup);
209   lookup.hash = hash;
210
211   item = search_cache(cache, hash, &lookup);
212
213   if (item == NULL)
214      return false;
215
216   void *prog_data = ((char *) item->key) + item->key_size;
217
218   if (item->offset != *inout_offset ||
219       prog_data != *((void **) inout_prog_data)) {
220      if (likely(flag_state))
221         cache->brw->ctx.NewDriverState |= (1 << cache_id);
222      *inout_offset = item->offset;
223      *((void **) inout_prog_data) = prog_data;
224   }
225
226   return true;
227}
228
229static void
230brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
231{
232   struct brw_context *brw = cache->brw;
233   struct brw_bo *new_bo;
234
235   perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
236              (unsigned) cache->bo->size / 1024, new_size / 1024);
237
238   new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
239                         BRW_MEMZONE_SHADER);
240   if (can_do_exec_capture(brw->screen))
241      new_bo->kflags |= EXEC_OBJECT_CAPTURE;
242
243   void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
244                                       MAP_ASYNC | MAP_PERSISTENT);
245
246   /* Copy any existing data that needs to be saved. */
247   if (cache->next_offset != 0) {
248#ifdef USE_SSE41
249      if (!cache->bo->cache_coherent && cpu_has_sse4_1)
250         _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
251      else
252#endif
253         memcpy(map, cache->map, cache->next_offset);
254   }
255
256   brw_bo_unmap(cache->bo);
257   brw_bo_unreference(cache->bo);
258   cache->bo = new_bo;
259   cache->map = map;
260
261   /* Since we have a new BO in place, we need to signal the units
262    * that depend on it (state base address on gen5+, or unit state before).
263    */
264   brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
265   brw->batch.state_base_address_emitted = false;
266}
267
268/**
269 * Attempts to find an item in the cache with identical data.
270 */
271static const struct brw_cache_item *
272brw_lookup_prog(const struct brw_cache *cache,
273                enum brw_cache_id cache_id,
274                const void *data, unsigned data_size)
275{
276   unsigned i;
277   const struct brw_cache_item *item;
278
279   for (i = 0; i < cache->size; i++) {
280      for (item = cache->items[i]; item; item = item->next) {
281         if (item->cache_id != cache_id || item->size != data_size ||
282             memcmp(cache->map + item->offset, data, item->size) != 0)
283            continue;
284
285         return item;
286      }
287   }
288
289   return NULL;
290}
291
292static uint32_t
293brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
294{
295   uint32_t offset;
296
297   /* Allocate space in the cache BO for our new program. */
298   if (cache->next_offset + size > cache->bo->size) {
299      uint32_t new_size = cache->bo->size * 2;
300
301      while (cache->next_offset + size > new_size)
302         new_size *= 2;
303
304      brw_cache_new_bo(cache, new_size);
305   }
306
307   offset = cache->next_offset;
308
309   /* Programs are always 64-byte aligned, so set up the next one now */
310   cache->next_offset = ALIGN(offset + size, 64);
311
312   return offset;
313}
314
315const void *
316brw_find_previous_compile(struct brw_cache *cache,
317                          enum brw_cache_id cache_id,
318                          unsigned program_string_id)
319{
320   for (unsigned i = 0; i < cache->size; i++) {
321      for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
322         if (c->cache_id == cache_id &&
323             get_program_string_id(cache_id, c->key) == program_string_id) {
324            return c->key;
325         }
326      }
327   }
328
329   return NULL;
330}
331
332void
333brw_upload_cache(struct brw_cache *cache,
334                 enum brw_cache_id cache_id,
335                 const void *key,
336                 GLuint key_size,
337                 const void *data,
338                 GLuint data_size,
339                 const void *prog_data,
340                 GLuint prog_data_size,
341                 uint32_t *out_offset,
342                 void *out_prog_data)
343{
344   struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
345   const struct brw_cache_item *matching_data =
346      brw_lookup_prog(cache, cache_id, data, data_size);
347   GLuint hash;
348   void *tmp;
349
350   item->cache_id = cache_id;
351   item->size = data_size;
352   item->key = key;
353   item->key_size = key_size;
354   item->prog_data_size = prog_data_size;
355   hash = hash_key(item);
356   item->hash = hash;
357
358   /* If we can find a matching prog in the cache already, then reuse the
359    * existing stuff without creating new copy into the underlying buffer
360    * object. This is notably useful for programs generating shaders at
361    * runtime, where multiple shaders may compile to the same thing in our
362    * backend.
363    */
364   if (matching_data) {
365      item->offset = matching_data->offset;
366   } else {
367      item->offset = brw_alloc_item_data(cache, data_size);
368
369      /* Copy data to the buffer */
370      memcpy(cache->map + item->offset, data, data_size);
371   }
372
373   /* Set up the memory containing the key and prog_data */
374   tmp = malloc(key_size + prog_data_size);
375
376   memcpy(tmp, key, key_size);
377   memcpy(tmp + key_size, prog_data, prog_data_size);
378
379   item->key = tmp;
380
381   if (cache->n_items > cache->size * 1.5f)
382      rehash(cache);
383
384   hash %= cache->size;
385   item->next = cache->items[hash];
386   cache->items[hash] = item;
387   cache->n_items++;
388
389   *out_offset = item->offset;
390   *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
391   cache->brw->ctx.NewDriverState |= 1 << cache_id;
392}
393
394void
395brw_init_caches(struct brw_context *brw)
396{
397   struct brw_cache *cache = &brw->cache;
398
399   cache->brw = brw;
400
401   cache->size = 7;
402   cache->n_items = 0;
403   cache->items =
404      calloc(cache->size, sizeof(struct brw_cache_item *));
405
406   cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
407                            BRW_MEMZONE_SHADER);
408   if (can_do_exec_capture(brw->screen))
409      cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
410
411   cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
412                                           MAP_ASYNC | MAP_PERSISTENT);
413}
414
415static void
416brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
417{
418   struct brw_cache_item *c, *next;
419   GLuint i;
420
421   DBG("%s\n", __func__);
422
423   for (i = 0; i < cache->size; i++) {
424      for (c = cache->items[i]; c; c = next) {
425         next = c->next;
426         if (c->cache_id == BRW_CACHE_VS_PROG ||
427             c->cache_id == BRW_CACHE_TCS_PROG ||
428             c->cache_id == BRW_CACHE_TES_PROG ||
429             c->cache_id == BRW_CACHE_GS_PROG ||
430             c->cache_id == BRW_CACHE_FS_PROG ||
431             c->cache_id == BRW_CACHE_CS_PROG) {
432            const void *item_prog_data = c->key + c->key_size;
433            brw_stage_prog_data_free(item_prog_data);
434         }
435         free((void *)c->key);
436         free(c);
437      }
438      cache->items[i] = NULL;
439   }
440
441   cache->n_items = 0;
442
443   /* Start putting programs into the start of the BO again, since
444    * we'll never find the old results.
445    */
446   cache->next_offset = 0;
447
448   /* We need to make sure that the programs get regenerated, since
449    * any offsets leftover in brw_context will no longer be valid.
450    */
451   brw->NewGLState = ~0;
452   brw->ctx.NewDriverState = ~0ull;
453   brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
454   brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
455   brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
456   brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
457
458   /* Also, NULL out any stale program pointers. */
459   brw->vs.base.prog_data = NULL;
460   brw->tcs.base.prog_data = NULL;
461   brw->tes.base.prog_data = NULL;
462   brw->gs.base.prog_data = NULL;
463   brw->wm.base.prog_data = NULL;
464   brw->cs.base.prog_data = NULL;
465
466   intel_batchbuffer_flush(brw);
467}
468
469void
470brw_program_cache_check_size(struct brw_context *brw)
471{
472   /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
473    * state cache.
474    */
475   if (brw->cache.n_items > 2000) {
476      perf_debug("Exceeded state cache size limit.  Clearing the set "
477                 "of compiled programs, which will trigger recompiles\n");
478      brw_clear_cache(brw, &brw->cache);
479      brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
480   }
481}
482
483
484static void
485brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
486{
487
488   DBG("%s\n", __func__);
489
490   /* This can be NULL if context creation failed early on */
491   if (cache->bo) {
492      brw_bo_unmap(cache->bo);
493      brw_bo_unreference(cache->bo);
494      cache->bo = NULL;
495      cache->map = NULL;
496   }
497   brw_clear_cache(brw, cache);
498   free(cache->items);
499   cache->items = NULL;
500   cache->size = 0;
501}
502
503
504void
505brw_destroy_caches(struct brw_context *brw)
506{
507   brw_destroy_cache(brw, &brw->cache);
508}
509
510static const char *
511cache_name(enum brw_cache_id cache_id)
512{
513   switch (cache_id) {
514   case BRW_CACHE_VS_PROG:
515      return "VS kernel";
516   case BRW_CACHE_TCS_PROG:
517      return "TCS kernel";
518   case BRW_CACHE_TES_PROG:
519      return "TES kernel";
520   case BRW_CACHE_FF_GS_PROG:
521      return "Fixed-function GS kernel";
522   case BRW_CACHE_GS_PROG:
523      return "GS kernel";
524   case BRW_CACHE_CLIP_PROG:
525      return "CLIP kernel";
526   case BRW_CACHE_SF_PROG:
527      return "SF kernel";
528   case BRW_CACHE_FS_PROG:
529      return "FS kernel";
530   case BRW_CACHE_CS_PROG:
531      return "CS kernel";
532   default:
533      return "unknown";
534   }
535}
536
537void
538brw_print_program_cache(struct brw_context *brw)
539{
540   const struct brw_cache *cache = &brw->cache;
541   struct brw_cache_item *item;
542
543   for (unsigned i = 0; i < cache->size; i++) {
544      for (item = cache->items[i]; item; item = item->next) {
545         fprintf(stderr, "%s:\n", cache_name(i));
546         brw_disassemble(&brw->screen->devinfo, cache->map,
547                         item->offset, item->size, stderr);
548      }
549   }
550}
551