1/*
2 * Copyright (c) 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/**
25 * The aux map provides a multi-level lookup of the main surface address which
26 * ends up providing information about the auxiliary surface data, including
27 * the address where the auxiliary data resides.
28 *
29 * The 48-bit VMA (GPU) address of the main surface is split to do the address
30 * lookup:
31 *
32 *  48 bit address of main surface
33 * +--------+--------+--------+------+
34 * | 47:36  | 35:24  | 23:16  | 15:0 |
35 * | L3-idx | L2-idx | L1-idx | ...  |
36 * +--------+--------+--------+------+
37 *
38 * The GFX_AUX_TABLE_BASE_ADDR points to a buffer. The L3 Table Entry is
39 * located by indexing into this buffer as a uint64_t array using the L3-idx
40 * value. The 64-bit L3 entry is defined as:
41 *
42 * +-------+-------------+------+---+
43 * | 63:48 | 47:15       | 14:1 | 0 |
44 * |  ...  | L2-tbl-addr | ...  | V |
45 * +-------+-------------+------+---+
46 *
47 * If the `V` (valid) bit is set, then the L2-tbl-addr gives the address for
48 * the level-2 table entries, with the lower address bits filled with zero.
49 * The L2 Table Entry is located by indexing into this buffer as a uint64_t
50 * array using the L2-idx value. The 64-bit L2 entry is similar to the L3
51 * entry, except with 2 additional address bits:
52 *
53 * +-------+-------------+------+---+
54 * | 63:48 | 47:13       | 12:1 | 0 |
55 * |  ...  | L1-tbl-addr | ...  | V |
56 * +-------+-------------+------+---+
57 *
58 * If the `V` bit is set, then the L1-tbl-addr gives the address for the
59 * level-1 table entries, with the lower address bits filled with zero. The L1
60 * Table Entry is located by indexing into this buffer as a uint64_t array
61 * using the L1-idx value. The 64-bit L1 entry is defined as:
62 *
63 * +--------+------+-------+-------+-------+---------------+-----+---+
64 * | 63:58  | 57   | 56:54 | 53:52 | 51:48 | 47:8          | 7:1 | 0 |
65 * | Format | Y/Cr | Depth |  TM   |  ...  | aux-data-addr | ... | V |
66 * +--------+------+-------+-------+-------+---------------+-----+---+
67 *
68 * Where:
69 *  - Format: See `get_format_encoding`
70 *  - Y/Cr: 0=Y(Luma), 1=Cr(Chroma)
71 *  - (bit) Depth: See `get_bpp_encoding`
72 *  - TM (Tile-mode): 0=Ys, 1=Y, 2=rsvd, 3=rsvd
73 *  - aux-data-addr: VMA/GPU address for the aux-data
74 *  - V: entry is valid
75 */
76
77#include "intel_aux_map.h"
78#include "intel_gem.h"
79
80#include "dev/intel_device_info.h"
81#include "isl/isl.h"
82
83#include "drm-uapi/i915_drm.h"
84#include "util/list.h"
85#include "util/ralloc.h"
86#include "util/u_atomic.h"
87#include "main/macros.h"
88
89#include <inttypes.h>
90#include <stdlib.h>
91#include <stdio.h>
92#include <pthread.h>
93
94static const bool aux_map_debug = false;
95
96struct aux_map_buffer {
97   struct list_head link;
98   struct intel_buffer *buffer;
99};
100
101struct intel_aux_map_context {
102   void *driver_ctx;
103   pthread_mutex_t mutex;
104   struct intel_mapped_pinned_buffer_alloc *buffer_alloc;
105   uint32_t num_buffers;
106   struct list_head buffers;
107   uint64_t level3_base_addr;
108   uint64_t *level3_map;
109   uint32_t tail_offset, tail_remaining;
110   uint32_t state_num;
111};
112
113static bool
114add_buffer(struct intel_aux_map_context *ctx)
115{
116   struct aux_map_buffer *buf = ralloc(ctx, struct aux_map_buffer);
117   if (!buf)
118      return false;
119
120   const uint32_t size = 0x100000;
121   buf->buffer = ctx->buffer_alloc->alloc(ctx->driver_ctx, size);
122   if (!buf->buffer) {
123      ralloc_free(buf);
124      return false;
125   }
126
127   assert(buf->buffer->map != NULL);
128
129   list_addtail(&buf->link, &ctx->buffers);
130   ctx->tail_offset = 0;
131   ctx->tail_remaining = size;
132   p_atomic_inc(&ctx->num_buffers);
133
134   return true;
135}
136
137static void
138advance_current_pos(struct intel_aux_map_context *ctx, uint32_t size)
139{
140   assert(ctx->tail_remaining >= size);
141   ctx->tail_remaining -= size;
142   ctx->tail_offset += size;
143}
144
145static bool
146align_and_verify_space(struct intel_aux_map_context *ctx, uint32_t size,
147                       uint32_t align)
148{
149   if (ctx->tail_remaining < size)
150      return false;
151
152   struct aux_map_buffer *tail =
153      list_last_entry(&ctx->buffers, struct aux_map_buffer, link);
154   uint64_t gpu = tail->buffer->gpu + ctx->tail_offset;
155   uint64_t aligned = align64(gpu, align);
156
157   if ((aligned - gpu) + size > ctx->tail_remaining) {
158      return false;
159   } else {
160      if (aligned - gpu > 0)
161         advance_current_pos(ctx, aligned - gpu);
162      return true;
163   }
164}
165
166static void
167get_current_pos(struct intel_aux_map_context *ctx, uint64_t *gpu, uint64_t **map)
168{
169   assert(!list_is_empty(&ctx->buffers));
170   struct aux_map_buffer *tail =
171      list_last_entry(&ctx->buffers, struct aux_map_buffer, link);
172   if (gpu)
173      *gpu = tail->buffer->gpu + ctx->tail_offset;
174   if (map)
175      *map = (uint64_t*)((uint8_t*)tail->buffer->map + ctx->tail_offset);
176}
177
178static bool
179add_sub_table(struct intel_aux_map_context *ctx, uint32_t size,
180              uint32_t align, uint64_t *gpu, uint64_t **map)
181{
182   if (!align_and_verify_space(ctx, size, align)) {
183      if (!add_buffer(ctx))
184         return false;
185      UNUSED bool aligned = align_and_verify_space(ctx, size, align);
186      assert(aligned);
187   }
188   get_current_pos(ctx, gpu, map);
189   memset(*map, 0, size);
190   advance_current_pos(ctx, size);
191   return true;
192}
193
194uint32_t
195intel_aux_map_get_state_num(struct intel_aux_map_context *ctx)
196{
197   return p_atomic_read(&ctx->state_num);
198}
199
200struct intel_aux_map_context *
201intel_aux_map_init(void *driver_ctx,
202                   struct intel_mapped_pinned_buffer_alloc *buffer_alloc,
203                   const struct intel_device_info *devinfo)
204{
205   struct intel_aux_map_context *ctx;
206   if (devinfo->ver < 12)
207      return NULL;
208
209   ctx = ralloc(NULL, struct intel_aux_map_context);
210   if (!ctx)
211      return NULL;
212
213   if (pthread_mutex_init(&ctx->mutex, NULL))
214      return NULL;
215
216   ctx->driver_ctx = driver_ctx;
217   ctx->buffer_alloc = buffer_alloc;
218   ctx->num_buffers = 0;
219   list_inithead(&ctx->buffers);
220   ctx->tail_offset = 0;
221   ctx->tail_remaining = 0;
222   ctx->state_num = 0;
223
224   if (add_sub_table(ctx, 32 * 1024, 32 * 1024, &ctx->level3_base_addr,
225                     &ctx->level3_map)) {
226      if (aux_map_debug)
227         fprintf(stderr, "AUX-MAP L3: 0x%"PRIx64", map=%p\n",
228                 ctx->level3_base_addr, ctx->level3_map);
229      p_atomic_inc(&ctx->state_num);
230      return ctx;
231   } else {
232      ralloc_free(ctx);
233      return NULL;
234   }
235}
236
237void
238intel_aux_map_finish(struct intel_aux_map_context *ctx)
239{
240   if (!ctx)
241      return;
242
243   pthread_mutex_destroy(&ctx->mutex);
244   list_for_each_entry_safe(struct aux_map_buffer, buf, &ctx->buffers, link) {
245      ctx->buffer_alloc->free(ctx->driver_ctx, buf->buffer);
246      list_del(&buf->link);
247      p_atomic_dec(&ctx->num_buffers);
248      ralloc_free(buf);
249   }
250
251   ralloc_free(ctx);
252}
253
254uint64_t
255intel_aux_map_get_base(struct intel_aux_map_context *ctx)
256{
257   /**
258    * This get initialized in intel_aux_map_init, and never changes, so there is
259    * no need to lock the mutex.
260    */
261   return ctx->level3_base_addr;
262}
263
264static struct aux_map_buffer *
265find_buffer(struct intel_aux_map_context *ctx, uint64_t addr)
266{
267   list_for_each_entry(struct aux_map_buffer, buf, &ctx->buffers, link) {
268      if (buf->buffer->gpu <= addr && buf->buffer->gpu_end > addr) {
269         return buf;
270      }
271   }
272   return NULL;
273}
274
275static uint64_t *
276get_u64_entry_ptr(struct intel_aux_map_context *ctx, uint64_t addr)
277{
278   struct aux_map_buffer *buf = find_buffer(ctx, addr);
279   assert(buf);
280   uintptr_t map_offset = addr - buf->buffer->gpu;
281   return (uint64_t*)((uint8_t*)buf->buffer->map + map_offset);
282}
283
284static uint8_t
285get_bpp_encoding(enum isl_format format)
286{
287   if (isl_format_is_yuv(format)) {
288      switch (format) {
289      case ISL_FORMAT_YCRCB_NORMAL:
290      case ISL_FORMAT_YCRCB_SWAPY:
291      case ISL_FORMAT_PLANAR_420_8: return 3;
292      case ISL_FORMAT_PLANAR_420_12: return 2;
293      case ISL_FORMAT_PLANAR_420_10: return 1;
294      case ISL_FORMAT_PLANAR_420_16: return 0;
295      default:
296         unreachable("Unsupported format!");
297         return 0;
298      }
299   } else {
300      switch (isl_format_get_layout(format)->bpb) {
301      case 16:  return 0;
302      case 8:   return 4;
303      case 32:  return 5;
304      case 64:  return 6;
305      case 128: return 7;
306      default:
307         unreachable("Unsupported bpp!");
308         return 0;
309      }
310   }
311}
312
313#define INTEL_AUX_MAP_ENTRY_Y_TILED_BIT  (0x1ull << 52)
314
315uint64_t
316intel_aux_map_format_bits(enum isl_tiling tiling, enum isl_format format,
317                          uint8_t plane)
318{
319   if (aux_map_debug)
320      fprintf(stderr, "AUX-MAP entry %s, bpp_enc=%d\n",
321              isl_format_get_name(format),
322              isl_format_get_aux_map_encoding(format));
323
324   assert(isl_tiling_is_any_y(tiling));
325
326   uint64_t format_bits =
327      ((uint64_t)isl_format_get_aux_map_encoding(format) << 58) |
328      ((uint64_t)(plane > 0) << 57) |
329      ((uint64_t)get_bpp_encoding(format) << 54) |
330      INTEL_AUX_MAP_ENTRY_Y_TILED_BIT;
331
332   assert((format_bits & INTEL_AUX_MAP_FORMAT_BITS_MASK) == format_bits);
333
334   return format_bits;
335}
336
337uint64_t
338intel_aux_map_format_bits_for_isl_surf(const struct isl_surf *isl_surf)
339{
340   assert(!isl_format_is_planar(isl_surf->format));
341   return intel_aux_map_format_bits(isl_surf->tiling, isl_surf->format, 0);
342}
343
344static void
345get_aux_entry(struct intel_aux_map_context *ctx, uint64_t address,
346              uint32_t *l1_index_out, uint64_t *l1_entry_addr_out,
347              uint64_t **l1_entry_map_out)
348{
349   uint32_t l3_index = (address >> 36) & 0xfff;
350   uint64_t *l3_entry = &ctx->level3_map[l3_index];
351
352   uint64_t *l2_map;
353   if ((*l3_entry & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
354      uint64_t l2_gpu;
355      if (add_sub_table(ctx, 32 * 1024, 32 * 1024, &l2_gpu, &l2_map)) {
356         if (aux_map_debug)
357            fprintf(stderr, "AUX-MAP L3[0x%x]: 0x%"PRIx64", map=%p\n",
358                    l3_index, l2_gpu, l2_map);
359      } else {
360         unreachable("Failed to add L2 Aux-Map Page Table!");
361      }
362      *l3_entry = (l2_gpu & 0xffffffff8000ULL) | 1;
363   } else {
364      uint64_t l2_addr = intel_canonical_address(*l3_entry & ~0x7fffULL);
365      l2_map = get_u64_entry_ptr(ctx, l2_addr);
366   }
367   uint32_t l2_index = (address >> 24) & 0xfff;
368   uint64_t *l2_entry = &l2_map[l2_index];
369
370   uint64_t l1_addr, *l1_map;
371   if ((*l2_entry & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
372      if (add_sub_table(ctx, 8 * 1024, 8 * 1024, &l1_addr, &l1_map)) {
373         if (aux_map_debug)
374            fprintf(stderr, "AUX-MAP L2[0x%x]: 0x%"PRIx64", map=%p\n",
375                    l2_index, l1_addr, l1_map);
376      } else {
377         unreachable("Failed to add L1 Aux-Map Page Table!");
378      }
379      *l2_entry = (l1_addr & 0xffffffffe000ULL) | 1;
380   } else {
381      l1_addr = intel_canonical_address(*l2_entry & ~0x1fffULL);
382      l1_map = get_u64_entry_ptr(ctx, l1_addr);
383   }
384   uint32_t l1_index = (address >> 16) & 0xff;
385   if (l1_index_out)
386      *l1_index_out = l1_index;
387   if (l1_entry_addr_out)
388      *l1_entry_addr_out = l1_addr + l1_index * sizeof(*l1_map);
389   if (l1_entry_map_out)
390      *l1_entry_map_out = &l1_map[l1_index];
391}
392
393static void
394add_mapping(struct intel_aux_map_context *ctx, uint64_t address,
395            uint64_t aux_address, uint64_t format_bits,
396            bool *state_changed)
397{
398   if (aux_map_debug)
399      fprintf(stderr, "AUX-MAP 0x%"PRIx64" => 0x%"PRIx64"\n", address,
400              aux_address);
401
402   uint32_t l1_index;
403   uint64_t *l1_entry;
404   get_aux_entry(ctx, address, &l1_index, NULL, &l1_entry);
405
406   const uint64_t l1_data =
407      (aux_address & INTEL_AUX_MAP_ADDRESS_MASK) |
408      format_bits |
409      INTEL_AUX_MAP_ENTRY_VALID_BIT;
410
411   const uint64_t current_l1_data = *l1_entry;
412   if ((current_l1_data & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
413      assert((aux_address & 0xffULL) == 0);
414      if (aux_map_debug)
415         fprintf(stderr, "AUX-MAP L1[0x%x] 0x%"PRIx64" -> 0x%"PRIx64"\n",
416                 l1_index, current_l1_data, l1_data);
417      /**
418       * We use non-zero bits in 63:1 to indicate the entry had been filled
419       * previously. If these bits are non-zero and they don't exactly match
420       * what we want to program into the entry, then we must force the
421       * aux-map tables to be flushed.
422       */
423      if (current_l1_data != 0 && \
424          (current_l1_data | INTEL_AUX_MAP_ENTRY_VALID_BIT) != l1_data)
425         *state_changed = true;
426      *l1_entry = l1_data;
427   } else {
428      if (aux_map_debug)
429         fprintf(stderr, "AUX-MAP L1[0x%x] is already marked valid!\n",
430                 l1_index);
431      assert(*l1_entry == l1_data);
432   }
433}
434
435uint64_t *
436intel_aux_map_get_entry(struct intel_aux_map_context *ctx,
437                        uint64_t address,
438                        uint64_t *entry_address)
439{
440   pthread_mutex_lock(&ctx->mutex);
441   uint64_t *l1_entry_map;
442   get_aux_entry(ctx, address, NULL, entry_address, &l1_entry_map);
443   pthread_mutex_unlock(&ctx->mutex);
444
445   return l1_entry_map;
446}
447
448void
449intel_aux_map_add_mapping(struct intel_aux_map_context *ctx, uint64_t address,
450                          uint64_t aux_address, uint64_t main_size_B,
451                          uint64_t format_bits)
452{
453   bool state_changed = false;
454   pthread_mutex_lock(&ctx->mutex);
455   uint64_t map_addr = address;
456   uint64_t dest_aux_addr = aux_address;
457   assert(align64(address, INTEL_AUX_MAP_MAIN_PAGE_SIZE) == address);
458   assert(align64(aux_address, INTEL_AUX_MAP_AUX_PAGE_SIZE) == aux_address);
459   while (map_addr - address < main_size_B) {
460      add_mapping(ctx, map_addr, dest_aux_addr, format_bits, &state_changed);
461      map_addr += INTEL_AUX_MAP_MAIN_PAGE_SIZE;
462      dest_aux_addr += INTEL_AUX_MAP_AUX_PAGE_SIZE;
463   }
464   pthread_mutex_unlock(&ctx->mutex);
465   if (state_changed)
466      p_atomic_inc(&ctx->state_num);
467}
468
469/**
470 * We mark the leaf entry as invalid, but we don't attempt to cleanup the
471 * other levels of translation mappings. Since we attempt to re-use VMA
472 * ranges, hopefully this will not lead to unbounded growth of the translation
473 * tables.
474 */
475static void
476remove_mapping(struct intel_aux_map_context *ctx, uint64_t address,
477               bool *state_changed)
478{
479   uint32_t l3_index = (address >> 36) & 0xfff;
480   uint64_t *l3_entry = &ctx->level3_map[l3_index];
481
482   uint64_t *l2_map;
483   if ((*l3_entry & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
484      return;
485   } else {
486      uint64_t l2_addr = intel_canonical_address(*l3_entry & ~0x7fffULL);
487      l2_map = get_u64_entry_ptr(ctx, l2_addr);
488   }
489   uint32_t l2_index = (address >> 24) & 0xfff;
490   uint64_t *l2_entry = &l2_map[l2_index];
491
492   uint64_t *l1_map;
493   if ((*l2_entry & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
494      return;
495   } else {
496      uint64_t l1_addr = intel_canonical_address(*l2_entry & ~0x1fffULL);
497      l1_map = get_u64_entry_ptr(ctx, l1_addr);
498   }
499   uint32_t l1_index = (address >> 16) & 0xff;
500   uint64_t *l1_entry = &l1_map[l1_index];
501
502   const uint64_t current_l1_data = *l1_entry;
503   const uint64_t l1_data = current_l1_data & ~1ull;
504
505   if ((current_l1_data & INTEL_AUX_MAP_ENTRY_VALID_BIT) == 0) {
506      return;
507   } else {
508      if (aux_map_debug)
509         fprintf(stderr, "AUX-MAP [0x%x][0x%x][0x%x] L1 entry removed!\n",
510                 l3_index, l2_index, l1_index);
511      /**
512       * We use non-zero bits in 63:1 to indicate the entry had been filled
513       * previously. In the unlikely event that these are all zero, we force a
514       * flush of the aux-map tables.
515       */
516      if (unlikely(l1_data == 0))
517         *state_changed = true;
518      *l1_entry = l1_data;
519   }
520}
521
522void
523intel_aux_map_unmap_range(struct intel_aux_map_context *ctx, uint64_t address,
524                          uint64_t size)
525{
526   bool state_changed = false;
527   pthread_mutex_lock(&ctx->mutex);
528   if (aux_map_debug)
529      fprintf(stderr, "AUX-MAP remove 0x%"PRIx64"-0x%"PRIx64"\n", address,
530              address + size);
531
532   uint64_t map_addr = address;
533   assert(align64(address, INTEL_AUX_MAP_MAIN_PAGE_SIZE) == address);
534   while (map_addr - address < size) {
535      remove_mapping(ctx, map_addr, &state_changed);
536      map_addr += 64 * 1024;
537   }
538   pthread_mutex_unlock(&ctx->mutex);
539   if (state_changed)
540      p_atomic_inc(&ctx->state_num);
541}
542
543uint32_t
544intel_aux_map_get_num_buffers(struct intel_aux_map_context *ctx)
545{
546   return p_atomic_read(&ctx->num_buffers);
547}
548
549void
550intel_aux_map_fill_bos(struct intel_aux_map_context *ctx, void **driver_bos,
551                       uint32_t max_bos)
552{
553   assert(p_atomic_read(&ctx->num_buffers) >= max_bos);
554   uint32_t i = 0;
555   list_for_each_entry(struct aux_map_buffer, buf, &ctx->buffers, link) {
556      if (i >= max_bos)
557         return;
558      driver_bos[i++] = buf->buffer->driver_bo;
559   }
560}
561