1/*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28/*
29    This file replaces libdrm's radeon_cs_gem with our own implemention.
30    It's optimized specifically for Radeon DRM.
31    Adding buffers and space checking are faster and simpler than their
32    counterparts in libdrm (the time complexity of all the functions
33    is O(1) in nearly all scenarios, thanks to hashing).
34
35    It works like this:
36
37    cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38    also adds the size of 'buf' to the used_gart and used_vram winsys variables
39    based on the domains, which are simply or'd for the accounting purposes.
40    The adding is skipped if the reloc is already present in the list, but it
41    accounts any newly-referenced domains.
42
43    cs_validate is then called, which just checks:
44        used_vram/gart < vram/gart_size * 0.8
45    The 0.8 number allows for some memory fragmentation. If the validation
46    fails, the pipe driver flushes CS and tries do the validation again,
47    i.e. it validates only that one operation. If it fails again, it drops
48    the operation on the floor and prints some nasty message to stderr.
49    (done in the pipe driver)
50
51    cs_write_reloc(cs, buf) just writes a reloc that has been added using
52    cs_add_buffer. The read_domain and write_domain parameters have been removed,
53    because we already specify them in cs_add_buffer.
54*/
55
56#include "radeon_drm_cs.h"
57
58#include "util/u_memory.h"
59#include "util/os_time.h"
60
61#include <stdio.h>
62#include <stdlib.h>
63#include <stdint.h>
64#include <xf86drm.h>
65
66
67#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70static void radeon_fence_reference(struct pipe_fence_handle **dst,
71                                   struct pipe_fence_handle *src);
72
73static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
74{
75   struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
76   if (!ctx)
77      return NULL;
78
79   ctx->ws = (struct radeon_drm_winsys*)ws;
80   ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81   return (struct radeon_winsys_ctx*)ctx;
82}
83
84static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
85{
86   FREE(ctx);
87}
88
89static enum pipe_reset_status
90radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
91                                  bool *needs_reset)
92{
93   struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
94
95   unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
96
97   if (ctx->gpu_reset_counter == latest) {
98      if (needs_reset)
99         *needs_reset = false;
100      return PIPE_NO_RESET;
101   }
102
103   if (needs_reset)
104      *needs_reset = true;
105
106   ctx->gpu_reset_counter = latest;
107   return PIPE_UNKNOWN_CONTEXT_RESET;
108}
109
110static bool radeon_init_cs_context(struct radeon_cs_context *csc,
111                                   struct radeon_drm_winsys *ws)
112{
113   int i;
114
115   csc->fd = ws->fd;
116
117   csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
118   csc->chunks[0].length_dw = 0;
119   csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
120   csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
121   csc->chunks[1].length_dw = 0;
122   csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
123   csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
124   csc->chunks[2].length_dw = 2;
125   csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
126
127   csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
128   csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
129   csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
130
131   csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
132
133   for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
134      csc->reloc_indices_hashlist[i] = -1;
135   }
136   return true;
137}
138
139static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
140{
141   unsigned i;
142
143   for (i = 0; i < csc->num_relocs; i++) {
144      p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145      radeon_ws_bo_reference(&csc->relocs_bo[i].bo, NULL);
146   }
147   for (i = 0; i < csc->num_slab_buffers; ++i) {
148      p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149      radeon_ws_bo_reference(&csc->slab_buffers[i].bo, NULL);
150   }
151
152   csc->num_relocs = 0;
153   csc->num_validated_relocs = 0;
154   csc->num_slab_buffers = 0;
155   csc->chunks[0].length_dw = 0;
156   csc->chunks[1].length_dw = 0;
157
158   for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159      csc->reloc_indices_hashlist[i] = -1;
160   }
161}
162
163static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
164{
165   radeon_cs_context_cleanup(csc);
166   FREE(csc->slab_buffers);
167   FREE(csc->relocs_bo);
168   FREE(csc->relocs);
169}
170
171
172static bool
173radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
174                     struct radeon_winsys_ctx *ctx,
175                     enum ring_type ring_type,
176                     void (*flush)(void *ctx, unsigned flags,
177                                   struct pipe_fence_handle **fence),
178                     void *flush_ctx,
179                     bool stop_exec_on_failure)
180{
181   struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
182   struct radeon_drm_cs *cs;
183
184   cs = CALLOC_STRUCT(radeon_drm_cs);
185   if (!cs) {
186      return false;
187   }
188   util_queue_fence_init(&cs->flush_completed);
189
190   cs->ws = ws;
191   cs->flush_cs = flush;
192   cs->flush_data = flush_ctx;
193
194   if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
195      FREE(cs);
196      return false;
197   }
198   if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
199      radeon_destroy_cs_context(&cs->csc1);
200      FREE(cs);
201      return false;
202   }
203
204   /* Set the first command buffer as current. */
205   cs->csc = &cs->csc1;
206   cs->cst = &cs->csc2;
207   cs->ring_type = ring_type;
208
209   memset(rcs, 0, sizeof(*rcs));
210   rcs->current.buf = cs->csc->buf;
211   rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
212   rcs->priv = cs;
213
214   p_atomic_inc(&ws->num_cs);
215   return true;
216}
217
218int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
219{
220   unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
221   struct radeon_bo_item *buffers;
222   unsigned num_buffers;
223   int i = csc->reloc_indices_hashlist[hash];
224
225   if (bo->handle) {
226      buffers = csc->relocs_bo;
227      num_buffers = csc->num_relocs;
228   } else {
229      buffers = csc->slab_buffers;
230      num_buffers = csc->num_slab_buffers;
231   }
232
233   /* not found or found */
234   if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
235      return i;
236
237   /* Hash collision, look for the BO in the list of relocs linearly. */
238   for (i = num_buffers - 1; i >= 0; i--) {
239      if (buffers[i].bo == bo) {
240         /* Put this reloc in the hash list.
241          * This will prevent additional hash collisions if there are
242          * several consecutive lookup_buffer calls for the same buffer.
243          *
244          * Example: Assuming buffers A,B,C collide in the hash list,
245          * the following sequence of relocs:
246          *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
247          * will collide here: ^ and here:   ^,
248          * meaning that we should get very few collisions in the end. */
249         csc->reloc_indices_hashlist[hash] = i;
250         return i;
251      }
252   }
253   return -1;
254}
255
256static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
257                                                 struct radeon_bo *bo)
258{
259   struct radeon_cs_context *csc = cs->csc;
260   struct drm_radeon_cs_reloc *reloc;
261   unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
262   int i = -1;
263
264   i = radeon_lookup_buffer(csc, bo);
265
266   if (i >= 0) {
267      /* For async DMA, every add_buffer call must add a buffer to the list
268       * no matter how many duplicates there are. This is due to the fact
269       * the DMA CS checker doesn't use NOP packets for offset patching,
270       * but always uses the i-th buffer from the list to patch the i-th
271       * offset. If there are N offsets in a DMA CS, there must also be N
272       * buffers in the relocation list.
273       *
274       * This doesn't have to be done if virtual memory is enabled,
275       * because there is no offset patching with virtual memory.
276       */
277      if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
278         return i;
279      }
280   }
281
282   /* New relocation, check if the backing array is large enough. */
283   if (csc->num_relocs >= csc->max_relocs) {
284      uint32_t size;
285      csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
286
287      size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
288      csc->relocs_bo = realloc(csc->relocs_bo, size);
289
290      size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
291      csc->relocs = realloc(csc->relocs, size);
292
293      csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
294   }
295
296   /* Initialize the new relocation. */
297   csc->relocs_bo[csc->num_relocs].bo = NULL;
298   csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
299   radeon_ws_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
300   p_atomic_inc(&bo->num_cs_references);
301   reloc = &csc->relocs[csc->num_relocs];
302   reloc->handle = bo->handle;
303   reloc->read_domains = 0;
304   reloc->write_domain = 0;
305   reloc->flags = 0;
306
307   csc->reloc_indices_hashlist[hash] = csc->num_relocs;
308
309   csc->chunks[1].length_dw += RELOC_DWORDS;
310
311   return csc->num_relocs++;
312}
313
314static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
315                                            struct radeon_bo *bo)
316{
317   struct radeon_cs_context *csc = cs->csc;
318   unsigned hash;
319   struct radeon_bo_item *item;
320   int idx;
321   int real_idx;
322
323   idx = radeon_lookup_buffer(csc, bo);
324   if (idx >= 0)
325      return idx;
326
327   real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
328
329   /* Check if the backing array is large enough. */
330   if (csc->num_slab_buffers >= csc->max_slab_buffers) {
331      unsigned new_max = MAX2(csc->max_slab_buffers + 16,
332                              (unsigned)(csc->max_slab_buffers * 1.3));
333      struct radeon_bo_item *new_buffers =
334            REALLOC(csc->slab_buffers,
335                    csc->max_slab_buffers * sizeof(*new_buffers),
336                    new_max * sizeof(*new_buffers));
337      if (!new_buffers) {
338         fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
339         return -1;
340      }
341
342      csc->max_slab_buffers = new_max;
343      csc->slab_buffers = new_buffers;
344   }
345
346   /* Initialize the new relocation. */
347   idx = csc->num_slab_buffers++;
348   item = &csc->slab_buffers[idx];
349
350   item->bo = NULL;
351   item->u.slab.real_idx = real_idx;
352   radeon_ws_bo_reference(&item->bo, bo);
353   p_atomic_inc(&bo->num_cs_references);
354
355   hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
356   csc->reloc_indices_hashlist[hash] = idx;
357
358   return idx;
359}
360
361static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
362                                         struct pb_buffer *buf,
363                                         enum radeon_bo_usage usage,
364                                         enum radeon_bo_domain domains,
365                                         enum radeon_bo_priority priority)
366{
367   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
368   struct radeon_bo *bo = (struct radeon_bo*)buf;
369   enum radeon_bo_domain added_domains;
370
371   /* If VRAM is just stolen system memory, allow both VRAM and
372    * GTT, whichever has free space. If a buffer is evicted from
373    * VRAM to GTT, it will stay there.
374    */
375   if (!cs->ws->info.has_dedicated_vram)
376      domains |= RADEON_DOMAIN_GTT;
377
378   enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
379   enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
380   struct drm_radeon_cs_reloc *reloc;
381   int index;
382
383   if (!bo->handle) {
384      index = radeon_lookup_or_add_slab_buffer(cs, bo);
385      if (index < 0)
386         return 0;
387
388      index = cs->csc->slab_buffers[index].u.slab.real_idx;
389   } else {
390      index = radeon_lookup_or_add_real_buffer(cs, bo);
391   }
392
393   reloc = &cs->csc->relocs[index];
394   added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
395   reloc->read_domains |= rd;
396   reloc->write_domain |= wd;
397   reloc->flags = MAX2(reloc->flags, priority);
398   cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
399
400   if (added_domains & RADEON_DOMAIN_VRAM)
401      rcs->used_vram_kb += bo->base.size / 1024;
402   else if (added_domains & RADEON_DOMAIN_GTT)
403      rcs->used_gart_kb += bo->base.size / 1024;
404
405   return index;
406}
407
408static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
409                                       struct pb_buffer *buf)
410{
411   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
412
413   return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
414}
415
416static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
417{
418   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
419   bool status =
420         rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
421         rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
422
423   if (status) {
424      cs->csc->num_validated_relocs = cs->csc->num_relocs;
425   } else {
426      /* Remove lately-added buffers. The validation failed with them
427       * and the CS is about to be flushed because of that. Keep only
428       * the already-validated buffers. */
429      unsigned i;
430
431      for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
432         p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
433         radeon_ws_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
434      }
435      cs->csc->num_relocs = cs->csc->num_validated_relocs;
436
437      /* Flush if there are any relocs. Clean up otherwise. */
438      if (cs->csc->num_relocs) {
439         cs->flush_cs(cs->flush_data,
440                      RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
441      } else {
442         radeon_cs_context_cleanup(cs->csc);
443         rcs->used_vram_kb = 0;
444         rcs->used_gart_kb = 0;
445
446         assert(rcs->current.cdw == 0);
447         if (rcs->current.cdw != 0) {
448            fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
449         }
450      }
451   }
452   return status;
453}
454
455static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
456                                      bool force_chaining)
457{
458   assert(rcs->current.cdw <= rcs->current.max_dw);
459   return rcs->current.max_dw - rcs->current.cdw >= dw;
460}
461
462static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
463                                              struct radeon_bo_list_item *list)
464{
465   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
466   int i;
467
468   if (list) {
469      for (i = 0; i < cs->csc->num_relocs; i++) {
470         list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
471         list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
472         list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
473      }
474   }
475   return cs->csc->num_relocs;
476}
477
478void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
479{
480   struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
481   unsigned i;
482   int r;
483
484   r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
485                           &csc->cs, sizeof(struct drm_radeon_cs));
486   if (r) {
487      if (r == -ENOMEM)
488         fprintf(stderr, "radeon: Not enough memory for command submission.\n");
489      else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
490         unsigned i;
491
492         fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
493         for (i = 0; i < csc->chunks[0].length_dw; i++) {
494            fprintf(stderr, "0x%08X\n", csc->buf[i]);
495         }
496      } else {
497         fprintf(stderr, "radeon: The kernel rejected CS, "
498                         "see dmesg for more information (%i).\n", r);
499      }
500   }
501
502   for (i = 0; i < csc->num_relocs; i++)
503      p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
504   for (i = 0; i < csc->num_slab_buffers; i++)
505      p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
506
507   radeon_cs_context_cleanup(csc);
508}
509
510/*
511 * Make sure previous submission of this cs are completed
512 */
513void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
514{
515   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
516
517   /* Wait for any pending ioctl of this CS to complete. */
518   if (util_queue_is_initialized(&cs->ws->cs_queue))
519      util_queue_fence_wait(&cs->flush_completed);
520}
521
522/* Add the given fence to a slab buffer fence list.
523 *
524 * There is a potential race condition when bo participates in submissions on
525 * two or more threads simultaneously. Since we do not know which of the
526 * submissions will be sent to the GPU first, we have to keep the fences
527 * of all submissions.
528 *
529 * However, fences that belong to submissions that have already returned from
530 * their respective ioctl do not have to be kept, because we know that they
531 * will signal earlier.
532 */
533static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
534{
535   unsigned dst;
536
537   assert(fence->num_cs_references);
538
539   /* Cleanup older fences */
540   dst = 0;
541   for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
542      if (bo->u.slab.fences[src]->num_cs_references) {
543         bo->u.slab.fences[dst] = bo->u.slab.fences[src];
544         dst++;
545      } else {
546         radeon_ws_bo_reference(&bo->u.slab.fences[src], NULL);
547      }
548   }
549   bo->u.slab.num_fences = dst;
550
551   /* Check available space for the new fence */
552   if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
553      unsigned new_max_fences = bo->u.slab.max_fences + 1;
554      struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
555                                              bo->u.slab.max_fences * sizeof(*new_fences),
556                                              new_max_fences * sizeof(*new_fences));
557      if (!new_fences) {
558         fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
559         return;
560      }
561
562      bo->u.slab.fences = new_fences;
563      bo->u.slab.max_fences = new_max_fences;
564   }
565
566   /* Add the new fence */
567   bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
568   radeon_ws_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
569   bo->u.slab.num_fences++;
570}
571
572static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
573                               unsigned flags,
574                               struct pipe_fence_handle **pfence)
575{
576   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
577   struct radeon_cs_context *tmp;
578
579   switch (cs->ring_type) {
580   case RING_DMA:
581      /* pad DMA ring to 8 DWs */
582      if (cs->ws->info.chip_class <= GFX6) {
583         while (rcs->current.cdw & 7)
584            radeon_emit(rcs, 0xf0000000); /* NOP packet */
585      } else {
586         while (rcs->current.cdw & 7)
587            radeon_emit(rcs, 0x00000000); /* NOP packet */
588      }
589      break;
590   case RING_GFX:
591      /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
592       * r6xx, requires at least 4 dw alignment to avoid a hw bug.
593       */
594      if (cs->ws->info.gfx_ib_pad_with_type2) {
595         while (rcs->current.cdw & 7)
596            radeon_emit(rcs, 0x80000000); /* type2 nop packet */
597      } else {
598         while (rcs->current.cdw & 7)
599            radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
600      }
601      break;
602   case RING_UVD:
603      while (rcs->current.cdw & 15)
604         radeon_emit(rcs, 0x80000000); /* type2 nop packet */
605      break;
606   default:
607      break;
608   }
609
610   if (rcs->current.cdw > rcs->current.max_dw) {
611      fprintf(stderr, "radeon: command stream overflowed\n");
612   }
613
614   if (pfence || cs->csc->num_slab_buffers) {
615      struct pipe_fence_handle *fence;
616
617      if (cs->next_fence) {
618         fence = cs->next_fence;
619         cs->next_fence = NULL;
620      } else {
621         fence = radeon_cs_create_fence(rcs);
622      }
623
624      if (fence) {
625         if (pfence)
626            radeon_fence_reference(pfence, fence);
627
628         mtx_lock(&cs->ws->bo_fence_lock);
629         for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
630            struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
631            p_atomic_inc(&bo->num_active_ioctls);
632            radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
633         }
634         mtx_unlock(&cs->ws->bo_fence_lock);
635
636         radeon_fence_reference(&fence, NULL);
637      }
638   } else {
639      radeon_fence_reference(&cs->next_fence, NULL);
640   }
641
642   radeon_drm_cs_sync_flush(rcs);
643
644   /* Swap command streams. */
645   tmp = cs->csc;
646   cs->csc = cs->cst;
647   cs->cst = tmp;
648
649   /* If the CS is not empty or overflowed, emit it in a separate thread. */
650   if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
651       !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
652      unsigned i, num_relocs;
653
654      num_relocs = cs->cst->num_relocs;
655
656      cs->cst->chunks[0].length_dw = rcs->current.cdw;
657
658      for (i = 0; i < num_relocs; i++) {
659         /* Update the number of active asynchronous CS ioctls for the buffer. */
660         p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
661      }
662
663      switch (cs->ring_type) {
664      case RING_DMA:
665         cs->cst->flags[0] = 0;
666         cs->cst->flags[1] = RADEON_CS_RING_DMA;
667         cs->cst->cs.num_chunks = 3;
668         if (cs->ws->info.r600_has_virtual_memory) {
669            cs->cst->flags[0] |= RADEON_CS_USE_VM;
670         }
671         break;
672
673      case RING_UVD:
674         cs->cst->flags[0] = 0;
675         cs->cst->flags[1] = RADEON_CS_RING_UVD;
676         cs->cst->cs.num_chunks = 3;
677         break;
678
679      case RING_VCE:
680         cs->cst->flags[0] = 0;
681         cs->cst->flags[1] = RADEON_CS_RING_VCE;
682         cs->cst->cs.num_chunks = 3;
683         break;
684
685      default:
686      case RING_GFX:
687      case RING_COMPUTE:
688         cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
689         cs->cst->flags[1] = RADEON_CS_RING_GFX;
690         cs->cst->cs.num_chunks = 3;
691
692         if (cs->ws->info.r600_has_virtual_memory) {
693            cs->cst->flags[0] |= RADEON_CS_USE_VM;
694            cs->cst->cs.num_chunks = 3;
695         }
696         if (flags & PIPE_FLUSH_END_OF_FRAME) {
697            cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
698            cs->cst->cs.num_chunks = 3;
699         }
700         if (cs->ring_type == RING_COMPUTE) {
701            cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
702            cs->cst->cs.num_chunks = 3;
703         }
704         break;
705      }
706
707      if (util_queue_is_initialized(&cs->ws->cs_queue)) {
708         util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
709                            radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
710         if (!(flags & PIPE_FLUSH_ASYNC))
711            radeon_drm_cs_sync_flush(rcs);
712      } else {
713         radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
714      }
715   } else {
716      radeon_cs_context_cleanup(cs->cst);
717   }
718
719   /* Prepare a new CS. */
720   rcs->current.buf = cs->csc->buf;
721   rcs->current.cdw = 0;
722   rcs->used_vram_kb = 0;
723   rcs->used_gart_kb = 0;
724
725   if (cs->ring_type == RING_GFX)
726      cs->ws->num_gfx_IBs++;
727   else if (cs->ring_type == RING_DMA)
728      cs->ws->num_sdma_IBs++;
729   return 0;
730}
731
732static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
733{
734   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
735
736   if (!cs)
737      return;
738
739   radeon_drm_cs_sync_flush(rcs);
740   util_queue_fence_destroy(&cs->flush_completed);
741   radeon_cs_context_cleanup(&cs->csc1);
742   radeon_cs_context_cleanup(&cs->csc2);
743   p_atomic_dec(&cs->ws->num_cs);
744   radeon_destroy_cs_context(&cs->csc1);
745   radeon_destroy_cs_context(&cs->csc2);
746   radeon_fence_reference(&cs->next_fence, NULL);
747   FREE(cs);
748}
749
750static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
751                                    struct pb_buffer *_buf,
752                                    enum radeon_bo_usage usage)
753{
754   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
755   struct radeon_bo *bo = (struct radeon_bo*)_buf;
756   int index;
757
758   if (!bo->num_cs_references)
759      return false;
760
761   index = radeon_lookup_buffer(cs->csc, bo);
762   if (index == -1)
763      return false;
764
765   if (!bo->handle)
766      index = cs->csc->slab_buffers[index].u.slab.real_idx;
767
768   if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
769      return true;
770   if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
771      return true;
772
773   return false;
774}
775
776/* FENCES */
777
778static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
779{
780   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
781   struct pb_buffer *fence;
782
783   /* Create a fence, which is a dummy BO. */
784   fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
785                                      RADEON_DOMAIN_GTT,
786                                      RADEON_FLAG_NO_SUBALLOC
787                                      | RADEON_FLAG_NO_INTERPROCESS_SHARING);
788   if (!fence)
789      return NULL;
790
791   /* Add the fence as a dummy relocation. */
792   cs->ws->base.cs_add_buffer(rcs, fence,
793                              RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
794                              RADEON_PRIO_FENCE);
795   return (struct pipe_fence_handle*)fence;
796}
797
798static bool radeon_fence_wait(struct radeon_winsys *ws,
799                              struct pipe_fence_handle *fence,
800                              uint64_t timeout)
801{
802   return ws->buffer_wait(ws, (struct pb_buffer*)fence, timeout,
803                          RADEON_USAGE_READWRITE);
804}
805
806static void radeon_fence_reference(struct pipe_fence_handle **dst,
807                                   struct pipe_fence_handle *src)
808{
809   pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
810}
811
812static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
813{
814   struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
815   struct pipe_fence_handle *fence = NULL;
816
817   if (cs->next_fence) {
818      radeon_fence_reference(&fence, cs->next_fence);
819      return fence;
820   }
821
822   fence = radeon_cs_create_fence(rcs);
823   if (!fence)
824      return NULL;
825
826   radeon_fence_reference(&cs->next_fence, fence);
827   return fence;
828}
829
830static void
831radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
832                                   struct pipe_fence_handle *fence,
833                                   unsigned dependency_flags)
834{
835   /* TODO: Handle the following unlikely multi-threaded scenario:
836    *
837    *  Thread 1 / Context 1                   Thread 2 / Context 2
838    *  --------------------                   --------------------
839    *  f = cs_get_next_fence()
840    *                                         cs_add_fence_dependency(f)
841    *                                         cs_flush()
842    *  cs_flush()
843    *
844    * We currently assume that this does not happen because we don't support
845    * asynchronous flushes on Radeon.
846    */
847}
848
849void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
850{
851   ws->base.ctx_create = radeon_drm_ctx_create;
852   ws->base.ctx_destroy = radeon_drm_ctx_destroy;
853   ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
854   ws->base.cs_create = radeon_drm_cs_create;
855   ws->base.cs_destroy = radeon_drm_cs_destroy;
856   ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
857   ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
858   ws->base.cs_validate = radeon_drm_cs_validate;
859   ws->base.cs_check_space = radeon_drm_cs_check_space;
860   ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
861   ws->base.cs_flush = radeon_drm_cs_flush;
862   ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
863   ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
864   ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
865   ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
866   ws->base.fence_wait = radeon_fence_wait;
867   ws->base.fence_reference = radeon_fence_reference;
868}
869