v3dvx_meta_common.c revision 7ec681f3
1/*
2 * Copyright © 2021 Raspberry Pi
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "v3dv_private.h"
25#include "v3dv_meta_common.h"
26
27#include "broadcom/common/v3d_macros.h"
28#include "broadcom/cle/v3dx_pack.h"
29#include "broadcom/compiler/v3d_compiler.h"
30
31#include "vk_format_info.h"
32
33struct rcl_clear_info {
34   const union v3dv_clear_value *clear_value;
35   struct v3dv_image *image;
36   VkImageAspectFlags aspects;
37   uint32_t level;
38};
39
40static struct v3dv_cl *
41emit_rcl_prologue(struct v3dv_job *job,
42                  struct v3dv_meta_framebuffer *fb,
43                  const struct rcl_clear_info *clear_info)
44{
45   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
46
47   struct v3dv_cl *rcl = &job->rcl;
48   v3dv_cl_ensure_space_with_branch(rcl, 200 +
49                                    tiling->layers * 256 *
50                                    cl_packet_length(SUPERTILE_COORDINATES));
51   if (job->cmd_buffer->state.oom)
52      return NULL;
53
54   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
55      config.early_z_disable = true;
56      config.image_width_pixels = tiling->width;
57      config.image_height_pixels = tiling->height;
58      config.number_of_render_targets = 1;
59      config.multisample_mode_4x = tiling->msaa;
60      config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
61      config.internal_depth_type = fb->internal_depth_type;
62   }
63
64   if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
65      uint32_t clear_pad = 0;
66      if (clear_info->image) {
67         const struct v3dv_image *image = clear_info->image;
68         const struct v3d_resource_slice *slice =
69            &image->slices[clear_info->level];
70         if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
71             slice->tiling == V3D_TILING_UIF_XOR) {
72            int uif_block_height = v3d_utile_height(image->cpp) * 2;
73
74            uint32_t implicit_padded_height =
75               align(tiling->height, uif_block_height) / uif_block_height;
76
77            if (slice->padded_height_of_output_image_in_uif_blocks -
78                implicit_padded_height >= 15) {
79               clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
80            }
81         }
82      }
83
84      const uint32_t *color = &clear_info->clear_value->color[0];
85      cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
86         clear.clear_color_low_32_bits = color[0];
87         clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
88         clear.render_target_number = 0;
89      };
90
91      if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
92         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
93            clear.clear_color_mid_low_32_bits =
94              ((color[1] >> 24) | (color[2] << 8));
95            clear.clear_color_mid_high_24_bits =
96              ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
97            clear.render_target_number = 0;
98         };
99      }
100
101      if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
102         cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
103            clear.uif_padded_height_in_uif_blocks = clear_pad;
104            clear.clear_color_high_16_bits = color[3] >> 16;
105            clear.render_target_number = 0;
106         };
107      }
108   }
109
110   cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
111      rt.render_target_0_internal_bpp = tiling->internal_bpp;
112      rt.render_target_0_internal_type = fb->internal_type;
113      rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
114   }
115
116   cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
117      clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
118      clear.stencil_clear_value = clear_info ? clear_info->clear_value->s : 0;
119   };
120
121   cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
122      init.use_auto_chained_tile_lists = true;
123      init.size_of_first_block_in_chained_tile_lists =
124         TILE_ALLOCATION_BLOCK_SIZE_64B;
125   }
126
127   return rcl;
128}
129
130static void
131emit_frame_setup(struct v3dv_job *job,
132                 uint32_t min_layer,
133                 const union v3dv_clear_value *clear_value)
134{
135   v3dv_return_if_oom(NULL, job);
136
137   const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
138
139   struct v3dv_cl *rcl = &job->rcl;
140
141   const uint32_t tile_alloc_offset =
142      64 * min_layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
143   cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
144      list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
145   }
146
147   cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
148      config.number_of_bin_tile_lists = 1;
149      config.total_frame_width_in_tiles = tiling->draw_tiles_x;
150      config.total_frame_height_in_tiles = tiling->draw_tiles_y;
151
152      config.supertile_width_in_tiles = tiling->supertile_width;
153      config.supertile_height_in_tiles = tiling->supertile_height;
154
155      config.total_frame_width_in_supertiles =
156         tiling->frame_width_in_supertiles;
157      config.total_frame_height_in_supertiles =
158         tiling->frame_height_in_supertiles;
159   }
160
161   /* Implement GFXH-1742 workaround. Also, if we are clearing we have to do
162    * it here.
163    */
164   for (int i = 0; i < 2; i++) {
165      cl_emit(rcl, TILE_COORDINATES, coords);
166      cl_emit(rcl, END_OF_LOADS, end);
167      cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
168         store.buffer_to_store = NONE;
169      }
170      if (clear_value && i == 0) {
171         cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
172            clear.clear_z_stencil_buffer = true;
173            clear.clear_all_render_targets = true;
174         }
175      }
176      cl_emit(rcl, END_OF_TILE_MARKER, end);
177   }
178
179   cl_emit(rcl, FLUSH_VCD_CACHE, flush);
180}
181
182static void
183emit_supertile_coordinates(struct v3dv_job *job,
184                           struct v3dv_meta_framebuffer *framebuffer)
185{
186   v3dv_return_if_oom(NULL, job);
187
188   struct v3dv_cl *rcl = &job->rcl;
189
190   const uint32_t min_y = framebuffer->min_y_supertile;
191   const uint32_t max_y = framebuffer->max_y_supertile;
192   const uint32_t min_x = framebuffer->min_x_supertile;
193   const uint32_t max_x = framebuffer->max_x_supertile;
194
195   for (int y = min_y; y <= max_y; y++) {
196      for (int x = min_x; x <= max_x; x++) {
197         cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
198            coords.column_number_in_supertiles = x;
199            coords.row_number_in_supertiles = y;
200         }
201      }
202   }
203}
204
205static void
206emit_linear_load(struct v3dv_cl *cl,
207                 uint32_t buffer,
208                 struct v3dv_bo *bo,
209                 uint32_t offset,
210                 uint32_t stride,
211                 uint32_t format)
212{
213   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
214      load.buffer_to_load = buffer;
215      load.address = v3dv_cl_address(bo, offset);
216      load.input_image_format = format;
217      load.memory_format = V3D_TILING_RASTER;
218      load.height_in_ub_or_stride = stride;
219      load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
220   }
221}
222
223static void
224emit_linear_store(struct v3dv_cl *cl,
225                  uint32_t buffer,
226                  struct v3dv_bo *bo,
227                  uint32_t offset,
228                  uint32_t stride,
229                  bool msaa,
230                  uint32_t format)
231{
232   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
233      store.buffer_to_store = RENDER_TARGET_0;
234      store.address = v3dv_cl_address(bo, offset);
235      store.clear_buffer_being_stored = false;
236      store.output_image_format = format;
237      store.memory_format = V3D_TILING_RASTER;
238      store.height_in_ub_or_stride = stride;
239      store.decimate_mode = msaa ? V3D_DECIMATE_MODE_ALL_SAMPLES :
240                                   V3D_DECIMATE_MODE_SAMPLE_0;
241   }
242}
243
244/* This chooses a tile buffer format that is appropriate for the copy operation.
245 * Typically, this is the image render target type, however, if we are copying
246 * depth/stencil to/from a buffer the hardware can't do raster loads/stores, so
247 * we need to load and store to/from a tile color buffer using a compatible
248 * color format.
249 */
250static uint32_t
251choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
252                  VkImageAspectFlags aspect,
253                  bool for_store,
254                  bool is_copy_to_buffer,
255                  bool is_copy_from_buffer)
256{
257   if (is_copy_to_buffer || is_copy_from_buffer) {
258      switch (framebuffer->vk_format) {
259      case VK_FORMAT_D16_UNORM:
260         return V3D_OUTPUT_IMAGE_FORMAT_R16UI;
261      case VK_FORMAT_D32_SFLOAT:
262         return V3D_OUTPUT_IMAGE_FORMAT_R32F;
263      case VK_FORMAT_X8_D24_UNORM_PACK32:
264         return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
265      case VK_FORMAT_D24_UNORM_S8_UINT:
266         /* When storing the stencil aspect of a combined depth/stencil image
267          * to a buffer, the Vulkan spec states that the output buffer must
268          * have packed stencil values, so we choose an R8UI format for our
269          * store outputs. For the load input we still want RGBA8UI since the
270          * source image contains 4 channels (including the 3 channels
271          * containing the 24-bit depth value).
272          *
273          * When loading the stencil aspect of a combined depth/stencil image
274          * from a buffer, we read packed 8-bit stencil values from the buffer
275          * that we need to put into the LSB of the 32-bit format (the R
276          * channel), so we use R8UI. For the store, if we used R8UI then we
277          * would write 8-bit stencil values consecutively over depth channels,
278          * so we need to use RGBA8UI. This will write each stencil value in
279          * its correct position, but will overwrite depth values (channels G
280          * B,A) with undefined values. To fix this,  we will have to restore
281          * the depth aspect from the Z tile buffer, which we should pre-load
282          * from the image before the store).
283          */
284         if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
285            return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
286         } else {
287            assert(aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
288            if (is_copy_to_buffer) {
289               return for_store ? V3D_OUTPUT_IMAGE_FORMAT_R8UI :
290                                  V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
291            } else {
292               assert(is_copy_from_buffer);
293               return for_store ? V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI :
294                                  V3D_OUTPUT_IMAGE_FORMAT_R8UI;
295            }
296         }
297      default: /* Color formats */
298         return framebuffer->format->rt_type;
299         break;
300      }
301   } else {
302      return framebuffer->format->rt_type;
303   }
304}
305
306static inline bool
307format_needs_rb_swap(struct v3dv_device *device,
308                     VkFormat format)
309{
310   const uint8_t *swizzle = v3dv_get_format_swizzle(device, format);
311   return swizzle[0] == PIPE_SWIZZLE_Z;
312}
313
314static void
315emit_image_load(struct v3dv_device *device,
316                struct v3dv_cl *cl,
317                struct v3dv_meta_framebuffer *framebuffer,
318                struct v3dv_image *image,
319                VkImageAspectFlags aspect,
320                uint32_t layer,
321                uint32_t mip_level,
322                bool is_copy_to_buffer,
323                bool is_copy_from_buffer)
324{
325   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
326
327   /* For image to/from buffer copies we always load to and store from RT0,
328    * even for depth/stencil aspects, because the hardware can't do raster
329    * stores or loads from/to the depth/stencil tile buffers.
330    */
331   bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
332                            aspect == VK_IMAGE_ASPECT_COLOR_BIT;
333
334   const struct v3d_resource_slice *slice = &image->slices[mip_level];
335   cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
336      load.buffer_to_load = load_to_color_tlb ?
337         RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
338
339      load.address = v3dv_cl_address(image->mem->bo, layer_offset);
340
341      load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
342                                                  is_copy_to_buffer,
343                                                  is_copy_from_buffer);
344      load.memory_format = slice->tiling;
345
346      /* When copying depth/stencil images to a buffer, for D24 formats Vulkan
347       * expects the depth value in the LSB bits of each 32-bit pixel.
348       * Unfortunately, the hardware seems to put the S8/X8 bits there and the
349       * depth bits on the MSB. To work around that we can reverse the channel
350       * order and then swap the R/B channels to get what we want.
351       *
352       * NOTE: reversing and swapping only gets us the behavior we want if the
353       * operations happen in that exact order, which seems to be the case when
354       * done on the tile buffer load operations. On the store, it seems the
355       * order is not the same. The order on the store is probably reversed so
356       * that reversing and swapping on both the load and the store preserves
357       * the original order of the channels in memory.
358       *
359       * Notice that we only need to do this when copying to a buffer, where
360       * depth and stencil aspects are copied as separate regions and
361       * the spec expects them to be tightly packed.
362       */
363      bool needs_rb_swap = false;
364      bool needs_chan_reverse = false;
365      if (is_copy_to_buffer &&
366         (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
367          (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
368           (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
369         needs_rb_swap = true;
370         needs_chan_reverse = true;
371      } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
372                 (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
373         /* This is not a raw data copy (i.e. we are clearing the image),
374          * so we need to make sure we respect the format swizzle.
375          */
376         needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
377      }
378
379      load.r_b_swap = needs_rb_swap;
380      load.channel_reverse = needs_chan_reverse;
381
382      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
383          slice->tiling == V3D_TILING_UIF_XOR) {
384         load.height_in_ub_or_stride =
385            slice->padded_height_of_output_image_in_uif_blocks;
386      } else if (slice->tiling == V3D_TILING_RASTER) {
387         load.height_in_ub_or_stride = slice->stride;
388      }
389
390      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
391         load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
392      else
393         load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
394   }
395}
396
397static void
398emit_image_store(struct v3dv_device *device,
399                 struct v3dv_cl *cl,
400                 struct v3dv_meta_framebuffer *framebuffer,
401                 struct v3dv_image *image,
402                 VkImageAspectFlags aspect,
403                 uint32_t layer,
404                 uint32_t mip_level,
405                 bool is_copy_to_buffer,
406                 bool is_copy_from_buffer,
407                 bool is_multisample_resolve)
408{
409   uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
410
411   bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
412                               aspect == VK_IMAGE_ASPECT_COLOR_BIT;
413
414   const struct v3d_resource_slice *slice = &image->slices[mip_level];
415   cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
416      store.buffer_to_store = store_from_color_tlb ?
417         RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
418
419      store.address = v3dv_cl_address(image->mem->bo, layer_offset);
420      store.clear_buffer_being_stored = false;
421
422      /* See rationale in emit_image_load() */
423      bool needs_rb_swap = false;
424      bool needs_chan_reverse = false;
425      if (is_copy_from_buffer &&
426         (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
427          (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
428           (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
429         needs_rb_swap = true;
430         needs_chan_reverse = true;
431      } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
432                 (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
433         needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
434      }
435
436      store.r_b_swap = needs_rb_swap;
437      store.channel_reverse = needs_chan_reverse;
438
439      store.output_image_format = choose_tlb_format(framebuffer, aspect, true,
440                                                    is_copy_to_buffer,
441                                                    is_copy_from_buffer);
442      store.memory_format = slice->tiling;
443      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
444          slice->tiling == V3D_TILING_UIF_XOR) {
445         store.height_in_ub_or_stride =
446            slice->padded_height_of_output_image_in_uif_blocks;
447      } else if (slice->tiling == V3D_TILING_RASTER) {
448         store.height_in_ub_or_stride = slice->stride;
449      }
450
451      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
452         store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
453      else if (is_multisample_resolve)
454         store.decimate_mode = V3D_DECIMATE_MODE_4X;
455      else
456         store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
457   }
458}
459
460static void
461emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
462                                        struct v3dv_meta_framebuffer *framebuffer,
463                                        struct v3dv_buffer *buffer,
464                                        struct v3dv_image *image,
465                                        uint32_t layer_offset,
466                                        const VkBufferImageCopy2KHR *region)
467{
468   struct v3dv_cl *cl = &job->indirect;
469   v3dv_cl_ensure_space(cl, 200, 1);
470   v3dv_return_if_oom(NULL, job);
471
472   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
473
474   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
475
476   /* Load image to TLB */
477   assert((image->vk.image_type != VK_IMAGE_TYPE_3D &&
478           layer_offset < region->imageSubresource.layerCount) ||
479          layer_offset < image->vk.extent.depth);
480
481   const uint32_t image_layer = image->vk.image_type != VK_IMAGE_TYPE_3D ?
482      region->imageSubresource.baseArrayLayer + layer_offset :
483      region->imageOffset.z + layer_offset;
484
485   emit_image_load(job->device, cl, framebuffer, image,
486                   region->imageSubresource.aspectMask,
487                   image_layer,
488                   region->imageSubresource.mipLevel,
489                   true, false);
490
491   cl_emit(cl, END_OF_LOADS, end);
492
493   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
494
495   /* Store TLB to buffer */
496   uint32_t width, height;
497   if (region->bufferRowLength == 0)
498      width = region->imageExtent.width;
499   else
500      width = region->bufferRowLength;
501
502   if (region->bufferImageHeight == 0)
503      height = region->imageExtent.height;
504   else
505      height = region->bufferImageHeight;
506
507   /* Handle copy from compressed format */
508   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
509   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
510
511   /* If we are storing stencil from a combined depth/stencil format the
512    * Vulkan spec states that the output buffer must have packed stencil
513    * values, where each stencil value is 1 byte.
514    */
515   uint32_t cpp =
516      region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
517         1 : image->cpp;
518   uint32_t buffer_stride = width * cpp;
519   uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
520                            height * buffer_stride * layer_offset;
521
522   uint32_t format = choose_tlb_format(framebuffer,
523                                       region->imageSubresource.aspectMask,
524                                       true, true, false);
525   bool msaa = image->vk.samples > VK_SAMPLE_COUNT_1_BIT;
526
527   emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
528                     buffer_offset, buffer_stride, msaa, format);
529
530   cl_emit(cl, END_OF_TILE_MARKER, end);
531
532   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
533
534   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
535      branch.start = tile_list_start;
536      branch.end = v3dv_cl_get_address(cl);
537   }
538}
539
540static void
541emit_copy_layer_to_buffer(struct v3dv_job *job,
542                          struct v3dv_buffer *buffer,
543                          struct v3dv_image *image,
544                          struct v3dv_meta_framebuffer *framebuffer,
545                          uint32_t layer,
546                          const VkBufferImageCopy2KHR *region)
547{
548   emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
549                                           image, layer, region);
550   emit_supertile_coordinates(job, framebuffer);
551}
552
553void
554v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
555                                         struct v3dv_buffer *buffer,
556                                         struct v3dv_image *image,
557                                         struct v3dv_meta_framebuffer *framebuffer,
558                                         const VkBufferImageCopy2KHR *region)
559{
560   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
561   v3dv_return_if_oom(NULL, job);
562
563   emit_frame_setup(job, 0, NULL);
564   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
565      emit_copy_layer_to_buffer(job, buffer, image, framebuffer, layer, region);
566   cl_emit(rcl, END_OF_RENDERING, end);
567}
568
569static void
570emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
571                                       struct v3dv_meta_framebuffer *framebuffer,
572                                       struct v3dv_image *dst,
573                                       struct v3dv_image *src,
574                                       uint32_t layer_offset,
575                                       const VkImageResolve2KHR *region)
576{
577   struct v3dv_cl *cl = &job->indirect;
578   v3dv_cl_ensure_space(cl, 200, 1);
579   v3dv_return_if_oom(NULL, job);
580
581   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
582
583   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
584
585   assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
586           layer_offset < region->srcSubresource.layerCount) ||
587          layer_offset < src->vk.extent.depth);
588
589   const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
590      region->srcSubresource.baseArrayLayer + layer_offset :
591      region->srcOffset.z + layer_offset;
592
593   emit_image_load(job->device, cl, framebuffer, src,
594                   region->srcSubresource.aspectMask,
595                   src_layer,
596                   region->srcSubresource.mipLevel,
597                   false, false);
598
599   cl_emit(cl, END_OF_LOADS, end);
600
601   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
602
603   assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
604           layer_offset < region->dstSubresource.layerCount) ||
605          layer_offset < dst->vk.extent.depth);
606
607   const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
608      region->dstSubresource.baseArrayLayer + layer_offset :
609      region->dstOffset.z + layer_offset;
610
611   emit_image_store(job->device, cl, framebuffer, dst,
612                    region->dstSubresource.aspectMask,
613                    dst_layer,
614                    region->dstSubresource.mipLevel,
615                    false, false, true);
616
617   cl_emit(cl, END_OF_TILE_MARKER, end);
618
619   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
620
621   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
622      branch.start = tile_list_start;
623      branch.end = v3dv_cl_get_address(cl);
624   }
625}
626
627static void
628emit_resolve_image_layer(struct v3dv_job *job,
629                         struct v3dv_image *dst,
630                         struct v3dv_image *src,
631                         struct v3dv_meta_framebuffer *framebuffer,
632                         uint32_t layer,
633                         const VkImageResolve2KHR *region)
634{
635   emit_resolve_image_layer_per_tile_list(job, framebuffer,
636                                          dst, src, layer, region);
637   emit_supertile_coordinates(job, framebuffer);
638}
639
640void
641v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
642                                  struct v3dv_image *dst,
643                                  struct v3dv_image *src,
644                                  struct v3dv_meta_framebuffer *framebuffer,
645                                  const VkImageResolve2KHR *region)
646{
647   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
648   v3dv_return_if_oom(NULL, job);
649
650   emit_frame_setup(job, 0, NULL);
651   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
652      emit_resolve_image_layer(job, dst, src, framebuffer, layer, region);
653   cl_emit(rcl, END_OF_RENDERING, end);
654}
655
656static void
657emit_copy_buffer_per_tile_list(struct v3dv_job *job,
658                               struct v3dv_bo *dst,
659                               struct v3dv_bo *src,
660                               uint32_t dst_offset,
661                               uint32_t src_offset,
662                               uint32_t stride,
663                               uint32_t format)
664{
665   struct v3dv_cl *cl = &job->indirect;
666   v3dv_cl_ensure_space(cl, 200, 1);
667   v3dv_return_if_oom(NULL, job);
668
669   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
670
671   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
672
673   emit_linear_load(cl, RENDER_TARGET_0, src, src_offset, stride, format);
674
675   cl_emit(cl, END_OF_LOADS, end);
676
677   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
678
679   emit_linear_store(cl, RENDER_TARGET_0,
680                     dst, dst_offset, stride, false, format);
681
682   cl_emit(cl, END_OF_TILE_MARKER, end);
683
684   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
685
686   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
687      branch.start = tile_list_start;
688      branch.end = v3dv_cl_get_address(cl);
689   }
690}
691
692void
693v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
694                            struct v3dv_bo *dst,
695                            struct v3dv_bo *src,
696                            uint32_t dst_offset,
697                            uint32_t src_offset,
698                            struct v3dv_meta_framebuffer *framebuffer,
699                            uint32_t format,
700                            uint32_t item_size)
701{
702   const uint32_t stride = job->frame_tiling.width * item_size;
703   emit_copy_buffer_per_tile_list(job, dst, src,
704                                  dst_offset, src_offset,
705                                  stride, format);
706   emit_supertile_coordinates(job, framebuffer);
707}
708
709void
710v3dX(meta_emit_copy_buffer_rcl)(struct v3dv_job *job,
711                                struct v3dv_bo *dst,
712                                struct v3dv_bo *src,
713                                uint32_t dst_offset,
714                                uint32_t src_offset,
715                                struct v3dv_meta_framebuffer *framebuffer,
716                                uint32_t format,
717                                uint32_t item_size)
718{
719   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
720   v3dv_return_if_oom(NULL, job);
721
722   emit_frame_setup(job, 0, NULL);
723
724   v3dX(meta_emit_copy_buffer)(job, dst, src, dst_offset, src_offset,
725                               framebuffer, format, item_size);
726
727   cl_emit(rcl, END_OF_RENDERING, end);
728}
729
730static void
731emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
732                                    struct v3dv_meta_framebuffer *framebuffer,
733                                    struct v3dv_image *dst,
734                                    struct v3dv_image *src,
735                                    uint32_t layer_offset,
736                                    const VkImageCopy2KHR *region)
737{
738   struct v3dv_cl *cl = &job->indirect;
739   v3dv_cl_ensure_space(cl, 200, 1);
740   v3dv_return_if_oom(NULL, job);
741
742   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
743
744   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
745
746   assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
747           layer_offset < region->srcSubresource.layerCount) ||
748          layer_offset < src->vk.extent.depth);
749
750   const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
751      region->srcSubresource.baseArrayLayer + layer_offset :
752      region->srcOffset.z + layer_offset;
753
754   emit_image_load(job->device, cl, framebuffer, src,
755                   region->srcSubresource.aspectMask,
756                   src_layer,
757                   region->srcSubresource.mipLevel,
758                   false, false);
759
760   cl_emit(cl, END_OF_LOADS, end);
761
762   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
763
764   assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
765           layer_offset < region->dstSubresource.layerCount) ||
766          layer_offset < dst->vk.extent.depth);
767
768   const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
769      region->dstSubresource.baseArrayLayer + layer_offset :
770      region->dstOffset.z + layer_offset;
771
772   emit_image_store(job->device, cl, framebuffer, dst,
773                    region->dstSubresource.aspectMask,
774                    dst_layer,
775                    region->dstSubresource.mipLevel,
776                    false, false, false);
777
778   cl_emit(cl, END_OF_TILE_MARKER, end);
779
780   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
781
782   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
783      branch.start = tile_list_start;
784      branch.end = v3dv_cl_get_address(cl);
785   }
786}
787
788static void
789emit_copy_image_layer(struct v3dv_job *job,
790                      struct v3dv_image *dst,
791                      struct v3dv_image *src,
792                      struct v3dv_meta_framebuffer *framebuffer,
793                      uint32_t layer,
794                      const VkImageCopy2KHR *region)
795{
796   emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
797   emit_supertile_coordinates(job, framebuffer);
798}
799
800void
801v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
802                               struct v3dv_image *dst,
803                               struct v3dv_image *src,
804                               struct v3dv_meta_framebuffer *framebuffer,
805                               const VkImageCopy2KHR *region)
806{
807   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
808   v3dv_return_if_oom(NULL, job);
809
810   emit_frame_setup(job, 0, NULL);
811   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
812      emit_copy_image_layer(job, dst, src, framebuffer, layer, region);
813   cl_emit(rcl, END_OF_RENDERING, end);
814}
815
816void
817v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
818                        struct v3dv_image *dst,
819                        uint32_t dst_mip_level,
820                        uint32_t dst_layer,
821                        struct v3dv_image *src,
822                        uint32_t src_mip_level,
823                        uint32_t src_layer,
824                        uint32_t width,
825                        uint32_t height,
826                        const struct v3dv_format *format)
827{
828   const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
829   const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
830
831   assert(dst->mem && dst->mem->bo);
832   const struct v3dv_bo *dst_bo = dst->mem->bo;
833
834   assert(src->mem && src->mem->bo);
835   const struct v3dv_bo *src_bo = src->mem->bo;
836
837   struct drm_v3d_submit_tfu tfu = {
838      .ios = (height << 16) | width,
839      .bo_handles = {
840         dst_bo->handle,
841         src_bo->handle != dst_bo->handle ? src_bo->handle : 0
842      },
843   };
844
845   const uint32_t src_offset =
846      src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
847   tfu.iia |= src_offset;
848
849   uint32_t icfg;
850   if (src_slice->tiling == V3D_TILING_RASTER) {
851      icfg = V3D_TFU_ICFG_FORMAT_RASTER;
852   } else {
853      icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
854             (src_slice->tiling - V3D_TILING_LINEARTILE);
855   }
856   tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
857
858   const uint32_t dst_offset =
859      dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
860   tfu.ioa |= dst_offset;
861
862   tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
863               (dst_slice->tiling - V3D_TILING_LINEARTILE)) <<
864                V3D_TFU_IOA_FORMAT_SHIFT;
865   tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
866
867   switch (src_slice->tiling) {
868   case V3D_TILING_UIF_NO_XOR:
869   case V3D_TILING_UIF_XOR:
870      tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
871      break;
872   case V3D_TILING_RASTER:
873      tfu.iis |= src_slice->stride / src->cpp;
874      break;
875   default:
876      break;
877   }
878
879   /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
880    * OPAD field for the destination (how many extra UIF blocks beyond
881    * those necessary to cover the height).
882    */
883   if (dst_slice->tiling == V3D_TILING_UIF_NO_XOR ||
884       dst_slice->tiling == V3D_TILING_UIF_XOR) {
885      uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
886      uint32_t implicit_padded_height = align(height, uif_block_h);
887      uint32_t icfg =
888         (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
889      tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
890   }
891
892   v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
893}
894
895static void
896emit_clear_image_layer_per_tile_list(struct v3dv_job *job,
897                                     struct v3dv_meta_framebuffer *framebuffer,
898                                     struct v3dv_image *image,
899                                     VkImageAspectFlags aspects,
900                                     uint32_t layer,
901                                     uint32_t level)
902{
903   struct v3dv_cl *cl = &job->indirect;
904   v3dv_cl_ensure_space(cl, 200, 1);
905   v3dv_return_if_oom(NULL, job);
906
907   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
908
909   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
910
911   cl_emit(cl, END_OF_LOADS, end);
912
913   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
914
915   emit_image_store(job->device, cl, framebuffer, image, aspects,
916                    layer, level, false, false, false);
917
918   cl_emit(cl, END_OF_TILE_MARKER, end);
919
920   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
921
922   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
923      branch.start = tile_list_start;
924      branch.end = v3dv_cl_get_address(cl);
925   }
926}
927
928static void
929emit_clear_image_layers(struct v3dv_job *job,
930                 struct v3dv_image *image,
931                 struct v3dv_meta_framebuffer *framebuffer,
932                 VkImageAspectFlags aspects,
933                 uint32_t min_layer,
934                 uint32_t max_layer,
935                 uint32_t level)
936{
937   for (uint32_t layer = min_layer; layer < max_layer; layer++) {
938      emit_clear_image_layer_per_tile_list(job, framebuffer, image, aspects,
939                                           layer, level);
940      emit_supertile_coordinates(job, framebuffer);
941   }
942}
943
944void
945v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
946                                struct v3dv_image *image,
947                                struct v3dv_meta_framebuffer *framebuffer,
948                                const union v3dv_clear_value *clear_value,
949                                VkImageAspectFlags aspects,
950                                uint32_t min_layer,
951                                uint32_t max_layer,
952                                uint32_t level)
953{
954   const struct rcl_clear_info clear_info = {
955      .clear_value = clear_value,
956      .image = image,
957      .aspects = aspects,
958      .level = level,
959   };
960
961   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
962   v3dv_return_if_oom(NULL, job);
963
964   emit_frame_setup(job, 0, clear_value);
965   emit_clear_image_layers(job, image, framebuffer, aspects,
966                           min_layer, max_layer, level);
967   cl_emit(rcl, END_OF_RENDERING, end);
968}
969
970static void
971emit_fill_buffer_per_tile_list(struct v3dv_job *job,
972                               struct v3dv_bo *bo,
973                               uint32_t offset,
974                               uint32_t stride)
975{
976   struct v3dv_cl *cl = &job->indirect;
977   v3dv_cl_ensure_space(cl, 200, 1);
978   v3dv_return_if_oom(NULL, job);
979
980   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
981
982   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
983
984   cl_emit(cl, END_OF_LOADS, end);
985
986   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
987
988   emit_linear_store(cl, RENDER_TARGET_0, bo, offset, stride, false,
989                     V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI);
990
991   cl_emit(cl, END_OF_TILE_MARKER, end);
992
993   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
994
995   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
996      branch.start = tile_list_start;
997      branch.end = v3dv_cl_get_address(cl);
998   }
999}
1000
1001static void
1002emit_fill_buffer(struct v3dv_job *job,
1003                 struct v3dv_bo *bo,
1004                 uint32_t offset,
1005                 struct v3dv_meta_framebuffer *framebuffer)
1006{
1007   const uint32_t stride = job->frame_tiling.width * 4;
1008   emit_fill_buffer_per_tile_list(job, bo, offset, stride);
1009   emit_supertile_coordinates(job, framebuffer);
1010}
1011
1012void
1013v3dX(meta_emit_fill_buffer_rcl)(struct v3dv_job *job,
1014                                struct v3dv_bo *bo,
1015                                uint32_t offset,
1016                                struct v3dv_meta_framebuffer *framebuffer,
1017                                uint32_t data)
1018{
1019   const union v3dv_clear_value clear_value = {
1020       .color = { data, 0, 0, 0 },
1021   };
1022
1023   const struct rcl_clear_info clear_info = {
1024      .clear_value = &clear_value,
1025      .image = NULL,
1026      .aspects = VK_IMAGE_ASPECT_COLOR_BIT,
1027      .level = 0,
1028   };
1029
1030   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
1031   v3dv_return_if_oom(NULL, job);
1032
1033   emit_frame_setup(job, 0, &clear_value);
1034   emit_fill_buffer(job, bo, offset, framebuffer);
1035   cl_emit(rcl, END_OF_RENDERING, end);
1036}
1037
1038
1039static void
1040emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
1041                                        struct v3dv_meta_framebuffer *framebuffer,
1042                                        struct v3dv_image *image,
1043                                        struct v3dv_buffer *buffer,
1044                                        uint32_t layer,
1045                                        const VkBufferImageCopy2KHR *region)
1046{
1047   struct v3dv_cl *cl = &job->indirect;
1048   v3dv_cl_ensure_space(cl, 200, 1);
1049   v3dv_return_if_oom(NULL, job);
1050
1051   struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1052
1053   cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1054
1055   const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
1056   assert((image->vk.image_type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
1057          layer < image->vk.extent.depth);
1058
1059   /* Load TLB from buffer */
1060   uint32_t width, height;
1061   if (region->bufferRowLength == 0)
1062      width = region->imageExtent.width;
1063   else
1064      width = region->bufferRowLength;
1065
1066   if (region->bufferImageHeight == 0)
1067      height = region->imageExtent.height;
1068   else
1069      height = region->bufferImageHeight;
1070
1071   /* Handle copy to compressed format using a compatible format */
1072   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
1073   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
1074
1075   uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
1076                  1 : image->cpp;
1077   uint32_t buffer_stride = width * cpp;
1078   uint32_t buffer_offset =
1079      buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
1080
1081   uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
1082                                       false, false, true);
1083
1084   emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
1085                    buffer_offset, buffer_stride, format);
1086
1087   /* Because we can't do raster loads/stores of Z/S formats we need to
1088    * use a color tile buffer with a compatible RGBA color format instead.
1089    * However, when we are uploading a single aspect to a combined
1090    * depth/stencil image we have the problem that our tile buffer stores don't
1091    * allow us to mask out the other aspect, so we always write all four RGBA
1092    * channels to the image and we end up overwriting that other aspect with
1093    * undefined values. To work around that, we first load the aspect we are
1094    * not copying from the image memory into a proper Z/S tile buffer. Then we
1095    * do our store from the color buffer for the aspect we are copying, and
1096    * after that, we do another store from the Z/S tile buffer to restore the
1097    * other aspect to its original value.
1098    */
1099   if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1100      if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1101         emit_image_load(job->device, cl, framebuffer, image,
1102                         VK_IMAGE_ASPECT_STENCIL_BIT,
1103                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1104                         false, false);
1105      } else {
1106         assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1107         emit_image_load(job->device, cl, framebuffer, image,
1108                         VK_IMAGE_ASPECT_DEPTH_BIT,
1109                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1110                         false, false);
1111      }
1112   }
1113
1114   cl_emit(cl, END_OF_LOADS, end);
1115
1116   cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1117
1118   /* Store TLB to image */
1119   emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
1120                    imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1121                    false, true, false);
1122
1123   if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1124      if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1125         emit_image_store(job->device, cl, framebuffer, image,
1126                          VK_IMAGE_ASPECT_STENCIL_BIT,
1127                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1128                          false, false, false);
1129      } else {
1130         assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1131         emit_image_store(job->device, cl, framebuffer, image,
1132                          VK_IMAGE_ASPECT_DEPTH_BIT,
1133                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
1134                          false, false, false);
1135      }
1136   }
1137
1138   cl_emit(cl, END_OF_TILE_MARKER, end);
1139
1140   cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1141
1142   cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1143      branch.start = tile_list_start;
1144      branch.end = v3dv_cl_get_address(cl);
1145   }
1146}
1147
1148static void
1149emit_copy_buffer_to_layer(struct v3dv_job *job,
1150                          struct v3dv_image *image,
1151                          struct v3dv_buffer *buffer,
1152                          struct v3dv_meta_framebuffer *framebuffer,
1153                          uint32_t layer,
1154                          const VkBufferImageCopy2KHR *region)
1155{
1156   emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
1157                                           layer, region);
1158   emit_supertile_coordinates(job, framebuffer);
1159}
1160
1161void
1162v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
1163                                         struct v3dv_image *image,
1164                                         struct v3dv_buffer *buffer,
1165                                         struct v3dv_meta_framebuffer *framebuffer,
1166                                         const VkBufferImageCopy2KHR *region)
1167{
1168   struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
1169   v3dv_return_if_oom(NULL, job);
1170
1171   emit_frame_setup(job, 0, NULL);
1172   for (int layer = 0; layer < job->frame_tiling.layers; layer++)
1173      emit_copy_buffer_to_layer(job, image, buffer, framebuffer, layer, region);
1174   cl_emit(rcl, END_OF_RENDERING, end);
1175}
1176
1177/* Figure out a TLB size configuration for a number of pixels to process.
1178 * Beware that we can't "render" more than 4096x4096 pixels in a single job,
1179 * if the pixel count is larger than this, the caller might need to split
1180 * the job and call this function multiple times.
1181 */
1182static void
1183framebuffer_size_for_pixel_count(uint32_t num_pixels,
1184                                 uint32_t *width,
1185                                 uint32_t *height)
1186{
1187   assert(num_pixels > 0);
1188
1189   const uint32_t max_dim_pixels = 4096;
1190   const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
1191
1192   uint32_t w, h;
1193   if (num_pixels > max_pixels) {
1194      w = max_dim_pixels;
1195      h = max_dim_pixels;
1196   } else {
1197      w = num_pixels;
1198      h = 1;
1199      while (w > max_dim_pixels || ((w % 2) == 0 && w > 2 * h)) {
1200         w >>= 1;
1201         h <<= 1;
1202      }
1203   }
1204   assert(w <= max_dim_pixels && h <= max_dim_pixels);
1205   assert(w * h <= num_pixels);
1206   assert(w > 0 && h > 0);
1207
1208   *width = w;
1209   *height = h;
1210}
1211
1212struct v3dv_job *
1213v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1214                       struct v3dv_bo *dst,
1215                       uint32_t dst_offset,
1216                       struct v3dv_bo *src,
1217                       uint32_t src_offset,
1218                       const VkBufferCopy2KHR *region)
1219{
1220   const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1221   const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1222
1223   /* Select appropriate pixel format for the copy operation based on the
1224    * size to copy and the alignment of the source and destination offsets.
1225    */
1226   src_offset += region->srcOffset;
1227   dst_offset += region->dstOffset;
1228   uint32_t item_size = 4;
1229   while (item_size > 1 &&
1230          (src_offset % item_size != 0 || dst_offset % item_size != 0)) {
1231      item_size /= 2;
1232   }
1233
1234   while (item_size > 1 && region->size % item_size != 0)
1235      item_size /= 2;
1236
1237   assert(region->size % item_size == 0);
1238   uint32_t num_items = region->size / item_size;
1239   assert(num_items > 0);
1240
1241   uint32_t format;
1242   VkFormat vk_format;
1243   switch (item_size) {
1244   case 4:
1245      format = V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
1246      vk_format = VK_FORMAT_R8G8B8A8_UINT;
1247      break;
1248   case 2:
1249      format = V3D_OUTPUT_IMAGE_FORMAT_RG8UI;
1250      vk_format = VK_FORMAT_R8G8_UINT;
1251      break;
1252   default:
1253      format = V3D_OUTPUT_IMAGE_FORMAT_R8UI;
1254      vk_format = VK_FORMAT_R8_UINT;
1255      break;
1256   }
1257
1258   struct v3dv_job *job = NULL;
1259   while (num_items > 0) {
1260      job = v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1261      if (!job)
1262         return NULL;
1263
1264      uint32_t width, height;
1265      framebuffer_size_for_pixel_count(num_items, &width, &height);
1266
1267      v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
1268
1269      struct v3dv_meta_framebuffer framebuffer;
1270      v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
1271                                  &job->frame_tiling);
1272
1273      v3dX(job_emit_binning_flush)(job);
1274
1275      v3dX(meta_emit_copy_buffer_rcl)(job, dst, src, dst_offset, src_offset,
1276                                      &framebuffer, format, item_size);
1277
1278      v3dv_cmd_buffer_finish_job(cmd_buffer);
1279
1280      const uint32_t items_copied = width * height;
1281      const uint32_t bytes_copied = items_copied * item_size;
1282      num_items -= items_copied;
1283      src_offset += bytes_copied;
1284      dst_offset += bytes_copied;
1285   }
1286
1287   return job;
1288}
1289
1290void
1291v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1292                       struct v3dv_bo *bo,
1293                       uint32_t offset,
1294                       uint32_t size,
1295                       uint32_t data)
1296{
1297   assert(size > 0 && size % 4 == 0);
1298   assert(offset + size <= bo->size);
1299
1300   const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1301   const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1302   uint32_t num_items = size / 4;
1303
1304   while (num_items > 0) {
1305      struct v3dv_job *job =
1306         v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1307      if (!job)
1308         return;
1309
1310      uint32_t width, height;
1311      framebuffer_size_for_pixel_count(num_items, &width, &height);
1312
1313      v3dv_job_start_frame(job, width, height, 1, true, 1, internal_bpp, false);
1314
1315      struct v3dv_meta_framebuffer framebuffer;
1316      v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
1317                                  internal_type, &job->frame_tiling);
1318
1319      v3dX(job_emit_binning_flush)(job);
1320
1321      v3dX(meta_emit_fill_buffer_rcl)(job, bo, offset, &framebuffer, data);
1322
1323      v3dv_cmd_buffer_finish_job(cmd_buffer);
1324
1325      const uint32_t items_copied = width * height;
1326      const uint32_t bytes_copied = items_copied * 4;
1327      num_items -= items_copied;
1328      offset += bytes_copied;
1329   }
1330}
1331
1332void
1333v3dX(meta_framebuffer_init)(struct v3dv_meta_framebuffer *fb,
1334                            VkFormat vk_format,
1335                            uint32_t internal_type,
1336                            const struct v3dv_frame_tiling *tiling)
1337{
1338   fb->internal_type = internal_type;
1339
1340   /* Supertile coverage always starts at 0,0  */
1341   uint32_t supertile_w_in_pixels =
1342      tiling->tile_width * tiling->supertile_width;
1343   uint32_t supertile_h_in_pixels =
1344      tiling->tile_height * tiling->supertile_height;
1345
1346   fb->min_x_supertile = 0;
1347   fb->min_y_supertile = 0;
1348   fb->max_x_supertile = (tiling->width - 1) / supertile_w_in_pixels;
1349   fb->max_y_supertile = (tiling->height - 1) / supertile_h_in_pixels;
1350
1351   fb->vk_format = vk_format;
1352   fb->format = v3dX(get_format)(vk_format);
1353
1354   fb->internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
1355   if (vk_format_is_depth_or_stencil(vk_format))
1356      fb->internal_depth_type = v3dX(get_internal_depth_type)(vk_format);
1357}
1358