tu_clear_blit.c revision 7ec681f3
1/*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 *    Jonathan Marek <jonathan@marek.ca>
7 */
8
9#include "tu_private.h"
10
11#include "tu_cs.h"
12#include "vk_format.h"
13
14#include "ir3/ir3_nir.h"
15
16#include "util/format_r11g11b10f.h"
17#include "util/format_rgb9e5.h"
18#include "util/format_srgb.h"
19#include "util/half_float.h"
20#include "compiler/nir/nir_builder.h"
21
22#include "tu_tracepoints.h"
23
24static uint32_t
25tu_pack_float32_for_unorm(float val, int bits)
26{
27   return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
28}
29
30/* r2d_ = BLIT_OP_SCALE operations */
31
32static enum a6xx_2d_ifmt
33format_to_ifmt(VkFormat format)
34{
35   if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
36       format == VK_FORMAT_X8_D24_UNORM_PACK32)
37      return R2D_UNORM8;
38
39   /* get_component_bits doesn't work with depth/stencil formats: */
40   if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
41      return R2D_FLOAT32;
42   if (format == VK_FORMAT_S8_UINT)
43      return R2D_INT8;
44
45   /* use the size of the red channel to find the corresponding "ifmt" */
46   bool is_int = vk_format_is_int(format);
47   switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
48   case 4: case 5: case 8:
49      return is_int ? R2D_INT8 : R2D_UNORM8;
50   case 10: case 11:
51      return is_int ? R2D_INT16 : R2D_FLOAT16;
52   case 16:
53      if (vk_format_is_float(format))
54         return R2D_FLOAT16;
55      return is_int ? R2D_INT16 : R2D_FLOAT32;
56   case 32:
57      return is_int ? R2D_INT32 : R2D_FLOAT32;
58    default:
59      unreachable("bad format");
60      return 0;
61   }
62}
63
64static void
65r2d_coords(struct tu_cs *cs,
66           const VkOffset2D *dst,
67           const VkOffset2D *src,
68           const VkExtent2D *extent)
69{
70   tu_cs_emit_regs(cs,
71      A6XX_GRAS_2D_DST_TL(.x = dst->x,                     .y = dst->y),
72      A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
73
74   if (!src)
75      return;
76
77   tu_cs_emit_regs(cs,
78                   A6XX_GRAS_2D_SRC_TL_X(src->x),
79                   A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
80                   A6XX_GRAS_2D_SRC_TL_Y(src->y),
81                   A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
82}
83
84static void
85r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
86{
87   uint32_t clear_value[4] = {};
88
89   switch (format) {
90   case VK_FORMAT_X8_D24_UNORM_PACK32:
91   case VK_FORMAT_D24_UNORM_S8_UINT:
92      /* cleared as r8g8b8a8_unorm using special format */
93      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
94      clear_value[1] = clear_value[0] >> 8;
95      clear_value[2] = clear_value[0] >> 16;
96      clear_value[3] = val->depthStencil.stencil;
97      break;
98   case VK_FORMAT_D16_UNORM:
99   case VK_FORMAT_D32_SFLOAT:
100      /* R2D_FLOAT32 */
101      clear_value[0] = fui(val->depthStencil.depth);
102      break;
103   case VK_FORMAT_S8_UINT:
104      clear_value[0] = val->depthStencil.stencil;
105      break;
106   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
107      /* cleared as UINT32 */
108      clear_value[0] = float3_to_rgb9e5(val->color.float32);
109      break;
110   default:
111      assert(!vk_format_is_depth_or_stencil(format));
112      const struct util_format_description *desc = vk_format_description(format);
113      enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
114
115      assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
116                      format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
117
118      for (unsigned i = 0; i < desc->nr_channels; i++) {
119         const struct util_format_channel_description *ch = &desc->channel[i];
120         if (ifmt == R2D_UNORM8) {
121            float linear = val->color.float32[i];
122            if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
123               linear = util_format_linear_to_srgb_float(val->color.float32[i]);
124
125            if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
126               clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
127            else
128               clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
129         } else if (ifmt == R2D_FLOAT16) {
130            clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
131         } else {
132            assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
133                   ifmt == R2D_INT16 || ifmt == R2D_INT8);
134            clear_value[i] = val->color.uint32[i];
135         }
136      }
137      break;
138   }
139
140   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
141   tu_cs_emit_array(cs, clear_value, 4);
142}
143
144static void
145r2d_src(struct tu_cmd_buffer *cmd,
146        struct tu_cs *cs,
147        const struct tu_image_view *iview,
148        uint32_t layer,
149        VkFilter filter)
150{
151   uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
152   if (filter != VK_FILTER_NEAREST)
153      src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
154
155   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
156   tu_cs_emit(cs, src_info);
157   tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
158   tu_cs_image_ref_2d(cs, iview, layer, true);
159
160   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS, 3);
161   tu_cs_image_flag_ref(cs, iview, layer);
162}
163
164static void
165r2d_src_stencil(struct tu_cmd_buffer *cmd,
166                struct tu_cs *cs,
167                const struct tu_image_view *iview,
168                uint32_t layer,
169                VkFilter filter)
170{
171   tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
172   tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
173   tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
174   tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
175   /* SP_PS_2D_SRC_PITCH has shifted pitch field */
176   tu_cs_emit(cs, iview->stencil_PITCH << 9);
177}
178
179static void
180r2d_src_buffer(struct tu_cmd_buffer *cmd,
181               struct tu_cs *cs,
182               VkFormat vk_format,
183               uint64_t va, uint32_t pitch,
184               uint32_t width, uint32_t height)
185{
186   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
187
188   tu_cs_emit_regs(cs,
189                   A6XX_SP_PS_2D_SRC_INFO(
190                      .color_format = format.fmt,
191                      .color_swap = format.swap,
192                      .srgb = vk_format_is_srgb(vk_format),
193                      .unk20 = 1,
194                      .unk22 = 1),
195                   A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
196                   A6XX_SP_PS_2D_SRC(.qword = va),
197                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
198}
199
200static void
201r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
202{
203   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
204   tu_cs_emit(cs, iview->RB_2D_DST_INFO);
205   tu_cs_image_ref_2d(cs, iview, layer, false);
206
207   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
208   tu_cs_image_flag_ref(cs, iview, layer);
209}
210
211static void
212r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
213{
214   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
215   tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
216   tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
217   tu_cs_emit(cs, iview->stencil_PITCH);
218}
219
220static void
221r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
222{
223   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
224
225   tu_cs_emit_regs(cs,
226                   A6XX_RB_2D_DST_INFO(
227                      .color_format = format.fmt,
228                      .color_swap = format.swap,
229                      .srgb = vk_format_is_srgb(vk_format)),
230                   A6XX_RB_2D_DST(.qword = va),
231                   A6XX_RB_2D_DST_PITCH(pitch));
232}
233
234static void
235r2d_setup_common(struct tu_cmd_buffer *cmd,
236                 struct tu_cs *cs,
237                 VkFormat vk_format,
238                 VkImageAspectFlags aspect_mask,
239                 unsigned blit_param,
240                 bool clear,
241                 bool ubwc,
242                 bool scissor)
243{
244   enum a6xx_format format = tu6_base_format(vk_format);
245   enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
246   uint32_t unknown_8c01 = 0;
247
248   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
249        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
250      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
251   }
252
253   /* note: the only format with partial clearing is D24S8 */
254   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
255      /* preserve stencil channel */
256      if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
257         unknown_8c01 = 0x08000041;
258      /* preserve depth channels */
259      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
260         unknown_8c01 = 0x00084001;
261   }
262
263   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
264   tu_cs_emit(cs, unknown_8c01);
265
266   uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
267         .scissor = scissor,
268         .rotate = blit_param,
269         .solid_color = clear,
270         .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
271         .color_format = format,
272         .mask = 0xf,
273         .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
274      ).value;
275
276   tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
277   tu_cs_emit(cs, blit_cntl);
278
279   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
280   tu_cs_emit(cs, blit_cntl);
281
282   if (format == FMT6_10_10_10_2_UNORM_DEST)
283      format = FMT6_16_16_16_16_FLOAT;
284
285   tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
286         .sint = vk_format_is_sint(vk_format),
287         .uint = vk_format_is_uint(vk_format),
288         .color_format = format,
289         .srgb = vk_format_is_srgb(vk_format),
290         .mask = 0xf));
291}
292
293static void
294r2d_setup(struct tu_cmd_buffer *cmd,
295          struct tu_cs *cs,
296          VkFormat vk_format,
297          VkImageAspectFlags aspect_mask,
298          unsigned blit_param,
299          bool clear,
300          bool ubwc,
301          VkSampleCountFlagBits samples)
302{
303   assert(samples == VK_SAMPLE_COUNT_1_BIT);
304
305   tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
306
307   r2d_setup_common(cmd, cs, vk_format, aspect_mask, blit_param, clear, ubwc, false);
308}
309
310static void
311r2d_teardown(struct tu_cmd_buffer *cmd,
312             struct tu_cs *cs)
313{
314   /* nothing to do here */
315}
316
317static void
318r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
319{
320   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
321   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
322}
323
324/* r3d_ = shader path operations */
325
326static nir_ssa_def *
327load_const(nir_builder *b, unsigned base, unsigned components)
328{
329   return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
330                           .base = base);
331}
332
333static nir_shader *
334build_blit_vs_shader(void)
335{
336   nir_builder _b =
337      nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
338   nir_builder *b = &_b;
339
340   nir_variable *out_pos =
341      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
342                          "gl_Position");
343   out_pos->data.location = VARYING_SLOT_POS;
344
345   nir_ssa_def *vert0_pos = load_const(b, 0, 2);
346   nir_ssa_def *vert1_pos = load_const(b, 4, 2);
347   nir_ssa_def *vertex = nir_load_vertex_id(b);
348
349   nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
350   pos = nir_vec4(b, nir_channel(b, pos, 0),
351                     nir_channel(b, pos, 1),
352                     nir_imm_float(b, 0.0),
353                     nir_imm_float(b, 1.0));
354
355   nir_store_var(b, out_pos, pos, 0xf);
356
357   nir_variable *out_coords =
358      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
359                          "coords");
360   out_coords->data.location = VARYING_SLOT_VAR0;
361
362   nir_ssa_def *vert0_coords = load_const(b, 2, 2);
363   nir_ssa_def *vert1_coords = load_const(b, 6, 2);
364
365   /* Only used with "z scale" blit path which uses a 3d texture */
366   nir_ssa_def *z_coord = load_const(b, 8, 1);
367
368   nir_ssa_def *coords = nir_bcsel(b, nir_i2b1(b, vertex), vert1_coords, vert0_coords);
369   coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
370                     z_coord);
371
372   nir_store_var(b, out_coords, coords, 0x7);
373
374   return b->shader;
375}
376
377static nir_shader *
378build_clear_vs_shader(void)
379{
380   nir_builder _b =
381      nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
382   nir_builder *b = &_b;
383
384   nir_variable *out_pos =
385      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
386                          "gl_Position");
387   out_pos->data.location = VARYING_SLOT_POS;
388
389   nir_ssa_def *vert0_pos = load_const(b, 0, 2);
390   nir_ssa_def *vert1_pos = load_const(b, 4, 2);
391   /* c0.z is used to clear depth */
392   nir_ssa_def *depth = load_const(b, 2, 1);
393   nir_ssa_def *vertex = nir_load_vertex_id(b);
394
395   nir_ssa_def *pos = nir_bcsel(b, nir_i2b1(b, vertex), vert1_pos, vert0_pos);
396   pos = nir_vec4(b, nir_channel(b, pos, 0),
397                     nir_channel(b, pos, 1),
398                     depth, nir_imm_float(b, 1.0));
399
400   nir_store_var(b, out_pos, pos, 0xf);
401
402   nir_variable *out_layer =
403      nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
404                          "gl_Layer");
405   out_layer->data.location = VARYING_SLOT_LAYER;
406   nir_ssa_def *layer = load_const(b, 3, 1);
407   nir_store_var(b, out_layer, layer, 1);
408
409   return b->shader;
410}
411
412static nir_shader *
413build_blit_fs_shader(bool zscale)
414{
415   nir_builder _b =
416      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
417                                     zscale ? "zscale blit fs" : "blit fs");
418   nir_builder *b = &_b;
419
420   nir_variable *out_color =
421      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
422                          "color0");
423   out_color->data.location = FRAG_RESULT_DATA0;
424
425   unsigned coord_components = zscale ? 3 : 2;
426   nir_variable *in_coords =
427      nir_variable_create(b->shader, nir_var_shader_in,
428                          glsl_vec_type(coord_components),
429                          "coords");
430   in_coords->data.location = VARYING_SLOT_VAR0;
431
432   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
433   /* Note: since we're just copying data, we rely on the HW ignoring the
434    * dest_type.
435    */
436   tex->dest_type = nir_type_int32;
437   tex->is_array = false;
438   tex->is_shadow = false;
439   tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
440
441   tex->texture_index = 0;
442   tex->sampler_index = 0;
443
444   b->shader->info.num_textures = 1;
445   BITSET_SET(b->shader->info.textures_used, 0);
446
447   tex->src[0].src_type = nir_tex_src_coord;
448   tex->src[0].src = nir_src_for_ssa(nir_load_var(b, in_coords));
449   tex->coord_components = coord_components;
450
451   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
452   nir_builder_instr_insert(b, &tex->instr);
453
454   nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
455
456   return b->shader;
457}
458
459/* We can only read multisample textures via txf_ms, so we need a separate
460 * variant for them.
461 */
462static nir_shader *
463build_ms_copy_fs_shader(void)
464{
465   nir_builder _b =
466      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
467                                     "multisample copy fs");
468   nir_builder *b = &_b;
469
470   nir_variable *out_color =
471      nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
472                          "color0");
473   out_color->data.location = FRAG_RESULT_DATA0;
474
475   nir_variable *in_coords =
476      nir_variable_create(b->shader, nir_var_shader_in,
477                          glsl_vec_type(2),
478                          "coords");
479   in_coords->data.location = VARYING_SLOT_VAR0;
480
481   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
482
483   tex->op = nir_texop_txf_ms;
484
485   /* Note: since we're just copying data, we rely on the HW ignoring the
486    * dest_type.
487    */
488   tex->dest_type = nir_type_int32;
489   tex->is_array = false;
490   tex->is_shadow = false;
491   tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
492
493   tex->texture_index = 0;
494   tex->sampler_index = 0;
495
496   b->shader->info.num_textures = 1;
497   BITSET_SET(b->shader->info.textures_used, 0);
498   BITSET_SET(b->shader->info.textures_used_by_txf, 0);
499
500   nir_ssa_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
501
502   tex->src[0].src_type = nir_tex_src_coord;
503   tex->src[0].src = nir_src_for_ssa(coord);
504   tex->coord_components = 2;
505
506   tex->src[1].src_type = nir_tex_src_ms_index;
507   tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
508
509   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
510   nir_builder_instr_insert(b, &tex->instr);
511
512   nir_store_var(b, out_color, &tex->dest.ssa, 0xf);
513
514   return b->shader;
515}
516
517static nir_shader *
518build_clear_fs_shader(unsigned mrts)
519{
520   nir_builder _b =
521      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
522                                     "mrt%u clear fs", mrts);
523   nir_builder *b = &_b;
524
525   for (unsigned i = 0; i < mrts; i++) {
526      nir_variable *out_color =
527         nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
528                             "color");
529      out_color->data.location = FRAG_RESULT_DATA0 + i;
530
531      nir_ssa_def *color = load_const(b, 4 * i, 4);
532      nir_store_var(b, out_color, color, 0xf);
533   }
534
535   return b->shader;
536}
537
538static void
539compile_shader(struct tu_device *dev, struct nir_shader *nir,
540               unsigned consts, unsigned *offset, enum global_shader idx)
541{
542   nir->options = ir3_get_compiler_options(dev->compiler);
543
544   nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
545   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
546
547   ir3_finalize_nir(dev->compiler, nir);
548
549   struct ir3_shader *sh = ir3_shader_from_nir(dev->compiler, nir,
550                                               align(consts, 4), NULL);
551
552   struct ir3_shader_key key = {};
553   bool created;
554   struct ir3_shader_variant *so =
555      ir3_shader_get_variant(sh, &key, false, false, &created);
556
557   struct tu6_global *global = dev->global_bo.map;
558
559   assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
560   dev->global_shaders[idx] = so;
561   memcpy(&global->shaders[*offset], so->bin,
562          sizeof(uint32_t) * so->info.sizedwords);
563   dev->global_shader_va[idx] = dev->global_bo.iova +
564      gb_offset(shaders[*offset]);
565   *offset += align(so->info.sizedwords, 32);
566}
567
568void
569tu_init_clear_blit_shaders(struct tu_device *dev)
570{
571   unsigned offset = 0;
572   compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
573   compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
574   compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
575   compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
576   compile_shader(dev, build_ms_copy_fs_shader(), 0, &offset, GLOBAL_SH_FS_COPY_MS);
577
578   for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
579      compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
580                     GLOBAL_SH_FS_CLEAR0 + num_rts);
581   }
582}
583
584void
585tu_destroy_clear_blit_shaders(struct tu_device *dev)
586{
587   for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
588      if (dev->global_shaders[i])
589         ir3_shader_destroy(dev->global_shaders[i]->shader);
590   }
591}
592
593static void
594r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
595           uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
596{
597   enum global_shader vs_id =
598      blit ? GLOBAL_SH_VS_BLIT : GLOBAL_SH_VS_CLEAR;
599
600   struct ir3_shader_variant *vs = cmd->device->global_shaders[vs_id];
601   uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
602
603   enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
604
605   if (z_scale)
606      fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
607   else if (samples != VK_SAMPLE_COUNT_1_BIT)
608      fs_id = GLOBAL_SH_FS_COPY_MS;
609
610   unsigned num_rts = util_bitcount(rts_mask);
611   if (!blit)
612      fs_id = GLOBAL_SH_FS_CLEAR0 + num_rts;
613
614   struct ir3_shader_variant *fs = cmd->device->global_shaders[fs_id];
615   uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
616
617   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
618         .vs_state = true,
619         .hs_state = true,
620         .ds_state = true,
621         .gs_state = true,
622         .fs_state = true,
623         .cs_state = true,
624         .gfx_ibo = true,
625         .cs_ibo = true,
626         .gfx_shared_const = true,
627         .gfx_bindless = 0x1f,
628         .cs_bindless = 0x1f));
629
630   tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, vs);
631   tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);
632   tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);
633   tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);
634   tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, fs);
635
636   struct tu_pvtmem_config pvtmem = {};
637   tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
638   tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
639
640   tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
641   tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
642
643   if (cmd->device->physical_device->info->a6xx.has_cp_reg_write) {
644   /* Copy what the blob does here. This will emit an extra 0x3f
645    * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
646    * this is working around yet.
647    */
648   tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
649   tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
650   tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
651   tu_cs_emit(cs, 0);
652   } else {
653      tu_cs_emit_regs(cs, A6XX_PC_MULTIVIEW_CNTL());
654   }
655   tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
656
657   tu6_emit_vpc(cs, vs, NULL, NULL, NULL, fs, 0);
658
659   /* REPL_MODE for varying with RECTLIST (2 vertices only) */
660   tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
661   tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
662
663   tu6_emit_fs_inputs(cs, fs);
664
665   tu_cs_emit_regs(cs,
666                   A6XX_GRAS_CL_CNTL(
667                      .persp_division_disable = 1,
668                      .vp_xform_disable = 1,
669                      .vp_clip_code_ignore = 1,
670                      .clip_disable = 1));
671   tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
672
673   tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
674   tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
675
676   tu_cs_emit_regs(cs,
677                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
678                   A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
679   tu_cs_emit_regs(cs,
680                   A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
681                   A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
682
683   tu_cs_emit_regs(cs,
684                   A6XX_VFD_INDEX_OFFSET(),
685                   A6XX_VFD_INSTANCE_START_OFFSET());
686
687   if (rts_mask) {
688      unsigned rts_count = util_last_bit(rts_mask);
689      tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
690      unsigned rt = 0;
691      for (unsigned i = 0; i < rts_count; i++) {
692         unsigned regid = 0;
693         if (rts_mask & (1u << i))
694            regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
695         tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid));
696      }
697   }
698
699   cmd->state.line_mode = RECTANGULAR;
700   tu6_emit_msaa(cs, samples, cmd->state.line_mode);
701}
702
703static void
704r3d_coords_raw(struct tu_cs *cs, const float *coords)
705{
706   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
707   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
708                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
709                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
710                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
711                  CP_LOAD_STATE6_0_NUM_UNIT(2));
712   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
713   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
714   tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
715}
716
717/* z coordinate for "z scale" blit path which uses a 3d texture */
718static void
719r3d_coord_z(struct tu_cs *cs, float z)
720{
721   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 4);
722   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(2) |
723                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
724                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
725                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
726                  CP_LOAD_STATE6_0_NUM_UNIT(1));
727   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
728   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
729   tu_cs_emit(cs, fui(z));
730   tu_cs_emit(cs, 0);
731   tu_cs_emit(cs, 0);
732   tu_cs_emit(cs, 0);
733}
734
735static void
736r3d_coords(struct tu_cs *cs,
737           const VkOffset2D *dst,
738           const VkOffset2D *src,
739           const VkExtent2D *extent)
740{
741   int32_t src_x1 = src ? src->x : 0;
742   int32_t src_y1 = src ? src->y : 0;
743   r3d_coords_raw(cs, (float[]) {
744      dst->x,                 dst->y,
745      src_x1,                 src_y1,
746      dst->x + extent->width, dst->y + extent->height,
747      src_x1 + extent->width, src_y1 + extent->height,
748   });
749}
750
751static void
752r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
753{
754   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
755   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
756                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
757                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
758                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
759                  CP_LOAD_STATE6_0_NUM_UNIT(1));
760   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
761   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
762   switch (format) {
763   case VK_FORMAT_X8_D24_UNORM_PACK32:
764   case VK_FORMAT_D24_UNORM_S8_UINT: {
765      /* cleared as r8g8b8a8_unorm using special format */
766      uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
767      tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
768      tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
769      tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
770      tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
771   } break;
772   case VK_FORMAT_D16_UNORM:
773   case VK_FORMAT_D32_SFLOAT:
774      tu_cs_emit(cs, fui(val->depthStencil.depth));
775      tu_cs_emit(cs, 0);
776      tu_cs_emit(cs, 0);
777      tu_cs_emit(cs, 0);
778      break;
779   case VK_FORMAT_S8_UINT:
780      tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
781      tu_cs_emit(cs, 0);
782      tu_cs_emit(cs, 0);
783      tu_cs_emit(cs, 0);
784      break;
785   default:
786      /* as color formats use clear value as-is */
787      assert(!vk_format_is_depth_or_stencil(format));
788      tu_cs_emit_array(cs, val->color.uint32, 4);
789      break;
790   }
791}
792
793static void
794r3d_src_common(struct tu_cmd_buffer *cmd,
795               struct tu_cs *cs,
796               const uint32_t *tex_const,
797               uint32_t offset_base,
798               uint32_t offset_ubwc,
799               VkFilter filter)
800{
801   struct tu_cs_memory texture = { };
802   VkResult result = tu_cs_alloc(&cmd->sub_cs,
803                                 2, /* allocate space for a sampler too */
804                                 A6XX_TEX_CONST_DWORDS, &texture);
805   if (result != VK_SUCCESS) {
806      cmd->record_result = result;
807      return;
808   }
809
810   memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
811
812   /* patch addresses for layer offset */
813   *(uint64_t*) (texture.map + 4) += offset_base;
814   uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
815   texture.map[7] = ubwc_addr;
816   texture.map[8] = ubwc_addr >> 32;
817
818   texture.map[A6XX_TEX_CONST_DWORDS + 0] =
819      A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
820      A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
821      A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
822      A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
823      A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
824      0x60000; /* XXX used by blob, doesn't seem necessary */
825   texture.map[A6XX_TEX_CONST_DWORDS + 1] =
826      0x1 | /* XXX used by blob, doesn't seem necessary */
827      A6XX_TEX_SAMP_1_UNNORM_COORDS |
828      A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
829   texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
830   texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
831
832   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
833   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
834               CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
835               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
836               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
837               CP_LOAD_STATE6_0_NUM_UNIT(1));
838   tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
839
840   tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
841
842   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
843   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
844      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
845      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
846      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
847      CP_LOAD_STATE6_0_NUM_UNIT(1));
848   tu_cs_emit_qw(cs, texture.iova);
849
850   tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
851   tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
852}
853
854static void
855r3d_src(struct tu_cmd_buffer *cmd,
856        struct tu_cs *cs,
857        const struct tu_image_view *iview,
858        uint32_t layer,
859        VkFilter filter)
860{
861   r3d_src_common(cmd, cs, iview->descriptor,
862                  iview->layer_size * layer,
863                  iview->ubwc_layer_size * layer,
864                  filter);
865}
866
867static void
868r3d_src_buffer(struct tu_cmd_buffer *cmd,
869               struct tu_cs *cs,
870               VkFormat vk_format,
871               uint64_t va, uint32_t pitch,
872               uint32_t width, uint32_t height)
873{
874   uint32_t desc[A6XX_TEX_CONST_DWORDS];
875
876   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
877
878   desc[0] =
879      COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
880      A6XX_TEX_CONST_0_FMT(format.fmt) |
881      A6XX_TEX_CONST_0_SWAP(format.swap) |
882      A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
883      // XXX to swizzle into .w for stencil buffer_to_image
884      A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
885      A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
886      A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
887   desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
888   desc[2] =
889      A6XX_TEX_CONST_2_PITCH(pitch) |
890      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
891   desc[3] = 0;
892   desc[4] = va;
893   desc[5] = va >> 32;
894   for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
895      desc[i] = 0;
896
897   r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
898}
899
900static void
901r3d_src_gmem(struct tu_cmd_buffer *cmd,
902             struct tu_cs *cs,
903             const struct tu_image_view *iview,
904             VkFormat format,
905             uint32_t gmem_offset,
906             uint32_t cpp)
907{
908   uint32_t desc[A6XX_TEX_CONST_DWORDS];
909   memcpy(desc, iview->descriptor, sizeof(desc));
910
911   /* patch the format so that depth/stencil get the right format */
912   desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
913   desc[0] |= A6XX_TEX_CONST_0_FMT(tu6_format_texture(format, TILE6_2).fmt);
914
915   /* patched for gmem */
916   desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
917   desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
918   desc[2] =
919      A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
920      A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
921   desc[3] = 0;
922   desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
923   desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
924   for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
925      desc[i] = 0;
926
927   r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
928}
929
930static void
931r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
932{
933   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
934   tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
935   tu_cs_image_ref(cs, iview, layer);
936   tu_cs_emit(cs, 0);
937
938   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
939   tu_cs_image_flag_ref(cs, iview, layer);
940
941   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
942}
943
944static void
945r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
946{
947   tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
948   tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
949   tu_cs_image_stencil_ref(cs, iview, layer);
950   tu_cs_emit(cs, 0);
951
952   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
953}
954
955static void
956r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
957{
958   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
959
960   tu_cs_emit_regs(cs,
961                   A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
962                   A6XX_RB_MRT_PITCH(0, pitch),
963                   A6XX_RB_MRT_ARRAY_PITCH(0, 0),
964                   A6XX_RB_MRT_BASE(0, .qword = va),
965                   A6XX_RB_MRT_BASE_GMEM(0, 0));
966
967   tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
968}
969
970static uint8_t
971aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
972{
973   uint8_t mask = 0xf;
974   assert(aspect_mask);
975   /* note: the only format with partial writing is D24S8,
976    * clear/blit uses the _AS_R8G8B8A8 format to access it
977    */
978   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
979      if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
980         mask = 0x7;
981      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
982         mask = 0x8;
983   }
984   return mask;
985}
986
987static void
988r3d_setup(struct tu_cmd_buffer *cmd,
989          struct tu_cs *cs,
990          VkFormat vk_format,
991          VkImageAspectFlags aspect_mask,
992          unsigned blit_param,
993          bool clear,
994          bool ubwc,
995          VkSampleCountFlagBits samples)
996{
997   enum a6xx_format format = tu6_base_format(vk_format);
998
999   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
1000        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
1001      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
1002   }
1003
1004   if (!cmd->state.pass) {
1005      tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1006      tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1007   }
1008
1009   tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
1010   tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
1011
1012   r3d_common(cmd, cs, !clear, 1, blit_param, samples);
1013
1014   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1015   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1016                  A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1017                  0xfc000000);
1018   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
1019
1020   tu_cs_emit_regs(cs,
1021                   A6XX_RB_FS_OUTPUT_CNTL0(),
1022                   A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1023
1024   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1025   tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1026
1027   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1028   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1029   tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1030   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1031   tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1032   tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1033   tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1034
1035   tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
1036   tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
1037
1038   tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1039                        .color_format = format,
1040                        .color_sint = vk_format_is_sint(vk_format),
1041                        .color_uint = vk_format_is_uint(vk_format)));
1042
1043   tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1044      .component_enable = aspect_write_mask(vk_format, aspect_mask)));
1045   tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1046   tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
1047
1048   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1049   tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1050
1051   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1052                        A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1053
1054   if (cmd->state.predication_active) {
1055      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1056      tu_cs_emit(cs, 0);
1057   }
1058}
1059
1060static void
1061r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1062{
1063   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1064   tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1065                  CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1066                  CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1067   tu_cs_emit(cs, 1); /* instance count */
1068   tu_cs_emit(cs, 2); /* vertex count */
1069}
1070
1071static void
1072r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1073{
1074   if (cmd->state.predication_active) {
1075      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1076      tu_cs_emit(cs, 1);
1077   }
1078}
1079
1080/* blit ops - common interface for 2d/shader paths */
1081
1082struct blit_ops {
1083   void (*coords)(struct tu_cs *cs,
1084                  const VkOffset2D *dst,
1085                  const VkOffset2D *src,
1086                  const VkExtent2D *extent);
1087   void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
1088   void (*src)(
1089        struct tu_cmd_buffer *cmd,
1090        struct tu_cs *cs,
1091        const struct tu_image_view *iview,
1092        uint32_t layer,
1093        VkFilter filter);
1094   void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1095                      VkFormat vk_format,
1096                      uint64_t va, uint32_t pitch,
1097                      uint32_t width, uint32_t height);
1098   void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1099   void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
1100   void (*setup)(struct tu_cmd_buffer *cmd,
1101                 struct tu_cs *cs,
1102                 VkFormat vk_format,
1103                 VkImageAspectFlags aspect_mask,
1104                 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1105                 bool clear,
1106                 bool ubwc,
1107                 VkSampleCountFlagBits samples);
1108   void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1109   void (*teardown)(struct tu_cmd_buffer *cmd,
1110                    struct tu_cs *cs);
1111};
1112
1113static const struct blit_ops r2d_ops = {
1114   .coords = r2d_coords,
1115   .clear_value = r2d_clear_value,
1116   .src = r2d_src,
1117   .src_buffer = r2d_src_buffer,
1118   .dst = r2d_dst,
1119   .dst_buffer = r2d_dst_buffer,
1120   .setup = r2d_setup,
1121   .run = r2d_run,
1122   .teardown = r2d_teardown,
1123};
1124
1125static const struct blit_ops r3d_ops = {
1126   .coords = r3d_coords,
1127   .clear_value = r3d_clear_value,
1128   .src = r3d_src,
1129   .src_buffer = r3d_src_buffer,
1130   .dst = r3d_dst,
1131   .dst_buffer = r3d_dst_buffer,
1132   .setup = r3d_setup,
1133   .run = r3d_run,
1134   .teardown = r3d_teardown,
1135};
1136
1137/* passthrough set coords from 3D extents */
1138static void
1139coords(const struct blit_ops *ops,
1140       struct tu_cs *cs,
1141       const VkOffset3D *dst,
1142       const VkOffset3D *src,
1143       const VkExtent3D *extent)
1144{
1145   ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
1146}
1147
1148/* Decides the VK format to treat our data as for a memcpy-style blit. We have
1149 * to be a bit careful because we have to pick a format with matching UBWC
1150 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1151 * everything.
1152 */
1153static VkFormat
1154copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
1155{
1156   if (vk_format_is_compressed(format)) {
1157      switch (vk_format_get_blocksize(format)) {
1158      case 1: return VK_FORMAT_R8_UINT;
1159      case 2: return VK_FORMAT_R16_UINT;
1160      case 4: return VK_FORMAT_R32_UINT;
1161      case 8: return VK_FORMAT_R32G32_UINT;
1162      case 16:return VK_FORMAT_R32G32B32A32_UINT;
1163      default:
1164         unreachable("unhandled format size");
1165      }
1166   }
1167
1168   switch (format) {
1169   /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1170    * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1171    * (also -1.0), when we're supposed to be memcpying the bits. See
1172    * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1173    */
1174   case VK_FORMAT_R8_SNORM:
1175      return VK_FORMAT_R8_UNORM;
1176   case VK_FORMAT_R8G8_SNORM:
1177      return VK_FORMAT_R8G8_UNORM;
1178   case VK_FORMAT_R8G8B8_SNORM:
1179      return VK_FORMAT_R8G8B8_UNORM;
1180   case VK_FORMAT_B8G8R8_SNORM:
1181      return VK_FORMAT_B8G8R8_UNORM;
1182   case VK_FORMAT_R8G8B8A8_SNORM:
1183      return VK_FORMAT_R8G8B8A8_UNORM;
1184   case VK_FORMAT_B8G8R8A8_SNORM:
1185      return VK_FORMAT_B8G8R8A8_UNORM;
1186   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1187      return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
1188   case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1189      return VK_FORMAT_A2R10G10B10_UNORM_PACK32;
1190   case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1191      return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
1192   case VK_FORMAT_R16_SNORM:
1193      return VK_FORMAT_R16_UNORM;
1194   case VK_FORMAT_R16G16_SNORM:
1195      return VK_FORMAT_R16G16_UNORM;
1196   case VK_FORMAT_R16G16B16_SNORM:
1197      return VK_FORMAT_R16G16B16_UNORM;
1198   case VK_FORMAT_R16G16B16A16_SNORM:
1199      return VK_FORMAT_R16G16B16A16_UNORM;
1200
1201   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
1202      return VK_FORMAT_R32_UINT;
1203
1204   case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
1205      if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1206         return VK_FORMAT_R8G8_UNORM;
1207      else
1208         return VK_FORMAT_R8_UNORM;
1209   case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
1210      return VK_FORMAT_R8_UNORM;
1211
1212   case VK_FORMAT_D24_UNORM_S8_UINT:
1213      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
1214         return VK_FORMAT_R8_UNORM;
1215      else
1216         return format;
1217
1218   case VK_FORMAT_D32_SFLOAT_S8_UINT:
1219      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1220         return VK_FORMAT_S8_UINT;
1221      assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1222      return VK_FORMAT_D32_SFLOAT;
1223
1224   default:
1225      return format;
1226   }
1227}
1228
1229void
1230tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1231              struct tu_cs *cs,
1232              struct tu_image *image,
1233              const VkClearValue *value)
1234{
1235   const struct blit_ops *ops = &r2d_ops;
1236
1237   ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1238              VK_SAMPLE_COUNT_1_BIT);
1239   ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
1240   ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
1241                   image->bo->iova + image->bo_offset + image->lrz_offset,
1242                   image->lrz_pitch * 2);
1243   ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
1244   ops->run(cmd, cs);
1245   ops->teardown(cmd, cs);
1246}
1247
1248static void
1249tu_image_view_copy_blit(struct tu_image_view *iview,
1250                        struct tu_image *image,
1251                        VkFormat format,
1252                        const VkImageSubresourceLayers *subres,
1253                        uint32_t layer,
1254                        bool stencil_read,
1255                        bool z_scale)
1256{
1257   VkImageAspectFlags aspect_mask = subres->aspectMask;
1258
1259   /* always use the AS_R8G8B8A8 format for these */
1260   if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
1261       format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1262      aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1263   }
1264
1265   tu_image_view_init(iview, &(VkImageViewCreateInfo) {
1266      .image = tu_image_to_handle(image),
1267      .viewType = z_scale ? VK_IMAGE_VIEW_TYPE_3D : VK_IMAGE_VIEW_TYPE_2D,
1268      .format = format,
1269      /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
1270      .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
1271      .subresourceRange = {
1272         .aspectMask = aspect_mask,
1273         .baseMipLevel = subres->mipLevel,
1274         .levelCount = 1,
1275         .baseArrayLayer = subres->baseArrayLayer + layer,
1276         .layerCount = 1,
1277      },
1278   }, false);
1279}
1280
1281static void
1282tu_image_view_copy(struct tu_image_view *iview,
1283                   struct tu_image *image,
1284                   VkFormat format,
1285                   const VkImageSubresourceLayers *subres,
1286                   uint32_t layer,
1287                   bool stencil_read)
1288{
1289   format = copy_format(format, subres->aspectMask, false);
1290   tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read, false);
1291}
1292
1293static void
1294tu_image_view_blit(struct tu_image_view *iview,
1295                   struct tu_image *image,
1296                   const VkImageSubresourceLayers *subres,
1297                   uint32_t layer)
1298{
1299   tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false, false);
1300}
1301
1302static void
1303tu6_blit_image(struct tu_cmd_buffer *cmd,
1304               struct tu_image *src_image,
1305               struct tu_image *dst_image,
1306               const VkImageBlit *info,
1307               VkFilter filter)
1308{
1309   const struct blit_ops *ops = &r2d_ops;
1310   struct tu_cs *cs = &cmd->cs;
1311   bool z_scale = false;
1312   uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1313
1314   /* 2D blit can't do rotation mirroring from just coordinates */
1315   static const enum a6xx_rotation rotate[2][2] = {
1316      {ROTATE_0, ROTATE_HFLIP},
1317      {ROTATE_VFLIP, ROTATE_180},
1318   };
1319
1320   bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1321                   (info->dstOffsets[1].x < info->dstOffsets[0].x);
1322   bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1323                   (info->dstOffsets[1].y < info->dstOffsets[0].y);
1324
1325   int32_t src0_z = info->srcOffsets[0].z;
1326   int32_t src1_z = info->srcOffsets[1].z;
1327
1328   if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1329        info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1330       info->srcOffsets[1].z < info->srcOffsets[0].z) {
1331      z_scale = true;
1332   }
1333
1334   if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1335      layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1336      src0_z = info->srcOffsets[1].z;
1337      src1_z = info->srcOffsets[0].z;
1338   }
1339
1340   if (info->dstSubresource.layerCount > 1) {
1341      assert(layers <= 1);
1342      layers = info->dstSubresource.layerCount;
1343   }
1344
1345   /* BC1_RGB_* formats need to have their last components overriden with 1
1346    * when sampling, which is normally handled with the texture descriptor
1347    * swizzle. The 2d path can't handle that, so use the 3d path.
1348    *
1349    * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1350    * the 2d path.
1351    */
1352
1353   unsigned blit_param = rotate[mirror_y][mirror_x];
1354   if (dst_image->layout[0].nr_samples > 1 ||
1355       src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1356       src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1357       filter == VK_FILTER_CUBIC_EXT ||
1358       z_scale) {
1359      ops = &r3d_ops;
1360      blit_param = z_scale;
1361   }
1362
1363   /* use the right format in setup() for D32_S8
1364    * TODO: this probably should use a helper
1365    */
1366   VkFormat format = dst_image->vk_format;
1367   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1368      if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1369         format = VK_FORMAT_D32_SFLOAT;
1370      else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1371         format = VK_FORMAT_S8_UINT;
1372      else
1373         unreachable("unexpected D32_S8 aspect mask in blit_image");
1374   }
1375
1376   trace_start_blit(&cmd->trace, cs);
1377
1378   ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1379              blit_param, false, dst_image->layout[0].ubwc,
1380              dst_image->layout[0].nr_samples);
1381
1382   if (ops == &r3d_ops) {
1383      r3d_coords_raw(cs, (float[]) {
1384         info->dstOffsets[0].x, info->dstOffsets[0].y,
1385         info->srcOffsets[0].x, info->srcOffsets[0].y,
1386         info->dstOffsets[1].x, info->dstOffsets[1].y,
1387         info->srcOffsets[1].x, info->srcOffsets[1].y
1388      });
1389   } else {
1390      tu_cs_emit_regs(cs,
1391         A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1392                             .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1393         A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1394                             .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1395      tu_cs_emit_regs(cs,
1396         A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1397         A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1398         A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1399         A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1400   }
1401
1402   struct tu_image_view dst, src;
1403   tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
1404                      MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1405
1406   if (z_scale) {
1407      tu_image_view_copy_blit(&src, src_image, src_image->vk_format,
1408                              &info->srcSubresource, 0, false, true);
1409      ops->src(cmd, cs, &src, 0, filter);
1410   } else {
1411      tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1412   }
1413
1414   for (uint32_t i = 0; i < layers; i++) {
1415      if (z_scale) {
1416         float t = ((float) i + 0.5f) / (float) layers;
1417         r3d_coord_z(cs, t * (src1_z - src0_z) + src0_z);
1418      } else {
1419         ops->src(cmd, cs, &src, i, filter);
1420      }
1421      ops->dst(cs, &dst, i);
1422      ops->run(cmd, cs);
1423   }
1424
1425   ops->teardown(cmd, cs);
1426
1427   trace_end_blit(&cmd->trace, cs,
1428                  ops == &r3d_ops,
1429                  src_image->vk_format,
1430                  dst_image->vk_format,
1431                  layers);
1432}
1433
1434VKAPI_ATTR void VKAPI_CALL
1435tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1436                VkImage srcImage,
1437                VkImageLayout srcImageLayout,
1438                VkImage dstImage,
1439                VkImageLayout dstImageLayout,
1440                uint32_t regionCount,
1441                const VkImageBlit *pRegions,
1442                VkFilter filter)
1443
1444{
1445   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1446   TU_FROM_HANDLE(tu_image, src_image, srcImage);
1447   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1448
1449   for (uint32_t i = 0; i < regionCount; ++i) {
1450      /* can't blit both depth and stencil at once with D32_S8
1451       * TODO: more advanced 3D blit path to support it instead?
1452       */
1453      if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1454          dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1455         VkImageBlit region = pRegions[i];
1456         u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
1457            region.srcSubresource.aspectMask = BIT(b);
1458            region.dstSubresource.aspectMask = BIT(b);
1459            tu6_blit_image(cmd, src_image, dst_image, &region, filter);
1460         }
1461         continue;
1462      }
1463      tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1464   }
1465}
1466
1467static void
1468copy_compressed(VkFormat format,
1469                VkOffset3D *offset,
1470                VkExtent3D *extent,
1471                uint32_t *width,
1472                uint32_t *height)
1473{
1474   if (!vk_format_is_compressed(format))
1475      return;
1476
1477   uint32_t block_width = vk_format_get_blockwidth(format);
1478   uint32_t block_height = vk_format_get_blockheight(format);
1479
1480   offset->x /= block_width;
1481   offset->y /= block_height;
1482
1483   if (extent) {
1484      extent->width = DIV_ROUND_UP(extent->width, block_width);
1485      extent->height = DIV_ROUND_UP(extent->height, block_height);
1486   }
1487   if (width)
1488      *width = DIV_ROUND_UP(*width, block_width);
1489   if (height)
1490      *height = DIV_ROUND_UP(*height, block_height);
1491}
1492
1493static void
1494tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1495                        struct tu_buffer *src_buffer,
1496                        struct tu_image *dst_image,
1497                        const VkBufferImageCopy *info)
1498{
1499   struct tu_cs *cs = &cmd->cs;
1500   uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1501   VkFormat src_format =
1502      copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1503   const struct blit_ops *ops = &r2d_ops;
1504
1505   /* special case for buffer to stencil */
1506   if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1507       info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1508      ops = &r3d_ops;
1509   }
1510
1511   /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1512    * which matters for UBWC. buffer_to_image/etc can fail because of this
1513    */
1514
1515   VkOffset3D offset = info->imageOffset;
1516   VkExtent3D extent = info->imageExtent;
1517   uint32_t src_width = info->bufferRowLength ?: extent.width;
1518   uint32_t src_height = info->bufferImageHeight ?: extent.height;
1519
1520   copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1521
1522   uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1523   uint32_t layer_size = src_height * pitch;
1524
1525   ops->setup(cmd, cs,
1526              copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1527              info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
1528              dst_image->layout[0].nr_samples);
1529
1530   struct tu_image_view dst;
1531   tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1532
1533   for (uint32_t i = 0; i < layers; i++) {
1534      ops->dst(cs, &dst, i);
1535
1536      uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1537      if ((src_va & 63) || (pitch & 63)) {
1538         for (uint32_t y = 0; y < extent.height; y++) {
1539            uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1540            ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1541                            x + extent.width, 1);
1542            ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
1543                        &(VkExtent2D) {extent.width, 1});
1544            ops->run(cmd, cs);
1545            src_va += pitch;
1546         }
1547      } else {
1548         ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1549         coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1550         ops->run(cmd, cs);
1551      }
1552   }
1553
1554   ops->teardown(cmd, cs);
1555}
1556
1557VKAPI_ATTR void VKAPI_CALL
1558tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1559                        VkBuffer srcBuffer,
1560                        VkImage dstImage,
1561                        VkImageLayout dstImageLayout,
1562                        uint32_t regionCount,
1563                        const VkBufferImageCopy *pRegions)
1564{
1565   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1566   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1567   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1568
1569   for (unsigned i = 0; i < regionCount; ++i)
1570      tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1571}
1572
1573static void
1574tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1575                        struct tu_image *src_image,
1576                        struct tu_buffer *dst_buffer,
1577                        const VkBufferImageCopy *info)
1578{
1579   struct tu_cs *cs = &cmd->cs;
1580   uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1581   VkFormat dst_format =
1582      copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1583   bool stencil_read = false;
1584
1585   if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1586       info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1587      stencil_read = true;
1588   }
1589
1590   const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1591   VkOffset3D offset = info->imageOffset;
1592   VkExtent3D extent = info->imageExtent;
1593   uint32_t dst_width = info->bufferRowLength ?: extent.width;
1594   uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1595
1596   copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1597
1598   uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1599   uint32_t layer_size = pitch * dst_height;
1600
1601   ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1602              VK_SAMPLE_COUNT_1_BIT);
1603
1604   struct tu_image_view src;
1605   tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1606
1607   for (uint32_t i = 0; i < layers; i++) {
1608      ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1609
1610      uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1611      if ((dst_va & 63) || (pitch & 63)) {
1612         for (uint32_t y = 0; y < extent.height; y++) {
1613            uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1614            ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1615            ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1616                        &(VkExtent2D) {extent.width, 1});
1617            ops->run(cmd, cs);
1618            dst_va += pitch;
1619         }
1620      } else {
1621         ops->dst_buffer(cs, dst_format, dst_va, pitch);
1622         coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1623         ops->run(cmd, cs);
1624      }
1625   }
1626
1627   ops->teardown(cmd, cs);
1628}
1629
1630VKAPI_ATTR void VKAPI_CALL
1631tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1632                        VkImage srcImage,
1633                        VkImageLayout srcImageLayout,
1634                        VkBuffer dstBuffer,
1635                        uint32_t regionCount,
1636                        const VkBufferImageCopy *pRegions)
1637{
1638   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1639   TU_FROM_HANDLE(tu_image, src_image, srcImage);
1640   TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1641
1642   for (unsigned i = 0; i < regionCount; ++i)
1643      tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1644}
1645
1646/* Tiled formats don't support swapping, which means that we can't support
1647 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1648 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1649 * Currently we fake support for tiled swapped formats and use the unswapped
1650 * format instead, but this means that reinterpreting copies to and from
1651 * swapped formats can't be performed correctly unless we can swizzle the
1652 * components by reinterpreting the other image as the "correct" swapped
1653 * format, i.e. only when the other image is linear.
1654 */
1655
1656static bool
1657is_swapped_format(VkFormat format)
1658{
1659   struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1660   struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1661   return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1662}
1663
1664/* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1665 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1666 * versa). This should mirror the logic in fdl6_layout.
1667 */
1668static bool
1669image_is_r8g8(struct tu_image *image)
1670{
1671   return image->layout[0].cpp == 2 &&
1672      vk_format_get_nr_components(image->vk_format) == 2;
1673}
1674
1675static void
1676tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1677                       struct tu_image *src_image,
1678                       struct tu_image *dst_image,
1679                       const VkImageCopy *info)
1680{
1681   const struct blit_ops *ops = &r2d_ops;
1682   struct tu_cs *cs = &cmd->cs;
1683
1684   if (dst_image->layout[0].nr_samples > 1)
1685      ops = &r3d_ops;
1686
1687   VkFormat format = VK_FORMAT_UNDEFINED;
1688   VkOffset3D src_offset = info->srcOffset;
1689   VkOffset3D dst_offset = info->dstOffset;
1690   VkExtent3D extent = info->extent;
1691   uint32_t layers_to_copy = MAX2(info->extent.depth, info->srcSubresource.layerCount);
1692
1693   /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1694    * Images":
1695    *
1696    *    When copying between compressed and uncompressed formats the extent
1697    *    members represent the texel dimensions of the source image and not
1698    *    the destination. When copying from a compressed image to an
1699    *    uncompressed image the image texel dimensions written to the
1700    *    uncompressed image will be source extent divided by the compressed
1701    *    texel block dimensions. When copying from an uncompressed image to a
1702    *    compressed image the image texel dimensions written to the compressed
1703    *    image will be the source extent multiplied by the compressed texel
1704    *    block dimensions.
1705    *
1706    * This means we only have to adjust the extent if the source image is
1707    * compressed.
1708    */
1709   copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1710   copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1711
1712   VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1713   VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1714
1715   bool use_staging_blit = false;
1716
1717   if (src_format == dst_format) {
1718      /* Images that share a format can always be copied directly because it's
1719       * the same as a blit.
1720       */
1721      format = src_format;
1722   } else if (!src_image->layout[0].tile_mode) {
1723      /* If an image is linear, we can always safely reinterpret it with the
1724       * other image's format and then do a regular blit.
1725       */
1726      format = dst_format;
1727   } else if (!dst_image->layout[0].tile_mode) {
1728      format = src_format;
1729   } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1730      /* We can't currently copy r8g8 images to/from other cpp=2 images,
1731       * due to the different tile layout.
1732       */
1733      use_staging_blit = true;
1734   } else if (is_swapped_format(src_format) ||
1735              is_swapped_format(dst_format)) {
1736      /* If either format has a non-identity swap, then we can't copy
1737       * to/from it.
1738       */
1739      use_staging_blit = true;
1740   } else if (!src_image->layout[0].ubwc) {
1741      format = dst_format;
1742   } else if (!dst_image->layout[0].ubwc) {
1743      format = src_format;
1744   } else {
1745      /* Both formats use UBWC and so neither can be reinterpreted.
1746       * TODO: We could do an in-place decompression of the dst instead.
1747       */
1748      use_staging_blit = true;
1749   }
1750
1751   struct tu_image_view dst, src;
1752
1753   if (use_staging_blit) {
1754      tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1755      tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1756
1757      struct tu_image staging_image = {
1758         .base.type = VK_OBJECT_TYPE_IMAGE,
1759         .vk_format = src_format,
1760         .level_count = 1,
1761         .layer_count = info->srcSubresource.layerCount,
1762         .bo_offset = 0,
1763      };
1764
1765      VkImageSubresourceLayers staging_subresource = {
1766         .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1767         .mipLevel = 0,
1768         .baseArrayLayer = 0,
1769         .layerCount = info->srcSubresource.layerCount,
1770      };
1771
1772      VkOffset3D staging_offset = { 0 };
1773
1774      staging_image.layout[0].tile_mode = TILE6_LINEAR;
1775      staging_image.layout[0].ubwc = false;
1776
1777      fdl6_layout(&staging_image.layout[0],
1778                  vk_format_to_pipe_format(staging_image.vk_format),
1779                  src_image->layout[0].nr_samples,
1780                  extent.width,
1781                  extent.height,
1782                  extent.depth,
1783                  staging_image.level_count,
1784                  staging_image.layer_count,
1785                  extent.depth > 1,
1786                  NULL);
1787
1788      VkResult result = tu_get_scratch_bo(cmd->device,
1789                                          staging_image.layout[0].size,
1790                                          &staging_image.bo);
1791      if (result != VK_SUCCESS) {
1792         cmd->record_result = result;
1793         return;
1794      }
1795
1796      struct tu_image_view staging;
1797      tu_image_view_copy(&staging, &staging_image, src_format,
1798                         &staging_subresource, 0, false);
1799
1800      ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1801                 dst_image->layout[0].nr_samples);
1802      coords(ops, cs, &staging_offset, &src_offset, &extent);
1803
1804      for (uint32_t i = 0; i < layers_to_copy; i++) {
1805         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1806         ops->dst(cs, &staging, i);
1807         ops->run(cmd, cs);
1808      }
1809
1810      /* When executed by the user there has to be a pipeline barrier here,
1811       * but since we're doing it manually we'll have to flush ourselves.
1812       */
1813      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1814      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1815      tu_cs_emit_wfi(cs);
1816
1817      tu_image_view_copy(&staging, &staging_image, dst_format,
1818                         &staging_subresource, 0, false);
1819
1820      ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1821                 0, false, dst_image->layout[0].ubwc,
1822                 dst_image->layout[0].nr_samples);
1823      coords(ops, cs, &dst_offset, &staging_offset, &extent);
1824
1825      for (uint32_t i = 0; i < layers_to_copy; i++) {
1826         ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1827         ops->dst(cs, &dst, i);
1828         ops->run(cmd, cs);
1829      }
1830   } else {
1831      tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1832      tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1833
1834      ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1835                 0, false, dst_image->layout[0].ubwc,
1836                 dst_image->layout[0].nr_samples);
1837      coords(ops, cs, &dst_offset, &src_offset, &extent);
1838
1839      for (uint32_t i = 0; i < layers_to_copy; i++) {
1840         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1841         ops->dst(cs, &dst, i);
1842         ops->run(cmd, cs);
1843      }
1844   }
1845
1846   ops->teardown(cmd, cs);
1847}
1848
1849VKAPI_ATTR void VKAPI_CALL
1850tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1851                VkImage srcImage,
1852                VkImageLayout srcImageLayout,
1853                VkImage destImage,
1854                VkImageLayout destImageLayout,
1855                uint32_t regionCount,
1856                const VkImageCopy *pRegions)
1857{
1858   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1859   TU_FROM_HANDLE(tu_image, src_image, srcImage);
1860   TU_FROM_HANDLE(tu_image, dst_image, destImage);
1861
1862   for (uint32_t i = 0; i < regionCount; ++i) {
1863      if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1864         VkImageCopy info = pRegions[i];
1865         u_foreach_bit(b, pRegions[i].dstSubresource.aspectMask) {
1866            info.srcSubresource.aspectMask = BIT(b);
1867            info.dstSubresource.aspectMask = BIT(b);
1868            tu_copy_image_to_image(cmd, src_image, dst_image, &info);
1869         }
1870         continue;
1871      }
1872
1873      tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1874   }
1875}
1876
1877static void
1878copy_buffer(struct tu_cmd_buffer *cmd,
1879            uint64_t dst_va,
1880            uint64_t src_va,
1881            uint64_t size,
1882            uint32_t block_size)
1883{
1884   const struct blit_ops *ops = &r2d_ops;
1885   struct tu_cs *cs = &cmd->cs;
1886   VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1887   uint64_t blocks = size / block_size;
1888
1889   ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
1890              VK_SAMPLE_COUNT_1_BIT);
1891
1892   while (blocks) {
1893      uint32_t src_x = (src_va & 63) / block_size;
1894      uint32_t dst_x = (dst_va & 63) / block_size;
1895      uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1896
1897      ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1898      ops->dst_buffer(     cs, format, dst_va & ~63, 0);
1899      ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1900      ops->run(cmd, cs);
1901
1902      src_va += width * block_size;
1903      dst_va += width * block_size;
1904      blocks -= width;
1905   }
1906
1907   ops->teardown(cmd, cs);
1908}
1909
1910VKAPI_ATTR void VKAPI_CALL
1911tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1912                 VkBuffer srcBuffer,
1913                 VkBuffer dstBuffer,
1914                 uint32_t regionCount,
1915                 const VkBufferCopy *pRegions)
1916{
1917   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1918   TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1919   TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1920
1921   for (unsigned i = 0; i < regionCount; ++i) {
1922      copy_buffer(cmd,
1923                  tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1924                  tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1925                  pRegions[i].size, 1);
1926   }
1927}
1928
1929VKAPI_ATTR void VKAPI_CALL
1930tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1931                   VkBuffer dstBuffer,
1932                   VkDeviceSize dstOffset,
1933                   VkDeviceSize dataSize,
1934                   const void *pData)
1935{
1936   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1937   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1938
1939   struct tu_cs_memory tmp;
1940   VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
1941   if (result != VK_SUCCESS) {
1942      cmd->record_result = result;
1943      return;
1944   }
1945
1946   memcpy(tmp.map, pData, dataSize);
1947   copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1948}
1949
1950VKAPI_ATTR void VKAPI_CALL
1951tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1952                 VkBuffer dstBuffer,
1953                 VkDeviceSize dstOffset,
1954                 VkDeviceSize fillSize,
1955                 uint32_t data)
1956{
1957   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1958   TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1959   const struct blit_ops *ops = &r2d_ops;
1960   struct tu_cs *cs = &cmd->cs;
1961
1962   if (fillSize == VK_WHOLE_SIZE)
1963      fillSize = buffer->size - dstOffset;
1964
1965   uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1966   uint32_t blocks = fillSize / 4;
1967
1968   ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1969              VK_SAMPLE_COUNT_1_BIT);
1970   ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1971
1972   while (blocks) {
1973      uint32_t dst_x = (dst_va & 63) / 4;
1974      uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1975
1976      ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1977      ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1978      ops->run(cmd, cs);
1979
1980      dst_va += width * 4;
1981      blocks -= width;
1982   }
1983
1984   ops->teardown(cmd, cs);
1985}
1986
1987VKAPI_ATTR void VKAPI_CALL
1988tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1989                   VkImage srcImage,
1990                   VkImageLayout srcImageLayout,
1991                   VkImage dstImage,
1992                   VkImageLayout dstImageLayout,
1993                   uint32_t regionCount,
1994                   const VkImageResolve *pRegions)
1995{
1996   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1997   TU_FROM_HANDLE(tu_image, src_image, srcImage);
1998   TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1999   const struct blit_ops *ops = &r2d_ops;
2000   struct tu_cs *cs = &cmd->cs;
2001
2002   ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
2003              0, false, dst_image->layout[0].ubwc, VK_SAMPLE_COUNT_1_BIT);
2004
2005   for (uint32_t i = 0; i < regionCount; ++i) {
2006      const VkImageResolve *info = &pRegions[i];
2007      uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
2008
2009      assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
2010      /* TODO: aspect masks possible ? */
2011
2012      coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
2013
2014      struct tu_image_view dst, src;
2015      tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2016      tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2017
2018      for (uint32_t i = 0; i < layers; i++) {
2019         ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
2020         ops->dst(cs, &dst, i);
2021         ops->run(cmd, cs);
2022      }
2023   }
2024
2025   ops->teardown(cmd, cs);
2026}
2027
2028#define for_each_layer(layer, layer_mask, layers) \
2029   for (uint32_t layer = 0; \
2030        layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2031        layer++) \
2032      if (!layer_mask || (layer_mask & BIT(layer)))
2033
2034static void
2035resolve_sysmem(struct tu_cmd_buffer *cmd,
2036               struct tu_cs *cs,
2037               VkFormat format,
2038               const struct tu_image_view *src,
2039               const struct tu_image_view *dst,
2040               uint32_t layer_mask,
2041               uint32_t layers,
2042               const VkRect2D *rect,
2043               bool separate_stencil)
2044{
2045   const struct blit_ops *ops = &r2d_ops;
2046
2047   trace_start_sysmem_resolve(&cmd->trace, cs);
2048
2049   ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT,
2050              0, false, dst->ubwc_enabled, VK_SAMPLE_COUNT_1_BIT);
2051   ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
2052
2053   for_each_layer(i, layer_mask, layers) {
2054      if (separate_stencil) {
2055         r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
2056         r2d_dst_stencil(cs, dst, i);
2057      } else {
2058         ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
2059         ops->dst(cs, dst, i);
2060      }
2061      ops->run(cmd, cs);
2062   }
2063
2064   ops->teardown(cmd, cs);
2065
2066   trace_end_sysmem_resolve(&cmd->trace, cs, format);
2067}
2068
2069void
2070tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2071                  struct tu_cs *cs,
2072                  const struct tu_image_view *src,
2073                  const struct tu_image_view *dst,
2074                  uint32_t layer_mask,
2075                  uint32_t layers,
2076                  const VkRect2D *rect)
2077{
2078   assert(src->image->vk_format == dst->image->vk_format);
2079
2080   if (dst->image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2081      resolve_sysmem(cmd, cs, VK_FORMAT_D32_SFLOAT,
2082                     src, dst, layer_mask, layers, rect, false);
2083      resolve_sysmem(cmd, cs, VK_FORMAT_S8_UINT,
2084                     src, dst, layer_mask, layers, rect, true);
2085   } else {
2086      resolve_sysmem(cmd, cs, dst->image->vk_format,
2087                     src, dst, layer_mask, layers, rect, false);
2088   }
2089}
2090
2091static void
2092clear_image(struct tu_cmd_buffer *cmd,
2093            struct tu_image *image,
2094            const VkClearValue *clear_value,
2095            const VkImageSubresourceRange *range,
2096            VkImageAspectFlags aspect_mask)
2097{
2098   uint32_t level_count = tu_get_levelCount(image, range);
2099   uint32_t layer_count = tu_get_layerCount(image, range);
2100   struct tu_cs *cs = &cmd->cs;
2101   VkFormat format = image->vk_format;
2102   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2103      format = copy_format(format, aspect_mask, false);
2104
2105   if (image->layout[0].depth0 > 1) {
2106      assert(layer_count == 1);
2107      assert(range->baseArrayLayer == 0);
2108   }
2109
2110   const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
2111
2112   ops->setup(cmd, cs, format, aspect_mask, 0, true, image->layout[0].ubwc,
2113              image->layout[0].nr_samples);
2114   if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2115      ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
2116   else
2117      ops->clear_value(cs, format, clear_value);
2118
2119   for (unsigned j = 0; j < level_count; j++) {
2120      if (image->layout[0].depth0 > 1)
2121         layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2122
2123      ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
2124                     u_minify(image->layout[0].width0, range->baseMipLevel + j),
2125                     u_minify(image->layout[0].height0, range->baseMipLevel + j)
2126                  });
2127
2128      struct tu_image_view dst;
2129      tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
2130         .aspectMask = aspect_mask,
2131         .mipLevel = range->baseMipLevel + j,
2132         .baseArrayLayer = range->baseArrayLayer,
2133         .layerCount = 1,
2134      }, 0, false, false);
2135
2136      for (uint32_t i = 0; i < layer_count; i++) {
2137         ops->dst(cs, &dst, i);
2138         ops->run(cmd, cs);
2139      }
2140   }
2141
2142   ops->teardown(cmd, cs);
2143}
2144
2145VKAPI_ATTR void VKAPI_CALL
2146tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2147                      VkImage image_h,
2148                      VkImageLayout imageLayout,
2149                      const VkClearColorValue *pColor,
2150                      uint32_t rangeCount,
2151                      const VkImageSubresourceRange *pRanges)
2152{
2153   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2154   TU_FROM_HANDLE(tu_image, image, image_h);
2155
2156   for (unsigned i = 0; i < rangeCount; i++)
2157      clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2158}
2159
2160VKAPI_ATTR void VKAPI_CALL
2161tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2162                             VkImage image_h,
2163                             VkImageLayout imageLayout,
2164                             const VkClearDepthStencilValue *pDepthStencil,
2165                             uint32_t rangeCount,
2166                             const VkImageSubresourceRange *pRanges)
2167{
2168   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2169   TU_FROM_HANDLE(tu_image, image, image_h);
2170
2171   for (unsigned i = 0; i < rangeCount; i++) {
2172      const VkImageSubresourceRange *range = &pRanges[i];
2173
2174      if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2175         /* can't clear both depth and stencil at once, split up the aspect mask */
2176         u_foreach_bit(b, range->aspectMask)
2177            clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2178         continue;
2179      }
2180
2181      clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2182   }
2183}
2184
2185static void
2186tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2187                            uint32_t attachment_count,
2188                            const VkClearAttachment *attachments,
2189                            uint32_t rect_count,
2190                            const VkClearRect *rects)
2191{
2192   /* the shader path here is special, it avoids changing MRT/etc state */
2193   const struct tu_subpass *subpass = cmd->state.subpass;
2194   const uint32_t mrt_count = subpass->color_count;
2195   struct tu_cs *cs = &cmd->draw_cs;
2196   uint32_t clear_value[MAX_RTS][4];
2197   float z_clear_val = 0.0f;
2198   uint8_t s_clear_val = 0;
2199   uint32_t clear_rts = 0, clear_components = 0;
2200   bool z_clear = false;
2201   bool s_clear = false;
2202
2203   trace_start_sysmem_clear_all(&cmd->trace, cs);
2204
2205   for (uint32_t i = 0; i < attachment_count; i++) {
2206      uint32_t a;
2207      if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2208         uint32_t c = attachments[i].colorAttachment;
2209         a = subpass->color_attachments[c].attachment;
2210         if (a == VK_ATTACHMENT_UNUSED)
2211            continue;
2212
2213         clear_rts |= 1 << c;
2214         clear_components |= 0xf << (c * 4);
2215         memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2216      } else {
2217         a = subpass->depth_stencil_attachment.attachment;
2218         if (a == VK_ATTACHMENT_UNUSED)
2219            continue;
2220
2221         if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2222            z_clear = true;
2223            z_clear_val = attachments[i].clearValue.depthStencil.depth;
2224         }
2225
2226         if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2227            s_clear = true;
2228            s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2229         }
2230      }
2231   }
2232
2233   /* We may not know the multisample count if there are no attachments, so
2234    * just bail early to avoid corner cases later.
2235    */
2236   if (clear_rts == 0 && !z_clear && !s_clear)
2237      return;
2238
2239   /* disable all draw states so they don't interfere
2240    * TODO: use and re-use draw states
2241    * we have to disable draw states individually to preserve
2242    * input attachment states, because a secondary command buffer
2243    * won't be able to restore them
2244    */
2245   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2246   for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2247      if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2248          i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2249         continue;
2250      tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2251                     CP_SET_DRAW_STATE__0_DISABLE);
2252      tu_cs_emit_qw(cs, 0);
2253   }
2254   cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2255
2256   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2257   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2258                  A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2259                  0xfc000000);
2260   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2261
2262   r3d_common(cmd, cs, false, clear_rts, false, cmd->state.subpass->samples);
2263
2264   tu_cs_emit_regs(cs,
2265                   A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2266   tu_cs_emit_regs(cs,
2267                   A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2268
2269   tu_cs_emit_regs(cs,
2270                   A6XX_RB_FS_OUTPUT_CNTL0(),
2271                   A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2272
2273   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2274   tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2275   for (uint32_t i = 0; i < mrt_count; i++) {
2276      tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2277            .component_enable = COND(clear_rts & (1 << i), 0xf)));
2278   }
2279
2280   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2281   tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2282
2283   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2284   tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2285         .z_test_enable = z_clear,
2286         .z_write_enable = z_clear,
2287         .zfunc = FUNC_ALWAYS));
2288   tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2289   tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2290         .stencil_enable = s_clear,
2291         .func = FUNC_ALWAYS,
2292         .zpass = STENCIL_REPLACE));
2293   tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2294   tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2295   tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2296
2297   unsigned num_rts = util_bitcount(clear_rts);
2298   tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
2299   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
2300                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
2301                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
2302                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
2303                  CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
2304   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
2305   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
2306   u_foreach_bit(b, clear_rts)
2307      tu_cs_emit_array(cs, clear_value[b], 4);
2308
2309   for (uint32_t i = 0; i < rect_count; i++) {
2310      /* This should be true because of this valid usage for
2311       * vkCmdClearAttachments:
2312       *
2313       *    "If the render pass instance this is recorded in uses multiview,
2314       *    then baseArrayLayer must be zero and layerCount must be one"
2315       */
2316      assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2317
2318      /* a630 doesn't support multiview masks, which means that we can't use
2319       * the normal multiview path without potentially recompiling a shader
2320       * on-demand or using a more complicated variant that takes the mask as
2321       * a const. Just use the layered path instead, since it shouldn't be
2322       * much worse.
2323       */
2324      for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
2325         r3d_coords_raw(cs, (float[]) {
2326            rects[i].rect.offset.x, rects[i].rect.offset.y,
2327            z_clear_val, uif(rects[i].baseArrayLayer + layer),
2328            rects[i].rect.offset.x + rects[i].rect.extent.width,
2329            rects[i].rect.offset.y + rects[i].rect.extent.height,
2330            z_clear_val, 1.0f,
2331         });
2332         r3d_run(cmd, cs);
2333      }
2334   }
2335
2336   trace_end_sysmem_clear_all(&cmd->trace,
2337                              cs, mrt_count, rect_count);
2338}
2339
2340static void
2341pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
2342{
2343   switch (format) {
2344   case VK_FORMAT_X8_D24_UNORM_PACK32:
2345   case VK_FORMAT_D24_UNORM_S8_UINT:
2346      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
2347                       val->depthStencil.stencil << 24;
2348      return;
2349   case VK_FORMAT_D16_UNORM:
2350      clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
2351      return;
2352   case VK_FORMAT_D32_SFLOAT:
2353      clear_value[0] = fui(val->depthStencil.depth);
2354      return;
2355   case VK_FORMAT_S8_UINT:
2356      clear_value[0] = val->depthStencil.stencil;
2357      return;
2358   default:
2359      break;
2360   }
2361
2362   float tmp[4];
2363   memcpy(tmp, val->color.float32, 4 * sizeof(float));
2364   if (vk_format_is_srgb(format)) {
2365      for (int i = 0; i < 3; i++)
2366         tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
2367   }
2368
2369#define PACK_F(type) util_format_##type##_pack_rgba_float \
2370   ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
2371   switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
2372   case 4:
2373      PACK_F(r4g4b4a4_unorm);
2374      break;
2375   case 5:
2376      if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
2377         PACK_F(r5g6b5_unorm);
2378      else
2379         PACK_F(r5g5b5a1_unorm);
2380      break;
2381   case 8:
2382      if (vk_format_is_snorm(format))
2383         PACK_F(r8g8b8a8_snorm);
2384      else if (vk_format_is_unorm(format))
2385         PACK_F(r8g8b8a8_unorm);
2386      else
2387         pack_int8(clear_value, val->color.uint32);
2388      break;
2389   case 10:
2390      if (vk_format_is_int(format))
2391         pack_int10_2(clear_value, val->color.uint32);
2392      else
2393         PACK_F(r10g10b10a2_unorm);
2394      break;
2395   case 11:
2396      clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2397      break;
2398   case 16:
2399      if (vk_format_is_snorm(format))
2400         PACK_F(r16g16b16a16_snorm);
2401      else if (vk_format_is_unorm(format))
2402         PACK_F(r16g16b16a16_unorm);
2403      else if (vk_format_is_float(format))
2404         PACK_F(r16g16b16a16_float);
2405      else
2406         pack_int16(clear_value, val->color.uint32);
2407      break;
2408   case 32:
2409      memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2410      break;
2411   default:
2412      unreachable("unexpected channel size");
2413   }
2414#undef PACK_F
2415}
2416
2417static void
2418clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2419                      struct tu_cs *cs,
2420                      VkFormat format,
2421                      uint8_t clear_mask,
2422                      uint32_t gmem_offset,
2423                      const VkClearValue *value)
2424{
2425   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2426   tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2427
2428   tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2429
2430   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2431   tu_cs_emit(cs, gmem_offset);
2432
2433   tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2434   tu_cs_emit(cs, 0);
2435
2436   uint32_t clear_vals[4] = {};
2437   pack_gmem_clear_value(value, format, clear_vals);
2438
2439   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2440   tu_cs_emit_array(cs, clear_vals, 4);
2441
2442   tu6_emit_event_write(cmd, cs, BLIT);
2443}
2444
2445static void
2446tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2447                              struct tu_cs *cs,
2448                              uint32_t attachment,
2449                              VkImageAspectFlags mask,
2450                              const VkClearValue *value)
2451{
2452   const struct tu_render_pass_attachment *att =
2453      &cmd->state.pass->attachments[attachment];
2454
2455   trace_start_gmem_clear(&cmd->trace, cs);
2456
2457   if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2458      if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2459         clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
2460      if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2461         clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
2462      return;
2463   }
2464
2465   clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
2466
2467   trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
2468}
2469
2470static void
2471tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2472                          uint32_t attachment_count,
2473                          const VkClearAttachment *attachments,
2474                          uint32_t rect_count,
2475                          const VkClearRect *rects)
2476{
2477   const struct tu_subpass *subpass = cmd->state.subpass;
2478   struct tu_cs *cs = &cmd->draw_cs;
2479
2480   /* TODO: swap the loops for smaller cmdstream */
2481   for (unsigned i = 0; i < rect_count; i++) {
2482      unsigned x1 = rects[i].rect.offset.x;
2483      unsigned y1 = rects[i].rect.offset.y;
2484      unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2485      unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2486
2487      tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2488      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2489      tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2490
2491      for (unsigned j = 0; j < attachment_count; j++) {
2492         uint32_t a;
2493         if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2494            a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2495         else
2496            a = subpass->depth_stencil_attachment.attachment;
2497
2498         if (a == VK_ATTACHMENT_UNUSED)
2499               continue;
2500
2501         tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2502                                       &attachments[j].clearValue);
2503      }
2504   }
2505}
2506
2507VKAPI_ATTR void VKAPI_CALL
2508tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2509                       uint32_t attachmentCount,
2510                       const VkClearAttachment *pAttachments,
2511                       uint32_t rectCount,
2512                       const VkClearRect *pRects)
2513{
2514   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2515   struct tu_cs *cs = &cmd->draw_cs;
2516
2517   /* sysmem path behaves like a draw, note we don't have a way of using different
2518    * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2519    */
2520   tu_emit_cache_flush_renderpass(cmd, cs);
2521
2522   for (uint32_t j = 0; j < attachmentCount; j++) {
2523      if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2524         continue;
2525      cmd->state.lrz.valid = false;
2526      cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2527   }
2528
2529   /* vkCmdClearAttachments is supposed to respect the predicate if active.
2530    * The easiest way to do this is to always use the 3d path, which always
2531    * works even with GMEM because it's just a simple draw using the existing
2532    * attachment state. However it seems that IGNORE_VISIBILITY draws must be
2533    * skipped in the binning pass, since otherwise they produce binning data
2534    * which isn't consumed and leads to the wrong binning data being read, so
2535    * condition on GMEM | SYSMEM.
2536    */
2537   if (cmd->state.predication_active) {
2538      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
2539                             CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2540      tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2541      tu_cond_exec_end(cs);
2542      return;
2543   }
2544
2545   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2546   tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2547   tu_cond_exec_end(cs);
2548
2549   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2550   tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2551   tu_cond_exec_end(cs);
2552}
2553
2554static void
2555clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2556                        struct tu_cs *cs,
2557                        VkFormat format,
2558                        VkImageAspectFlags clear_mask,
2559                        const VkRenderPassBeginInfo *info,
2560                        uint32_t a,
2561                        bool separate_stencil)
2562{
2563   const struct tu_framebuffer *fb = cmd->state.framebuffer;
2564   const struct tu_image_view *iview = cmd->state.attachments[a];
2565   const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2566   const struct blit_ops *ops = &r2d_ops;
2567   if (cmd->state.pass->attachments[a].samples > 1)
2568      ops = &r3d_ops;
2569
2570   trace_start_sysmem_clear(&cmd->trace, cs);
2571
2572   ops->setup(cmd, cs, format, clear_mask, 0, true, iview->ubwc_enabled,
2573              cmd->state.pass->attachments[a].samples);
2574   ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2575   ops->clear_value(cs, format, &info->pClearValues[a]);
2576
2577   for_each_layer(i, clear_views, fb->layers) {
2578      if (separate_stencil) {
2579         if (ops == &r3d_ops)
2580            r3d_dst_stencil(cs, iview, i);
2581         else
2582            r2d_dst_stencil(cs, iview, i);
2583      } else {
2584         ops->dst(cs, iview, i);
2585      }
2586      ops->run(cmd, cs);
2587   }
2588
2589   ops->teardown(cmd, cs);
2590
2591   trace_end_sysmem_clear(&cmd->trace, cs,
2592                          format, ops == &r3d_ops,
2593                          cmd->state.pass->attachments[a].samples);
2594}
2595
2596void
2597tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2598                           struct tu_cs *cs,
2599                           uint32_t a,
2600                           const VkRenderPassBeginInfo *info)
2601{
2602   const struct tu_render_pass_attachment *attachment =
2603      &cmd->state.pass->attachments[a];
2604
2605   if (!attachment->clear_mask)
2606      return;
2607
2608   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2609      if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2610         clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2611                                 info, a, false);
2612      }
2613      if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2614         clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2615                                 info, a, true);
2616      }
2617   } else {
2618      clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2619                              info, a, false);
2620   }
2621
2622   /* The spec doesn't explicitly say, but presumably the initial renderpass
2623    * clear is considered part of the renderpass, and therefore barriers
2624    * aren't required inside the subpass/renderpass.  Therefore we need to
2625    * flush CCU color into CCU depth here, just like with
2626    * vkCmdClearAttachments(). Note that because this only happens at the
2627    * beginning of a renderpass, and renderpass writes are considered
2628    * "incoherent", we shouldn't have to worry about syncing depth into color
2629    * beforehand as depth should already be flushed.
2630    */
2631   if (vk_format_is_depth_or_stencil(attachment->format)) {
2632      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2633      tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2634   } else {
2635      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2636      tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2637   }
2638
2639   if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug)
2640      tu_cs_emit_wfi(cs);
2641}
2642
2643void
2644tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2645                         struct tu_cs *cs,
2646                         uint32_t a,
2647                         const VkRenderPassBeginInfo *info)
2648{
2649   const struct tu_render_pass_attachment *attachment =
2650      &cmd->state.pass->attachments[a];
2651
2652   if (!attachment->clear_mask)
2653      return;
2654
2655   tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2656
2657   tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2658                                 &info->pClearValues[a]);
2659}
2660
2661static void
2662tu_emit_blit(struct tu_cmd_buffer *cmd,
2663             struct tu_cs *cs,
2664             const struct tu_image_view *iview,
2665             const struct tu_render_pass_attachment *attachment,
2666             bool resolve,
2667             bool separate_stencil)
2668{
2669   tu_cs_emit_regs(cs,
2670                   A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2671
2672   tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2673      .unk0 = !resolve,
2674      .gmem = !resolve,
2675      .sample_0 = vk_format_is_int(attachment->format) |
2676         vk_format_is_depth_or_stencil(attachment->format)));
2677
2678   tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2679   if (separate_stencil) {
2680      tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2681      tu_cs_emit_qw(cs, iview->stencil_base_addr);
2682      tu_cs_emit(cs, iview->stencil_PITCH);
2683
2684      tu_cs_emit_regs(cs,
2685                      A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2686   } else {
2687      tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2688      tu_cs_image_ref_2d(cs, iview, 0, false);
2689
2690      tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
2691      tu_cs_image_flag_ref(cs, iview, 0);
2692
2693      tu_cs_emit_regs(cs,
2694                      A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2695   }
2696
2697   tu6_emit_event_write(cmd, cs, BLIT);
2698}
2699
2700static bool
2701blit_can_resolve(VkFormat format)
2702{
2703   const struct util_format_description *desc = vk_format_description(format);
2704
2705   /* blit event can only do resolve for simple cases:
2706    * averaging samples as unsigned integers or choosing only one sample
2707    */
2708   if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2709      return false;
2710
2711   /* can't do formats with larger channel sizes
2712    * note: this includes all float formats
2713    * note2: single channel integer formats seem OK
2714    */
2715   if (desc->channel[0].size > 10)
2716      return false;
2717
2718   switch (format) {
2719   /* for unknown reasons blit event can't msaa resolve these formats when tiled
2720    * likely related to these formats having different layout from other cpp=2 formats
2721    */
2722   case VK_FORMAT_R8G8_UNORM:
2723   case VK_FORMAT_R8G8_UINT:
2724   case VK_FORMAT_R8G8_SINT:
2725   /* TODO: this one should be able to work? */
2726   case VK_FORMAT_D24_UNORM_S8_UINT:
2727      return false;
2728   default:
2729      break;
2730   }
2731
2732   return true;
2733}
2734
2735void
2736tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2737                        struct tu_cs *cs,
2738                        uint32_t a,
2739                        bool force_load)
2740{
2741   const struct tu_image_view *iview = cmd->state.attachments[a];
2742   const struct tu_render_pass_attachment *attachment =
2743      &cmd->state.pass->attachments[a];
2744
2745   trace_start_gmem_load(&cmd->trace, cs);
2746
2747   if (attachment->load || force_load)
2748      tu_emit_blit(cmd, cs, iview, attachment, false, false);
2749
2750   if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2751      tu_emit_blit(cmd, cs, iview, attachment, false, true);
2752
2753   trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
2754}
2755
2756static void
2757store_cp_blit(struct tu_cmd_buffer *cmd,
2758              struct tu_cs *cs,
2759              const struct tu_image_view *iview,
2760              uint32_t samples,
2761              bool separate_stencil,
2762              VkFormat format,
2763              uint32_t gmem_offset,
2764              uint32_t cpp)
2765{
2766   r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
2767                    iview->ubwc_enabled, true);
2768   if (separate_stencil)
2769      r2d_dst_stencil(cs, iview, 0);
2770   else
2771      r2d_dst(cs, iview, 0);
2772
2773   tu_cs_emit_regs(cs,
2774                   A6XX_SP_PS_2D_SRC_INFO(
2775                      .color_format = tu6_format_texture(format, TILE6_2).fmt,
2776                      .tile_mode = TILE6_2,
2777                      .srgb = vk_format_is_srgb(format),
2778                      .samples = tu_msaa_samples(samples),
2779                      .samples_average = !vk_format_is_int(format) &&
2780                                         !vk_format_is_depth_or_stencil(format),
2781                      .unk20 = 1,
2782                      .unk22 = 1),
2783                   /* note: src size does not matter when not scaling */
2784                   A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2785                   A6XX_SP_PS_2D_SRC(.qword = cmd->device->physical_device->gmem_base + gmem_offset),
2786                   A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2787
2788   /* sync GMEM writes with CACHE. */
2789   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2790
2791   /* Wait for CACHE_INVALIDATE to land */
2792   tu_cs_emit_wfi(cs);
2793
2794   tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2795   tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2796
2797   /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2798    * sysmem, and we generally assume that GMEM renderpasses leave their
2799    * results in sysmem, so we need to flush manually here.
2800    */
2801   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2802}
2803
2804static void
2805store_3d_blit(struct tu_cmd_buffer *cmd,
2806              struct tu_cs *cs,
2807              const struct tu_image_view *iview,
2808              uint32_t dst_samples,
2809              bool separate_stencil,
2810              VkFormat format,
2811              const VkRect2D *render_area,
2812              uint32_t gmem_offset,
2813              uint32_t cpp)
2814{
2815   r3d_setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
2816             iview->ubwc_enabled, dst_samples);
2817
2818   r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2819
2820   if (separate_stencil)
2821      r3d_dst_stencil(cs, iview, 0);
2822   else
2823      r3d_dst(cs, iview, 0);
2824
2825   r3d_src_gmem(cmd, cs, iview, format, gmem_offset, cpp);
2826
2827   /* sync GMEM writes with CACHE. */
2828   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2829
2830   r3d_run(cmd, cs);
2831
2832   /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2833    * sysmem, and we generally assume that GMEM renderpasses leave their
2834    * results in sysmem, so we need to flush manually here. The 3d blit path
2835    * writes to depth images as a color RT, so there's no need to flush depth.
2836    */
2837   tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2838}
2839
2840void
2841tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2842                         struct tu_cs *cs,
2843                         uint32_t a,
2844                         uint32_t gmem_a)
2845{
2846   struct tu_physical_device *phys_dev = cmd->device->physical_device;
2847   const VkRect2D *render_area = &cmd->state.render_area;
2848   struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2849   const struct tu_image_view *iview = cmd->state.attachments[a];
2850   struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2851
2852   if (!dst->store && !dst->store_stencil)
2853      return;
2854
2855   uint32_t x1 = render_area->offset.x;
2856   uint32_t y1 = render_area->offset.y;
2857   uint32_t x2 = x1 + render_area->extent.width;
2858   uint32_t y2 = y1 + render_area->extent.height;
2859   /* x2/y2 can be unaligned if equal to the size of the image,
2860    * since it will write into padding space
2861    * the one exception is linear levels which don't have the
2862    * required y padding in the layout (except for the last level)
2863    */
2864   bool need_y2_align =
2865      y2 != iview->extent.height || iview->need_y2_align;
2866
2867   bool unaligned =
2868      x1 % phys_dev->info->gmem_align_w ||
2869      (x2 % phys_dev->info->gmem_align_w && x2 != iview->extent.width) ||
2870      y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);
2871
2872   /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
2873    * one for depth and other for stencil. When resolving a MSAA
2874    * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
2875    */
2876   bool resolve_d32s8_s8 =
2877      src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
2878      dst->format == VK_FORMAT_S8_UINT;
2879
2880   trace_start_gmem_store(&cmd->trace, cs);
2881
2882   /* use fast path when render area is aligned, except for unsupported resolve cases */
2883   if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2884      if (dst->store)
2885         tu_emit_blit(cmd, cs, iview, src, true, resolve_d32s8_s8);
2886      if (dst->store_stencil)
2887         tu_emit_blit(cmd, cs, iview, src, true, true);
2888
2889      trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
2890      return;
2891   }
2892
2893   VkFormat format = src->format;
2894   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2895      format = VK_FORMAT_D32_SFLOAT;
2896
2897   if (dst->samples > 1) {
2898      /* If we hit this path, we have to disable draw states after every tile
2899       * instead of once at the end of the renderpass, so that they aren't
2900       * executed when calling CP_DRAW.
2901       *
2902       * TODO: store a flag somewhere so we don't do this more than once and
2903       * don't do it after the renderpass when this happens.
2904       */
2905      if (dst->store || dst->store_stencil)
2906         tu_disable_draw_states(cmd, cs);
2907
2908      if (dst->store) {
2909         store_3d_blit(cmd, cs, iview, dst->samples, resolve_d32s8_s8, format,
2910                       render_area, src->gmem_offset, src->cpp);
2911      }
2912      if (dst->store_stencil) {
2913         store_3d_blit(cmd, cs, iview, dst->samples, true, VK_FORMAT_S8_UINT,
2914                       render_area, src->gmem_offset, src->samples);
2915      }
2916   } else {
2917      r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2918
2919      if (dst->store) {
2920         store_cp_blit(cmd, cs, iview, src->samples, resolve_d32s8_s8, format,
2921                       src->gmem_offset, src->cpp);
2922      }
2923      if (dst->store_stencil) {
2924         store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2925                       src->gmem_offset_stencil, src->samples);
2926      }
2927   }
2928
2929   trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
2930}
2931