1/*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28#include "common/freedreno_guardband.h"
29#include "tu_private.h"
30
31#include "ir3/ir3_nir.h"
32#include "main/menums.h"
33#include "nir/nir.h"
34#include "nir/nir_builder.h"
35#include "spirv/nir_spirv.h"
36#include "util/debug.h"
37#include "util/mesa-sha1.h"
38#include "util/u_atomic.h"
39#include "vk_format.h"
40#include "vk_util.h"
41
42#include "tu_cs.h"
43
44/* Emit IB that preloads the descriptors that the shader uses */
45
46static void
47emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
48                enum a6xx_state_block sb, unsigned base, unsigned offset,
49                unsigned count)
50{
51   /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
52    * clear if emitting more packets will even help anything. Presumably the
53    * descriptor cache is relatively small, and these packets stop doing
54    * anything when there are too many descriptors.
55    */
56   tu_cs_emit_pkt7(cs, opcode, 3);
57   tu_cs_emit(cs,
58              CP_LOAD_STATE6_0_STATE_TYPE(st) |
59              CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
60              CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
61              CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
62   tu_cs_emit_qw(cs, offset | (base << 28));
63}
64
65static unsigned
66tu6_load_state_size(struct tu_pipeline *pipeline, bool compute)
67{
68   const unsigned load_state_size = 4;
69   unsigned size = 0;
70   for (unsigned i = 0; i < pipeline->layout->num_sets; i++) {
71      if (!(pipeline->active_desc_sets & (1u << i)))
72         continue;
73
74      struct tu_descriptor_set_layout *set_layout = pipeline->layout->set[i].layout;
75      for (unsigned j = 0; j < set_layout->binding_count; j++) {
76         struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
77         unsigned count = 0;
78         /* Note: some users, like amber for example, pass in
79          * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
80          * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
81          */
82         VkShaderStageFlags stages = compute ?
83            binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
84            binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
85         unsigned stage_count = util_bitcount(stages);
86
87         if (!binding->array_size)
88            continue;
89
90         switch (binding->type) {
91         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
92         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
93         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
94         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
95            /* IBO-backed resources only need one packet for all graphics stages */
96            if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT)
97               count += 1;
98            if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
99               count += 1;
100            break;
101         case VK_DESCRIPTOR_TYPE_SAMPLER:
102         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
103         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
104         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
105         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
106            /* Textures and UBO's needs a packet for each stage */
107            count = stage_count;
108            break;
109         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
110            /* Because of how we pack combined images and samplers, we
111             * currently can't use one packet for the whole array.
112             */
113            count = stage_count * binding->array_size * 2;
114            break;
115         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
116         case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
117            break;
118         default:
119            unreachable("bad descriptor type");
120         }
121         size += count * load_state_size;
122      }
123   }
124   return size;
125}
126
127static void
128tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute)
129{
130   unsigned size = tu6_load_state_size(pipeline, compute);
131   if (size == 0)
132      return;
133
134   struct tu_cs cs;
135   tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
136
137   struct tu_pipeline_layout *layout = pipeline->layout;
138   for (unsigned i = 0; i < layout->num_sets; i++) {
139      /* From 13.2.7. Descriptor Set Binding:
140       *
141       *    A compatible descriptor set must be bound for all set numbers that
142       *    any shaders in a pipeline access, at the time that a draw or
143       *    dispatch command is recorded to execute using that pipeline.
144       *    However, if none of the shaders in a pipeline statically use any
145       *    bindings with a particular set number, then no descriptor set need
146       *    be bound for that set number, even if the pipeline layout includes
147       *    a non-trivial descriptor set layout for that set number.
148       *
149       * This means that descriptor sets unused by the pipeline may have a
150       * garbage or 0 BINDLESS_BASE register, which will cause context faults
151       * when prefetching descriptors from these sets. Skip prefetching for
152       * descriptors from them to avoid this. This is also an optimization,
153       * since these prefetches would be useless.
154       */
155      if (!(pipeline->active_desc_sets & (1u << i)))
156         continue;
157
158      struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
159      for (unsigned j = 0; j < set_layout->binding_count; j++) {
160         struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
161         unsigned base = i;
162         unsigned offset = binding->offset / 4;
163         /* Note: some users, like amber for example, pass in
164          * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
165          * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
166          */
167         VkShaderStageFlags stages = compute ?
168            binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
169            binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
170         unsigned count = binding->array_size;
171         if (count == 0 || stages == 0)
172            continue;
173         switch (binding->type) {
174         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
175            base = MAX_SETS;
176            offset = (layout->set[i].dynamic_offset_start +
177                      binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
178            FALLTHROUGH;
179         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
180         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
181         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
182            /* IBO-backed resources only need one packet for all graphics stages */
183            if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
184               emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
185                               base, offset, count);
186            }
187            if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
188               emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
189                               base, offset, count);
190            }
191            break;
192         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
193         case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
194            /* nothing - input attachment doesn't use bindless */
195            break;
196         case VK_DESCRIPTOR_TYPE_SAMPLER:
197         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
198         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
199            tu_foreach_stage(stage, stages) {
200               emit_load_state(&cs, tu6_stage2opcode(stage),
201                               binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
202                               ST6_SHADER : ST6_CONSTANTS,
203                               tu6_stage2texsb(stage), base, offset, count);
204            }
205            break;
206         }
207         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
208            base = MAX_SETS;
209            offset = (layout->set[i].dynamic_offset_start +
210                      binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS;
211            FALLTHROUGH;
212         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213            tu_foreach_stage(stage, stages) {
214               emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215                               tu6_stage2shadersb(stage), base, offset, count);
216            }
217            break;
218         }
219         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220            tu_foreach_stage(stage, stages) {
221               /* TODO: We could emit less CP_LOAD_STATE6 if we used
222                * struct-of-arrays instead of array-of-structs.
223                */
224               for (unsigned i = 0; i < count; i++) {
225                  unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226                  unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227                  emit_load_state(&cs, tu6_stage2opcode(stage),
228                                  ST6_CONSTANTS, tu6_stage2texsb(stage),
229                                  base, tex_offset, 1);
230                  emit_load_state(&cs, tu6_stage2opcode(stage),
231                                  ST6_SHADER, tu6_stage2texsb(stage),
232                                  base, sam_offset, 1);
233               }
234            }
235            break;
236         }
237         default:
238            unreachable("bad descriptor type");
239         }
240      }
241   }
242
243   pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244}
245
246struct tu_pipeline_builder
247{
248   struct tu_device *device;
249   struct tu_pipeline_cache *cache;
250   struct tu_pipeline_layout *layout;
251   const VkAllocationCallbacks *alloc;
252   const VkGraphicsPipelineCreateInfo *create_info;
253
254   struct tu_shader *shaders[MESA_SHADER_FRAGMENT + 1];
255   struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1];
256   struct ir3_shader_variant *binning_variant;
257   uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
258   uint64_t binning_vs_iova;
259
260   uint32_t additional_cs_reserve_size;
261
262   struct tu_pvtmem_config pvtmem;
263
264   bool rasterizer_discard;
265   /* these states are affectd by rasterizer_discard */
266   bool emit_msaa_state;
267   VkSampleCountFlagBits samples;
268   bool use_color_attachments;
269   bool use_dual_src_blend;
270   bool alpha_to_coverage;
271   uint32_t color_attachment_count;
272   VkFormat color_attachment_formats[MAX_RTS];
273   VkFormat depth_attachment_format;
274   uint32_t render_components;
275   uint32_t multiview_mask;
276};
277
278static bool
279tu_logic_op_reads_dst(VkLogicOp op)
280{
281   switch (op) {
282   case VK_LOGIC_OP_CLEAR:
283   case VK_LOGIC_OP_COPY:
284   case VK_LOGIC_OP_COPY_INVERTED:
285   case VK_LOGIC_OP_SET:
286      return false;
287   default:
288      return true;
289   }
290}
291
292static VkBlendFactor
293tu_blend_factor_no_dst_alpha(VkBlendFactor factor)
294{
295   /* treat dst alpha as 1.0 and avoid reading it */
296   switch (factor) {
297   case VK_BLEND_FACTOR_DST_ALPHA:
298      return VK_BLEND_FACTOR_ONE;
299   case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
300      return VK_BLEND_FACTOR_ZERO;
301   default:
302      return factor;
303   }
304}
305
306static bool tu_blend_factor_is_dual_src(VkBlendFactor factor)
307{
308   switch (factor) {
309   case VK_BLEND_FACTOR_SRC1_COLOR:
310   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
311   case VK_BLEND_FACTOR_SRC1_ALPHA:
312   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
313      return true;
314   default:
315      return false;
316   }
317}
318
319static bool
320tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
321{
322   if (!info)
323      return false;
324
325   for (unsigned i = 0; i < info->attachmentCount; i++) {
326      const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i];
327      if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) ||
328          tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) ||
329          tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) ||
330          tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor))
331         return true;
332   }
333
334   return false;
335}
336
337static const struct xs_config {
338   uint16_t reg_sp_xs_ctrl;
339   uint16_t reg_sp_xs_config;
340   uint16_t reg_sp_xs_instrlen;
341   uint16_t reg_hlsq_xs_ctrl;
342   uint16_t reg_sp_xs_first_exec_offset;
343   uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
344} xs_config[] = {
345   [MESA_SHADER_VERTEX] = {
346      REG_A6XX_SP_VS_CTRL_REG0,
347      REG_A6XX_SP_VS_CONFIG,
348      REG_A6XX_SP_VS_INSTRLEN,
349      REG_A6XX_HLSQ_VS_CNTL,
350      REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
351      REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
352   },
353   [MESA_SHADER_TESS_CTRL] = {
354      REG_A6XX_SP_HS_CTRL_REG0,
355      REG_A6XX_SP_HS_CONFIG,
356      REG_A6XX_SP_HS_INSTRLEN,
357      REG_A6XX_HLSQ_HS_CNTL,
358      REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
359      REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
360   },
361   [MESA_SHADER_TESS_EVAL] = {
362      REG_A6XX_SP_DS_CTRL_REG0,
363      REG_A6XX_SP_DS_CONFIG,
364      REG_A6XX_SP_DS_INSTRLEN,
365      REG_A6XX_HLSQ_DS_CNTL,
366      REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
367      REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
368   },
369   [MESA_SHADER_GEOMETRY] = {
370      REG_A6XX_SP_GS_CTRL_REG0,
371      REG_A6XX_SP_GS_CONFIG,
372      REG_A6XX_SP_GS_INSTRLEN,
373      REG_A6XX_HLSQ_GS_CNTL,
374      REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
375      REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
376   },
377   [MESA_SHADER_FRAGMENT] = {
378      REG_A6XX_SP_FS_CTRL_REG0,
379      REG_A6XX_SP_FS_CONFIG,
380      REG_A6XX_SP_FS_INSTRLEN,
381      REG_A6XX_HLSQ_FS_CNTL,
382      REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
383      REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
384   },
385   [MESA_SHADER_COMPUTE] = {
386      REG_A6XX_SP_CS_CTRL_REG0,
387      REG_A6XX_SP_CS_CONFIG,
388      REG_A6XX_SP_CS_INSTRLEN,
389      REG_A6XX_HLSQ_CS_CNTL,
390      REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
391      REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
392   },
393};
394
395static uint32_t
396tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
397{
398   const struct ir3_const_state *const_state = ir3_const_state(xs);
399   uint32_t base = const_state->offsets.immediate;
400   int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
401
402   /* truncate size to avoid writing constants that shader
403    * does not use:
404    */
405   size = MIN2(size + base, xs->constlen) - base;
406
407   return MAX2(size, 0) * 4;
408}
409
410/* We allocate fixed-length substreams for shader state, however some
411 * parts of the state may have unbound length. Their additional space
412 * requirements should be calculated here.
413 */
414static uint32_t
415tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
416{
417   uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
418   return size;
419}
420
421void
422tu6_emit_xs_config(struct tu_cs *cs,
423                   gl_shader_stage stage, /* xs->type, but xs may be NULL */
424                   const struct ir3_shader_variant *xs)
425{
426   const struct xs_config *cfg = &xs_config[stage];
427
428   if (!xs) {
429      /* shader stage disabled */
430      tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
431      tu_cs_emit(cs, 0);
432
433      tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
434      tu_cs_emit(cs, 0);
435      return;
436   }
437
438   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
439   tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
440                  COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
441                  COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
442                  COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
443                  COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
444                  A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
445                  A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
446
447   tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
448   tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
449                  A6XX_HLSQ_VS_CNTL_ENABLED);
450}
451
452void
453tu6_emit_xs(struct tu_cs *cs,
454            gl_shader_stage stage, /* xs->type, but xs may be NULL */
455            const struct ir3_shader_variant *xs,
456            const struct tu_pvtmem_config *pvtmem,
457            uint64_t binary_iova)
458{
459   const struct xs_config *cfg = &xs_config[stage];
460
461   if (!xs) {
462      /* shader stage disabled */
463      return;
464   }
465
466   enum a6xx_threadsize thrsz =
467      xs->info.double_threadsize ? THREAD128 : THREAD64;
468   switch (stage) {
469   case MESA_SHADER_VERTEX:
470      tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
471               .fullregfootprint = xs->info.max_reg + 1,
472               .halfregfootprint = xs->info.max_half_reg + 1,
473               .branchstack = ir3_shader_branchstack_hw(xs),
474               .mergedregs = xs->mergedregs,
475      ));
476      break;
477   case MESA_SHADER_TESS_CTRL:
478      tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
479               .fullregfootprint = xs->info.max_reg + 1,
480               .halfregfootprint = xs->info.max_half_reg + 1,
481               .branchstack = ir3_shader_branchstack_hw(xs),
482      ));
483      break;
484   case MESA_SHADER_TESS_EVAL:
485      tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
486               .fullregfootprint = xs->info.max_reg + 1,
487               .halfregfootprint = xs->info.max_half_reg + 1,
488               .branchstack = ir3_shader_branchstack_hw(xs),
489               .mergedregs = xs->mergedregs,
490      ));
491      break;
492   case MESA_SHADER_GEOMETRY:
493      tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
494               .fullregfootprint = xs->info.max_reg + 1,
495               .halfregfootprint = xs->info.max_half_reg + 1,
496               .branchstack = ir3_shader_branchstack_hw(xs),
497      ));
498      break;
499   case MESA_SHADER_FRAGMENT:
500      tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
501               .fullregfootprint = xs->info.max_reg + 1,
502               .halfregfootprint = xs->info.max_half_reg + 1,
503               .branchstack = ir3_shader_branchstack_hw(xs),
504               .mergedregs = xs->mergedregs,
505               .threadsize = thrsz,
506               .pixlodenable = xs->need_pixlod,
507               .diff_fine = xs->need_fine_derivatives,
508               .varying = xs->total_in != 0,
509               /* unknown bit, seems unnecessary */
510               .unk24 = true,
511      ));
512      break;
513   case MESA_SHADER_COMPUTE:
514      tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
515               .fullregfootprint = xs->info.max_reg + 1,
516               .halfregfootprint = xs->info.max_half_reg + 1,
517               .branchstack = ir3_shader_branchstack_hw(xs),
518               .mergedregs = xs->mergedregs,
519               .threadsize = thrsz,
520      ));
521      break;
522   default:
523      unreachable("bad shader stage");
524   }
525
526   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
527   tu_cs_emit(cs, xs->instrlen);
528
529   /* emit program binary & private memory layout
530    * binary_iova should be aligned to 1 instrlen unit (128 bytes)
531    */
532
533   assert((binary_iova & 0x7f) == 0);
534   assert((pvtmem->iova & 0x1f) == 0);
535
536   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
537   tu_cs_emit(cs, 0);
538   tu_cs_emit_qw(cs, binary_iova);
539   tu_cs_emit(cs,
540              A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
541   tu_cs_emit_qw(cs, pvtmem->iova);
542   tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
543                  COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
544
545   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
546   tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
547
548   tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
549   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
550                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
551                  CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
552                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
553                  CP_LOAD_STATE6_0_NUM_UNIT(xs->instrlen));
554   tu_cs_emit_qw(cs, binary_iova);
555
556   /* emit immediates */
557
558   const struct ir3_const_state *const_state = ir3_const_state(xs);
559   uint32_t base = const_state->offsets.immediate;
560   unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
561
562   if (immediate_size > 0) {
563      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
564      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
565                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
566                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
567                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
568                 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
569      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
570      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
571
572      tu_cs_emit_array(cs, const_state->immediates, immediate_size);
573   }
574
575   if (const_state->constant_data_ubo != -1) {
576      uint64_t iova = binary_iova + xs->info.constant_data_offset;
577
578      /* Upload UBO state for the constant data. */
579      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
580      tu_cs_emit(cs,
581                 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
582                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
583                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
584                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
585                 CP_LOAD_STATE6_0_NUM_UNIT(1));
586      tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
587      tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
588      int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
589      tu_cs_emit_qw(cs,
590                    iova |
591                    (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
592
593      /* Upload the constant data to the const file if needed. */
594      const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
595
596      for (int i = 0; i < ubo_state->num_enabled; i++) {
597         if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
598             ubo_state->range[i].ubo.bindless) {
599            continue;
600         }
601
602         uint32_t start = ubo_state->range[i].start;
603         uint32_t end = ubo_state->range[i].end;
604         uint32_t size = MIN2(end - start,
605                              (16 * xs->constlen) - ubo_state->range[i].offset);
606
607         tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
608         tu_cs_emit(cs,
609                    CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
610                    CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
611                    CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
612                    CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
613                    CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
614         tu_cs_emit_qw(cs, iova + start);
615      }
616   }
617}
618
619static void
620tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
621                   const struct ir3_shader_variant *v,
622                   const struct tu_pvtmem_config *pvtmem,
623                   uint64_t binary_iova)
624{
625   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
626         .cs_state = true,
627         .cs_ibo = true));
628
629   tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v);
630   tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
631
632   uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
633   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
634   tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
635                  A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
636
637   if (cs->device->physical_device->info->a6xx.has_lpac) {
638      tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
639      tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
640                     A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
641   }
642
643   uint32_t local_invocation_id =
644      ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
645   uint32_t work_group_id =
646      ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
647
648   enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
649   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
650   tu_cs_emit(cs,
651              A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
652              A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
653              A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
654              A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
655   tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
656                  A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
657
658   if (cs->device->physical_device->info->a6xx.has_lpac) {
659      tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
660      tu_cs_emit(cs,
661                 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
662                 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
663                 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
664                 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
665      tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
666                     A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
667   }
668}
669
670static void
671tu6_emit_vs_system_values(struct tu_cs *cs,
672                          const struct ir3_shader_variant *vs,
673                          const struct ir3_shader_variant *hs,
674                          const struct ir3_shader_variant *ds,
675                          const struct ir3_shader_variant *gs,
676                          bool primid_passthru)
677{
678   const uint32_t vertexid_regid =
679         ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
680   const uint32_t instanceid_regid =
681         ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
682   const uint32_t tess_coord_x_regid = hs ?
683         ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) :
684         regid(63, 0);
685   const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
686         tess_coord_x_regid + 1 :
687         regid(63, 0);
688   const uint32_t hs_rel_patch_regid = hs ?
689         ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
690         regid(63, 0);
691   const uint32_t ds_rel_patch_regid = hs ?
692         ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
693         regid(63, 0);
694   const uint32_t hs_invocation_regid = hs ?
695         ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) :
696         regid(63, 0);
697   const uint32_t gs_primitiveid_regid = gs ?
698         ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
699         regid(63, 0);
700   const uint32_t vs_primitiveid_regid = hs ?
701         ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
702         gs_primitiveid_regid;
703   const uint32_t ds_primitiveid_regid = ds ?
704         ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
705         regid(63, 0);
706   const uint32_t gsheader_regid = gs ?
707         ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) :
708         regid(63, 0);
709
710   /* Note: we currently don't support multiview with tess or GS. If we did,
711    * and the HW actually works, then we'd have to somehow share this across
712    * stages. Note that the blob doesn't support this either.
713    */
714   const uint32_t viewid_regid =
715      ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
716
717   tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6);
718   tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
719                  A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
720                  A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
721                  A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
722   tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
723                  A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
724   tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
725                  A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
726                  A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
727                  A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
728   tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
729   tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
730                  0xfc00); /* VFD_CONTROL_5 */
731   tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */
732}
733
734static void
735tu6_setup_streamout(struct tu_cs *cs,
736                    const struct ir3_shader_variant *v,
737                    struct ir3_shader_linkage *l)
738{
739   const struct ir3_stream_output_info *info = &v->shader->stream_output;
740   /* Note: 64 here comes from the HW layout of the program RAM. The program
741    * for stream N is at DWORD 64 * N.
742    */
743#define A6XX_SO_PROG_DWORDS 64
744   uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
745   BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
746   uint32_t ncomp[IR3_MAX_SO_BUFFERS] = {};
747
748   /* TODO: streamout state should be in a non-GMEM draw state */
749
750   /* no streamout: */
751   if (info->num_outputs == 0) {
752      tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
753      tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
754      tu_cs_emit(cs, 0);
755      tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
756      tu_cs_emit(cs, 0);
757      return;
758   }
759
760   /* is there something to do with info->stride[i]? */
761
762   for (unsigned i = 0; i < info->num_outputs; i++) {
763      const struct ir3_stream_output *out = &info->output[i];
764      unsigned k = out->register_index;
765      unsigned idx;
766
767      /* Skip it, if it's an output that was never assigned a register. */
768      if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
769         continue;
770
771      ncomp[out->output_buffer] += out->num_components;
772
773      /* linkage map sorted by order frag shader wants things, so
774       * a bit less ideal here..
775       */
776      for (idx = 0; idx < l->cnt; idx++)
777         if (l->var[idx].regid == v->outputs[k].regid)
778            break;
779
780      debug_assert(idx < l->cnt);
781
782      for (unsigned j = 0; j < out->num_components; j++) {
783         unsigned c   = j + out->start_component;
784         unsigned loc = l->var[idx].loc + c;
785         unsigned off = j + out->dst_offset;  /* in dwords */
786
787         assert(loc < A6XX_SO_PROG_DWORDS * 2);
788         unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
789         if (loc & 1) {
790            prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
791                           A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
792                           A6XX_VPC_SO_PROG_B_OFF(off * 4);
793         } else {
794            prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
795                           A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
796                           A6XX_VPC_SO_PROG_A_OFF(off * 4);
797         }
798         BITSET_SET(valid_dwords, dword);
799      }
800   }
801
802   unsigned prog_count = 0;
803   unsigned start, end;
804   BITSET_FOREACH_RANGE(start, end, valid_dwords,
805                        A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
806      prog_count += end - start + 1;
807   }
808
809   tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
810   tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
811   tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
812                  COND(ncomp[0] > 0,
813                       A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
814                  COND(ncomp[1] > 0,
815                       A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
816                  COND(ncomp[2] > 0,
817                       A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
818                  COND(ncomp[3] > 0,
819                       A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
820   for (uint32_t i = 0; i < 4; i++) {
821      tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(i));
822      tu_cs_emit(cs, ncomp[i]);
823   }
824   bool first = true;
825   BITSET_FOREACH_RANGE(start, end, valid_dwords,
826                        A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
827      tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
828      tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
829                     A6XX_VPC_SO_CNTL_ADDR(start));
830      for (unsigned i = start; i < end; i++) {
831         tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
832         tu_cs_emit(cs, prog[i]);
833      }
834      first = false;
835   }
836}
837
838static void
839tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
840               enum a6xx_state_block block, uint32_t offset,
841               uint32_t size, const uint32_t *dwords) {
842   assert(size % 4 == 0);
843
844   tu_cs_emit_pkt7(cs, opcode, 3 + size);
845   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
846         CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
847         CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
848         CP_LOAD_STATE6_0_STATE_BLOCK(block) |
849         CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
850
851   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
852   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
853   dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
854
855   tu_cs_emit_array(cs, dwords, size);
856}
857
858static void
859tu6_emit_link_map(struct tu_cs *cs,
860                  const struct ir3_shader_variant *producer,
861                  const struct ir3_shader_variant *consumer,
862                  enum a6xx_state_block sb)
863{
864   const struct ir3_const_state *const_state = ir3_const_state(consumer);
865   uint32_t base = const_state->offsets.primitive_map;
866   int size = DIV_ROUND_UP(consumer->input_size, 4);
867
868   size = (MIN2(size + base, consumer->constlen) - base) * 4;
869   if (size <= 0)
870      return;
871
872   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size,
873                         producer->output_loc);
874}
875
876static uint16_t
877gl_primitive_to_tess(uint16_t primitive) {
878   switch (primitive) {
879   case GL_POINTS:
880      return TESS_POINTS;
881   case GL_LINE_STRIP:
882      return TESS_LINES;
883   case GL_TRIANGLE_STRIP:
884      return TESS_CW_TRIS;
885   default:
886      unreachable("");
887   }
888}
889
890void
891tu6_emit_vpc(struct tu_cs *cs,
892             const struct ir3_shader_variant *vs,
893             const struct ir3_shader_variant *hs,
894             const struct ir3_shader_variant *ds,
895             const struct ir3_shader_variant *gs,
896             const struct ir3_shader_variant *fs,
897             uint32_t patch_control_points)
898{
899   /* note: doesn't compile as static because of the array regs.. */
900   const struct reg_config {
901      uint16_t reg_sp_xs_out_reg;
902      uint16_t reg_sp_xs_vpc_dst_reg;
903      uint16_t reg_vpc_xs_pack;
904      uint16_t reg_vpc_xs_clip_cntl;
905      uint16_t reg_gras_xs_cl_cntl;
906      uint16_t reg_pc_xs_out_cntl;
907      uint16_t reg_sp_xs_primitive_cntl;
908      uint16_t reg_vpc_xs_layer_cntl;
909      uint16_t reg_gras_xs_layer_cntl;
910   } reg_config[] = {
911      [MESA_SHADER_VERTEX] = {
912         REG_A6XX_SP_VS_OUT_REG(0),
913         REG_A6XX_SP_VS_VPC_DST_REG(0),
914         REG_A6XX_VPC_VS_PACK,
915         REG_A6XX_VPC_VS_CLIP_CNTL,
916         REG_A6XX_GRAS_VS_CL_CNTL,
917         REG_A6XX_PC_VS_OUT_CNTL,
918         REG_A6XX_SP_VS_PRIMITIVE_CNTL,
919         REG_A6XX_VPC_VS_LAYER_CNTL,
920         REG_A6XX_GRAS_VS_LAYER_CNTL
921      },
922      [MESA_SHADER_TESS_CTRL] = {
923         0,
924         0,
925         0,
926         0,
927         0,
928         REG_A6XX_PC_HS_OUT_CNTL,
929         0,
930         0,
931         0
932      },
933      [MESA_SHADER_TESS_EVAL] = {
934         REG_A6XX_SP_DS_OUT_REG(0),
935         REG_A6XX_SP_DS_VPC_DST_REG(0),
936         REG_A6XX_VPC_DS_PACK,
937         REG_A6XX_VPC_DS_CLIP_CNTL,
938         REG_A6XX_GRAS_DS_CL_CNTL,
939         REG_A6XX_PC_DS_OUT_CNTL,
940         REG_A6XX_SP_DS_PRIMITIVE_CNTL,
941         REG_A6XX_VPC_DS_LAYER_CNTL,
942         REG_A6XX_GRAS_DS_LAYER_CNTL
943      },
944      [MESA_SHADER_GEOMETRY] = {
945         REG_A6XX_SP_GS_OUT_REG(0),
946         REG_A6XX_SP_GS_VPC_DST_REG(0),
947         REG_A6XX_VPC_GS_PACK,
948         REG_A6XX_VPC_GS_CLIP_CNTL,
949         REG_A6XX_GRAS_GS_CL_CNTL,
950         REG_A6XX_PC_GS_OUT_CNTL,
951         REG_A6XX_SP_GS_PRIMITIVE_CNTL,
952         REG_A6XX_VPC_GS_LAYER_CNTL,
953         REG_A6XX_GRAS_GS_LAYER_CNTL
954      },
955   };
956
957   const struct ir3_shader_variant *last_shader;
958   if (gs) {
959      last_shader = gs;
960   } else if (hs) {
961      last_shader = ds;
962   } else {
963      last_shader = vs;
964   }
965
966   const struct reg_config *cfg = &reg_config[last_shader->type];
967
968   struct ir3_shader_linkage linkage = {
969      .primid_loc = 0xff,
970      .clip0_loc = 0xff,
971      .clip1_loc = 0xff,
972   };
973   if (fs)
974      ir3_link_shaders(&linkage, last_shader, fs, true);
975
976   if (last_shader->shader->stream_output.num_outputs)
977      ir3_link_stream_out(&linkage, last_shader);
978
979   /* We do this after linking shaders in order to know whether PrimID
980    * passthrough needs to be enabled.
981    */
982   bool primid_passthru = linkage.primid_loc != 0xff;
983   tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru);
984
985   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
986   tu_cs_emit(cs, ~linkage.varmask[0]);
987   tu_cs_emit(cs, ~linkage.varmask[1]);
988   tu_cs_emit(cs, ~linkage.varmask[2]);
989   tu_cs_emit(cs, ~linkage.varmask[3]);
990
991   /* a6xx finds position/pointsize at the end */
992   const uint32_t pointsize_regid =
993      ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
994   const uint32_t layer_regid =
995      ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
996   const uint32_t view_regid =
997      ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
998   const uint32_t clip0_regid =
999      ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
1000   const uint32_t clip1_regid =
1001      ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
1002   uint32_t flags_regid = gs ?
1003      ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
1004
1005   uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
1006
1007   if (layer_regid != regid(63, 0)) {
1008      layer_loc = linkage.max_loc;
1009      ir3_link_add(&linkage, layer_regid, 0x1, linkage.max_loc);
1010   }
1011
1012   if (view_regid != regid(63, 0)) {
1013      view_loc = linkage.max_loc;
1014      ir3_link_add(&linkage, view_regid, 0x1, linkage.max_loc);
1015   }
1016
1017   unsigned extra_pos = 0;
1018
1019   for (unsigned i = 0; i < last_shader->outputs_count; i++) {
1020      if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
1021         continue;
1022
1023      if (position_loc == 0xff)
1024         position_loc = linkage.max_loc;
1025
1026      ir3_link_add(&linkage, last_shader->outputs[i].regid,
1027                   0xf, position_loc + 4 * last_shader->outputs[i].view);
1028      extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
1029   }
1030
1031   if (pointsize_regid != regid(63, 0)) {
1032      pointsize_loc = linkage.max_loc;
1033      ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
1034   }
1035
1036   uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
1037
1038   /* Handle the case where clip/cull distances aren't read by the FS */
1039   uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
1040   if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
1041      clip0_loc = linkage.max_loc;
1042      ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc);
1043   }
1044   if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
1045      clip1_loc = linkage.max_loc;
1046      ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc);
1047   }
1048
1049   tu6_setup_streamout(cs, last_shader, &linkage);
1050
1051   /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
1052    * at least when a DS is the last stage, so add a dummy output to keep it
1053    * happy if there aren't any. We do this late in order to avoid emitting
1054    * any unused code and make sure that optimizations don't remove it.
1055    */
1056   if (linkage.cnt == 0)
1057      ir3_link_add(&linkage, 0, 0x1, linkage.max_loc);
1058
1059   /* map outputs of the last shader to VPC */
1060   assert(linkage.cnt <= 32);
1061   const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
1062   const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
1063   uint32_t sp_out[16] = {0};
1064   uint32_t sp_vpc_dst[8] = {0};
1065   for (uint32_t i = 0; i < linkage.cnt; i++) {
1066      ((uint16_t *) sp_out)[i] =
1067         A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
1068         A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
1069      ((uint8_t *) sp_vpc_dst)[i] =
1070         A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
1071   }
1072
1073   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
1074   tu_cs_emit_array(cs, sp_out, sp_out_count);
1075
1076   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1077   tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1078
1079   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1080   tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1081                  A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1082                  A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1083                  A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1084
1085   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1086   tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1087                  A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1088                  A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1089
1090   tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1091   tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1092                  A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1093
1094   const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1095
1096   for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1097      const struct ir3_shader_variant *shader = geom_shaders[i];
1098      if (!shader)
1099         continue;
1100
1101      bool primid = shader->type != MESA_SHADER_VERTEX &&
1102         VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1103
1104      tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1105      if (shader == last_shader) {
1106         tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1107                        CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1108                        CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1109                        CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1110                        COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1111                        A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
1112      } else {
1113         tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1114      }
1115   }
1116
1117   tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1118   tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1119                  A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1120
1121   tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1122   tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1123                  A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
1124
1125   tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1126   tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1127                  CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1128
1129   tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
1130
1131   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1132   tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) |
1133                  COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1134                  A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
1135                  A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
1136
1137   if (hs) {
1138      shader_info *hs_info = &hs->shader->nir->info;
1139
1140      tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1141      tu_cs_emit(cs, hs_info->tess.tcs_vertices_out);
1142
1143      /* Total attribute slots in HS incoming patch. */
1144      tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1145      tu_cs_emit(cs, patch_control_points * vs->output_size / 4);
1146
1147      const uint32_t wavesize = 64;
1148      const uint32_t max_wave_input_size = 64;
1149
1150      /* note: if HS is really just the VS extended, then this
1151       * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
1152       * however that doesn't match the blob, and fails some dEQP tests.
1153       */
1154      uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
1155      uint32_t max_prims_per_wave =
1156         max_wave_input_size * wavesize / (vs->output_size * patch_control_points);
1157      prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
1158
1159      uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
1160      uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
1161
1162      tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1163      tu_cs_emit(cs, wave_input_size);
1164
1165      /* In SPIR-V generated from GLSL, the tessellation primitive params are
1166       * are specified in the tess eval shader, but in SPIR-V generated from
1167       * HLSL, they are specified in the tess control shader. */
1168      shader_info *tess_info =
1169            ds->shader->nir->info.tess.spacing == TESS_SPACING_UNSPECIFIED ?
1170            &hs->shader->nir->info : &ds->shader->nir->info;
1171      tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1);
1172      uint32_t output;
1173      if (tess_info->tess.point_mode)
1174         output = TESS_POINTS;
1175      else if (tess_info->tess.primitive_mode == GL_ISOLINES)
1176         output = TESS_LINES;
1177      else if (tess_info->tess.ccw)
1178         output = TESS_CCW_TRIS;
1179      else
1180         output = TESS_CW_TRIS;
1181
1182      enum a6xx_tess_spacing spacing;
1183      switch (tess_info->tess.spacing) {
1184      case TESS_SPACING_EQUAL:
1185         spacing = TESS_EQUAL;
1186         break;
1187      case TESS_SPACING_FRACTIONAL_ODD:
1188         spacing = TESS_FRACTIONAL_ODD;
1189         break;
1190      case TESS_SPACING_FRACTIONAL_EVEN:
1191         spacing = TESS_FRACTIONAL_EVEN;
1192         break;
1193      case TESS_SPACING_UNSPECIFIED:
1194      default:
1195         unreachable("invalid tess spacing");
1196      }
1197      tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) |
1198            A6XX_PC_TESS_CNTL_OUTPUT(output));
1199
1200      tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1201      tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1202   }
1203
1204
1205   if (gs) {
1206      uint32_t vertices_out, invocations, output, vec4_size;
1207      uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1208
1209      /* this detects the tu_clear_blit path, which doesn't set ->nir */
1210      if (gs->shader->nir) {
1211         if (hs) {
1212            tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1213         } else {
1214            tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1215         }
1216         vertices_out = gs->shader->nir->info.gs.vertices_out - 1;
1217         output = gl_primitive_to_tess(gs->shader->nir->info.gs.output_primitive);
1218         invocations = gs->shader->nir->info.gs.invocations - 1;
1219         /* Size of per-primitive alloction in ldlw memory in vec4s. */
1220         vec4_size = gs->shader->nir->info.gs.vertices_in *
1221                     DIV_ROUND_UP(prev_stage_output_size, 4);
1222      } else {
1223         vertices_out = 3;
1224         output = TESS_CW_TRIS;
1225         invocations = 0;
1226         vec4_size = 0;
1227      }
1228
1229      tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1230      tu_cs_emit(cs,
1231            A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
1232            A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
1233            A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
1234
1235      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1236      tu_cs_emit(cs, 0xff);
1237
1238      tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1239      tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1240
1241      uint32_t prim_size = prev_stage_output_size;
1242      if (prim_size > 64)
1243         prim_size = 64;
1244      else if (prim_size == 64)
1245         prim_size = 63;
1246      tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1247      tu_cs_emit(cs, prim_size);
1248   }
1249}
1250
1251static int
1252tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
1253                     uint32_t index,
1254                     uint8_t *interp_mode,
1255                     uint8_t *ps_repl_mode)
1256{
1257   enum
1258   {
1259      INTERP_SMOOTH = 0,
1260      INTERP_FLAT = 1,
1261      INTERP_ZERO = 2,
1262      INTERP_ONE = 3,
1263   };
1264   enum
1265   {
1266      PS_REPL_NONE = 0,
1267      PS_REPL_S = 1,
1268      PS_REPL_T = 2,
1269      PS_REPL_ONE_MINUS_T = 3,
1270   };
1271
1272   const uint32_t compmask = fs->inputs[index].compmask;
1273
1274   /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
1275    * fourth component occupy three consecutive varying slots
1276    */
1277   int shift = 0;
1278   *interp_mode = 0;
1279   *ps_repl_mode = 0;
1280   if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
1281      if (compmask & 0x1) {
1282         *ps_repl_mode |= PS_REPL_S << shift;
1283         shift += 2;
1284      }
1285      if (compmask & 0x2) {
1286         *ps_repl_mode |= PS_REPL_T << shift;
1287         shift += 2;
1288      }
1289      if (compmask & 0x4) {
1290         *interp_mode |= INTERP_ZERO << shift;
1291         shift += 2;
1292      }
1293      if (compmask & 0x8) {
1294         *interp_mode |= INTERP_ONE << 6;
1295         shift += 2;
1296      }
1297   } else if (fs->inputs[index].flat) {
1298      for (int i = 0; i < 4; i++) {
1299         if (compmask & (1 << i)) {
1300            *interp_mode |= INTERP_FLAT << shift;
1301            shift += 2;
1302         }
1303      }
1304   }
1305
1306   return shift;
1307}
1308
1309static void
1310tu6_emit_vpc_varying_modes(struct tu_cs *cs,
1311                           const struct ir3_shader_variant *fs)
1312{
1313   uint32_t interp_modes[8] = { 0 };
1314   uint32_t ps_repl_modes[8] = { 0 };
1315
1316   if (fs) {
1317      for (int i = -1;
1318           (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
1319
1320         /* get the mode for input i */
1321         uint8_t interp_mode;
1322         uint8_t ps_repl_mode;
1323         const int bits =
1324            tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode);
1325
1326         /* OR the mode into the array */
1327         const uint32_t inloc = fs->inputs[i].inloc * 2;
1328         uint32_t n = inloc / 32;
1329         uint32_t shift = inloc % 32;
1330         interp_modes[n] |= interp_mode << shift;
1331         ps_repl_modes[n] |= ps_repl_mode << shift;
1332         if (shift + bits > 32) {
1333            n++;
1334            shift = 32 - shift;
1335
1336            interp_modes[n] |= interp_mode >> shift;
1337            ps_repl_modes[n] |= ps_repl_mode >> shift;
1338         }
1339      }
1340   }
1341
1342   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1343   tu_cs_emit_array(cs, interp_modes, 8);
1344
1345   tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1346   tu_cs_emit_array(cs, ps_repl_modes, 8);
1347}
1348
1349void
1350tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1351{
1352   uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1353   uint32_t ij_regid[IJ_COUNT];
1354   uint32_t smask_in_regid;
1355
1356   bool sample_shading = fs->per_samp | fs->key.sample_shading;
1357   bool enable_varyings = fs->total_in > 0;
1358
1359   samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1360   smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1361   face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1362   coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1363   zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1364   for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1365      ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1366
1367   if (fs->num_sampler_prefetch > 0) {
1368      assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
1369      /* also, it seems like ij_pix is *required* to be r0.x */
1370      assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1371   }
1372
1373   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1374   tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1375         A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
1376         0x7000);    // XXX);
1377   for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1378      const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1379      tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
1380                     A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
1381                     A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
1382                     A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
1383                     A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
1384                     COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
1385                     A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
1386   }
1387
1388   if (fs->num_sampler_prefetch > 0) {
1389      tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1390      for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1391         const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1392         tu_cs_emit(cs,
1393                    A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1394                    A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1395      }
1396   }
1397
1398   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
1399   tu_cs_emit(cs, 0x7);
1400   tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
1401                  A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
1402                  A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
1403                  A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE]));
1404   tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
1405                  A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
1406                  A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
1407                  A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
1408   tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
1409                  A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
1410                  A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
1411                  A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
1412   tu_cs_emit(cs, 0xfcfc);
1413
1414   enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1415   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1416   tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) |
1417                  COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS));
1418
1419   bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1420   bool need_size_persamp = false;
1421   if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) {
1422      if (sample_shading)
1423         need_size_persamp = true;
1424      else
1425         need_size = true;
1426   }
1427
1428   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1429   tu_cs_emit(cs,
1430         CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1431         CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1432         CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1433         CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1434         CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1435         CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1436         COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1437         COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1438         COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1439
1440   tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1441   tu_cs_emit(cs,
1442         CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1443         CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1444         CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1445         CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1446         CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1447         CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1448         COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1449         COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1450         COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1451         COND(fs->fragcoord_compmask != 0,
1452                           A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1453   tu_cs_emit(cs,
1454         A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1455            sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1456         CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1457         CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1458         CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) |
1459         COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1460
1461   tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1462   tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1463
1464   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1465   tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1466              A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1467                 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1468
1469   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1470   tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1471}
1472
1473static void
1474tu6_emit_fs_outputs(struct tu_cs *cs,
1475                    const struct ir3_shader_variant *fs,
1476                    uint32_t mrt_count, bool dual_src_blend,
1477                    uint32_t render_components,
1478                    bool no_earlyz,
1479                    struct tu_pipeline *pipeline)
1480{
1481   uint32_t smask_regid, posz_regid, stencilref_regid;
1482
1483   posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1484   smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1485   stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1486
1487   uint32_t fragdata_regid[8];
1488   if (fs->color0_mrt) {
1489      fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
1490      for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++)
1491         fragdata_regid[i] = fragdata_regid[0];
1492   } else {
1493      for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++)
1494         fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1495   }
1496
1497   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1498   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1499                  A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1500                  A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1501                  COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1502   tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1503
1504   uint32_t fs_render_components = 0;
1505
1506   tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
1507   for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1508      tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1509                     (COND(fragdata_regid[i] & HALF_REG_ID,
1510                           A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1511
1512      if (VALIDREG(fragdata_regid[i])) {
1513         fs_render_components |= 0xf << (i * 4);
1514      }
1515   }
1516
1517   /* dual source blending has an extra fs output in the 2nd slot */
1518   if (dual_src_blend) {
1519      fs_render_components |= 0xf << 4;
1520   }
1521
1522   /* There is no point in having component enabled which is not written
1523    * by the shader. Per VK spec it is an UB, however a few apps depend on
1524    * attachment not being changed if FS doesn't have corresponding output.
1525    */
1526   fs_render_components &= render_components;
1527
1528   tu_cs_emit_regs(cs,
1529                   A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1530
1531   tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
1532   tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1533                  COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1534                  COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1535                  COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1536   tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count));
1537
1538   tu_cs_emit_regs(cs,
1539                   A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1540
1541   if (pipeline) {
1542      pipeline->lrz.fs_has_kill = fs->has_kill;
1543      pipeline->lrz.early_fragment_tests = fs->shader->nir->info.fs.early_fragment_tests;
1544
1545      if ((fs->shader && !fs->shader->nir->info.fs.early_fragment_tests) &&
1546          (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) {
1547         pipeline->lrz.force_late_z = true;
1548      }
1549   }
1550}
1551
1552static void
1553tu6_emit_geom_tess_consts(struct tu_cs *cs,
1554                          const struct ir3_shader_variant *vs,
1555                          const struct ir3_shader_variant *hs,
1556                          const struct ir3_shader_variant *ds,
1557                          const struct ir3_shader_variant *gs,
1558                          uint32_t cps_per_patch)
1559{
1560   uint32_t num_vertices =
1561         hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in;
1562
1563   uint32_t vs_params[4] = {
1564      vs->output_size * num_vertices * 4,  /* vs primitive stride */
1565      vs->output_size * 4,                 /* vs vertex stride */
1566      0,
1567      0,
1568   };
1569   uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param;
1570   tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
1571                  ARRAY_SIZE(vs_params), vs_params);
1572
1573   if (hs) {
1574      assert(ds->type != MESA_SHADER_NONE);
1575      uint32_t hs_params[4] = {
1576         vs->output_size * num_vertices * 4,  /* hs primitive stride */
1577         vs->output_size * 4,                 /* hs vertex stride */
1578         hs->output_size,
1579         cps_per_patch,
1580      };
1581
1582      uint32_t hs_base = hs->const_state->offsets.primitive_param;
1583      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
1584                     ARRAY_SIZE(hs_params), hs_params);
1585      if (gs)
1586         num_vertices = gs->shader->nir->info.gs.vertices_in;
1587
1588      uint32_t ds_params[4] = {
1589         ds->output_size * num_vertices * 4,  /* ds primitive stride */
1590         ds->output_size * 4,                 /* ds vertex stride */
1591         hs->output_size,                     /* hs vertex stride (dwords) */
1592         hs->shader->nir->info.tess.tcs_vertices_out
1593      };
1594
1595      uint32_t ds_base = ds->const_state->offsets.primitive_param;
1596      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
1597                     ARRAY_SIZE(ds_params), ds_params);
1598   }
1599
1600   if (gs) {
1601      const struct ir3_shader_variant *prev = ds ? ds : vs;
1602      uint32_t gs_params[4] = {
1603         prev->output_size * num_vertices * 4,  /* gs primitive stride */
1604         prev->output_size * 4,                 /* gs vertex stride */
1605         0,
1606         0,
1607      };
1608      uint32_t gs_base = gs->const_state->offsets.primitive_param;
1609      tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
1610                     ARRAY_SIZE(gs_params), gs_params);
1611   }
1612}
1613
1614static void
1615tu6_emit_program_config(struct tu_cs *cs,
1616                        struct tu_pipeline_builder *builder)
1617{
1618   gl_shader_stage stage = MESA_SHADER_VERTEX;
1619
1620   STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1621
1622   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
1623         .vs_state = true,
1624         .hs_state = true,
1625         .ds_state = true,
1626         .gs_state = true,
1627         .fs_state = true,
1628         .gfx_ibo = true));
1629   for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
1630      tu6_emit_xs_config(cs, stage, builder->variants[stage]);
1631   }
1632}
1633
1634static void
1635tu6_emit_program(struct tu_cs *cs,
1636                 struct tu_pipeline_builder *builder,
1637                 bool binning_pass,
1638                 struct tu_pipeline *pipeline)
1639{
1640   const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
1641   const struct ir3_shader_variant *bs = builder->binning_variant;
1642   const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
1643   const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
1644   const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
1645   const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
1646   gl_shader_stage stage = MESA_SHADER_VERTEX;
1647   uint32_t cps_per_patch = builder->create_info->pTessellationState ?
1648      builder->create_info->pTessellationState->patchControlPoints : 0;
1649   bool multi_pos_output = builder->shaders[MESA_SHADER_VERTEX]->multi_pos_output;
1650
1651  /* Don't use the binning pass variant when GS is present because we don't
1652   * support compiling correct binning pass variants with GS.
1653   */
1654   if (binning_pass && !gs) {
1655      vs = bs;
1656      tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
1657      stage++;
1658   }
1659
1660   for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
1661      const struct ir3_shader_variant *xs = builder->variants[stage];
1662
1663      if (stage == MESA_SHADER_FRAGMENT && binning_pass)
1664         fs = xs = NULL;
1665
1666      tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
1667   }
1668
1669   uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
1670   uint32_t multiview_cntl = builder->multiview_mask ?
1671      A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1672      A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1673      COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1674      : 0;
1675
1676   /* Copy what the blob does here. This will emit an extra 0x3f
1677    * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1678    * this is working around yet.
1679    */
1680   if (builder->device->physical_device->info->a6xx.has_cp_reg_write) {
1681      tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1682      tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1683      tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1684   } else {
1685      tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1686   }
1687   tu_cs_emit(cs, multiview_cntl);
1688
1689   tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1690   tu_cs_emit(cs, multiview_cntl);
1691
1692   if (multiview_cntl &&
1693       builder->device->physical_device->info->a6xx.supports_multiview_mask) {
1694      tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1695      tu_cs_emit(cs, builder->multiview_mask);
1696   }
1697
1698   tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1699   tu_cs_emit(cs, 0);
1700
1701   tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch);
1702   tu6_emit_vpc_varying_modes(cs, fs);
1703
1704   bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT;
1705   uint32_t mrt_count = builder->color_attachment_count;
1706   uint32_t render_components = builder->render_components;
1707
1708   if (builder->alpha_to_coverage) {
1709      /* alpha to coverage can behave like a discard */
1710      no_earlyz = true;
1711      /* alpha value comes from first mrt */
1712      render_components |= 0xf;
1713      if (!mrt_count) {
1714         mrt_count = 1;
1715         /* Disable memory write for dummy mrt because it doesn't get set otherwise */
1716         tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0));
1717      }
1718   }
1719
1720   if (fs) {
1721      tu6_emit_fs_inputs(cs, fs);
1722      tu6_emit_fs_outputs(cs, fs, mrt_count,
1723                          builder->use_dual_src_blend,
1724                          render_components,
1725                          no_earlyz,
1726                          pipeline);
1727   } else {
1728      /* TODO: check if these can be skipped if fs is disabled */
1729      struct ir3_shader_variant dummy_variant = {};
1730      tu6_emit_fs_inputs(cs, &dummy_variant);
1731      tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count,
1732                          builder->use_dual_src_blend,
1733                          render_components,
1734                          no_earlyz,
1735                          NULL);
1736   }
1737
1738   if (gs || hs) {
1739      tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch);
1740   }
1741}
1742
1743static void
1744tu6_emit_vertex_input(struct tu_pipeline *pipeline,
1745                      struct tu_cs *cs,
1746                      const struct ir3_shader_variant *vs,
1747                      const VkPipelineVertexInputStateCreateInfo *info)
1748{
1749   uint32_t vfd_decode_idx = 0;
1750   uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
1751   uint32_t step_rate[MAX_VBS];
1752
1753   for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) {
1754      const VkVertexInputBindingDescription *binding =
1755         &info->pVertexBindingDescriptions[i];
1756
1757      if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
1758         tu_cs_emit_regs(cs,
1759                        A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride));
1760      }
1761
1762      if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1763         binding_instanced |= 1 << binding->binding;
1764
1765      step_rate[binding->binding] = 1;
1766   }
1767
1768   const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state =
1769      vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1770   if (div_state) {
1771      for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) {
1772         const VkVertexInputBindingDivisorDescriptionEXT *desc =
1773            &div_state->pVertexBindingDivisors[i];
1774         step_rate[desc->binding] = desc->divisor;
1775      }
1776   }
1777
1778   /* TODO: emit all VFD_DECODE/VFD_DEST_CNTL in same (two) pkt4 */
1779
1780   for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
1781      const VkVertexInputAttributeDescription *attr =
1782         &info->pVertexAttributeDescriptions[i];
1783      uint32_t input_idx;
1784
1785      for (input_idx = 0; input_idx < vs->inputs_count; input_idx++) {
1786         if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) == attr->location)
1787            break;
1788      }
1789
1790      /* attribute not used, skip it */
1791      if (input_idx == vs->inputs_count)
1792         continue;
1793
1794      const struct tu_native_format format = tu6_format_vtx(attr->format);
1795      tu_cs_emit_regs(cs,
1796                      A6XX_VFD_DECODE_INSTR(vfd_decode_idx,
1797                        .idx = attr->binding,
1798                        .offset = attr->offset,
1799                        .instanced = binding_instanced & (1 << attr->binding),
1800                        .format = format.fmt,
1801                        .swap = format.swap,
1802                        .unk30 = 1,
1803                        ._float = !vk_format_is_int(attr->format)),
1804                      A6XX_VFD_DECODE_STEP_RATE(vfd_decode_idx, step_rate[attr->binding]));
1805
1806      tu_cs_emit_regs(cs,
1807                      A6XX_VFD_DEST_CNTL_INSTR(vfd_decode_idx,
1808                        .writemask = vs->inputs[input_idx].compmask,
1809                        .regid = vs->inputs[input_idx].regid));
1810
1811      vfd_decode_idx++;
1812   }
1813
1814   tu_cs_emit_regs(cs,
1815                   A6XX_VFD_CONTROL_0(
1816                     .fetch_cnt = vfd_decode_idx, /* decode_cnt for binning pass ? */
1817                     .decode_cnt = vfd_decode_idx));
1818}
1819
1820void
1821tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport)
1822{
1823   VkExtent2D guardband = {511, 511};
1824
1825   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6);
1826   for (uint32_t i = 0; i < num_viewport; i++) {
1827      const VkViewport *viewport = &viewports[i];
1828      float offsets[3];
1829      float scales[3];
1830      scales[0] = viewport->width / 2.0f;
1831      scales[1] = viewport->height / 2.0f;
1832      scales[2] = viewport->maxDepth - viewport->minDepth;
1833      offsets[0] = viewport->x + scales[0];
1834      offsets[1] = viewport->y + scales[1];
1835      offsets[2] = viewport->minDepth;
1836      for (uint32_t j = 0; j < 3; j++) {
1837         tu_cs_emit(cs, fui(offsets[j]));
1838         tu_cs_emit(cs, fui(scales[j]));
1839      }
1840
1841      guardband.width =
1842         MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
1843      guardband.height =
1844         MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
1845   }
1846
1847   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2);
1848   for (uint32_t i = 0; i < num_viewport; i++) {
1849      const VkViewport *viewport = &viewports[i];
1850      VkOffset2D min;
1851      VkOffset2D max;
1852      min.x = (int32_t) viewport->x;
1853      max.x = (int32_t) ceilf(viewport->x + viewport->width);
1854      if (viewport->height >= 0.0f) {
1855         min.y = (int32_t) viewport->y;
1856         max.y = (int32_t) ceilf(viewport->y + viewport->height);
1857      } else {
1858         min.y = (int32_t)(viewport->y + viewport->height);
1859         max.y = (int32_t) ceilf(viewport->y);
1860      }
1861      /* the spec allows viewport->height to be 0.0f */
1862      if (min.y == max.y)
1863         max.y++;
1864      /* allow viewport->width = 0.0f for un-initialized viewports: */
1865      if (min.x == max.x)
1866         max.x++;
1867
1868      min.x = MAX2(min.x, 0);
1869      min.y = MAX2(min.y, 0);
1870
1871      assert(min.x < max.x);
1872      assert(min.y < max.y);
1873      tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
1874                     A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
1875      tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(max.x - 1) |
1876                     A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(max.y - 1));
1877   }
1878
1879   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2);
1880   for (uint32_t i = 0; i < num_viewport; i++) {
1881      const VkViewport *viewport = &viewports[i];
1882      tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
1883      tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
1884   }
1885   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
1886   tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
1887                  A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
1888
1889   /* TODO: what to do about this and multi viewport ? */
1890   float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1891   float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
1892
1893   tu_cs_emit_regs(cs,
1894                   A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
1895                   A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
1896}
1897
1898void
1899tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count)
1900{
1901   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2);
1902
1903   for (uint32_t i = 0; i < scissor_count; i++) {
1904      const VkRect2D *scissor = &scissors[i];
1905
1906      uint32_t min_x = scissor->offset.x;
1907      uint32_t min_y = scissor->offset.y;
1908      uint32_t max_x = min_x + scissor->extent.width - 1;
1909      uint32_t max_y = min_y + scissor->extent.height - 1;
1910
1911      if (!scissor->extent.width || !scissor->extent.height) {
1912         min_x = min_y = 1;
1913         max_x = max_y = 0;
1914      } else {
1915         /* avoid overflow */
1916         uint32_t scissor_max = BITFIELD_MASK(15);
1917         min_x = MIN2(scissor_max, min_x);
1918         min_y = MIN2(scissor_max, min_y);
1919         max_x = MIN2(scissor_max, max_x);
1920         max_y = MIN2(scissor_max, max_y);
1921      }
1922
1923      tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
1924                     A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
1925      tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
1926                     A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
1927   }
1928}
1929
1930void
1931tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc)
1932{
1933   if (!samp_loc) {
1934      tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
1935      tu_cs_emit(cs, 0);
1936
1937      tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
1938      tu_cs_emit(cs, 0);
1939
1940      tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
1941      tu_cs_emit(cs, 0);
1942      return;
1943   }
1944
1945   assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount);
1946   assert(samp_loc->sampleLocationGridSize.width == 1);
1947   assert(samp_loc->sampleLocationGridSize.height == 1);
1948
1949   uint32_t sample_config =
1950      A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE;
1951   uint32_t sample_locations = 0;
1952   for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) {
1953      sample_locations |=
1954         (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) |
1955          A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8;
1956   }
1957
1958   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2);
1959   tu_cs_emit(cs, sample_config);
1960   tu_cs_emit(cs, sample_locations);
1961
1962   tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2);
1963   tu_cs_emit(cs, sample_config);
1964   tu_cs_emit(cs, sample_locations);
1965
1966   tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2);
1967   tu_cs_emit(cs, sample_config);
1968   tu_cs_emit(cs, sample_locations);
1969}
1970
1971static uint32_t
1972tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
1973                 enum a5xx_line_mode line_mode,
1974                 bool multiview)
1975{
1976   uint32_t gras_su_cntl = 0;
1977
1978   if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT)
1979      gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
1980   if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT)
1981      gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
1982
1983   if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE)
1984      gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
1985
1986   gras_su_cntl |=
1987      A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f);
1988
1989   if (rast_info->depthBiasEnable)
1990      gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
1991
1992   gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode);
1993
1994   if (multiview) {
1995      gras_su_cntl |=
1996         A6XX_GRAS_SU_CNTL_UNK17 |
1997         A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE;
1998   }
1999
2000   return gras_su_cntl;
2001}
2002
2003void
2004tu6_emit_depth_bias(struct tu_cs *cs,
2005                    float constant_factor,
2006                    float clamp,
2007                    float slope_factor)
2008{
2009   tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2010   tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value);
2011   tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value);
2012   tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value);
2013}
2014
2015static uint32_t
2016tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att,
2017                         bool has_alpha)
2018{
2019   const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp);
2020   const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor(
2021      has_alpha ? att->srcColorBlendFactor
2022                : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor));
2023   const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor(
2024      has_alpha ? att->dstColorBlendFactor
2025                : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor));
2026   const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp);
2027   const enum adreno_rb_blend_factor src_alpha_factor =
2028      tu6_blend_factor(att->srcAlphaBlendFactor);
2029   const enum adreno_rb_blend_factor dst_alpha_factor =
2030      tu6_blend_factor(att->dstAlphaBlendFactor);
2031
2032   return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) |
2033          A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) |
2034          A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) |
2035          A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) |
2036          A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) |
2037          A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor);
2038}
2039
2040static uint32_t
2041tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
2042                   uint32_t rb_mrt_control_rop,
2043                   bool has_alpha)
2044{
2045   uint32_t rb_mrt_control =
2046      A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask);
2047
2048   rb_mrt_control |= rb_mrt_control_rop;
2049
2050   if (att->blendEnable) {
2051      rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND;
2052
2053      if (has_alpha)
2054         rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2;
2055   }
2056
2057   return rb_mrt_control;
2058}
2059
2060static void
2061tu6_emit_rb_mrt_controls(struct tu_cs *cs,
2062                         const VkPipelineColorBlendStateCreateInfo *blend_info,
2063                         const VkFormat attachment_formats[MAX_RTS],
2064                         uint32_t *blend_enable_mask)
2065{
2066   *blend_enable_mask = 0;
2067
2068   bool rop_reads_dst = false;
2069   uint32_t rb_mrt_control_rop = 0;
2070   if (blend_info->logicOpEnable) {
2071      rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp);
2072      rb_mrt_control_rop =
2073         A6XX_RB_MRT_CONTROL_ROP_ENABLE |
2074         A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
2075   }
2076
2077   for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
2078      const VkPipelineColorBlendAttachmentState *att =
2079         &blend_info->pAttachments[i];
2080      const VkFormat format = attachment_formats[i];
2081
2082      uint32_t rb_mrt_control = 0;
2083      uint32_t rb_mrt_blend_control = 0;
2084      if (format != VK_FORMAT_UNDEFINED) {
2085         const bool has_alpha = vk_format_has_alpha(format);
2086
2087         rb_mrt_control =
2088            tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
2089         rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
2090
2091         if (att->blendEnable || rop_reads_dst)
2092            *blend_enable_mask |= 1 << i;
2093      }
2094
2095      tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2);
2096      tu_cs_emit(cs, rb_mrt_control);
2097      tu_cs_emit(cs, rb_mrt_blend_control);
2098   }
2099}
2100
2101static void
2102tu6_emit_blend_control(struct tu_cs *cs,
2103                       uint32_t blend_enable_mask,
2104                       bool dual_src_blend,
2105                       const VkPipelineMultisampleStateCreateInfo *msaa_info)
2106{
2107   const uint32_t sample_mask =
2108      msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff)
2109                             : ((1 << msaa_info->rasterizationSamples) - 1);
2110
2111   tu_cs_emit_regs(cs,
2112                   A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
2113                                      .dual_color_in_enable = dual_src_blend,
2114                                      .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2115                                      .unk8 = true));
2116
2117   /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
2118   tu_cs_emit_regs(cs,
2119                   A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
2120                                      .independent_blend = true,
2121                                      .sample_mask = sample_mask,
2122                                      .dual_color_in_enable = dual_src_blend,
2123                                      .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2124                                      .alpha_to_one = msaa_info->alphaToOneEnable));
2125}
2126
2127static uint32_t
2128calc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config,
2129                 uint32_t pvtmem_bytes)
2130{
2131   uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512);
2132   uint32_t per_sp_size =
2133      ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12);
2134
2135   if (config) {
2136      config->per_fiber_size = per_fiber_size;
2137      config->per_sp_size = per_sp_size;
2138   }
2139
2140   return dev->physical_device->info->num_sp_cores * per_sp_size;
2141}
2142
2143static VkResult
2144tu_setup_pvtmem(struct tu_device *dev,
2145                struct tu_pipeline *pipeline,
2146                struct tu_pvtmem_config *config,
2147                uint32_t pvtmem_bytes, bool per_wave)
2148{
2149   if (!pvtmem_bytes) {
2150      memset(config, 0, sizeof(*config));
2151      return VK_SUCCESS;
2152   }
2153
2154   uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes);
2155   config->per_wave = per_wave;
2156
2157   VkResult result =
2158      tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size,
2159                     TU_BO_ALLOC_NO_FLAGS);
2160   if (result != VK_SUCCESS)
2161      return result;
2162
2163   config->iova = pipeline->pvtmem_bo.iova;
2164
2165   return result;
2166}
2167
2168
2169static VkResult
2170tu_pipeline_allocate_cs(struct tu_device *dev,
2171                        struct tu_pipeline *pipeline,
2172                        struct tu_pipeline_builder *builder,
2173                        struct ir3_shader_variant *compute)
2174{
2175   uint32_t size = 2048 + tu6_load_state_size(pipeline, compute);
2176
2177   /* graphics case: */
2178   if (builder) {
2179      uint32_t pvtmem_bytes = 0;
2180      for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
2181         if (builder->variants[i]) {
2182            size += builder->variants[i]->info.size / 4;
2183            pvtmem_bytes = MAX2(pvtmem_bytes, builder->variants[i]->pvtmem_size);
2184         }
2185      }
2186
2187      size += builder->binning_variant->info.size / 4;
2188      pvtmem_bytes = MAX2(pvtmem_bytes, builder->binning_variant->pvtmem_size);
2189
2190      size += calc_pvtmem_size(dev, NULL, pvtmem_bytes) / 4;
2191
2192      builder->additional_cs_reserve_size = 0;
2193      for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
2194         struct ir3_shader_variant *variant = builder->variants[i];
2195         if (variant) {
2196            builder->additional_cs_reserve_size +=
2197               tu_xs_get_additional_cs_size_dwords(variant);
2198
2199            if (variant->binning) {
2200               builder->additional_cs_reserve_size +=
2201                  tu_xs_get_additional_cs_size_dwords(variant->binning);
2202            }
2203         }
2204      }
2205
2206      size += builder->additional_cs_reserve_size;
2207   } else {
2208      size += compute->info.size / 4;
2209      size += calc_pvtmem_size(dev, NULL, compute->pvtmem_size) / 4;
2210
2211      size += tu_xs_get_additional_cs_size_dwords(compute);
2212   }
2213
2214   tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
2215
2216   /* Reserve the space now such that tu_cs_begin_sub_stream never fails. Note
2217    * that LOAD_STATE can potentially take up a large amount of space so we
2218    * calculate its size explicitly.
2219   */
2220   return tu_cs_reserve_space(&pipeline->cs, size);
2221}
2222
2223static void
2224tu_pipeline_shader_key_init(struct ir3_shader_key *key,
2225                            const struct tu_pipeline *pipeline,
2226                            const VkGraphicsPipelineCreateInfo *pipeline_info)
2227{
2228   for (uint32_t i = 0; i < pipeline_info->stageCount; i++) {
2229      if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
2230         key->has_gs = true;
2231         break;
2232      }
2233   }
2234
2235   if (pipeline_info->pRasterizationState->rasterizerDiscardEnable &&
2236       !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD)))
2237      return;
2238
2239   const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState;
2240   const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
2241      vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
2242   if (msaa_info->rasterizationSamples > 1 ||
2243       /* also set msaa key when sample location is not the default
2244        * since this affects varying interpolation */
2245       (sample_locations && sample_locations->sampleLocationsEnable)) {
2246      key->msaa = true;
2247   }
2248
2249   /* note: not actually used by ir3, just checked in tu6_emit_fs_inputs */
2250   if (msaa_info->sampleShadingEnable)
2251      key->sample_shading = true;
2252
2253   /* We set this after we compile to NIR because we need the prim mode */
2254   key->tessellation = IR3_TESS_NONE;
2255}
2256
2257static uint32_t
2258tu6_get_tessmode(struct tu_shader* shader)
2259{
2260   uint32_t primitive_mode = shader->ir3_shader->nir->info.tess.primitive_mode;
2261   switch (primitive_mode) {
2262   case GL_ISOLINES:
2263      return IR3_TESS_ISOLINES;
2264   case GL_TRIANGLES:
2265      return IR3_TESS_TRIANGLES;
2266   case GL_QUADS:
2267      return IR3_TESS_QUADS;
2268   case GL_NONE:
2269      return IR3_TESS_NONE;
2270   default:
2271      unreachable("bad tessmode");
2272   }
2273}
2274
2275static uint64_t
2276tu_upload_variant(struct tu_pipeline *pipeline,
2277                  const struct ir3_shader_variant *variant)
2278{
2279   struct tu_cs_memory memory;
2280
2281   if (!variant)
2282      return 0;
2283
2284   /* this expects to get enough alignment because shaders are allocated first
2285    * and total size is always aligned correctly
2286    * note: an assert in tu6_emit_xs_config validates the alignment
2287    */
2288   tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
2289
2290   memcpy(memory.map, variant->bin, variant->info.size);
2291   return memory.iova;
2292}
2293
2294static void
2295tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant,
2296                     char *nir_from_spirv)
2297{
2298   ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.nir);
2299   ralloc_steal(pipeline->executables_mem_ctx, variant->disasm_info.disasm);
2300
2301   struct tu_pipeline_executable exe = {
2302      .stage = variant->shader->type,
2303      .nir_from_spirv = nir_from_spirv,
2304      .nir_final = variant->disasm_info.nir,
2305      .disasm = variant->disasm_info.disasm,
2306      .stats = variant->info,
2307      .is_binning = variant->binning_pass,
2308   };
2309
2310   util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
2311}
2312
2313static VkResult
2314tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
2315                                    struct tu_pipeline *pipeline)
2316{
2317   const struct ir3_compiler *compiler = builder->device->compiler;
2318   const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
2319      NULL
2320   };
2321   for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2322      gl_shader_stage stage =
2323         vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2324      stage_infos[stage] = &builder->create_info->pStages[i];
2325   }
2326
2327   struct ir3_shader_key key = {};
2328   tu_pipeline_shader_key_init(&key, pipeline, builder->create_info);
2329
2330   nir_shader *nir[ARRAY_SIZE(builder->shaders)] = { NULL };
2331
2332   for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2333        stage < ARRAY_SIZE(nir); stage++) {
2334      const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2335      if (!stage_info)
2336         continue;
2337
2338      nir[stage] = tu_spirv_to_nir(builder->device, stage_info, stage);
2339      if (!nir[stage])
2340         return VK_ERROR_OUT_OF_HOST_MEMORY;
2341   }
2342
2343   if (!nir[MESA_SHADER_FRAGMENT]) {
2344         const nir_shader_compiler_options *nir_options =
2345            ir3_get_compiler_options(builder->device->compiler);
2346         nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
2347                                                           nir_options,
2348                                                           "noop_fs");
2349         nir[MESA_SHADER_FRAGMENT] = fs_b.shader;
2350   }
2351
2352   const bool executable_info = builder->create_info->flags &
2353      VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
2354
2355   char *nir_initial_disasm[ARRAY_SIZE(builder->shaders)] = { NULL };
2356
2357   if (executable_info) {
2358      for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2359            stage < ARRAY_SIZE(nir); stage++) {
2360         if (!nir[stage])
2361            continue;
2362
2363         nir_initial_disasm[stage] =
2364            nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx);
2365      }
2366   }
2367
2368   /* TODO do intra-stage linking here */
2369
2370   uint32_t desc_sets = 0;
2371   for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2372        stage < ARRAY_SIZE(nir); stage++) {
2373      if (!nir[stage])
2374         continue;
2375
2376      struct tu_shader *shader =
2377         tu_shader_create(builder->device, nir[stage],
2378                          builder->multiview_mask, builder->layout,
2379                          builder->alloc);
2380      if (!shader)
2381         return VK_ERROR_OUT_OF_HOST_MEMORY;
2382
2383      /* In SPIR-V generated from GLSL, the primitive mode is specified in the
2384       * tessellation evaluation shader, but in SPIR-V generated from HLSL,
2385       * the mode is specified in the tessellation control shader. */
2386      if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) &&
2387          key.tessellation == IR3_TESS_NONE) {
2388         key.tessellation = tu6_get_tessmode(shader);
2389      }
2390
2391      if (stage > MESA_SHADER_TESS_CTRL) {
2392         if (stage == MESA_SHADER_FRAGMENT) {
2393            key.tcs_store_primid = key.tcs_store_primid ||
2394               (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2395         } else {
2396            key.tcs_store_primid = key.tcs_store_primid ||
2397               BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2398         }
2399      }
2400
2401      /* Keep track of the status of each shader's active descriptor sets,
2402       * which is set in tu_lower_io. */
2403      desc_sets |= shader->active_desc_sets;
2404
2405      builder->shaders[stage] = shader;
2406   }
2407   pipeline->active_desc_sets = desc_sets;
2408
2409   struct tu_shader *last_shader = builder->shaders[MESA_SHADER_GEOMETRY];
2410   if (!last_shader)
2411      last_shader = builder->shaders[MESA_SHADER_TESS_EVAL];
2412   if (!last_shader)
2413      last_shader = builder->shaders[MESA_SHADER_VERTEX];
2414
2415   uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written;
2416
2417   key.layer_zero = !(outputs_written & VARYING_BIT_LAYER);
2418   key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT);
2419
2420   pipeline->tess.patch_type = key.tessellation;
2421
2422   for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2423        stage < ARRAY_SIZE(builder->shaders); stage++) {
2424      if (!builder->shaders[stage])
2425         continue;
2426
2427      bool created;
2428      builder->variants[stage] =
2429         ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2430                                &key, false, executable_info, &created);
2431      if (!builder->variants[stage])
2432         return VK_ERROR_OUT_OF_HOST_MEMORY;
2433   }
2434
2435   uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler);
2436
2437   key.safe_constlen = true;
2438
2439   for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2440        stage < ARRAY_SIZE(builder->shaders); stage++) {
2441      if (!builder->shaders[stage])
2442         continue;
2443
2444      if (safe_constlens & (1 << stage)) {
2445         bool created;
2446         builder->variants[stage] =
2447            ir3_shader_get_variant(builder->shaders[stage]->ir3_shader,
2448                                   &key, false, executable_info, &created);
2449         if (!builder->variants[stage])
2450            return VK_ERROR_OUT_OF_HOST_MEMORY;
2451      }
2452   }
2453
2454   const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX];
2455   struct ir3_shader_variant *variant;
2456
2457   if (vs->ir3_shader->stream_output.num_outputs ||
2458       !ir3_has_binning_vs(&key)) {
2459      variant = builder->variants[MESA_SHADER_VERTEX];
2460   } else {
2461      bool created;
2462      key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX));
2463      variant = ir3_shader_get_variant(vs->ir3_shader, &key,
2464                                       true, executable_info, &created);
2465      if (!variant)
2466         return VK_ERROR_OUT_OF_HOST_MEMORY;
2467   }
2468
2469   builder->binning_variant = variant;
2470
2471   for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2472         stage < ARRAY_SIZE(nir); stage++) {
2473      if (builder->variants[stage]) {
2474         tu_append_executable(pipeline, builder->variants[stage],
2475            nir_initial_disasm[stage]);
2476      }
2477   }
2478
2479   if (builder->binning_variant != builder->variants[MESA_SHADER_VERTEX]) {
2480      tu_append_executable(pipeline, builder->binning_variant, NULL);
2481   }
2482
2483   return VK_SUCCESS;
2484}
2485
2486static void
2487tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
2488                                  struct tu_pipeline *pipeline)
2489{
2490   const VkPipelineDynamicStateCreateInfo *dynamic_info =
2491      builder->create_info->pDynamicState;
2492
2493   pipeline->gras_su_cntl_mask = ~0u;
2494   pipeline->rb_depth_cntl_mask = ~0u;
2495   pipeline->rb_stencil_cntl_mask = ~0u;
2496   pipeline->pc_raster_cntl_mask = ~0u;
2497   pipeline->vpc_unknown_9107_mask = ~0u;
2498
2499   if (!dynamic_info)
2500      return;
2501
2502   for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) {
2503      VkDynamicState state = dynamic_info->pDynamicStates[i];
2504      switch (state) {
2505      case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE:
2506         if (state == VK_DYNAMIC_STATE_LINE_WIDTH)
2507            pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2508         pipeline->dynamic_state_mask |= BIT(state);
2509         break;
2510      case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
2511         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS);
2512         break;
2513      case VK_DYNAMIC_STATE_CULL_MODE_EXT:
2514         pipeline->gras_su_cntl_mask &=
2515            ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT);
2516         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2517         break;
2518      case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
2519         pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
2520         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2521         break;
2522      case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
2523         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY);
2524         break;
2525      case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
2526         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE);
2527         break;
2528      case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
2529         pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
2530         break;
2531      case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
2532         pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
2533         break;
2534      case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
2535         pipeline->rb_depth_cntl_mask &=
2536            ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
2537         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2538         break;
2539      case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
2540         pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2541         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2542         break;
2543      case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
2544         pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
2545         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2546         break;
2547      case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
2548         pipeline->rb_depth_cntl_mask &=
2549            ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
2550         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
2551         break;
2552      case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
2553         pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2554                                             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2555                                             A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
2556         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2557         break;
2558      case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
2559         pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
2560                                             A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
2561                                             A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
2562                                             A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK |
2563                                             A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
2564                                             A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
2565                                             A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
2566                                             A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
2567         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
2568         break;
2569      case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
2570         pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
2571         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
2572         break;
2573      case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
2574         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
2575         break;
2576      case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
2577         pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD;
2578         pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2579         pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD);
2580         break;
2581      default:
2582         assert(!"unsupported dynamic state");
2583         break;
2584      }
2585   }
2586}
2587
2588static void
2589tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2590                        struct tu_shader *shader,
2591                        struct ir3_shader_variant *v)
2592{
2593   link->const_state = *ir3_const_state(v);
2594   link->constlen = v->constlen;
2595   link->push_consts = shader->push_consts;
2596}
2597
2598static void
2599tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
2600                                        struct tu_pipeline *pipeline)
2601{
2602   struct tu_cs prog_cs;
2603
2604   /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2605    * else that could depend on that state (like push constants)
2606    *
2607    * Note also that this always uses the full VS even in binning pass.  The
2608    * binning pass variant has the same const layout as the full VS, and
2609    * the constlen for the VS will be the same or greater than the constlen
2610    * for the binning pass variant.  It is required that the constlen state
2611    * matches between binning and draw passes, as some parts of the push
2612    * consts are emitted in state groups that are shared between the binning
2613    * and draw passes.
2614    */
2615   tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
2616   tu6_emit_program_config(&prog_cs, builder);
2617   pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2618
2619   tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
2620   tu6_emit_program(&prog_cs, builder, false, pipeline);
2621   pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2622
2623   tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
2624   tu6_emit_program(&prog_cs, builder, true, pipeline);
2625   pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
2626
2627   VkShaderStageFlags stages = 0;
2628   for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
2629      stages |= builder->create_info->pStages[i].stage;
2630   }
2631   pipeline->active_stages = stages;
2632
2633   for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders); i++) {
2634      if (!builder->shaders[i])
2635         continue;
2636
2637      tu_pipeline_set_linkage(&pipeline->program.link[i],
2638                              builder->shaders[i],
2639                              builder->variants[i]);
2640   }
2641}
2642
2643static void
2644tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
2645                                       struct tu_pipeline *pipeline)
2646{
2647   const VkPipelineVertexInputStateCreateInfo *vi_info =
2648      builder->create_info->pVertexInputState;
2649   const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
2650   const struct ir3_shader_variant *bs = builder->binning_variant;
2651
2652   /* Bindings may contain holes */
2653   for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
2654      pipeline->num_vbs =
2655         MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1);
2656   }
2657
2658   struct tu_cs vi_cs;
2659   tu_cs_begin_sub_stream(&pipeline->cs,
2660                          MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2661   tu6_emit_vertex_input(pipeline, &vi_cs, vs, vi_info);
2662   pipeline->vi.state = tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2663
2664   if (bs) {
2665      tu_cs_begin_sub_stream(&pipeline->cs,
2666                             MAX_VERTEX_ATTRIBS * 7 + 2, &vi_cs);
2667      tu6_emit_vertex_input(pipeline, &vi_cs, bs, vi_info);
2668      pipeline->vi.binning_state =
2669         tu_cs_end_draw_state(&pipeline->cs, &vi_cs);
2670   }
2671}
2672
2673static void
2674tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder,
2675                                         struct tu_pipeline *pipeline)
2676{
2677   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
2678      builder->create_info->pInputAssemblyState;
2679
2680   pipeline->ia.primtype = tu6_primtype(ia_info->topology);
2681   pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable;
2682}
2683
2684static bool
2685tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs,
2686                         uint32_t id, uint32_t size)
2687{
2688   assert(id < ARRAY_SIZE(pipeline->dynamic_state));
2689
2690   if (pipeline->dynamic_state_mask & BIT(id))
2691      return false;
2692
2693   pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size);
2694   return true;
2695}
2696
2697static void
2698tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
2699                                       struct tu_pipeline *pipeline)
2700{
2701   if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
2702       !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
2703      return;
2704
2705   const VkPipelineTessellationStateCreateInfo *tess_info =
2706      builder->create_info->pTessellationState;
2707
2708   assert(pipeline->ia.primtype == DI_PT_PATCHES0);
2709   assert(tess_info->patchControlPoints <= 32);
2710   pipeline->ia.primtype += tess_info->patchControlPoints;
2711   const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info =
2712         vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
2713   pipeline->tess.upper_left_domain_origin = !domain_info ||
2714         domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
2715   const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
2716   const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
2717   pipeline->tess.param_stride = hs->output_size * 4;
2718   pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1;
2719   pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1;
2720}
2721
2722static void
2723tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
2724                                   struct tu_pipeline *pipeline)
2725{
2726   /* The spec says:
2727    *
2728    *    pViewportState is a pointer to an instance of the
2729    *    VkPipelineViewportStateCreateInfo structure, and is ignored if the
2730    *    pipeline has rasterization disabled."
2731    *
2732    * We leave the relevant registers stale in that case.
2733    */
2734   if (builder->rasterizer_discard)
2735      return;
2736
2737   const VkPipelineViewportStateCreateInfo *vp_info =
2738      builder->create_info->pViewportState;
2739
2740   struct tu_cs cs;
2741
2742   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount))
2743      tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount);
2744
2745   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
2746      tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
2747}
2748
2749static void
2750tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
2751                                        struct tu_pipeline *pipeline)
2752{
2753   const VkPipelineRasterizationStateCreateInfo *rast_info =
2754      builder->create_info->pRasterizationState;
2755
2756   enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode);
2757
2758   bool depth_clip_disable = rast_info->depthClampEnable;
2759
2760   const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
2761      vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
2762   if (depth_clip_state)
2763      depth_clip_disable = !depth_clip_state->depthClipEnable;
2764
2765   pipeline->line_mode = RECTANGULAR;
2766
2767   if (tu6_primtype_line(pipeline->ia.primtype)) {
2768      const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state =
2769         vk_find_struct_const(rast_info->pNext,
2770                              PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
2771
2772      if (rast_line_state && rast_line_state->lineRasterizationMode ==
2773               VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
2774         pipeline->line_mode = BRESENHAM;
2775      }
2776   }
2777
2778   struct tu_cs cs;
2779   uint32_t cs_size = 9 +
2780      (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) +
2781      (builder->emit_msaa_state ? 11 : 0);
2782   pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size);
2783
2784   tu_cs_emit_regs(&cs,
2785                   A6XX_GRAS_CL_CNTL(
2786                     .znear_clip_disable = depth_clip_disable,
2787                     .zfar_clip_disable = depth_clip_disable,
2788                     /* TODO should this be depth_clip_disable instead? */
2789                     .unk5 = rast_info->depthClampEnable,
2790                     .zero_gb_scale_z = 1,
2791                     .vp_clip_code_ignore = 1));
2792
2793   tu_cs_emit_regs(&cs,
2794                   A6XX_VPC_POLYGON_MODE(mode));
2795
2796   tu_cs_emit_regs(&cs,
2797                   A6XX_PC_POLYGON_MODE(mode));
2798
2799   /* move to hw ctx init? */
2800   tu_cs_emit_regs(&cs,
2801                   A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
2802                   A6XX_GRAS_SU_POINT_SIZE(1.0f));
2803
2804   if (builder->device->physical_device->info->a6xx.has_shading_rate) {
2805      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00());
2806      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10());
2807      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20());
2808      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30());
2809   }
2810
2811   /* If samples count couldn't be devised from the subpass, we should emit it here.
2812    * It happens when subpass doesn't use any color/depth attachment.
2813    */
2814   if (builder->emit_msaa_state)
2815      tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode);
2816
2817   const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
2818      vk_find_struct_const(rast_info->pNext,
2819                           PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
2820   unsigned stream = stream_info ? stream_info->rasterizationStream : 0;
2821
2822   pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream);
2823   pipeline->vpc_unknown_9107 = 0;
2824   if (rast_info->rasterizerDiscardEnable) {
2825      pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
2826      pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2827   }
2828
2829   if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) {
2830      tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl));
2831      tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107));
2832   }
2833
2834   pipeline->gras_su_cntl =
2835      tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0);
2836
2837   if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
2838      tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl));
2839
2840   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) {
2841      tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor,
2842                          rast_info->depthBiasClamp,
2843                          rast_info->depthBiasSlopeFactor);
2844   }
2845
2846   const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state =
2847      vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
2848   pipeline->provoking_vertex_last = provoking_vtx_state &&
2849      provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
2850}
2851
2852static void
2853tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
2854                                        struct tu_pipeline *pipeline)
2855{
2856   /* The spec says:
2857    *
2858    *    pDepthStencilState is a pointer to an instance of the
2859    *    VkPipelineDepthStencilStateCreateInfo structure, and is ignored if
2860    *    the pipeline has rasterization disabled or if the subpass of the
2861    *    render pass the pipeline is created against does not use a
2862    *    depth/stencil attachment.
2863    */
2864   const VkPipelineDepthStencilStateCreateInfo *ds_info =
2865      builder->create_info->pDepthStencilState;
2866   const VkPipelineRasterizationStateCreateInfo *rast_info =
2867      builder->create_info->pRasterizationState;
2868   uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
2869   struct tu_cs cs;
2870
2871   if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
2872       builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
2873      if (ds_info->depthTestEnable) {
2874         rb_depth_cntl |=
2875            A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
2876            A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
2877            A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */
2878
2879         if (rast_info->depthClampEnable)
2880            rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE;
2881
2882         if (ds_info->depthWriteEnable)
2883            rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2884      }
2885
2886      if (ds_info->depthBoundsTestEnable)
2887         rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
2888
2889      if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
2890         tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
2891   } else {
2892      /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
2893       * to 0 when this pipeline is used, as enabling depth test when there
2894       * is no depth attachment is a problem (at least for the S8_UINT case)
2895       */
2896      if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL))
2897         pipeline->rb_depth_cntl_disable = true;
2898   }
2899
2900   if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
2901      const VkStencilOpState *front = &ds_info->front;
2902      const VkStencilOpState *back = &ds_info->back;
2903
2904      rb_stencil_cntl |=
2905         A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
2906         A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
2907         A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
2908         A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
2909         A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
2910         A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
2911         A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
2912         A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
2913
2914      if (ds_info->stencilTestEnable) {
2915         rb_stencil_cntl |=
2916            A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2917            A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2918            A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
2919      }
2920   }
2921
2922   if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
2923      tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
2924      tu_cs_emit(&cs, rb_depth_cntl);
2925   }
2926   pipeline->rb_depth_cntl = rb_depth_cntl;
2927
2928   if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) {
2929      tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
2930      tu_cs_emit(&cs, rb_stencil_cntl);
2931   }
2932   pipeline->rb_stencil_cntl = rb_stencil_cntl;
2933
2934   /* the remaining draw states arent used if there is no d/s, leave them empty */
2935   if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED)
2936      return;
2937
2938   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
2939      tu_cs_emit_regs(&cs,
2940                      A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds),
2941                      A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds));
2942   }
2943
2944   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) {
2945      tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff,
2946                                               .bfmask = ds_info->back.compareMask & 0xff));
2947   }
2948
2949   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) {
2950      update_stencil_mask(&pipeline->stencil_wrmask,  VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask);
2951      update_stencil_mask(&pipeline->stencil_wrmask,  VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask);
2952      tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask));
2953   }
2954
2955   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) {
2956      tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff,
2957                                              .bfref = ds_info->back.reference & 0xff));
2958   }
2959
2960   if (builder->shaders[MESA_SHADER_FRAGMENT]) {
2961      const struct ir3_shader_variant *fs = &builder->shaders[MESA_SHADER_FRAGMENT]->ir3_shader->variants[0];
2962      if (fs->has_kill || fs->no_earlyz || fs->writes_pos) {
2963         pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
2964      }
2965      if (fs->no_earlyz || fs->writes_pos) {
2966         pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ;
2967      }
2968   }
2969}
2970
2971static void
2972tu_pipeline_builder_parse_multisample_and_color_blend(
2973   struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
2974{
2975   /* The spec says:
2976    *
2977    *    pMultisampleState is a pointer to an instance of the
2978    *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
2979    *    has rasterization disabled.
2980    *
2981    * Also,
2982    *
2983    *    pColorBlendState is a pointer to an instance of the
2984    *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
2985    *    pipeline has rasterization disabled or if the subpass of the render
2986    *    pass the pipeline is created against does not use any color
2987    *    attachments.
2988    *
2989    * We leave the relevant registers stale when rasterization is disabled.
2990    */
2991   if (builder->rasterizer_discard)
2992      return;
2993
2994   static const VkPipelineColorBlendStateCreateInfo dummy_blend_info;
2995   const VkPipelineMultisampleStateCreateInfo *msaa_info =
2996      builder->create_info->pMultisampleState;
2997   const VkPipelineColorBlendStateCreateInfo *blend_info =
2998      builder->use_color_attachments ? builder->create_info->pColorBlendState
2999                                     : &dummy_blend_info;
3000
3001   struct tu_cs cs;
3002   pipeline->blend_state =
3003      tu_cs_draw_state(&pipeline->cs, &cs, blend_info->attachmentCount * 3 + 4);
3004
3005   uint32_t blend_enable_mask;
3006   tu6_emit_rb_mrt_controls(&cs, blend_info,
3007                            builder->color_attachment_formats,
3008                            &blend_enable_mask);
3009
3010   tu6_emit_blend_control(&cs, blend_enable_mask,
3011                          builder->use_dual_src_blend, msaa_info);
3012
3013   assert(cs.cur == cs.end); /* validate draw state size */
3014
3015   if (blend_enable_mask) {
3016      for (int i = 0; i < blend_info->attachmentCount; i++) {
3017         VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i];
3018         /* Disable LRZ writes when blend is enabled, since the
3019          * resulting pixel value from the blend-draw
3020          * depends on an earlier draw, which LRZ in the draw pass
3021          * could early-reject if the previous blend-enabled draw wrote LRZ.
3022          *
3023          * From the PoV of LRZ, having masked color channels is
3024          * the same as having blend enabled, in that the draw will
3025          * care about the fragments from an earlier draw.
3026          *
3027          * TODO: We need to disable LRZ writes only for the binning pass.
3028          * Therefore, we need to emit it in a separate draw state. We keep
3029          * it disabled for sysmem path as well for the moment.
3030          */
3031         if (blendAttachment.blendEnable || blendAttachment.colorWriteMask != 0xf) {
3032            pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3033         }
3034      }
3035   }
3036
3037   if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) {
3038      tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3039      tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4);
3040   }
3041
3042   const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
3043      vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
3044   const VkSampleLocationsInfoEXT *samp_loc = NULL;
3045
3046   if (sample_locations && sample_locations->sampleLocationsEnable)
3047      samp_loc = &sample_locations->sampleLocationsInfo;
3048
3049    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3050                                 samp_loc ? 9 : 6)) {
3051      tu6_emit_sample_locations(&cs, samp_loc);
3052    }
3053}
3054
3055static void
3056tu_pipeline_finish(struct tu_pipeline *pipeline,
3057                   struct tu_device *dev,
3058                   const VkAllocationCallbacks *alloc)
3059{
3060   tu_cs_finish(&pipeline->cs);
3061
3062   if (pipeline->pvtmem_bo.size)
3063      tu_bo_finish(dev, &pipeline->pvtmem_bo);
3064
3065   ralloc_free(pipeline->executables_mem_ctx);
3066}
3067
3068static VkResult
3069tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
3070                          struct tu_pipeline **pipeline)
3071{
3072   VkResult result;
3073
3074   *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc,
3075                                sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE);
3076   if (!*pipeline)
3077      return VK_ERROR_OUT_OF_HOST_MEMORY;
3078
3079   (*pipeline)->layout = builder->layout;
3080   (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
3081   util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
3082
3083   /* compile and upload shaders */
3084   result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
3085   if (result != VK_SUCCESS) {
3086      vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3087      return result;
3088   }
3089
3090   result = tu_pipeline_allocate_cs(builder->device, *pipeline, builder, NULL);
3091   if (result != VK_SUCCESS) {
3092      vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3093      return result;
3094   }
3095
3096   for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++)
3097      builder->shader_iova[i] = tu_upload_variant(*pipeline, builder->variants[i]);
3098
3099   builder->binning_vs_iova =
3100      tu_upload_variant(*pipeline, builder->binning_variant);
3101
3102   /* Setup private memory. Note that because we're sharing the same private
3103    * memory for all stages, all stages must use the same config, or else
3104    * fibers from one stage might overwrite fibers in another.
3105    */
3106
3107   uint32_t pvtmem_size = 0;
3108   bool per_wave = true;
3109   for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
3110      if (builder->variants[i]) {
3111         pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size);
3112         if (!builder->variants[i]->pvtmem_per_wave)
3113            per_wave = false;
3114      }
3115   }
3116
3117   if (builder->binning_variant) {
3118      pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
3119      if (!builder->binning_variant->pvtmem_per_wave)
3120         per_wave = false;
3121   }
3122
3123   result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
3124                            pvtmem_size, per_wave);
3125   if (result != VK_SUCCESS) {
3126      vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3127      return result;
3128   }
3129
3130   tu_pipeline_builder_parse_dynamic(builder, *pipeline);
3131   tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
3132   tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
3133   tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
3134   tu_pipeline_builder_parse_tessellation(builder, *pipeline);
3135   tu_pipeline_builder_parse_viewport(builder, *pipeline);
3136   tu_pipeline_builder_parse_rasterization(builder, *pipeline);
3137   tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
3138   tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
3139   tu6_emit_load_state(*pipeline, false);
3140
3141   /* we should have reserved enough space upfront such that the CS never
3142    * grows
3143    */
3144   assert((*pipeline)->cs.bo_count == 1);
3145
3146   return VK_SUCCESS;
3147}
3148
3149static void
3150tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
3151{
3152   for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders); i++) {
3153      if (!builder->shaders[i])
3154         continue;
3155      tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc);
3156   }
3157}
3158
3159static void
3160tu_pipeline_builder_init_graphics(
3161   struct tu_pipeline_builder *builder,
3162   struct tu_device *dev,
3163   struct tu_pipeline_cache *cache,
3164   const VkGraphicsPipelineCreateInfo *create_info,
3165   const VkAllocationCallbacks *alloc)
3166{
3167   TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout);
3168
3169   *builder = (struct tu_pipeline_builder) {
3170      .device = dev,
3171      .cache = cache,
3172      .create_info = create_info,
3173      .alloc = alloc,
3174      .layout = layout,
3175   };
3176
3177   bool rasterizer_discard_dynamic = false;
3178   if (create_info->pDynamicState) {
3179      for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
3180         if (create_info->pDynamicState->pDynamicStates[i] ==
3181               VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT) {
3182            rasterizer_discard_dynamic = true;
3183            break;
3184         }
3185      }
3186   }
3187
3188   const struct tu_render_pass *pass =
3189      tu_render_pass_from_handle(create_info->renderPass);
3190   const struct tu_subpass *subpass =
3191      &pass->subpasses[create_info->subpass];
3192
3193   builder->multiview_mask = subpass->multiview_mask;
3194
3195   builder->rasterizer_discard =
3196      builder->create_info->pRasterizationState->rasterizerDiscardEnable &&
3197      !rasterizer_discard_dynamic;
3198
3199   /* variableMultisampleRate support */
3200   builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard;
3201
3202   if (builder->rasterizer_discard) {
3203      builder->samples = VK_SAMPLE_COUNT_1_BIT;
3204   } else {
3205      builder->samples = create_info->pMultisampleState->rasterizationSamples;
3206      builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable;
3207
3208      const uint32_t a = subpass->depth_stencil_attachment.attachment;
3209      builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
3210         pass->attachments[a].format : VK_FORMAT_UNDEFINED;
3211
3212      assert(subpass->color_count == 0 ||
3213             !create_info->pColorBlendState ||
3214             subpass->color_count == create_info->pColorBlendState->attachmentCount);
3215      builder->color_attachment_count = subpass->color_count;
3216      for (uint32_t i = 0; i < subpass->color_count; i++) {
3217         const uint32_t a = subpass->color_attachments[i].attachment;
3218         if (a == VK_ATTACHMENT_UNUSED)
3219            continue;
3220
3221         builder->color_attachment_formats[i] = pass->attachments[a].format;
3222         builder->use_color_attachments = true;
3223         builder->render_components |= 0xf << (i * 4);
3224      }
3225
3226      if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) {
3227         builder->color_attachment_count++;
3228         builder->use_dual_src_blend = true;
3229         /* dual source blending has an extra fs output in the 2nd slot */
3230         if (subpass->color_attachments[0].attachment != VK_ATTACHMENT_UNUSED)
3231            builder->render_components |= 0xf << 4;
3232      }
3233   }
3234}
3235
3236static VkResult
3237tu_graphics_pipeline_create(VkDevice device,
3238                            VkPipelineCache pipelineCache,
3239                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
3240                            const VkAllocationCallbacks *pAllocator,
3241                            VkPipeline *pPipeline)
3242{
3243   TU_FROM_HANDLE(tu_device, dev, device);
3244   TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache);
3245
3246   struct tu_pipeline_builder builder;
3247   tu_pipeline_builder_init_graphics(&builder, dev, cache,
3248                                     pCreateInfo, pAllocator);
3249
3250   struct tu_pipeline *pipeline = NULL;
3251   VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
3252   tu_pipeline_builder_finish(&builder);
3253
3254   if (result == VK_SUCCESS)
3255      *pPipeline = tu_pipeline_to_handle(pipeline);
3256   else
3257      *pPipeline = VK_NULL_HANDLE;
3258
3259   return result;
3260}
3261
3262VKAPI_ATTR VkResult VKAPI_CALL
3263tu_CreateGraphicsPipelines(VkDevice device,
3264                           VkPipelineCache pipelineCache,
3265                           uint32_t count,
3266                           const VkGraphicsPipelineCreateInfo *pCreateInfos,
3267                           const VkAllocationCallbacks *pAllocator,
3268                           VkPipeline *pPipelines)
3269{
3270   VkResult final_result = VK_SUCCESS;
3271
3272   for (uint32_t i = 0; i < count; i++) {
3273      VkResult result = tu_graphics_pipeline_create(device, pipelineCache,
3274                                                    &pCreateInfos[i], pAllocator,
3275                                                    &pPipelines[i]);
3276
3277      if (result != VK_SUCCESS)
3278         final_result = result;
3279   }
3280
3281   return final_result;
3282}
3283
3284static VkResult
3285tu_compute_pipeline_create(VkDevice device,
3286                           VkPipelineCache _cache,
3287                           const VkComputePipelineCreateInfo *pCreateInfo,
3288                           const VkAllocationCallbacks *pAllocator,
3289                           VkPipeline *pPipeline)
3290{
3291   TU_FROM_HANDLE(tu_device, dev, device);
3292   TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
3293   const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
3294   VkResult result;
3295
3296   struct tu_pipeline *pipeline;
3297
3298   *pPipeline = VK_NULL_HANDLE;
3299
3300   pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline),
3301                               VK_OBJECT_TYPE_PIPELINE);
3302   if (!pipeline)
3303      return VK_ERROR_OUT_OF_HOST_MEMORY;
3304
3305   pipeline->layout = layout;
3306
3307   pipeline->executables_mem_ctx = ralloc_context(NULL);
3308   util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx);
3309
3310   struct ir3_shader_key key = {};
3311
3312   nir_shader *nir = tu_spirv_to_nir(dev, stage_info, MESA_SHADER_COMPUTE);
3313
3314   const bool executable_info = pCreateInfo->flags &
3315      VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
3316
3317   char *nir_initial_disasm = executable_info ?
3318      nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL;
3319
3320   struct tu_shader *shader =
3321      tu_shader_create(dev, nir, 0, layout, pAllocator);
3322   if (!shader) {
3323      result = VK_ERROR_OUT_OF_HOST_MEMORY;
3324      goto fail;
3325   }
3326
3327   pipeline->active_desc_sets = shader->active_desc_sets;
3328
3329   bool created;
3330   struct ir3_shader_variant *v =
3331      ir3_shader_get_variant(shader->ir3_shader, &key, false, executable_info, &created);
3332   if (!v) {
3333      result = VK_ERROR_OUT_OF_HOST_MEMORY;
3334      goto fail;
3335   }
3336
3337   tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
3338                           shader, v);
3339
3340   result = tu_pipeline_allocate_cs(dev, pipeline, NULL, v);
3341   if (result != VK_SUCCESS)
3342      goto fail;
3343
3344   uint64_t shader_iova = tu_upload_variant(pipeline, v);
3345
3346   struct tu_pvtmem_config pvtmem;
3347   tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave);
3348
3349   for (int i = 0; i < 3; i++)
3350      pipeline->compute.local_size[i] = v->local_size[i];
3351
3352   pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
3353
3354   struct tu_cs prog_cs;
3355   uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v);
3356   tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs);
3357   tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova);
3358   pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3359
3360   tu6_emit_load_state(pipeline, true);
3361
3362   tu_append_executable(pipeline, v, nir_initial_disasm);
3363
3364   tu_shader_destroy(dev, shader, pAllocator);
3365
3366   *pPipeline = tu_pipeline_to_handle(pipeline);
3367
3368   return VK_SUCCESS;
3369
3370fail:
3371   if (shader)
3372      tu_shader_destroy(dev, shader, pAllocator);
3373
3374   vk_object_free(&dev->vk, pAllocator, pipeline);
3375
3376   return result;
3377}
3378
3379VKAPI_ATTR VkResult VKAPI_CALL
3380tu_CreateComputePipelines(VkDevice device,
3381                          VkPipelineCache pipelineCache,
3382                          uint32_t count,
3383                          const VkComputePipelineCreateInfo *pCreateInfos,
3384                          const VkAllocationCallbacks *pAllocator,
3385                          VkPipeline *pPipelines)
3386{
3387   VkResult final_result = VK_SUCCESS;
3388
3389   for (uint32_t i = 0; i < count; i++) {
3390      VkResult result = tu_compute_pipeline_create(device, pipelineCache,
3391                                                   &pCreateInfos[i],
3392                                                   pAllocator, &pPipelines[i]);
3393      if (result != VK_SUCCESS)
3394         final_result = result;
3395   }
3396
3397   return final_result;
3398}
3399
3400VKAPI_ATTR void VKAPI_CALL
3401tu_DestroyPipeline(VkDevice _device,
3402                   VkPipeline _pipeline,
3403                   const VkAllocationCallbacks *pAllocator)
3404{
3405   TU_FROM_HANDLE(tu_device, dev, _device);
3406   TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
3407
3408   if (!_pipeline)
3409      return;
3410
3411   tu_pipeline_finish(pipeline, dev, pAllocator);
3412   vk_object_free(&dev->vk, pAllocator, pipeline);
3413}
3414
3415#define WRITE_STR(field, ...) ({                                \
3416   memset(field, 0, sizeof(field));                             \
3417   UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
3418   assert(_i > 0 && _i < sizeof(field));                        \
3419})
3420
3421static const struct tu_pipeline_executable *
3422tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
3423{
3424   assert(index < util_dynarray_num_elements(&pipeline->executables,
3425                                             struct tu_pipeline_executable));
3426   return util_dynarray_element(
3427      &pipeline->executables, struct tu_pipeline_executable, index);
3428}
3429
3430VKAPI_ATTR VkResult VKAPI_CALL
3431tu_GetPipelineExecutablePropertiesKHR(
3432      VkDevice _device,
3433      const VkPipelineInfoKHR* pPipelineInfo,
3434      uint32_t* pExecutableCount,
3435      VkPipelineExecutablePropertiesKHR* pProperties)
3436{
3437   TU_FROM_HANDLE(tu_device, dev, _device);
3438   TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
3439   VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount);
3440
3441   util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
3442      vk_outarray_append(&out, props) {
3443         gl_shader_stage stage = exe->stage;
3444         props->stages = mesa_to_vk_shader_stage(stage);
3445
3446         if (!exe->is_binning)
3447            WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
3448         else
3449            WRITE_STR(props->name, "Binning VS");
3450
3451         WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
3452
3453         props->subgroupSize =
3454            dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
3455      }
3456   }
3457
3458   return vk_outarray_status(&out);
3459}
3460
3461VKAPI_ATTR VkResult VKAPI_CALL
3462tu_GetPipelineExecutableStatisticsKHR(
3463      VkDevice _device,
3464      const VkPipelineExecutableInfoKHR* pExecutableInfo,
3465      uint32_t* pStatisticCount,
3466      VkPipelineExecutableStatisticKHR* pStatistics)
3467{
3468   TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
3469   VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount);
3470
3471   const struct tu_pipeline_executable *exe =
3472      tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3473
3474   vk_outarray_append(&out, stat) {
3475      WRITE_STR(stat->name, "Max Waves Per Core");
3476      WRITE_STR(stat->description,
3477                "Maximum number of simultaneous waves per core.");
3478      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3479      stat->value.u64 = exe->stats.max_waves;
3480   }
3481
3482   vk_outarray_append(&out, stat) {
3483      WRITE_STR(stat->name, "Instruction Count");
3484      WRITE_STR(stat->description,
3485                "Total number of IR3 instructions in the final generated "
3486                "shader executable.");
3487      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3488      stat->value.u64 = exe->stats.instrs_count;
3489   }
3490
3491   vk_outarray_append(&out, stat) {
3492      WRITE_STR(stat->name, "NOPs Count");
3493      WRITE_STR(stat->description,
3494                "Number of NOP instructions in the final generated "
3495                "shader executable.");
3496      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3497      stat->value.u64 = exe->stats.nops_count;
3498   }
3499
3500   vk_outarray_append(&out, stat) {
3501      WRITE_STR(stat->name, "MOV Count");
3502      WRITE_STR(stat->description,
3503                "Number of MOV instructions in the final generated "
3504                "shader executable.");
3505      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3506      stat->value.u64 = exe->stats.mov_count;
3507   }
3508
3509   vk_outarray_append(&out, stat) {
3510      WRITE_STR(stat->name, "COV Count");
3511      WRITE_STR(stat->description,
3512                "Number of COV instructions in the final generated "
3513                "shader executable.");
3514      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3515      stat->value.u64 = exe->stats.cov_count;
3516   }
3517
3518   vk_outarray_append(&out, stat) {
3519      WRITE_STR(stat->name, "Registers used");
3520      WRITE_STR(stat->description,
3521                "Number of registers used in the final generated "
3522                "shader executable.");
3523      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3524      stat->value.u64 = exe->stats.max_reg + 1;
3525   }
3526
3527   vk_outarray_append(&out, stat) {
3528      WRITE_STR(stat->name, "Half-registers used");
3529      WRITE_STR(stat->description,
3530                "Number of half-registers used in the final generated "
3531                "shader executable.");
3532      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3533      stat->value.u64 = exe->stats.max_half_reg + 1;
3534   }
3535
3536   vk_outarray_append(&out, stat) {
3537      WRITE_STR(stat->name, "Instructions with SS sync bit");
3538      WRITE_STR(stat->description,
3539                "SS bit is set for instructions which depend on a result "
3540                "of \"long\" instructions to prevent RAW hazard.");
3541      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3542      stat->value.u64 = exe->stats.ss;
3543   }
3544
3545   vk_outarray_append(&out, stat) {
3546      WRITE_STR(stat->name, "Instructions with SY sync bit");
3547      WRITE_STR(stat->description,
3548                "SY bit is set for instructions which depend on a result "
3549                "of loads from global memory to prevent RAW hazard.");
3550      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3551      stat->value.u64 = exe->stats.sy;
3552   }
3553
3554   vk_outarray_append(&out, stat) {
3555      WRITE_STR(stat->name, "Estimated cycles stalled on SS");
3556      WRITE_STR(stat->description,
3557                "A better metric to estimate the impact of SS syncs.");
3558      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3559      stat->value.u64 = exe->stats.sstall;
3560   }
3561
3562   for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
3563      vk_outarray_append(&out, stat) {
3564         WRITE_STR(stat->name, "cat%d instructions", i);
3565         WRITE_STR(stat->description,
3566                  "Number of cat%d instructions.", i);
3567         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3568         stat->value.u64 = exe->stats.instrs_per_cat[i];
3569      }
3570   }
3571
3572   vk_outarray_append(&out, stat) {
3573      WRITE_STR(stat->name, "STP Count");
3574      WRITE_STR(stat->description,
3575                "Number of STore Private instructions in the final generated "
3576                "shader executable.");
3577      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3578      stat->value.u64 = exe->stats.stp_count;
3579   }
3580
3581   vk_outarray_append(&out, stat) {
3582      WRITE_STR(stat->name, "LDP Count");
3583      WRITE_STR(stat->description,
3584                "Number of LoaD Private instructions in the final generated "
3585                "shader executable.");
3586      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3587      stat->value.u64 = exe->stats.ldp_count;
3588   }
3589
3590   return vk_outarray_status(&out);
3591}
3592
3593static bool
3594write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
3595              const char *data)
3596{
3597   ir->isText = VK_TRUE;
3598
3599   size_t data_len = strlen(data) + 1;
3600
3601   if (ir->pData == NULL) {
3602      ir->dataSize = data_len;
3603      return true;
3604   }
3605
3606   strncpy(ir->pData, data, ir->dataSize);
3607   if (ir->dataSize < data_len)
3608      return false;
3609
3610   ir->dataSize = data_len;
3611   return true;
3612}
3613
3614VKAPI_ATTR VkResult VKAPI_CALL
3615tu_GetPipelineExecutableInternalRepresentationsKHR(
3616    VkDevice _device,
3617    const VkPipelineExecutableInfoKHR* pExecutableInfo,
3618    uint32_t* pInternalRepresentationCount,
3619    VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
3620{
3621   TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
3622   VK_OUTARRAY_MAKE(out, pInternalRepresentations, pInternalRepresentationCount);
3623   bool incomplete_text = false;
3624
3625   const struct tu_pipeline_executable *exe =
3626      tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3627
3628   if (exe->nir_from_spirv) {
3629      vk_outarray_append(&out, ir) {
3630         WRITE_STR(ir->name, "NIR from SPIRV");
3631         WRITE_STR(ir->description,
3632                   "Initial NIR before any optimizations");
3633
3634         if (!write_ir_text(ir, exe->nir_from_spirv))
3635            incomplete_text = true;
3636      }
3637   }
3638
3639   if (exe->nir_final) {
3640      vk_outarray_append(&out, ir) {
3641         WRITE_STR(ir->name, "Final NIR");
3642         WRITE_STR(ir->description,
3643                   "Final NIR before going into the back-end compiler");
3644
3645         if (!write_ir_text(ir, exe->nir_final))
3646            incomplete_text = true;
3647      }
3648   }
3649
3650   if (exe->disasm) {
3651      vk_outarray_append(&out, ir) {
3652         WRITE_STR(ir->name, "IR3 Assembly");
3653         WRITE_STR(ir->description,
3654                   "Final IR3 assembly for the generated shader binary");
3655
3656         if (!write_ir_text(ir, exe->disasm))
3657            incomplete_text = true;
3658      }
3659   }
3660
3661   return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
3662}
3663