1/*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "anv_nir.h"
25#include "nir_builder.h"
26#include "compiler/brw_nir.h"
27#include "util/mesa-sha1.h"
28
29#define sizeof_field(type, field) sizeof(((type *)0)->field)
30
31void
32anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
33                            bool robust_buffer_access,
34                            nir_shader *nir,
35                            struct brw_stage_prog_data *prog_data,
36                            struct anv_pipeline_bind_map *map,
37                            void *mem_ctx)
38{
39   const struct brw_compiler *compiler = pdevice->compiler;
40   const struct intel_device_info *devinfo = compiler->devinfo;
41   memset(map->push_ranges, 0, sizeof(map->push_ranges));
42
43   bool has_const_ubo = false;
44   unsigned push_start = UINT_MAX, push_end = 0;
45   nir_foreach_function(function, nir) {
46      if (!function->impl)
47         continue;
48
49      nir_foreach_block(block, function->impl) {
50         nir_foreach_instr(instr, block) {
51            if (instr->type != nir_instr_type_intrinsic)
52               continue;
53
54            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
55            switch (intrin->intrinsic) {
56            case nir_intrinsic_load_ubo:
57               if (nir_src_is_const(intrin->src[0]) &&
58                   nir_src_is_const(intrin->src[1]))
59                  has_const_ubo = true;
60               break;
61
62            case nir_intrinsic_load_push_constant: {
63               unsigned base = nir_intrinsic_base(intrin);
64               unsigned range = nir_intrinsic_range(intrin);
65               push_start = MIN2(push_start, base);
66               push_end = MAX2(push_end, base + range);
67               break;
68            }
69
70            case nir_intrinsic_load_desc_set_address_intel:
71               push_start = MIN2(push_start,
72                  offsetof(struct anv_push_constants, desc_sets));
73               push_end = MAX2(push_end, push_start +
74                  sizeof_field(struct anv_push_constants, desc_sets));
75               break;
76
77            default:
78               break;
79            }
80         }
81      }
82   }
83
84   const bool has_push_intrinsic = push_start <= push_end;
85
86   const bool push_ubo_ranges =
87      pdevice->info.verx10 >= 75 &&
88      has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE &&
89      !brw_shader_stage_is_bindless(nir->info.stage);
90
91   if (push_ubo_ranges && robust_buffer_access) {
92      /* We can't on-the-fly adjust our push ranges because doing so would
93       * mess up the layout in the shader.  When robustBufferAccess is
94       * enabled, we push a mask into the shader indicating which pushed
95       * registers are valid and we zero out the invalid ones at the top of
96       * the shader.
97       */
98      const uint32_t push_reg_mask_start =
99         offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]);
100      const uint32_t push_reg_mask_end = push_reg_mask_start + sizeof(uint64_t);
101      push_start = MIN2(push_start, push_reg_mask_start);
102      push_end = MAX2(push_end, push_reg_mask_end);
103   }
104
105   if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) {
106      /* For compute shaders, we always have to have the subgroup ID.  The
107       * back-end compiler will "helpfully" add it for us in the last push
108       * constant slot.  Yes, there is an off-by-one error here but that's
109       * because the back-end will add it so we want to claim the number of
110       * push constants one dword less than the full amount including
111       * gl_SubgroupId.
112       */
113      assert(push_end <= offsetof(struct anv_push_constants, cs.subgroup_id));
114      push_end = offsetof(struct anv_push_constants, cs.subgroup_id);
115   }
116
117   /* Align push_start down to a 32B boundary and make it no larger than
118    * push_end (no push constants is indicated by push_start = UINT_MAX).
119    */
120   push_start = MIN2(push_start, push_end);
121   push_start = align_down_u32(push_start, 32);
122
123   /* For vec4 our push data size needs to be aligned to a vec4 and for
124    * scalar, it needs to be aligned to a DWORD.
125    */
126   const unsigned align = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
127   nir->num_uniforms = ALIGN(push_end - push_start, align);
128   prog_data->nr_params = nir->num_uniforms / 4;
129   prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
130
131   struct anv_push_range push_constant_range = {
132      .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS,
133      .start = push_start / 32,
134      .length = DIV_ROUND_UP(push_end - push_start, 32),
135   };
136
137   if (has_push_intrinsic) {
138      nir_foreach_function(function, nir) {
139         if (!function->impl)
140            continue;
141
142         nir_builder build, *b = &build;
143         nir_builder_init(b, function->impl);
144
145         nir_foreach_block(block, function->impl) {
146            nir_foreach_instr_safe(instr, block) {
147               if (instr->type != nir_instr_type_intrinsic)
148                  continue;
149
150               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
151               switch (intrin->intrinsic) {
152               case nir_intrinsic_load_push_constant: {
153                  /* With bindless shaders we load uniforms with SEND
154                   * messages. All the push constants are located after the
155                   * RT_DISPATCH_GLOBALS. We just need to add the offset to
156                   * the address right after RT_DISPATCH_GLOBALS (see
157                   * brw_nir_lower_rt_intrinsics.c).
158                   */
159                  unsigned base_offset =
160                     brw_shader_stage_is_bindless(nir->info.stage) ? 0 : push_start;
161                  intrin->intrinsic = nir_intrinsic_load_uniform;
162                  nir_intrinsic_set_base(intrin,
163                                         nir_intrinsic_base(intrin) -
164                                         base_offset);
165                  break;
166               }
167
168               case nir_intrinsic_load_desc_set_address_intel: {
169                  b->cursor = nir_before_instr(&intrin->instr);
170                  nir_ssa_def *pc_load = nir_load_uniform(b, 1, 64,
171                     nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint64_t)),
172                     .base = offsetof(struct anv_push_constants, desc_sets),
173                     .range = sizeof_field(struct anv_push_constants, desc_sets),
174                     .dest_type = nir_type_uint64);
175                  nir_ssa_def_rewrite_uses(&intrin->dest.ssa, pc_load);
176                  break;
177               }
178
179               default:
180                  break;
181               }
182            }
183         }
184      }
185   }
186
187   if (push_ubo_ranges) {
188      brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
189
190      /* The vec4 back-end pushes at most 32 regs while the scalar back-end
191       * pushes up to 64.  This is primarily because the scalar back-end has a
192       * massively more competent register allocator and so the risk of
193       * spilling due to UBO pushing isn't nearly as high.
194       */
195      const unsigned max_push_regs =
196         compiler->scalar_stage[nir->info.stage] ? 64 : 32;
197
198      unsigned total_push_regs = push_constant_range.length;
199      for (unsigned i = 0; i < 4; i++) {
200         if (total_push_regs + prog_data->ubo_ranges[i].length > max_push_regs)
201            prog_data->ubo_ranges[i].length = max_push_regs - total_push_regs;
202         total_push_regs += prog_data->ubo_ranges[i].length;
203      }
204      assert(total_push_regs <= max_push_regs);
205
206      int n = 0;
207
208      if (push_constant_range.length > 0)
209         map->push_ranges[n++] = push_constant_range;
210
211      if (robust_buffer_access) {
212         const uint32_t push_reg_mask_offset =
213            offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]);
214         assert(push_reg_mask_offset >= push_start);
215         prog_data->push_reg_mask_param =
216            (push_reg_mask_offset - push_start) / 4;
217      }
218
219      unsigned range_start_reg = push_constant_range.length;
220
221      for (int i = 0; i < 4; i++) {
222         struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
223         if (ubo_range->length == 0)
224            continue;
225
226         if (n >= 4 || (n == 3 && compiler->constant_buffer_0_is_relative)) {
227            memset(ubo_range, 0, sizeof(*ubo_range));
228            continue;
229         }
230
231         const struct anv_pipeline_binding *binding =
232            &map->surface_to_descriptor[ubo_range->block];
233
234         map->push_ranges[n++] = (struct anv_push_range) {
235            .set = binding->set,
236            .index = binding->index,
237            .dynamic_offset_index = binding->dynamic_offset_index,
238            .start = ubo_range->start,
239            .length = ubo_range->length,
240         };
241
242         /* We only bother to shader-zero pushed client UBOs */
243         if (binding->set < MAX_SETS && robust_buffer_access) {
244            prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
245                                                         ubo_range->length);
246         }
247
248         range_start_reg += ubo_range->length;
249      }
250   } else {
251      /* For Ivy Bridge, the push constants packets have a different
252       * rule that would require us to iterate in the other direction
253       * and possibly mess around with dynamic state base address.
254       * Don't bother; just emit regular push constants at n = 0.
255       *
256       * In the compute case, we don't have multiple push ranges so it's
257       * better to just provide one in push_ranges[0].
258       */
259      map->push_ranges[0] = push_constant_range;
260   }
261
262   /* Now that we're done computing the push constant portion of the
263    * bind map, hash it.  This lets us quickly determine if the actual
264    * mapping has changed and not just a no-op pipeline change.
265    */
266   _mesa_sha1_compute(map->push_ranges,
267                      sizeof(map->push_ranges),
268                      map->push_sha1);
269}
270
271void
272anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data,
273                             struct anv_pipeline_bind_map *map)
274{
275#ifndef NDEBUG
276   unsigned prog_data_push_size = DIV_ROUND_UP(prog_data->nr_params, 8);
277   for (unsigned i = 0; i < 4; i++)
278      prog_data_push_size += prog_data->ubo_ranges[i].length;
279
280   unsigned bind_map_push_size = 0;
281   for (unsigned i = 0; i < 4; i++)
282      bind_map_push_size += map->push_ranges[i].length;
283
284   /* We could go through everything again but it should be enough to assert
285    * that they push the same number of registers.  This should alert us if
286    * the back-end compiler decides to re-arrange stuff or shrink a range.
287    */
288   assert(prog_data_push_size == bind_map_push_size);
289#endif
290}
291