1/* 2 * Copyright © 2019 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "anv_nir.h" 25#include "nir_builder.h" 26#include "compiler/brw_nir.h" 27#include "util/mesa-sha1.h" 28 29#define sizeof_field(type, field) sizeof(((type *)0)->field) 30 31void 32anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, 33 bool robust_buffer_access, 34 nir_shader *nir, 35 struct brw_stage_prog_data *prog_data, 36 struct anv_pipeline_bind_map *map, 37 void *mem_ctx) 38{ 39 const struct brw_compiler *compiler = pdevice->compiler; 40 const struct intel_device_info *devinfo = compiler->devinfo; 41 memset(map->push_ranges, 0, sizeof(map->push_ranges)); 42 43 bool has_const_ubo = false; 44 unsigned push_start = UINT_MAX, push_end = 0; 45 nir_foreach_function(function, nir) { 46 if (!function->impl) 47 continue; 48 49 nir_foreach_block(block, function->impl) { 50 nir_foreach_instr(instr, block) { 51 if (instr->type != nir_instr_type_intrinsic) 52 continue; 53 54 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 55 switch (intrin->intrinsic) { 56 case nir_intrinsic_load_ubo: 57 if (nir_src_is_const(intrin->src[0]) && 58 nir_src_is_const(intrin->src[1])) 59 has_const_ubo = true; 60 break; 61 62 case nir_intrinsic_load_push_constant: { 63 unsigned base = nir_intrinsic_base(intrin); 64 unsigned range = nir_intrinsic_range(intrin); 65 push_start = MIN2(push_start, base); 66 push_end = MAX2(push_end, base + range); 67 break; 68 } 69 70 case nir_intrinsic_load_desc_set_address_intel: 71 push_start = MIN2(push_start, 72 offsetof(struct anv_push_constants, desc_sets)); 73 push_end = MAX2(push_end, push_start + 74 sizeof_field(struct anv_push_constants, desc_sets)); 75 break; 76 77 default: 78 break; 79 } 80 } 81 } 82 } 83 84 const bool has_push_intrinsic = push_start <= push_end; 85 86 const bool push_ubo_ranges = 87 pdevice->info.verx10 >= 75 && 88 has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE && 89 !brw_shader_stage_is_bindless(nir->info.stage); 90 91 if (push_ubo_ranges && robust_buffer_access) { 92 /* We can't on-the-fly adjust our push ranges because doing so would 93 * mess up the layout in the shader. When robustBufferAccess is 94 * enabled, we push a mask into the shader indicating which pushed 95 * registers are valid and we zero out the invalid ones at the top of 96 * the shader. 97 */ 98 const uint32_t push_reg_mask_start = 99 offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]); 100 const uint32_t push_reg_mask_end = push_reg_mask_start + sizeof(uint64_t); 101 push_start = MIN2(push_start, push_reg_mask_start); 102 push_end = MAX2(push_end, push_reg_mask_end); 103 } 104 105 if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) { 106 /* For compute shaders, we always have to have the subgroup ID. The 107 * back-end compiler will "helpfully" add it for us in the last push 108 * constant slot. Yes, there is an off-by-one error here but that's 109 * because the back-end will add it so we want to claim the number of 110 * push constants one dword less than the full amount including 111 * gl_SubgroupId. 112 */ 113 assert(push_end <= offsetof(struct anv_push_constants, cs.subgroup_id)); 114 push_end = offsetof(struct anv_push_constants, cs.subgroup_id); 115 } 116 117 /* Align push_start down to a 32B boundary and make it no larger than 118 * push_end (no push constants is indicated by push_start = UINT_MAX). 119 */ 120 push_start = MIN2(push_start, push_end); 121 push_start = align_down_u32(push_start, 32); 122 123 /* For vec4 our push data size needs to be aligned to a vec4 and for 124 * scalar, it needs to be aligned to a DWORD. 125 */ 126 const unsigned align = compiler->scalar_stage[nir->info.stage] ? 4 : 16; 127 nir->num_uniforms = ALIGN(push_end - push_start, align); 128 prog_data->nr_params = nir->num_uniforms / 4; 129 prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params); 130 131 struct anv_push_range push_constant_range = { 132 .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, 133 .start = push_start / 32, 134 .length = DIV_ROUND_UP(push_end - push_start, 32), 135 }; 136 137 if (has_push_intrinsic) { 138 nir_foreach_function(function, nir) { 139 if (!function->impl) 140 continue; 141 142 nir_builder build, *b = &build; 143 nir_builder_init(b, function->impl); 144 145 nir_foreach_block(block, function->impl) { 146 nir_foreach_instr_safe(instr, block) { 147 if (instr->type != nir_instr_type_intrinsic) 148 continue; 149 150 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 151 switch (intrin->intrinsic) { 152 case nir_intrinsic_load_push_constant: { 153 /* With bindless shaders we load uniforms with SEND 154 * messages. All the push constants are located after the 155 * RT_DISPATCH_GLOBALS. We just need to add the offset to 156 * the address right after RT_DISPATCH_GLOBALS (see 157 * brw_nir_lower_rt_intrinsics.c). 158 */ 159 unsigned base_offset = 160 brw_shader_stage_is_bindless(nir->info.stage) ? 0 : push_start; 161 intrin->intrinsic = nir_intrinsic_load_uniform; 162 nir_intrinsic_set_base(intrin, 163 nir_intrinsic_base(intrin) - 164 base_offset); 165 break; 166 } 167 168 case nir_intrinsic_load_desc_set_address_intel: { 169 b->cursor = nir_before_instr(&intrin->instr); 170 nir_ssa_def *pc_load = nir_load_uniform(b, 1, 64, 171 nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint64_t)), 172 .base = offsetof(struct anv_push_constants, desc_sets), 173 .range = sizeof_field(struct anv_push_constants, desc_sets), 174 .dest_type = nir_type_uint64); 175 nir_ssa_def_rewrite_uses(&intrin->dest.ssa, pc_load); 176 break; 177 } 178 179 default: 180 break; 181 } 182 } 183 } 184 } 185 } 186 187 if (push_ubo_ranges) { 188 brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); 189 190 /* The vec4 back-end pushes at most 32 regs while the scalar back-end 191 * pushes up to 64. This is primarily because the scalar back-end has a 192 * massively more competent register allocator and so the risk of 193 * spilling due to UBO pushing isn't nearly as high. 194 */ 195 const unsigned max_push_regs = 196 compiler->scalar_stage[nir->info.stage] ? 64 : 32; 197 198 unsigned total_push_regs = push_constant_range.length; 199 for (unsigned i = 0; i < 4; i++) { 200 if (total_push_regs + prog_data->ubo_ranges[i].length > max_push_regs) 201 prog_data->ubo_ranges[i].length = max_push_regs - total_push_regs; 202 total_push_regs += prog_data->ubo_ranges[i].length; 203 } 204 assert(total_push_regs <= max_push_regs); 205 206 int n = 0; 207 208 if (push_constant_range.length > 0) 209 map->push_ranges[n++] = push_constant_range; 210 211 if (robust_buffer_access) { 212 const uint32_t push_reg_mask_offset = 213 offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]); 214 assert(push_reg_mask_offset >= push_start); 215 prog_data->push_reg_mask_param = 216 (push_reg_mask_offset - push_start) / 4; 217 } 218 219 unsigned range_start_reg = push_constant_range.length; 220 221 for (int i = 0; i < 4; i++) { 222 struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; 223 if (ubo_range->length == 0) 224 continue; 225 226 if (n >= 4 || (n == 3 && compiler->constant_buffer_0_is_relative)) { 227 memset(ubo_range, 0, sizeof(*ubo_range)); 228 continue; 229 } 230 231 const struct anv_pipeline_binding *binding = 232 &map->surface_to_descriptor[ubo_range->block]; 233 234 map->push_ranges[n++] = (struct anv_push_range) { 235 .set = binding->set, 236 .index = binding->index, 237 .dynamic_offset_index = binding->dynamic_offset_index, 238 .start = ubo_range->start, 239 .length = ubo_range->length, 240 }; 241 242 /* We only bother to shader-zero pushed client UBOs */ 243 if (binding->set < MAX_SETS && robust_buffer_access) { 244 prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg, 245 ubo_range->length); 246 } 247 248 range_start_reg += ubo_range->length; 249 } 250 } else { 251 /* For Ivy Bridge, the push constants packets have a different 252 * rule that would require us to iterate in the other direction 253 * and possibly mess around with dynamic state base address. 254 * Don't bother; just emit regular push constants at n = 0. 255 * 256 * In the compute case, we don't have multiple push ranges so it's 257 * better to just provide one in push_ranges[0]. 258 */ 259 map->push_ranges[0] = push_constant_range; 260 } 261 262 /* Now that we're done computing the push constant portion of the 263 * bind map, hash it. This lets us quickly determine if the actual 264 * mapping has changed and not just a no-op pipeline change. 265 */ 266 _mesa_sha1_compute(map->push_ranges, 267 sizeof(map->push_ranges), 268 map->push_sha1); 269} 270 271void 272anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data, 273 struct anv_pipeline_bind_map *map) 274{ 275#ifndef NDEBUG 276 unsigned prog_data_push_size = DIV_ROUND_UP(prog_data->nr_params, 8); 277 for (unsigned i = 0; i < 4; i++) 278 prog_data_push_size += prog_data->ubo_ranges[i].length; 279 280 unsigned bind_map_push_size = 0; 281 for (unsigned i = 0; i < 4; i++) 282 bind_map_push_size += map->push_ranges[i].length; 283 284 /* We could go through everything again but it should be enough to assert 285 * that they push the same number of registers. This should alert us if 286 * the back-end compiler decides to re-arrange stuff or shrink a range. 287 */ 288 assert(prog_data_push_size == bind_map_push_size); 289#endif 290} 291