1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_nir.h" 25#include "compiler/nir/nir.h" 26#include "util/u_dynarray.h" 27 28/** 29 * \file brw_nir_analyze_ubo_ranges.c 30 * 31 * This pass decides which portions of UBOs to upload as push constants, 32 * so shaders can access them as part of the thread payload, rather than 33 * having to issue expensive memory reads to pull the data. 34 * 35 * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different 36 * buffers, in GRF (256-bit/32-byte) units. 37 * 38 * To do this, we examine NIR load_ubo intrinsics, recording the number of 39 * loads at each offset. We track offsets at a 32-byte granularity, so even 40 * fields with a bit of padding between them tend to fall into contiguous 41 * ranges. We build a list of these ranges, tracking their "cost" (number 42 * of registers required) and "benefit" (number of pull loads eliminated 43 * by pushing the range). We then sort the list to obtain the four best 44 * ranges (most benefit for the least cost). 45 */ 46 47struct ubo_range_entry 48{ 49 struct brw_ubo_range range; 50 int benefit; 51}; 52 53static int 54score(const struct ubo_range_entry *entry) 55{ 56 return 2 * entry->benefit - entry->range.length; 57} 58 59/** 60 * Compares score for two UBO range entries. 61 * 62 * For a descending qsort(). 63 */ 64static int 65cmp_ubo_range_entry(const void *va, const void *vb) 66{ 67 const struct ubo_range_entry *a = va; 68 const struct ubo_range_entry *b = vb; 69 70 /* Rank based on scores */ 71 int delta = score(b) - score(a); 72 73 /* Then use the UBO block index as a tie-breaker */ 74 if (delta == 0) 75 delta = b->range.block - a->range.block; 76 77 /* Finally use the UBO offset as a second tie-breaker */ 78 if (delta == 0) 79 delta = b->range.block - a->range.block; 80 81 return delta; 82} 83 84struct ubo_block_info 85{ 86 /* Each bit in the offsets bitfield represents a 32-byte section of data. 87 * If it's set to one, there is interesting UBO data at that offset. If 88 * not, there's a "hole" - padding between data - or just nothing at all. 89 */ 90 uint64_t offsets; 91 uint8_t uses[64]; 92}; 93 94struct ubo_analysis_state 95{ 96 struct hash_table *blocks; 97 bool uses_regular_uniforms; 98}; 99 100static struct ubo_block_info * 101get_block_info(struct ubo_analysis_state *state, int block) 102{ 103 uint32_t hash = block + 1; 104 void *key = (void *) (uintptr_t) hash; 105 106 struct hash_entry *entry = 107 _mesa_hash_table_search_pre_hashed(state->blocks, hash, key); 108 109 if (entry) 110 return (struct ubo_block_info *) entry->data; 111 112 struct ubo_block_info *info = 113 rzalloc(state->blocks, struct ubo_block_info); 114 _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info); 115 116 return info; 117} 118 119static void 120analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block) 121{ 122 nir_foreach_instr(instr, block) { 123 if (instr->type != nir_instr_type_intrinsic) 124 continue; 125 126 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 127 switch (intrin->intrinsic) { 128 case nir_intrinsic_load_uniform: 129 case nir_intrinsic_image_deref_load: 130 case nir_intrinsic_image_deref_store: 131 case nir_intrinsic_image_deref_atomic_add: 132 case nir_intrinsic_image_deref_atomic_min: 133 case nir_intrinsic_image_deref_atomic_max: 134 case nir_intrinsic_image_deref_atomic_and: 135 case nir_intrinsic_image_deref_atomic_or: 136 case nir_intrinsic_image_deref_atomic_xor: 137 case nir_intrinsic_image_deref_atomic_exchange: 138 case nir_intrinsic_image_deref_atomic_comp_swap: 139 case nir_intrinsic_image_deref_size: 140 state->uses_regular_uniforms = true; 141 continue; 142 143 case nir_intrinsic_load_ubo: 144 break; /* Fall through to the analysis below */ 145 146 default: 147 continue; /* Not a uniform or UBO intrinsic */ 148 } 149 150 if (nir_src_is_const(intrin->src[0]) && 151 nir_src_is_const(intrin->src[1])) { 152 const int block = nir_src_as_uint(intrin->src[0]); 153 const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); 154 const int offset = byte_offset / 32; 155 156 /* Avoid shifting by larger than the width of our bitfield, as this 157 * is undefined in C. Even if we require multiple bits to represent 158 * the entire value, it's OK to record a partial value - the backend 159 * is capable of falling back to pull loads for later components of 160 * vectors, as it has to shrink ranges for other reasons anyway. 161 */ 162 if (offset >= 64) 163 continue; 164 165 /* The value might span multiple 32-byte chunks. */ 166 const int bytes = nir_intrinsic_dest_components(intrin) * 167 (nir_dest_bit_size(intrin->dest) / 8); 168 const int start = ROUND_DOWN_TO(byte_offset, 32); 169 const int end = ALIGN(byte_offset + bytes, 32); 170 const int chunks = (end - start) / 32; 171 172 /* TODO: should we count uses in loops as higher benefit? */ 173 174 struct ubo_block_info *info = get_block_info(state, block); 175 info->offsets |= ((1ull << chunks) - 1) << offset; 176 info->uses[offset]++; 177 } 178 } 179} 180 181static void 182print_ubo_entry(FILE *file, 183 const struct ubo_range_entry *entry, 184 struct ubo_analysis_state *state) 185{ 186 struct ubo_block_info *info = get_block_info(state, entry->range.block); 187 188 fprintf(file, 189 "block %2d, start %2d, length %2d, bits = %"PRIx64", " 190 "benefit %2d, cost %2d, score = %2d\n", 191 entry->range.block, entry->range.start, entry->range.length, 192 info->offsets, entry->benefit, entry->range.length, score(entry)); 193} 194 195void 196brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler, 197 nir_shader *nir, 198 const struct brw_vs_prog_key *vs_key, 199 struct brw_ubo_range out_ranges[4]) 200{ 201 const struct gen_device_info *devinfo = compiler->devinfo; 202 203 if ((devinfo->gen <= 7 && !devinfo->is_haswell) || 204 !compiler->scalar_stage[nir->info.stage]) { 205 memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range)); 206 return; 207 } 208 209 void *mem_ctx = ralloc_context(NULL); 210 211 struct ubo_analysis_state state = { 212 .uses_regular_uniforms = false, 213 .blocks = 214 _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal), 215 }; 216 217 switch (nir->info.stage) { 218 case MESA_SHADER_VERTEX: 219 if (vs_key && vs_key->nr_userclip_plane_consts > 0) 220 state.uses_regular_uniforms = true; 221 break; 222 223 case MESA_SHADER_COMPUTE: 224 /* Compute shaders use push constants to get the subgroup ID so it's 225 * best to just assume some system values are pushed. 226 */ 227 state.uses_regular_uniforms = true; 228 break; 229 230 default: 231 break; 232 } 233 234 /* Walk the IR, recording how many times each UBO block/offset is used. */ 235 nir_foreach_function(function, nir) { 236 if (function->impl) { 237 nir_foreach_block(block, function->impl) { 238 analyze_ubos_block(&state, block); 239 } 240 } 241 } 242 243 /* Find ranges: a block, starting 32-byte offset, and length. */ 244 struct util_dynarray ranges; 245 util_dynarray_init(&ranges, mem_ctx); 246 247 hash_table_foreach(state.blocks, entry) { 248 const int b = entry->hash - 1; 249 const struct ubo_block_info *info = entry->data; 250 uint64_t offsets = info->offsets; 251 252 /* Walk through the offsets bitfield, finding contiguous regions of 253 * set bits: 254 * 255 * 0000000001111111111111000000000000111111111111110000000011111100 256 * ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^ 257 * 258 * Each of these will become a UBO range. 259 */ 260 while (offsets != 0) { 261 /* Find the first 1 in the offsets bitfield. This represents the 262 * start of a range of interesting UBO data. Make it zero-indexed. 263 */ 264 int first_bit = ffsll(offsets) - 1; 265 266 /* Find the first 0 bit in offsets beyond first_bit. To find the 267 * first zero bit, we find the first 1 bit in the complement. In 268 * order to ignore bits before first_bit, we mask off those bits. 269 */ 270 int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1; 271 272 if (first_hole == -1) { 273 /* If we didn't find a hole, then set it to the end of the 274 * bitfield. There are no more ranges to process. 275 */ 276 first_hole = 64; 277 offsets = 0; 278 } else { 279 /* We've processed all bits before first_hole. Mask them off. */ 280 offsets &= ~((1ull << first_hole) - 1); 281 } 282 283 struct ubo_range_entry *entry = 284 util_dynarray_grow(&ranges, sizeof(struct ubo_range_entry)); 285 286 entry->range.block = b; 287 entry->range.start = first_bit; 288 /* first_hole is one beyond the end, so we don't need to add 1 */ 289 entry->range.length = first_hole - first_bit; 290 entry->benefit = 0; 291 292 for (int i = 0; i < entry->range.length; i++) 293 entry->benefit += info->uses[first_bit + i]; 294 } 295 } 296 297 int nr_entries = ranges.size / sizeof(struct ubo_range_entry); 298 299 if (0) { 300 util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) { 301 print_ubo_entry(stderr, entry, &state); 302 } 303 } 304 305 /* TODO: Consider combining ranges. 306 * 307 * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are 308 * more ranges, and two are close by with only a small hole, it may be 309 * worth combining them. The holes will waste register space, but the 310 * benefit of removing pulls may outweigh that cost. 311 */ 312 313 /* Sort the list so the most beneficial ranges are at the front. */ 314 qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry), 315 cmp_ubo_range_entry); 316 317 struct ubo_range_entry *entries = ranges.data; 318 319 /* Return the top 4 or so. We drop by one if regular uniforms are in 320 * use, assuming one push buffer will be dedicated to those. We may 321 * also only get 3 on Haswell if we can't write INSTPM. 322 * 323 * The backend may need to shrink these ranges to ensure that they 324 * don't exceed the maximum push constant limits. It can simply drop 325 * the tail of the list, as that's the least valuable portion. We 326 * unfortunately can't truncate it here, because we don't know what 327 * the backend is planning to do with regular uniforms. 328 */ 329 const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) - 330 state.uses_regular_uniforms; 331 nr_entries = MIN2(nr_entries, max_ubos); 332 333 for (int i = 0; i < nr_entries; i++) { 334 out_ranges[i] = entries[i].range; 335 } 336 for (int i = nr_entries; i < 4; i++) { 337 out_ranges[i].block = 0; 338 out_ranges[i].start = 0; 339 out_ranges[i].length = 0; 340 } 341 342 ralloc_free(ranges.mem_ctx); 343} 344