1/* 2 * Copyright (C) 2019 Collabora, Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors (Collabora): 24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25 */ 26 27#include "compiler.h" 28#include "util/u_math.h" 29#include "util/u_memory.h" 30 31/* This pass promotes reads from UBOs to register-mapped uniforms. This saves 32 * both instructions and work register pressure, but it reduces the work 33 * registers available, requiring a balance. 34 * 35 * We use a heuristic to determine the ideal count, implemented by 36 * mir_work_heuristic, which returns the ideal number of work registers. 37 */ 38 39static bool 40mir_is_ubo(midgard_instruction *ins) 41{ 42 return (ins->type == TAG_LOAD_STORE_4) && 43 (OP_IS_UBO_READ(ins->op)); 44} 45 46static bool 47mir_is_direct_aligned_ubo(midgard_instruction *ins) 48{ 49 return mir_is_ubo(ins) && 50 !(ins->constants.u32[0] & 0xF) && 51 (ins->src[1] == ~0) && 52 (ins->src[2] == ~0); 53} 54 55/* Represents use data for a single UBO */ 56 57#define MAX_UBO_QWORDS (65536 / 16) 58 59struct mir_ubo_block { 60 BITSET_DECLARE(uses, MAX_UBO_QWORDS); 61 BITSET_DECLARE(pushed, MAX_UBO_QWORDS); 62}; 63 64struct mir_ubo_analysis { 65 /* Per block analysis */ 66 unsigned nr_blocks; 67 struct mir_ubo_block *blocks; 68}; 69 70static struct mir_ubo_analysis 71mir_analyze_ranges(compiler_context *ctx) 72{ 73 struct mir_ubo_analysis res = { 74 .nr_blocks = ctx->nir->info.num_ubos + 1, 75 }; 76 77 res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block)); 78 79 mir_foreach_instr_global(ctx, ins) { 80 if (!mir_is_direct_aligned_ubo(ins)) continue; 81 82 unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store); 83 unsigned offset = ins->constants.u32[0] / 16; 84 85 assert(ubo < res.nr_blocks); 86 87 if (offset < MAX_UBO_QWORDS) 88 BITSET_SET(res.blocks[ubo].uses, offset); 89 } 90 91 return res; 92} 93 94/* Select UBO words to push. A sophisticated implementation would consider the 95 * number of uses and perhaps the control flow to estimate benefit. This is not 96 * sophisticated. Select from the last UBO first to prioritize sysvals. */ 97 98static void 99mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords) 100{ 101 unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4); 102 103 for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { 104 struct mir_ubo_block *block = &analysis->blocks[ubo]; 105 106 unsigned vec4; 107 BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) { 108 /* Don't push more than possible */ 109 if (push->count > max_words - 4) 110 return; 111 112 for (unsigned offs = 0; offs < 4; ++offs) { 113 struct panfrost_ubo_word word = { 114 .ubo = ubo, 115 .offset = (vec4 * 16) + (offs * 4) 116 }; 117 118 push->words[push->count++] = word; 119 } 120 121 /* Mark it as pushed so we can rewrite */ 122 BITSET_SET(block->pushed, vec4); 123 } 124 } 125} 126 127#if 0 128static void 129mir_dump_ubo_analysis(struct mir_ubo_analysis *res) 130{ 131 printf("%u blocks\n", res->nr_blocks); 132 133 for (unsigned i = 0; i < res->nr_blocks; ++i) { 134 BITSET_WORD *uses = res->blocks[i].uses; 135 BITSET_WORD *push = res->blocks[i].pushed; 136 137 unsigned last = BITSET_LAST_BIT_SIZED(uses, BITSET_WORDS(MAX_UBO_QWORDS)); 138 139 printf("\t"); 140 141 for (unsigned j = 0; j < last; ++j) { 142 bool used = BITSET_TEST(uses, j); 143 bool pushed = BITSET_TEST(push, j); 144 assert(used || !pushed); 145 146 putchar(pushed ? '*' : used ? '-' : '_'); 147 } 148 149 printf("\n"); 150 } 151} 152#endif 153 154static unsigned 155mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis) 156{ 157 unsigned count = 0; 158 159 for (unsigned i = 0; i < analysis->nr_blocks; ++i) { 160 BITSET_WORD *uses = analysis->blocks[i].uses; 161 162 for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w) 163 count += util_bitcount(uses[w]); 164 } 165 166 return count; 167} 168 169static unsigned 170mir_count_live(uint16_t *live, unsigned temp_count) 171{ 172 unsigned count = 0; 173 174 for (unsigned i = 0; i < temp_count; ++i) 175 count += util_bitcount(live[i]); 176 177 return count; 178} 179 180static unsigned 181mir_estimate_pressure(compiler_context *ctx) 182{ 183 mir_invalidate_liveness(ctx); 184 mir_compute_liveness(ctx); 185 186 unsigned max_live = 0; 187 188 mir_foreach_block(ctx, _block) { 189 midgard_block *block = (midgard_block *) _block; 190 uint16_t *live = mem_dup(block->base.live_out, ctx->temp_count * sizeof(uint16_t)); 191 192 mir_foreach_instr_in_block_rev(block, ins) { 193 unsigned count = mir_count_live(live, ctx->temp_count); 194 max_live = MAX2(max_live, count); 195 mir_liveness_ins_update(live, ins, ctx->temp_count); 196 } 197 198 free(live); 199 } 200 201 return DIV_ROUND_UP(max_live, 16); 202} 203 204static unsigned 205mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis) 206{ 207 unsigned uniform_count = mir_promoteable_uniform_count(analysis); 208 209 /* If there are 8 or fewer uniforms, it doesn't matter what we do, so 210 * allow as many work registers as needed */ 211 212 if (uniform_count <= 8) 213 return 16; 214 215 /* Otherwise, estimate the register pressure */ 216 217 unsigned pressure = mir_estimate_pressure(ctx); 218 219 /* Prioritize not spilling above all else. The relation between the 220 * pressure estimate and the actual register pressure is a little 221 * murkier than we might like (due to scheduling, pipeline registers, 222 * failure to pack vector registers, load/store registers, texture 223 * registers...), hence why this is a heuristic parameter */ 224 225 if (pressure > 6) 226 return 16; 227 228 /* If there's no chance of spilling, prioritize UBOs and thread count */ 229 230 return 8; 231} 232 233/* Bitset of indices that will be used as a special register -- inputs to a 234 * non-ALU op. We precompute this set so that testing is efficient, otherwise 235 * we end up O(mn) behaviour for n instructions and m uniform reads */ 236 237static BITSET_WORD * 238mir_special_indices(compiler_context *ctx) 239{ 240 mir_compute_temp_count(ctx); 241 BITSET_WORD *bset = calloc(BITSET_WORDS(ctx->temp_count), sizeof(BITSET_WORD)); 242 243 mir_foreach_instr_global(ctx, ins) { 244 /* Look for special instructions */ 245 bool is_ldst = ins->type == TAG_LOAD_STORE_4; 246 bool is_tex = ins->type == TAG_TEXTURE_4; 247 bool is_writeout = ins->compact_branch && ins->writeout; 248 249 if (!(is_ldst || is_tex || is_writeout)) 250 continue; 251 252 /* Anything read by a special instruction is itself special */ 253 mir_foreach_src(ins, i) { 254 unsigned idx = ins->src[i]; 255 256 if (idx < ctx->temp_count) 257 BITSET_SET(bset, idx); 258 } 259 } 260 261 return bset; 262} 263 264void 265midgard_promote_uniforms(compiler_context *ctx) 266{ 267 if (ctx->inputs->no_ubo_to_push) { 268 /* If nothing is pushed, all UBOs need to be uploaded 269 * conventionally */ 270 ctx->ubo_mask = ~0; 271 return; 272 } 273 274 struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx); 275 276 unsigned work_count = mir_work_heuristic(ctx, &analysis); 277 unsigned promoted_count = 24 - work_count; 278 279 /* Ensure we are 16 byte aligned to avoid underallocations */ 280 mir_pick_ubo(&ctx->info->push, &analysis, promoted_count); 281 ctx->info->push.count = ALIGN_POT(ctx->info->push.count, 4); 282 283 /* First, figure out special indices a priori so we don't recompute a lot */ 284 BITSET_WORD *special = mir_special_indices(ctx); 285 286 ctx->ubo_mask = 0; 287 288 mir_foreach_instr_global_safe(ctx, ins) { 289 if (!mir_is_ubo(ins)) continue; 290 291 unsigned ubo = midgard_unpack_ubo_index_imm(ins->load_store); 292 unsigned qword = ins->constants.u32[0] / 16; 293 294 if (!mir_is_direct_aligned_ubo(ins)) { 295 if (ins->src[1] == ~0) 296 ctx->ubo_mask |= BITSET_BIT(ubo); 297 else 298 ctx->ubo_mask = ~0; 299 300 continue; 301 } 302 303 /* Check if we decided to push this */ 304 assert(ubo < analysis.nr_blocks); 305 if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) { 306 ctx->ubo_mask |= BITSET_BIT(ubo); 307 continue; 308 } 309 310 /* Find where we pushed to, TODO: unaligned pushes to pack */ 311 unsigned base = pan_lookup_pushed_ubo(&ctx->info->push, ubo, qword * 16); 312 assert((base & 0x3) == 0); 313 314 unsigned address = base / 4; 315 unsigned uniform_reg = 23 - address; 316 317 /* Should've taken into account when pushing */ 318 assert(address < promoted_count); 319 unsigned promoted = SSA_FIXED_REGISTER(uniform_reg); 320 321 /* We do need the move for safety for a non-SSA dest, or if 322 * we're being fed into a special class */ 323 324 bool needs_move = ins->dest & PAN_IS_REG || ins->dest == ctx->blend_src1; 325 326 if (ins->dest < ctx->temp_count) 327 needs_move |= BITSET_TEST(special, ins->dest); 328 329 if (needs_move) { 330 unsigned type_size = nir_alu_type_get_type_size(ins->dest_type); 331 midgard_instruction mov = v_mov(promoted, ins->dest); 332 mov.dest_type = nir_type_uint | type_size; 333 mov.src_types[1] = mov.dest_type; 334 335 uint16_t rounded = mir_round_bytemask_up(mir_bytemask(ins), type_size); 336 mir_set_bytemask(&mov, rounded); 337 mir_insert_instruction_before(ctx, ins, mov); 338 } else { 339 mir_rewrite_index_src(ctx, ins->dest, promoted); 340 } 341 342 mir_remove_instruction(ins); 343 } 344 345 free(special); 346 free(analysis.blocks); 347} 348