1/* 2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Jonathan Marek <jonathan@marek.ca> 25 */ 26 27#include "ir2_private.h" 28 29static bool scalar_possible(struct ir2_instr *instr) 30{ 31 if (instr->alu.scalar_opc == SCALAR_NONE) 32 return false; 33 34 return src_ncomp(instr) == 1; 35} 36 37static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b) 38{ 39 if (!a) 40 return true; 41 42 /* dont use same instruction twice */ 43 if (a == b) 44 return false; 45 46 /* PRED_SET must be alone */ 47 if (b->alu.scalar_opc >= PRED_SETEs && 48 b->alu.scalar_opc <= PRED_SET_RESTOREs) 49 return false; 50 51 /* must write to same export (issues otherwise?) */ 52 return a->alu.export == b->alu.export; 53} 54 55/* priority of vector instruction for scheduling (lower=higher prio) */ 56static unsigned alu_vector_prio(struct ir2_instr *instr) 57{ 58 if (instr->alu.vector_opc == VECTOR_NONE) 59 return ~0u; 60 61 if (is_export(instr)) 62 return 4; 63 64 /* TODO check src type and ncomps */ 65 if (instr->src_count == 3) 66 return 0; 67 68 if (!scalar_possible(instr)) 69 return 1; 70 71 return instr->src_count == 2 ? 2 : 3; 72} 73 74/* priority of scalar instruction for scheduling (lower=higher prio) */ 75static unsigned alu_scalar_prio(struct ir2_instr *instr) 76{ 77 if (!scalar_possible(instr)) 78 return ~0u; 79 80 /* this case is dealt with later */ 81 if (instr->src_count > 1) 82 return ~0u; 83 84 if (is_export(instr)) 85 return 4; 86 87 /* PRED to end of block */ 88 if (instr->alu.scalar_opc >= PRED_SETEs && 89 instr->alu.scalar_opc <= PRED_SET_RESTOREs) 90 return 5; 91 92 /* scalar only have highest priority */ 93 return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3; 94} 95 96/* this is a bit messy: 97 * we want to find a slot where we can insert a scalar MOV with 98 * a vector instruction that was already scheduled 99 */ 100static struct ir2_sched_instr* 101insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx, 102 struct ir2_src src1, unsigned *comp) 103{ 104 struct ir2_sched_instr *sched = NULL, *s; 105 unsigned i, mask = 0xf; 106 107 /* go first earliest point where the mov can be inserted */ 108 for (i = ctx->instr_sched_count-1; i > 0; i--) { 109 s = &ctx->instr_sched[i - 1]; 110 111 if (s->instr && s->instr->block_idx != block_idx) 112 break; 113 if (s->instr_s && s->instr_s->block_idx != block_idx) 114 break; 115 116 if (src1.type == IR2_SRC_SSA) { 117 if ((s->instr && s->instr->idx == src1.num) || 118 (s->instr_s && s->instr_s->idx == src1.num)) 119 break; 120 } 121 122 unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf); 123 if ((mask & mr) == 0) 124 break; 125 126 mask &= mr; 127 if (s->instr_s || s->instr->src_count == 3) 128 continue; 129 130 if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0) 131 continue; 132 133 sched = s; 134 } 135 *comp = ffs(mask) - 1; 136 return sched; 137} 138 139/* case1: 140 * in this case, insert a mov to place the 2nd src into to same reg 141 * (scalar sources come from the same register) 142 * 143 * this is a common case which works when one of the srcs is input/const 144 * but for instrs which have 2 ssa/reg srcs, then its not ideal 145 */ 146static bool 147scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order) 148{ 149 struct ir2_src src0 = instr->src[ order]; 150 struct ir2_src src1 = instr->src[!order]; 151 struct ir2_sched_instr *sched; 152 struct ir2_instr *ins; 153 struct ir2_reg *reg; 154 unsigned idx, comp; 155 156 switch (src0.type) { 157 case IR2_SRC_CONST: 158 case IR2_SRC_INPUT: 159 return false; 160 default: 161 break; 162 } 163 164 /* TODO, insert needs logic for this */ 165 if (src1.type == IR2_SRC_REG) 166 return false; 167 168 /* we could do something if they match src1.. */ 169 if (src0.negate || src0.abs) 170 return false; 171 172 reg = get_reg_src(ctx, &src0); 173 174 /* result not used more since we will overwrite */ 175 for (int i = 0; i < 4; i++) 176 if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i)) 177 return false; 178 179 /* find a place to insert the mov */ 180 sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp); 181 if (!sched) 182 return false; 183 184 ins = &ctx->instr[idx = ctx->instr_count++]; 185 ins->idx = idx; 186 ins->type = IR2_ALU; 187 ins->src[0] = src1; 188 ins->src_count = 1; 189 ins->is_ssa = true; 190 ins->ssa.idx = reg->idx; 191 ins->ssa.ncomp = 1; 192 ins->ssa.comp[0].c = comp; 193 ins->alu.scalar_opc = MAXs; 194 ins->alu.export = -1; 195 ins->alu.write_mask = 1; 196 ins->pred = instr->pred; 197 ins->block_idx = instr->block_idx; 198 199 instr->src[0] = src0; 200 instr->alu.src1_swizzle = comp; 201 202 sched->instr_s = ins; 203 return true; 204} 205 206/* fill sched with next fetch or (vector and/or scalar) alu instruction */ 207static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched) 208{ 209 struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL; 210 unsigned avail_count = 0; 211 212 instr_alloc_type_t export = ~0u; 213 int block_idx = -1; 214 215 /* XXX merge this loop with the other one somehow? */ 216 ir2_foreach_instr(instr, ctx) { 217 if (!instr->need_emit) 218 continue; 219 if (is_export(instr)) 220 export = MIN2(export, export_buf(instr->alu.export)); 221 } 222 223 ir2_foreach_instr(instr, ctx) { 224 if (!instr->need_emit) 225 continue; 226 227 /* dont mix exports */ 228 if (is_export(instr) && export_buf(instr->alu.export) != export) 229 continue; 230 231 if (block_idx < 0) 232 block_idx = instr->block_idx; 233 else if (block_idx != instr->block_idx || /* must be same block */ 234 instr->type == IR2_CF || /* CF/MEM must be alone */ 235 (is_export(instr) && export == SQ_MEMORY)) 236 break; 237 /* it works because IR2_CF is always at end of block 238 * and somewhat same idea with MEM exports, which might not be alone 239 * but will end up in-order at least 240 */ 241 242 /* check if dependencies are satisfied */ 243 bool is_ok = true; 244 ir2_foreach_src(src, instr) { 245 if (src->type == IR2_SRC_REG) { 246 /* need to check if all previous instructions in the block 247 * which write the reg have been emitted 248 * slow.. 249 * XXX: check components instead of whole register 250 */ 251 struct ir2_reg *reg = get_reg_src(ctx, src); 252 ir2_foreach_instr(p, ctx) { 253 if (!p->is_ssa && p->reg == reg && p->idx < instr->idx) 254 is_ok &= !p->need_emit; 255 } 256 } else if (src->type == IR2_SRC_SSA) { 257 /* in this case its easy, just check need_emit */ 258 is_ok &= !ctx->instr[src->num].need_emit; 259 } 260 } 261 if (!is_ok) 262 continue; 263 264 avail[avail_count++] = instr; 265 } 266 267 if (!avail_count) { 268 assert(block_idx == -1); 269 return -1; 270 } 271 272 /* priority to FETCH instructions */ 273 ir2_foreach_avail(instr) { 274 if (instr->type == IR2_ALU) 275 continue; 276 277 ra_src_free(ctx, instr); 278 ra_reg(ctx, get_reg(instr), -1, false, 0); 279 280 instr->need_emit = false; 281 sched->instr = instr; 282 sched->instr_s = NULL; 283 return block_idx; 284 } 285 286 /* TODO precompute priorities */ 287 288 unsigned prio_v = ~0u, prio_s = ~0u, prio; 289 ir2_foreach_avail(instr) { 290 prio = alu_vector_prio(instr); 291 if (prio < prio_v) { 292 instr_v = instr; 293 prio_v = prio; 294 } 295 } 296 297 /* TODO can still insert scalar if src_count=3, if smart about it */ 298 if (!instr_v || instr_v->src_count < 3) { 299 ir2_foreach_avail(instr) { 300 bool compat = is_alu_compatible(instr_v, instr); 301 302 prio = alu_scalar_prio(instr); 303 if (prio >= prio_v && !compat) 304 continue; 305 306 if (prio < prio_s) { 307 instr_s = instr; 308 prio_s = prio; 309 if (!compat) 310 instr_v = NULL; 311 } 312 } 313 } 314 315 assert(instr_v || instr_s); 316 317 /* now, we try more complex insertion of vector instruction as scalar 318 * TODO: if we are smart we can still insert if instr_v->src_count==3 319 */ 320 if (!instr_s && instr_v->src_count < 3) { 321 ir2_foreach_avail(instr) { 322 if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr)) 323 continue; 324 325 /* at this point, src_count should always be 2 */ 326 assert(instr->src_count == 2); 327 328 if (scalarize_case1(ctx, instr, 0)) { 329 instr_s = instr; 330 break; 331 } 332 if (scalarize_case1(ctx, instr, 1)) { 333 instr_s = instr; 334 break; 335 } 336 } 337 } 338 339 /* free src registers */ 340 if (instr_v) { 341 instr_v->need_emit = false; 342 ra_src_free(ctx, instr_v); 343 } 344 345 if (instr_s) { 346 instr_s->need_emit = false; 347 ra_src_free(ctx, instr_s); 348 } 349 350 /* allocate dst registers */ 351 if (instr_v) 352 ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask); 353 354 if (instr_s) 355 ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask); 356 357 sched->instr = instr_v; 358 sched->instr_s = instr_s; 359 return block_idx; 360} 361 362/* scheduling: determine order of instructions */ 363static void schedule_instrs(struct ir2_context *ctx) 364{ 365 struct ir2_sched_instr *sched; 366 int block_idx; 367 368 /* allocate input registers */ 369 for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++) 370 if (ctx->input[idx].initialized) 371 ra_reg(ctx, &ctx->input[idx], idx, false, 0); 372 373 for (;;) { 374 sched = &ctx->instr_sched[ctx->instr_sched_count++]; 375 block_idx = sched_next(ctx, sched); 376 if (block_idx < 0) 377 break; 378 memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state)); 379 380 /* catch texture fetch after scheduling and insert the 381 * SET_TEX_LOD right before it if necessary 382 * TODO clean this up 383 */ 384 struct ir2_instr *instr = sched->instr, *tex_lod; 385 if (instr && instr->type == IR2_FETCH && 386 instr->fetch.opc == TEX_FETCH && instr->src_count == 2) { 387 /* generate the SET_LOD instruction */ 388 tex_lod = &ctx->instr[ctx->instr_count++]; 389 tex_lod->type = IR2_FETCH; 390 tex_lod->block_idx = instr->block_idx; 391 tex_lod->pred = instr->pred; 392 tex_lod->fetch.opc = TEX_SET_TEX_LOD; 393 tex_lod->src[0] = instr->src[1]; 394 tex_lod->src_count = 1; 395 396 sched[1] = sched[0]; 397 sched->instr = tex_lod; 398 ctx->instr_sched_count++; 399 } 400 401 bool free_block = true; 402 ir2_foreach_instr(instr, ctx) 403 free_block &= instr->block_idx != block_idx; 404 if (free_block) 405 ra_block_free(ctx, block_idx); 406 }; 407 ctx->instr_sched_count--; 408} 409 410void 411ir2_compile(struct fd2_shader_stateobj *so, unsigned variant, 412 struct fd2_shader_stateobj *fp) 413{ 414 struct ir2_context ctx = { }; 415 bool binning = !fp && so->type == MESA_SHADER_VERTEX; 416 417 if (fp) 418 so->variant[variant].f = fp->variant[0].f; 419 420 ctx.so = so; 421 ctx.info = &so->variant[variant].info; 422 ctx.f = &so->variant[variant].f; 423 ctx.info->max_reg = -1; 424 425 /* convert nir to internal representation */ 426 ir2_nir_compile(&ctx, binning); 427 428 /* copy propagate srcs */ 429 cp_src(&ctx); 430 431 /* get ref_counts and kill non-needed instructions */ 432 ra_count_refs(&ctx); 433 434 /* remove movs used to write outputs */ 435 cp_export(&ctx); 436 437 /* instruction order.. and vector->scalar conversions */ 438 schedule_instrs(&ctx); 439 440 /* finally, assemble to bitcode */ 441 assemble(&ctx, binning); 442} 443