1b8e80941Smrg/* 2b8e80941Smrg * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca> 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20b8e80941Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21b8e80941Smrg * SOFTWARE. 22b8e80941Smrg * 23b8e80941Smrg * Authors: 24b8e80941Smrg * Jonathan Marek <jonathan@marek.ca> 25b8e80941Smrg */ 26b8e80941Smrg 27b8e80941Smrg#include "ir2_private.h" 28b8e80941Smrg 29b8e80941Smrgstatic bool scalar_possible(struct ir2_instr *instr) 30b8e80941Smrg{ 31b8e80941Smrg if (instr->alu.scalar_opc == SCALAR_NONE) 32b8e80941Smrg return false; 33b8e80941Smrg 34b8e80941Smrg return src_ncomp(instr) == 1; 35b8e80941Smrg} 36b8e80941Smrg 37b8e80941Smrgstatic bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b) 38b8e80941Smrg{ 39b8e80941Smrg if (!a) 40b8e80941Smrg return true; 41b8e80941Smrg 42b8e80941Smrg /* dont use same instruction twice */ 43b8e80941Smrg if (a == b) 44b8e80941Smrg return false; 45b8e80941Smrg 46b8e80941Smrg /* PRED_SET must be alone */ 47b8e80941Smrg if (b->alu.scalar_opc >= PRED_SETEs && 48b8e80941Smrg b->alu.scalar_opc <= PRED_SET_RESTOREs) 49b8e80941Smrg return false; 50b8e80941Smrg 51b8e80941Smrg /* must write to same export (issues otherwise?) */ 52b8e80941Smrg return a->alu.export == b->alu.export; 53b8e80941Smrg} 54b8e80941Smrg 55b8e80941Smrg/* priority of vector instruction for scheduling (lower=higher prio) */ 56b8e80941Smrgstatic unsigned alu_vector_prio(struct ir2_instr *instr) 57b8e80941Smrg{ 58b8e80941Smrg if (instr->alu.vector_opc == VECTOR_NONE) 59b8e80941Smrg return ~0u; 60b8e80941Smrg 61b8e80941Smrg if (is_export(instr)) 62b8e80941Smrg return 4; 63b8e80941Smrg 64b8e80941Smrg /* TODO check src type and ncomps */ 65b8e80941Smrg if (instr->src_count == 3) 66b8e80941Smrg return 0; 67b8e80941Smrg 68b8e80941Smrg if (!scalar_possible(instr)) 69b8e80941Smrg return 1; 70b8e80941Smrg 71b8e80941Smrg return instr->src_count == 2 ? 2 : 3; 72b8e80941Smrg} 73b8e80941Smrg 74b8e80941Smrg/* priority of scalar instruction for scheduling (lower=higher prio) */ 75b8e80941Smrgstatic unsigned alu_scalar_prio(struct ir2_instr *instr) 76b8e80941Smrg{ 77b8e80941Smrg if (!scalar_possible(instr)) 78b8e80941Smrg return ~0u; 79b8e80941Smrg 80b8e80941Smrg /* this case is dealt with later */ 81b8e80941Smrg if (instr->src_count > 1) 82b8e80941Smrg return ~0u; 83b8e80941Smrg 84b8e80941Smrg if (is_export(instr)) 85b8e80941Smrg return 4; 86b8e80941Smrg 87b8e80941Smrg /* PRED to end of block */ 88b8e80941Smrg if (instr->alu.scalar_opc >= PRED_SETEs && 89b8e80941Smrg instr->alu.scalar_opc <= PRED_SET_RESTOREs) 90b8e80941Smrg return 5; 91b8e80941Smrg 92b8e80941Smrg /* scalar only have highest priority */ 93b8e80941Smrg return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3; 94b8e80941Smrg} 95b8e80941Smrg 96b8e80941Smrg/* this is a bit messy: 97b8e80941Smrg * we want to find a slot where we can insert a scalar MOV with 98b8e80941Smrg * a vector instruction that was already scheduled 99b8e80941Smrg */ 100b8e80941Smrgstatic struct ir2_sched_instr* 101b8e80941Smrginsert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx, 102b8e80941Smrg struct ir2_src src1, unsigned *comp) 103b8e80941Smrg{ 104b8e80941Smrg struct ir2_sched_instr *sched = NULL, *s; 105b8e80941Smrg unsigned i, mask = 0xf; 106b8e80941Smrg 107b8e80941Smrg /* go first earliest point where the mov can be inserted */ 108b8e80941Smrg for (i = ctx->instr_sched_count-1; i > 0; i--) { 109b8e80941Smrg s = &ctx->instr_sched[i - 1]; 110b8e80941Smrg 111b8e80941Smrg if (s->instr && s->instr->block_idx != block_idx) 112b8e80941Smrg break; 113b8e80941Smrg if (s->instr_s && s->instr_s->block_idx != block_idx) 114b8e80941Smrg break; 115b8e80941Smrg 116b8e80941Smrg if (src1.type == IR2_SRC_SSA) { 117b8e80941Smrg if ((s->instr && s->instr->idx == src1.num) || 118b8e80941Smrg (s->instr_s && s->instr_s->idx == src1.num)) 119b8e80941Smrg break; 120b8e80941Smrg } 121b8e80941Smrg 122b8e80941Smrg unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf); 123b8e80941Smrg if ((mask & mr) == 0) 124b8e80941Smrg break; 125b8e80941Smrg 126b8e80941Smrg mask &= mr; 127b8e80941Smrg if (s->instr_s || s->instr->src_count == 3) 128b8e80941Smrg continue; 129b8e80941Smrg 130b8e80941Smrg if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0) 131b8e80941Smrg continue; 132b8e80941Smrg 133b8e80941Smrg sched = s; 134b8e80941Smrg } 135b8e80941Smrg *comp = ffs(mask) - 1; 136b8e80941Smrg return sched; 137b8e80941Smrg} 138b8e80941Smrg 139b8e80941Smrg/* case1: 140b8e80941Smrg * in this case, insert a mov to place the 2nd src into to same reg 141b8e80941Smrg * (scalar sources come from the same register) 142b8e80941Smrg * 143b8e80941Smrg * this is a common case which works when one of the srcs is input/const 144b8e80941Smrg * but for instrs which have 2 ssa/reg srcs, then its not ideal 145b8e80941Smrg */ 146b8e80941Smrgstatic bool 147b8e80941Smrgscalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order) 148b8e80941Smrg{ 149b8e80941Smrg struct ir2_src src0 = instr->src[ order]; 150b8e80941Smrg struct ir2_src src1 = instr->src[!order]; 151b8e80941Smrg struct ir2_sched_instr *sched; 152b8e80941Smrg struct ir2_instr *ins; 153b8e80941Smrg struct ir2_reg *reg; 154b8e80941Smrg unsigned idx, comp; 155b8e80941Smrg 156b8e80941Smrg switch (src0.type) { 157b8e80941Smrg case IR2_SRC_CONST: 158b8e80941Smrg case IR2_SRC_INPUT: 159b8e80941Smrg return false; 160b8e80941Smrg default: 161b8e80941Smrg break; 162b8e80941Smrg } 163b8e80941Smrg 164b8e80941Smrg /* TODO, insert needs logic for this */ 165b8e80941Smrg if (src1.type == IR2_SRC_REG) 166b8e80941Smrg return false; 167b8e80941Smrg 168b8e80941Smrg /* we could do something if they match src1.. */ 169b8e80941Smrg if (src0.negate || src0.abs) 170b8e80941Smrg return false; 171b8e80941Smrg 172b8e80941Smrg reg = get_reg_src(ctx, &src0); 173b8e80941Smrg 174b8e80941Smrg /* result not used more since we will overwrite */ 175b8e80941Smrg for (int i = 0; i < 4; i++) 176b8e80941Smrg if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i)) 177b8e80941Smrg return false; 178b8e80941Smrg 179b8e80941Smrg /* find a place to insert the mov */ 180b8e80941Smrg sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp); 181b8e80941Smrg if (!sched) 182b8e80941Smrg return false; 183b8e80941Smrg 184b8e80941Smrg ins = &ctx->instr[idx = ctx->instr_count++]; 185b8e80941Smrg ins->idx = idx; 186b8e80941Smrg ins->type = IR2_ALU; 187b8e80941Smrg ins->src[0] = src1; 188b8e80941Smrg ins->src_count = 1; 189b8e80941Smrg ins->is_ssa = true; 190b8e80941Smrg ins->ssa.idx = reg->idx; 191b8e80941Smrg ins->ssa.ncomp = 1; 192b8e80941Smrg ins->ssa.comp[0].c = comp; 193b8e80941Smrg ins->alu.scalar_opc = MAXs; 194b8e80941Smrg ins->alu.export = -1; 195b8e80941Smrg ins->alu.write_mask = 1; 196b8e80941Smrg ins->pred = instr->pred; 197b8e80941Smrg ins->block_idx = instr->block_idx; 198b8e80941Smrg 199b8e80941Smrg instr->src[0] = src0; 200b8e80941Smrg instr->alu.src1_swizzle = comp; 201b8e80941Smrg 202b8e80941Smrg sched->instr_s = ins; 203b8e80941Smrg return true; 204b8e80941Smrg} 205b8e80941Smrg 206b8e80941Smrg/* fill sched with next fetch or (vector and/or scalar) alu instruction */ 207b8e80941Smrgstatic int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched) 208b8e80941Smrg{ 209b8e80941Smrg struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL; 210b8e80941Smrg unsigned avail_count = 0; 211b8e80941Smrg 212b8e80941Smrg instr_alloc_type_t export = ~0u; 213b8e80941Smrg int block_idx = -1; 214b8e80941Smrg 215b8e80941Smrg /* XXX merge this loop with the other one somehow? */ 216b8e80941Smrg ir2_foreach_instr(instr, ctx) { 217b8e80941Smrg if (!instr->need_emit) 218b8e80941Smrg continue; 219b8e80941Smrg if (is_export(instr)) 220b8e80941Smrg export = MIN2(export, export_buf(instr->alu.export)); 221b8e80941Smrg } 222b8e80941Smrg 223b8e80941Smrg ir2_foreach_instr(instr, ctx) { 224b8e80941Smrg if (!instr->need_emit) 225b8e80941Smrg continue; 226b8e80941Smrg 227b8e80941Smrg /* dont mix exports */ 228b8e80941Smrg if (is_export(instr) && export_buf(instr->alu.export) != export) 229b8e80941Smrg continue; 230b8e80941Smrg 231b8e80941Smrg if (block_idx < 0) 232b8e80941Smrg block_idx = instr->block_idx; 233b8e80941Smrg else if (block_idx != instr->block_idx || /* must be same block */ 234b8e80941Smrg instr->type == IR2_CF || /* CF/MEM must be alone */ 235b8e80941Smrg (is_export(instr) && export == SQ_MEMORY)) 236b8e80941Smrg break; 237b8e80941Smrg /* it works because IR2_CF is always at end of block 238b8e80941Smrg * and somewhat same idea with MEM exports, which might not be alone 239b8e80941Smrg * but will end up in-order at least 240b8e80941Smrg */ 241b8e80941Smrg 242b8e80941Smrg /* check if dependencies are satisfied */ 243b8e80941Smrg bool is_ok = true; 244b8e80941Smrg ir2_foreach_src(src, instr) { 245b8e80941Smrg if (src->type == IR2_SRC_REG) { 246b8e80941Smrg /* need to check if all previous instructions in the block 247b8e80941Smrg * which write the reg have been emitted 248b8e80941Smrg * slow.. 249b8e80941Smrg * XXX: check components instead of whole register 250b8e80941Smrg */ 251b8e80941Smrg struct ir2_reg *reg = get_reg_src(ctx, src); 252b8e80941Smrg ir2_foreach_instr(p, ctx) { 253b8e80941Smrg if (!p->is_ssa && p->reg == reg && p->idx < instr->idx) 254b8e80941Smrg is_ok &= !p->need_emit; 255b8e80941Smrg } 256b8e80941Smrg } else if (src->type == IR2_SRC_SSA) { 257b8e80941Smrg /* in this case its easy, just check need_emit */ 258b8e80941Smrg is_ok &= !ctx->instr[src->num].need_emit; 259b8e80941Smrg } 260b8e80941Smrg } 261b8e80941Smrg if (!is_ok) 262b8e80941Smrg continue; 263b8e80941Smrg 264b8e80941Smrg avail[avail_count++] = instr; 265b8e80941Smrg } 266b8e80941Smrg 267b8e80941Smrg if (!avail_count) { 268b8e80941Smrg assert(block_idx == -1); 269b8e80941Smrg return -1; 270b8e80941Smrg } 271b8e80941Smrg 272b8e80941Smrg /* priority to FETCH instructions */ 273b8e80941Smrg ir2_foreach_avail(instr) { 274b8e80941Smrg if (instr->type == IR2_ALU) 275b8e80941Smrg continue; 276b8e80941Smrg 277b8e80941Smrg ra_src_free(ctx, instr); 278b8e80941Smrg ra_reg(ctx, get_reg(instr), -1, false, 0); 279b8e80941Smrg 280b8e80941Smrg instr->need_emit = false; 281b8e80941Smrg sched->instr = instr; 282b8e80941Smrg sched->instr_s = NULL; 283b8e80941Smrg return block_idx; 284b8e80941Smrg } 285b8e80941Smrg 286b8e80941Smrg /* TODO precompute priorities */ 287b8e80941Smrg 288b8e80941Smrg unsigned prio_v = ~0u, prio_s = ~0u, prio; 289b8e80941Smrg ir2_foreach_avail(instr) { 290b8e80941Smrg prio = alu_vector_prio(instr); 291b8e80941Smrg if (prio < prio_v) { 292b8e80941Smrg instr_v = instr; 293b8e80941Smrg prio_v = prio; 294b8e80941Smrg } 295b8e80941Smrg } 296b8e80941Smrg 297b8e80941Smrg /* TODO can still insert scalar if src_count=3, if smart about it */ 298b8e80941Smrg if (!instr_v || instr_v->src_count < 3) { 299b8e80941Smrg ir2_foreach_avail(instr) { 300b8e80941Smrg bool compat = is_alu_compatible(instr_v, instr); 301b8e80941Smrg 302b8e80941Smrg prio = alu_scalar_prio(instr); 303b8e80941Smrg if (prio >= prio_v && !compat) 304b8e80941Smrg continue; 305b8e80941Smrg 306b8e80941Smrg if (prio < prio_s) { 307b8e80941Smrg instr_s = instr; 308b8e80941Smrg prio_s = prio; 309b8e80941Smrg if (!compat) 310b8e80941Smrg instr_v = NULL; 311b8e80941Smrg } 312b8e80941Smrg } 313b8e80941Smrg } 314b8e80941Smrg 315b8e80941Smrg assert(instr_v || instr_s); 316b8e80941Smrg 317b8e80941Smrg /* now, we try more complex insertion of vector instruction as scalar 318b8e80941Smrg * TODO: if we are smart we can still insert if instr_v->src_count==3 319b8e80941Smrg */ 320b8e80941Smrg if (!instr_s && instr_v->src_count < 3) { 321b8e80941Smrg ir2_foreach_avail(instr) { 322b8e80941Smrg if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr)) 323b8e80941Smrg continue; 324b8e80941Smrg 325b8e80941Smrg /* at this point, src_count should always be 2 */ 326b8e80941Smrg assert(instr->src_count == 2); 327b8e80941Smrg 328b8e80941Smrg if (scalarize_case1(ctx, instr, 0)) { 329b8e80941Smrg instr_s = instr; 330b8e80941Smrg break; 331b8e80941Smrg } 332b8e80941Smrg if (scalarize_case1(ctx, instr, 1)) { 333b8e80941Smrg instr_s = instr; 334b8e80941Smrg break; 335b8e80941Smrg } 336b8e80941Smrg } 337b8e80941Smrg } 338b8e80941Smrg 339b8e80941Smrg /* free src registers */ 340b8e80941Smrg if (instr_v) { 341b8e80941Smrg instr_v->need_emit = false; 342b8e80941Smrg ra_src_free(ctx, instr_v); 343b8e80941Smrg } 344b8e80941Smrg 345b8e80941Smrg if (instr_s) { 346b8e80941Smrg instr_s->need_emit = false; 347b8e80941Smrg ra_src_free(ctx, instr_s); 348b8e80941Smrg } 349b8e80941Smrg 350b8e80941Smrg /* allocate dst registers */ 351b8e80941Smrg if (instr_v) 352b8e80941Smrg ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask); 353b8e80941Smrg 354b8e80941Smrg if (instr_s) 355b8e80941Smrg ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask); 356b8e80941Smrg 357b8e80941Smrg sched->instr = instr_v; 358b8e80941Smrg sched->instr_s = instr_s; 359b8e80941Smrg return block_idx; 360b8e80941Smrg} 361b8e80941Smrg 362b8e80941Smrg/* scheduling: determine order of instructions */ 363b8e80941Smrgstatic void schedule_instrs(struct ir2_context *ctx) 364b8e80941Smrg{ 365b8e80941Smrg struct ir2_sched_instr *sched; 366b8e80941Smrg int block_idx; 367b8e80941Smrg 368b8e80941Smrg /* allocate input registers */ 369b8e80941Smrg for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++) 370b8e80941Smrg if (ctx->input[idx].initialized) 371b8e80941Smrg ra_reg(ctx, &ctx->input[idx], idx, false, 0); 372b8e80941Smrg 373b8e80941Smrg for (;;) { 374b8e80941Smrg sched = &ctx->instr_sched[ctx->instr_sched_count++]; 375b8e80941Smrg block_idx = sched_next(ctx, sched); 376b8e80941Smrg if (block_idx < 0) 377b8e80941Smrg break; 378b8e80941Smrg memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state)); 379b8e80941Smrg 380b8e80941Smrg /* catch texture fetch after scheduling and insert the 381b8e80941Smrg * SET_TEX_LOD right before it if necessary 382b8e80941Smrg * TODO clean this up 383b8e80941Smrg */ 384b8e80941Smrg struct ir2_instr *instr = sched->instr, *tex_lod; 385b8e80941Smrg if (instr && instr->type == IR2_FETCH && 386b8e80941Smrg instr->fetch.opc == TEX_FETCH && instr->src_count == 2) { 387b8e80941Smrg /* generate the SET_LOD instruction */ 388b8e80941Smrg tex_lod = &ctx->instr[ctx->instr_count++]; 389b8e80941Smrg tex_lod->type = IR2_FETCH; 390b8e80941Smrg tex_lod->block_idx = instr->block_idx; 391b8e80941Smrg tex_lod->pred = instr->pred; 392b8e80941Smrg tex_lod->fetch.opc = TEX_SET_TEX_LOD; 393b8e80941Smrg tex_lod->src[0] = instr->src[1]; 394b8e80941Smrg tex_lod->src_count = 1; 395b8e80941Smrg 396b8e80941Smrg sched[1] = sched[0]; 397b8e80941Smrg sched->instr = tex_lod; 398b8e80941Smrg ctx->instr_sched_count++; 399b8e80941Smrg } 400b8e80941Smrg 401b8e80941Smrg bool free_block = true; 402b8e80941Smrg ir2_foreach_instr(instr, ctx) 403b8e80941Smrg free_block &= instr->block_idx != block_idx; 404b8e80941Smrg if (free_block) 405b8e80941Smrg ra_block_free(ctx, block_idx); 406b8e80941Smrg }; 407b8e80941Smrg ctx->instr_sched_count--; 408b8e80941Smrg} 409b8e80941Smrg 410b8e80941Smrgvoid 411b8e80941Smrgir2_compile(struct fd2_shader_stateobj *so, unsigned variant, 412b8e80941Smrg struct fd2_shader_stateobj *fp) 413b8e80941Smrg{ 414b8e80941Smrg struct ir2_context ctx = { }; 415b8e80941Smrg bool binning = !fp && so->type == MESA_SHADER_VERTEX; 416b8e80941Smrg 417b8e80941Smrg if (fp) 418b8e80941Smrg so->variant[variant].f = fp->variant[0].f; 419b8e80941Smrg 420b8e80941Smrg ctx.so = so; 421b8e80941Smrg ctx.info = &so->variant[variant].info; 422b8e80941Smrg ctx.f = &so->variant[variant].f; 423b8e80941Smrg ctx.info->max_reg = -1; 424b8e80941Smrg 425b8e80941Smrg /* convert nir to internal representation */ 426b8e80941Smrg ir2_nir_compile(&ctx, binning); 427b8e80941Smrg 428b8e80941Smrg /* copy propagate srcs */ 429b8e80941Smrg cp_src(&ctx); 430b8e80941Smrg 431b8e80941Smrg /* get ref_counts and kill non-needed instructions */ 432b8e80941Smrg ra_count_refs(&ctx); 433b8e80941Smrg 434b8e80941Smrg /* remove movs used to write outputs */ 435b8e80941Smrg cp_export(&ctx); 436b8e80941Smrg 437b8e80941Smrg /* instruction order.. and vector->scalar conversions */ 438b8e80941Smrg schedule_instrs(&ctx); 439b8e80941Smrg 440b8e80941Smrg /* finally, assemble to bitcode */ 441b8e80941Smrg assemble(&ctx, binning); 442b8e80941Smrg} 443