1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "util/u_math.h" 28 29#include "ir3.h" 30 31/* 32 * Instruction Depth: 33 * 34 * Calculates weighted instruction depth, ie. the sum of # of needed 35 * instructions plus delay slots back to original input (ie INPUT or 36 * CONST). That is to say, an instructions depth is: 37 * 38 * depth(instr) { 39 * d = 0; 40 * // for each src register: 41 * foreach (src in instr->regs[1..n]) 42 * d = max(d, delayslots(src->instr, n) + depth(src->instr)); 43 * return d + 1; 44 * } 45 * 46 * After an instruction's depth is calculated, it is inserted into the 47 * blocks depth sorted list, which is used by the scheduling pass. 48 */ 49 50/* generally don't count false dependencies, since this can just be 51 * something like a barrier, or SSBO store. The exception is array 52 * dependencies if the assigner is an array write and the consumer 53 * reads the same array. 54 */ 55static bool 56ignore_dep(struct ir3_instruction *assigner, 57 struct ir3_instruction *consumer, unsigned n) 58{ 59 if (!__is_false_dep(consumer, n)) 60 return false; 61 62 if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) { 63 struct ir3_register *dst = assigner->regs[0]; 64 struct ir3_register *src; 65 66 debug_assert(dst->flags & IR3_REG_ARRAY); 67 68 foreach_src(src, consumer) { 69 if ((src->flags & IR3_REG_ARRAY) && 70 (dst->array.id == src->array.id)) { 71 return false; 72 } 73 } 74 } 75 76 return true; 77} 78 79/* calculate required # of delay slots between the instruction that 80 * assigns a value and the one that consumes 81 */ 82int ir3_delayslots(struct ir3_instruction *assigner, 83 struct ir3_instruction *consumer, unsigned n) 84{ 85 if (ignore_dep(assigner, consumer, n)) 86 return 0; 87 88 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal 89 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch 90 * handled with sync bits 91 */ 92 93 if (is_meta(assigner) || is_meta(consumer)) 94 return 0; 95 96 if (writes_addr(assigner)) 97 return 6; 98 99 /* handled via sync flags: */ 100 if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) 101 return 0; 102 103 /* assigner must be alu: */ 104 if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || 105 is_mem(consumer)) { 106 return 6; 107 } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && 108 (n == 3)) { 109 /* special case, 3rd src to cat3 not required on first cycle */ 110 return 1; 111 } else { 112 return 3; 113 } 114} 115 116void 117ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list) 118{ 119 /* remove from existing spot in list: */ 120 list_delinit(&instr->node); 121 122 /* find where to re-insert instruction: */ 123 list_for_each_entry (struct ir3_instruction, pos, list, node) { 124 if (pos->depth > instr->depth) { 125 list_add(&instr->node, &pos->node); 126 return; 127 } 128 } 129 /* if we get here, we didn't find an insertion spot: */ 130 list_addtail(&instr->node, list); 131} 132 133static void 134ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep) 135{ 136 struct ir3_instruction *src; 137 138 /* don't mark falsedep's as used, but otherwise process them normally: */ 139 if (!falsedep) 140 instr->flags &= ~IR3_INSTR_UNUSED; 141 142 if (ir3_instr_check_mark(instr)) 143 return; 144 145 instr->depth = 0; 146 147 foreach_ssa_src_n(src, i, instr) { 148 unsigned sd; 149 150 /* visit child to compute it's depth: */ 151 ir3_instr_depth(src, boost, __is_false_dep(instr, i)); 152 153 /* for array writes, no need to delay on previous write: */ 154 if (i == 0) 155 continue; 156 157 sd = ir3_delayslots(src, instr, i) + src->depth; 158 sd += boost; 159 160 instr->depth = MAX2(instr->depth, sd); 161 } 162 163 if (!is_meta(instr)) 164 instr->depth++; 165 166 ir3_insert_by_depth(instr, &instr->block->instr_list); 167} 168 169static bool 170remove_unused_by_block(struct ir3_block *block) 171{ 172 bool progress = false; 173 list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) { 174 if (instr->opc == OPC_END) 175 continue; 176 if (instr->flags & IR3_INSTR_UNUSED) { 177 if (instr->opc == OPC_META_FO) { 178 struct ir3_instruction *src = ssa(instr->regs[1]); 179 /* leave inputs alone.. we can't optimize out components of 180 * an input, since the hw is still going to be writing all 181 * of the components, and we could end up in a situation 182 * where multiple inputs overlap. 183 */ 184 if ((src->opc != OPC_META_INPUT) && 185 (src->regs[0]->wrmask > 1)) { 186 src->regs[0]->wrmask &= ~(1 << instr->fo.off); 187 188 /* prune no-longer needed right-neighbors. We could 189 * probably do the same for left-neighbors (ie. tex 190 * fetch that only need .yw components), but that 191 * makes RA a bit more confusing than it already is 192 */ 193 struct ir3_instruction *n = instr; 194 while (n && n->cp.right) 195 n = n->cp.right; 196 while (n->flags & IR3_INSTR_UNUSED) { 197 n = n->cp.left; 198 if (!n) 199 break; 200 n->cp.right = NULL; 201 } 202 } 203 } 204 list_delinit(&instr->node); 205 progress = true; 206 } 207 } 208 return progress; 209} 210 211static bool 212compute_depth_and_remove_unused(struct ir3 *ir) 213{ 214 unsigned i; 215 bool progress = false; 216 217 ir3_clear_mark(ir); 218 219 /* initially mark everything as unused, we'll clear the flag as we 220 * visit the instructions: 221 */ 222 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 223 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 224 instr->flags |= IR3_INSTR_UNUSED; 225 } 226 } 227 228 for (i = 0; i < ir->noutputs; i++) 229 if (ir->outputs[i]) 230 ir3_instr_depth(ir->outputs[i], 0, false); 231 232 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 233 for (i = 0; i < block->keeps_count; i++) 234 ir3_instr_depth(block->keeps[i], 0, false); 235 236 /* We also need to account for if-condition: */ 237 if (block->condition) 238 ir3_instr_depth(block->condition, 6, false); 239 } 240 241 /* mark un-used instructions: */ 242 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 243 progress |= remove_unused_by_block(block); 244 } 245 246 /* note that we can end up with unused indirects, but we should 247 * not end up with unused predicates. 248 */ 249 for (i = 0; i < ir->indirects_count; i++) { 250 struct ir3_instruction *instr = ir->indirects[i]; 251 if (instr && (instr->flags & IR3_INSTR_UNUSED)) 252 ir->indirects[i] = NULL; 253 } 254 255 /* cleanup unused inputs: */ 256 for (i = 0; i < ir->ninputs; i++) { 257 struct ir3_instruction *in = ir->inputs[i]; 258 if (in && (in->flags & IR3_INSTR_UNUSED)) 259 ir->inputs[i] = NULL; 260 } 261 262 return progress; 263} 264 265void 266ir3_depth(struct ir3 *ir) 267{ 268 bool progress; 269 do { 270 progress = compute_depth_and_remove_unused(ir); 271 } while (progress); 272} 273