1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2012 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#include "brw_fs.h" 25b8e80941Smrg#include "brw_cfg.h" 26b8e80941Smrg 27b8e80941Smrg/** @file brw_fs_cse.cpp 28b8e80941Smrg * 29b8e80941Smrg * Support for local common subexpression elimination. 30b8e80941Smrg * 31b8e80941Smrg * See Muchnick's Advanced Compiler Design and Implementation, section 32b8e80941Smrg * 13.1 (p378). 33b8e80941Smrg */ 34b8e80941Smrg 35b8e80941Smrgusing namespace brw; 36b8e80941Smrg 37b8e80941Smrgnamespace { 38b8e80941Smrgstruct aeb_entry : public exec_node { 39b8e80941Smrg /** The instruction that generates the expression value. */ 40b8e80941Smrg fs_inst *generator; 41b8e80941Smrg 42b8e80941Smrg /** The temporary where the value is stored. */ 43b8e80941Smrg fs_reg tmp; 44b8e80941Smrg}; 45b8e80941Smrg} 46b8e80941Smrg 47b8e80941Smrgstatic bool 48b8e80941Smrgis_expression(const fs_visitor *v, const fs_inst *const inst) 49b8e80941Smrg{ 50b8e80941Smrg switch (inst->opcode) { 51b8e80941Smrg case BRW_OPCODE_MOV: 52b8e80941Smrg case BRW_OPCODE_SEL: 53b8e80941Smrg case BRW_OPCODE_NOT: 54b8e80941Smrg case BRW_OPCODE_AND: 55b8e80941Smrg case BRW_OPCODE_OR: 56b8e80941Smrg case BRW_OPCODE_XOR: 57b8e80941Smrg case BRW_OPCODE_SHR: 58b8e80941Smrg case BRW_OPCODE_SHL: 59b8e80941Smrg case BRW_OPCODE_ASR: 60b8e80941Smrg case BRW_OPCODE_CMP: 61b8e80941Smrg case BRW_OPCODE_CMPN: 62b8e80941Smrg case BRW_OPCODE_ADD: 63b8e80941Smrg case BRW_OPCODE_MUL: 64b8e80941Smrg case SHADER_OPCODE_MULH: 65b8e80941Smrg case BRW_OPCODE_FRC: 66b8e80941Smrg case BRW_OPCODE_RNDU: 67b8e80941Smrg case BRW_OPCODE_RNDD: 68b8e80941Smrg case BRW_OPCODE_RNDE: 69b8e80941Smrg case BRW_OPCODE_RNDZ: 70b8e80941Smrg case BRW_OPCODE_LINE: 71b8e80941Smrg case BRW_OPCODE_PLN: 72b8e80941Smrg case BRW_OPCODE_MAD: 73b8e80941Smrg case BRW_OPCODE_LRP: 74b8e80941Smrg case FS_OPCODE_FB_READ_LOGICAL: 75b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 76b8e80941Smrg case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: 77b8e80941Smrg case FS_OPCODE_LINTERP: 78b8e80941Smrg case SHADER_OPCODE_FIND_LIVE_CHANNEL: 79b8e80941Smrg case SHADER_OPCODE_BROADCAST: 80b8e80941Smrg case SHADER_OPCODE_MOV_INDIRECT: 81b8e80941Smrg case SHADER_OPCODE_TEX_LOGICAL: 82b8e80941Smrg case SHADER_OPCODE_TXD_LOGICAL: 83b8e80941Smrg case SHADER_OPCODE_TXF_LOGICAL: 84b8e80941Smrg case SHADER_OPCODE_TXL_LOGICAL: 85b8e80941Smrg case SHADER_OPCODE_TXS_LOGICAL: 86b8e80941Smrg case FS_OPCODE_TXB_LOGICAL: 87b8e80941Smrg case SHADER_OPCODE_TXF_CMS_LOGICAL: 88b8e80941Smrg case SHADER_OPCODE_TXF_CMS_W_LOGICAL: 89b8e80941Smrg case SHADER_OPCODE_TXF_UMS_LOGICAL: 90b8e80941Smrg case SHADER_OPCODE_TXF_MCS_LOGICAL: 91b8e80941Smrg case SHADER_OPCODE_LOD_LOGICAL: 92b8e80941Smrg case SHADER_OPCODE_TG4_LOGICAL: 93b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET_LOGICAL: 94b8e80941Smrg case FS_OPCODE_PACK: 95b8e80941Smrg return true; 96b8e80941Smrg case SHADER_OPCODE_RCP: 97b8e80941Smrg case SHADER_OPCODE_RSQ: 98b8e80941Smrg case SHADER_OPCODE_SQRT: 99b8e80941Smrg case SHADER_OPCODE_EXP2: 100b8e80941Smrg case SHADER_OPCODE_LOG2: 101b8e80941Smrg case SHADER_OPCODE_POW: 102b8e80941Smrg case SHADER_OPCODE_INT_QUOTIENT: 103b8e80941Smrg case SHADER_OPCODE_INT_REMAINDER: 104b8e80941Smrg case SHADER_OPCODE_SIN: 105b8e80941Smrg case SHADER_OPCODE_COS: 106b8e80941Smrg return inst->mlen < 2; 107b8e80941Smrg case SHADER_OPCODE_LOAD_PAYLOAD: 108b8e80941Smrg return !inst->is_copy_payload(v->alloc); 109b8e80941Smrg default: 110b8e80941Smrg return inst->is_send_from_grf() && !inst->has_side_effects() && 111b8e80941Smrg !inst->is_volatile(); 112b8e80941Smrg } 113b8e80941Smrg} 114b8e80941Smrg 115b8e80941Smrgstatic bool 116b8e80941Smrgoperands_match(const fs_inst *a, const fs_inst *b, bool *negate) 117b8e80941Smrg{ 118b8e80941Smrg fs_reg *xs = a->src; 119b8e80941Smrg fs_reg *ys = b->src; 120b8e80941Smrg 121b8e80941Smrg if (a->opcode == BRW_OPCODE_MAD) { 122b8e80941Smrg return xs[0].equals(ys[0]) && 123b8e80941Smrg ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) || 124b8e80941Smrg (xs[2].equals(ys[1]) && xs[1].equals(ys[2]))); 125b8e80941Smrg } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) { 126b8e80941Smrg bool xs0_negate = xs[0].negate; 127b8e80941Smrg bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f 128b8e80941Smrg : xs[1].negate; 129b8e80941Smrg bool ys0_negate = ys[0].negate; 130b8e80941Smrg bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f 131b8e80941Smrg : ys[1].negate; 132b8e80941Smrg float xs1_imm = xs[1].f; 133b8e80941Smrg float ys1_imm = ys[1].f; 134b8e80941Smrg 135b8e80941Smrg xs[0].negate = false; 136b8e80941Smrg xs[1].negate = false; 137b8e80941Smrg ys[0].negate = false; 138b8e80941Smrg ys[1].negate = false; 139b8e80941Smrg xs[1].f = fabsf(xs[1].f); 140b8e80941Smrg ys[1].f = fabsf(ys[1].f); 141b8e80941Smrg 142b8e80941Smrg bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || 143b8e80941Smrg (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); 144b8e80941Smrg 145b8e80941Smrg xs[0].negate = xs0_negate; 146b8e80941Smrg xs[1].negate = xs[1].file == IMM ? false : xs1_negate; 147b8e80941Smrg ys[0].negate = ys0_negate; 148b8e80941Smrg ys[1].negate = ys[1].file == IMM ? false : ys1_negate; 149b8e80941Smrg xs[1].f = xs1_imm; 150b8e80941Smrg ys[1].f = ys1_imm; 151b8e80941Smrg 152b8e80941Smrg *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate); 153b8e80941Smrg if (*negate && (a->saturate || b->saturate)) 154b8e80941Smrg return false; 155b8e80941Smrg return ret; 156b8e80941Smrg } else if (!a->is_commutative()) { 157b8e80941Smrg bool match = true; 158b8e80941Smrg for (int i = 0; i < a->sources; i++) { 159b8e80941Smrg if (!xs[i].equals(ys[i])) { 160b8e80941Smrg match = false; 161b8e80941Smrg break; 162b8e80941Smrg } 163b8e80941Smrg } 164b8e80941Smrg return match; 165b8e80941Smrg } else { 166b8e80941Smrg return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || 167b8e80941Smrg (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); 168b8e80941Smrg } 169b8e80941Smrg} 170b8e80941Smrg 171b8e80941Smrgstatic bool 172b8e80941Smrginstructions_match(fs_inst *a, fs_inst *b, bool *negate) 173b8e80941Smrg{ 174b8e80941Smrg return a->opcode == b->opcode && 175b8e80941Smrg a->force_writemask_all == b->force_writemask_all && 176b8e80941Smrg a->exec_size == b->exec_size && 177b8e80941Smrg a->group == b->group && 178b8e80941Smrg a->saturate == b->saturate && 179b8e80941Smrg a->predicate == b->predicate && 180b8e80941Smrg a->predicate_inverse == b->predicate_inverse && 181b8e80941Smrg a->conditional_mod == b->conditional_mod && 182b8e80941Smrg a->flag_subreg == b->flag_subreg && 183b8e80941Smrg a->dst.type == b->dst.type && 184b8e80941Smrg a->offset == b->offset && 185b8e80941Smrg a->mlen == b->mlen && 186b8e80941Smrg a->ex_mlen == b->ex_mlen && 187b8e80941Smrg a->sfid == b->sfid && 188b8e80941Smrg a->desc == b->desc && 189b8e80941Smrg a->size_written == b->size_written && 190b8e80941Smrg a->base_mrf == b->base_mrf && 191b8e80941Smrg a->check_tdr == b->check_tdr && 192b8e80941Smrg a->send_has_side_effects == b->send_has_side_effects && 193b8e80941Smrg a->eot == b->eot && 194b8e80941Smrg a->header_size == b->header_size && 195b8e80941Smrg a->shadow_compare == b->shadow_compare && 196b8e80941Smrg a->pi_noperspective == b->pi_noperspective && 197b8e80941Smrg a->target == b->target && 198b8e80941Smrg a->sources == b->sources && 199b8e80941Smrg operands_match(a, b, negate); 200b8e80941Smrg} 201b8e80941Smrg 202b8e80941Smrgstatic void 203b8e80941Smrgcreate_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate) 204b8e80941Smrg{ 205b8e80941Smrg unsigned written = regs_written(inst); 206b8e80941Smrg unsigned dst_width = 207b8e80941Smrg DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE); 208b8e80941Smrg fs_inst *copy; 209b8e80941Smrg 210b8e80941Smrg if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { 211b8e80941Smrg assert(src.file == VGRF); 212b8e80941Smrg fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, 213b8e80941Smrg inst->sources); 214b8e80941Smrg for (int i = 0; i < inst->header_size; i++) { 215b8e80941Smrg payload[i] = src; 216b8e80941Smrg src.offset += REG_SIZE; 217b8e80941Smrg } 218b8e80941Smrg for (int i = inst->header_size; i < inst->sources; i++) { 219b8e80941Smrg src.type = inst->src[i].type; 220b8e80941Smrg payload[i] = src; 221b8e80941Smrg src = offset(src, bld, 1); 222b8e80941Smrg } 223b8e80941Smrg copy = bld.LOAD_PAYLOAD(inst->dst, payload, inst->sources, 224b8e80941Smrg inst->header_size); 225b8e80941Smrg } else if (written != dst_width) { 226b8e80941Smrg assert(src.file == VGRF); 227b8e80941Smrg assert(written % dst_width == 0); 228b8e80941Smrg const int sources = written / dst_width; 229b8e80941Smrg fs_reg *payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources); 230b8e80941Smrg for (int i = 0; i < sources; i++) { 231b8e80941Smrg payload[i] = src; 232b8e80941Smrg src = offset(src, bld, 1); 233b8e80941Smrg } 234b8e80941Smrg copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, 0); 235b8e80941Smrg } else { 236b8e80941Smrg copy = bld.MOV(inst->dst, src); 237b8e80941Smrg copy->group = inst->group; 238b8e80941Smrg copy->force_writemask_all = inst->force_writemask_all; 239b8e80941Smrg copy->src[0].negate = negate; 240b8e80941Smrg } 241b8e80941Smrg assert(regs_written(copy) == written); 242b8e80941Smrg} 243b8e80941Smrg 244b8e80941Smrgbool 245b8e80941Smrgfs_visitor::opt_cse_local(bblock_t *block) 246b8e80941Smrg{ 247b8e80941Smrg bool progress = false; 248b8e80941Smrg exec_list aeb; 249b8e80941Smrg 250b8e80941Smrg void *cse_ctx = ralloc_context(NULL); 251b8e80941Smrg 252b8e80941Smrg int ip = block->start_ip; 253b8e80941Smrg foreach_inst_in_block(fs_inst, inst, block) { 254b8e80941Smrg /* Skip some cases. */ 255b8e80941Smrg if (is_expression(this, inst) && !inst->is_partial_write() && 256b8e80941Smrg ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || 257b8e80941Smrg inst->dst.is_null())) 258b8e80941Smrg { 259b8e80941Smrg bool found = false; 260b8e80941Smrg bool negate = false; 261b8e80941Smrg 262b8e80941Smrg foreach_in_list_use_after(aeb_entry, entry, &aeb) { 263b8e80941Smrg /* Match current instruction's expression against those in AEB. */ 264b8e80941Smrg if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) && 265b8e80941Smrg instructions_match(inst, entry->generator, &negate)) { 266b8e80941Smrg found = true; 267b8e80941Smrg progress = true; 268b8e80941Smrg break; 269b8e80941Smrg } 270b8e80941Smrg } 271b8e80941Smrg 272b8e80941Smrg if (!found) { 273b8e80941Smrg if (inst->opcode != BRW_OPCODE_MOV || 274b8e80941Smrg (inst->opcode == BRW_OPCODE_MOV && 275b8e80941Smrg inst->src[0].file == IMM && 276b8e80941Smrg inst->src[0].type == BRW_REGISTER_TYPE_VF)) { 277b8e80941Smrg /* Our first sighting of this expression. Create an entry. */ 278b8e80941Smrg aeb_entry *entry = ralloc(cse_ctx, aeb_entry); 279b8e80941Smrg entry->tmp = reg_undef; 280b8e80941Smrg entry->generator = inst; 281b8e80941Smrg aeb.push_tail(entry); 282b8e80941Smrg } 283b8e80941Smrg } else { 284b8e80941Smrg /* This is at least our second sighting of this expression. 285b8e80941Smrg * If we don't have a temporary already, make one. 286b8e80941Smrg */ 287b8e80941Smrg bool no_existing_temp = entry->tmp.file == BAD_FILE; 288b8e80941Smrg if (no_existing_temp && !entry->generator->dst.is_null()) { 289b8e80941Smrg const fs_builder ibld = fs_builder(this, block, entry->generator) 290b8e80941Smrg .at(block, entry->generator->next); 291b8e80941Smrg int written = regs_written(entry->generator); 292b8e80941Smrg 293b8e80941Smrg entry->tmp = fs_reg(VGRF, alloc.allocate(written), 294b8e80941Smrg entry->generator->dst.type); 295b8e80941Smrg 296b8e80941Smrg create_copy_instr(ibld, entry->generator, entry->tmp, false); 297b8e80941Smrg 298b8e80941Smrg entry->generator->dst = entry->tmp; 299b8e80941Smrg } 300b8e80941Smrg 301b8e80941Smrg /* dest <- temp */ 302b8e80941Smrg if (!inst->dst.is_null()) { 303b8e80941Smrg assert(inst->size_written == entry->generator->size_written); 304b8e80941Smrg assert(inst->dst.type == entry->tmp.type); 305b8e80941Smrg const fs_builder ibld(this, block, inst); 306b8e80941Smrg 307b8e80941Smrg create_copy_instr(ibld, inst, entry->tmp, negate); 308b8e80941Smrg } 309b8e80941Smrg 310b8e80941Smrg /* Set our iterator so that next time through the loop inst->next 311b8e80941Smrg * will get the instruction in the basic block after the one we've 312b8e80941Smrg * removed. 313b8e80941Smrg */ 314b8e80941Smrg fs_inst *prev = (fs_inst *)inst->prev; 315b8e80941Smrg 316b8e80941Smrg inst->remove(block); 317b8e80941Smrg inst = prev; 318b8e80941Smrg } 319b8e80941Smrg } 320b8e80941Smrg 321b8e80941Smrg foreach_in_list_safe(aeb_entry, entry, &aeb) { 322b8e80941Smrg /* Kill all AEB entries that write a different value to or read from 323b8e80941Smrg * the flag register if we just wrote it. 324b8e80941Smrg */ 325b8e80941Smrg if (inst->flags_written()) { 326b8e80941Smrg bool negate; /* dummy */ 327b8e80941Smrg if (entry->generator->flags_read(devinfo) || 328b8e80941Smrg (entry->generator->flags_written() && 329b8e80941Smrg !instructions_match(inst, entry->generator, &negate))) { 330b8e80941Smrg entry->remove(); 331b8e80941Smrg ralloc_free(entry); 332b8e80941Smrg continue; 333b8e80941Smrg } 334b8e80941Smrg } 335b8e80941Smrg 336b8e80941Smrg for (int i = 0; i < entry->generator->sources; i++) { 337b8e80941Smrg fs_reg *src_reg = &entry->generator->src[i]; 338b8e80941Smrg 339b8e80941Smrg /* Kill all AEB entries that use the destination we just 340b8e80941Smrg * overwrote. 341b8e80941Smrg */ 342b8e80941Smrg if (regions_overlap(inst->dst, inst->size_written, 343b8e80941Smrg entry->generator->src[i], 344b8e80941Smrg entry->generator->size_read(i))) { 345b8e80941Smrg entry->remove(); 346b8e80941Smrg ralloc_free(entry); 347b8e80941Smrg break; 348b8e80941Smrg } 349b8e80941Smrg 350b8e80941Smrg /* Kill any AEB entries using registers that don't get reused any 351b8e80941Smrg * more -- a sure sign they'll fail operands_match(). 352b8e80941Smrg */ 353b8e80941Smrg if (src_reg->file == VGRF && virtual_grf_end[src_reg->nr] < ip) { 354b8e80941Smrg entry->remove(); 355b8e80941Smrg ralloc_free(entry); 356b8e80941Smrg break; 357b8e80941Smrg } 358b8e80941Smrg } 359b8e80941Smrg } 360b8e80941Smrg 361b8e80941Smrg ip++; 362b8e80941Smrg } 363b8e80941Smrg 364b8e80941Smrg ralloc_free(cse_ctx); 365b8e80941Smrg 366b8e80941Smrg return progress; 367b8e80941Smrg} 368b8e80941Smrg 369b8e80941Smrgbool 370b8e80941Smrgfs_visitor::opt_cse() 371b8e80941Smrg{ 372b8e80941Smrg bool progress = false; 373b8e80941Smrg 374b8e80941Smrg calculate_live_intervals(); 375b8e80941Smrg 376b8e80941Smrg foreach_block (block, cfg) { 377b8e80941Smrg progress = opt_cse_local(block) || progress; 378b8e80941Smrg } 379b8e80941Smrg 380b8e80941Smrg if (progress) 381b8e80941Smrg invalidate_live_intervals(); 382b8e80941Smrg 383b8e80941Smrg return progress; 384b8e80941Smrg} 385