1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2012, 2013, 2014 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#include "brw_vec4.h" 25b8e80941Smrg#include "brw_vec4_live_variables.h" 26b8e80941Smrg#include "brw_cfg.h" 27b8e80941Smrg 28b8e80941Smrgusing namespace brw; 29b8e80941Smrg 30b8e80941Smrg/** @file brw_vec4_cse.cpp 31b8e80941Smrg * 32b8e80941Smrg * Support for local common subexpression elimination. 33b8e80941Smrg * 34b8e80941Smrg * See Muchnick's Advanced Compiler Design and Implementation, section 35b8e80941Smrg * 13.1 (p378). 36b8e80941Smrg */ 37b8e80941Smrg 38b8e80941Smrgnamespace { 39b8e80941Smrgstruct aeb_entry : public exec_node { 40b8e80941Smrg /** The instruction that generates the expression value. */ 41b8e80941Smrg vec4_instruction *generator; 42b8e80941Smrg 43b8e80941Smrg /** The temporary where the value is stored. */ 44b8e80941Smrg src_reg tmp; 45b8e80941Smrg}; 46b8e80941Smrg} 47b8e80941Smrg 48b8e80941Smrgstatic bool 49b8e80941Smrgis_expression(const vec4_instruction *const inst) 50b8e80941Smrg{ 51b8e80941Smrg switch (inst->opcode) { 52b8e80941Smrg case BRW_OPCODE_MOV: 53b8e80941Smrg case BRW_OPCODE_SEL: 54b8e80941Smrg case BRW_OPCODE_NOT: 55b8e80941Smrg case BRW_OPCODE_AND: 56b8e80941Smrg case BRW_OPCODE_OR: 57b8e80941Smrg case BRW_OPCODE_XOR: 58b8e80941Smrg case BRW_OPCODE_SHR: 59b8e80941Smrg case BRW_OPCODE_SHL: 60b8e80941Smrg case BRW_OPCODE_ASR: 61b8e80941Smrg case BRW_OPCODE_CMP: 62b8e80941Smrg case BRW_OPCODE_CMPN: 63b8e80941Smrg case BRW_OPCODE_ADD: 64b8e80941Smrg case BRW_OPCODE_MUL: 65b8e80941Smrg case SHADER_OPCODE_MULH: 66b8e80941Smrg case BRW_OPCODE_FRC: 67b8e80941Smrg case BRW_OPCODE_RNDU: 68b8e80941Smrg case BRW_OPCODE_RNDD: 69b8e80941Smrg case BRW_OPCODE_RNDE: 70b8e80941Smrg case BRW_OPCODE_RNDZ: 71b8e80941Smrg case BRW_OPCODE_LINE: 72b8e80941Smrg case BRW_OPCODE_PLN: 73b8e80941Smrg case BRW_OPCODE_MAD: 74b8e80941Smrg case BRW_OPCODE_LRP: 75b8e80941Smrg case VEC4_OPCODE_UNPACK_UNIFORM: 76b8e80941Smrg case SHADER_OPCODE_FIND_LIVE_CHANNEL: 77b8e80941Smrg case SHADER_OPCODE_BROADCAST: 78b8e80941Smrg case TCS_OPCODE_SET_INPUT_URB_OFFSETS: 79b8e80941Smrg case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: 80b8e80941Smrg return true; 81b8e80941Smrg case SHADER_OPCODE_RCP: 82b8e80941Smrg case SHADER_OPCODE_RSQ: 83b8e80941Smrg case SHADER_OPCODE_SQRT: 84b8e80941Smrg case SHADER_OPCODE_EXP2: 85b8e80941Smrg case SHADER_OPCODE_LOG2: 86b8e80941Smrg case SHADER_OPCODE_POW: 87b8e80941Smrg case SHADER_OPCODE_INT_QUOTIENT: 88b8e80941Smrg case SHADER_OPCODE_INT_REMAINDER: 89b8e80941Smrg case SHADER_OPCODE_SIN: 90b8e80941Smrg case SHADER_OPCODE_COS: 91b8e80941Smrg return inst->mlen == 0; 92b8e80941Smrg default: 93b8e80941Smrg return false; 94b8e80941Smrg } 95b8e80941Smrg} 96b8e80941Smrg 97b8e80941Smrgstatic bool 98b8e80941Smrgoperands_match(const vec4_instruction *a, const vec4_instruction *b) 99b8e80941Smrg{ 100b8e80941Smrg const src_reg *xs = a->src; 101b8e80941Smrg const src_reg *ys = b->src; 102b8e80941Smrg 103b8e80941Smrg if (a->opcode == BRW_OPCODE_MAD) { 104b8e80941Smrg return xs[0].equals(ys[0]) && 105b8e80941Smrg ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) || 106b8e80941Smrg (xs[2].equals(ys[1]) && xs[1].equals(ys[2]))); 107b8e80941Smrg } else if (a->opcode == BRW_OPCODE_MOV && 108b8e80941Smrg xs[0].file == IMM && 109b8e80941Smrg xs[0].type == BRW_REGISTER_TYPE_VF) { 110b8e80941Smrg src_reg tmp_x = xs[0]; 111b8e80941Smrg src_reg tmp_y = ys[0]; 112b8e80941Smrg 113b8e80941Smrg /* Smash out the values that are not part of the writemask. Otherwise 114b8e80941Smrg * the equals operator will fail due to mismatches in unused components. 115b8e80941Smrg */ 116b8e80941Smrg const unsigned ab_writemask = a->dst.writemask & b->dst.writemask; 117b8e80941Smrg const uint32_t mask = ((ab_writemask & WRITEMASK_X) ? 0x000000ff : 0) | 118b8e80941Smrg ((ab_writemask & WRITEMASK_Y) ? 0x0000ff00 : 0) | 119b8e80941Smrg ((ab_writemask & WRITEMASK_Z) ? 0x00ff0000 : 0) | 120b8e80941Smrg ((ab_writemask & WRITEMASK_W) ? 0xff000000 : 0); 121b8e80941Smrg 122b8e80941Smrg tmp_x.ud &= mask; 123b8e80941Smrg tmp_y.ud &= mask; 124b8e80941Smrg 125b8e80941Smrg return tmp_x.equals(tmp_y); 126b8e80941Smrg } else if (!a->is_commutative()) { 127b8e80941Smrg return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]); 128b8e80941Smrg } else { 129b8e80941Smrg return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || 130b8e80941Smrg (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); 131b8e80941Smrg } 132b8e80941Smrg} 133b8e80941Smrg 134b8e80941Smrg/** 135b8e80941Smrg * Checks if instructions match, exactly for sources, but loosely for 136b8e80941Smrg * destination writemasks. 137b8e80941Smrg * 138b8e80941Smrg * \param 'a' is the generating expression from the AEB entry. 139b8e80941Smrg * \param 'b' is the second occurrence of the expression that we're 140b8e80941Smrg * considering eliminating. 141b8e80941Smrg */ 142b8e80941Smrgstatic bool 143b8e80941Smrginstructions_match(vec4_instruction *a, vec4_instruction *b) 144b8e80941Smrg{ 145b8e80941Smrg return a->opcode == b->opcode && 146b8e80941Smrg a->saturate == b->saturate && 147b8e80941Smrg a->predicate == b->predicate && 148b8e80941Smrg a->predicate_inverse == b->predicate_inverse && 149b8e80941Smrg a->conditional_mod == b->conditional_mod && 150b8e80941Smrg a->flag_subreg == b->flag_subreg && 151b8e80941Smrg a->dst.type == b->dst.type && 152b8e80941Smrg a->offset == b->offset && 153b8e80941Smrg a->mlen == b->mlen && 154b8e80941Smrg a->base_mrf == b->base_mrf && 155b8e80941Smrg a->header_size == b->header_size && 156b8e80941Smrg a->shadow_compare == b->shadow_compare && 157b8e80941Smrg ((a->dst.writemask & b->dst.writemask) == a->dst.writemask) && 158b8e80941Smrg a->force_writemask_all == b->force_writemask_all && 159b8e80941Smrg a->size_written == b->size_written && 160b8e80941Smrg a->exec_size == b->exec_size && 161b8e80941Smrg a->group == b->group && 162b8e80941Smrg operands_match(a, b); 163b8e80941Smrg} 164b8e80941Smrg 165b8e80941Smrgbool 166b8e80941Smrgvec4_visitor::opt_cse_local(bblock_t *block) 167b8e80941Smrg{ 168b8e80941Smrg bool progress = false; 169b8e80941Smrg exec_list aeb; 170b8e80941Smrg 171b8e80941Smrg void *cse_ctx = ralloc_context(NULL); 172b8e80941Smrg 173b8e80941Smrg int ip = block->start_ip; 174b8e80941Smrg foreach_inst_in_block (vec4_instruction, inst, block) { 175b8e80941Smrg /* Skip some cases. */ 176b8e80941Smrg if (is_expression(inst) && !inst->predicate && inst->mlen == 0 && 177b8e80941Smrg ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || 178b8e80941Smrg inst->dst.is_null())) 179b8e80941Smrg { 180b8e80941Smrg bool found = false; 181b8e80941Smrg 182b8e80941Smrg foreach_in_list_use_after(aeb_entry, entry, &aeb) { 183b8e80941Smrg /* Match current instruction's expression against those in AEB. */ 184b8e80941Smrg if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) && 185b8e80941Smrg instructions_match(inst, entry->generator)) { 186b8e80941Smrg found = true; 187b8e80941Smrg progress = true; 188b8e80941Smrg break; 189b8e80941Smrg } 190b8e80941Smrg } 191b8e80941Smrg 192b8e80941Smrg if (!found) { 193b8e80941Smrg if (inst->opcode != BRW_OPCODE_MOV || 194b8e80941Smrg (inst->opcode == BRW_OPCODE_MOV && 195b8e80941Smrg inst->src[0].file == IMM && 196b8e80941Smrg inst->src[0].type == BRW_REGISTER_TYPE_VF)) { 197b8e80941Smrg /* Our first sighting of this expression. Create an entry. */ 198b8e80941Smrg aeb_entry *entry = ralloc(cse_ctx, aeb_entry); 199b8e80941Smrg entry->tmp = src_reg(); /* file will be BAD_FILE */ 200b8e80941Smrg entry->generator = inst; 201b8e80941Smrg aeb.push_tail(entry); 202b8e80941Smrg } 203b8e80941Smrg } else { 204b8e80941Smrg /* This is at least our second sighting of this expression. 205b8e80941Smrg * If we don't have a temporary already, make one. 206b8e80941Smrg */ 207b8e80941Smrg bool no_existing_temp = entry->tmp.file == BAD_FILE; 208b8e80941Smrg if (no_existing_temp && !entry->generator->dst.is_null()) { 209b8e80941Smrg entry->tmp = retype(src_reg(VGRF, alloc.allocate( 210b8e80941Smrg regs_written(entry->generator)), 211b8e80941Smrg NULL), inst->dst.type); 212b8e80941Smrg 213b8e80941Smrg const unsigned width = entry->generator->exec_size; 214b8e80941Smrg unsigned component_size = width * type_sz(entry->tmp.type); 215b8e80941Smrg unsigned num_copy_movs = 216b8e80941Smrg DIV_ROUND_UP(entry->generator->size_written, component_size); 217b8e80941Smrg for (unsigned i = 0; i < num_copy_movs; ++i) { 218b8e80941Smrg vec4_instruction *copy = 219b8e80941Smrg MOV(offset(entry->generator->dst, width, i), 220b8e80941Smrg offset(entry->tmp, width, i)); 221b8e80941Smrg copy->exec_size = width; 222b8e80941Smrg copy->group = entry->generator->group; 223b8e80941Smrg copy->force_writemask_all = 224b8e80941Smrg entry->generator->force_writemask_all; 225b8e80941Smrg entry->generator->insert_after(block, copy); 226b8e80941Smrg } 227b8e80941Smrg 228b8e80941Smrg entry->generator->dst = dst_reg(entry->tmp); 229b8e80941Smrg } 230b8e80941Smrg 231b8e80941Smrg /* dest <- temp */ 232b8e80941Smrg if (!inst->dst.is_null()) { 233b8e80941Smrg assert(inst->dst.type == entry->tmp.type); 234b8e80941Smrg const unsigned width = inst->exec_size; 235b8e80941Smrg unsigned component_size = width * type_sz(inst->dst.type); 236b8e80941Smrg unsigned num_copy_movs = 237b8e80941Smrg DIV_ROUND_UP(inst->size_written, component_size); 238b8e80941Smrg for (unsigned i = 0; i < num_copy_movs; ++i) { 239b8e80941Smrg vec4_instruction *copy = 240b8e80941Smrg MOV(offset(inst->dst, width, i), 241b8e80941Smrg offset(entry->tmp, width, i)); 242b8e80941Smrg copy->exec_size = inst->exec_size; 243b8e80941Smrg copy->group = inst->group; 244b8e80941Smrg copy->force_writemask_all = inst->force_writemask_all; 245b8e80941Smrg inst->insert_before(block, copy); 246b8e80941Smrg } 247b8e80941Smrg } 248b8e80941Smrg 249b8e80941Smrg /* Set our iterator so that next time through the loop inst->next 250b8e80941Smrg * will get the instruction in the basic block after the one we've 251b8e80941Smrg * removed. 252b8e80941Smrg */ 253b8e80941Smrg vec4_instruction *prev = (vec4_instruction *)inst->prev; 254b8e80941Smrg 255b8e80941Smrg inst->remove(block); 256b8e80941Smrg inst = prev; 257b8e80941Smrg } 258b8e80941Smrg } 259b8e80941Smrg 260b8e80941Smrg foreach_in_list_safe(aeb_entry, entry, &aeb) { 261b8e80941Smrg /* Kill all AEB entries that write a different value to or read from 262b8e80941Smrg * the flag register if we just wrote it. 263b8e80941Smrg */ 264b8e80941Smrg if (inst->writes_flag()) { 265b8e80941Smrg if (entry->generator->reads_flag() || 266b8e80941Smrg (entry->generator->writes_flag() && 267b8e80941Smrg !instructions_match(inst, entry->generator))) { 268b8e80941Smrg entry->remove(); 269b8e80941Smrg ralloc_free(entry); 270b8e80941Smrg continue; 271b8e80941Smrg } 272b8e80941Smrg } 273b8e80941Smrg 274b8e80941Smrg for (int i = 0; i < 3; i++) { 275b8e80941Smrg src_reg *src = &entry->generator->src[i]; 276b8e80941Smrg 277b8e80941Smrg /* Kill all AEB entries that use the destination we just 278b8e80941Smrg * overwrote. 279b8e80941Smrg */ 280b8e80941Smrg if (inst->dst.file == entry->generator->src[i].file && 281b8e80941Smrg inst->dst.nr == entry->generator->src[i].nr) { 282b8e80941Smrg entry->remove(); 283b8e80941Smrg ralloc_free(entry); 284b8e80941Smrg break; 285b8e80941Smrg } 286b8e80941Smrg 287b8e80941Smrg /* Kill any AEB entries using registers that don't get reused any 288b8e80941Smrg * more -- a sure sign they'll fail operands_match(). 289b8e80941Smrg */ 290b8e80941Smrg if (src->file == VGRF) { 291b8e80941Smrg if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) { 292b8e80941Smrg entry->remove(); 293b8e80941Smrg ralloc_free(entry); 294b8e80941Smrg break; 295b8e80941Smrg } 296b8e80941Smrg } 297b8e80941Smrg } 298b8e80941Smrg } 299b8e80941Smrg 300b8e80941Smrg ip++; 301b8e80941Smrg } 302b8e80941Smrg 303b8e80941Smrg ralloc_free(cse_ctx); 304b8e80941Smrg 305b8e80941Smrg return progress; 306b8e80941Smrg} 307b8e80941Smrg 308b8e80941Smrgbool 309b8e80941Smrgvec4_visitor::opt_cse() 310b8e80941Smrg{ 311b8e80941Smrg bool progress = false; 312b8e80941Smrg 313b8e80941Smrg calculate_live_intervals(); 314b8e80941Smrg 315b8e80941Smrg foreach_block (block, cfg) { 316b8e80941Smrg progress = opt_cse_local(block) || progress; 317b8e80941Smrg } 318b8e80941Smrg 319b8e80941Smrg if (progress) 320b8e80941Smrg invalidate_live_intervals(); 321b8e80941Smrg 322b8e80941Smrg return progress; 323b8e80941Smrg} 324