1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2012, 2013, 2014 Intel Corporation
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21b8e80941Smrg * IN THE SOFTWARE.
22b8e80941Smrg */
23b8e80941Smrg
24b8e80941Smrg#include "brw_vec4.h"
25b8e80941Smrg#include "brw_vec4_live_variables.h"
26b8e80941Smrg#include "brw_cfg.h"
27b8e80941Smrg
28b8e80941Smrgusing namespace brw;
29b8e80941Smrg
30b8e80941Smrg/** @file brw_vec4_cse.cpp
31b8e80941Smrg *
32b8e80941Smrg * Support for local common subexpression elimination.
33b8e80941Smrg *
34b8e80941Smrg * See Muchnick's Advanced Compiler Design and Implementation, section
35b8e80941Smrg * 13.1 (p378).
36b8e80941Smrg */
37b8e80941Smrg
38b8e80941Smrgnamespace {
39b8e80941Smrgstruct aeb_entry : public exec_node {
40b8e80941Smrg   /** The instruction that generates the expression value. */
41b8e80941Smrg   vec4_instruction *generator;
42b8e80941Smrg
43b8e80941Smrg   /** The temporary where the value is stored. */
44b8e80941Smrg   src_reg tmp;
45b8e80941Smrg};
46b8e80941Smrg}
47b8e80941Smrg
48b8e80941Smrgstatic bool
49b8e80941Smrgis_expression(const vec4_instruction *const inst)
50b8e80941Smrg{
51b8e80941Smrg   switch (inst->opcode) {
52b8e80941Smrg   case BRW_OPCODE_MOV:
53b8e80941Smrg   case BRW_OPCODE_SEL:
54b8e80941Smrg   case BRW_OPCODE_NOT:
55b8e80941Smrg   case BRW_OPCODE_AND:
56b8e80941Smrg   case BRW_OPCODE_OR:
57b8e80941Smrg   case BRW_OPCODE_XOR:
58b8e80941Smrg   case BRW_OPCODE_SHR:
59b8e80941Smrg   case BRW_OPCODE_SHL:
60b8e80941Smrg   case BRW_OPCODE_ASR:
61b8e80941Smrg   case BRW_OPCODE_CMP:
62b8e80941Smrg   case BRW_OPCODE_CMPN:
63b8e80941Smrg   case BRW_OPCODE_ADD:
64b8e80941Smrg   case BRW_OPCODE_MUL:
65b8e80941Smrg   case SHADER_OPCODE_MULH:
66b8e80941Smrg   case BRW_OPCODE_FRC:
67b8e80941Smrg   case BRW_OPCODE_RNDU:
68b8e80941Smrg   case BRW_OPCODE_RNDD:
69b8e80941Smrg   case BRW_OPCODE_RNDE:
70b8e80941Smrg   case BRW_OPCODE_RNDZ:
71b8e80941Smrg   case BRW_OPCODE_LINE:
72b8e80941Smrg   case BRW_OPCODE_PLN:
73b8e80941Smrg   case BRW_OPCODE_MAD:
74b8e80941Smrg   case BRW_OPCODE_LRP:
75b8e80941Smrg   case VEC4_OPCODE_UNPACK_UNIFORM:
76b8e80941Smrg   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
77b8e80941Smrg   case SHADER_OPCODE_BROADCAST:
78b8e80941Smrg   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
79b8e80941Smrg   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
80b8e80941Smrg      return true;
81b8e80941Smrg   case SHADER_OPCODE_RCP:
82b8e80941Smrg   case SHADER_OPCODE_RSQ:
83b8e80941Smrg   case SHADER_OPCODE_SQRT:
84b8e80941Smrg   case SHADER_OPCODE_EXP2:
85b8e80941Smrg   case SHADER_OPCODE_LOG2:
86b8e80941Smrg   case SHADER_OPCODE_POW:
87b8e80941Smrg   case SHADER_OPCODE_INT_QUOTIENT:
88b8e80941Smrg   case SHADER_OPCODE_INT_REMAINDER:
89b8e80941Smrg   case SHADER_OPCODE_SIN:
90b8e80941Smrg   case SHADER_OPCODE_COS:
91b8e80941Smrg      return inst->mlen == 0;
92b8e80941Smrg   default:
93b8e80941Smrg      return false;
94b8e80941Smrg   }
95b8e80941Smrg}
96b8e80941Smrg
97b8e80941Smrgstatic bool
98b8e80941Smrgoperands_match(const vec4_instruction *a, const vec4_instruction *b)
99b8e80941Smrg{
100b8e80941Smrg   const src_reg *xs = a->src;
101b8e80941Smrg   const src_reg *ys = b->src;
102b8e80941Smrg
103b8e80941Smrg   if (a->opcode == BRW_OPCODE_MAD) {
104b8e80941Smrg      return xs[0].equals(ys[0]) &&
105b8e80941Smrg             ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
106b8e80941Smrg              (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
107b8e80941Smrg   } else if (a->opcode == BRW_OPCODE_MOV &&
108b8e80941Smrg              xs[0].file == IMM &&
109b8e80941Smrg              xs[0].type == BRW_REGISTER_TYPE_VF) {
110b8e80941Smrg      src_reg tmp_x = xs[0];
111b8e80941Smrg      src_reg tmp_y = ys[0];
112b8e80941Smrg
113b8e80941Smrg      /* Smash out the values that are not part of the writemask.  Otherwise
114b8e80941Smrg       * the equals operator will fail due to mismatches in unused components.
115b8e80941Smrg       */
116b8e80941Smrg      const unsigned ab_writemask = a->dst.writemask & b->dst.writemask;
117b8e80941Smrg      const uint32_t mask = ((ab_writemask & WRITEMASK_X) ? 0x000000ff : 0) |
118b8e80941Smrg                            ((ab_writemask & WRITEMASK_Y) ? 0x0000ff00 : 0) |
119b8e80941Smrg                            ((ab_writemask & WRITEMASK_Z) ? 0x00ff0000 : 0) |
120b8e80941Smrg                            ((ab_writemask & WRITEMASK_W) ? 0xff000000 : 0);
121b8e80941Smrg
122b8e80941Smrg      tmp_x.ud &= mask;
123b8e80941Smrg      tmp_y.ud &= mask;
124b8e80941Smrg
125b8e80941Smrg      return tmp_x.equals(tmp_y);
126b8e80941Smrg   } else if (!a->is_commutative()) {
127b8e80941Smrg      return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
128b8e80941Smrg   } else {
129b8e80941Smrg      return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
130b8e80941Smrg             (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
131b8e80941Smrg   }
132b8e80941Smrg}
133b8e80941Smrg
134b8e80941Smrg/**
135b8e80941Smrg * Checks if instructions match, exactly for sources, but loosely for
136b8e80941Smrg * destination writemasks.
137b8e80941Smrg *
138b8e80941Smrg * \param 'a' is the generating expression from the AEB entry.
139b8e80941Smrg * \param 'b' is the second occurrence of the expression that we're
140b8e80941Smrg *        considering eliminating.
141b8e80941Smrg */
142b8e80941Smrgstatic bool
143b8e80941Smrginstructions_match(vec4_instruction *a, vec4_instruction *b)
144b8e80941Smrg{
145b8e80941Smrg   return a->opcode == b->opcode &&
146b8e80941Smrg          a->saturate == b->saturate &&
147b8e80941Smrg          a->predicate == b->predicate &&
148b8e80941Smrg          a->predicate_inverse == b->predicate_inverse &&
149b8e80941Smrg          a->conditional_mod == b->conditional_mod &&
150b8e80941Smrg          a->flag_subreg == b->flag_subreg &&
151b8e80941Smrg          a->dst.type == b->dst.type &&
152b8e80941Smrg          a->offset == b->offset &&
153b8e80941Smrg          a->mlen == b->mlen &&
154b8e80941Smrg          a->base_mrf == b->base_mrf &&
155b8e80941Smrg          a->header_size == b->header_size &&
156b8e80941Smrg          a->shadow_compare == b->shadow_compare &&
157b8e80941Smrg          ((a->dst.writemask & b->dst.writemask) == a->dst.writemask) &&
158b8e80941Smrg          a->force_writemask_all == b->force_writemask_all &&
159b8e80941Smrg          a->size_written == b->size_written &&
160b8e80941Smrg          a->exec_size == b->exec_size &&
161b8e80941Smrg          a->group == b->group &&
162b8e80941Smrg          operands_match(a, b);
163b8e80941Smrg}
164b8e80941Smrg
165b8e80941Smrgbool
166b8e80941Smrgvec4_visitor::opt_cse_local(bblock_t *block)
167b8e80941Smrg{
168b8e80941Smrg   bool progress = false;
169b8e80941Smrg   exec_list aeb;
170b8e80941Smrg
171b8e80941Smrg   void *cse_ctx = ralloc_context(NULL);
172b8e80941Smrg
173b8e80941Smrg   int ip = block->start_ip;
174b8e80941Smrg   foreach_inst_in_block (vec4_instruction, inst, block) {
175b8e80941Smrg      /* Skip some cases. */
176b8e80941Smrg      if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
177b8e80941Smrg          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
178b8e80941Smrg           inst->dst.is_null()))
179b8e80941Smrg      {
180b8e80941Smrg         bool found = false;
181b8e80941Smrg
182b8e80941Smrg         foreach_in_list_use_after(aeb_entry, entry, &aeb) {
183b8e80941Smrg            /* Match current instruction's expression against those in AEB. */
184b8e80941Smrg            if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
185b8e80941Smrg                instructions_match(inst, entry->generator)) {
186b8e80941Smrg               found = true;
187b8e80941Smrg               progress = true;
188b8e80941Smrg               break;
189b8e80941Smrg            }
190b8e80941Smrg         }
191b8e80941Smrg
192b8e80941Smrg         if (!found) {
193b8e80941Smrg            if (inst->opcode != BRW_OPCODE_MOV ||
194b8e80941Smrg                (inst->opcode == BRW_OPCODE_MOV &&
195b8e80941Smrg                 inst->src[0].file == IMM &&
196b8e80941Smrg                 inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
197b8e80941Smrg               /* Our first sighting of this expression.  Create an entry. */
198b8e80941Smrg               aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
199b8e80941Smrg               entry->tmp = src_reg(); /* file will be BAD_FILE */
200b8e80941Smrg               entry->generator = inst;
201b8e80941Smrg               aeb.push_tail(entry);
202b8e80941Smrg            }
203b8e80941Smrg         } else {
204b8e80941Smrg            /* This is at least our second sighting of this expression.
205b8e80941Smrg             * If we don't have a temporary already, make one.
206b8e80941Smrg             */
207b8e80941Smrg            bool no_existing_temp = entry->tmp.file == BAD_FILE;
208b8e80941Smrg            if (no_existing_temp && !entry->generator->dst.is_null()) {
209b8e80941Smrg               entry->tmp = retype(src_reg(VGRF, alloc.allocate(
210b8e80941Smrg                                              regs_written(entry->generator)),
211b8e80941Smrg                                           NULL), inst->dst.type);
212b8e80941Smrg
213b8e80941Smrg               const unsigned width = entry->generator->exec_size;
214b8e80941Smrg               unsigned component_size = width * type_sz(entry->tmp.type);
215b8e80941Smrg               unsigned num_copy_movs =
216b8e80941Smrg                  DIV_ROUND_UP(entry->generator->size_written, component_size);
217b8e80941Smrg               for (unsigned i = 0; i < num_copy_movs; ++i) {
218b8e80941Smrg                  vec4_instruction *copy =
219b8e80941Smrg                     MOV(offset(entry->generator->dst, width, i),
220b8e80941Smrg                         offset(entry->tmp, width, i));
221b8e80941Smrg                  copy->exec_size = width;
222b8e80941Smrg                  copy->group = entry->generator->group;
223b8e80941Smrg                  copy->force_writemask_all =
224b8e80941Smrg                     entry->generator->force_writemask_all;
225b8e80941Smrg                  entry->generator->insert_after(block, copy);
226b8e80941Smrg               }
227b8e80941Smrg
228b8e80941Smrg               entry->generator->dst = dst_reg(entry->tmp);
229b8e80941Smrg            }
230b8e80941Smrg
231b8e80941Smrg            /* dest <- temp */
232b8e80941Smrg            if (!inst->dst.is_null()) {
233b8e80941Smrg               assert(inst->dst.type == entry->tmp.type);
234b8e80941Smrg               const unsigned width = inst->exec_size;
235b8e80941Smrg               unsigned component_size = width * type_sz(inst->dst.type);
236b8e80941Smrg               unsigned num_copy_movs =
237b8e80941Smrg                  DIV_ROUND_UP(inst->size_written, component_size);
238b8e80941Smrg               for (unsigned i = 0; i < num_copy_movs; ++i) {
239b8e80941Smrg                  vec4_instruction *copy =
240b8e80941Smrg                     MOV(offset(inst->dst, width, i),
241b8e80941Smrg                         offset(entry->tmp, width, i));
242b8e80941Smrg                  copy->exec_size = inst->exec_size;
243b8e80941Smrg                  copy->group = inst->group;
244b8e80941Smrg                  copy->force_writemask_all = inst->force_writemask_all;
245b8e80941Smrg                  inst->insert_before(block, copy);
246b8e80941Smrg               }
247b8e80941Smrg            }
248b8e80941Smrg
249b8e80941Smrg            /* Set our iterator so that next time through the loop inst->next
250b8e80941Smrg             * will get the instruction in the basic block after the one we've
251b8e80941Smrg             * removed.
252b8e80941Smrg             */
253b8e80941Smrg            vec4_instruction *prev = (vec4_instruction *)inst->prev;
254b8e80941Smrg
255b8e80941Smrg            inst->remove(block);
256b8e80941Smrg            inst = prev;
257b8e80941Smrg         }
258b8e80941Smrg      }
259b8e80941Smrg
260b8e80941Smrg      foreach_in_list_safe(aeb_entry, entry, &aeb) {
261b8e80941Smrg         /* Kill all AEB entries that write a different value to or read from
262b8e80941Smrg          * the flag register if we just wrote it.
263b8e80941Smrg          */
264b8e80941Smrg         if (inst->writes_flag()) {
265b8e80941Smrg            if (entry->generator->reads_flag() ||
266b8e80941Smrg                (entry->generator->writes_flag() &&
267b8e80941Smrg                 !instructions_match(inst, entry->generator))) {
268b8e80941Smrg               entry->remove();
269b8e80941Smrg               ralloc_free(entry);
270b8e80941Smrg               continue;
271b8e80941Smrg            }
272b8e80941Smrg         }
273b8e80941Smrg
274b8e80941Smrg         for (int i = 0; i < 3; i++) {
275b8e80941Smrg            src_reg *src = &entry->generator->src[i];
276b8e80941Smrg
277b8e80941Smrg            /* Kill all AEB entries that use the destination we just
278b8e80941Smrg             * overwrote.
279b8e80941Smrg             */
280b8e80941Smrg            if (inst->dst.file == entry->generator->src[i].file &&
281b8e80941Smrg                inst->dst.nr == entry->generator->src[i].nr) {
282b8e80941Smrg               entry->remove();
283b8e80941Smrg               ralloc_free(entry);
284b8e80941Smrg               break;
285b8e80941Smrg            }
286b8e80941Smrg
287b8e80941Smrg            /* Kill any AEB entries using registers that don't get reused any
288b8e80941Smrg             * more -- a sure sign they'll fail operands_match().
289b8e80941Smrg             */
290b8e80941Smrg            if (src->file == VGRF) {
291b8e80941Smrg               if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
292b8e80941Smrg                  entry->remove();
293b8e80941Smrg                  ralloc_free(entry);
294b8e80941Smrg                  break;
295b8e80941Smrg               }
296b8e80941Smrg            }
297b8e80941Smrg         }
298b8e80941Smrg      }
299b8e80941Smrg
300b8e80941Smrg      ip++;
301b8e80941Smrg   }
302b8e80941Smrg
303b8e80941Smrg   ralloc_free(cse_ctx);
304b8e80941Smrg
305b8e80941Smrg   return progress;
306b8e80941Smrg}
307b8e80941Smrg
308b8e80941Smrgbool
309b8e80941Smrgvec4_visitor::opt_cse()
310b8e80941Smrg{
311b8e80941Smrg   bool progress = false;
312b8e80941Smrg
313b8e80941Smrg   calculate_live_intervals();
314b8e80941Smrg
315b8e80941Smrg   foreach_block (block, cfg) {
316b8e80941Smrg      progress = opt_cse_local(block) || progress;
317b8e80941Smrg   }
318b8e80941Smrg
319b8e80941Smrg   if (progress)
320b8e80941Smrg      invalidate_live_intervals();
321b8e80941Smrg
322b8e80941Smrg   return progress;
323b8e80941Smrg}
324