compiler/nir/nir_algebraic.py

b8e80941Smrg#
b8e80941Smrg# Copyright (C) 2014 Intel Corporation
b8e80941Smrg#
b8e80941Smrg# Permission is hereby granted, free of charge, to any person obtaining a
b8e80941Smrg# copy of this software and associated documentation files (the "Software"),
b8e80941Smrg# to deal in the Software without restriction, including without limitation
b8e80941Smrg# the rights to use, copy, modify, merge, publish, distribute, sublicense,
b8e80941Smrg# and/or sell copies of the Software, and to permit persons to whom the
b8e80941Smrg# Software is furnished to do so, subject to the following conditions:
b8e80941Smrg#
b8e80941Smrg# The above copyright notice and this permission notice (including the next
b8e80941Smrg# paragraph) shall be included in all copies or substantial portions of the
b8e80941Smrg# Software.
b8e80941Smrg#
b8e80941Smrg# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
b8e80941Smrg# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
b8e80941Smrg# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
b8e80941Smrg# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
b8e80941Smrg# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
b8e80941Smrg# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
b8e80941Smrg# IN THE SOFTWARE.
b8e80941Smrg#
b8e80941Smrg# Authors:
b8e80941Smrg#    Jason Ekstrand (jason@jlekstrand.net)
b8e80941Smrg
b8e80941Smrgfrom __future__ import print_function
b8e80941Smrgimport ast
b8e80941Smrgfrom collections import defaultdict
b8e80941Smrgimport itertools
b8e80941Smrgimport struct
b8e80941Smrgimport sys
b8e80941Smrgimport mako.template
b8e80941Smrgimport re
b8e80941Smrgimport traceback
b8e80941Smrg
b8e80941Smrgfrom nir_opcodes import opcodes, type_sizes
b8e80941Smrg
b8e80941Smrg# These opcodes are only employed by nir_search.  This provides a mapping from
b8e80941Smrg# opcode to destination type.
b8e80941Smrgconv_opcode_types = {
b8e80941Smrg    'i2f' : 'float',
b8e80941Smrg    'u2f' : 'float',
b8e80941Smrg    'f2f' : 'float',
b8e80941Smrg    'f2u' : 'uint',
b8e80941Smrg    'f2i' : 'int',
b8e80941Smrg    'u2u' : 'uint',
b8e80941Smrg    'i2i' : 'int',
b8e80941Smrg    'b2f' : 'float',
b8e80941Smrg    'b2i' : 'int',
b8e80941Smrg    'i2b' : 'bool',
b8e80941Smrg    'f2b' : 'bool',
b8e80941Smrg}
b8e80941Smrg
b8e80941Smrgdef get_c_opcode(op):
b8e80941Smrg      if op in conv_opcode_types:
b8e80941Smrg         return 'nir_search_op_' + op
b8e80941Smrg      else:
b8e80941Smrg         return 'nir_op_' + op
b8e80941Smrg
b8e80941Smrg
b8e80941Smrgif sys.version_info < (3, 0):
b8e80941Smrg    integer_types = (int, long)
b8e80941Smrg    string_type = unicode
b8e80941Smrg
b8e80941Smrgelse:
b8e80941Smrg    integer_types = (int, )
b8e80941Smrg    string_type = str
b8e80941Smrg
b8e80941Smrg_type_re = re.compile(r"(?P<type>int|uint|bool|float)?(?P<bits>\d+)?")
b8e80941Smrg
b8e80941Smrgdef type_bits(type_str):
b8e80941Smrg   m = _type_re.match(type_str)
b8e80941Smrg   assert m.group('type')
b8e80941Smrg
b8e80941Smrg   if m.group('bits') is None:
b8e80941Smrg      return 0
b8e80941Smrg   else:
b8e80941Smrg      return int(m.group('bits'))
b8e80941Smrg
b8e80941Smrg# Represents a set of variables, each with a unique id
b8e80941Smrgclass VarSet(object):
b8e80941Smrg   def __init__(self):
b8e80941Smrg      self.names = {}
b8e80941Smrg      self.ids = itertools.count()
b8e80941Smrg      self.immutable = False;
b8e80941Smrg
b8e80941Smrg   def __getitem__(self, name):
b8e80941Smrg      if name not in self.names:
b8e80941Smrg         assert not self.immutable, "Unknown replacement variable: " + name
b8e80941Smrg         self.names[name] = next(self.ids)
b8e80941Smrg
b8e80941Smrg      return self.names[name]
b8e80941Smrg
b8e80941Smrg   def lock(self):
b8e80941Smrg      self.immutable = True
b8e80941Smrg
b8e80941Smrgclass Value(object):
b8e80941Smrg   @staticmethod
b8e80941Smrg   def create(val, name_base, varset):
b8e80941Smrg      if isinstance(val, bytes):
b8e80941Smrg         val = val.decode('utf-8')
b8e80941Smrg
b8e80941Smrg      if isinstance(val, tuple):
b8e80941Smrg         return Expression(val, name_base, varset)
b8e80941Smrg      elif isinstance(val, Expression):
b8e80941Smrg         return val
b8e80941Smrg      elif isinstance(val, string_type):
b8e80941Smrg         return Variable(val, name_base, varset)
b8e80941Smrg      elif isinstance(val, (bool, float) + integer_types):
b8e80941Smrg         return Constant(val, name_base)
b8e80941Smrg
b8e80941Smrg   def __init__(self, val, name, type_str):
b8e80941Smrg      self.in_val = str(val)
b8e80941Smrg      self.name = name
b8e80941Smrg      self.type_str = type_str
b8e80941Smrg
b8e80941Smrg   def __str__(self):
b8e80941Smrg      return self.in_val
b8e80941Smrg
b8e80941Smrg   def get_bit_size(self):
b8e80941Smrg      """Get the physical bit-size that has been chosen for this value, or if
b8e80941Smrg      there is none, the canonical value which currently represents this
b8e80941Smrg      bit-size class. Variables will be preferred, i.e. if there are any
b8e80941Smrg      variables in the equivalence class, the canonical value will be a
b8e80941Smrg      variable. We do this since we'll need to know which variable each value
b8e80941Smrg      is equivalent to when constructing the replacement expression. This is
b8e80941Smrg      the "find" part of the union-find algorithm.
b8e80941Smrg      """
b8e80941Smrg      bit_size = self
b8e80941Smrg
b8e80941Smrg      while isinstance(bit_size, Value):
b8e80941Smrg         if bit_size._bit_size is None:
b8e80941Smrg            break
b8e80941Smrg         bit_size = bit_size._bit_size
b8e80941Smrg
b8e80941Smrg      if bit_size is not self:
b8e80941Smrg         self._bit_size = bit_size
b8e80941Smrg      return bit_size
b8e80941Smrg
b8e80941Smrg   def set_bit_size(self, other):
b8e80941Smrg      """Make self.get_bit_size() return what other.get_bit_size() return
b8e80941Smrg      before calling this, or just "other" if it's a concrete bit-size. This is
b8e80941Smrg      the "union" part of the union-find algorithm.
b8e80941Smrg      """
b8e80941Smrg
b8e80941Smrg      self_bit_size = self.get_bit_size()
b8e80941Smrg      other_bit_size = other if isinstance(other, int) else other.get_bit_size()
b8e80941Smrg
b8e80941Smrg      if self_bit_size == other_bit_size:
b8e80941Smrg         return
b8e80941Smrg
b8e80941Smrg      self_bit_size._bit_size = other_bit_size
b8e80941Smrg
b8e80941Smrg   @property
b8e80941Smrg   def type_enum(self):
b8e80941Smrg      return "nir_search_value_" + self.type_str
b8e80941Smrg
b8e80941Smrg   @property
b8e80941Smrg   def c_type(self):
b8e80941Smrg      return "nir_search_" + self.type_str
b8e80941Smrg
b8e80941Smrg   def __c_name(self, cache):
b8e80941Smrg      if cache is not None and self.name in cache:
b8e80941Smrg         return cache[self.name]
b8e80941Smrg      else:
b8e80941Smrg         return self.name
b8e80941Smrg
b8e80941Smrg   def c_value_ptr(self, cache):
b8e80941Smrg      return "&{0}.value".format(self.__c_name(cache))
b8e80941Smrg
b8e80941Smrg   def c_ptr(self, cache):
b8e80941Smrg      return "&{0}".format(self.__c_name(cache))
b8e80941Smrg
b8e80941Smrg   @property
b8e80941Smrg   def c_bit_size(self):
b8e80941Smrg      bit_size = self.get_bit_size()
b8e80941Smrg      if isinstance(bit_size, int):
b8e80941Smrg         return bit_size
b8e80941Smrg      elif isinstance(bit_size, Variable):
b8e80941Smrg         return -bit_size.index - 1
b8e80941Smrg      else:
b8e80941Smrg         # If the bit-size class is neither a variable, nor an actual bit-size, then
b8e80941Smrg         # - If it's in the search expression, we don't need to check anything
b8e80941Smrg         # - If it's in the replace expression, either it's ambiguous (in which
b8e80941Smrg         # case we'd reject it), or it equals the bit-size of the search value
b8e80941Smrg         # We represent these cases with a 0 bit-size.
b8e80941Smrg         return 0
b8e80941Smrg
b8e80941Smrg   __template = mako.template.Template("""{
b8e80941Smrg   { ${val.type_enum}, ${val.c_bit_size} },
b8e80941Smrg% if isinstance(val, Constant):
b8e80941Smrg   ${val.type()}, { ${val.hex()} /* ${val.value} */ },
b8e80941Smrg% elif isinstance(val, Variable):
b8e80941Smrg   ${val.index}, /* ${val.var_name} */
b8e80941Smrg   ${'true' if val.is_constant else 'false'},
b8e80941Smrg   ${val.type() or 'nir_type_invalid' },
b8e80941Smrg   ${val.cond if val.cond else 'NULL'},
b8e80941Smrg% elif isinstance(val, Expression):
b8e80941Smrg   ${'true' if val.inexact else 'false'},
b8e80941Smrg   ${val.comm_expr_idx}, ${val.comm_exprs},
b8e80941Smrg   ${val.c_opcode()},
b8e80941Smrg   { ${', '.join(src.c_value_ptr(cache) for src in val.sources)} },
b8e80941Smrg   ${val.cond if val.cond else 'NULL'},
b8e80941Smrg% endif
b8e80941Smrg};""")
b8e80941Smrg
b8e80941Smrg   def render(self, cache):
b8e80941Smrg      struct_init = self.__template.render(val=self, cache=cache,
b8e80941Smrg                                           Constant=Constant,
b8e80941Smrg                                           Variable=Variable,
b8e80941Smrg                                           Expression=Expression)
b8e80941Smrg      if cache is not None and struct_init in cache:
b8e80941Smrg         # If it's in the cache, register a name remap in the cache and render
b8e80941Smrg         # only a comment saying it's been remapped
b8e80941Smrg         cache[self.name] = cache[struct_init]
b8e80941Smrg         return "/* {} -> {} in the cache */\n".format(self.name,
b8e80941Smrg                                                       cache[struct_init])
b8e80941Smrg      else:
b8e80941Smrg         if cache is not None:
b8e80941Smrg            cache[struct_init] = self.name
b8e80941Smrg         return "static const {} {} = {}\n".format(self.c_type, self.name,
b8e80941Smrg                                                   struct_init)
b8e80941Smrg
b8e80941Smrg_constant_re = re.compile(r"(?P<value>[^@\(]+)(?:@(?P<bits>\d+))?")
b8e80941Smrg
b8e80941Smrgclass Constant(Value):
b8e80941Smrg   def __init__(self, val, name):
b8e80941Smrg      Value.__init__(self, val, name, "constant")
b8e80941Smrg
b8e80941Smrg      if isinstance(val, (str)):
b8e80941Smrg         m = _constant_re.match(val)
b8e80941Smrg         self.value = ast.literal_eval(m.group('value'))
b8e80941Smrg         self._bit_size = int(m.group('bits')) if m.group('bits') else None
b8e80941Smrg      else:
b8e80941Smrg         self.value = val
b8e80941Smrg         self._bit_size = None
b8e80941Smrg
b8e80941Smrg      if isinstance(self.value, bool):
b8e80941Smrg         assert self._bit_size is None or self._bit_size == 1
b8e80941Smrg         self._bit_size = 1
b8e80941Smrg
b8e80941Smrg   def hex(self):
b8e80941Smrg      if isinstance(self.value, (bool)):
b8e80941Smrg         return 'NIR_TRUE' if self.value else 'NIR_FALSE'
b8e80941Smrg      if isinstance(self.value, integer_types):
b8e80941Smrg         return hex(self.value)
b8e80941Smrg      elif isinstance(self.value, float):
b8e80941Smrg         i = struct.unpack('Q', struct.pack('d', self.value))[0]
b8e80941Smrg         h = hex(i)
b8e80941Smrg
b8e80941Smrg         # On Python 2 this 'L' suffix is automatically added, but not on Python 3
b8e80941Smrg         # Adding it explicitly makes the generated file identical, regardless
b8e80941Smrg         # of the Python version running this script.
b8e80941Smrg         if h[-1] != 'L' and i > sys.maxsize:
b8e80941Smrg            h += 'L'
b8e80941Smrg
b8e80941Smrg         return h
b8e80941Smrg      else:
b8e80941Smrg         assert False
b8e80941Smrg
b8e80941Smrg   def type(self):
b8e80941Smrg      if isinstance(self.value, (bool)):
b8e80941Smrg         return "nir_type_bool"
b8e80941Smrg      elif isinstance(self.value, integer_types):
b8e80941Smrg         return "nir_type_int"
b8e80941Smrg      elif isinstance(self.value, float):
b8e80941Smrg         return "nir_type_float"
b8e80941Smrg
b8e80941Smrg_var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)"
b8e80941Smrg                          r"(?:@(?P<type>int|uint|bool|float)?(?P<bits>\d+)?)?"
b8e80941Smrg                          r"(?P<cond>\([^\)]+\))?")
b8e80941Smrg
b8e80941Smrgclass Variable(Value):
b8e80941Smrg   def __init__(self, val, name, varset):
b8e80941Smrg      Value.__init__(self, val, name, "variable")
b8e80941Smrg
b8e80941Smrg      m = _var_name_re.match(val)
b8e80941Smrg      assert m and m.group('name') is not None
b8e80941Smrg
b8e80941Smrg      self.var_name = m.group('name')
b8e80941Smrg
b8e80941Smrg      # Prevent common cases where someone puts quotes around a literal
b8e80941Smrg      # constant.  If we want to support names that have numeric or
b8e80941Smrg      # punctuation characters, we can me the first assertion more flexible.
b8e80941Smrg      assert self.var_name.isalpha()
b8e80941Smrg      assert self.var_name is not 'True'
b8e80941Smrg      assert self.var_name is not 'False'
b8e80941Smrg
b8e80941Smrg      self.is_constant = m.group('const') is not None
b8e80941Smrg      self.cond = m.group('cond')
b8e80941Smrg      self.required_type = m.group('type')
b8e80941Smrg      self._bit_size = int(m.group('bits')) if m.group('bits') else None
b8e80941Smrg
b8e80941Smrg      if self.required_type == 'bool':
b8e80941Smrg         if self._bit_size is not None:
b8e80941Smrg            assert self._bit_size in type_sizes(self.required_type)
b8e80941Smrg         else:
b8e80941Smrg            self._bit_size = 1
b8e80941Smrg
b8e80941Smrg      if self.required_type is not None:
b8e80941Smrg         assert self.required_type in ('float', 'bool', 'int', 'uint')
b8e80941Smrg
b8e80941Smrg      self.index = varset[self.var_name]
b8e80941Smrg
b8e80941Smrg   def type(self):
b8e80941Smrg      if self.required_type == 'bool':
b8e80941Smrg         return "nir_type_bool"
b8e80941Smrg      elif self.required_type in ('int', 'uint'):
b8e80941Smrg         return "nir_type_int"
b8e80941Smrg      elif self.required_type == 'float':
b8e80941Smrg         return "nir_type_float"
b8e80941Smrg
b8e80941Smrg_opcode_re = re.compile(r"(?P<inexact>~)?(?P<opcode>\w+)(?:@(?P<bits>\d+))?"
b8e80941Smrg                        r"(?P<cond>\([^\)]+\))?")
b8e80941Smrg
b8e80941Smrgclass Expression(Value):
b8e80941Smrg   def __init__(self, expr, name_base, varset):
b8e80941Smrg      Value.__init__(self, expr, name_base, "expression")
b8e80941Smrg      assert isinstance(expr, tuple)
b8e80941Smrg
b8e80941Smrg      m = _opcode_re.match(expr[0])
b8e80941Smrg      assert m and m.group('opcode') is not None
b8e80941Smrg
b8e80941Smrg      self.opcode = m.group('opcode')
b8e80941Smrg      self._bit_size = int(m.group('bits')) if m.group('bits') else None
b8e80941Smrg      self.inexact = m.group('inexact') is not None
b8e80941Smrg      self.cond = m.group('cond')
b8e80941Smrg      self.sources = [ Value.create(src, "{0}_{1}".format(name_base, i), varset)
b8e80941Smrg                       for (i, src) in enumerate(expr[1:]) ]
b8e80941Smrg
b8e80941Smrg      if self.opcode in conv_opcode_types:
b8e80941Smrg         assert self._bit_size is None, \
b8e80941Smrg                'Expression cannot use an unsized conversion opcode with ' \
b8e80941Smrg                'an explicit size; that\'s silly.'
b8e80941Smrg
b8e80941Smrg      self.__index_comm_exprs(0)
b8e80941Smrg
b8e80941Smrg   def __index_comm_exprs(self, base_idx):
b8e80941Smrg      """Recursively count and index commutative expressions
b8e80941Smrg      """
b8e80941Smrg      self.comm_exprs = 0
b8e80941Smrg      if self.opcode not in conv_opcode_types and \
b8e80941Smrg         "commutative" in opcodes[self.opcode].algebraic_properties:
b8e80941Smrg         self.comm_expr_idx = base_idx
b8e80941Smrg         self.comm_exprs += 1
b8e80941Smrg      else:
b8e80941Smrg         self.comm_expr_idx = -1
b8e80941Smrg
b8e80941Smrg      for s in self.sources:
b8e80941Smrg         if isinstance(s, Expression):
b8e80941Smrg            s.__index_comm_exprs(base_idx + self.comm_exprs)
b8e80941Smrg            self.comm_exprs += s.comm_exprs
b8e80941Smrg
b8e80941Smrg      return self.comm_exprs
b8e80941Smrg
b8e80941Smrg   def c_opcode(self):
b8e80941Smrg      return get_c_opcode(self.opcode)
b8e80941Smrg
b8e80941Smrg   def render(self, cache):
b8e80941Smrg      srcs = "\n".join(src.render(cache) for src in self.sources)
b8e80941Smrg      return srcs + super(Expression, self).render(cache)
b8e80941Smrg
b8e80941Smrgclass BitSizeValidator(object):
b8e80941Smrg   """A class for validating bit sizes of expressions.
b8e80941Smrg
b8e80941Smrg   NIR supports multiple bit-sizes on expressions in order to handle things
b8e80941Smrg   such as fp64.  The source and destination of every ALU operation is
b8e80941Smrg   assigned a type and that type may or may not specify a bit size.  Sources
b8e80941Smrg   and destinations whose type does not specify a bit size are considered
b8e80941Smrg   "unsized" and automatically take on the bit size of the corresponding
b8e80941Smrg   register or SSA value.  NIR has two simple rules for bit sizes that are
b8e80941Smrg   validated by nir_validator:
b8e80941Smrg
b8e80941Smrg    1) A given SSA def or register has a single bit size that is respected by
b8e80941Smrg       everything that reads from it or writes to it.
b8e80941Smrg
b8e80941Smrg    2) The bit sizes of all unsized inputs/outputs on any given ALU
b8e80941Smrg       instruction must match.  They need not match the sized inputs or
b8e80941Smrg       outputs but they must match each other.
b8e80941Smrg
b8e80941Smrg   In order to keep nir_algebraic relatively simple and easy-to-use,
b8e80941Smrg   nir_search supports a type of bit-size inference based on the two rules
b8e80941Smrg   above.  This is similar to type inference in many common programming
b8e80941Smrg   languages.  If, for instance, you are constructing an add operation and you
b8e80941Smrg   know the second source is 16-bit, then you know that the other source and
b8e80941Smrg   the destination must also be 16-bit.  There are, however, cases where this
b8e80941Smrg   inference can be ambiguous or contradictory.  Consider, for instance, the
b8e80941Smrg   following transformation:
b8e80941Smrg
b8e80941Smrg   (('usub_borrow', a, b), ('b2i@32', ('ult', a, b)))
b8e80941Smrg
b8e80941Smrg   This transformation can potentially cause a problem because usub_borrow is
b8e80941Smrg   well-defined for any bit-size of integer.  However, b2i always generates a
b8e80941Smrg   32-bit result so it could end up replacing a 64-bit expression with one
b8e80941Smrg   that takes two 64-bit values and produces a 32-bit value.  As another
b8e80941Smrg   example, consider this expression:
b8e80941Smrg
b8e80941Smrg   (('bcsel', a, b, 0), ('iand', a, b))
b8e80941Smrg
b8e80941Smrg   In this case, in the search expression a must be 32-bit but b can
b8e80941Smrg   potentially have any bit size.  If we had a 64-bit b value, we would end up
b8e80941Smrg   trying to and a 32-bit value with a 64-bit value which would be invalid
b8e80941Smrg
b8e80941Smrg   This class solves that problem by providing a validation layer that proves
b8e80941Smrg   that a given search-and-replace operation is 100% well-defined before we
b8e80941Smrg   generate any code.  This ensures that bugs are caught at compile time
b8e80941Smrg   rather than at run time.
b8e80941Smrg
b8e80941Smrg   Each value maintains a "bit-size class", which is either an actual bit size
b8e80941Smrg   or an equivalence class with other values that must have the same bit size.
b8e80941Smrg   The validator works by combining bit-size classes with each other according
b8e80941Smrg   to the NIR rules outlined above, checking that there are no inconsistencies.
b8e80941Smrg   When doing this for the replacement expression, we make sure to never change
b8e80941Smrg   the equivalence class of any of the search values. We could make the example
b8e80941Smrg   transforms above work by doing some extra run-time checking of the search
b8e80941Smrg   expression, but we make the user specify those constraints themselves, to
b8e80941Smrg   avoid any surprises. Since the replacement bitsizes can only be connected to
b8e80941Smrg   the source bitsize via variables (variables must have the same bitsize in
b8e80941Smrg   the source and replacment expressions) or the roots of the expression (the
b8e80941Smrg   replacement expression must produce the same bit size as the search
b8e80941Smrg   expression), we prevent merging a variable with anything when processing the
b8e80941Smrg   replacement expression, or specializing the search bitsize
b8e80941Smrg   with anything. The former prevents
b8e80941Smrg
b8e80941Smrg   (('bcsel', a, b, 0), ('iand', a, b))
b8e80941Smrg
b8e80941Smrg   from being allowed, since we'd have to merge the bitsizes for a and b due to
b8e80941Smrg   the 'iand', while the latter prevents
b8e80941Smrg
b8e80941Smrg   (('usub_borrow', a, b), ('b2i@32', ('ult', a, b)))
b8e80941Smrg
b8e80941Smrg   from being allowed, since the search expression has the bit size of a and b,
b8e80941Smrg   which can't be specialized to 32 which is the bitsize of the replace
b8e80941Smrg   expression. It also prevents something like:
b8e80941Smrg
b8e80941Smrg   (('b2i', ('i2b', a)), ('ineq', a, 0))
b8e80941Smrg
b8e80941Smrg   since the bitsize of 'b2i', which can be anything, can't be specialized to
b8e80941Smrg   the bitsize of a.
b8e80941Smrg
b8e80941Smrg   After doing all this, we check that every subexpression of the replacement
b8e80941Smrg   was assigned a constant bitsize, the bitsize of a variable, or the bitsize
b8e80941Smrg   of the search expresssion, since those are the things that are known when
b8e80941Smrg   constructing the replacement expresssion. Finally, we record the bitsize
b8e80941Smrg   needed in nir_search_value so that we know what to do when building the
b8e80941Smrg   replacement expression.
b8e80941Smrg   """
b8e80941Smrg
b8e80941Smrg   def __init__(self, varset):
b8e80941Smrg      self._var_classes = [None] * len(varset.names)
b8e80941Smrg
b8e80941Smrg   def compare_bitsizes(self, a, b):
b8e80941Smrg      """Determines which bitsize class is a specialization of the other, or
b8e80941Smrg      whether neither is. When we merge two different bitsizes, the
b8e80941Smrg      less-specialized bitsize always points to the more-specialized one, so
b8e80941Smrg      that calling get_bit_size() always gets you the most specialized bitsize.
b8e80941Smrg      The specialization partial order is given by:
b8e80941Smrg      - Physical bitsizes are always the most specialized, and a different
b8e80941Smrg        bitsize can never specialize another.
b8e80941Smrg      - In the search expression, variables can always be specialized to each
b8e80941Smrg        other and to physical bitsizes. In the replace expression, we disallow
b8e80941Smrg        this to avoid adding extra constraints to the search expression that
b8e80941Smrg        the user didn't specify.
b8e80941Smrg      - Expressions and constants without a bitsize can always be specialized to
b8e80941Smrg        each other and variables, but not the other way around.
b8e80941Smrg
b8e80941Smrg        We return -1 if a <= b (b can be specialized to a), 0 if a = b, 1 if a >= b,
b8e80941Smrg        and None if they are not comparable (neither a <= b nor b <= a).
b8e80941Smrg      """
b8e80941Smrg      if isinstance(a, int):
b8e80941Smrg         if isinstance(b, int):
b8e80941Smrg            return 0 if a == b else None
b8e80941Smrg         elif isinstance(b, Variable):
b8e80941Smrg            return -1 if self.is_search else None
b8e80941Smrg         else:
b8e80941Smrg            return -1
b8e80941Smrg      elif isinstance(a, Variable):
b8e80941Smrg         if isinstance(b, int):
b8e80941Smrg            return 1 if self.is_search else None
b8e80941Smrg         elif isinstance(b, Variable):
b8e80941Smrg            return 0 if self.is_search or a.index == b.index else None
b8e80941Smrg         else:
b8e80941Smrg            return -1
b8e80941Smrg      else:
b8e80941Smrg         if isinstance(b, int):
b8e80941Smrg            return 1
b8e80941Smrg         elif isinstance(b, Variable):
b8e80941Smrg            return 1
b8e80941Smrg         else:
b8e80941Smrg            return 0
b8e80941Smrg
b8e80941Smrg   def unify_bit_size(self, a, b, error_msg):
b8e80941Smrg      """Record that a must have the same bit-size as b. If both
b8e80941Smrg      have been assigned conflicting physical bit-sizes, call "error_msg" with
b8e80941Smrg      the bit-sizes of self and other to get a message and raise an error.
b8e80941Smrg      In the replace expression, disallow merging variables with other
b8e80941Smrg      variables and physical bit-sizes as well.
b8e80941Smrg      """
b8e80941Smrg      a_bit_size = a.get_bit_size()
b8e80941Smrg      b_bit_size = b if isinstance(b, int) else b.get_bit_size()
b8e80941Smrg
b8e80941Smrg      cmp_result = self.compare_bitsizes(a_bit_size, b_bit_size)
b8e80941Smrg
b8e80941Smrg      assert cmp_result is not None, \
b8e80941Smrg         error_msg(a_bit_size, b_bit_size)
b8e80941Smrg
b8e80941Smrg      if cmp_result < 0:
b8e80941Smrg         b_bit_size.set_bit_size(a)
b8e80941Smrg      elif not isinstance(a_bit_size, int):
b8e80941Smrg         a_bit_size.set_bit_size(b)
b8e80941Smrg
b8e80941Smrg   def merge_variables(self, val):
b8e80941Smrg      """Perform the first part of type inference by merging all the different
b8e80941Smrg      uses of the same variable. We always do this as if we're in the search
b8e80941Smrg      expression, even if we're actually not, since otherwise we'd get errors
b8e80941Smrg      if the search expression specified some constraint but the replace
b8e80941Smrg      expression didn't, because we'd be merging a variable and a constant.
b8e80941Smrg      """
b8e80941Smrg      if isinstance(val, Variable):
b8e80941Smrg         if self._var_classes[val.index] is None:
b8e80941Smrg            self._var_classes[val.index] = val
b8e80941Smrg         else:
b8e80941Smrg            other = self._var_classes[val.index]
b8e80941Smrg            self.unify_bit_size(other, val,
b8e80941Smrg                  lambda other_bit_size, bit_size:
b8e80941Smrg                     'Variable {} has conflicting bit size requirements: ' \
b8e80941Smrg                     'it must have bit size {} and {}'.format(
b8e80941Smrg                        val.var_name, other_bit_size, bit_size))
b8e80941Smrg      elif isinstance(val, Expression):
b8e80941Smrg         for src in val.sources:
b8e80941Smrg            self.merge_variables(src)
b8e80941Smrg
b8e80941Smrg   def validate_value(self, val):
b8e80941Smrg      """Validate the an expression by performing classic Hindley-Milner
b8e80941Smrg      type inference on bitsizes. This will detect if there are any conflicting
b8e80941Smrg      requirements, and unify variables so that we know which variables must
b8e80941Smrg      have the same bitsize. If we're operating on the replace expression, we
b8e80941Smrg      will refuse to merge different variables together or merge a variable
b8e80941Smrg      with a constant, in order to prevent surprises due to rules unexpectedly
b8e80941Smrg      not matching at runtime.
b8e80941Smrg      """
b8e80941Smrg      if not isinstance(val, Expression):
b8e80941Smrg         return
b8e80941Smrg
b8e80941Smrg      # Generic conversion ops are special in that they have a single unsized
b8e80941Smrg      # source and an unsized destination and the two don't have to match.
b8e80941Smrg      # This means there's no validation or unioning to do here besides the
b8e80941Smrg      # len(val.sources) check.
b8e80941Smrg      if val.opcode in conv_opcode_types:
b8e80941Smrg         assert len(val.sources) == 1, \
b8e80941Smrg            "Expression {} has {} sources, expected 1".format(
b8e80941Smrg               val, len(val.sources))
b8e80941Smrg         self.validate_value(val.sources[0])
b8e80941Smrg         return
b8e80941Smrg
b8e80941Smrg      nir_op = opcodes[val.opcode]
b8e80941Smrg      assert len(val.sources) == nir_op.num_inputs, \
b8e80941Smrg         "Expression {} has {} sources, expected {}".format(
b8e80941Smrg            val, len(val.sources), nir_op.num_inputs)
b8e80941Smrg
b8e80941Smrg      for src in val.sources:
b8e80941Smrg         self.validate_value(src)
b8e80941Smrg
b8e80941Smrg      dst_type_bits = type_bits(nir_op.output_type)
b8e80941Smrg
b8e80941Smrg      # First, unify all the sources. That way, an error coming up because two
b8e80941Smrg      # sources have an incompatible bit-size won't produce an error message
b8e80941Smrg      # involving the destination.
b8e80941Smrg      first_unsized_src = None
b8e80941Smrg      for src_type, src in zip(nir_op.input_types, val.sources):
b8e80941Smrg         src_type_bits = type_bits(src_type)
b8e80941Smrg         if src_type_bits == 0:
b8e80941Smrg            if first_unsized_src is None:
b8e80941Smrg               first_unsized_src = src
b8e80941Smrg               continue
b8e80941Smrg
b8e80941Smrg            if self.is_search:
b8e80941Smrg               self.unify_bit_size(first_unsized_src, src,
b8e80941Smrg                  lambda first_unsized_src_bit_size, src_bit_size:
b8e80941Smrg                     'Source {} of {} must have bit size {}, while source {} ' \
b8e80941Smrg                     'must have incompatible bit size {}'.format(
b8e80941Smrg                        first_unsized_src, val, first_unsized_src_bit_size,
b8e80941Smrg                        src, src_bit_size))
b8e80941Smrg            else:
b8e80941Smrg               self.unify_bit_size(first_unsized_src, src,
b8e80941Smrg                  lambda first_unsized_src_bit_size, src_bit_size:
b8e80941Smrg                     'Sources {} (bit size of {}) and {} (bit size of {}) ' \
b8e80941Smrg                     'of {} may not have the same bit size when building the ' \
b8e80941Smrg                     'replacement expression.'.format(
b8e80941Smrg                        first_unsized_src, first_unsized_src_bit_size, src,
b8e80941Smrg                        src_bit_size, val))
b8e80941Smrg         else:
b8e80941Smrg            if self.is_search:
b8e80941Smrg               self.unify_bit_size(src, src_type_bits,
b8e80941Smrg                  lambda src_bit_size, unused:
b8e80941Smrg                     '{} must have {} bits, but as a source of nir_op_{} '\
b8e80941Smrg                     'it must have {} bits'.format(
b8e80941Smrg                        src, src_bit_size, nir_op.name, src_type_bits))
b8e80941Smrg            else:
b8e80941Smrg               self.unify_bit_size(src, src_type_bits,
b8e80941Smrg                  lambda src_bit_size, unused:
b8e80941Smrg                     '{} has the bit size of {}, but as a source of ' \
b8e80941Smrg                     'nir_op_{} it must have {} bits, which may not be the ' \
b8e80941Smrg                     'same'.format(
b8e80941Smrg                        src, src_bit_size, nir_op.name, src_type_bits))
b8e80941Smrg
b8e80941Smrg      if dst_type_bits == 0:
b8e80941Smrg         if first_unsized_src is not None:
b8e80941Smrg            if self.is_search:
b8e80941Smrg               self.unify_bit_size(val, first_unsized_src,
b8e80941Smrg                  lambda val_bit_size, src_bit_size:
b8e80941Smrg                     '{} must have the bit size of {}, while its source {} ' \
b8e80941Smrg                     'must have incompatible bit size {}'.format(
b8e80941Smrg                        val, val_bit_size, first_unsized_src, src_bit_size))
b8e80941Smrg            else:
b8e80941Smrg               self.unify_bit_size(val, first_unsized_src,
b8e80941Smrg                  lambda val_bit_size, src_bit_size:
b8e80941Smrg                     '{} must have {} bits, but its source {} ' \
b8e80941Smrg                     '(bit size of {}) may not have that bit size ' \
b8e80941Smrg                     'when building the replacement.'.format(
b8e80941Smrg                        val, val_bit_size, first_unsized_src, src_bit_size))
b8e80941Smrg      else:
b8e80941Smrg         self.unify_bit_size(val, dst_type_bits,
b8e80941Smrg            lambda dst_bit_size, unused:
b8e80941Smrg               '{} must have {} bits, but as a destination of nir_op_{} ' \
b8e80941Smrg               'it must have {} bits'.format(
b8e80941Smrg                  val, dst_bit_size, nir_op.name, dst_type_bits))
b8e80941Smrg
b8e80941Smrg   def validate_replace(self, val, search):
b8e80941Smrg      bit_size = val.get_bit_size()
b8e80941Smrg      assert isinstance(bit_size, int) or isinstance(bit_size, Variable) or \
b8e80941Smrg            bit_size == search.get_bit_size(), \
b8e80941Smrg            'Ambiguous bit size for replacement value {}: ' \
b8e80941Smrg            'it cannot be deduced from a variable, a fixed bit size ' \
b8e80941Smrg            'somewhere, or the search expression.'.format(val)
b8e80941Smrg
b8e80941Smrg      if isinstance(val, Expression):
b8e80941Smrg         for src in val.sources:
b8e80941Smrg            self.validate_replace(src, search)
b8e80941Smrg
b8e80941Smrg   def validate(self, search, replace):
b8e80941Smrg      self.is_search = True
b8e80941Smrg      self.merge_variables(search)
b8e80941Smrg      self.merge_variables(replace)
b8e80941Smrg      self.validate_value(search)
b8e80941Smrg
b8e80941Smrg      self.is_search = False
b8e80941Smrg      self.validate_value(replace)
b8e80941Smrg
b8e80941Smrg      # Check that search is always more specialized than replace. Note that
b8e80941Smrg      # we're doing this in replace mode, disallowing merging variables.
b8e80941Smrg      search_bit_size = search.get_bit_size()
b8e80941Smrg      replace_bit_size = replace.get_bit_size()
b8e80941Smrg      cmp_result = self.compare_bitsizes(search_bit_size, replace_bit_size)
b8e80941Smrg
b8e80941Smrg      assert cmp_result is not None and cmp_result <= 0, \
b8e80941Smrg         'The search expression bit size {} and replace expression ' \
b8e80941Smrg         'bit size {} may not be the same'.format(
b8e80941Smrg               search_bit_size, replace_bit_size)
b8e80941Smrg
b8e80941Smrg      replace.set_bit_size(search)
b8e80941Smrg
b8e80941Smrg      self.validate_replace(replace, search)
b8e80941Smrg
b8e80941Smrg_optimization_ids = itertools.count()
b8e80941Smrg
b8e80941Smrgcondition_list = ['true']
b8e80941Smrg
b8e80941Smrgclass SearchAndReplace(object):
b8e80941Smrg   def __init__(self, transform):
b8e80941Smrg      self.id = next(_optimization_ids)
b8e80941Smrg
b8e80941Smrg      search = transform[0]
b8e80941Smrg      replace = transform[1]
b8e80941Smrg      if len(transform) > 2:
b8e80941Smrg         self.condition = transform[2]
b8e80941Smrg      else:
b8e80941Smrg         self.condition = 'true'
b8e80941Smrg
b8e80941Smrg      if self.condition not in condition_list:
b8e80941Smrg         condition_list.append(self.condition)
b8e80941Smrg      self.condition_index = condition_list.index(self.condition)
b8e80941Smrg
b8e80941Smrg      varset = VarSet()
b8e80941Smrg      if isinstance(search, Expression):
b8e80941Smrg         self.search = search
b8e80941Smrg      else:
b8e80941Smrg         self.search = Expression(search, "search{0}".format(self.id), varset)
b8e80941Smrg
b8e80941Smrg      varset.lock()
b8e80941Smrg
b8e80941Smrg      if isinstance(replace, Value):
b8e80941Smrg         self.replace = replace
b8e80941Smrg      else:
b8e80941Smrg         self.replace = Value.create(replace, "replace{0}".format(self.id), varset)
b8e80941Smrg
b8e80941Smrg      BitSizeValidator(varset).validate(self.search, self.replace)
b8e80941Smrg
b8e80941Smrgclass TreeAutomaton(object):
b8e80941Smrg   """This class calculates a bottom-up tree automaton to quickly search for
b8e80941Smrg   the left-hand sides of tranforms. Tree automatons are a generalization of
b8e80941Smrg   classical NFA's and DFA's, where the transition function determines the
b8e80941Smrg   state of the parent node based on the state of its children. We construct a
b8e80941Smrg   deterministic automaton to match patterns, using a similar algorithm to the
b8e80941Smrg   classical NFA to DFA construction. At the moment, it only matches opcodes
b8e80941Smrg   and constants (without checking the actual value), leaving more detailed
b8e80941Smrg   checking to the search function which actually checks the leaves. The
b8e80941Smrg   automaton acts as a quick filter for the search function, requiring only n
b8e80941Smrg   + 1 table lookups for each n-source operation. The implementation is based
b8e80941Smrg   on the theory described in "Tree Automatons: Two Taxonomies and a Toolkit."
b8e80941Smrg   In the language of that reference, this is a frontier-to-root deterministic
b8e80941Smrg   automaton using only symbol filtering. The filtering is crucial to reduce
b8e80941Smrg   both the time taken to generate the tables and the size of the tables.
b8e80941Smrg   """
b8e80941Smrg   def __init__(self, transforms):
b8e80941Smrg      self.patterns = [t.search for t in transforms]
b8e80941Smrg      self._compute_items()
b8e80941Smrg      self._build_table()
b8e80941Smrg      #print('num items: {}'.format(len(set(self.items.values()))))
b8e80941Smrg      #print('num states: {}'.format(len(self.states)))
b8e80941Smrg      #for state, patterns in zip(self.states, self.patterns):
b8e80941Smrg      #   print('{}: num patterns: {}'.format(state, len(patterns)))
b8e80941Smrg
b8e80941Smrg   class IndexMap(object):
b8e80941Smrg      """An indexed list of objects, where one can either lookup an object by
b8e80941Smrg      index or find the index associated to an object quickly using a hash
b8e80941Smrg      table. Compared to a list, it has a constant time index(). Compared to a
b8e80941Smrg      set, it provides a stable iteration order.
b8e80941Smrg      """
b8e80941Smrg      def __init__(self, iterable=()):
b8e80941Smrg         self.objects = []
b8e80941Smrg         self.map = {}
b8e80941Smrg         for obj in iterable:
b8e80941Smrg            self.add(obj)
b8e80941Smrg
b8e80941Smrg      def __getitem__(self, i):
b8e80941Smrg         return self.objects[i]
b8e80941Smrg
b8e80941Smrg      def __contains__(self, obj):
b8e80941Smrg         return obj in self.map
b8e80941Smrg
b8e80941Smrg      def __len__(self):
b8e80941Smrg         return len(self.objects)
b8e80941Smrg
b8e80941Smrg      def __iter__(self):
b8e80941Smrg         return iter(self.objects)
b8e80941Smrg
b8e80941Smrg      def clear(self):
b8e80941Smrg         self.objects = []
b8e80941Smrg         self.map.clear()
b8e80941Smrg
b8e80941Smrg      def index(self, obj):
b8e80941Smrg         return self.map[obj]
b8e80941Smrg
b8e80941Smrg      def add(self, obj):
b8e80941Smrg         if obj in self.map:
b8e80941Smrg            return self.map[obj]
b8e80941Smrg         else:
b8e80941Smrg            index = len(self.objects)
b8e80941Smrg            self.objects.append(obj)
b8e80941Smrg            self.map[obj] = index
b8e80941Smrg            return index
b8e80941Smrg
b8e80941Smrg      def __repr__(self):
b8e80941Smrg         return 'IndexMap([' + ', '.join(repr(e) for e in self.objects) + '])'
b8e80941Smrg
b8e80941Smrg   class Item(object):
b8e80941Smrg      """This represents an "item" in the language of "Tree Automatons." This
b8e80941Smrg      is just a subtree of some pattern, which represents a potential partial
b8e80941Smrg      match at runtime. We deduplicate them, so that identical subtrees of
b8e80941Smrg      different patterns share the same object, and store some extra
b8e80941Smrg      information needed for the main algorithm as well.
b8e80941Smrg      """
b8e80941Smrg      def __init__(self, opcode, children):
b8e80941Smrg         self.opcode = opcode
b8e80941Smrg         self.children = children
b8e80941Smrg         # These are the indices of patterns for which this item is the root node.
b8e80941Smrg         self.patterns = []
b8e80941Smrg         # This the set of opcodes for parents of this item. Used to speed up
b8e80941Smrg         # filtering.
b8e80941Smrg         self.parent_ops = set()
b8e80941Smrg
b8e80941Smrg      def __str__(self):
b8e80941Smrg         return '(' + ', '.join([self.opcode] + [str(c) for c in self.children]) + ')'
b8e80941Smrg
b8e80941Smrg      def __repr__(self):
b8e80941Smrg         return str(self)
b8e80941Smrg
b8e80941Smrg   def _compute_items(self):
b8e80941Smrg      """Build a set of all possible items, deduplicating them."""
b8e80941Smrg      # This is a map from (opcode, sources) to item.
b8e80941Smrg      self.items = {}
b8e80941Smrg
b8e80941Smrg      # The set of all opcodes used by the patterns. Used later to avoid
b8e80941Smrg      # building and emitting all the tables for opcodes that aren't used.
b8e80941Smrg      self.opcodes = self.IndexMap()
b8e80941Smrg
b8e80941Smrg      def get_item(opcode, children, pattern=None):
b8e80941Smrg         commutative = len(children) == 2 \
b8e80941Smrg               and "commutative" in opcodes[opcode].algebraic_properties
b8e80941Smrg         item = self.items.setdefault((opcode, children),
b8e80941Smrg                                      self.Item(opcode, children))
b8e80941Smrg         if commutative:
b8e80941Smrg            self.items[opcode, (children[1], children[0])] = item
b8e80941Smrg         if pattern is not None:
b8e80941Smrg            item.patterns.append(pattern)
b8e80941Smrg         return item
b8e80941Smrg
b8e80941Smrg      self.wildcard = get_item("__wildcard", ())
b8e80941Smrg      self.const = get_item("__const", ())
b8e80941Smrg
b8e80941Smrg      def process_subpattern(src, pattern=None):
b8e80941Smrg         if isinstance(src, Constant):
b8e80941Smrg            # Note: we throw away the actual constant value!
b8e80941Smrg            return self.const
b8e80941Smrg         elif isinstance(src, Variable):
b8e80941Smrg            if src.is_constant:
b8e80941Smrg               return self.const
b8e80941Smrg            else:
b8e80941Smrg               # Note: we throw away which variable it is here! This special
b8e80941Smrg               # item is equivalent to nu in "Tree Automatons."
b8e80941Smrg               return self.wildcard
b8e80941Smrg         else:
b8e80941Smrg            assert isinstance(src, Expression)
b8e80941Smrg            opcode = src.opcode
b8e80941Smrg            stripped = opcode.rstrip('0123456789')
b8e80941Smrg            if stripped in conv_opcode_types:
b8e80941Smrg               # Matches that use conversion opcodes with a specific type,
b8e80941Smrg               # like f2b1, are tricky.  Either we construct the automaton to
b8e80941Smrg               # match specific NIR opcodes like nir_op_f2b1, in which case we
b8e80941Smrg               # need to create separate items for each possible NIR opcode
b8e80941Smrg               # for patterns that have a generic opcode like f2b, or we
b8e80941Smrg               # construct it to match the search opcode, in which case we
b8e80941Smrg               # need to map f2b1 to f2b when constructing the automaton. Here
b8e80941Smrg               # we do the latter.
b8e80941Smrg               opcode = stripped
b8e80941Smrg            self.opcodes.add(opcode)
b8e80941Smrg            children = tuple(process_subpattern(c) for c in src.sources)
b8e80941Smrg            item = get_item(opcode, children, pattern)
b8e80941Smrg            for i, child in enumerate(children):
b8e80941Smrg               child.parent_ops.add(opcode)
b8e80941Smrg            return item
b8e80941Smrg
b8e80941Smrg      for i, pattern in enumerate(self.patterns):
b8e80941Smrg         process_subpattern(pattern, i)
b8e80941Smrg
b8e80941Smrg   def _build_table(self):
b8e80941Smrg      """This is the core algorithm which builds up the transition table. It
b8e80941Smrg      is based off of Algorithm 5.7.38 "Reachability-based tabulation of Cl .
b8e80941Smrg      Comp_a and Filt_{a,i} using integers to identify match sets." It
b8e80941Smrg      simultaneously builds up a list of all possible "match sets" or
b8e80941Smrg      "states", where each match set represents the set of Item's that match a
b8e80941Smrg      given instruction, and builds up the transition table between states.
b8e80941Smrg      """
b8e80941Smrg      # Map from opcode + filtered state indices to transitioned state.
b8e80941Smrg      self.table = defaultdict(dict)
b8e80941Smrg      # Bijection from state to index. q in the original algorithm is
b8e80941Smrg      # len(self.states)
b8e80941Smrg      self.states = self.IndexMap()
b8e80941Smrg      # List of pattern matches for each state index.
b8e80941Smrg      self.state_patterns = []
b8e80941Smrg      # Map from state index to filtered state index for each opcode.
b8e80941Smrg      self.filter = defaultdict(list)
b8e80941Smrg      # Bijections from filtered state to filtered state index for each
b8e80941Smrg      # opcode, called the "representor sets" in the original algorithm.
b8e80941Smrg      # q_{a,j} in the original algorithm is len(self.rep[op]).
b8e80941Smrg      self.rep = defaultdict(self.IndexMap)
b8e80941Smrg
b8e80941Smrg      # Everything in self.states with a index at least worklist_index is part
b8e80941Smrg      # of the worklist of newly created states. There is also a worklist of
b8e80941Smrg      # newly fitered states for each opcode, for which worklist_indices
b8e80941Smrg      # serves a similar purpose. worklist_index corresponds to p in the
b8e80941Smrg      # original algorithm, while worklist_indices is p_{a,j} (although since
b8e80941Smrg      # we only filter by opcode/symbol, it's really just p_a).
b8e80941Smrg      self.worklist_index = 0
b8e80941Smrg      worklist_indices = defaultdict(lambda: 0)
b8e80941Smrg
b8e80941Smrg      # This is the set of opcodes for which the filtered worklist is non-empty.
b8e80941Smrg      # It's used to avoid scanning opcodes for which there is nothing to
b8e80941Smrg      # process when building the transition table. It corresponds to new_a in
b8e80941Smrg      # the original algorithm.
b8e80941Smrg      new_opcodes = self.IndexMap()
b8e80941Smrg
b8e80941Smrg      # Process states on the global worklist, filtering them for each opcode,
b8e80941Smrg      # updating the filter tables, and updating the filtered worklists if any
b8e80941Smrg      # new filtered states are found. Similar to ComputeRepresenterSets() in
b8e80941Smrg      # the original algorithm, although that only processes a single state.
b8e80941Smrg      def process_new_states():
b8e80941Smrg         while self.worklist_index < len(self.states):
b8e80941Smrg            state = self.states[self.worklist_index]
b8e80941Smrg
b8e80941Smrg            # Calculate pattern matches for this state. Each pattern is
b8e80941Smrg            # assigned to a unique item, so we don't have to worry about
b8e80941Smrg            # deduplicating them here. However, we do have to sort them so
b8e80941Smrg            # that they're visited at runtime in the order they're specified
b8e80941Smrg            # in the source.
b8e80941Smrg            patterns = list(sorted(p for item in state for p in item.patterns))
b8e80941Smrg            assert len(self.state_patterns) == self.worklist_index
b8e80941Smrg            self.state_patterns.append(patterns)
b8e80941Smrg
b8e80941Smrg            # calculate filter table for this state, and update filtered
b8e80941Smrg            # worklists.
b8e80941Smrg            for op in self.opcodes:
b8e80941Smrg               filt = self.filter[op]
b8e80941Smrg               rep = self.rep[op]
b8e80941Smrg               filtered = frozenset(item for item in state if \
b8e80941Smrg                  op in item.parent_ops)
b8e80941Smrg               if filtered in rep:
b8e80941Smrg                  rep_index = rep.index(filtered)
b8e80941Smrg               else:
b8e80941Smrg                  rep_index = rep.add(filtered)
b8e80941Smrg                  new_opcodes.add(op)
b8e80941Smrg               assert len(filt) == self.worklist_index
b8e80941Smrg               filt.append(rep_index)
b8e80941Smrg            self.worklist_index += 1
b8e80941Smrg
b8e80941Smrg      # There are two start states: one which can only match as a wildcard,
b8e80941Smrg      # and one which can match as a wildcard or constant. These will be the
b8e80941Smrg      # states of intrinsics/other instructions and load_const instructions,
b8e80941Smrg      # respectively. The indices of these must match the definitions of
b8e80941Smrg      # WILDCARD_STATE and CONST_STATE below, so that the runtime C code can
b8e80941Smrg      # initialize things correctly.
b8e80941Smrg      self.states.add(frozenset((self.wildcard,)))
b8e80941Smrg      self.states.add(frozenset((self.const,self.wildcard)))
b8e80941Smrg      process_new_states()
b8e80941Smrg
b8e80941Smrg      while len(new_opcodes) > 0:
b8e80941Smrg         for op in new_opcodes:
b8e80941Smrg            rep = self.rep[op]
b8e80941Smrg            table = self.table[op]
b8e80941Smrg            op_worklist_index = worklist_indices[op]
b8e80941Smrg            if op in conv_opcode_types:
b8e80941Smrg               num_srcs = 1
b8e80941Smrg            else:
b8e80941Smrg               num_srcs = opcodes[op].num_inputs
b8e80941Smrg
b8e80941Smrg            # Iterate over all possible source combinations where at least one
b8e80941Smrg            # is on the worklist.
b8e80941Smrg            for src_indices in itertools.product(range(len(rep)), repeat=num_srcs):
b8e80941Smrg               if all(src_idx < op_worklist_index for src_idx in src_indices):
b8e80941Smrg                  continue
b8e80941Smrg
b8e80941Smrg               srcs = tuple(rep[src_idx] for src_idx in src_indices)
b8e80941Smrg
b8e80941Smrg               # Try all possible pairings of source items and add the
b8e80941Smrg               # corresponding parent items. This is Comp_a from the paper.
b8e80941Smrg               parent = set(self.items[op, item_srcs] for item_srcs in
b8e80941Smrg                  itertools.product(*srcs) if (op, item_srcs) in self.items)
b8e80941Smrg
b8e80941Smrg               # We could always start matching something else with a
b8e80941Smrg               # wildcard. This is Cl from the paper.
b8e80941Smrg               parent.add(self.wildcard)
b8e80941Smrg
b8e80941Smrg               table[src_indices] = self.states.add(frozenset(parent))
b8e80941Smrg            worklist_indices[op] = len(rep)
b8e80941Smrg         new_opcodes.clear()
b8e80941Smrg         process_new_states()
b8e80941Smrg
b8e80941Smrg_algebraic_pass_template = mako.template.Template("""
b8e80941Smrg#include "nir.h"
b8e80941Smrg#include "nir_builder.h"
b8e80941Smrg#include "nir_search.h"
b8e80941Smrg#include "nir_search_helpers.h"
b8e80941Smrg
b8e80941Smrg#ifndef NIR_OPT_ALGEBRAIC_STRUCT_DEFS
b8e80941Smrg#define NIR_OPT_ALGEBRAIC_STRUCT_DEFS
b8e80941Smrg
b8e80941Smrgstruct transform {
b8e80941Smrg   const nir_search_expression *search;
b8e80941Smrg   const nir_search_value *replace;
b8e80941Smrg   unsigned condition_offset;
b8e80941Smrg};
b8e80941Smrg
b8e80941Smrgstruct per_op_table {
b8e80941Smrg   const uint16_t *filter;
b8e80941Smrg   unsigned num_filtered_states;
b8e80941Smrg   const uint16_t *table;
b8e80941Smrg};
b8e80941Smrg
b8e80941Smrg/* Note: these must match the start states created in
b8e80941Smrg * TreeAutomaton._build_table()
b8e80941Smrg */
b8e80941Smrg
b8e80941Smrg/* WILDCARD_STATE = 0 is set by zeroing the state array */
b8e80941Smrgstatic const uint16_t CONST_STATE = 1;
b8e80941Smrg
b8e80941Smrg#endif
b8e80941Smrg
b8e80941Smrg<% cache = {} %>
b8e80941Smrg% for xform in xforms:
b8e80941Smrg   ${xform.search.render(cache)}
b8e80941Smrg   ${xform.replace.render(cache)}
b8e80941Smrg% endfor
b8e80941Smrg
b8e80941Smrg% for state_id, state_xforms in enumerate(automaton.state_patterns):
b8e80941Smrg% if state_xforms: # avoid emitting a 0-length array for MSVC
b8e80941Smrgstatic const struct transform ${pass_name}_state${state_id}_xforms[] = {
b8e80941Smrg% for i in state_xforms:
b8e80941Smrg  { ${xforms[i].search.c_ptr(cache)}, ${xforms[i].replace.c_value_ptr(cache)}, ${xforms[i].condition_index} },
b8e80941Smrg% endfor
b8e80941Smrg};
b8e80941Smrg% endif
b8e80941Smrg% endfor
b8e80941Smrg
b8e80941Smrgstatic const struct per_op_table ${pass_name}_table[nir_num_search_ops] = {
b8e80941Smrg% for op in automaton.opcodes:
b8e80941Smrg   [${get_c_opcode(op)}] = {
b8e80941Smrg      .filter = (uint16_t []) {
b8e80941Smrg      % for e in automaton.filter[op]:
b8e80941Smrg         ${e},
b8e80941Smrg      % endfor
b8e80941Smrg      },
b8e80941Smrg      <%
b8e80941Smrg        num_filtered = len(automaton.rep[op])
b8e80941Smrg      %>
b8e80941Smrg      .num_filtered_states = ${num_filtered},
b8e80941Smrg      .table = (uint16_t []) {
b8e80941Smrg      <%
b8e80941Smrg        num_srcs = len(next(iter(automaton.table[op])))
b8e80941Smrg      %>
b8e80941Smrg      % for indices in itertools.product(range(num_filtered), repeat=num_srcs):
b8e80941Smrg         ${automaton.table[op][indices]},
b8e80941Smrg      % endfor
b8e80941Smrg      },
b8e80941Smrg   },
b8e80941Smrg% endfor
b8e80941Smrg};
b8e80941Smrg
b8e80941Smrgstatic void
b8e80941Smrg${pass_name}_pre_block(nir_block *block, uint16_t *states)
b8e80941Smrg{
b8e80941Smrg   nir_foreach_instr(instr, block) {
b8e80941Smrg      switch (instr->type) {
b8e80941Smrg      case nir_instr_type_alu: {
b8e80941Smrg         nir_alu_instr *alu = nir_instr_as_alu(instr);
b8e80941Smrg         nir_op op = alu->op;
b8e80941Smrg         uint16_t search_op = nir_search_op_for_nir_op(op);
b8e80941Smrg         const struct per_op_table *tbl = &${pass_name}_table[search_op];
b8e80941Smrg         if (tbl->num_filtered_states == 0)
b8e80941Smrg            continue;
b8e80941Smrg
b8e80941Smrg         /* Calculate the index into the transition table. Note the index
b8e80941Smrg          * calculated must match the iteration order of Python's
b8e80941Smrg          * itertools.product(), which was used to emit the transition
b8e80941Smrg          * table.
b8e80941Smrg          */
b8e80941Smrg         uint16_t index = 0;
b8e80941Smrg         for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) {
b8e80941Smrg            index *= tbl->num_filtered_states;
b8e80941Smrg            index += tbl->filter[states[alu->src[i].src.ssa->index]];
b8e80941Smrg         }
b8e80941Smrg         states[alu->dest.dest.ssa.index] = tbl->table[index];
b8e80941Smrg         break;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      case nir_instr_type_load_const: {
b8e80941Smrg         nir_load_const_instr *load_const = nir_instr_as_load_const(instr);
b8e80941Smrg         states[load_const->def.index] = CONST_STATE;
b8e80941Smrg         break;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      default:
b8e80941Smrg         break;
b8e80941Smrg      }
b8e80941Smrg   }
b8e80941Smrg}
b8e80941Smrg
b8e80941Smrgstatic bool
b8e80941Smrg${pass_name}_block(nir_builder *build, nir_block *block,
b8e80941Smrg                   const uint16_t *states, const bool *condition_flags)
b8e80941Smrg{
b8e80941Smrg   bool progress = false;
b8e80941Smrg
b8e80941Smrg   nir_foreach_instr_reverse_safe(instr, block) {
b8e80941Smrg      if (instr->type != nir_instr_type_alu)
b8e80941Smrg         continue;
b8e80941Smrg
b8e80941Smrg      nir_alu_instr *alu = nir_instr_as_alu(instr);
b8e80941Smrg      if (!alu->dest.dest.is_ssa)
b8e80941Smrg         continue;
b8e80941Smrg
b8e80941Smrg      switch (states[alu->dest.dest.ssa.index]) {
b8e80941Smrg% for i in range(len(automaton.state_patterns)):
b8e80941Smrg      case ${i}:
b8e80941Smrg         % if automaton.state_patterns[i]:
b8e80941Smrg         for (unsigned i = 0; i < ARRAY_SIZE(${pass_name}_state${i}_xforms); i++) {
b8e80941Smrg            const struct transform *xform = &${pass_name}_state${i}_xforms[i];
b8e80941Smrg            if (condition_flags[xform->condition_offset] &&
b8e80941Smrg                nir_replace_instr(build, alu, xform->search, xform->replace)) {
b8e80941Smrg               progress = true;
b8e80941Smrg               break;
b8e80941Smrg            }
b8e80941Smrg         }
b8e80941Smrg         % endif
b8e80941Smrg         break;
b8e80941Smrg% endfor
b8e80941Smrg      default: assert(0);
b8e80941Smrg      }
b8e80941Smrg   }
b8e80941Smrg
b8e80941Smrg   return progress;
b8e80941Smrg}
b8e80941Smrg
b8e80941Smrgstatic bool
b8e80941Smrg${pass_name}_impl(nir_function_impl *impl, const bool *condition_flags)
b8e80941Smrg{
b8e80941Smrg   bool progress = false;
b8e80941Smrg
b8e80941Smrg   nir_builder build;
b8e80941Smrg   nir_builder_init(&build, impl);
b8e80941Smrg
b8e80941Smrg   /* Note: it's important here that we're allocating a zeroed array, since
b8e80941Smrg    * state 0 is the default state, which means we don't have to visit
b8e80941Smrg    * anything other than constants and ALU instructions.
b8e80941Smrg    */
b8e80941Smrg   uint16_t *states = calloc(impl->ssa_alloc, sizeof(*states));
b8e80941Smrg
b8e80941Smrg   nir_foreach_block(block, impl) {
b8e80941Smrg      ${pass_name}_pre_block(block, states);
b8e80941Smrg   }
b8e80941Smrg
b8e80941Smrg   nir_foreach_block_reverse(block, impl) {
b8e80941Smrg      progress |= ${pass_name}_block(&build, block, states, condition_flags);
b8e80941Smrg   }
b8e80941Smrg
b8e80941Smrg   free(states);
b8e80941Smrg
b8e80941Smrg   if (progress) {
b8e80941Smrg      nir_metadata_preserve(impl, nir_metadata_block_index |
b8e80941Smrg                                  nir_metadata_dominance);
b8e80941Smrg    } else {
b8e80941Smrg#ifndef NDEBUG
b8e80941Smrg      impl->valid_metadata &= ~nir_metadata_not_properly_reset;
b8e80941Smrg#endif
b8e80941Smrg    }
b8e80941Smrg
b8e80941Smrg   return progress;
b8e80941Smrg}
b8e80941Smrg
b8e80941Smrg
b8e80941Smrgbool
b8e80941Smrg${pass_name}(nir_shader *shader)
b8e80941Smrg{
b8e80941Smrg   bool progress = false;
b8e80941Smrg   bool condition_flags[${len(condition_list)}];
b8e80941Smrg   const nir_shader_compiler_options *options = shader->options;
b8e80941Smrg   const shader_info *info = &shader->info;
b8e80941Smrg   (void) options;
b8e80941Smrg   (void) info;
b8e80941Smrg
b8e80941Smrg   % for index, condition in enumerate(condition_list):
b8e80941Smrg   condition_flags[${index}] = ${condition};
b8e80941Smrg   % endfor
b8e80941Smrg
b8e80941Smrg   nir_foreach_function(function, shader) {
b8e80941Smrg      if (function->impl)
b8e80941Smrg         progress |= ${pass_name}_impl(function->impl, condition_flags);
b8e80941Smrg   }
b8e80941Smrg
b8e80941Smrg   return progress;
b8e80941Smrg}
b8e80941Smrg""")
b8e80941Smrg
b8e80941Smrg
b8e80941Smrg
b8e80941Smrgclass AlgebraicPass(object):
b8e80941Smrg   def __init__(self, pass_name, transforms):
b8e80941Smrg      self.xforms = []
b8e80941Smrg      self.opcode_xforms = defaultdict(lambda : [])
b8e80941Smrg      self.pass_name = pass_name
b8e80941Smrg
b8e80941Smrg      error = False
b8e80941Smrg
b8e80941Smrg      for xform in transforms:
b8e80941Smrg         if not isinstance(xform, SearchAndReplace):
b8e80941Smrg            try:
b8e80941Smrg               xform = SearchAndReplace(xform)
b8e80941Smrg            except:
b8e80941Smrg               print("Failed to parse transformation:", file=sys.stderr)
b8e80941Smrg               print("  " + str(xform), file=sys.stderr)
b8e80941Smrg               traceback.print_exc(file=sys.stderr)
b8e80941Smrg               print('', file=sys.stderr)
b8e80941Smrg               error = True
b8e80941Smrg               continue
b8e80941Smrg
b8e80941Smrg         self.xforms.append(xform)
b8e80941Smrg         if xform.search.opcode in conv_opcode_types:
b8e80941Smrg            dst_type = conv_opcode_types[xform.search.opcode]
b8e80941Smrg            for size in type_sizes(dst_type):
b8e80941Smrg               sized_opcode = xform.search.opcode + str(size)
b8e80941Smrg               self.opcode_xforms[sized_opcode].append(xform)
b8e80941Smrg         else:
b8e80941Smrg            self.opcode_xforms[xform.search.opcode].append(xform)
b8e80941Smrg
b8e80941Smrg      self.automaton = TreeAutomaton(self.xforms)
b8e80941Smrg
b8e80941Smrg      if error:
b8e80941Smrg         sys.exit(1)
b8e80941Smrg
b8e80941Smrg
b8e80941Smrg   def render(self):
b8e80941Smrg      return _algebraic_pass_template.render(pass_name=self.pass_name,
b8e80941Smrg                                             xforms=self.xforms,
b8e80941Smrg                                             opcode_xforms=self.opcode_xforms,
b8e80941Smrg                                             condition_list=condition_list,
b8e80941Smrg                                             automaton=self.automaton,
b8e80941Smrg                                             get_c_opcode=get_c_opcode,
b8e80941Smrg                                             itertools=itertools)