nir_opt_algebraic.py revision 7ec681f3
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2014 Intel Corporation
4#
5# Permission is hereby granted, free of charge, to any person obtaining a
6# copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice (including the next
13# paragraph) shall be included in all copies or substantial portions of the
14# Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22# IN THE SOFTWARE.
23#
24# Authors:
25#    Jason Ekstrand (jason@jlekstrand.net)
26
27from collections import OrderedDict
28import nir_algebraic
29from nir_opcodes import type_sizes
30import itertools
31import struct
32from math import pi
33
34# Convenience variables
35a = 'a'
36b = 'b'
37c = 'c'
38d = 'd'
39e = 'e'
40
41signed_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)'
42signed_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)'
43
44# Written in the form (<search>, <replace>) where <search> is an expression
45# and <replace> is either an expression or a value.  An expression is
46# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
47# where each source is either an expression or a value.  A value can be
48# either a numeric constant or a string representing a variable name.
49#
50# If the opcode in a search expression is prefixed by a '~' character, this
51# indicates that the operation is inexact.  Such operations will only get
52# applied to SSA values that do not have the exact bit set.  This should be
53# used by by any optimizations that are not bit-for-bit exact.  It should not,
54# however, be used for backend-requested lowering operations as those need to
55# happen regardless of precision.
56#
57# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
58# "#" indicates that the given variable will only match constants,
59# type indicates that the given variable will only match values from ALU
60#    instructions with the given output type,
61# (cond) specifies an additional condition function (see nir_search_helpers.h),
62# swiz is a swizzle applied to the variable (only in the <replace> expression)
63#
64# For constants, you have to be careful to make sure that it is the right
65# type because python is unaware of the source and destination types of the
66# opcodes.
67#
68# All expression types can have a bit-size specified.  For opcodes, this
69# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
70# type and size.  In the search half of the expression this indicates that it
71# should only match that particular bit-size.  In the replace half of the
72# expression this indicates that the constructed value should have that
73# bit-size.
74#
75# If the opcode in a replacement expression is prefixed by a '!' character,
76# this indicated that the new expression will be marked exact.
77#
78# A special condition "many-comm-expr" can be used with expressions to note
79# that the expression and its subexpressions have more commutative expressions
80# than nir_replace_instr can handle.  If this special condition is needed with
81# another condition, the two can be separated by a comma (e.g.,
82# "(many-comm-expr,is_used_once)").
83
84# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
85def lowered_sincos(c):
86    x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
87    x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
88    return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
89
90def intBitsToFloat(i):
91    return struct.unpack('!f', struct.pack('!I', i))[0]
92
93optimizations = [
94
95   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
96   (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'),
97   (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'),
98   (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'),
99   (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'),
100   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
101   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
102
103   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
104   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
105   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
106   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
107   (('udiv', a, 1), a),
108   (('idiv', a, 1), a),
109   (('umod', a, 1), 0),
110   (('imod', a, 1), 0),
111   (('imod', a, -1), 0),
112   (('irem', a, 1), 0),
113   (('irem', a, -1), 0),
114   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
115   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'),
116   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),
117   (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
118   (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
119   (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'),
120   # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)'
121   (('irem', a, '#b(is_pos_power_of_two)'),
122    ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))),
123    '!options->lower_bitops'),
124   (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'),
125
126   (('~fneg', ('fneg', a)), a),
127   (('ineg', ('ineg', a)), a),
128   (('fabs', ('fneg', a)), ('fabs', a)),
129   (('fabs', ('u2f', a)), ('u2f', a)),
130   (('iabs', ('iabs', a)), ('iabs', a)),
131   (('iabs', ('ineg', a)), ('iabs', a)),
132   (('f2b', ('fneg', a)), ('f2b', a)),
133   (('i2b', ('ineg', a)), ('i2b', a)),
134   (('~fadd', a, 0.0), a),
135   # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a
136   # floating point instruction, they should flush any input denormals and we
137   # can replace -0.0 with 0.0 if the float execution mode allows it.
138   (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16),
139   (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32),
140   (('iadd', a, 0), a),
141   (('usadd_4x8_vc4', a, 0), a),
142   (('usadd_4x8_vc4', a, ~0), ~0),
143   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
144   (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
145   (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),
146   (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
147   (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),
148   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
149   (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
150   (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
151   (('~fadd', ('fneg', a), a), 0.0),
152   (('iadd', ('ineg', a), a), 0),
153   (('iadd', ('ineg', a), ('iadd', a, b)), b),
154   (('iadd', a, ('iadd', ('ineg', a), b)), b),
155   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
156   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
157   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
158   (('~fmul', a, 0.0), 0.0),
159   # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
160   (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16),
161   (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32),
162   (('imul', a, 0), 0),
163   (('umul_unorm_4x8_vc4', a, 0), 0),
164   (('umul_unorm_4x8_vc4', a, ~0), a),
165   (('~fmul', a, 1.0), a),
166   # The only effect a*1.0 can have is flushing denormals. If it's only used by
167   # a floating point instruction, they should flush any input denormals and
168   # this multiplication isn't needed.
169   (('fmul(is_only_used_as_float)', a, 1.0), a),
170   (('imul', a, 1), a),
171   (('fmul', a, -1.0), ('fneg', a)),
172   (('imul', a, -1), ('ineg', a)),
173   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
174   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
175   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
176   # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN
177   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
178   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
179   (('~ffma', 0.0, a, b), b),
180   (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16),
181   (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32),
182   (('~ffma', a, b, 0.0), ('fmul', a, b)),
183   (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16),
184   (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),
185   (('ffma', 1.0, a, b), ('fadd', a, b)),
186   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
187   (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
188   (('~flrp', a, b, 0.0), a),
189   (('~flrp', a, b, 1.0), b),
190   (('~flrp', a, a, b), a),
191   (('~flrp', 0.0, a, b), ('fmul', a, b)),
192
193   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
194   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
195
196   (('sdot_4x8_iadd', a, 0, b), b),
197   (('udot_4x8_uadd', a, 0, b), b),
198   (('sdot_4x8_iadd_sat', a, 0, b), b),
199   (('udot_4x8_uadd_sat', a, 0, b), b),
200   (('sdot_2x16_iadd', a, 0, b), b),
201   (('udot_2x16_uadd', a, 0, b), b),
202   (('sdot_2x16_iadd_sat', a, 0, b), b),
203   (('udot_2x16_uadd_sat', a, 0, b), b),
204
205   # sudot_4x8_iadd is not commutative at all, so the patterns must be
206   # duplicated with zeros on each of the first positions.
207   (('sudot_4x8_iadd', a, 0, b), b),
208   (('sudot_4x8_iadd', 0, a, b), b),
209   (('sudot_4x8_iadd_sat', a, 0, b), b),
210   (('sudot_4x8_iadd_sat', 0, a, b), b),
211
212   (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
213   (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
214   (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
215   (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),
216   (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),
217
218   # Try to let constant folding eliminate the dot-product part.  These are
219   # safe because the dot product cannot overflow 32 bits.
220   (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
221   (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
222   (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
223   (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
224   (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),
225   (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),
226   (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
227   (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
228   (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
229   (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),
230   (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),
231   (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
232   (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
233   (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
234   (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
235   (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
236]
237
238# Shorthand for the expansion of just the dot product part of the [iu]dp4a
239# instructions.
240sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
241                                 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
242                        ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
243                                 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
244udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
245                                 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
246                        ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
247                                 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
248sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
249                                  ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
250                         ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
251                                  ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
252sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
253                         ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
254udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
255                         ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
256
257optimizations.extend([
258   (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
259   (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_dot_4x8'),
260   (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
261   (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
262   (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
263
264   # For the unsigned dot-product, the largest possible value 4*(255*255) =
265   # 0x3f804, so we don't have to worry about that intermediate result
266   # overflowing.  0x100000000 - 0x3f804 = 0xfffc07fc.  If c is a constant
267   # that is less than 0xfffc07fc, then the result cannot overflow ever.
268   (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
269   (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_dot_4x8'),
270
271   # For the signed dot-product, the largest positive value is 4*(-128*-128) =
272   # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00.  We
273   # don't have to worry about that intermediate result overflowing or
274   # underflowing.
275   (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
276
277   (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
278
279   (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),
280   (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
281])
282
283# Float sizes
284for s in [16, 32, 64]:
285    optimizations.extend([
286       (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
287
288       (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
289       (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
290       (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
291
292       (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
293       # These are the same as the previous three rules, but it depends on
294       # 1-fsat(x) <=> fsat(1-x).  See below.
295       (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),
296       (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
297
298       (('~fadd@{}'.format(s),    ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
299       (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
300
301       (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)),
302       (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)),
303
304       # These two aren't flrp lowerings, but do appear in some shaders.
305       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)),
306       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))),
307
308       # 1 - ((1 - a) * (1 - b))
309       # 1 - (1 - a - b + a*b)
310       # 1 - 1 + a + b - a*b
311       # a + b - a*b
312       # a + b*(1 - a)
313       # b*(1 - a) + 1*a
314       # flrp(b, 1, a)
315       (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),
316    ])
317
318optimizations.extend([
319   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
320
321   (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
322   (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
323   (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
324   (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
325   (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
326   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
327   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),
328   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),
329   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
330   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
331   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
332   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
333   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
334
335   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
336    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
337
338   (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
339
340   (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
341   (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
342   (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
343   (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
344
345   (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
346   (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
347
348   (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
349   (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
350
351   # Lower fdot to fsum when it is available
352   (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
353   (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
354   (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
355   (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
356
357   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
358   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
359   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
360   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
361
362   # (a * #b + #c) << #d
363   # ((a * #b) << #d) + (#c << #d)
364   # (a * (#b << #d)) + (#c << #d)
365   (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
366    ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
367
368   # (a * #b) << #c
369   # a * (#b << #c)
370   (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
371])
372
373# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
374# bits of the second source.  These replacements must correctly handle the
375# case where (b % bitsize) + (c % bitsize) >= bitsize.
376for s in [8, 16, 32, 64]:
377   mask = (1 << s) - 1
378
379   ishl = "ishl@{}".format(s)
380   ishr = "ishr@{}".format(s)
381   ushr = "ushr@{}".format(s)
382
383   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
384
385   optimizations.extend([
386       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
387       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
388
389       # To get get -1 for large shifts of negative values, ishr must instead
390       # clamp the shift count to the maximum value.
391       ((ishr, (ishr, a, '#b'), '#c'),
392        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
393   ])
394
395# Optimize a pattern of address calculation created by DXVK where the offset is
396# divided by 4 and then multipled by 4. This can be turned into an iand and the
397# additions before can be reassociated to CSE the iand instruction.
398
399for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)):
400    a_sz = 'a@{}'.format(size)
401
402    optimizations.extend([
403       # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
404       (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
405       (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
406
407       # This does not trivially work with ishr.
408       (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))),
409    ])
410
411for log2 in range(1, 7): # powers of two from 2 to 64
412   v = 1 << log2
413   mask = 0xffffffff & ~(v - 1)
414   b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
415
416   optimizations.extend([
417       # Reassociate for improved CSE
418       (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
419   ])
420
421# To save space in the state tables, reduce to the set that is known to help.
422# Previously, this was range(1, 32).  In addition, a couple rules inside the
423# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
424# resolution.
425for i in [1, 2, 16, 24]:
426    lo_mask = 0xffffffff >> i
427    hi_mask = (0xffffffff << i) & 0xffffffff
428
429    optimizations.extend([
430        # This pattern seems to only help in the soft-fp64 code.
431        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
432#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
433#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
434
435        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
436        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
437#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
438    ])
439
440optimizations.extend([
441   # This is common for address calculations.  Reassociating may enable the
442   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
443   # instruction or a constant offset field for in load / store instructions.
444   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
445
446   # (a + #b) * #c => (a * #c) + (#b * #c)
447   (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
448
449   # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)
450   (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
451    ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))),
452   (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
453    ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))),
454
455   # Comparison simplifications
456   (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)),
457   (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)),
458   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
459   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
460   (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)),
461   (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)),
462   (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)),
463   (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)),
464   (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)),
465   (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)),
466
467   (('iand', ('feq', a, b), ('fneu', a, b)), False),
468   (('iand', ('flt', a, b), ('flt', b, a)), False),
469   (('iand', ('ieq', a, b), ('ine', a, b)), False),
470   (('iand', ('ilt', a, b), ('ilt', b, a)), False),
471   (('iand', ('ult', a, b), ('ult', b, a)), False),
472
473   # This helps some shaders because, after some optimizations, they end up
474   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
475   # matching would be handled by CSE.
476   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
477   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
478   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
479   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
480   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
481   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
482   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
483   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
484   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
485   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
486
487   # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false.
488   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
489
490   # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false.
491   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
492
493   # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false.
494   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
495
496   # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true.
497   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
498
499   # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false.
500   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
501
502   # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false.
503   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
504
505   # 0.0 >= b2f(a)
506   # b2f(a) <= 0.0
507   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
508   # inot(a)
509   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
510
511   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
512
513   (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
514   (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
515   (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
516   (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
517   (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
518   (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
519   (('fneu',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
520   (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
521   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
522   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
523   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
524   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
525   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
526   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
527   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
528   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
529
530   # -(b2f(a) + b2f(b)) < 0
531   # 0 < b2f(a) + b2f(b)
532   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
533   # a || b
534   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
535   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
536
537   # -(b2f(a) + b2f(b)) >= 0
538   # 0 >= b2f(a) + b2f(b)
539   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
540   # !(a || b)
541   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
542   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
543
544   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
545   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
546
547   # Some optimizations (below) convert things like (a < b || c < b) into
548   # (min(a, c) < b).  However, this interfers with the previous optimizations
549   # that try to remove comparisons with negated sums of b2f.  This just
550   # breaks that apart.
551   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
552    ('ior', ('flt', c, 0.0), ('ior', a, b))),
553
554   (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
555   (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
556   (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
557   (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
558   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
559   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
560   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
561   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
562   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
563   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
564   (('~fneu',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
565   (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
566
567   # Cannot remove the addition from ilt or ige due to overflow.
568   (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
569   (('ine', ('iadd', a, b), a), ('ine', b, 0)),
570
571   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
572   (('fneu', ('b2f', 'a@1'), 0.0), a),
573   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
574   (('ine', ('b2i', 'a@1'), 0),   a),
575
576   (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
577   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
578   (('fge', ('u2f', a), 0.0), True),
579   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
580   (('flt', ('u2f', a), 0.0), False),
581   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
582   (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
583   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
584   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
585   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
586   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
587   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
588
589   # 0.0 < fabs(a)
590   # fabs(a) > 0.0
591   # fabs(a) != 0.0 because fabs(a) must be >= 0
592   # a != 0.0
593   (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
594
595   # -fabs(a) < 0.0
596   # fabs(a) > 0.0
597   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
598
599   # 0.0 >= fabs(a)
600   # 0.0 == fabs(a)   because fabs(a) must be >= 0
601   # 0.0 == a
602   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
603
604   # -fabs(a) >= 0.0
605   # 0.0 >= fabs(a)
606   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
607
608   # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
609   #
610   # This should be NaN safe.
611   #
612   # NaN >= 0 && 1 >= NaN -> false && false -> false
613   #
614   # vs.
615   #
616   # NaN == fsat(NaN) -> NaN == 0 -> false
617   (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
618
619   # Note: fmin(-a, -b) == -fmax(a, b)
620   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
621   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
622   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
623   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
624
625   # fmin(b2f(a), b)
626   # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
627   # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
628   # bcsel(a, fmin(1.0, b), fmin(0.0, b))
629   #
630   # Since b is a constant, constant folding will eliminate the fmin and the
631   # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
632   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
633
634   (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
635
636   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
637   (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
638   (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
639   (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
640   (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
641   (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)),
642   (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
643   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
644   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
645   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
646   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
647   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
648   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
649   (('bcsel', a, True, b), ('ior', a, b)),
650   (('bcsel', a, a, b), ('ior', a, b)),
651   (('bcsel', a, b, False), ('iand', a, b)),
652   (('bcsel', a, b, a), ('iand', a, b)),
653   (('~fmin', a, a), a),
654   (('~fmax', a, a), a),
655   (('imin', a, a), a),
656   (('imax', a, a), a),
657   (('umin', a, a), a),
658   (('umin', a, 0), 0),
659   (('umin', a, -1), a),
660   (('umax', a, a), a),
661   (('umax', a, 0), a),
662   (('umax', a, -1), -1),
663   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
664   (('umax', ('umax', a, b), b), ('umax', a, b)),
665   (('imax', ('imax', a, b), b), ('imax', a, b)),
666   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
667   (('umin', ('umin', a, b), b), ('umin', a, b)),
668   (('imin', ('imin', a, b), b), ('imin', a, b)),
669   (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)),
670   (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)),
671   (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)),
672   (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)),
673   (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)),
674   (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),
675])
676
677for N in [8, 16, 32, 64]:
678    b2iN = 'b2i{0}'.format(N)
679    optimizations.extend([
680        (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),
681        (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),
682    ])
683
684for N in [16, 32, 64]:
685    b2fN = 'b2f{0}'.format(N)
686    optimizations.extend([
687        (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),
688        (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),
689    ])
690
691# Integer sizes
692for s in [8, 16, 32, 64]:
693    optimizations.extend([
694       (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),
695
696       # Simplify logic to detect sign of an integer.
697       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ige', a, 0)),
698       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),
699       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ilt', a, 0)),
700       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),
701       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
702       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
703       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),
704       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),
705       (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
706       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
707       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),
708       (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),
709    ])
710
711optimizations.extend([
712   (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
713   (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
714   (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
715   (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
716   (('~fmin', a, ('fabs', a)), a),
717   (('imin', a, ('iabs', a)), a),
718   (('~fmax', a, ('fneg', ('fabs', a))), a),
719   (('imax', a, ('ineg', ('iabs', a))), a),
720   (('fmax', a, ('fabs', a)), ('fabs', a)),
721   (('imax', a, ('iabs', a)), ('iabs', a)),
722   (('fmax', a, ('fneg', a)), ('fabs', a)),
723   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
724   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
725   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
726   # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
727   # fsat(a) returns 0.0.
728   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
729   # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
730   # fneg(fsat(fneg(a))) returns -0.0 on NaN.
731   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
732   # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
733   # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
734   # SignedZeroInfNanPreserve is set, but we don't currently have any way of
735   # representing this in the optimizations other than the usual ~.
736   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
737   # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark
738   # the new comparison precise to prevent it being changed to 'a != 0'.
739   (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))),
740   (('fsat', ('b2f', a)), ('b2f', a)),
741   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
742   (('fsat', ('fsat', a)), ('fsat', a)),
743   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
744   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
745   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
746   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
747   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
748   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
749   # Both the left and right patterns are "b" when isnan(a), so this is exact.
750   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
751   # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
752   # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
753   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
754
755   # max(-min(b, a), b) -> max(abs(b), -a)
756   # min(-max(b, a), b) -> min(-abs(b), -a)
757   (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
758   (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
759
760   # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
761   # fsat(b-a).
762   #
763   # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
764   #
765   # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
766   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
767    ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
768
769   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
770
771   # The ior versions are exact because fmin and fmax will always pick a
772   # non-NaN value, if one exists.  Therefore (a < NaN) || (a < c) == a <
773   # fmax(NaN, c) == a < c.  Mark the fmin or fmax in the replacement as exact
774   # to prevent other optimizations from ruining the "NaN clensing" property
775   # of the fmin or fmax.
776   (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))),
777   (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)),
778   (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))),
779   (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)),
780   (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))),
781   (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)),
782   (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))),
783   (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)),
784   (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
785   (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
786   (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
787   (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
788   (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))),
789   (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)),
790   (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))),
791   (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)),
792
793   (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
794   (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
795   (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
796   (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
797   (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
798   (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
799   (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
800   (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
801   (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
802   (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
803   (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
804   (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
805   (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
806   (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
807   (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
808   (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
809
810   # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y
811   # < 0.0, || a.y > 1.0 || ...  These patterns rearrange and replace in a
812   # single step.  Doing just the replacement can lead to an infinite loop as
813   # the pattern is repeatedly applied to the result of the previous
814   # application of the pattern.
815   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
816   (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
817   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
818   (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
819
820   # This is how SpvOpFOrdNotEqual might be implemented.  If both values are
821   # numbers, then it can be replaced with fneu.
822   (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),
823])
824
825# Float sizes
826for s in [16, 32, 64]:
827    optimizations.extend([
828       # These derive from the previous patterns with the application of b < 0 <=>
829       # 0 < -b.  The transformation should be applied if either comparison is
830       # used once as this ensures that the number of comparisons will not
831       # increase.  The sources to the ior and iand are not symmetric, so the
832       # rules have to be duplicated to get this behavior.
833       (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
834       (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
835       (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
836       (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
837       (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
838       (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
839       (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
840       (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
841
842       # The (i2f32, ...) part is an open-coded fsign.  When that is combined
843       # with the bcsel, it's basically copysign(1.0, a).  There are some
844       # behavior differences between this pattern and copysign w.r.t. ±0 and
845       # NaN.  copysign(x, y) blindly takes the sign bit from y and applies it
846       # to x, regardless of whether either or both values are NaN.
847       #
848       # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,
849       #            int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0
850       # If a == ±0: bcsel(True, 1.0, ...) = 1.0,
851       #            int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1
852       #
853       # For all other values of 'a', the original and replacement behave as
854       # copysign.
855       #
856       # Marking the replacement comparisons as precise prevents any future
857       # optimizations from replacing either of the comparisons with the
858       # logical-not of the other.
859       #
860       # Note: Use b2i32 in the replacement because some platforms that
861       # support fp16 don't support int16.
862       (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),
863        ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),
864
865       (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),
866
867       # The C spec says, "If the value of the integral part cannot be represented
868       # by the integer type, the behavior is undefined."  "Undefined" can mean
869       # "the conversion doesn't happen at all."
870       (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),
871
872       # Ironically, mark these as imprecise because removing the conversions may
873       # preserve more precision than doing the conversions (e.g.,
874       # uint(float(0x81818181u)) == 0x81818200).
875       (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
876       (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
877       (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
878       (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
879
880       (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'),
881       (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'),
882    ])
883
884    # float? -> float? -> floatS ==> float? -> floatS
885    (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),
886
887    # int? -> float? -> floatS ==> int? -> floatS
888    (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),
889    (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),
890
891    # float? -> float? -> intS ==> float? -> intS
892    (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),
893    (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),
894
895    for B in [32, 64]:
896        if s < B:
897            optimizations.extend([
898               # S = smaller, B = bigger
899               # typeS -> typeB -> typeS ==> identity
900               (('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),
901               (('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a),
902               (('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a),
903
904               # bool1 -> typeB -> typeS ==> bool1 -> typeS
905               (('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)),
906               (('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
907               (('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
908
909               # floatS -> floatB -> intB ==> floatS -> intB
910               (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),
911               (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),
912
913               # int? -> floatB -> floatS ==> int? -> floatS
914               (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),
915               (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),
916
917               # intS -> intB -> floatB ==> intS -> floatB
918               (('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)),
919               (('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)),
920            ])
921
922# mediump variants of the above
923optimizations.extend([
924    # int32 -> float32 -> float16 ==> int32 -> float16
925    (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),
926    (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),
927
928    # float32 -> float16 -> int16 ==> float32 -> int16
929    (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),
930    (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),
931
932    # float32 -> int32 -> int16 ==> float32 -> int16
933    (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),
934    (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),
935
936    # int32 -> int16 -> float16 ==> int32 -> float16
937    (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),
938    (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),
939])
940
941# Clean up junk left from 8-bit integer to 16-bit integer lowering.
942optimizations.extend([
943    # The u2u16(u2u8(X)) just masks off the upper 8-bits of X.  This can be
944    # accomplished by mask the upper 8-bit of the immediate operand to the
945    # iand instruction.  Often times, both patterns will end up being applied
946    # to the same original expression tree.
947    (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'),               ('iand', a, ('iand', b, 0xff))),
948    (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))),
949])
950
951for op in ['iand', 'ior', 'ixor']:
952    optimizations.extend([
953        (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))),
954        (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))),
955
956        # Undistribute extract from a logic op
957        ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)),
958        ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)),
959        ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)),
960        ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)),
961
962        # Undistribute shifts from a logic op
963        ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)),
964        ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)),
965        ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)),
966    ])
967
968# Integer sizes
969for s in [8, 16, 32, 64]:
970    optimizations.extend([
971       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'),
972       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'),
973       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'),
974       (('ior',  ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'),
975       (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'),
976       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'),
977
978       # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
979       (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),
980
981       # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)
982       (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),
983       (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),
984       (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),
985    ])
986
987optimizations.extend([
988   # Common pattern like 'if (i == 0 || i == 1 || ...)'
989   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
990   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
991   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
992
993   (('ior', a, ('ieq', a, False)), True),
994   (('ior', a, ('inot', a)), -1),
995
996   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
997   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
998
999   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
1000   # The first part of the iand comes from the !__feq64_nonnan.
1001   #
1002   # The second pattern is a reformulation of the first based on the relation
1003   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
1004   # happens to be y == 0.
1005   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
1006    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
1007   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
1008    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
1009
1010   # These patterns can result when (a < b || a < c) => (a < min(b, c))
1011   # transformations occur before constant propagation and loop-unrolling.
1012   #
1013   # The flt versions are exact.  If isnan(a), the original pattern is
1014   # trivially false, and the replacements are false too.  If isnan(b):
1015   #
1016   #    a < fmax(NaN, a) => a < a => false vs a < NaN => false
1017   (('flt', a, ('fmax', b, a)), ('flt', a, b)),
1018   (('flt', ('fmin', a, b), a), ('flt', b, a)),
1019   (('~fge', a, ('fmin', b, a)), True),
1020   (('~fge', ('fmax', a, b), a), True),
1021   (('flt', a, ('fmin', b, a)), False),
1022   (('flt', ('fmax', a, b), a), False),
1023   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
1024   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
1025
1026   (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
1027   (('ilt', ('imin', a, b), a), ('ilt', b, a)),
1028   (('ige', a, ('imin', b, a)), True),
1029   (('ige', ('imax', a, b), a), True),
1030   (('ult', a, ('umax', b, a)), ('ult', a, b)),
1031   (('ult', ('umin', a, b), a), ('ult', b, a)),
1032   (('uge', a, ('umin', b, a)), True),
1033   (('uge', ('umax', a, b), a), True),
1034   (('ilt', a, ('imin', b, a)), False),
1035   (('ilt', ('imax', a, b), a), False),
1036   (('ige', a, ('imax', b, a)), ('ige', a, b)),
1037   (('ige', ('imin', a, b), a), ('ige', b, a)),
1038   (('ult', a, ('umin', b, a)), False),
1039   (('ult', ('umax', a, b), a), False),
1040   (('uge', a, ('umax', b, a)), ('uge', a, b)),
1041   (('uge', ('umin', a, b), a), ('uge', b, a)),
1042   (('ult', a, ('iand', b, a)), False),
1043   (('ult', ('ior', a, b), a), False),
1044   (('uge', a, ('iand', b, a)), True),
1045   (('uge', ('ior', a, b), a), True),
1046
1047   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
1048   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
1049   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
1050   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
1051   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
1052   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
1053   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
1054   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
1055   (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
1056   (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
1057   (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
1058   (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
1059   (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
1060   (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
1061   (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
1062   (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
1063
1064   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
1065   # negative.
1066   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
1067    ('iabs', ('ishr', a, b))),
1068   (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
1069
1070   (('fabs', ('slt', a, b)), ('slt', a, b)),
1071   (('fabs', ('sge', a, b)), ('sge', a, b)),
1072   (('fabs', ('seq', a, b)), ('seq', a, b)),
1073   (('fabs', ('sne', a, b)), ('sne', a, b)),
1074   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
1075   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
1076   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
1077   (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
1078   (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
1079   (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
1080   (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
1081   (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
1082   (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
1083   (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
1084   (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
1085   (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
1086   (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
1087   (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
1088   (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
1089   (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
1090   (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
1091   (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
1092   (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
1093   (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
1094   (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1095   (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
1096   (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
1097   (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1098   (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1099   (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1100
1101   (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1102   (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1103   (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1104
1105   (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1106   (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1107   (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1108
1109   (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1110   (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1111   (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1112
1113   (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1114   (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1115   (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1116
1117   (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
1118   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
1119   # Emulating booleans
1120   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1121   (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1122   (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
1123   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
1124   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
1125   (('iand', 'a@bool16', 1.0), ('b2f', a)),
1126   (('iand', 'a@bool32', 1.0), ('b2f', a)),
1127   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
1128   # Comparison with the same args.  Note that these are only done for the
1129   # float versions when the source must be a number.  Generally, NaN cmp NaN
1130   # produces the opposite result of X cmp X.  flt is the outlier.  NaN < NaN
1131   # is false, and, for any number X, X < X is also false.
1132   (('ilt', a, a), False),
1133   (('ige', a, a), True),
1134   (('ieq', a, a), True),
1135   (('ine', a, a), False),
1136   (('ult', a, a), False),
1137   (('uge', a, a), True),
1138   (('flt', a, a), False),
1139   (('fge', 'a(is_a_number)', a), True),
1140   (('feq', 'a(is_a_number)', a), True),
1141   (('fneu', 'a(is_a_number)', a), False),
1142   # Logical and bit operations
1143   (('iand', a, a), a),
1144   (('iand', a, ~0), a),
1145   (('iand', a, 0), 0),
1146   (('ior', a, a), a),
1147   (('ior', a, 0), a),
1148   (('ior', a, True), True),
1149   (('ixor', a, a), 0),
1150   (('ixor', a, 0), a),
1151   (('inot', ('inot', a)), a),
1152   (('ior', ('iand', a, b), b), b),
1153   (('ior', ('ior', a, b), b), ('ior', a, b)),
1154   (('iand', ('ior', a, b), b), b),
1155   (('iand', ('iand', a, b), b), ('iand', a, b)),
1156   # DeMorgan's Laws
1157   (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
1158   (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
1159   # Shift optimizations
1160   (('ishl', 0, a), 0),
1161   (('ishl', a, 0), a),
1162   (('ishr', 0, a), 0),
1163   (('ishr', a, 0), a),
1164   (('ushr', 0, a), 0),
1165   (('ushr', a, 0), a),
1166   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
1167   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
1168   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
1169   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),
1170   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
1171   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),
1172   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
1173   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),
1174   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),
1175   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),
1176   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),
1177   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),
1178   # Exponential/logarithmic identities
1179   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
1180   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
1181   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
1182   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
1183   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
1184    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
1185   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
1186   (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
1187   (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1188   (('~fpow', a, 1.0), a),
1189   (('~fpow', a, 2.0), ('fmul', a, a)),
1190   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1191   (('~fpow', 2.0, a), ('fexp2', a)),
1192   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
1193   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
1194   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
1195   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
1196   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
1197   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
1198   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
1199   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
1200   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
1201   (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
1202   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
1203   (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
1204   # Division and reciprocal
1205   (('~fdiv', 1.0, a), ('frcp', a)),
1206   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
1207   (('~frcp', ('frcp', a)), a),
1208   (('~frcp', ('fsqrt', a)), ('frsq', a)),
1209   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
1210   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
1211   # Trig
1212   (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
1213   (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
1214   # Boolean simplifications
1215   (('i2b16(is_used_by_if)', a), ('ine16', a, 0)),
1216   (('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
1217   (('i2b1(is_used_by_if)', a), ('ine', a, 0)),
1218   (('ieq', a, True), a),
1219   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
1220   (('ine', a, False), a),
1221   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
1222   (('bcsel', a, True, False), a),
1223   (('bcsel', a, False, True), ('inot', a)),
1224   (('bcsel', True, b, c), b),
1225   (('bcsel', False, b, c), c),
1226
1227   (('bcsel@16', a, 1.0, 0.0), ('b2f', a)),
1228   (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),
1229   (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1230   (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1231   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
1232   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
1233   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1234   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1235   (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1236   (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1237   (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1238   (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1239
1240   (('bcsel', a, b, b), b),
1241   (('~fcsel', a, b, b), b),
1242
1243   # D3D Boolean emulation
1244   (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
1245   (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
1246   (('bcsel', a, 1, 0), ('b2i', 'a@1')),
1247   (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
1248   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1249    ('ineg', ('b2i', ('iand', a, b)))),
1250   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
1251    ('ineg', ('b2i', ('ior', a, b)))),
1252   (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1253   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
1254   (('ine', ('ineg', ('b2i', 'a@1')), 0), a),
1255   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
1256   (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1257   (('ilt', ('ineg', ('b2i', 'a@1')), 0), a),
1258   (('ult', 0, ('ineg', ('b2i', 'a@1'))), a),
1259   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
1260   (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
1261
1262   # With D3D booleans, imax is AND and umax is OR
1263   (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1264    ('ineg', ('b2i', ('iand', a, b)))),
1265   (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1266    ('ineg', ('b2i', ('ior', a, b)))),
1267   (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1268    ('ineg', ('b2i', ('ior', a, b)))),
1269   (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1270    ('ineg', ('b2i', ('iand', a, b)))),
1271
1272   # Conversions
1273   (('i2b16', ('b2i', 'a@16')), a),
1274   (('i2b32', ('b2i', 'a@32')), a),
1275   (('f2i', ('ftrunc', a)), ('f2i', a)),
1276   (('f2u', ('ftrunc', a)), ('f2u', a)),
1277   (('i2b', ('ineg', a)), ('i2b', a)),
1278   (('i2b', ('iabs', a)), ('i2b', a)),
1279   (('inot', ('f2b1', a)), ('feq', a, 0.0)),
1280
1281   # Conversions from 16 bits to 32 bits and back can always be removed
1282   (('f2fmp', ('f2f32', 'a@16')), a),
1283   (('i2imp', ('i2i32', 'a@16')), a),
1284   (('i2imp', ('u2u32', 'a@16')), a),
1285
1286   (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),
1287   (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),
1288   (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),
1289   (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),
1290
1291   (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
1292   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1293   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1294
1295   (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),
1296   (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),
1297   (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1298   (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1299
1300   # Conversions to 16 bits would be lossy so they should only be removed if
1301   # the instruction was generated by the precision lowering pass.
1302   (('f2f32', ('f2fmp', 'a@32')), a),
1303   (('i2i32', ('i2imp', 'a@32')), a),
1304   (('u2u32', ('i2imp', 'a@32')), a),
1305
1306   (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),
1307   (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),
1308   (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),
1309   (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),
1310
1311   # Conversions from float32 to float64 and back can be removed as long as
1312   # it doesn't need to be precise, since the conversion may e.g. flush denorms
1313   (('~f2f32', ('f2f64', 'a@32')), a),
1314
1315   (('ffloor', 'a(is_integral)'), a),
1316   (('fceil', 'a(is_integral)'), a),
1317   (('ftrunc', 'a(is_integral)'), a),
1318   # fract(x) = x - floor(x), so fract(NaN) = NaN
1319   (('~ffract', 'a(is_integral)'), 0.0),
1320   (('fabs', 'a(is_not_negative)'), a),
1321   (('iabs', 'a(is_not_negative)'), a),
1322   (('fsat', 'a(is_not_positive)'), 0.0),
1323
1324   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
1325
1326   # The result of the multiply must be in [-1, 0], so the result of the ffma
1327   # must be in [0, 1].
1328   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
1329   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
1330   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
1331   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
1332
1333   (('fneu', 'a(is_not_zero)', 0.0), True),
1334   (('feq', 'a(is_not_zero)', 0.0), False),
1335
1336   # In this chart, + means value > 0 and - means value < 0.
1337   #
1338   # + >= + -> unknown  0 >= + -> false    - >= + -> false
1339   # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
1340   # + >= - -> true     0 >= - -> true     - >= - -> unknown
1341   #
1342   # Using grouping conceptually similar to a Karnaugh map...
1343   #
1344   # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
1345   # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
1346   # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
1347   #
1348   # The flt / ilt cases just invert the expected result.
1349   #
1350   # The results expecting true, must be marked imprecise.  The results
1351   # expecting false are fine because NaN compared >= or < anything is false.
1352
1353   (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True),
1354   (('fge', 'a(is_not_positive)',          'b(is_gt_zero)'),               False),
1355   (('fge', 'a(is_lt_zero)',               'b(is_not_negative)'),          False),
1356
1357   (('flt', 'a(is_not_negative)',          'b(is_not_positive)'),          False),
1358   (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'),      True),
1359   (('flt', 'a(is_a_number_lt_zero)',      'b(is_a_number_not_negative)'), True),
1360
1361   (('ine', 'a(is_not_zero)', 0), True),
1362   (('ieq', 'a(is_not_zero)', 0), False),
1363
1364   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1365   (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1366   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
1367
1368   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
1369   (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1370   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1371
1372   (('ult', 0, 'a(is_gt_zero)'), True),
1373   (('ult', a, 0), False),
1374
1375   # Packing and then unpacking does nothing
1376   (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
1377   (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
1378   (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
1379   (('unpack_64_2x32', ('pack_64_2x32', a)), a),
1380   (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a),
1381   (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1382                           ('unpack_64_2x32_split_y', a)), a),
1383   (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
1384                              ('unpack_64_2x32_split_y', a))), a),
1385   (('pack_64_2x32', ('unpack_64_2x32', a)), a),
1386   (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a),
1387
1388   # Comparing two halves of an unpack separately.  While this optimization
1389   # should be correct for non-constant values, it's less obvious that it's
1390   # useful in that case.  For constant values, the pack will fold and we're
1391   # guaranteed to reduce the whole tree to one instruction.
1392   (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
1393             ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
1394    ('ieq', a, ('pack_32_2x16_split', b, c))),
1395
1396   # Byte extraction
1397   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1398   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1399   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
1400   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1401   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1402   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
1403   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1404
1405   # Common pattern in many Vulkan CTS tests that read 8-bit integers from a
1406   # storage buffer.
1407   (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'),
1408   (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'),
1409
1410   # Common pattern after lowering 8-bit integers to 16-bit.
1411   (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))),
1412   (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))),
1413
1414   (('ubfe', a,  0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1415   (('ubfe', a,  8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1416   (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1417   (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1418   (('ibfe', a,  0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
1419   (('ibfe', a,  8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1420   (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
1421   (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1422
1423   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
1424   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
1425
1426    # Word extraction
1427   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1428   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1429   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1430   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1431   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
1432
1433   (('ubfe', a,  0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1434   (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1435   (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1436   (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1437
1438   # Packing a u8vec4 to write to an SSBO.
1439   (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
1440    ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
1441
1442   (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
1443   (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
1444
1445   # Lower pack/unpack
1446   (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
1447   (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'),
1448   (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
1449   (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
1450   (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'),
1451   (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'),
1452
1453   # Useless masking before unpacking
1454   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
1455   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
1456   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
1457   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
1458   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
1459   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
1460
1461   (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
1462   (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
1463   (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),
1464   (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
1465   (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
1466
1467   # Optimize half packing
1468   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
1469   (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
1470
1471   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1472    ('pack_half_2x16', ('vec2', a, b))),
1473   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1474    ('pack_half_2x16', ('vec2', a, b))),
1475
1476   (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
1477   (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
1478   (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
1479
1480   (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1481   (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1482
1483   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
1484   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
1485   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
1486   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
1487   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
1488   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
1489   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
1490   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
1491])
1492
1493# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
1494# patterns like those below.
1495for op in ('ushr', 'ishr'):
1496   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
1497   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
1498   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
1499
1500optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
1501
1502# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
1503# patterns like those below.
1504for op in ('extract_u8', 'extract_i8'):
1505   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
1506   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
1507   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
1508
1509optimizations.extend([
1510   # Subtracts
1511   (('ussub_4x8_vc4', a, 0), a),
1512   (('ussub_4x8_vc4', a, ~0), 0),
1513   # Lower all Subtractions first - they can get recombined later
1514   (('fsub', a, b), ('fadd', a, ('fneg', b))),
1515   (('isub', a, b), ('iadd', a, ('ineg', b))),
1516   (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1517   # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
1518   (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1519
1520   # Propagate negation up multiplication chains
1521   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
1522   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
1523   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
1524
1525   # Propagate constants up multiplication chains
1526   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
1527   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
1528   (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),
1529   # Prefer moving out a multiplication for more MAD/FMA-friendly code
1530   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
1531   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
1532   (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),
1533   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
1534
1535   # Reassociate constants in add/mul chains so they can be folded together.
1536   # For now, we mostly only handle cases where the constants are separated by
1537   # a single non-constant.  We could do better eventually.
1538   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
1539   (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),
1540   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
1541   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
1542   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
1543   (('~fadd', '#a',          ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffma',          b,  c, ('fadd', a,          d))),
1544   (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
1545   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
1546   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
1547   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
1548   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
1549
1550   # Reassociate add chains for more MAD/FMA-friendly code
1551   (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)),
1552
1553   # Drop mul-div by the same value when there's no wrapping.
1554   (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
1555
1556   # By definition...
1557   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
1558   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1559   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1560
1561   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
1562   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1563   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1564
1565   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
1566
1567   (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1568   (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1569   (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
1570
1571   (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
1572   (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
1573
1574   # Misc. lowering
1575   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
1576   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
1577   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
1578   (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
1579
1580   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1581    ('bcsel', ('ult', 31, 'bits'), 'insert',
1582              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
1583    'options->lower_bitfield_insert'),
1584   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1585   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1586   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1587   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1588   (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1589   (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1590   (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1591   (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1592
1593   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1594   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),
1595   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_uadd_sat'),
1596   (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1597
1598   # int64_t sum = a + b;
1599   #
1600   # if (a < 0 && b < 0 && a < sum)
1601   #    sum = INT64_MIN;
1602   # } else if (a >= 0 && b >= 0 && sum < a)
1603   #    sum = INT64_MAX;
1604   # }
1605   #
1606   # A couple optimizations are applied.
1607   #
1608   # 1. a < sum => sum >= 0.  This replacement works because it is known that
1609   #    a < 0 and b < 0, so sum should also be < 0 unless there was
1610   #    underflow.
1611   #
1612   # 2. sum < a => sum < 0.  This replacement works because it is known that
1613   #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
1614   #    overflow.
1615   #
1616   # 3. Invert the second if-condition and swap the order of parameters for
1617   #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
1618   #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
1619   #
1620   # On Intel Gen11, this saves ~11 instructions.
1621   (('iadd_sat@64', a, b), ('bcsel',
1622                            ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1623                            0x8000000000000000,
1624                            ('bcsel',
1625                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1626                             ('iadd', a, b),
1627                             0x7fffffffffffffff)),
1628    '(options->lower_int64_options & nir_lower_iadd64) != 0'),
1629
1630   # int64_t sum = a - b;
1631   #
1632   # if (a < 0 && b >= 0 && a < sum)
1633   #    sum = INT64_MIN;
1634   # } else if (a >= 0 && b < 0 && a >= sum)
1635   #    sum = INT64_MAX;
1636   # }
1637   #
1638   # Optimizations similar to the iadd_sat case are applied here.
1639   (('isub_sat@64', a, b), ('bcsel',
1640                            ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1641                            0x8000000000000000,
1642                            ('bcsel',
1643                             ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1644                             ('isub', a, b),
1645                             0x7fffffffffffffff)),
1646    '(options->lower_int64_options & nir_lower_iadd64) != 0'),
1647
1648   # These are done here instead of in the backend because the int64 lowering
1649   # pass will make a mess of the patterns.  The first patterns are
1650   # conditioned on nir_lower_minmax64 because it was not clear that it was
1651   # always an improvement on platforms that have real int64 support.  No
1652   # shaders in shader-db hit this, so it was hard to say one way or the
1653   # other.
1654   (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1655   (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1656   (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1657   (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1658   (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1659   (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1660
1661   (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1662   (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1663   # 0u < uint(a) <=> uint(a) != 0u
1664   (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1665
1666   # Alternative lowering that doesn't rely on bfi.
1667   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1668    ('bcsel', ('ult', 31, 'bits'),
1669     'insert',
1670    (('ior',
1671     ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
1672     ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
1673    'options->lower_bitfield_insert_to_shifts'),
1674
1675   # Alternative lowering that uses bitfield_select.
1676   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1677    ('bcsel', ('ult', 31, 'bits'), 'insert',
1678              ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
1679    'options->lower_bitfield_insert_to_bitfield_select'),
1680
1681   (('ibitfield_extract', 'value', 'offset', 'bits'),
1682    ('bcsel', ('ult', 31, 'bits'), 'value',
1683              ('ibfe', 'value', 'offset', 'bits')),
1684    'options->lower_bitfield_extract'),
1685
1686   (('ubitfield_extract', 'value', 'offset', 'bits'),
1687    ('bcsel', ('ult', 31, 'bits'), 'value',
1688              ('ubfe', 'value', 'offset', 'bits')),
1689    'options->lower_bitfield_extract'),
1690
1691   # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.
1692   (('bitfield_select', a, b, 0), ('iand', a, b)),
1693   (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)),
1694
1695   # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
1696   (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
1697   (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
1698   (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
1699   (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
1700   (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
1701   (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
1702
1703   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
1704   #
1705   #    If bits is zero, the result will be zero.
1706   #
1707   # These patterns prevent other patterns from generating invalid results
1708   # when count is zero.
1709   (('ubfe', a, b, 0), 0),
1710   (('ibfe', a, b, 0), 0),
1711
1712   (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
1713
1714   (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)),
1715   (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
1716   (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1717   (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1718   (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1719   (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1720
1721   (('ibitfield_extract', 'value', 'offset', 'bits'),
1722    ('bcsel', ('ieq', 0, 'bits'),
1723     0,
1724     ('ishr',
1725       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
1726       ('isub', 32, 'bits'))),
1727    'options->lower_bitfield_extract_to_shifts'),
1728
1729   (('ubitfield_extract', 'value', 'offset', 'bits'),
1730    ('iand',
1731     ('ushr', 'value', 'offset'),
1732     ('bcsel', ('ieq', 'bits', 32),
1733      0xffffffff,
1734      ('isub', ('ishl', 1, 'bits'), 1))),
1735    'options->lower_bitfield_extract_to_shifts'),
1736
1737   (('ifind_msb', 'value'),
1738    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
1739    'options->lower_ifind_msb'),
1740
1741   (('ifind_msb', 'value'),
1742    ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0),
1743     ('isub', 31, ('ifind_msb_rev', 'value')),
1744     ('ifind_msb_rev', 'value')),
1745    'options->lower_find_msb_to_reverse'),
1746
1747    (('ufind_msb', 'value'),
1748     ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0),
1749      ('isub', 31, ('ufind_msb_rev', 'value')),
1750      ('ufind_msb_rev', 'value')),
1751     'options->lower_find_msb_to_reverse'),
1752
1753   (('find_lsb', 'value'),
1754    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
1755    'options->lower_find_lsb'),
1756
1757   (('extract_i8', a, 'b@32'),
1758    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
1759    'options->lower_extract_byte'),
1760
1761   (('extract_u8', a, 'b@32'),
1762    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
1763    'options->lower_extract_byte'),
1764
1765   (('extract_i16', a, 'b@32'),
1766    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
1767    'options->lower_extract_word'),
1768
1769   (('extract_u16', a, 'b@32'),
1770    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
1771    'options->lower_extract_word'),
1772
1773    (('pack_unorm_2x16', 'v'),
1774     ('pack_uvec2_to_uint',
1775        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
1776     'options->lower_pack_unorm_2x16'),
1777
1778    (('pack_unorm_4x8', 'v'),
1779     ('pack_uvec4_to_uint',
1780        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
1781     'options->lower_pack_unorm_4x8'),
1782
1783    (('pack_snorm_2x16', 'v'),
1784     ('pack_uvec2_to_uint',
1785        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
1786     'options->lower_pack_snorm_2x16'),
1787
1788    (('pack_snorm_4x8', 'v'),
1789     ('pack_uvec4_to_uint',
1790        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
1791     'options->lower_pack_snorm_4x8'),
1792
1793    (('unpack_unorm_2x16', 'v'),
1794     ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
1795                                  ('extract_u16', 'v', 1))),
1796              65535.0),
1797     'options->lower_unpack_unorm_2x16'),
1798
1799    (('unpack_unorm_4x8', 'v'),
1800     ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
1801                                  ('extract_u8', 'v', 1),
1802                                  ('extract_u8', 'v', 2),
1803                                  ('extract_u8', 'v', 3))),
1804              255.0),
1805     'options->lower_unpack_unorm_4x8'),
1806
1807    (('unpack_snorm_2x16', 'v'),
1808     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
1809                                                            ('extract_i16', 'v', 1))),
1810                                           32767.0))),
1811     'options->lower_unpack_snorm_2x16'),
1812
1813    (('unpack_snorm_4x8', 'v'),
1814     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
1815                                                            ('extract_i8', 'v', 1),
1816                                                            ('extract_i8', 'v', 2),
1817                                                            ('extract_i8', 'v', 3))),
1818                                           127.0))),
1819     'options->lower_unpack_snorm_4x8'),
1820
1821   (('pack_half_2x16_split', 'a@32', 'b@32'),
1822    ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
1823    'options->lower_pack_split'),
1824
1825   (('unpack_half_2x16_split_x', 'a@32'),
1826    ('f2f32', ('u2u16', a)),
1827    'options->lower_pack_split'),
1828
1829   (('unpack_half_2x16_split_y', 'a@32'),
1830    ('f2f32', ('u2u16', ('ushr', a, 16))),
1831    'options->lower_pack_split'),
1832
1833   (('pack_32_2x16_split', 'a@16', 'b@16'),
1834    ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),
1835    'options->lower_pack_split'),
1836
1837   (('unpack_32_2x16_split_x', 'a@32'),
1838    ('u2u16', a),
1839    'options->lower_pack_split'),
1840
1841   (('unpack_32_2x16_split_y', 'a@32'),
1842    ('u2u16', ('ushr', 'a', 16)),
1843    'options->lower_pack_split'),
1844
1845   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
1846   (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
1847   (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
1848   # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0
1849   # Mark the new comparisons precise to prevent them being changed to 'a !=
1850   # 0' or 'a == 0'.
1851   (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'),
1852
1853   # Address/offset calculations:
1854   # Drivers supporting imul24 should use the nir_lower_amul() pass, this
1855   # rule converts everyone else to imul:
1856   (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
1857
1858   (('umul24', a, b),
1859    ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
1860    '!options->has_umul24'),
1861   (('umad24', a, b, c),
1862    ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
1863    '!options->has_umad24'),
1864
1865   # Relaxed 24bit ops
1866   (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'),
1867   (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'),
1868   (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'),
1869   (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'),
1870   (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'),
1871   (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'),
1872
1873   (('imad24_ir3', a, b, 0), ('imul24', a, b)),
1874   (('imad24_ir3', a, 0, c), (c)),
1875   (('imad24_ir3', a, 1, c), ('iadd', a, c)),
1876
1877   # if first two srcs are const, crack apart the imad so constant folding
1878   # can clean up the imul:
1879   # TODO ffma should probably get a similar rule:
1880   (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
1881
1882   # These will turn 24b address/offset calc back into 32b shifts, but
1883   # it should be safe to get back some of the bits of precision that we
1884   # already decided were no necessary:
1885   (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
1886   (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
1887   (('imul24', a, 0), (0)),
1888
1889   (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
1890   (('fcsel', ('slt', a, 0), b, c), ('fcsel_ge', a, c, b), "options->has_fused_comp_and_csel"),
1891   (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
1892   (('fcsel', ('sge', 0, a), b, c), ('fcsel_gt', a, c, b), "options->has_fused_comp_and_csel"),
1893
1894   (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel"),
1895   (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel"),
1896   (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel"),
1897   (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel"),
1898
1899   (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
1900   (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, c, b), "options->has_fused_comp_and_csel"),
1901   (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
1902   (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, c, b), "options->has_fused_comp_and_csel"),
1903
1904])
1905
1906# bit_size dependent lowerings
1907for bit_size in [8, 16, 32, 64]:
1908   # convenience constants
1909   intmax = (1 << (bit_size - 1)) - 1
1910   intmin = 1 << (bit_size - 1)
1911
1912   optimizations += [
1913      (('iadd_sat@' + str(bit_size), a, b),
1914       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
1915                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
1916      (('isub_sat@' + str(bit_size), a, b),
1917       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
1918                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
1919   ]
1920
1921invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
1922
1923for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
1924   optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
1925                         ('iand', (invert[left], a, b), (invert[right], c, d))))
1926   optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
1927                         ('ior', (invert[left], a, b), (invert[right], c, d))))
1928
1929# Optimize x2bN(b2x(x)) -> x
1930for size in type_sizes('bool'):
1931    aN = 'a@' + str(size)
1932    f2bN = 'f2b' + str(size)
1933    i2bN = 'i2b' + str(size)
1934    optimizations.append(((f2bN, ('b2f', aN)), a))
1935    optimizations.append(((i2bN, ('b2i', aN)), a))
1936
1937# Optimize x2yN(b2x(x)) -> b2y
1938for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
1939   if x != 'f' and y != 'f' and x != y:
1940      continue
1941
1942   b2x = 'b2f' if x == 'f' else 'b2i'
1943   b2y = 'b2f' if y == 'f' else 'b2i'
1944   x2yN = '{}2{}'.format(x, y)
1945   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
1946
1947# Optimize away x2xN(a@N)
1948for t in ['int', 'uint', 'float', 'bool']:
1949   for N in type_sizes(t):
1950      x2xN = '{0}2{0}{1}'.format(t[0], N)
1951      aN = 'a@{0}'.format(N)
1952      optimizations.append(((x2xN, aN), a))
1953
1954# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
1955# In particular, we can optimize away everything except upcast of downcast and
1956# upcasts where the type differs from the other cast
1957for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
1958   if N < M:
1959      # The outer cast is a down-cast.  It doesn't matter what the size of the
1960      # argument of the inner cast is because we'll never been in the upcast
1961      # of downcast case.  Regardless of types, we'll always end up with y2yN
1962      # in the end.
1963      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
1964         x2xN = '{0}2{0}{1}'.format(x, N)
1965         y2yM = '{0}2{0}{1}'.format(y, M)
1966         y2yN = '{0}2{0}{1}'.format(y, N)
1967         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
1968   elif N > M:
1969      # If the outer cast is an up-cast, we have to be more careful about the
1970      # size of the argument of the inner cast and with types.  In this case,
1971      # the type is always the type of type up-cast which is given by the
1972      # outer cast.
1973      for P in type_sizes('uint'):
1974         # We can't optimize away up-cast of down-cast.
1975         if M < P:
1976            continue
1977
1978         # Because we're doing down-cast of down-cast, the types always have
1979         # to match between the two casts
1980         for x in ['i', 'u']:
1981            x2xN = '{0}2{0}{1}'.format(x, N)
1982            x2xM = '{0}2{0}{1}'.format(x, M)
1983            aP = 'a@{0}'.format(P)
1984            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
1985   else:
1986      # The N == M case is handled by other optimizations
1987      pass
1988
1989# Downcast operations should be able to see through pack
1990for t in ['i', 'u']:
1991    for N in [8, 16, 32]:
1992        x2xN = '{0}2{0}{1}'.format(t, N)
1993        optimizations += [
1994            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
1995            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
1996        ]
1997
1998# Optimize comparisons with up-casts
1999for t in ['int', 'uint', 'float']:
2000    for N, M in itertools.product(type_sizes(t), repeat=2):
2001        if N == 1 or N >= M:
2002            continue
2003
2004        cond = 'true'
2005        if N == 8:
2006            cond = 'options->support_8bit_alu'
2007        elif N == 16:
2008            cond = 'options->support_16bit_alu'
2009        x2xM = '{0}2{0}{1}'.format(t[0], M)
2010        x2xN = '{0}2{0}{1}'.format(t[0], N)
2011        aN = 'a@' + str(N)
2012        bN = 'b@' + str(N)
2013        xeq = 'feq' if t == 'float' else 'ieq'
2014        xne = 'fneu' if t == 'float' else 'ine'
2015        xge = '{0}ge'.format(t[0])
2016        xlt = '{0}lt'.format(t[0])
2017
2018        # Up-casts are lossless so for correctly signed comparisons of
2019        # up-casted values we can do the comparison at the largest of the two
2020        # original sizes and drop one or both of the casts.  (We have
2021        # optimizations to drop the no-op casts which this may generate.)
2022        for P in type_sizes(t):
2023            if P == 1 or P > N:
2024                continue
2025
2026            bP = 'b@' + str(P)
2027            optimizations += [
2028                ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
2029                ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
2030                ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
2031                ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
2032                ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
2033                ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
2034            ]
2035
2036        # The next bit doesn't work on floats because the range checks would
2037        # get way too complicated.
2038        if t in ['int', 'uint']:
2039            if t == 'int':
2040                xN_min = -(1 << (N - 1))
2041                xN_max = (1 << (N - 1)) - 1
2042            elif t == 'uint':
2043                xN_min = 0
2044                xN_max = (1 << N) - 1
2045            else:
2046                assert False
2047
2048            # If we're up-casting and comparing to a constant, we can unfold
2049            # the comparison into a comparison with the shrunk down constant
2050            # and a check that the constant fits in the smaller bit size.
2051            optimizations += [
2052                ((xeq, (x2xM, aN), '#b'),
2053                 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
2054                ((xne, (x2xM, aN), '#b'),
2055                 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
2056                ((xlt, (x2xM, aN), '#b'),
2057                 ('iand', (xlt, xN_min, b),
2058                          ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
2059                ((xlt, '#a', (x2xM, bN)),
2060                 ('iand', (xlt, a, xN_max),
2061                          ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
2062                ((xge, (x2xM, aN), '#b'),
2063                 ('iand', (xge, xN_max, b),
2064                          ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
2065                ((xge, '#a', (x2xM, bN)),
2066                 ('iand', (xge, a, xN_min),
2067                          ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
2068            ]
2069
2070# Convert masking followed by signed downcast to just unsigned downcast
2071optimizations += [
2072    (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
2073    (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
2074    (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
2075    (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
2076    (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
2077    (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
2078]
2079
2080# Some operations such as iadd have the property that the bottom N bits of the
2081# output only depends on the bottom N bits of each of the inputs so we can
2082# remove casts
2083for N in [16, 32]:
2084    for M in [8, 16]:
2085        if M >= N:
2086            continue
2087
2088        aN = 'a@' + str(N)
2089        u2uM = 'u2u{0}'.format(M)
2090        i2iM = 'i2i{0}'.format(M)
2091
2092        for x in ['u', 'i']:
2093            x2xN = '{0}2{0}{1}'.format(x, N)
2094            extract_xM = 'extract_{0}{1}'.format(x, M)
2095
2096            x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M)
2097            extract_xM_M_bits = \
2098                '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M)
2099            optimizations += [
2100                ((x2xN_M_bits, (u2uM, aN)), a),
2101                ((extract_xM_M_bits, aN, 0), a),
2102            ]
2103
2104            bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M)
2105            optimizations += [
2106                ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)),
2107                ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)),
2108                ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)),
2109            ]
2110
2111            for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']:
2112                op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M)
2113                optimizations += [
2114                    ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)),
2115                    ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)),
2116                    ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)),
2117                ]
2118
2119def fexp2i(exp, bits):
2120   # Generate an expression which constructs value 2.0^exp or 0.0.
2121   #
2122   # We assume that exp is already in a valid range:
2123   #
2124   #   * [-15, 15] for 16-bit float
2125   #   * [-127, 127] for 32-bit float
2126   #   * [-1023, 1023] for 16-bit float
2127   #
2128   # If exp is the lowest value in the valid range, a value of 0.0 is
2129   # constructed.  Otherwise, the value 2.0^exp is constructed.
2130   if bits == 16:
2131      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
2132   elif bits == 32:
2133      return ('ishl', ('iadd', exp, 127), 23)
2134   elif bits == 64:
2135      return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
2136   else:
2137      assert False
2138
2139def ldexp(f, exp, bits):
2140   # The maximum possible range for a normal exponent is [-126, 127] and,
2141   # throwing in denormals, you get a maximum range of [-149, 127].  This
2142   # means that we can potentially have a swing of +-276.  If you start with
2143   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
2144   # all the way to zero.  The GLSL spec only requires that we handle a subset
2145   # of this range.  From version 4.60 of the spec:
2146   #
2147   #    "If exp is greater than +128 (single-precision) or +1024
2148   #    (double-precision), the value returned is undefined. If exp is less
2149   #    than -126 (single-precision) or -1022 (double-precision), the value
2150   #    returned may be flushed to zero. Additionally, splitting the value
2151   #    into a significand and exponent using frexp() and then reconstructing
2152   #    a floating-point value using ldexp() should yield the original input
2153   #    for zero and all finite non-denormalized values."
2154   #
2155   # The SPIR-V spec has similar language.
2156   #
2157   # In order to handle the maximum value +128 using the fexp2i() helper
2158   # above, we have to split the exponent in half and do two multiply
2159   # operations.
2160   #
2161   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
2162   # twice the full range that is valid for the fexp2i() function above.  If
2163   # exp/2 is the bottom value of that range, the fexp2i() expression will
2164   # yield 0.0f which, when multiplied by f, will flush it to zero which is
2165   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
2166   # value is clamped from above, then it must have been above the supported
2167   # range of the GLSL built-in and therefore any return value is acceptable.
2168   if bits == 16:
2169      exp = ('imin', ('imax', exp, -30), 30)
2170   elif bits == 32:
2171      exp = ('imin', ('imax', exp, -254), 254)
2172   elif bits == 64:
2173      exp = ('imin', ('imax', exp, -2046), 2046)
2174   else:
2175      assert False
2176
2177   # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
2178   # (We use ishr which isn't the same for -1, but the -1 case still works
2179   # since we use exp-exp/2 as the second exponent.)  While the spec
2180   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
2181   # work with denormals and doesn't allow for the full swing in exponents
2182   # that you can get with normalized values.  Instead, we create two powers
2183   # of two and multiply by them each in turn.  That way the effective range
2184   # of our exponent is doubled.
2185   pow2_1 = fexp2i(('ishr', exp, 1), bits)
2186   pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
2187   return ('fmul', ('fmul', f, pow2_1), pow2_2)
2188
2189optimizations += [
2190   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
2191   (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
2192   (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
2193]
2194
2195# Unreal Engine 4 demo applications open-codes bitfieldReverse()
2196def bitfield_reverse(u):
2197    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2198    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
2199    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
2200    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
2201    step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
2202
2203    return step5
2204
2205optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2206
2207# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"
2208# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"
2209for ncomp in [2, 3, 4, 8, 16]:
2210   optimizations += [
2211      (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),
2212      (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),
2213      (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),
2214      (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),
2215   ]
2216
2217# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
2218# then the "a == a" is redundant because it's equivalent to "a is not NaN"
2219# and, if a is a NaN then the second comparison will fail anyway.
2220for op in ['flt', 'fge', 'feq']:
2221   optimizations += [
2222      (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
2223      (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
2224   ]
2225
2226# Add optimizations to handle the case where the result of a ternary is
2227# compared to a constant.  This way we can take things like
2228#
2229# (a ? 0 : 1) > 0
2230#
2231# and turn it into
2232#
2233# a ? (0 > 0) : (1 > 0)
2234#
2235# which constant folding will eat for lunch.  The resulting ternary will
2236# further get cleaned up by the boolean reductions above and we will be
2237# left with just the original variable "a".
2238for op in ['feq', 'fneu', 'ieq', 'ine']:
2239   optimizations += [
2240      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2241       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2242   ]
2243
2244for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:
2245   optimizations += [
2246      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2247       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2248      ((op, '#d', ('bcsel', a, '#b', '#c')),
2249       ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
2250   ]
2251
2252
2253# For example, this converts things like
2254#
2255#    1 + mix(0, a - 1, condition)
2256#
2257# into
2258#
2259#    mix(1, (a-1)+1, condition)
2260#
2261# Other optimizations will rearrange the constants.
2262for op in ['fadd', 'fmul', 'iadd', 'imul']:
2263   optimizations += [
2264      ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
2265   ]
2266
2267# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
2268# states:
2269#
2270#     If neither layout qualifier is specified, derivatives in compute shaders
2271#     return zero, which is consistent with the handling of built-in texture
2272#     functions like texture() in GLSL 4.50 compute shaders.
2273for op in ['fddx', 'fddx_fine', 'fddx_coarse',
2274           'fddy', 'fddy_fine', 'fddy_coarse']:
2275   optimizations += [
2276      ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
2277]
2278
2279# Some optimizations for ir3-specific instructions.
2280optimizations += [
2281   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
2282   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
2283   # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
2284   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
2285   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
2286]
2287
2288# These kinds of sequences can occur after nir_opt_peephole_select.
2289#
2290# NOTE: fadd is not handled here because that gets in the way of ffma
2291# generation in the i965 driver.  Instead, fadd and ffma are handled in
2292# late_optimizations.
2293
2294for op in ['flrp']:
2295    optimizations += [
2296        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2297        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2298        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2299        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2300        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2301        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2302    ]
2303
2304for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
2305    optimizations += [
2306        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2307        (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2308        (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2309        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2310    ]
2311
2312for op in ['fpow']:
2313    optimizations += [
2314        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2315        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2316        (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
2317        (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
2318    ]
2319
2320for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fneg', 'fabs', 'fsign']:
2321    optimizations += [
2322        (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
2323    ]
2324
2325for op in ['ineg', 'iabs', 'inot', 'isign']:
2326    optimizations += [
2327        ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
2328    ]
2329
2330optimizations.extend([
2331    (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal')
2332    ])
2333
2334# This section contains optimizations to propagate downsizing conversions of
2335# constructed vectors into vectors of downsized components. Whether this is
2336# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
2337# this reduces the register pressure of the vector itself and often enables the
2338# conversions to be eliminated via other algebraic rules or constant folding.
2339# In the worst case on a SIMD architecture, the propagated conversions may be
2340# revectorized via nir_opt_vectorize so instruction count is minimally
2341# impacted.
2342#
2343# On a machine with SIMD-within-a-register only, this actually
2344# counterintuitively hurts instruction count. These machines are the same that
2345# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
2346# not being set.
2347#
2348# Finally for scalar architectures, there should be no difference in generated
2349# code since it all ends up scalarized at the end, but it might minimally help
2350# compile-times.
2351
2352for i in range(2, 4 + 1):
2353   for T in ('f', 'u', 'i'):
2354      vec_inst = ('vec' + str(i),)
2355
2356      indices = ['a', 'b', 'c', 'd']
2357      suffix_in = tuple((indices[j] + '@32') for j in range(i))
2358
2359      to_16 = '{}2{}16'.format(T, T)
2360      to_mp = '{}2{}mp'.format(T, T)
2361
2362      out_16 = tuple((to_16, indices[j]) for j in range(i))
2363      out_mp = tuple((to_mp, indices[j]) for j in range(i))
2364
2365      optimizations  += [
2366         ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
2367      ]
2368      # u2ump doesn't exist, because it's equal to i2imp
2369      if T in ['f', 'i']:
2370          optimizations  += [
2371             ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
2372          ]
2373
2374# This section contains "late" optimizations that should be run before
2375# creating ffmas and calling regular optimizations for the final time.
2376# Optimizations should go here if they help code generation and conflict
2377# with the regular optimizations.
2378before_ffma_optimizations = [
2379   # Propagate constants down multiplication chains
2380   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
2381   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
2382   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
2383   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
2384
2385   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
2386   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
2387   (('~fadd', ('fneg', a), a), 0.0),
2388   (('iadd', ('ineg', a), a), 0),
2389   (('iadd', ('ineg', a), ('iadd', a, b)), b),
2390   (('iadd', a, ('iadd', ('ineg', a), b)), b),
2391   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
2392   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
2393
2394   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
2395   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
2396   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
2397]
2398
2399# This section contains "late" optimizations that should be run after the
2400# regular optimizations have finished.  Optimizations should go here if
2401# they help code generation but do not necessarily produce code that is
2402# more easily optimizable.
2403late_optimizations = [
2404   # The rearrangements are fine w.r.t. NaN.  However, they produce incorrect
2405   # results if one operand is +Inf and the other is -Inf.
2406   #
2407   # 1. Inf + -Inf = NaN
2408   # 2. ∀x: x + NaN = NaN and x - NaN = NaN
2409   # 3. ∀x: x != NaN = true
2410   # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false
2411   #
2412   #               a=Inf, b=-Inf   a=-Inf, b=Inf    a=NaN    b=NaN
2413   #  (a+b) < 0        false            false       false    false
2414   #      a < -b       false            false       false    false
2415   # -(a+b) < 0        false            false       false    false
2416   #     -a < b        false            false       false    false
2417   #  (a+b) >= 0       false            false       false    false
2418   #      a >= -b      true             true        false    false
2419   # -(a+b) >= 0       false            false       false    false
2420   #     -a >= b       true             true        false    false
2421   #  (a+b) == 0       false            false       false    false
2422   #      a == -b      true             true        false    false
2423   #  (a+b) != 0       true             true        true     true
2424   #      a != -b      false            false       true     true
2425   (('flt',                        ('fadd(is_used_once)', a, b),  0.0), ('flt',          a, ('fneg', b))),
2426   (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a),         b)),
2427   (('flt', 0.0,                        ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a),         b)),
2428   (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt',          a, ('fneg', b))),
2429   (('~fge',                        ('fadd(is_used_once)', a, b),  0.0), ('fge',          a, ('fneg', b))),
2430   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a),         b)),
2431   (('~fge', 0.0,                        ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a),         b)),
2432   (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge',          a, ('fneg', b))),
2433   (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))),
2434   (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))),
2435
2436   # If either source must be finite, then the original (a+b) cannot produce
2437   # NaN due to Inf-Inf.  The patterns and the replacements produce the same
2438   # result if b is NaN. Therefore, the replacements are exact.
2439   (('fge',                        ('fadd(is_used_once)', 'a(is_finite)', b),  0.0), ('fge',          a, ('fneg', b))),
2440   (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a),         b)),
2441   (('fge', 0.0,                        ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a),         b)),
2442   (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge',          a, ('fneg', b))),
2443   (('feq',  ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq',  a, ('fneg', b))),
2444   (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),
2445
2446   # This is how SpvOpFOrdNotEqual might be implemented.  Replace it with
2447   # SpvOpLessOrGreater.
2448   (('iand', ('fneu', a, b),   ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))),
2449   (('iand', ('fneu', a, 0.0),          ('feq', a, a)                ), ('!flt', 0.0, ('fabs', a))),
2450
2451   # This is how SpvOpFUnordEqual might be implemented.  Replace it with
2452   # !SpvOpLessOrGreater.
2453   (('ior', ('feq', a, b),   ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))),
2454   (('ior', ('feq', a, 0.0),         ('fneu', a, a),                ), ('inot', ('!flt', 0.0, ('fabs', a)))),
2455
2456   # nir_lower_to_source_mods will collapse this, but its existence during the
2457   # optimization loop can prevent other optimizations.
2458   (('fneg', ('fneg', a)), a),
2459
2460   # Subtractions get lowered during optimization, so we need to recombine them
2461   (('fadd', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
2462   (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),
2463   (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),
2464   (('ineg', a), ('isub', 0, a), 'options->lower_ineg'),
2465   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
2466   (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
2467   (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
2468   (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
2469
2470   (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'),
2471   (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'),
2472   (('isub', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), ('ineg', c)), 'options->has_iadd3'),
2473
2474   # These are duplicated from the main optimizations table.  The late
2475   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
2476   # new patterns like these.  The patterns that compare with zero are removed
2477   # because they are unlikely to be created in by anything in
2478   # late_optimizations.
2479   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
2480   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
2481   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
2482   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
2483
2484   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
2485
2486   (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
2487
2488   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
2489   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
2490   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
2491   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
2492   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
2493   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
2494   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
2495   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
2496   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
2497   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
2498
2499   (('ior', a, a), a),
2500   (('iand', a, a), a),
2501
2502   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
2503
2504   (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
2505   (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
2506   (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
2507   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
2508
2509   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
2510
2511   # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
2512   # particular operation is common for expanding values stored in a texture
2513   # from [0,1] to [-1,1].
2514   (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2515   (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2516   (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2517   (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2518   (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2519   (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2520   (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2521   (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2522
2523    # flrp(a, b, a)
2524    # a*(1-a) + b*a
2525    # a + -a*a + a*b    (1)
2526    # a + a*(b - a)
2527    # Option 1: ffma(a, (b-a), a)
2528    #
2529    # Alternately, after (1):
2530    # a*(1+b) + -a*a
2531    # a*((1+b) + -a)
2532    #
2533    # Let b=1
2534    #
2535    # Option 2: ffma(a, 2, -(a*a))
2536    # Option 3: ffma(a, 2, (-a)*a)
2537    # Option 4: ffma(a, -a, (2*a)
2538    # Option 5: a * (2 - a)
2539    #
2540    # There are a lot of other possible combinations.
2541   (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
2542   (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2543   (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2544   (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2545   (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2546
2547   # we do these late so that we don't get in the way of creating ffmas
2548   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
2549   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
2550
2551   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
2552   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
2553   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
2554    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
2555
2556   # Things that look like DPH in the source shader may get expanded to
2557   # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
2558   # to NIR.  After FFMA is generated, this can look like:
2559   #
2560   #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
2561   #
2562   # Reassociate the last addition into the first multiplication.
2563   #
2564   # Some shaders do not use 'invariant' in vertex and (possibly) geometry
2565   # shader stages on some outputs that are intended to be invariant.  For
2566   # various reasons, this optimization may not be fully applied in all
2567   # shaders used for different rendering passes of the same geometry.  This
2568   # can result in Z-fighting artifacts (at best).  For now, disable this
2569   # optimization in these stages.  See bugzilla #111490.  In tessellation
2570   # stages applications seem to use 'precise' when necessary, so allow the
2571   # optimization in those stages.
2572   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
2573    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2574   (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
2575    ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2576   (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
2577    ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2578
2579   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
2580   #
2581   #    If bits is zero, the result will be zero.
2582   #
2583   # These prevent the next two lowerings generating incorrect results when
2584   # count is zero.
2585   (('ubfe', a, b, 0), 0),
2586   (('ibfe', a, b, 0), 0),
2587
2588   # On Intel GPUs, BFE is a 3-source instruction.  Like all 3-source
2589   # instructions on Intel GPUs, it cannot have an immediate values as
2590   # sources.  There are also limitations on source register strides.  As a
2591   # result, it is very easy for 3-source instruction combined with either
2592   # loads of immediate values or copies from weird register strides to be
2593   # more expensive than the primitive instructions it represents.
2594   (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'),
2595
2596   # b is the lowest order bit to be extracted and c is the number of bits to
2597   # extract.  The inner shift removes the bits above b + c by shifting left
2598   # 32 - (b + c).  ishl only sees the low 5 bits of the shift count, which is
2599   # -(b + c).  The outer shift moves the bit that was at b to bit zero.
2600   # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
2601   # This means that it must be shifted right by 32 - c or -c bits.
2602   (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'),
2603
2604   # Clean up no-op shifts that may result from the bfe lowerings.
2605   (('ishl', a, 0), a),
2606   (('ishl', a, -32), a),
2607   (('ishr', a, 0), a),
2608   (('ishr', a, -32), a),
2609   (('ushr', a, 0), a),
2610
2611   (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
2612   (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
2613   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
2614   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
2615]
2616
2617# A few more extract cases we'd rather leave late
2618for N in [16, 32]:
2619    aN = 'a@{0}'.format(N)
2620    u2uM = 'u2u{0}'.format(M)
2621    i2iM = 'i2i{0}'.format(M)
2622
2623    for x in ['u', 'i']:
2624        x2xN = '{0}2{0}{1}'.format(x, N)
2625        extract_x8 = 'extract_{0}8'.format(x)
2626        extract_x16 = 'extract_{0}16'.format(x)
2627
2628        late_optimizations.extend([
2629            ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
2630            ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
2631        ])
2632
2633        if N > 16:
2634            late_optimizations.extend([
2635                ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
2636                ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
2637            ])
2638
2639# Byte insertion
2640late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
2641late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
2642late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte'))
2643
2644late_optimizations += [
2645   # Word insertion
2646   (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'),
2647
2648   # Extract and then insert
2649   (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)),
2650   (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)),
2651]
2652
2653# Integer sizes
2654for s in [8, 16, 32, 64]:
2655    late_optimizations.extend([
2656        (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)),
2657        (('ior',  ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)),
2658    ])
2659
2660# Float sizes
2661for s in [16, 32, 64]:
2662    late_optimizations.extend([
2663       (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),
2664       (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),
2665    ])
2666
2667for op in ['fadd']:
2668    late_optimizations += [
2669        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2670        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2671    ]
2672
2673for op in ['ffma']:
2674    late_optimizations += [
2675        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2676        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2677
2678        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2679        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2680    ]
2681
2682# mediump: If an opcode is surrounded by conversions, remove the conversions.
2683# The rationale is that type conversions + the low precision opcode are more
2684# expensive that the same arithmetic opcode at higher precision.
2685#
2686# This must be done in late optimizations, because we need normal optimizations to
2687# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).
2688#
2689# Unary opcodes
2690for op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy',
2691           'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',
2692           'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:
2693    late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]
2694
2695# Binary opcodes
2696for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:
2697    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]
2698
2699# Ternary opcodes
2700for op in ['ffma', 'flrp']:
2701    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]
2702
2703# Comparison opcodes
2704for op in ['feq', 'fge', 'flt', 'fneu']:
2705    late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]
2706
2707# Do this last, so that the f2fmp patterns above have effect.
2708late_optimizations += [
2709  # Convert *2*mp instructions to concrete *2*16 instructions. At this point
2710  # any conversions that could have been removed will have been removed in
2711  # nir_opt_algebraic so any remaining ones are required.
2712  (('f2fmp', a), ('f2f16', a)),
2713  (('f2imp', a), ('f2i16', a)),
2714  (('f2ump', a), ('f2u16', a)),
2715  (('i2imp', a), ('i2i16', a)),
2716  (('i2fmp', a), ('i2f16', a)),
2717  (('i2imp', a), ('u2u16', a)),
2718  (('u2fmp', a), ('u2f16', a)),
2719  (('fisfinite', a), ('flt', ('fabs', a), float("inf"))),
2720]
2721
2722distribute_src_mods = [
2723   # Try to remove some spurious negations rather than pushing them down.
2724   (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
2725   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
2726   (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
2727   (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
2728   (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),
2729   (('fneg', ('fneg', a)), a),
2730
2731   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
2732   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
2733
2734   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
2735   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
2736   (('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
2737
2738   # Note that fmin <-> fmax.  I don't think there is a way to distribute
2739   # fabs() into fmin or fmax.
2740   (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
2741   (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
2742
2743   (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),
2744   (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),
2745   (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),
2746
2747   # fdph works mostly like fdot, but to get the correct result, the negation
2748   # must be applied to the second source.
2749   (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
2750
2751   (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
2752   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
2753]
2754
2755print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
2756print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
2757                                  before_ffma_optimizations).render())
2758print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
2759                                  late_optimizations).render())
2760print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
2761                                  distribute_src_mods).render())
2762