nir_opt_algebraic.py revision 7ec681f3
1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2014 Intel Corporation 4# 5# Permission is hereby granted, free of charge, to any person obtaining a 6# copy of this software and associated documentation files (the "Software"), 7# to deal in the Software without restriction, including without limitation 8# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9# and/or sell copies of the Software, and to permit persons to whom the 10# Software is furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice (including the next 13# paragraph) shall be included in all copies or substantial portions of the 14# Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22# IN THE SOFTWARE. 23# 24# Authors: 25# Jason Ekstrand (jason@jlekstrand.net) 26 27from collections import OrderedDict 28import nir_algebraic 29from nir_opcodes import type_sizes 30import itertools 31import struct 32from math import pi 33 34# Convenience variables 35a = 'a' 36b = 'b' 37c = 'c' 38d = 'd' 39e = 'e' 40 41signed_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)' 42signed_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)' 43 44# Written in the form (<search>, <replace>) where <search> is an expression 45# and <replace> is either an expression or a value. An expression is 46# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 47# where each source is either an expression or a value. A value can be 48# either a numeric constant or a string representing a variable name. 49# 50# If the opcode in a search expression is prefixed by a '~' character, this 51# indicates that the operation is inexact. Such operations will only get 52# applied to SSA values that do not have the exact bit set. This should be 53# used by by any optimizations that are not bit-for-bit exact. It should not, 54# however, be used for backend-requested lowering operations as those need to 55# happen regardless of precision. 56# 57# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: 58# "#" indicates that the given variable will only match constants, 59# type indicates that the given variable will only match values from ALU 60# instructions with the given output type, 61# (cond) specifies an additional condition function (see nir_search_helpers.h), 62# swiz is a swizzle applied to the variable (only in the <replace> expression) 63# 64# For constants, you have to be careful to make sure that it is the right 65# type because python is unaware of the source and destination types of the 66# opcodes. 67# 68# All expression types can have a bit-size specified. For opcodes, this 69# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 70# type and size. In the search half of the expression this indicates that it 71# should only match that particular bit-size. In the replace half of the 72# expression this indicates that the constructed value should have that 73# bit-size. 74# 75# If the opcode in a replacement expression is prefixed by a '!' character, 76# this indicated that the new expression will be marked exact. 77# 78# A special condition "many-comm-expr" can be used with expressions to note 79# that the expression and its subexpressions have more commutative expressions 80# than nir_replace_instr can handle. If this special condition is needed with 81# another condition, the two can be separated by a comma (e.g., 82# "(many-comm-expr,is_used_once)"). 83 84# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 85def lowered_sincos(c): 86 x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) 87 x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) 88 return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) 89 90def intBitsToFloat(i): 91 return struct.unpack('!f', struct.pack('!I', i))[0] 92 93optimizations = [ 94 95 (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 96 (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'), 97 (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'), 98 (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'), 99 (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'), 100 (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 101 (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), 102 103 (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 104 (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 105 (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 106 (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 107 (('udiv', a, 1), a), 108 (('idiv', a, 1), a), 109 (('umod', a, 1), 0), 110 (('imod', a, 1), 0), 111 (('imod', a, -1), 0), 112 (('irem', a, 1), 0), 113 (('irem', a, -1), 0), 114 (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), 115 (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'), 116 (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'), 117 (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 118 (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 119 (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'), 120 # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)' 121 (('irem', a, '#b(is_pos_power_of_two)'), 122 ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))), 123 '!options->lower_bitops'), 124 (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'), 125 126 (('~fneg', ('fneg', a)), a), 127 (('ineg', ('ineg', a)), a), 128 (('fabs', ('fneg', a)), ('fabs', a)), 129 (('fabs', ('u2f', a)), ('u2f', a)), 130 (('iabs', ('iabs', a)), ('iabs', a)), 131 (('iabs', ('ineg', a)), ('iabs', a)), 132 (('f2b', ('fneg', a)), ('f2b', a)), 133 (('i2b', ('ineg', a)), ('i2b', a)), 134 (('~fadd', a, 0.0), a), 135 # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a 136 # floating point instruction, they should flush any input denormals and we 137 # can replace -0.0 with 0.0 if the float execution mode allows it. 138 (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16), 139 (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32), 140 (('iadd', a, 0), a), 141 (('usadd_4x8_vc4', a, 0), a), 142 (('usadd_4x8_vc4', a, ~0), ~0), 143 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 144 (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 145 (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), 146 (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 147 (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), 148 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 149 (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))), 150 (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))), 151 (('~fadd', ('fneg', a), a), 0.0), 152 (('iadd', ('ineg', a), a), 0), 153 (('iadd', ('ineg', a), ('iadd', a, b)), b), 154 (('iadd', a, ('iadd', ('ineg', a), b)), b), 155 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 156 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 157 (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), 158 (('~fmul', a, 0.0), 0.0), 159 # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN 160 (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16), 161 (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32), 162 (('imul', a, 0), 0), 163 (('umul_unorm_4x8_vc4', a, 0), 0), 164 (('umul_unorm_4x8_vc4', a, ~0), a), 165 (('~fmul', a, 1.0), a), 166 # The only effect a*1.0 can have is flushing denormals. If it's only used by 167 # a floating point instruction, they should flush any input denormals and 168 # this multiplication isn't needed. 169 (('fmul(is_only_used_as_float)', a, 1.0), a), 170 (('imul', a, 1), a), 171 (('fmul', a, -1.0), ('fneg', a)), 172 (('imul', a, -1), ('ineg', a)), 173 # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 174 # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 175 # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 176 # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN 177 (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 178 (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 179 (('~ffma', 0.0, a, b), b), 180 (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16), 181 (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32), 182 (('~ffma', a, b, 0.0), ('fmul', a, b)), 183 (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16), 184 (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32), 185 (('ffma', 1.0, a, b), ('fadd', a, b)), 186 (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), 187 (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), 188 (('~flrp', a, b, 0.0), a), 189 (('~flrp', a, b, 1.0), b), 190 (('~flrp', a, a, b), a), 191 (('~flrp', 0.0, a, b), ('fmul', a, b)), 192 193 # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) 194 (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), 195 196 (('sdot_4x8_iadd', a, 0, b), b), 197 (('udot_4x8_uadd', a, 0, b), b), 198 (('sdot_4x8_iadd_sat', a, 0, b), b), 199 (('udot_4x8_uadd_sat', a, 0, b), b), 200 (('sdot_2x16_iadd', a, 0, b), b), 201 (('udot_2x16_uadd', a, 0, b), b), 202 (('sdot_2x16_iadd_sat', a, 0, b), b), 203 (('udot_2x16_uadd_sat', a, 0, b), b), 204 205 # sudot_4x8_iadd is not commutative at all, so the patterns must be 206 # duplicated with zeros on each of the first positions. 207 (('sudot_4x8_iadd', a, 0, b), b), 208 (('sudot_4x8_iadd', 0, a, b), b), 209 (('sudot_4x8_iadd_sat', a, 0, b), b), 210 (('sudot_4x8_iadd_sat', 0, a, b), b), 211 212 (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))), 213 (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))), 214 (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))), 215 (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))), 216 (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))), 217 218 # Try to let constant folding eliminate the dot-product part. These are 219 # safe because the dot product cannot overflow 32 bits. 220 (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)), 221 (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)), 222 (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)), 223 (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)), 224 (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)), 225 (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)), 226 (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)), 227 (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)), 228 (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)), 229 (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)), 230 (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)), 231 (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 232 (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 233 (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 234 (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 235 (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 236] 237 238# Shorthand for the expansion of just the dot product part of the [iu]dp4a 239# instructions. 240sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), 241 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), 242 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), 243 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) 244udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), 245 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), 246 ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), 247 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) 248sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), 249 ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), 250 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), 251 ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) 252sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), 253 ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) 254udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), 255 ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) 256 257optimizations.extend([ 258 (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_dot_4x8'), 259 (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_dot_4x8'), 260 (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 261 (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 262 (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'), 263 264 # For the unsigned dot-product, the largest possible value 4*(255*255) = 265 # 0x3f804, so we don't have to worry about that intermediate result 266 # overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant 267 # that is less than 0xfffc07fc, then the result cannot overflow ever. 268 (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)), 269 (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_dot_4x8'), 270 271 # For the signed dot-product, the largest positive value is 4*(-128*-128) = 272 # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We 273 # don't have to worry about that intermediate result overflowing or 274 # underflowing. 275 (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_dot_4x8'), 276 277 (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 278 279 (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'), 280 (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 281]) 282 283# Float sizes 284for s in [16, 32, 64]: 285 optimizations.extend([ 286 (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 287 288 (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)), 289 (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)), 290 (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)), 291 292 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 293 # These are the same as the previous three rules, but it depends on 294 # 1-fsat(x) <=> fsat(1-x). See below. 295 (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)), 296 (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 297 298 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 299 (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 300 301 (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)), 302 (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)), 303 304 # These two aren't flrp lowerings, but do appear in some shaders. 305 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)), 306 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))), 307 308 # 1 - ((1 - a) * (1 - b)) 309 # 1 - (1 - a - b + a*b) 310 # 1 - 1 + a + b - a*b 311 # a + b - a*b 312 # a + b*(1 - a) 313 # b*(1 - a) + 1*a 314 # flrp(b, 1, a) 315 (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)), 316 ]) 317 318optimizations.extend([ 319 (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), 320 321 (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 322 (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 323 (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 324 (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 325 (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 326 (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 327 (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), 328 (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), 329 (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), 330 # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). 331 (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), 332 (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), 333 (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), 334 335 (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 336 ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 337 338 (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), 339 340 (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), 341 (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 342 (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 343 (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 344 345 (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 346 (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 347 348 (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), 349 (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), 350 351 # Lower fdot to fsum when it is available 352 (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), 353 (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), 354 (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), 355 (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), 356 357 # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially 358 # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 359 # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 360 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 361 362 # (a * #b + #c) << #d 363 # ((a * #b) << #d) + (#c << #d) 364 # (a * (#b << #d)) + (#c << #d) 365 (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 366 ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 367 368 # (a * #b) << #c 369 # a * (#b << #c) 370 (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 371]) 372 373# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) 374# bits of the second source. These replacements must correctly handle the 375# case where (b % bitsize) + (c % bitsize) >= bitsize. 376for s in [8, 16, 32, 64]: 377 mask = (1 << s) - 1 378 379 ishl = "ishl@{}".format(s) 380 ishr = "ishr@{}".format(s) 381 ushr = "ushr@{}".format(s) 382 383 in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) 384 385 optimizations.extend([ 386 ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), 387 ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), 388 389 # To get get -1 for large shifts of negative values, ishr must instead 390 # clamp the shift count to the maximum value. 391 ((ishr, (ishr, a, '#b'), '#c'), 392 (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), 393 ]) 394 395# Optimize a pattern of address calculation created by DXVK where the offset is 396# divided by 4 and then multipled by 4. This can be turned into an iand and the 397# additions before can be reassociated to CSE the iand instruction. 398 399for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): 400 a_sz = 'a@{}'.format(size) 401 402 optimizations.extend([ 403 # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' 404 (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 405 (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 406 407 # This does not trivially work with ishr. 408 (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))), 409 ]) 410 411for log2 in range(1, 7): # powers of two from 2 to 64 412 v = 1 << log2 413 mask = 0xffffffff & ~(v - 1) 414 b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) 415 416 optimizations.extend([ 417 # Reassociate for improved CSE 418 (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), 419 ]) 420 421# To save space in the state tables, reduce to the set that is known to help. 422# Previously, this was range(1, 32). In addition, a couple rules inside the 423# loop are commented out. Revisit someday, probably after mesa/#2635 has some 424# resolution. 425for i in [1, 2, 16, 24]: 426 lo_mask = 0xffffffff >> i 427 hi_mask = (0xffffffff << i) & 0xffffffff 428 429 optimizations.extend([ 430 # This pattern seems to only help in the soft-fp64 code. 431 (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), 432# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), 433# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), 434 435 (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), 436 (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), 437# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct 438 ]) 439 440optimizations.extend([ 441 # This is common for address calculations. Reassociating may enable the 442 # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD 443 # instruction or a constant offset field for in load / store instructions. 444 (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))), 445 446 # (a + #b) * #c => (a * #c) + (#b * #c) 447 (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))), 448 449 # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d) 450 (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 451 ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))), 452 (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 453 ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))), 454 455 # Comparison simplifications 456 (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)), 457 (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)), 458 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 459 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 460 (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)), 461 (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)), 462 (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)), 463 (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)), 464 (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)), 465 (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)), 466 467 (('iand', ('feq', a, b), ('fneu', a, b)), False), 468 (('iand', ('flt', a, b), ('flt', b, a)), False), 469 (('iand', ('ieq', a, b), ('ine', a, b)), False), 470 (('iand', ('ilt', a, b), ('ilt', b, a)), False), 471 (('iand', ('ult', a, b), ('ult', b, a)), False), 472 473 # This helps some shaders because, after some optimizations, they end up 474 # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of 475 # matching would be handled by CSE. 476 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 477 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 478 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 479 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 480 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 481 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 482 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 483 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 484 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 485 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 486 487 # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false. 488 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 489 490 # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false. 491 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 492 493 # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false. 494 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 495 496 # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true. 497 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 498 499 # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false. 500 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 501 502 # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false. 503 (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)), 504 505 # 0.0 >= b2f(a) 506 # b2f(a) <= 0.0 507 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 508 # inot(a) 509 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 510 511 (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 512 513 (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 514 (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 515 (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 516 (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 517 (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 518 (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 519 (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 520 (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 521 (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 522 (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 523 (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 524 (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 525 (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 526 (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 527 (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 528 (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 529 530 # -(b2f(a) + b2f(b)) < 0 531 # 0 < b2f(a) + b2f(b) 532 # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 533 # a || b 534 (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 535 (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 536 537 # -(b2f(a) + b2f(b)) >= 0 538 # 0 >= b2f(a) + b2f(b) 539 # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 540 # !(a || b) 541 (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 542 (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 543 544 (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 545 (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 546 547 # Some optimizations (below) convert things like (a < b || c < b) into 548 # (min(a, c) < b). However, this interfers with the previous optimizations 549 # that try to remove comparisons with negated sums of b2f. This just 550 # breaks that apart. 551 (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 552 ('ior', ('flt', c, 0.0), ('ior', a, b))), 553 554 (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 555 (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 556 (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 557 (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)), 558 (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), 559 (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), 560 (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), 561 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), 562 (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), 563 (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), 564 (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))), 565 (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)), 566 567 # Cannot remove the addition from ilt or ige due to overflow. 568 (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 569 (('ine', ('iadd', a, b), a), ('ine', b, 0)), 570 571 (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 572 (('fneu', ('b2f', 'a@1'), 0.0), a), 573 (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 574 (('ine', ('b2i', 'a@1'), 0), a), 575 576 (('fneu', ('u2f', a), 0.0), ('ine', a, 0)), 577 (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 578 (('fge', ('u2f', a), 0.0), True), 579 (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 580 (('flt', ('u2f', a), 0.0), False), 581 (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 582 (('fneu', ('i2f', a), 0.0), ('ine', a, 0)), 583 (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 584 (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 585 (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 586 (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 587 (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 588 589 # 0.0 < fabs(a) 590 # fabs(a) > 0.0 591 # fabs(a) != 0.0 because fabs(a) must be >= 0 592 # a != 0.0 593 (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)), 594 595 # -fabs(a) < 0.0 596 # fabs(a) > 0.0 597 (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)), 598 599 # 0.0 >= fabs(a) 600 # 0.0 == fabs(a) because fabs(a) must be >= 0 601 # 0.0 == a 602 (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 603 604 # -fabs(a) >= 0.0 605 # 0.0 >= fabs(a) 606 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 607 608 # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a 609 # 610 # This should be NaN safe. 611 # 612 # NaN >= 0 && 1 >= NaN -> false && false -> false 613 # 614 # vs. 615 # 616 # NaN == fsat(NaN) -> NaN == 0 -> false 617 (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), 618 619 # Note: fmin(-a, -b) == -fmax(a, b) 620 (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 621 (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 622 (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 623 (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 624 625 # fmin(b2f(a), b) 626 # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 627 # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 628 # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 629 # 630 # Since b is a constant, constant folding will eliminate the fmin and the 631 # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 632 (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 633 634 (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 635 636 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 637 (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 638 (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 639 (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 640 (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 641 (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)), 642 (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 643 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 644 (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 645 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 646 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 647 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 648 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 649 (('bcsel', a, True, b), ('ior', a, b)), 650 (('bcsel', a, a, b), ('ior', a, b)), 651 (('bcsel', a, b, False), ('iand', a, b)), 652 (('bcsel', a, b, a), ('iand', a, b)), 653 (('~fmin', a, a), a), 654 (('~fmax', a, a), a), 655 (('imin', a, a), a), 656 (('imax', a, a), a), 657 (('umin', a, a), a), 658 (('umin', a, 0), 0), 659 (('umin', a, -1), a), 660 (('umax', a, a), a), 661 (('umax', a, 0), a), 662 (('umax', a, -1), -1), 663 (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 664 (('umax', ('umax', a, b), b), ('umax', a, b)), 665 (('imax', ('imax', a, b), b), ('imax', a, b)), 666 (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 667 (('umin', ('umin', a, b), b), ('umin', a, b)), 668 (('imin', ('imin', a, b), b), ('imin', a, b)), 669 (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)), 670 (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)), 671 (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)), 672 (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)), 673 (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)), 674 (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)), 675]) 676 677for N in [8, 16, 32, 64]: 678 b2iN = 'b2i{0}'.format(N) 679 optimizations.extend([ 680 (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)), 681 (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)), 682 ]) 683 684for N in [16, 32, 64]: 685 b2fN = 'b2f{0}'.format(N) 686 optimizations.extend([ 687 (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)), 688 (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)), 689 ]) 690 691# Integer sizes 692for s in [8, 16, 32, 64]: 693 optimizations.extend([ 694 (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)), 695 696 # Simplify logic to detect sign of an integer. 697 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)), 698 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)), 699 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)), 700 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)), 701 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 702 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 703 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)), 704 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)), 705 (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 706 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 707 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)), 708 (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)), 709 ]) 710 711optimizations.extend([ 712 (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 713 (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 714 (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 715 (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 716 (('~fmin', a, ('fabs', a)), a), 717 (('imin', a, ('iabs', a)), a), 718 (('~fmax', a, ('fneg', ('fabs', a))), a), 719 (('imax', a, ('ineg', ('iabs', a))), a), 720 (('fmax', a, ('fabs', a)), ('fabs', a)), 721 (('imax', a, ('iabs', a)), ('iabs', a)), 722 (('fmax', a, ('fneg', a)), ('fabs', a)), 723 (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'), 724 (('~fmax', ('fabs', a), 0.0), ('fabs', a)), 725 (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 726 # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while 727 # fsat(a) returns 0.0. 728 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 729 # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while 730 # fneg(fsat(fneg(a))) returns -0.0 on NaN. 731 (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 732 # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while 733 # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if 734 # SignedZeroInfNanPreserve is set, but we don't currently have any way of 735 # representing this in the optimizations other than the usual ~. 736 (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 737 # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark 738 # the new comparison precise to prevent it being changed to 'a != 0'. 739 (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))), 740 (('fsat', ('b2f', a)), ('b2f', a)), 741 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 742 (('fsat', ('fsat', a)), ('fsat', a)), 743 (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), 744 (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), 745 (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), 746 (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 747 (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 748 (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 749 # Both the left and right patterns are "b" when isnan(a), so this is exact. 750 (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 751 # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) -> 752 # fmin(0.0, b)) while the right one is "b", so this optimization is inexact. 753 (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 754 755 # max(-min(b, a), b) -> max(abs(b), -a) 756 # min(-max(b, a), b) -> min(-abs(b), -a) 757 (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))), 758 (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))), 759 760 # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) = 761 # fsat(b-a). 762 # 763 # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0 764 # 765 # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0. 766 (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0), 767 ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'), 768 769 (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 770 771 # The ior versions are exact because fmin and fmax will always pick a 772 # non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a < 773 # fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact 774 # to prevent other optimizations from ruining the "NaN clensing" property 775 # of the fmin or fmax. 776 (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))), 777 (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)), 778 (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))), 779 (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)), 780 (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))), 781 (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)), 782 (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))), 783 (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)), 784 (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 785 (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 786 (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 787 (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 788 (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))), 789 (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)), 790 (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))), 791 (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)), 792 793 (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 794 (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 795 (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 796 (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 797 (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 798 (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 799 (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 800 (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 801 (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 802 (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 803 (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 804 (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 805 (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 806 (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 807 (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 808 (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 809 810 # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y 811 # < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a 812 # single step. Doing just the replacement can lead to an infinite loop as 813 # the pattern is repeatedly applied to the result of the previous 814 # application of the pattern. 815 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 816 (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 817 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 818 (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 819 820 # This is how SpvOpFOrdNotEqual might be implemented. If both values are 821 # numbers, then it can be replaced with fneu. 822 (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)), 823]) 824 825# Float sizes 826for s in [16, 32, 64]: 827 optimizations.extend([ 828 # These derive from the previous patterns with the application of b < 0 <=> 829 # 0 < -b. The transformation should be applied if either comparison is 830 # used once as this ensures that the number of comparisons will not 831 # increase. The sources to the ior and iand are not symmetric, so the 832 # rules have to be duplicated to get this behavior. 833 (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 834 (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 835 (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 836 (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 837 (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 838 (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 839 (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 840 (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 841 842 # The (i2f32, ...) part is an open-coded fsign. When that is combined 843 # with the bcsel, it's basically copysign(1.0, a). There are some 844 # behavior differences between this pattern and copysign w.r.t. ±0 and 845 # NaN. copysign(x, y) blindly takes the sign bit from y and applies it 846 # to x, regardless of whether either or both values are NaN. 847 # 848 # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0, 849 # int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0 850 # If a == ±0: bcsel(True, 1.0, ...) = 1.0, 851 # int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1 852 # 853 # For all other values of 'a', the original and replacement behave as 854 # copysign. 855 # 856 # Marking the replacement comparisons as precise prevents any future 857 # optimizations from replacing either of the comparisons with the 858 # logical-not of the other. 859 # 860 # Note: Use b2i32 in the replacement because some platforms that 861 # support fp16 don't support int16. 862 (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))), 863 ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))), 864 865 (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))), 866 867 # The C spec says, "If the value of the integral part cannot be represented 868 # by the integer type, the behavior is undefined." "Undefined" can mean 869 # "the conversion doesn't happen at all." 870 (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)), 871 872 # Ironically, mark these as imprecise because removing the conversions may 873 # preserve more precision than doing the conversions (e.g., 874 # uint(float(0x81818181u)) == 0x81818200). 875 (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 876 (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 877 (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 878 (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 879 880 (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'), 881 (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'), 882 ]) 883 884 # float? -> float? -> floatS ==> float? -> floatS 885 (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)), 886 887 # int? -> float? -> floatS ==> int? -> floatS 888 (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)), 889 (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)), 890 891 # float? -> float? -> intS ==> float? -> intS 892 (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)), 893 (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)), 894 895 for B in [32, 64]: 896 if s < B: 897 optimizations.extend([ 898 # S = smaller, B = bigger 899 # typeS -> typeB -> typeS ==> identity 900 (('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a), 901 (('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a), 902 (('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a), 903 904 # bool1 -> typeB -> typeS ==> bool1 -> typeS 905 (('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)), 906 (('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)), 907 (('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)), 908 909 # floatS -> floatB -> intB ==> floatS -> intB 910 (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)), 911 (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)), 912 913 # int? -> floatB -> floatS ==> int? -> floatS 914 (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)), 915 (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)), 916 917 # intS -> intB -> floatB ==> intS -> floatB 918 (('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)), 919 (('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)), 920 ]) 921 922# mediump variants of the above 923optimizations.extend([ 924 # int32 -> float32 -> float16 ==> int32 -> float16 925 (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)), 926 (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)), 927 928 # float32 -> float16 -> int16 ==> float32 -> int16 929 (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)), 930 (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)), 931 932 # float32 -> int32 -> int16 ==> float32 -> int16 933 (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)), 934 (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)), 935 936 # int32 -> int16 -> float16 ==> int32 -> float16 937 (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)), 938 (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)), 939]) 940 941# Clean up junk left from 8-bit integer to 16-bit integer lowering. 942optimizations.extend([ 943 # The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be 944 # accomplished by mask the upper 8-bit of the immediate operand to the 945 # iand instruction. Often times, both patterns will end up being applied 946 # to the same original expression tree. 947 (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))), 948 (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))), 949]) 950 951for op in ['iand', 'ior', 'ixor']: 952 optimizations.extend([ 953 (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))), 954 (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))), 955 956 # Undistribute extract from a logic op 957 ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)), 958 ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)), 959 ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)), 960 ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)), 961 962 # Undistribute shifts from a logic op 963 ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)), 964 ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)), 965 ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)), 966 ]) 967 968# Integer sizes 969for s in [8, 16, 32, 64]: 970 optimizations.extend([ 971 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'), 972 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'), 973 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'), 974 (('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'), 975 (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'), 976 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'), 977 978 # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 979 (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a), 980 981 # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits) 982 (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)), 983 (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)), 984 (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)), 985 ]) 986 987optimizations.extend([ 988 # Common pattern like 'if (i == 0 || i == 1 || ...)' 989 (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 990 (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 991 (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 992 993 (('ior', a, ('ieq', a, False)), True), 994 (('ior', a, ('inot', a)), -1), 995 996 (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)), 997 (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))), 998 999 # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. 1000 # The first part of the iand comes from the !__feq64_nonnan. 1001 # 1002 # The second pattern is a reformulation of the first based on the relation 1003 # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation 1004 # happens to be y == 0. 1005 (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), 1006 ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), 1007 (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), 1008 ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), 1009 1010 # These patterns can result when (a < b || a < c) => (a < min(b, c)) 1011 # transformations occur before constant propagation and loop-unrolling. 1012 # 1013 # The flt versions are exact. If isnan(a), the original pattern is 1014 # trivially false, and the replacements are false too. If isnan(b): 1015 # 1016 # a < fmax(NaN, a) => a < a => false vs a < NaN => false 1017 (('flt', a, ('fmax', b, a)), ('flt', a, b)), 1018 (('flt', ('fmin', a, b), a), ('flt', b, a)), 1019 (('~fge', a, ('fmin', b, a)), True), 1020 (('~fge', ('fmax', a, b), a), True), 1021 (('flt', a, ('fmin', b, a)), False), 1022 (('flt', ('fmax', a, b), a), False), 1023 (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 1024 (('~fge', ('fmin', a, b), a), ('fge', b, a)), 1025 1026 (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 1027 (('ilt', ('imin', a, b), a), ('ilt', b, a)), 1028 (('ige', a, ('imin', b, a)), True), 1029 (('ige', ('imax', a, b), a), True), 1030 (('ult', a, ('umax', b, a)), ('ult', a, b)), 1031 (('ult', ('umin', a, b), a), ('ult', b, a)), 1032 (('uge', a, ('umin', b, a)), True), 1033 (('uge', ('umax', a, b), a), True), 1034 (('ilt', a, ('imin', b, a)), False), 1035 (('ilt', ('imax', a, b), a), False), 1036 (('ige', a, ('imax', b, a)), ('ige', a, b)), 1037 (('ige', ('imin', a, b), a), ('ige', b, a)), 1038 (('ult', a, ('umin', b, a)), False), 1039 (('ult', ('umax', a, b), a), False), 1040 (('uge', a, ('umax', b, a)), ('uge', a, b)), 1041 (('uge', ('umin', a, b), a), ('uge', b, a)), 1042 (('ult', a, ('iand', b, a)), False), 1043 (('ult', ('ior', a, b), a), False), 1044 (('uge', a, ('iand', b, a)), True), 1045 (('uge', ('ior', a, b), a), True), 1046 1047 (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 1048 (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 1049 (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 1050 (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 1051 (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 1052 (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 1053 (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 1054 (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 1055 (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 1056 (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 1057 (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 1058 (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 1059 (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 1060 (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 1061 (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 1062 (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 1063 1064 # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 1065 # negative. 1066 (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 1067 ('iabs', ('ishr', a, b))), 1068 (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), 1069 1070 (('fabs', ('slt', a, b)), ('slt', a, b)), 1071 (('fabs', ('sge', a, b)), ('sge', a, b)), 1072 (('fabs', ('seq', a, b)), ('seq', a, b)), 1073 (('fabs', ('sne', a, b)), ('sne', a, b)), 1074 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 1075 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 1076 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 1077 (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'), 1078 (('seq', ('seq', a, b), 1.0), ('seq', a, b)), 1079 (('seq', ('sne', a, b), 1.0), ('sne', a, b)), 1080 (('seq', ('slt', a, b), 1.0), ('slt', a, b)), 1081 (('seq', ('sge', a, b), 1.0), ('sge', a, b)), 1082 (('sne', ('seq', a, b), 0.0), ('seq', a, b)), 1083 (('sne', ('sne', a, b), 0.0), ('sne', a, b)), 1084 (('sne', ('slt', a, b), 0.0), ('slt', a, b)), 1085 (('sne', ('sge', a, b), 0.0), ('sge', a, b)), 1086 (('seq', ('seq', a, b), 0.0), ('sne', a, b)), 1087 (('seq', ('sne', a, b), 0.0), ('seq', a, b)), 1088 (('seq', ('slt', a, b), 0.0), ('sge', a, b)), 1089 (('seq', ('sge', a, b), 0.0), ('slt', a, b)), 1090 (('sne', ('seq', a, b), 1.0), ('sne', a, b)), 1091 (('sne', ('sne', a, b), 1.0), ('seq', a, b)), 1092 (('sne', ('slt', a, b), 1.0), ('sge', a, b)), 1093 (('sne', ('sge', a, b), 1.0), ('slt', a, b)), 1094 (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1095 (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), 1096 (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), 1097 (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1098 (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1099 (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1100 1101 (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1102 (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1103 (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1104 1105 (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1106 (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1107 (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1108 1109 (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1110 (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1111 (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1112 1113 (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1114 (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1115 (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1116 1117 (('fneu', ('fneg', a), a), ('fneu', a, 0.0)), 1118 (('feq', ('fneg', a), a), ('feq', a, 0.0)), 1119 # Emulating booleans 1120 (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1121 (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1122 (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1123 (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 1124 (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 1125 (('iand', 'a@bool16', 1.0), ('b2f', a)), 1126 (('iand', 'a@bool32', 1.0), ('b2f', a)), 1127 (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 1128 # Comparison with the same args. Note that these are only done for the 1129 # float versions when the source must be a number. Generally, NaN cmp NaN 1130 # produces the opposite result of X cmp X. flt is the outlier. NaN < NaN 1131 # is false, and, for any number X, X < X is also false. 1132 (('ilt', a, a), False), 1133 (('ige', a, a), True), 1134 (('ieq', a, a), True), 1135 (('ine', a, a), False), 1136 (('ult', a, a), False), 1137 (('uge', a, a), True), 1138 (('flt', a, a), False), 1139 (('fge', 'a(is_a_number)', a), True), 1140 (('feq', 'a(is_a_number)', a), True), 1141 (('fneu', 'a(is_a_number)', a), False), 1142 # Logical and bit operations 1143 (('iand', a, a), a), 1144 (('iand', a, ~0), a), 1145 (('iand', a, 0), 0), 1146 (('ior', a, a), a), 1147 (('ior', a, 0), a), 1148 (('ior', a, True), True), 1149 (('ixor', a, a), 0), 1150 (('ixor', a, 0), a), 1151 (('inot', ('inot', a)), a), 1152 (('ior', ('iand', a, b), b), b), 1153 (('ior', ('ior', a, b), b), ('ior', a, b)), 1154 (('iand', ('ior', a, b), b), b), 1155 (('iand', ('iand', a, b), b), ('iand', a, b)), 1156 # DeMorgan's Laws 1157 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 1158 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 1159 # Shift optimizations 1160 (('ishl', 0, a), 0), 1161 (('ishl', a, 0), a), 1162 (('ishr', 0, a), 0), 1163 (('ishr', a, 0), a), 1164 (('ushr', 0, a), 0), 1165 (('ushr', a, 0), a), 1166 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), 1167 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'), 1168 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), 1169 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'), 1170 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), 1171 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'), 1172 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), 1173 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'), 1174 (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'), 1175 (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'), 1176 (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'), 1177 (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'), 1178 # Exponential/logarithmic identities 1179 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 1180 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 1181 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 1182 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 1183 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 1184 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 1185 (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), 1186 (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 1187 (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1188 (('~fpow', a, 1.0), a), 1189 (('~fpow', a, 2.0), ('fmul', a, a)), 1190 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1191 (('~fpow', 2.0, a), ('fexp2', a)), 1192 (('~fpow', ('fpow', a, 2.2), 0.454545), a), 1193 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 1194 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 1195 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 1196 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 1197 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 1198 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 1199 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 1200 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 1201 (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 1202 (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), 1203 (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), 1204 # Division and reciprocal 1205 (('~fdiv', 1.0, a), ('frcp', a)), 1206 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 1207 (('~frcp', ('frcp', a)), a), 1208 (('~frcp', ('fsqrt', a)), ('frsq', a)), 1209 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 1210 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 1211 # Trig 1212 (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), 1213 (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), 1214 # Boolean simplifications 1215 (('i2b16(is_used_by_if)', a), ('ine16', a, 0)), 1216 (('i2b32(is_used_by_if)', a), ('ine32', a, 0)), 1217 (('i2b1(is_used_by_if)', a), ('ine', a, 0)), 1218 (('ieq', a, True), a), 1219 (('ine(is_not_used_by_if)', a, True), ('inot', a)), 1220 (('ine', a, False), a), 1221 (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 1222 (('bcsel', a, True, False), a), 1223 (('bcsel', a, False, True), ('inot', a)), 1224 (('bcsel', True, b, c), b), 1225 (('bcsel', False, b, c), c), 1226 1227 (('bcsel@16', a, 1.0, 0.0), ('b2f', a)), 1228 (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))), 1229 (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1230 (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1231 (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 1232 (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 1233 (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1234 (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1235 (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1236 (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1237 (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1238 (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1239 1240 (('bcsel', a, b, b), b), 1241 (('~fcsel', a, b, b), b), 1242 1243 # D3D Boolean emulation 1244 (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), 1245 (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))), 1246 (('bcsel', a, 1, 0), ('b2i', 'a@1')), 1247 (('bcsel', a, 0, 1), ('b2i', ('inot', a))), 1248 (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1249 ('ineg', ('b2i', ('iand', a, b)))), 1250 (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 1251 ('ineg', ('b2i', ('ior', a, b)))), 1252 (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1253 (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 1254 (('ine', ('ineg', ('b2i', 'a@1')), 0), a), 1255 (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 1256 (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1257 (('ilt', ('ineg', ('b2i', 'a@1')), 0), a), 1258 (('ult', 0, ('ineg', ('b2i', 'a@1'))), a), 1259 (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 1260 (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), 1261 1262 # With D3D booleans, imax is AND and umax is OR 1263 (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1264 ('ineg', ('b2i', ('iand', a, b)))), 1265 (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1266 ('ineg', ('b2i', ('ior', a, b)))), 1267 (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1268 ('ineg', ('b2i', ('ior', a, b)))), 1269 (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1270 ('ineg', ('b2i', ('iand', a, b)))), 1271 1272 # Conversions 1273 (('i2b16', ('b2i', 'a@16')), a), 1274 (('i2b32', ('b2i', 'a@32')), a), 1275 (('f2i', ('ftrunc', a)), ('f2i', a)), 1276 (('f2u', ('ftrunc', a)), ('f2u', a)), 1277 (('i2b', ('ineg', a)), ('i2b', a)), 1278 (('i2b', ('iabs', a)), ('i2b', a)), 1279 (('inot', ('f2b1', a)), ('feq', a, 0.0)), 1280 1281 # Conversions from 16 bits to 32 bits and back can always be removed 1282 (('f2fmp', ('f2f32', 'a@16')), a), 1283 (('i2imp', ('i2i32', 'a@16')), a), 1284 (('i2imp', ('u2u32', 'a@16')), a), 1285 1286 (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)), 1287 (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)), 1288 (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)), 1289 (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)), 1290 1291 (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)), 1292 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1293 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1294 1295 (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)), 1296 (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)), 1297 (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1298 (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1299 1300 # Conversions to 16 bits would be lossy so they should only be removed if 1301 # the instruction was generated by the precision lowering pass. 1302 (('f2f32', ('f2fmp', 'a@32')), a), 1303 (('i2i32', ('i2imp', 'a@32')), a), 1304 (('u2u32', ('i2imp', 'a@32')), a), 1305 1306 (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)), 1307 (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)), 1308 (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)), 1309 (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)), 1310 1311 # Conversions from float32 to float64 and back can be removed as long as 1312 # it doesn't need to be precise, since the conversion may e.g. flush denorms 1313 (('~f2f32', ('f2f64', 'a@32')), a), 1314 1315 (('ffloor', 'a(is_integral)'), a), 1316 (('fceil', 'a(is_integral)'), a), 1317 (('ftrunc', 'a(is_integral)'), a), 1318 # fract(x) = x - floor(x), so fract(NaN) = NaN 1319 (('~ffract', 'a(is_integral)'), 0.0), 1320 (('fabs', 'a(is_not_negative)'), a), 1321 (('iabs', 'a(is_not_negative)'), a), 1322 (('fsat', 'a(is_not_positive)'), 0.0), 1323 1324 (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), 1325 1326 # The result of the multiply must be in [-1, 0], so the result of the ffma 1327 # must be in [0, 1]. 1328 (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), 1329 (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), 1330 (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), 1331 (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), 1332 1333 (('fneu', 'a(is_not_zero)', 0.0), True), 1334 (('feq', 'a(is_not_zero)', 0.0), False), 1335 1336 # In this chart, + means value > 0 and - means value < 0. 1337 # 1338 # + >= + -> unknown 0 >= + -> false - >= + -> false 1339 # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false 1340 # + >= - -> true 0 >= - -> true - >= - -> unknown 1341 # 1342 # Using grouping conceptually similar to a Karnaugh map... 1343 # 1344 # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true 1345 # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false 1346 # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false 1347 # 1348 # The flt / ilt cases just invert the expected result. 1349 # 1350 # The results expecting true, must be marked imprecise. The results 1351 # expecting false are fine because NaN compared >= or < anything is false. 1352 1353 (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True), 1354 (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1355 (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1356 1357 (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1358 (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True), 1359 (('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True), 1360 1361 (('ine', 'a(is_not_zero)', 0), True), 1362 (('ieq', 'a(is_not_zero)', 0), False), 1363 1364 (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), 1365 (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1366 (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1367 1368 (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1369 (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), 1370 (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), 1371 1372 (('ult', 0, 'a(is_gt_zero)'), True), 1373 (('ult', a, 0), False), 1374 1375 # Packing and then unpacking does nothing 1376 (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), 1377 (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), 1378 (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)), 1379 (('unpack_64_2x32', ('pack_64_2x32', a)), a), 1380 (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a), 1381 (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1382 ('unpack_64_2x32_split_y', a)), a), 1383 (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a), 1384 ('unpack_64_2x32_split_y', a))), a), 1385 (('pack_64_2x32', ('unpack_64_2x32', a)), a), 1386 (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a), 1387 1388 # Comparing two halves of an unpack separately. While this optimization 1389 # should be correct for non-constant values, it's less obvious that it's 1390 # useful in that case. For constant values, the pack will fold and we're 1391 # guaranteed to reduce the whole tree to one instruction. 1392 (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), 1393 ('ieq', ('unpack_32_2x16_split_y', a), '#c')), 1394 ('ieq', a, ('pack_32_2x16_split', b, c))), 1395 1396 # Byte extraction 1397 (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1398 (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1399 (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 1400 (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1401 (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1402 (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 1403 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1404 1405 # Common pattern in many Vulkan CTS tests that read 8-bit integers from a 1406 # storage buffer. 1407 (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'), 1408 (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'), 1409 1410 # Common pattern after lowering 8-bit integers to 16-bit. 1411 (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))), 1412 (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))), 1413 1414 (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1415 (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1416 (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1417 (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1418 (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'), 1419 (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1420 (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'), 1421 (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1422 1423 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 1424 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 1425 1426 # Word extraction 1427 (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1428 (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1429 (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1430 (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1431 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 1432 1433 (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1434 (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1435 (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1436 (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1437 1438 # Packing a u8vec4 to write to an SSBO. 1439 (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), 1440 ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), 1441 1442 (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), 1443 (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), 1444 1445 # Lower pack/unpack 1446 (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'), 1447 (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'), 1448 (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'), 1449 (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'), 1450 (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'), 1451 (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'), 1452 1453 # Useless masking before unpacking 1454 (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), 1455 (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), 1456 (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), 1457 (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), 1458 (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), 1459 (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), 1460 1461 (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)), 1462 (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)), 1463 (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)), 1464 (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)), 1465 (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)), 1466 1467 # Optimize half packing 1468 (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), 1469 (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), 1470 1471 (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1472 ('pack_half_2x16', ('vec2', a, b))), 1473 (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1474 ('pack_half_2x16', ('vec2', a, b))), 1475 1476 (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)), 1477 (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)), 1478 (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)), 1479 1480 (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1481 (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1482 1483 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), 1484 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), 1485 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)), 1486 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)), 1487 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)), 1488 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)), 1489 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)), 1490 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)), 1491]) 1492 1493# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 1494# patterns like those below. 1495for op in ('ushr', 'ishr'): 1496 optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 1497 optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 1498 optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 1499 1500optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 1501 1502# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 1503# patterns like those below. 1504for op in ('extract_u8', 'extract_i8'): 1505 optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 1506 optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 1507 optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 1508 1509optimizations.extend([ 1510 # Subtracts 1511 (('ussub_4x8_vc4', a, 0), a), 1512 (('ussub_4x8_vc4', a, ~0), 0), 1513 # Lower all Subtractions first - they can get recombined later 1514 (('fsub', a, b), ('fadd', a, ('fneg', b))), 1515 (('isub', a, b), ('iadd', a, ('ineg', b))), 1516 (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1517 # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. 1518 (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1519 1520 # Propagate negation up multiplication chains 1521 (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), 1522 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 1523 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 1524 1525 # Propagate constants up multiplication chains 1526 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 1527 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 1528 (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), 1529 # Prefer moving out a multiplication for more MAD/FMA-friendly code 1530 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), 1531 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 1532 (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), 1533 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 1534 1535 # Reassociate constants in add/mul chains so they can be folded together. 1536 # For now, we mostly only handle cases where the constants are separated by 1537 # a single non-constant. We could do better eventually. 1538 (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 1539 (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), 1540 (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 1541 (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 1542 (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 1543 (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), 1544 (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 1545 (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 1546 (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), 1547 (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), 1548 (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), 1549 1550 # Reassociate add chains for more MAD/FMA-friendly code 1551 (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)), 1552 1553 # Drop mul-div by the same value when there's no wrapping. 1554 (('idiv', ('imul(no_signed_wrap)', a, b), b), a), 1555 1556 # By definition... 1557 (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 1558 (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1559 (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1560 1561 (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 1562 (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1563 (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1564 1565 (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 1566 1567 (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 1568 (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 1569 (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), 1570 1571 (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), 1572 (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)), 1573 1574 # Misc. lowering 1575 (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), 1576 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), 1577 (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 1578 (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 1579 1580 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1581 ('bcsel', ('ult', 31, 'bits'), 'insert', 1582 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 1583 'options->lower_bitfield_insert'), 1584 (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1585 (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1586 (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1587 (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1588 (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1589 (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1590 (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1591 (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1592 1593 (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1594 (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'), 1595 (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_uadd_sat'), 1596 (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1597 1598 # int64_t sum = a + b; 1599 # 1600 # if (a < 0 && b < 0 && a < sum) 1601 # sum = INT64_MIN; 1602 # } else if (a >= 0 && b >= 0 && sum < a) 1603 # sum = INT64_MAX; 1604 # } 1605 # 1606 # A couple optimizations are applied. 1607 # 1608 # 1. a < sum => sum >= 0. This replacement works because it is known that 1609 # a < 0 and b < 0, so sum should also be < 0 unless there was 1610 # underflow. 1611 # 1612 # 2. sum < a => sum < 0. This replacement works because it is known that 1613 # a >= 0 and b >= 0, so sum should also be >= 0 unless there was 1614 # overflow. 1615 # 1616 # 3. Invert the second if-condition and swap the order of parameters for 1617 # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= 1618 # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) 1619 # 1620 # On Intel Gen11, this saves ~11 instructions. 1621 (('iadd_sat@64', a, b), ('bcsel', 1622 ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1623 0x8000000000000000, 1624 ('bcsel', 1625 ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1626 ('iadd', a, b), 1627 0x7fffffffffffffff)), 1628 '(options->lower_int64_options & nir_lower_iadd64) != 0'), 1629 1630 # int64_t sum = a - b; 1631 # 1632 # if (a < 0 && b >= 0 && a < sum) 1633 # sum = INT64_MIN; 1634 # } else if (a >= 0 && b < 0 && a >= sum) 1635 # sum = INT64_MAX; 1636 # } 1637 # 1638 # Optimizations similar to the iadd_sat case are applied here. 1639 (('isub_sat@64', a, b), ('bcsel', 1640 ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1641 0x8000000000000000, 1642 ('bcsel', 1643 ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1644 ('isub', a, b), 1645 0x7fffffffffffffff)), 1646 '(options->lower_int64_options & nir_lower_iadd64) != 0'), 1647 1648 # These are done here instead of in the backend because the int64 lowering 1649 # pass will make a mess of the patterns. The first patterns are 1650 # conditioned on nir_lower_minmax64 because it was not clear that it was 1651 # always an improvement on platforms that have real int64 support. No 1652 # shaders in shader-db hit this, so it was hard to say one way or the 1653 # other. 1654 (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1655 (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1656 (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1657 (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1658 (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1659 (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1660 1661 (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1662 (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1663 # 0u < uint(a) <=> uint(a) != 0u 1664 (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1665 1666 # Alternative lowering that doesn't rely on bfi. 1667 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1668 ('bcsel', ('ult', 31, 'bits'), 1669 'insert', 1670 (('ior', 1671 ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), 1672 ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 1673 'options->lower_bitfield_insert_to_shifts'), 1674 1675 # Alternative lowering that uses bitfield_select. 1676 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1677 ('bcsel', ('ult', 31, 'bits'), 'insert', 1678 ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), 1679 'options->lower_bitfield_insert_to_bitfield_select'), 1680 1681 (('ibitfield_extract', 'value', 'offset', 'bits'), 1682 ('bcsel', ('ult', 31, 'bits'), 'value', 1683 ('ibfe', 'value', 'offset', 'bits')), 1684 'options->lower_bitfield_extract'), 1685 1686 (('ubitfield_extract', 'value', 'offset', 'bits'), 1687 ('bcsel', ('ult', 31, 'bits'), 'value', 1688 ('ubfe', 'value', 'offset', 'bits')), 1689 'options->lower_bitfield_extract'), 1690 1691 # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0. 1692 (('bitfield_select', a, b, 0), ('iand', a, b)), 1693 (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)), 1694 1695 # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' 1696 (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), 1697 (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), 1698 (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), 1699 (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), 1700 (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), 1701 (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), 1702 1703 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 1704 # 1705 # If bits is zero, the result will be zero. 1706 # 1707 # These patterns prevent other patterns from generating invalid results 1708 # when count is zero. 1709 (('ubfe', a, b, 0), 0), 1710 (('ibfe', a, b, 0), 0), 1711 1712 (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))), 1713 1714 (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)), 1715 (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct 1716 (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1717 (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1718 (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1719 (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1720 1721 (('ibitfield_extract', 'value', 'offset', 'bits'), 1722 ('bcsel', ('ieq', 0, 'bits'), 1723 0, 1724 ('ishr', 1725 ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 1726 ('isub', 32, 'bits'))), 1727 'options->lower_bitfield_extract_to_shifts'), 1728 1729 (('ubitfield_extract', 'value', 'offset', 'bits'), 1730 ('iand', 1731 ('ushr', 'value', 'offset'), 1732 ('bcsel', ('ieq', 'bits', 32), 1733 0xffffffff, 1734 ('isub', ('ishl', 1, 'bits'), 1))), 1735 'options->lower_bitfield_extract_to_shifts'), 1736 1737 (('ifind_msb', 'value'), 1738 ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 1739 'options->lower_ifind_msb'), 1740 1741 (('ifind_msb', 'value'), 1742 ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0), 1743 ('isub', 31, ('ifind_msb_rev', 'value')), 1744 ('ifind_msb_rev', 'value')), 1745 'options->lower_find_msb_to_reverse'), 1746 1747 (('ufind_msb', 'value'), 1748 ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0), 1749 ('isub', 31, ('ufind_msb_rev', 'value')), 1750 ('ufind_msb_rev', 'value')), 1751 'options->lower_find_msb_to_reverse'), 1752 1753 (('find_lsb', 'value'), 1754 ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 1755 'options->lower_find_lsb'), 1756 1757 (('extract_i8', a, 'b@32'), 1758 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 1759 'options->lower_extract_byte'), 1760 1761 (('extract_u8', a, 'b@32'), 1762 ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 1763 'options->lower_extract_byte'), 1764 1765 (('extract_i16', a, 'b@32'), 1766 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 1767 'options->lower_extract_word'), 1768 1769 (('extract_u16', a, 'b@32'), 1770 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 1771 'options->lower_extract_word'), 1772 1773 (('pack_unorm_2x16', 'v'), 1774 ('pack_uvec2_to_uint', 1775 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 1776 'options->lower_pack_unorm_2x16'), 1777 1778 (('pack_unorm_4x8', 'v'), 1779 ('pack_uvec4_to_uint', 1780 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 1781 'options->lower_pack_unorm_4x8'), 1782 1783 (('pack_snorm_2x16', 'v'), 1784 ('pack_uvec2_to_uint', 1785 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 1786 'options->lower_pack_snorm_2x16'), 1787 1788 (('pack_snorm_4x8', 'v'), 1789 ('pack_uvec4_to_uint', 1790 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 1791 'options->lower_pack_snorm_4x8'), 1792 1793 (('unpack_unorm_2x16', 'v'), 1794 ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 1795 ('extract_u16', 'v', 1))), 1796 65535.0), 1797 'options->lower_unpack_unorm_2x16'), 1798 1799 (('unpack_unorm_4x8', 'v'), 1800 ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 1801 ('extract_u8', 'v', 1), 1802 ('extract_u8', 'v', 2), 1803 ('extract_u8', 'v', 3))), 1804 255.0), 1805 'options->lower_unpack_unorm_4x8'), 1806 1807 (('unpack_snorm_2x16', 'v'), 1808 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 1809 ('extract_i16', 'v', 1))), 1810 32767.0))), 1811 'options->lower_unpack_snorm_2x16'), 1812 1813 (('unpack_snorm_4x8', 'v'), 1814 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 1815 ('extract_i8', 'v', 1), 1816 ('extract_i8', 'v', 2), 1817 ('extract_i8', 'v', 3))), 1818 127.0))), 1819 'options->lower_unpack_snorm_4x8'), 1820 1821 (('pack_half_2x16_split', 'a@32', 'b@32'), 1822 ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), 1823 'options->lower_pack_split'), 1824 1825 (('unpack_half_2x16_split_x', 'a@32'), 1826 ('f2f32', ('u2u16', a)), 1827 'options->lower_pack_split'), 1828 1829 (('unpack_half_2x16_split_y', 'a@32'), 1830 ('f2f32', ('u2u16', ('ushr', a, 16))), 1831 'options->lower_pack_split'), 1832 1833 (('pack_32_2x16_split', 'a@16', 'b@16'), 1834 ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)), 1835 'options->lower_pack_split'), 1836 1837 (('unpack_32_2x16_split_x', 'a@32'), 1838 ('u2u16', a), 1839 'options->lower_pack_split'), 1840 1841 (('unpack_32_2x16_split_y', 'a@32'), 1842 ('u2u16', ('ushr', 'a', 16)), 1843 'options->lower_pack_split'), 1844 1845 (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 1846 (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'), 1847 (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'), 1848 # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0 1849 # Mark the new comparisons precise to prevent them being changed to 'a != 1850 # 0' or 'a == 0'. 1851 (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'), 1852 1853 # Address/offset calculations: 1854 # Drivers supporting imul24 should use the nir_lower_amul() pass, this 1855 # rule converts everyone else to imul: 1856 (('amul', a, b), ('imul', a, b), '!options->has_imul24'), 1857 1858 (('umul24', a, b), 1859 ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), 1860 '!options->has_umul24'), 1861 (('umad24', a, b, c), 1862 ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c), 1863 '!options->has_umad24'), 1864 1865 # Relaxed 24bit ops 1866 (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'), 1867 (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'), 1868 (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'), 1869 (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'), 1870 (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'), 1871 (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'), 1872 1873 (('imad24_ir3', a, b, 0), ('imul24', a, b)), 1874 (('imad24_ir3', a, 0, c), (c)), 1875 (('imad24_ir3', a, 1, c), ('iadd', a, c)), 1876 1877 # if first two srcs are const, crack apart the imad so constant folding 1878 # can clean up the imul: 1879 # TODO ffma should probably get a similar rule: 1880 (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), 1881 1882 # These will turn 24b address/offset calc back into 32b shifts, but 1883 # it should be safe to get back some of the bits of precision that we 1884 # already decided were no necessary: 1885 (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 1886 (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 1887 (('imul24', a, 0), (0)), 1888 1889 (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 1890 (('fcsel', ('slt', a, 0), b, c), ('fcsel_ge', a, c, b), "options->has_fused_comp_and_csel"), 1891 (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 1892 (('fcsel', ('sge', 0, a), b, c), ('fcsel_gt', a, c, b), "options->has_fused_comp_and_csel"), 1893 1894 (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel"), 1895 (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel"), 1896 (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel"), 1897 (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel"), 1898 1899 (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 1900 (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, c, b), "options->has_fused_comp_and_csel"), 1901 (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 1902 (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, c, b), "options->has_fused_comp_and_csel"), 1903 1904]) 1905 1906# bit_size dependent lowerings 1907for bit_size in [8, 16, 32, 64]: 1908 # convenience constants 1909 intmax = (1 << (bit_size - 1)) - 1 1910 intmin = 1 << (bit_size - 1) 1911 1912 optimizations += [ 1913 (('iadd_sat@' + str(bit_size), a, b), 1914 ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 1915 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'), 1916 (('isub_sat@' + str(bit_size), a, b), 1917 ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 1918 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'), 1919 ] 1920 1921invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')]) 1922 1923for left, right in itertools.combinations_with_replacement(invert.keys(), 2): 1924 optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), 1925 ('iand', (invert[left], a, b), (invert[right], c, d)))) 1926 optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))), 1927 ('ior', (invert[left], a, b), (invert[right], c, d)))) 1928 1929# Optimize x2bN(b2x(x)) -> x 1930for size in type_sizes('bool'): 1931 aN = 'a@' + str(size) 1932 f2bN = 'f2b' + str(size) 1933 i2bN = 'i2b' + str(size) 1934 optimizations.append(((f2bN, ('b2f', aN)), a)) 1935 optimizations.append(((i2bN, ('b2i', aN)), a)) 1936 1937# Optimize x2yN(b2x(x)) -> b2y 1938for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 1939 if x != 'f' and y != 'f' and x != y: 1940 continue 1941 1942 b2x = 'b2f' if x == 'f' else 'b2i' 1943 b2y = 'b2f' if y == 'f' else 'b2i' 1944 x2yN = '{}2{}'.format(x, y) 1945 optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 1946 1947# Optimize away x2xN(a@N) 1948for t in ['int', 'uint', 'float', 'bool']: 1949 for N in type_sizes(t): 1950 x2xN = '{0}2{0}{1}'.format(t[0], N) 1951 aN = 'a@{0}'.format(N) 1952 optimizations.append(((x2xN, aN), a)) 1953 1954# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 1955# In particular, we can optimize away everything except upcast of downcast and 1956# upcasts where the type differs from the other cast 1957for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 1958 if N < M: 1959 # The outer cast is a down-cast. It doesn't matter what the size of the 1960 # argument of the inner cast is because we'll never been in the upcast 1961 # of downcast case. Regardless of types, we'll always end up with y2yN 1962 # in the end. 1963 for x, y in itertools.product(['i', 'u'], ['i', 'u']): 1964 x2xN = '{0}2{0}{1}'.format(x, N) 1965 y2yM = '{0}2{0}{1}'.format(y, M) 1966 y2yN = '{0}2{0}{1}'.format(y, N) 1967 optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 1968 elif N > M: 1969 # If the outer cast is an up-cast, we have to be more careful about the 1970 # size of the argument of the inner cast and with types. In this case, 1971 # the type is always the type of type up-cast which is given by the 1972 # outer cast. 1973 for P in type_sizes('uint'): 1974 # We can't optimize away up-cast of down-cast. 1975 if M < P: 1976 continue 1977 1978 # Because we're doing down-cast of down-cast, the types always have 1979 # to match between the two casts 1980 for x in ['i', 'u']: 1981 x2xN = '{0}2{0}{1}'.format(x, N) 1982 x2xM = '{0}2{0}{1}'.format(x, M) 1983 aP = 'a@{0}'.format(P) 1984 optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 1985 else: 1986 # The N == M case is handled by other optimizations 1987 pass 1988 1989# Downcast operations should be able to see through pack 1990for t in ['i', 'u']: 1991 for N in [8, 16, 32]: 1992 x2xN = '{0}2{0}{1}'.format(t, N) 1993 optimizations += [ 1994 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 1995 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 1996 ] 1997 1998# Optimize comparisons with up-casts 1999for t in ['int', 'uint', 'float']: 2000 for N, M in itertools.product(type_sizes(t), repeat=2): 2001 if N == 1 or N >= M: 2002 continue 2003 2004 cond = 'true' 2005 if N == 8: 2006 cond = 'options->support_8bit_alu' 2007 elif N == 16: 2008 cond = 'options->support_16bit_alu' 2009 x2xM = '{0}2{0}{1}'.format(t[0], M) 2010 x2xN = '{0}2{0}{1}'.format(t[0], N) 2011 aN = 'a@' + str(N) 2012 bN = 'b@' + str(N) 2013 xeq = 'feq' if t == 'float' else 'ieq' 2014 xne = 'fneu' if t == 'float' else 'ine' 2015 xge = '{0}ge'.format(t[0]) 2016 xlt = '{0}lt'.format(t[0]) 2017 2018 # Up-casts are lossless so for correctly signed comparisons of 2019 # up-casted values we can do the comparison at the largest of the two 2020 # original sizes and drop one or both of the casts. (We have 2021 # optimizations to drop the no-op casts which this may generate.) 2022 for P in type_sizes(t): 2023 if P == 1 or P > N: 2024 continue 2025 2026 bP = 'b@' + str(P) 2027 optimizations += [ 2028 ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond), 2029 ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond), 2030 ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond), 2031 ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond), 2032 ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond), 2033 ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond), 2034 ] 2035 2036 # The next bit doesn't work on floats because the range checks would 2037 # get way too complicated. 2038 if t in ['int', 'uint']: 2039 if t == 'int': 2040 xN_min = -(1 << (N - 1)) 2041 xN_max = (1 << (N - 1)) - 1 2042 elif t == 'uint': 2043 xN_min = 0 2044 xN_max = (1 << N) - 1 2045 else: 2046 assert False 2047 2048 # If we're up-casting and comparing to a constant, we can unfold 2049 # the comparison into a comparison with the shrunk down constant 2050 # and a check that the constant fits in the smaller bit size. 2051 optimizations += [ 2052 ((xeq, (x2xM, aN), '#b'), 2053 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond), 2054 ((xne, (x2xM, aN), '#b'), 2055 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond), 2056 ((xlt, (x2xM, aN), '#b'), 2057 ('iand', (xlt, xN_min, b), 2058 ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond), 2059 ((xlt, '#a', (x2xM, bN)), 2060 ('iand', (xlt, a, xN_max), 2061 ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond), 2062 ((xge, (x2xM, aN), '#b'), 2063 ('iand', (xge, xN_max, b), 2064 ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond), 2065 ((xge, '#a', (x2xM, bN)), 2066 ('iand', (xge, a, xN_min), 2067 ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond), 2068 ] 2069 2070# Convert masking followed by signed downcast to just unsigned downcast 2071optimizations += [ 2072 (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)), 2073 (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)), 2074 (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)), 2075 (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)), 2076 (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)), 2077 (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)), 2078] 2079 2080# Some operations such as iadd have the property that the bottom N bits of the 2081# output only depends on the bottom N bits of each of the inputs so we can 2082# remove casts 2083for N in [16, 32]: 2084 for M in [8, 16]: 2085 if M >= N: 2086 continue 2087 2088 aN = 'a@' + str(N) 2089 u2uM = 'u2u{0}'.format(M) 2090 i2iM = 'i2i{0}'.format(M) 2091 2092 for x in ['u', 'i']: 2093 x2xN = '{0}2{0}{1}'.format(x, N) 2094 extract_xM = 'extract_{0}{1}'.format(x, M) 2095 2096 x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M) 2097 extract_xM_M_bits = \ 2098 '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M) 2099 optimizations += [ 2100 ((x2xN_M_bits, (u2uM, aN)), a), 2101 ((extract_xM_M_bits, aN, 0), a), 2102 ] 2103 2104 bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M) 2105 optimizations += [ 2106 ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)), 2107 ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)), 2108 ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)), 2109 ] 2110 2111 for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']: 2112 op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M) 2113 optimizations += [ 2114 ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)), 2115 ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)), 2116 ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)), 2117 ] 2118 2119def fexp2i(exp, bits): 2120 # Generate an expression which constructs value 2.0^exp or 0.0. 2121 # 2122 # We assume that exp is already in a valid range: 2123 # 2124 # * [-15, 15] for 16-bit float 2125 # * [-127, 127] for 32-bit float 2126 # * [-1023, 1023] for 16-bit float 2127 # 2128 # If exp is the lowest value in the valid range, a value of 0.0 is 2129 # constructed. Otherwise, the value 2.0^exp is constructed. 2130 if bits == 16: 2131 return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 2132 elif bits == 32: 2133 return ('ishl', ('iadd', exp, 127), 23) 2134 elif bits == 64: 2135 return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 2136 else: 2137 assert False 2138 2139def ldexp(f, exp, bits): 2140 # The maximum possible range for a normal exponent is [-126, 127] and, 2141 # throwing in denormals, you get a maximum range of [-149, 127]. This 2142 # means that we can potentially have a swing of +-276. If you start with 2143 # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush 2144 # all the way to zero. The GLSL spec only requires that we handle a subset 2145 # of this range. From version 4.60 of the spec: 2146 # 2147 # "If exp is greater than +128 (single-precision) or +1024 2148 # (double-precision), the value returned is undefined. If exp is less 2149 # than -126 (single-precision) or -1022 (double-precision), the value 2150 # returned may be flushed to zero. Additionally, splitting the value 2151 # into a significand and exponent using frexp() and then reconstructing 2152 # a floating-point value using ldexp() should yield the original input 2153 # for zero and all finite non-denormalized values." 2154 # 2155 # The SPIR-V spec has similar language. 2156 # 2157 # In order to handle the maximum value +128 using the fexp2i() helper 2158 # above, we have to split the exponent in half and do two multiply 2159 # operations. 2160 # 2161 # First, we clamp exp to a reasonable range. Specifically, we clamp to 2162 # twice the full range that is valid for the fexp2i() function above. If 2163 # exp/2 is the bottom value of that range, the fexp2i() expression will 2164 # yield 0.0f which, when multiplied by f, will flush it to zero which is 2165 # allowed by the GLSL and SPIR-V specs for low exponent values. If the 2166 # value is clamped from above, then it must have been above the supported 2167 # range of the GLSL built-in and therefore any return value is acceptable. 2168 if bits == 16: 2169 exp = ('imin', ('imax', exp, -30), 30) 2170 elif bits == 32: 2171 exp = ('imin', ('imax', exp, -254), 254) 2172 elif bits == 64: 2173 exp = ('imin', ('imax', exp, -2046), 2046) 2174 else: 2175 assert False 2176 2177 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 2178 # (We use ishr which isn't the same for -1, but the -1 case still works 2179 # since we use exp-exp/2 as the second exponent.) While the spec 2180 # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 2181 # work with denormals and doesn't allow for the full swing in exponents 2182 # that you can get with normalized values. Instead, we create two powers 2183 # of two and multiply by them each in turn. That way the effective range 2184 # of our exponent is doubled. 2185 pow2_1 = fexp2i(('ishr', exp, 1), bits) 2186 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 2187 return ('fmul', ('fmul', f, pow2_1), pow2_2) 2188 2189optimizations += [ 2190 (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 2191 (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 2192 (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 2193] 2194 2195# Unreal Engine 4 demo applications open-codes bitfieldReverse() 2196def bitfield_reverse(u): 2197 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2198 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 2199 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 2200 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 2201 step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 2202 2203 return step5 2204 2205optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2206 2207# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" 2208# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" 2209for ncomp in [2, 3, 4, 8, 16]: 2210 optimizations += [ 2211 (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)), 2212 (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)), 2213 (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)), 2214 (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)), 2215 ] 2216 2217# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 2218# then the "a == a" is redundant because it's equivalent to "a is not NaN" 2219# and, if a is a NaN then the second comparison will fail anyway. 2220for op in ['flt', 'fge', 'feq']: 2221 optimizations += [ 2222 (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), 2223 (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), 2224 ] 2225 2226# Add optimizations to handle the case where the result of a ternary is 2227# compared to a constant. This way we can take things like 2228# 2229# (a ? 0 : 1) > 0 2230# 2231# and turn it into 2232# 2233# a ? (0 > 0) : (1 > 0) 2234# 2235# which constant folding will eat for lunch. The resulting ternary will 2236# further get cleaned up by the boolean reductions above and we will be 2237# left with just the original variable "a". 2238for op in ['feq', 'fneu', 'ieq', 'ine']: 2239 optimizations += [ 2240 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2241 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2242 ] 2243 2244for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: 2245 optimizations += [ 2246 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2247 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2248 ((op, '#d', ('bcsel', a, '#b', '#c')), 2249 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 2250 ] 2251 2252 2253# For example, this converts things like 2254# 2255# 1 + mix(0, a - 1, condition) 2256# 2257# into 2258# 2259# mix(1, (a-1)+1, condition) 2260# 2261# Other optimizations will rearrange the constants. 2262for op in ['fadd', 'fmul', 'iadd', 'imul']: 2263 optimizations += [ 2264 ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 2265 ] 2266 2267# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives 2268# states: 2269# 2270# If neither layout qualifier is specified, derivatives in compute shaders 2271# return zero, which is consistent with the handling of built-in texture 2272# functions like texture() in GLSL 4.50 compute shaders. 2273for op in ['fddx', 'fddx_fine', 'fddx_coarse', 2274 'fddy', 'fddy_fine', 'fddy_coarse']: 2275 optimizations += [ 2276 ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') 2277] 2278 2279# Some optimizations for ir3-specific instructions. 2280optimizations += [ 2281 # 'al * bl': If either 'al' or 'bl' is zero, return zero. 2282 (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), 2283 # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'. 2284 (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), 2285 (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), 2286] 2287 2288# These kinds of sequences can occur after nir_opt_peephole_select. 2289# 2290# NOTE: fadd is not handled here because that gets in the way of ffma 2291# generation in the i965 driver. Instead, fadd and ffma are handled in 2292# late_optimizations. 2293 2294for op in ['flrp']: 2295 optimizations += [ 2296 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2297 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2298 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2299 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2300 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2301 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2302 ] 2303 2304for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: 2305 optimizations += [ 2306 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2307 (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2308 (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2309 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2310 ] 2311 2312for op in ['fpow']: 2313 optimizations += [ 2314 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2315 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2316 (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), 2317 (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), 2318 ] 2319 2320for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fneg', 'fabs', 'fsign']: 2321 optimizations += [ 2322 (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))), 2323 ] 2324 2325for op in ['ineg', 'iabs', 'inot', 'isign']: 2326 optimizations += [ 2327 ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))), 2328 ] 2329 2330optimizations.extend([ 2331 (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal') 2332 ]) 2333 2334# This section contains optimizations to propagate downsizing conversions of 2335# constructed vectors into vectors of downsized components. Whether this is 2336# useful depends on the SIMD semantics of the backend. On a true SIMD machine, 2337# this reduces the register pressure of the vector itself and often enables the 2338# conversions to be eliminated via other algebraic rules or constant folding. 2339# In the worst case on a SIMD architecture, the propagated conversions may be 2340# revectorized via nir_opt_vectorize so instruction count is minimally 2341# impacted. 2342# 2343# On a machine with SIMD-within-a-register only, this actually 2344# counterintuitively hurts instruction count. These machines are the same that 2345# require vectorize_vec2_16bit, so we predicate the optimizations on that flag 2346# not being set. 2347# 2348# Finally for scalar architectures, there should be no difference in generated 2349# code since it all ends up scalarized at the end, but it might minimally help 2350# compile-times. 2351 2352for i in range(2, 4 + 1): 2353 for T in ('f', 'u', 'i'): 2354 vec_inst = ('vec' + str(i),) 2355 2356 indices = ['a', 'b', 'c', 'd'] 2357 suffix_in = tuple((indices[j] + '@32') for j in range(i)) 2358 2359 to_16 = '{}2{}16'.format(T, T) 2360 to_mp = '{}2{}mp'.format(T, T) 2361 2362 out_16 = tuple((to_16, indices[j]) for j in range(i)) 2363 out_mp = tuple((to_mp, indices[j]) for j in range(i)) 2364 2365 optimizations += [ 2366 ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'), 2367 ] 2368 # u2ump doesn't exist, because it's equal to i2imp 2369 if T in ['f', 'i']: 2370 optimizations += [ 2371 ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit') 2372 ] 2373 2374# This section contains "late" optimizations that should be run before 2375# creating ffmas and calling regular optimizations for the final time. 2376# Optimizations should go here if they help code generation and conflict 2377# with the regular optimizations. 2378before_ffma_optimizations = [ 2379 # Propagate constants down multiplication chains 2380 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 2381 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 2382 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 2383 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 2384 2385 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 2386 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 2387 (('~fadd', ('fneg', a), a), 0.0), 2388 (('iadd', ('ineg', a), a), 0), 2389 (('iadd', ('ineg', a), ('iadd', a, b)), b), 2390 (('iadd', a, ('iadd', ('ineg', a), b)), b), 2391 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 2392 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 2393 2394 (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), 2395 (('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), 2396 (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), 2397] 2398 2399# This section contains "late" optimizations that should be run after the 2400# regular optimizations have finished. Optimizations should go here if 2401# they help code generation but do not necessarily produce code that is 2402# more easily optimizable. 2403late_optimizations = [ 2404 # The rearrangements are fine w.r.t. NaN. However, they produce incorrect 2405 # results if one operand is +Inf and the other is -Inf. 2406 # 2407 # 1. Inf + -Inf = NaN 2408 # 2. ∀x: x + NaN = NaN and x - NaN = NaN 2409 # 3. ∀x: x != NaN = true 2410 # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false 2411 # 2412 # a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN 2413 # (a+b) < 0 false false false false 2414 # a < -b false false false false 2415 # -(a+b) < 0 false false false false 2416 # -a < b false false false false 2417 # (a+b) >= 0 false false false false 2418 # a >= -b true true false false 2419 # -(a+b) >= 0 false false false false 2420 # -a >= b true true false false 2421 # (a+b) == 0 false false false false 2422 # a == -b true true false false 2423 # (a+b) != 0 true true true true 2424 # a != -b false false true true 2425 (('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))), 2426 (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)), 2427 (('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)), 2428 (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))), 2429 (('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))), 2430 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)), 2431 (('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)), 2432 (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))), 2433 (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))), 2434 (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))), 2435 2436 # If either source must be finite, then the original (a+b) cannot produce 2437 # NaN due to Inf-Inf. The patterns and the replacements produce the same 2438 # result if b is NaN. Therefore, the replacements are exact. 2439 (('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))), 2440 (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)), 2441 (('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)), 2442 (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))), 2443 (('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))), 2444 (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))), 2445 2446 # This is how SpvOpFOrdNotEqual might be implemented. Replace it with 2447 # SpvOpLessOrGreater. 2448 (('iand', ('fneu', a, b), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))), 2449 (('iand', ('fneu', a, 0.0), ('feq', a, a) ), ('!flt', 0.0, ('fabs', a))), 2450 2451 # This is how SpvOpFUnordEqual might be implemented. Replace it with 2452 # !SpvOpLessOrGreater. 2453 (('ior', ('feq', a, b), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))), 2454 (('ior', ('feq', a, 0.0), ('fneu', a, a), ), ('inot', ('!flt', 0.0, ('fabs', a)))), 2455 2456 # nir_lower_to_source_mods will collapse this, but its existence during the 2457 # optimization loop can prevent other optimizations. 2458 (('fneg', ('fneg', a)), a), 2459 2460 # Subtractions get lowered during optimization, so we need to recombine them 2461 (('fadd', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 2462 (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), 2463 (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), 2464 (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), 2465 (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), 2466 (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'), 2467 (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'), 2468 (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'), 2469 2470 (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'), 2471 (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'), 2472 (('isub', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), ('ineg', c)), 'options->has_iadd3'), 2473 2474 # These are duplicated from the main optimizations table. The late 2475 # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create 2476 # new patterns like these. The patterns that compare with zero are removed 2477 # because they are unlikely to be created in by anything in 2478 # late_optimizations. 2479 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 2480 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 2481 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 2482 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 2483 2484 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 2485 2486 (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 2487 2488 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 2489 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 2490 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 2491 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 2492 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 2493 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 2494 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 2495 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 2496 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 2497 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 2498 2499 (('ior', a, a), a), 2500 (('iand', a, a), a), 2501 2502 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 2503 2504 (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'), 2505 (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'), 2506 (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'), 2507 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 2508 2509 (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), 2510 2511 # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this 2512 # particular operation is common for expanding values stored in a texture 2513 # from [0,1] to [-1,1]. 2514 (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 2515 (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 2516 (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 2517 (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 2518 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 2519 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 2520 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 2521 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 2522 2523 # flrp(a, b, a) 2524 # a*(1-a) + b*a 2525 # a + -a*a + a*b (1) 2526 # a + a*(b - a) 2527 # Option 1: ffma(a, (b-a), a) 2528 # 2529 # Alternately, after (1): 2530 # a*(1+b) + -a*a 2531 # a*((1+b) + -a) 2532 # 2533 # Let b=1 2534 # 2535 # Option 2: ffma(a, 2, -(a*a)) 2536 # Option 3: ffma(a, 2, (-a)*a) 2537 # Option 4: ffma(a, -a, (2*a) 2538 # Option 5: a * (2 - a) 2539 # 2540 # There are a lot of other possible combinations. 2541 (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), 2542 (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2543 (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2544 (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2545 (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2546 2547 # we do these late so that we don't get in the way of creating ffmas 2548 (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 2549 (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 2550 2551 # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), 2552 # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. 2553 (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), 2554 ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), 2555 2556 # Things that look like DPH in the source shader may get expanded to 2557 # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets 2558 # to NIR. After FFMA is generated, this can look like: 2559 # 2560 # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) 2561 # 2562 # Reassociate the last addition into the first multiplication. 2563 # 2564 # Some shaders do not use 'invariant' in vertex and (possibly) geometry 2565 # shader stages on some outputs that are intended to be invariant. For 2566 # various reasons, this optimization may not be fully applied in all 2567 # shaders used for different rendering passes of the same geometry. This 2568 # can result in Z-fighting artifacts (at best). For now, disable this 2569 # optimization in these stages. See bugzilla #111490. In tessellation 2570 # stages applications seem to use 'precise' when necessary, so allow the 2571 # optimization in those stages. 2572 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 2573 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2574 (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 2575 ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2576 (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 2577 ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2578 2579 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 2580 # 2581 # If bits is zero, the result will be zero. 2582 # 2583 # These prevent the next two lowerings generating incorrect results when 2584 # count is zero. 2585 (('ubfe', a, b, 0), 0), 2586 (('ibfe', a, b, 0), 0), 2587 2588 # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source 2589 # instructions on Intel GPUs, it cannot have an immediate values as 2590 # sources. There are also limitations on source register strides. As a 2591 # result, it is very easy for 3-source instruction combined with either 2592 # loads of immediate values or copies from weird register strides to be 2593 # more expensive than the primitive instructions it represents. 2594 (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'), 2595 2596 # b is the lowest order bit to be extracted and c is the number of bits to 2597 # extract. The inner shift removes the bits above b + c by shifting left 2598 # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is 2599 # -(b + c). The outer shift moves the bit that was at b to bit zero. 2600 # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c. 2601 # This means that it must be shifted right by 32 - c or -c bits. 2602 (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'), 2603 2604 # Clean up no-op shifts that may result from the bfe lowerings. 2605 (('ishl', a, 0), a), 2606 (('ishl', a, -32), a), 2607 (('ishr', a, 0), a), 2608 (('ishr', a, -32), a), 2609 (('ushr', a, 0), a), 2610 2611 (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)), 2612 (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)), 2613 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 2614 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 2615] 2616 2617# A few more extract cases we'd rather leave late 2618for N in [16, 32]: 2619 aN = 'a@{0}'.format(N) 2620 u2uM = 'u2u{0}'.format(M) 2621 i2iM = 'i2i{0}'.format(M) 2622 2623 for x in ['u', 'i']: 2624 x2xN = '{0}2{0}{1}'.format(x, N) 2625 extract_x8 = 'extract_{0}8'.format(x) 2626 extract_x16 = 'extract_{0}16'.format(x) 2627 2628 late_optimizations.extend([ 2629 ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 2630 ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 2631 ]) 2632 2633 if N > 16: 2634 late_optimizations.extend([ 2635 ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 2636 ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 2637 ]) 2638 2639# Byte insertion 2640late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 2641late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 2642late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) 2643 2644late_optimizations += [ 2645 # Word insertion 2646 (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), 2647 2648 # Extract and then insert 2649 (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), 2650 (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), 2651] 2652 2653# Integer sizes 2654for s in [8, 16, 32, 64]: 2655 late_optimizations.extend([ 2656 (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)), 2657 (('ior', ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)), 2658 ]) 2659 2660# Float sizes 2661for s in [16, 32, 64]: 2662 late_optimizations.extend([ 2663 (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)), 2664 (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))), 2665 ]) 2666 2667for op in ['fadd']: 2668 late_optimizations += [ 2669 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2670 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2671 ] 2672 2673for op in ['ffma']: 2674 late_optimizations += [ 2675 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2676 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2677 2678 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2679 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2680 ] 2681 2682# mediump: If an opcode is surrounded by conversions, remove the conversions. 2683# The rationale is that type conversions + the low precision opcode are more 2684# expensive that the same arithmetic opcode at higher precision. 2685# 2686# This must be done in late optimizations, because we need normal optimizations to 2687# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))). 2688# 2689# Unary opcodes 2690for op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy', 2691 'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg', 2692 'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']: 2693 late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))] 2694 2695# Binary opcodes 2696for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']: 2697 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))] 2698 2699# Ternary opcodes 2700for op in ['ffma', 'flrp']: 2701 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))] 2702 2703# Comparison opcodes 2704for op in ['feq', 'fge', 'flt', 'fneu']: 2705 late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))] 2706 2707# Do this last, so that the f2fmp patterns above have effect. 2708late_optimizations += [ 2709 # Convert *2*mp instructions to concrete *2*16 instructions. At this point 2710 # any conversions that could have been removed will have been removed in 2711 # nir_opt_algebraic so any remaining ones are required. 2712 (('f2fmp', a), ('f2f16', a)), 2713 (('f2imp', a), ('f2i16', a)), 2714 (('f2ump', a), ('f2u16', a)), 2715 (('i2imp', a), ('i2i16', a)), 2716 (('i2fmp', a), ('i2f16', a)), 2717 (('i2imp', a), ('u2u16', a)), 2718 (('u2fmp', a), ('u2f16', a)), 2719 (('fisfinite', a), ('flt', ('fabs', a), float("inf"))), 2720] 2721 2722distribute_src_mods = [ 2723 # Try to remove some spurious negations rather than pushing them down. 2724 (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), 2725 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 2726 (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)), 2727 (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)), 2728 (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)), 2729 (('fneg', ('fneg', a)), a), 2730 2731 (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)), 2732 (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))), 2733 2734 (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))), 2735 (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)), 2736 (('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))), 2737 2738 # Note that fmin <-> fmax. I don't think there is a way to distribute 2739 # fabs() into fmin or fmax. 2740 (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))), 2741 (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))), 2742 2743 (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)), 2744 (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)), 2745 (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)), 2746 2747 # fdph works mostly like fdot, but to get the correct result, the negation 2748 # must be applied to the second source. 2749 (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))), 2750 2751 (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))), 2752 (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))), 2753] 2754 2755print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 2756print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 2757 before_ffma_optimizations).render()) 2758print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 2759 late_optimizations).render()) 2760print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods", 2761 distribute_src_mods).render()) 2762