1# 2# Copyright (C) 2014 Intel Corporation 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Jason Ekstrand (jason@jlekstrand.net) 25 26from __future__ import print_function 27 28from collections import OrderedDict 29import nir_algebraic 30from nir_opcodes import type_sizes 31import itertools 32 33# Convenience variables 34a = 'a' 35b = 'b' 36c = 'c' 37d = 'd' 38e = 'e' 39 40# Written in the form (<search>, <replace>) where <search> is an expression 41# and <replace> is either an expression or a value. An expression is 42# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 43# where each source is either an expression or a value. A value can be 44# either a numeric constant or a string representing a variable name. 45# 46# If the opcode in a search expression is prefixed by a '~' character, this 47# indicates that the operation is inexact. Such operations will only get 48# applied to SSA values that do not have the exact bit set. This should be 49# used by by any optimizations that are not bit-for-bit exact. It should not, 50# however, be used for backend-requested lowering operations as those need to 51# happen regardless of precision. 52# 53# Variable names are specified as "[#]name[@type][(cond)]" where "#" inicates 54# that the given variable will only match constants and the type indicates that 55# the given variable will only match values from ALU instructions with the 56# given output type, and (cond) specifies an additional condition function 57# (see nir_search_helpers.h). 58# 59# For constants, you have to be careful to make sure that it is the right 60# type because python is unaware of the source and destination types of the 61# opcodes. 62# 63# All expression types can have a bit-size specified. For opcodes, this 64# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 65# type and size. In the search half of the expression this indicates that it 66# should only match that particular bit-size. In the replace half of the 67# expression this indicates that the constructed value should have that 68# bit-size. 69 70optimizations = [ 71 72 (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))), 73 (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))), 74 (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 75 (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 76 (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 77 (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 78 (('udiv', a, 1), a), 79 (('idiv', a, 1), a), 80 (('umod', a, 1), 0), 81 (('imod', a, 1), 0), 82 (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b))), 83 (('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'), 84 (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'), 85 (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1))), 86 87 (('fneg', ('fneg', a)), a), 88 (('ineg', ('ineg', a)), a), 89 (('fabs', ('fabs', a)), ('fabs', a)), 90 (('fabs', ('fneg', a)), ('fabs', a)), 91 (('fabs', ('u2f', a)), ('u2f', a)), 92 (('iabs', ('iabs', a)), ('iabs', a)), 93 (('iabs', ('ineg', a)), ('iabs', a)), 94 (('f2b', ('fneg', a)), ('f2b', a)), 95 (('i2b', ('ineg', a)), ('i2b', a)), 96 (('~fadd', a, 0.0), a), 97 (('iadd', a, 0), a), 98 (('usadd_4x8', a, 0), a), 99 (('usadd_4x8', a, ~0), ~0), 100 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 101 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 102 (('~fadd', ('fneg', a), a), 0.0), 103 (('iadd', ('ineg', a), a), 0), 104 (('iadd', ('ineg', a), ('iadd', a, b)), b), 105 (('iadd', a, ('iadd', ('ineg', a), b)), b), 106 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 107 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 108 (('~fmul', a, 0.0), 0.0), 109 (('imul', a, 0), 0), 110 (('umul_unorm_4x8', a, 0), 0), 111 (('umul_unorm_4x8', a, ~0), a), 112 (('fmul', a, 1.0), a), 113 (('imul', a, 1), a), 114 (('fmul', a, -1.0), ('fneg', a)), 115 (('imul', a, -1), ('ineg', a)), 116 # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 117 # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 118 # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 119 (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 120 (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 121 (('~ffma', 0.0, a, b), b), 122 (('~ffma', a, 0.0, b), b), 123 (('~ffma', a, b, 0.0), ('fmul', a, b)), 124 (('ffma', a, 1.0, b), ('fadd', a, b)), 125 (('ffma', 1.0, a, b), ('fadd', a, b)), 126 (('~flrp', a, b, 0.0), a), 127 (('~flrp', a, b, 1.0), b), 128 (('~flrp', a, a, b), a), 129 (('~flrp', 0.0, a, b), ('fmul', a, b)), 130 (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'), 131 (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 132 (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'), 133 (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'), 134 (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'), 135 (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 136 (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 137 (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 138 (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 139 (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 140 (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp32'), 141 (('~fadd@32', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp32'), 142 (('~fadd@64', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp64'), 143 (('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp32'), 144 (('~fadd@32', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp32'), 145 (('~fadd@64', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp64'), 146 (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), 147 (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'), 148 149 (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 150 ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 151 152 (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)), 153 (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 154 (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 155 (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 156 157 (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 158 (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 159 160 # (a * #b + #c) << #d 161 # ((a * #b) << #d) + (#c << #d) 162 # (a * (#b << #d)) + (#c << #d) 163 (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 164 ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 165 166 # (a * #b) << #c 167 # a * (#b << #c) 168 (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 169 170 # Comparison simplifications 171 (('~inot', ('flt', a, b)), ('fge', a, b)), 172 (('~inot', ('fge', a, b)), ('flt', a, b)), 173 (('~inot', ('feq', a, b)), ('fne', a, b)), 174 (('~inot', ('fne', a, b)), ('feq', a, b)), 175 (('inot', ('ilt', a, b)), ('ige', a, b)), 176 (('inot', ('ult', a, b)), ('uge', a, b)), 177 (('inot', ('ige', a, b)), ('ilt', a, b)), 178 (('inot', ('uge', a, b)), ('ult', a, b)), 179 (('inot', ('ieq', a, b)), ('ine', a, b)), 180 (('inot', ('ine', a, b)), ('ieq', a, b)), 181 182 # 0.0 >= b2f(a) 183 # b2f(a) <= 0.0 184 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 185 # inot(a) 186 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 187 188 (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 189 190 (('fne', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 191 (('fne', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 192 (('fne', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 193 (('fne', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 194 (('fne', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 195 (('fne', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 196 (('fne', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 197 (('fne', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 198 (('fne', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 199 (('fne', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 200 (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 201 (('feq', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 202 (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 203 (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 204 (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 205 (('feq', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 206 (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 207 (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 208 (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 209 (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 210 211 # -(b2f(a) + b2f(b)) < 0 212 # 0 < b2f(a) + b2f(b) 213 # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 214 # a || b 215 (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 216 (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 217 218 # -(b2f(a) + b2f(b)) >= 0 219 # 0 >= b2f(a) + b2f(b) 220 # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 221 # !(a || b) 222 (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 223 (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 224 225 (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 226 (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 227 228 # Some optimizations (below) convert things like (a < b || c < b) into 229 # (min(a, c) < b). However, this interfers with the previous optimizations 230 # that try to remove comparisons with negated sums of b2f. This just 231 # breaks that apart. 232 (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 233 ('ior', ('flt', c, 0.0), ('ior', a, b))), 234 235 (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 236 (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 237 (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 238 (('~fne', ('fadd', a, b), a), ('fne', b, 0.0)), 239 240 # Cannot remove the addition from ilt or ige due to overflow. 241 (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 242 (('ine', ('iadd', a, b), a), ('ine', b, 0)), 243 244 # fmin(-b2f(a), b) >= 0.0 245 # -b2f(a) >= 0.0 && b >= 0.0 246 # -b2f(a) == 0.0 && b >= 0.0 -b2f can only be 0 or -1, never >0 247 # b2f(a) == 0.0 && b >= 0.0 248 # a == False && b >= 0.0 249 # !a && b >= 0.0 250 # 251 # The fge in the second replacement is not a typo. I leave the proof that 252 # "fmin(-b2f(a), b) >= 0 <=> fmin(-b2f(a), b) == 0" as an exercise for the 253 # reader. 254 (('fge', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))), 255 (('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))), 256 257 (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 258 (('fne', ('b2f', 'a@1'), 0.0), a), 259 (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 260 (('ine', ('b2i', 'a@1'), 0), a), 261 262 (('fne', ('u2f', a), 0.0), ('ine', a, 0)), 263 (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 264 (('fge', ('u2f', a), 0.0), True), 265 (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 266 (('flt', ('u2f', a), 0.0), False), 267 (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 268 (('fne', ('i2f', a), 0.0), ('ine', a, 0)), 269 (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 270 (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 271 (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 272 (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 273 (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 274 275 # 0.0 < fabs(a) 276 # fabs(a) > 0.0 277 # fabs(a) != 0.0 because fabs(a) must be >= 0 278 # a != 0.0 279 (('~flt', 0.0, ('fabs', a)), ('fne', a, 0.0)), 280 281 # -fabs(a) < 0.0 282 # fabs(a) > 0.0 283 (('~flt', ('fneg', ('fabs', a)), 0.0), ('fne', a, 0.0)), 284 285 # 0.0 >= fabs(a) 286 # 0.0 == fabs(a) because fabs(a) must be >= 0 287 # 0.0 == a 288 (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 289 290 # -fabs(a) >= 0.0 291 # 0.0 >= fabs(a) 292 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 293 294 (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 295 (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 296 (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 297 (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 298 299 # fmin(b2f(a), b) 300 # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 301 # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 302 # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 303 # 304 # Since b is a constant, constant folding will eliminate the fmin and the 305 # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 306 (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 307 308 (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 309 310 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 311 (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 312 (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 313 (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 314 (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 315 (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)), 316 (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 317 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 318 (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 319 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 320 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 321 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 322 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 323 (('bcsel', a, True, b), ('ior', a, b)), 324 (('bcsel', a, a, b), ('ior', a, b)), 325 (('bcsel', a, b, False), ('iand', a, b)), 326 (('bcsel', a, b, a), ('iand', a, b)), 327 (('fmin', a, a), a), 328 (('fmax', a, a), a), 329 (('imin', a, a), a), 330 (('imax', a, a), a), 331 (('umin', a, a), a), 332 (('umax', a, a), a), 333 (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 334 (('umax', ('umax', a, b), b), ('umax', a, b)), 335 (('imax', ('imax', a, b), b), ('imax', a, b)), 336 (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 337 (('umin', ('umin', a, b), b), ('umin', a, b)), 338 (('imin', ('imin', a, b), b), ('imin', a, b)), 339 (('fmax', a, ('fneg', a)), ('fabs', a)), 340 (('imax', a, ('ineg', a)), ('iabs', a)), 341 (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 342 (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 343 (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 344 (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 345 (('fmin', a, ('fabs', a)), a), 346 (('imin', a, ('iabs', a)), a), 347 (('fmax', a, ('fneg', ('fabs', a))), a), 348 (('imax', a, ('ineg', ('iabs', a))), a), 349 (('fmax', a, ('fabs', a)), ('fabs', a)), 350 (('imax', a, ('iabs', a)), ('iabs', a)), 351 (('fmax', a, ('fneg', a)), ('fabs', a)), 352 (('imax', a, ('ineg', a)), ('iabs', a)), 353 (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 354 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 355 (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))), 356 (('fsat', ('b2f', a)), ('b2f', a)), 357 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 358 (('fsat', ('fsat', a)), ('fsat', a)), 359 (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 360 (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 361 (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 362 (('fmax', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 363 (('fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 364 (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 365 (('~ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))), 366 (('~ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)), 367 (('~ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))), 368 (('~ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)), 369 (('~ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmax', b, c))), 370 (('~ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmin', a, b), c)), 371 (('~ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmin', b, c))), 372 (('~ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmax', a, b), c)), 373 (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 374 (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 375 (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 376 (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 377 (('~iand', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmin', b, c))), 378 (('~iand', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmax', a, b), c)), 379 (('~iand', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmax', b, c))), 380 (('~iand', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmin', a, b), c)), 381 382 (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 383 (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 384 (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 385 (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 386 (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 387 (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 388 (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 389 (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 390 (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 391 (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 392 (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 393 (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 394 (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 395 (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 396 (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 397 (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 398 399 # Common pattern like 'if (i == 0 || i == 1 || ...)' 400 (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 401 (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 402 (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 403 404 # The (i2f32, ...) part is an open-coded fsign. When that is combined with 405 # the bcsel, it's basically copysign(1.0, a). There is no copysign in NIR, 406 # so emit an open-coded version of that. 407 (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))), 408 ('ior', 0x3f800000, ('iand', a, 0x80000000))), 409 410 (('ior', a, ('ieq', a, False)), True), 411 (('ior', a, ('inot', a)), -1), 412 413 (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)), 414 (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))), 415 416 (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)), 417 418 # These patterns can result when (a < b || a < c) => (a < min(b, c)) 419 # transformations occur before constant propagation and loop-unrolling. 420 (('~flt', a, ('fmax', b, a)), ('flt', a, b)), 421 (('~flt', ('fmin', a, b), a), ('flt', b, a)), 422 (('~fge', a, ('fmin', b, a)), True), 423 (('~fge', ('fmax', a, b), a), True), 424 (('~flt', a, ('fmin', b, a)), False), 425 (('~flt', ('fmax', a, b), a), False), 426 (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 427 (('~fge', ('fmin', a, b), a), ('fge', b, a)), 428 429 (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 430 (('ilt', ('imin', a, b), a), ('ilt', b, a)), 431 (('ige', a, ('imin', b, a)), True), 432 (('ige', ('imax', a, b), a), True), 433 (('ult', a, ('umax', b, a)), ('ult', a, b)), 434 (('ult', ('umin', a, b), a), ('ult', b, a)), 435 (('uge', a, ('umin', b, a)), True), 436 (('uge', ('umax', a, b), a), True), 437 (('ilt', a, ('imin', b, a)), False), 438 (('ilt', ('imax', a, b), a), False), 439 (('ige', a, ('imax', b, a)), ('ige', a, b)), 440 (('ige', ('imin', a, b), a), ('ige', b, a)), 441 (('ult', a, ('umin', b, a)), False), 442 (('ult', ('umax', a, b), a), False), 443 (('uge', a, ('umax', b, a)), ('uge', a, b)), 444 (('uge', ('umin', a, b), a), ('uge', b, a)), 445 446 (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 447 (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 448 (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 449 (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 450 (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 451 (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 452 (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 453 (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 454 (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 455 (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 456 (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 457 (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 458 (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 459 (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 460 (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 461 (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 462 463 # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 464 # negative. 465 (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 466 ('iabs', ('ishr', a, b))), 467 (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), 468 469 (('fabs', ('slt', a, b)), ('slt', a, b)), 470 (('fabs', ('sge', a, b)), ('sge', a, b)), 471 (('fabs', ('seq', a, b)), ('seq', a, b)), 472 (('fabs', ('sne', a, b)), ('sne', a, b)), 473 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 474 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 475 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 476 (('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'), 477 (('fne', ('fneg', a), a), ('fne', a, 0.0)), 478 (('feq', ('fneg', a), a), ('feq', a, 0.0)), 479 # Emulating booleans 480 (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 481 (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 482 (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 483 (('iand', 'a@bool32', 1.0), ('b2f', a)), 484 # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 485 (('ineg', ('b2i32', 'a@32')), a), 486 (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 487 (('flt', ('fsub', 0.0, ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 488 # Comparison with the same args. Note that these are not done for 489 # the float versions because NaN always returns false on float 490 # inequalities. 491 (('ilt', a, a), False), 492 (('ige', a, a), True), 493 (('ieq', a, a), True), 494 (('ine', a, a), False), 495 (('ult', a, a), False), 496 (('uge', a, a), True), 497 # Logical and bit operations 498 (('fand', a, 0.0), 0.0), 499 (('iand', a, a), a), 500 (('iand', a, ~0), a), 501 (('iand', a, 0), 0), 502 (('ior', a, a), a), 503 (('ior', a, 0), a), 504 (('ior', a, True), True), 505 (('fxor', a, a), 0.0), 506 (('ixor', a, a), 0), 507 (('ixor', a, 0), a), 508 (('inot', ('inot', a)), a), 509 (('ior', ('iand', a, b), b), b), 510 (('ior', ('ior', a, b), b), ('ior', a, b)), 511 (('iand', ('ior', a, b), b), b), 512 (('iand', ('iand', a, b), b), ('iand', a, b)), 513 # DeMorgan's Laws 514 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 515 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 516 # Shift optimizations 517 (('ishl', 0, a), 0), 518 (('ishl', a, 0), a), 519 (('ishr', 0, a), 0), 520 (('ishr', a, 0), a), 521 (('ushr', 0, a), 0), 522 (('ushr', a, 0), a), 523 (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)), 524 (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)), 525 # Exponential/logarithmic identities 526 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 527 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 528 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 529 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 530 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 531 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 532 (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 533 (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 534 (('~fpow', a, 1.0), a), 535 (('~fpow', a, 2.0), ('fmul', a, a)), 536 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 537 (('~fpow', 2.0, a), ('fexp2', a)), 538 (('~fpow', ('fpow', a, 2.2), 0.454545), a), 539 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 540 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 541 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 542 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 543 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 544 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 545 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 546 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 547 (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 548 # Division and reciprocal 549 (('~fdiv', 1.0, a), ('frcp', a)), 550 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 551 (('~frcp', ('frcp', a)), a), 552 (('~frcp', ('fsqrt', a)), ('frsq', a)), 553 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 554 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 555 # Boolean simplifications 556 (('i2b32(is_used_by_if)', a), ('ine32', a, 0)), 557 (('i2b1(is_used_by_if)', a), ('ine', a, 0)), 558 (('ieq', a, True), a), 559 (('ine(is_not_used_by_if)', a, True), ('inot', a)), 560 (('ine', a, False), a), 561 (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 562 (('bcsel', a, True, False), a), 563 (('bcsel', a, False, True), ('inot', a)), 564 (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 565 (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 566 (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 567 (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 568 (('bcsel', True, b, c), b), 569 (('bcsel', False, b, c), c), 570 (('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))), 571 # The result of this should be hit by constant propagation and, in the 572 # next round of opt_algebraic, get picked up by one of the above two. 573 (('bcsel', '#a', b, c), ('bcsel', ('ine', 'a', 0), b, c)), 574 575 (('bcsel', a, b, b), b), 576 (('fcsel', a, b, b), b), 577 578 # D3D Boolean emulation 579 (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), 580 (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))), 581 (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 582 ('ineg', ('b2i', ('iand', a, b)))), 583 (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 584 ('ineg', ('b2i', ('ior', a, b)))), 585 (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 586 (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 587 (('ine', ('ineg', ('b2i', 'a@1')), 0), a), 588 (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 589 (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 590 591 # SM5 32-bit shifts are defined to use the 5 least significant bits 592 (('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)), 593 (('ishr', 'a@32', ('iand', 31, b)), ('ishr', a, b)), 594 (('ushr', 'a@32', ('iand', 31, b)), ('ushr', a, b)), 595 596 # Conversions 597 (('i2b32', ('b2i', 'a@32')), a), 598 (('f2i', ('ftrunc', a)), ('f2i', a)), 599 (('f2u', ('ftrunc', a)), ('f2u', a)), 600 (('i2b', ('ineg', a)), ('i2b', a)), 601 (('i2b', ('iabs', a)), ('i2b', a)), 602 (('fabs', ('b2f', a)), ('b2f', a)), 603 (('iabs', ('b2i', a)), ('b2i', a)), 604 (('inot', ('f2b1', a)), ('feq', a, 0.0)), 605 606 # Ironically, mark these as imprecise because removing the conversions may 607 # preserve more precision than doing the conversions (e.g., 608 # uint(float(0x81818181u)) == 0x81818200). 609 (('~f2i32', ('i2f', 'a@32')), a), 610 (('~f2i32', ('u2f', 'a@32')), a), 611 (('~f2u32', ('i2f', 'a@32')), a), 612 (('~f2u32', ('u2f', 'a@32')), a), 613 614 # Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec 615 # says: 616 # 617 # It is undefined to convert a negative floating-point value to an 618 # uint. 619 # 620 # Assuming that (uint)some_float behaves like (uint)(int)some_float allows 621 # some optimizations in the i965 backend to proceed. 622 (('ige', ('f2u', a), b), ('ige', ('f2i', a), b)), 623 (('ige', b, ('f2u', a)), ('ige', b, ('f2i', a))), 624 (('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)), 625 (('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))), 626 627 # Packing and then unpacking does nothing 628 (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), 629 (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), 630 (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 631 ('unpack_64_2x32_split_y', a)), a), 632 633 # Byte extraction 634 (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 635 (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 636 (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 637 (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 638 (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 639 (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 640 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte') 641] 642 643# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 644# patterns like those below. 645for op in ('ushr', 'ishr'): 646 optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 647 optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 648 optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 649 650optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 651 652# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 653# patterns like those below. 654for op in ('extract_u8', 'extract_i8'): 655 optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 656 optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 657 optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 658 659optimizations.extend([ 660 # Word extraction 661 (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 662 (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 663 (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 664 (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 665 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 666 667 # Subtracts 668 (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), 669 (('isub', a, ('isub', 0, b)), ('iadd', a, b)), 670 (('ussub_4x8', a, 0), a), 671 (('ussub_4x8', a, ~0), 0), 672 (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'), 673 (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'), 674 (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), 675 (('ineg', a), ('isub', 0, a), 'options->lower_negate'), 676 (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), 677 (('iadd', a, ('isub', 0, b)), ('isub', a, b)), 678 (('fabs', ('fsub', 0.0, a)), ('fabs', a)), 679 (('iabs', ('isub', 0, a)), ('iabs', a)), 680 681 # Propagate negation up multiplication chains 682 (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))), 683 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 684 685 # Propagate constants up multiplication chains 686 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 687 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 688 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 689 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 690 691 # Reassociate constants in add/mul chains so they can be folded together. 692 # For now, we mostly only handle cases where the constants are separated by 693 # a single non-constant. We could do better eventually. 694 (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 695 (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 696 (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 697 (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 698 (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 699 700 # By definition... 701 (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 702 (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 703 (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 704 705 (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 706 (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 707 (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 708 709 (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 710 711 # Misc. lowering 712 (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod16'), 713 (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'), 714 (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'), 715 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'), 716 (('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 717 (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 718 719 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 720 ('bcsel', ('ilt', 31, 'bits'), 'insert', 721 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 722 'options->lower_bitfield_insert'), 723 (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 724 (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 725 (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 726 (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 727 (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'), 728 (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'), 729 730 # Alternative lowering that doesn't rely on bfi. 731 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 732 ('bcsel', ('ilt', 31, 'bits'), 733 'insert', 734 ('ior', 735 ('iand', 'base', ('inot', ('bfm', 'bits', 'offset'))), 736 ('iand', ('ishl', 'insert', 'offset'), ('bfm', 'bits', 'offset')))), 737 'options->lower_bitfield_insert_to_shifts'), 738 739 # bfm lowering -- note that the NIR opcode is undefined if either arg is 32. 740 (('bfm', 'bits', 'offset'), 741 ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'), 742 'options->lower_bfm'), 743 744 (('ibitfield_extract', 'value', 'offset', 'bits'), 745 ('bcsel', ('ilt', 31, 'bits'), 'value', 746 ('ibfe', 'value', 'offset', 'bits')), 747 'options->lower_bitfield_extract'), 748 749 (('ubitfield_extract', 'value', 'offset', 'bits'), 750 ('bcsel', ('ult', 31, 'bits'), 'value', 751 ('ubfe', 'value', 'offset', 'bits')), 752 'options->lower_bitfield_extract'), 753 754 (('ibitfield_extract', 'value', 'offset', 'bits'), 755 ('bcsel', ('ieq', 0, 'bits'), 756 0, 757 ('ishr', 758 ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 759 ('isub', 32, 'bits'))), 760 'options->lower_bitfield_extract_to_shifts'), 761 762 (('ubitfield_extract', 'value', 'offset', 'bits'), 763 ('iand', 764 ('ushr', 'value', 'offset'), 765 ('bcsel', ('ieq', 'bits', 32), 766 0xffffffff, 767 ('bfm', 'bits', 0))), 768 'options->lower_bitfield_extract_to_shifts'), 769 770 (('ifind_msb', 'value'), 771 ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 772 'options->lower_ifind_msb'), 773 774 (('find_lsb', 'value'), 775 ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 776 'options->lower_find_lsb'), 777 778 (('extract_i8', a, 'b@32'), 779 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 780 'options->lower_extract_byte'), 781 782 (('extract_u8', a, 'b@32'), 783 ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 784 'options->lower_extract_byte'), 785 786 (('extract_i16', a, 'b@32'), 787 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 788 'options->lower_extract_word'), 789 790 (('extract_u16', a, 'b@32'), 791 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 792 'options->lower_extract_word'), 793 794 (('pack_unorm_2x16', 'v'), 795 ('pack_uvec2_to_uint', 796 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 797 'options->lower_pack_unorm_2x16'), 798 799 (('pack_unorm_4x8', 'v'), 800 ('pack_uvec4_to_uint', 801 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 802 'options->lower_pack_unorm_4x8'), 803 804 (('pack_snorm_2x16', 'v'), 805 ('pack_uvec2_to_uint', 806 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 807 'options->lower_pack_snorm_2x16'), 808 809 (('pack_snorm_4x8', 'v'), 810 ('pack_uvec4_to_uint', 811 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 812 'options->lower_pack_snorm_4x8'), 813 814 (('unpack_unorm_2x16', 'v'), 815 ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 816 ('extract_u16', 'v', 1))), 817 65535.0), 818 'options->lower_unpack_unorm_2x16'), 819 820 (('unpack_unorm_4x8', 'v'), 821 ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 822 ('extract_u8', 'v', 1), 823 ('extract_u8', 'v', 2), 824 ('extract_u8', 'v', 3))), 825 255.0), 826 'options->lower_unpack_unorm_4x8'), 827 828 (('unpack_snorm_2x16', 'v'), 829 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 830 ('extract_i16', 'v', 1))), 831 32767.0))), 832 'options->lower_unpack_snorm_2x16'), 833 834 (('unpack_snorm_4x8', 'v'), 835 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 836 ('extract_i8', 'v', 1), 837 ('extract_i8', 'v', 2), 838 ('extract_i8', 'v', 3))), 839 127.0))), 840 'options->lower_unpack_snorm_4x8'), 841 842 (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 843 (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'), 844]) 845 846# bit_size dependent lowerings 847for bit_size in [8, 16, 32, 64]: 848 # convenience constants 849 intmax = (1 << (bit_size - 1)) - 1 850 intmin = 1 << (bit_size - 1) 851 852 optimizations += [ 853 (('iadd_sat@' + str(bit_size), a, b), 854 ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 855 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_add_sat'), 856 (('isub_sat@' + str(bit_size), a, b), 857 ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 858 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'), 859 ] 860 861invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')]) 862 863for left, right in itertools.combinations_with_replacement(invert.keys(), 2): 864 optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), 865 ('iand', (invert[left], a, b), (invert[right], c, d)))) 866 optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))), 867 ('ior', (invert[left], a, b), (invert[right], c, d)))) 868 869# Optimize x2bN(b2x(x)) -> x 870for size in type_sizes('bool'): 871 aN = 'a@' + str(size) 872 f2bN = 'f2b' + str(size) 873 i2bN = 'i2b' + str(size) 874 optimizations.append(((f2bN, ('b2f', aN)), a)) 875 optimizations.append(((i2bN, ('b2i', aN)), a)) 876 877# Optimize x2yN(b2x(x)) -> b2y 878for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 879 if x != 'f' and y != 'f' and x != y: 880 continue 881 882 b2x = 'b2f' if x == 'f' else 'b2i' 883 b2y = 'b2f' if y == 'f' else 'b2i' 884 x2yN = '{}2{}'.format(x, y) 885 optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 886 887# Optimize away x2xN(a@N) 888for t in ['int', 'uint', 'float']: 889 for N in type_sizes(t): 890 x2xN = '{0}2{0}{1}'.format(t[0], N) 891 aN = 'a@{0}'.format(N) 892 optimizations.append(((x2xN, aN), a)) 893 894# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 895# In particular, we can optimize away everything except upcast of downcast and 896# upcasts where the type differs from the other cast 897for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 898 if N < M: 899 # The outer cast is a down-cast. It doesn't matter what the size of the 900 # argument of the inner cast is because we'll never been in the upcast 901 # of downcast case. Regardless of types, we'll always end up with y2yN 902 # in the end. 903 for x, y in itertools.product(['i', 'u'], ['i', 'u']): 904 x2xN = '{0}2{0}{1}'.format(x, N) 905 y2yM = '{0}2{0}{1}'.format(y, M) 906 y2yN = '{0}2{0}{1}'.format(y, N) 907 optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 908 elif N > M: 909 # If the outer cast is an up-cast, we have to be more careful about the 910 # size of the argument of the inner cast and with types. In this case, 911 # the type is always the type of type up-cast which is given by the 912 # outer cast. 913 for P in type_sizes('uint'): 914 # We can't optimize away up-cast of down-cast. 915 if M < P: 916 continue 917 918 # Because we're doing down-cast of down-cast, the types always have 919 # to match between the two casts 920 for x in ['i', 'u']: 921 x2xN = '{0}2{0}{1}'.format(x, N) 922 x2xM = '{0}2{0}{1}'.format(x, M) 923 aP = 'a@{0}'.format(P) 924 optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 925 else: 926 # The N == M case is handled by other optimizations 927 pass 928 929def fexp2i(exp, bits): 930 # We assume that exp is already in the right range. 931 if bits == 16: 932 return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 933 elif bits == 32: 934 return ('ishl', ('iadd', exp, 127), 23) 935 elif bits == 64: 936 return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 937 else: 938 assert False 939 940def ldexp(f, exp, bits): 941 # First, we clamp exp to a reasonable range. The maximum possible range 942 # for a normal exponent is [-126, 127] and, throwing in denormals, you get 943 # a maximum range of [-149, 127]. This means that we can potentially have 944 # a swing of +-276. If you start with FLT_MAX, you actually have to do 945 # ldexp(FLT_MAX, -278) to get it to flush all the way to zero. The GLSL 946 # spec, on the other hand, only requires that we handle an exponent value 947 # in the range [-126, 128]. This implementation is *mostly* correct; it 948 # handles a range on exp of [-252, 254] which allows you to create any 949 # value (including denorms if the hardware supports it) and to adjust the 950 # exponent of any normal value to anything you want. 951 if bits == 16: 952 exp = ('imin', ('imax', exp, -28), 30) 953 elif bits == 32: 954 exp = ('imin', ('imax', exp, -252), 254) 955 elif bits == 64: 956 exp = ('imin', ('imax', exp, -2044), 2046) 957 else: 958 assert False 959 960 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 961 # (We use ishr which isn't the same for -1, but the -1 case still works 962 # since we use exp-exp/2 as the second exponent.) While the spec 963 # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 964 # work with denormals and doesn't allow for the full swing in exponents 965 # that you can get with normalized values. Instead, we create two powers 966 # of two and multiply by them each in turn. That way the effective range 967 # of our exponent is doubled. 968 pow2_1 = fexp2i(('ishr', exp, 1), bits) 969 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 970 return ('fmul', ('fmul', f, pow2_1), pow2_2) 971 972optimizations += [ 973 (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 974 (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 975 (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 976] 977 978# Unreal Engine 4 demo applications open-codes bitfieldReverse() 979def bitfield_reverse(u): 980 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 981 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 982 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 983 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 984 step5 = ('ior', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 985 986 return step5 987 988optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 989 990# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 991# then the "a == a" is redundant because it's equivalent to "a is not NaN" 992# and, if a is a NaN then the second comparison will fail anyway. 993for op in ['flt', 'fge', 'feq']: 994 optimizations += [ 995 (('iand', ('feq', a, a), (op, a, b)), (op, a, b)), 996 (('iand', ('feq', a, a), (op, b, a)), (op, b, a)), 997 ] 998 999# Add optimizations to handle the case where the result of a ternary is 1000# compared to a constant. This way we can take things like 1001# 1002# (a ? 0 : 1) > 0 1003# 1004# and turn it into 1005# 1006# a ? (0 > 0) : (1 > 0) 1007# 1008# which constant folding will eat for lunch. The resulting ternary will 1009# further get cleaned up by the boolean reductions above and we will be 1010# left with just the original variable "a". 1011for op in ['flt', 'fge', 'feq', 'fne', 1012 'ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']: 1013 optimizations += [ 1014 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 1015 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 1016 ((op, '#d', ('bcsel', a, '#b', '#c')), 1017 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 1018 ] 1019 1020 1021# For example, this converts things like 1022# 1023# 1 + mix(0, a - 1, condition) 1024# 1025# into 1026# 1027# mix(1, (a-1)+1, condition) 1028# 1029# Other optimizations will rearrange the constants. 1030for op in ['fadd', 'fmul', 'iadd', 'imul']: 1031 optimizations += [ 1032 ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 1033 ] 1034 1035# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives 1036# states: 1037# 1038# If neither layout qualifier is specified, derivatives in compute shaders 1039# return zero, which is consistent with the handling of built-in texture 1040# functions like texture() in GLSL 4.50 compute shaders. 1041for op in ['fddx', 'fddx_fine', 'fddx_coarse', 1042 'fddy', 'fddy_fine', 'fddy_coarse']: 1043 optimizations += [ 1044 ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') 1045] 1046 1047# This section contains "late" optimizations that should be run before 1048# creating ffmas and calling regular optimizations for the final time. 1049# Optimizations should go here if they help code generation and conflict 1050# with the regular optimizations. 1051before_ffma_optimizations = [ 1052 # Propagate constants down multiplication chains 1053 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 1054 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 1055 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 1056 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 1057 1058 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 1059 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 1060 (('~fadd', ('fneg', a), a), 0.0), 1061 (('iadd', ('ineg', a), a), 0), 1062 (('iadd', ('ineg', a), ('iadd', a, b)), b), 1063 (('iadd', a, ('iadd', ('ineg', a), b)), b), 1064 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 1065 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 1066] 1067 1068# This section contains "late" optimizations that should be run after the 1069# regular optimizations have finished. Optimizations should go here if 1070# they help code generation but do not necessarily produce code that is 1071# more easily optimizable. 1072late_optimizations = [ 1073 # Most of these optimizations aren't quite safe when you get infinity or 1074 # Nan involved but the first one should be fine. 1075 (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))), 1076 (('flt', ('fneg', ('fadd', a, b)), 0.0), ('flt', ('fneg', a), b)), 1077 (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))), 1078 (('~fge', ('fneg', ('fadd', a, b)), 0.0), ('fge', ('fneg', a), b)), 1079 (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))), 1080 (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))), 1081 1082 (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 1083 1084 (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'), 1085 (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'), 1086 (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'), 1087 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 1088 1089 # we do these late so that we don't get in the way of creating ffmas 1090 (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 1091 (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 1092 1093 (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))), 1094] 1095 1096print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 1097print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 1098 before_ffma_optimizations).render()) 1099print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 1100 late_optimizations).render()) 1101