1b8e80941Smrg# 2b8e80941Smrg# Copyright (C) 2014 Intel Corporation 3b8e80941Smrg# 4b8e80941Smrg# Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg# copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg# to deal in the Software without restriction, including without limitation 7b8e80941Smrg# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg# and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg# Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg# 11b8e80941Smrg# The above copyright notice and this permission notice (including the next 12b8e80941Smrg# paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg# Software. 14b8e80941Smrg# 15b8e80941Smrg# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg# IN THE SOFTWARE. 22b8e80941Smrg# 23b8e80941Smrg# Authors: 24b8e80941Smrg# Jason Ekstrand (jason@jlekstrand.net) 25b8e80941Smrg 26b8e80941Smrgfrom __future__ import print_function 27b8e80941Smrg 28b8e80941Smrgfrom collections import OrderedDict 29b8e80941Smrgimport nir_algebraic 30b8e80941Smrgfrom nir_opcodes import type_sizes 31b8e80941Smrgimport itertools 32b8e80941Smrg 33b8e80941Smrg# Convenience variables 34b8e80941Smrga = 'a' 35b8e80941Smrgb = 'b' 36b8e80941Smrgc = 'c' 37b8e80941Smrgd = 'd' 38b8e80941Smrge = 'e' 39b8e80941Smrg 40b8e80941Smrg# Written in the form (<search>, <replace>) where <search> is an expression 41b8e80941Smrg# and <replace> is either an expression or a value. An expression is 42b8e80941Smrg# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 43b8e80941Smrg# where each source is either an expression or a value. A value can be 44b8e80941Smrg# either a numeric constant or a string representing a variable name. 45b8e80941Smrg# 46b8e80941Smrg# If the opcode in a search expression is prefixed by a '~' character, this 47b8e80941Smrg# indicates that the operation is inexact. Such operations will only get 48b8e80941Smrg# applied to SSA values that do not have the exact bit set. This should be 49b8e80941Smrg# used by by any optimizations that are not bit-for-bit exact. It should not, 50b8e80941Smrg# however, be used for backend-requested lowering operations as those need to 51b8e80941Smrg# happen regardless of precision. 52b8e80941Smrg# 53b8e80941Smrg# Variable names are specified as "[#]name[@type][(cond)]" where "#" inicates 54b8e80941Smrg# that the given variable will only match constants and the type indicates that 55b8e80941Smrg# the given variable will only match values from ALU instructions with the 56b8e80941Smrg# given output type, and (cond) specifies an additional condition function 57b8e80941Smrg# (see nir_search_helpers.h). 58b8e80941Smrg# 59b8e80941Smrg# For constants, you have to be careful to make sure that it is the right 60b8e80941Smrg# type because python is unaware of the source and destination types of the 61b8e80941Smrg# opcodes. 62b8e80941Smrg# 63b8e80941Smrg# All expression types can have a bit-size specified. For opcodes, this 64b8e80941Smrg# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 65b8e80941Smrg# type and size. In the search half of the expression this indicates that it 66b8e80941Smrg# should only match that particular bit-size. In the replace half of the 67b8e80941Smrg# expression this indicates that the constructed value should have that 68b8e80941Smrg# bit-size. 69b8e80941Smrg 70b8e80941Smrgoptimizations = [ 71b8e80941Smrg 72b8e80941Smrg (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))), 73b8e80941Smrg (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))), 74b8e80941Smrg (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 75b8e80941Smrg (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 76b8e80941Smrg (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 77b8e80941Smrg (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 78b8e80941Smrg (('udiv', a, 1), a), 79b8e80941Smrg (('idiv', a, 1), a), 80b8e80941Smrg (('umod', a, 1), 0), 81b8e80941Smrg (('imod', a, 1), 0), 82b8e80941Smrg (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b))), 83b8e80941Smrg (('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'), 84b8e80941Smrg (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'), 85b8e80941Smrg (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1))), 86b8e80941Smrg 87b8e80941Smrg (('fneg', ('fneg', a)), a), 88b8e80941Smrg (('ineg', ('ineg', a)), a), 89b8e80941Smrg (('fabs', ('fabs', a)), ('fabs', a)), 90b8e80941Smrg (('fabs', ('fneg', a)), ('fabs', a)), 91b8e80941Smrg (('fabs', ('u2f', a)), ('u2f', a)), 92b8e80941Smrg (('iabs', ('iabs', a)), ('iabs', a)), 93b8e80941Smrg (('iabs', ('ineg', a)), ('iabs', a)), 94b8e80941Smrg (('f2b', ('fneg', a)), ('f2b', a)), 95b8e80941Smrg (('i2b', ('ineg', a)), ('i2b', a)), 96b8e80941Smrg (('~fadd', a, 0.0), a), 97b8e80941Smrg (('iadd', a, 0), a), 98b8e80941Smrg (('usadd_4x8', a, 0), a), 99b8e80941Smrg (('usadd_4x8', a, ~0), ~0), 100b8e80941Smrg (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 101b8e80941Smrg (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 102b8e80941Smrg (('~fadd', ('fneg', a), a), 0.0), 103b8e80941Smrg (('iadd', ('ineg', a), a), 0), 104b8e80941Smrg (('iadd', ('ineg', a), ('iadd', a, b)), b), 105b8e80941Smrg (('iadd', a, ('iadd', ('ineg', a), b)), b), 106b8e80941Smrg (('~fadd', ('fneg', a), ('fadd', a, b)), b), 107b8e80941Smrg (('~fadd', a, ('fadd', ('fneg', a), b)), b), 108b8e80941Smrg (('~fmul', a, 0.0), 0.0), 109b8e80941Smrg (('imul', a, 0), 0), 110b8e80941Smrg (('umul_unorm_4x8', a, 0), 0), 111b8e80941Smrg (('umul_unorm_4x8', a, ~0), a), 112b8e80941Smrg (('fmul', a, 1.0), a), 113b8e80941Smrg (('imul', a, 1), a), 114b8e80941Smrg (('fmul', a, -1.0), ('fneg', a)), 115b8e80941Smrg (('imul', a, -1), ('ineg', a)), 116b8e80941Smrg # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 117b8e80941Smrg # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 118b8e80941Smrg # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 119b8e80941Smrg (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 120b8e80941Smrg (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 121b8e80941Smrg (('~ffma', 0.0, a, b), b), 122b8e80941Smrg (('~ffma', a, 0.0, b), b), 123b8e80941Smrg (('~ffma', a, b, 0.0), ('fmul', a, b)), 124b8e80941Smrg (('ffma', a, 1.0, b), ('fadd', a, b)), 125b8e80941Smrg (('ffma', 1.0, a, b), ('fadd', a, b)), 126b8e80941Smrg (('~flrp', a, b, 0.0), a), 127b8e80941Smrg (('~flrp', a, b, 1.0), b), 128b8e80941Smrg (('~flrp', a, a, b), a), 129b8e80941Smrg (('~flrp', 0.0, a, b), ('fmul', a, b)), 130b8e80941Smrg (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'), 131b8e80941Smrg (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 132b8e80941Smrg (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'), 133b8e80941Smrg (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'), 134b8e80941Smrg (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'), 135b8e80941Smrg (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 136b8e80941Smrg (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 137b8e80941Smrg (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 138b8e80941Smrg (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 139b8e80941Smrg (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 140b8e80941Smrg (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp32'), 141b8e80941Smrg (('~fadd@32', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp32'), 142b8e80941Smrg (('~fadd@64', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp64'), 143b8e80941Smrg (('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp32'), 144b8e80941Smrg (('~fadd@32', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp32'), 145b8e80941Smrg (('~fadd@64', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp64'), 146b8e80941Smrg (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), 147b8e80941Smrg (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'), 148b8e80941Smrg 149b8e80941Smrg (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 150b8e80941Smrg ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 151b8e80941Smrg 152b8e80941Smrg (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)), 153b8e80941Smrg (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 154b8e80941Smrg (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 155b8e80941Smrg (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 156b8e80941Smrg 157b8e80941Smrg (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 158b8e80941Smrg (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 159b8e80941Smrg 160b8e80941Smrg # (a * #b + #c) << #d 161b8e80941Smrg # ((a * #b) << #d) + (#c << #d) 162b8e80941Smrg # (a * (#b << #d)) + (#c << #d) 163b8e80941Smrg (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 164b8e80941Smrg ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 165b8e80941Smrg 166b8e80941Smrg # (a * #b) << #c 167b8e80941Smrg # a * (#b << #c) 168b8e80941Smrg (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 169b8e80941Smrg 170b8e80941Smrg # Comparison simplifications 171b8e80941Smrg (('~inot', ('flt', a, b)), ('fge', a, b)), 172b8e80941Smrg (('~inot', ('fge', a, b)), ('flt', a, b)), 173b8e80941Smrg (('~inot', ('feq', a, b)), ('fne', a, b)), 174b8e80941Smrg (('~inot', ('fne', a, b)), ('feq', a, b)), 175b8e80941Smrg (('inot', ('ilt', a, b)), ('ige', a, b)), 176b8e80941Smrg (('inot', ('ult', a, b)), ('uge', a, b)), 177b8e80941Smrg (('inot', ('ige', a, b)), ('ilt', a, b)), 178b8e80941Smrg (('inot', ('uge', a, b)), ('ult', a, b)), 179b8e80941Smrg (('inot', ('ieq', a, b)), ('ine', a, b)), 180b8e80941Smrg (('inot', ('ine', a, b)), ('ieq', a, b)), 181b8e80941Smrg 182b8e80941Smrg # 0.0 >= b2f(a) 183b8e80941Smrg # b2f(a) <= 0.0 184b8e80941Smrg # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 185b8e80941Smrg # inot(a) 186b8e80941Smrg (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 187b8e80941Smrg 188b8e80941Smrg (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 189b8e80941Smrg 190b8e80941Smrg (('fne', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 191b8e80941Smrg (('fne', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 192b8e80941Smrg (('fne', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 193b8e80941Smrg (('fne', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 194b8e80941Smrg (('fne', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 195b8e80941Smrg (('fne', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 196b8e80941Smrg (('fne', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 197b8e80941Smrg (('fne', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 198b8e80941Smrg (('fne', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 199b8e80941Smrg (('fne', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 200b8e80941Smrg (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 201b8e80941Smrg (('feq', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 202b8e80941Smrg (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 203b8e80941Smrg (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 204b8e80941Smrg (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 205b8e80941Smrg (('feq', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 206b8e80941Smrg (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 207b8e80941Smrg (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 208b8e80941Smrg (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 209b8e80941Smrg (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 210b8e80941Smrg 211b8e80941Smrg # -(b2f(a) + b2f(b)) < 0 212b8e80941Smrg # 0 < b2f(a) + b2f(b) 213b8e80941Smrg # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 214b8e80941Smrg # a || b 215b8e80941Smrg (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 216b8e80941Smrg (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 217b8e80941Smrg 218b8e80941Smrg # -(b2f(a) + b2f(b)) >= 0 219b8e80941Smrg # 0 >= b2f(a) + b2f(b) 220b8e80941Smrg # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 221b8e80941Smrg # !(a || b) 222b8e80941Smrg (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 223b8e80941Smrg (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 224b8e80941Smrg 225b8e80941Smrg (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 226b8e80941Smrg (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 227b8e80941Smrg 228b8e80941Smrg # Some optimizations (below) convert things like (a < b || c < b) into 229b8e80941Smrg # (min(a, c) < b). However, this interfers with the previous optimizations 230b8e80941Smrg # that try to remove comparisons with negated sums of b2f. This just 231b8e80941Smrg # breaks that apart. 232b8e80941Smrg (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 233b8e80941Smrg ('ior', ('flt', c, 0.0), ('ior', a, b))), 234b8e80941Smrg 235b8e80941Smrg (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 236b8e80941Smrg (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 237b8e80941Smrg (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 238b8e80941Smrg (('~fne', ('fadd', a, b), a), ('fne', b, 0.0)), 239b8e80941Smrg 240b8e80941Smrg # Cannot remove the addition from ilt or ige due to overflow. 241b8e80941Smrg (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 242b8e80941Smrg (('ine', ('iadd', a, b), a), ('ine', b, 0)), 243b8e80941Smrg 244b8e80941Smrg # fmin(-b2f(a), b) >= 0.0 245b8e80941Smrg # -b2f(a) >= 0.0 && b >= 0.0 246b8e80941Smrg # -b2f(a) == 0.0 && b >= 0.0 -b2f can only be 0 or -1, never >0 247b8e80941Smrg # b2f(a) == 0.0 && b >= 0.0 248b8e80941Smrg # a == False && b >= 0.0 249b8e80941Smrg # !a && b >= 0.0 250b8e80941Smrg # 251b8e80941Smrg # The fge in the second replacement is not a typo. I leave the proof that 252b8e80941Smrg # "fmin(-b2f(a), b) >= 0 <=> fmin(-b2f(a), b) == 0" as an exercise for the 253b8e80941Smrg # reader. 254b8e80941Smrg (('fge', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))), 255b8e80941Smrg (('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))), 256b8e80941Smrg 257b8e80941Smrg (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 258b8e80941Smrg (('fne', ('b2f', 'a@1'), 0.0), a), 259b8e80941Smrg (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 260b8e80941Smrg (('ine', ('b2i', 'a@1'), 0), a), 261b8e80941Smrg 262b8e80941Smrg (('fne', ('u2f', a), 0.0), ('ine', a, 0)), 263b8e80941Smrg (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 264b8e80941Smrg (('fge', ('u2f', a), 0.0), True), 265b8e80941Smrg (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 266b8e80941Smrg (('flt', ('u2f', a), 0.0), False), 267b8e80941Smrg (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 268b8e80941Smrg (('fne', ('i2f', a), 0.0), ('ine', a, 0)), 269b8e80941Smrg (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 270b8e80941Smrg (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 271b8e80941Smrg (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 272b8e80941Smrg (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 273b8e80941Smrg (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 274b8e80941Smrg 275b8e80941Smrg # 0.0 < fabs(a) 276b8e80941Smrg # fabs(a) > 0.0 277b8e80941Smrg # fabs(a) != 0.0 because fabs(a) must be >= 0 278b8e80941Smrg # a != 0.0 279b8e80941Smrg (('~flt', 0.0, ('fabs', a)), ('fne', a, 0.0)), 280b8e80941Smrg 281b8e80941Smrg # -fabs(a) < 0.0 282b8e80941Smrg # fabs(a) > 0.0 283b8e80941Smrg (('~flt', ('fneg', ('fabs', a)), 0.0), ('fne', a, 0.0)), 284b8e80941Smrg 285b8e80941Smrg # 0.0 >= fabs(a) 286b8e80941Smrg # 0.0 == fabs(a) because fabs(a) must be >= 0 287b8e80941Smrg # 0.0 == a 288b8e80941Smrg (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 289b8e80941Smrg 290b8e80941Smrg # -fabs(a) >= 0.0 291b8e80941Smrg # 0.0 >= fabs(a) 292b8e80941Smrg (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 293b8e80941Smrg 294b8e80941Smrg (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 295b8e80941Smrg (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 296b8e80941Smrg (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 297b8e80941Smrg (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 298b8e80941Smrg 299b8e80941Smrg # fmin(b2f(a), b) 300b8e80941Smrg # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 301b8e80941Smrg # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 302b8e80941Smrg # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 303b8e80941Smrg # 304b8e80941Smrg # Since b is a constant, constant folding will eliminate the fmin and the 305b8e80941Smrg # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 306b8e80941Smrg (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 307b8e80941Smrg 308b8e80941Smrg (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 309b8e80941Smrg 310b8e80941Smrg (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 311b8e80941Smrg (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 312b8e80941Smrg (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 313b8e80941Smrg (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 314b8e80941Smrg (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 315b8e80941Smrg (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)), 316b8e80941Smrg (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 317b8e80941Smrg (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 318b8e80941Smrg (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 319b8e80941Smrg (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 320b8e80941Smrg (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 321b8e80941Smrg (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 322b8e80941Smrg (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 323b8e80941Smrg (('bcsel', a, True, b), ('ior', a, b)), 324b8e80941Smrg (('bcsel', a, a, b), ('ior', a, b)), 325b8e80941Smrg (('bcsel', a, b, False), ('iand', a, b)), 326b8e80941Smrg (('bcsel', a, b, a), ('iand', a, b)), 327b8e80941Smrg (('fmin', a, a), a), 328b8e80941Smrg (('fmax', a, a), a), 329b8e80941Smrg (('imin', a, a), a), 330b8e80941Smrg (('imax', a, a), a), 331b8e80941Smrg (('umin', a, a), a), 332b8e80941Smrg (('umax', a, a), a), 333b8e80941Smrg (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 334b8e80941Smrg (('umax', ('umax', a, b), b), ('umax', a, b)), 335b8e80941Smrg (('imax', ('imax', a, b), b), ('imax', a, b)), 336b8e80941Smrg (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 337b8e80941Smrg (('umin', ('umin', a, b), b), ('umin', a, b)), 338b8e80941Smrg (('imin', ('imin', a, b), b), ('imin', a, b)), 339b8e80941Smrg (('fmax', a, ('fneg', a)), ('fabs', a)), 340b8e80941Smrg (('imax', a, ('ineg', a)), ('iabs', a)), 341b8e80941Smrg (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 342b8e80941Smrg (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 343b8e80941Smrg (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 344b8e80941Smrg (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 345b8e80941Smrg (('fmin', a, ('fabs', a)), a), 346b8e80941Smrg (('imin', a, ('iabs', a)), a), 347b8e80941Smrg (('fmax', a, ('fneg', ('fabs', a))), a), 348b8e80941Smrg (('imax', a, ('ineg', ('iabs', a))), a), 349b8e80941Smrg (('fmax', a, ('fabs', a)), ('fabs', a)), 350b8e80941Smrg (('imax', a, ('iabs', a)), ('iabs', a)), 351b8e80941Smrg (('fmax', a, ('fneg', a)), ('fabs', a)), 352b8e80941Smrg (('imax', a, ('ineg', a)), ('iabs', a)), 353b8e80941Smrg (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 354b8e80941Smrg (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 355b8e80941Smrg (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))), 356b8e80941Smrg (('fsat', ('b2f', a)), ('b2f', a)), 357b8e80941Smrg (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 358b8e80941Smrg (('fsat', ('fsat', a)), ('fsat', a)), 359b8e80941Smrg (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 360b8e80941Smrg (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 361b8e80941Smrg (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 362b8e80941Smrg (('fmax', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 363b8e80941Smrg (('fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 364b8e80941Smrg (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 365b8e80941Smrg (('~ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))), 366b8e80941Smrg (('~ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)), 367b8e80941Smrg (('~ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))), 368b8e80941Smrg (('~ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)), 369b8e80941Smrg (('~ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmax', b, c))), 370b8e80941Smrg (('~ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmin', a, b), c)), 371b8e80941Smrg (('~ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmin', b, c))), 372b8e80941Smrg (('~ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmax', a, b), c)), 373b8e80941Smrg (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 374b8e80941Smrg (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 375b8e80941Smrg (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 376b8e80941Smrg (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 377b8e80941Smrg (('~iand', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('fmin', b, c))), 378b8e80941Smrg (('~iand', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('fmax', a, b), c)), 379b8e80941Smrg (('~iand', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('fmax', b, c))), 380b8e80941Smrg (('~iand', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('fmin', a, b), c)), 381b8e80941Smrg 382b8e80941Smrg (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 383b8e80941Smrg (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 384b8e80941Smrg (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 385b8e80941Smrg (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 386b8e80941Smrg (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 387b8e80941Smrg (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 388b8e80941Smrg (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 389b8e80941Smrg (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 390b8e80941Smrg (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 391b8e80941Smrg (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 392b8e80941Smrg (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 393b8e80941Smrg (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 394b8e80941Smrg (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 395b8e80941Smrg (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 396b8e80941Smrg (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 397b8e80941Smrg (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 398b8e80941Smrg 399b8e80941Smrg # Common pattern like 'if (i == 0 || i == 1 || ...)' 400b8e80941Smrg (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 401b8e80941Smrg (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 402b8e80941Smrg (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 403b8e80941Smrg 404b8e80941Smrg # The (i2f32, ...) part is an open-coded fsign. When that is combined with 405b8e80941Smrg # the bcsel, it's basically copysign(1.0, a). There is no copysign in NIR, 406b8e80941Smrg # so emit an open-coded version of that. 407b8e80941Smrg (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))), 408b8e80941Smrg ('ior', 0x3f800000, ('iand', a, 0x80000000))), 409b8e80941Smrg 410b8e80941Smrg (('ior', a, ('ieq', a, False)), True), 411b8e80941Smrg (('ior', a, ('inot', a)), -1), 412b8e80941Smrg 413b8e80941Smrg (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)), 414b8e80941Smrg (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))), 415b8e80941Smrg 416b8e80941Smrg (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)), 417b8e80941Smrg 418b8e80941Smrg # These patterns can result when (a < b || a < c) => (a < min(b, c)) 419b8e80941Smrg # transformations occur before constant propagation and loop-unrolling. 420b8e80941Smrg (('~flt', a, ('fmax', b, a)), ('flt', a, b)), 421b8e80941Smrg (('~flt', ('fmin', a, b), a), ('flt', b, a)), 422b8e80941Smrg (('~fge', a, ('fmin', b, a)), True), 423b8e80941Smrg (('~fge', ('fmax', a, b), a), True), 424b8e80941Smrg (('~flt', a, ('fmin', b, a)), False), 425b8e80941Smrg (('~flt', ('fmax', a, b), a), False), 426b8e80941Smrg (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 427b8e80941Smrg (('~fge', ('fmin', a, b), a), ('fge', b, a)), 428b8e80941Smrg 429b8e80941Smrg (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 430b8e80941Smrg (('ilt', ('imin', a, b), a), ('ilt', b, a)), 431b8e80941Smrg (('ige', a, ('imin', b, a)), True), 432b8e80941Smrg (('ige', ('imax', a, b), a), True), 433b8e80941Smrg (('ult', a, ('umax', b, a)), ('ult', a, b)), 434b8e80941Smrg (('ult', ('umin', a, b), a), ('ult', b, a)), 435b8e80941Smrg (('uge', a, ('umin', b, a)), True), 436b8e80941Smrg (('uge', ('umax', a, b), a), True), 437b8e80941Smrg (('ilt', a, ('imin', b, a)), False), 438b8e80941Smrg (('ilt', ('imax', a, b), a), False), 439b8e80941Smrg (('ige', a, ('imax', b, a)), ('ige', a, b)), 440b8e80941Smrg (('ige', ('imin', a, b), a), ('ige', b, a)), 441b8e80941Smrg (('ult', a, ('umin', b, a)), False), 442b8e80941Smrg (('ult', ('umax', a, b), a), False), 443b8e80941Smrg (('uge', a, ('umax', b, a)), ('uge', a, b)), 444b8e80941Smrg (('uge', ('umin', a, b), a), ('uge', b, a)), 445b8e80941Smrg 446b8e80941Smrg (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 447b8e80941Smrg (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 448b8e80941Smrg (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 449b8e80941Smrg (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 450b8e80941Smrg (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 451b8e80941Smrg (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 452b8e80941Smrg (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 453b8e80941Smrg (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 454b8e80941Smrg (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 455b8e80941Smrg (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 456b8e80941Smrg (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 457b8e80941Smrg (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 458b8e80941Smrg (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 459b8e80941Smrg (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 460b8e80941Smrg (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 461b8e80941Smrg (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 462b8e80941Smrg 463b8e80941Smrg # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 464b8e80941Smrg # negative. 465b8e80941Smrg (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 466b8e80941Smrg ('iabs', ('ishr', a, b))), 467b8e80941Smrg (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), 468b8e80941Smrg 469b8e80941Smrg (('fabs', ('slt', a, b)), ('slt', a, b)), 470b8e80941Smrg (('fabs', ('sge', a, b)), ('sge', a, b)), 471b8e80941Smrg (('fabs', ('seq', a, b)), ('seq', a, b)), 472b8e80941Smrg (('fabs', ('sne', a, b)), ('sne', a, b)), 473b8e80941Smrg (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 474b8e80941Smrg (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 475b8e80941Smrg (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 476b8e80941Smrg (('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'), 477b8e80941Smrg (('fne', ('fneg', a), a), ('fne', a, 0.0)), 478b8e80941Smrg (('feq', ('fneg', a), a), ('feq', a, 0.0)), 479b8e80941Smrg # Emulating booleans 480b8e80941Smrg (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 481b8e80941Smrg (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 482b8e80941Smrg (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 483b8e80941Smrg (('iand', 'a@bool32', 1.0), ('b2f', a)), 484b8e80941Smrg # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 485b8e80941Smrg (('ineg', ('b2i32', 'a@32')), a), 486b8e80941Smrg (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 487b8e80941Smrg (('flt', ('fsub', 0.0, ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 488b8e80941Smrg # Comparison with the same args. Note that these are not done for 489b8e80941Smrg # the float versions because NaN always returns false on float 490b8e80941Smrg # inequalities. 491b8e80941Smrg (('ilt', a, a), False), 492b8e80941Smrg (('ige', a, a), True), 493b8e80941Smrg (('ieq', a, a), True), 494b8e80941Smrg (('ine', a, a), False), 495b8e80941Smrg (('ult', a, a), False), 496b8e80941Smrg (('uge', a, a), True), 497b8e80941Smrg # Logical and bit operations 498b8e80941Smrg (('fand', a, 0.0), 0.0), 499b8e80941Smrg (('iand', a, a), a), 500b8e80941Smrg (('iand', a, ~0), a), 501b8e80941Smrg (('iand', a, 0), 0), 502b8e80941Smrg (('ior', a, a), a), 503b8e80941Smrg (('ior', a, 0), a), 504b8e80941Smrg (('ior', a, True), True), 505b8e80941Smrg (('fxor', a, a), 0.0), 506b8e80941Smrg (('ixor', a, a), 0), 507b8e80941Smrg (('ixor', a, 0), a), 508b8e80941Smrg (('inot', ('inot', a)), a), 509b8e80941Smrg (('ior', ('iand', a, b), b), b), 510b8e80941Smrg (('ior', ('ior', a, b), b), ('ior', a, b)), 511b8e80941Smrg (('iand', ('ior', a, b), b), b), 512b8e80941Smrg (('iand', ('iand', a, b), b), ('iand', a, b)), 513b8e80941Smrg # DeMorgan's Laws 514b8e80941Smrg (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 515b8e80941Smrg (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 516b8e80941Smrg # Shift optimizations 517b8e80941Smrg (('ishl', 0, a), 0), 518b8e80941Smrg (('ishl', a, 0), a), 519b8e80941Smrg (('ishr', 0, a), 0), 520b8e80941Smrg (('ishr', a, 0), a), 521b8e80941Smrg (('ushr', 0, a), 0), 522b8e80941Smrg (('ushr', a, 0), a), 523b8e80941Smrg (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)), 524b8e80941Smrg (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)), 525b8e80941Smrg # Exponential/logarithmic identities 526b8e80941Smrg (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 527b8e80941Smrg (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 528b8e80941Smrg (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 529b8e80941Smrg (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 530b8e80941Smrg (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 531b8e80941Smrg ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 532b8e80941Smrg (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 533b8e80941Smrg (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 534b8e80941Smrg (('~fpow', a, 1.0), a), 535b8e80941Smrg (('~fpow', a, 2.0), ('fmul', a, a)), 536b8e80941Smrg (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 537b8e80941Smrg (('~fpow', 2.0, a), ('fexp2', a)), 538b8e80941Smrg (('~fpow', ('fpow', a, 2.2), 0.454545), a), 539b8e80941Smrg (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 540b8e80941Smrg (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 541b8e80941Smrg (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 542b8e80941Smrg (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 543b8e80941Smrg (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 544b8e80941Smrg (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 545b8e80941Smrg (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 546b8e80941Smrg (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 547b8e80941Smrg (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 548b8e80941Smrg # Division and reciprocal 549b8e80941Smrg (('~fdiv', 1.0, a), ('frcp', a)), 550b8e80941Smrg (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 551b8e80941Smrg (('~frcp', ('frcp', a)), a), 552b8e80941Smrg (('~frcp', ('fsqrt', a)), ('frsq', a)), 553b8e80941Smrg (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 554b8e80941Smrg (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 555b8e80941Smrg # Boolean simplifications 556b8e80941Smrg (('i2b32(is_used_by_if)', a), ('ine32', a, 0)), 557b8e80941Smrg (('i2b1(is_used_by_if)', a), ('ine', a, 0)), 558b8e80941Smrg (('ieq', a, True), a), 559b8e80941Smrg (('ine(is_not_used_by_if)', a, True), ('inot', a)), 560b8e80941Smrg (('ine', a, False), a), 561b8e80941Smrg (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 562b8e80941Smrg (('bcsel', a, True, False), a), 563b8e80941Smrg (('bcsel', a, False, True), ('inot', a)), 564b8e80941Smrg (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 565b8e80941Smrg (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 566b8e80941Smrg (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 567b8e80941Smrg (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 568b8e80941Smrg (('bcsel', True, b, c), b), 569b8e80941Smrg (('bcsel', False, b, c), c), 570b8e80941Smrg (('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))), 571b8e80941Smrg # The result of this should be hit by constant propagation and, in the 572b8e80941Smrg # next round of opt_algebraic, get picked up by one of the above two. 573b8e80941Smrg (('bcsel', '#a', b, c), ('bcsel', ('ine', 'a', 0), b, c)), 574b8e80941Smrg 575b8e80941Smrg (('bcsel', a, b, b), b), 576b8e80941Smrg (('fcsel', a, b, b), b), 577b8e80941Smrg 578b8e80941Smrg # D3D Boolean emulation 579b8e80941Smrg (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), 580b8e80941Smrg (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))), 581b8e80941Smrg (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 582b8e80941Smrg ('ineg', ('b2i', ('iand', a, b)))), 583b8e80941Smrg (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 584b8e80941Smrg ('ineg', ('b2i', ('ior', a, b)))), 585b8e80941Smrg (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 586b8e80941Smrg (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 587b8e80941Smrg (('ine', ('ineg', ('b2i', 'a@1')), 0), a), 588b8e80941Smrg (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 589b8e80941Smrg (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 590b8e80941Smrg 591b8e80941Smrg # SM5 32-bit shifts are defined to use the 5 least significant bits 592b8e80941Smrg (('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)), 593b8e80941Smrg (('ishr', 'a@32', ('iand', 31, b)), ('ishr', a, b)), 594b8e80941Smrg (('ushr', 'a@32', ('iand', 31, b)), ('ushr', a, b)), 595b8e80941Smrg 596b8e80941Smrg # Conversions 597b8e80941Smrg (('i2b32', ('b2i', 'a@32')), a), 598b8e80941Smrg (('f2i', ('ftrunc', a)), ('f2i', a)), 599b8e80941Smrg (('f2u', ('ftrunc', a)), ('f2u', a)), 600b8e80941Smrg (('i2b', ('ineg', a)), ('i2b', a)), 601b8e80941Smrg (('i2b', ('iabs', a)), ('i2b', a)), 602b8e80941Smrg (('fabs', ('b2f', a)), ('b2f', a)), 603b8e80941Smrg (('iabs', ('b2i', a)), ('b2i', a)), 604b8e80941Smrg (('inot', ('f2b1', a)), ('feq', a, 0.0)), 605b8e80941Smrg 606b8e80941Smrg # Ironically, mark these as imprecise because removing the conversions may 607b8e80941Smrg # preserve more precision than doing the conversions (e.g., 608b8e80941Smrg # uint(float(0x81818181u)) == 0x81818200). 609b8e80941Smrg (('~f2i32', ('i2f', 'a@32')), a), 610b8e80941Smrg (('~f2i32', ('u2f', 'a@32')), a), 611b8e80941Smrg (('~f2u32', ('i2f', 'a@32')), a), 612b8e80941Smrg (('~f2u32', ('u2f', 'a@32')), a), 613b8e80941Smrg 614b8e80941Smrg # Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec 615b8e80941Smrg # says: 616b8e80941Smrg # 617b8e80941Smrg # It is undefined to convert a negative floating-point value to an 618b8e80941Smrg # uint. 619b8e80941Smrg # 620b8e80941Smrg # Assuming that (uint)some_float behaves like (uint)(int)some_float allows 621b8e80941Smrg # some optimizations in the i965 backend to proceed. 622b8e80941Smrg (('ige', ('f2u', a), b), ('ige', ('f2i', a), b)), 623b8e80941Smrg (('ige', b, ('f2u', a)), ('ige', b, ('f2i', a))), 624b8e80941Smrg (('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)), 625b8e80941Smrg (('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))), 626b8e80941Smrg 627b8e80941Smrg # Packing and then unpacking does nothing 628b8e80941Smrg (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), 629b8e80941Smrg (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), 630b8e80941Smrg (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 631b8e80941Smrg ('unpack_64_2x32_split_y', a)), a), 632b8e80941Smrg 633b8e80941Smrg # Byte extraction 634b8e80941Smrg (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 635b8e80941Smrg (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 636b8e80941Smrg (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 637b8e80941Smrg (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 638b8e80941Smrg (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 639b8e80941Smrg (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 640b8e80941Smrg (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte') 641b8e80941Smrg] 642b8e80941Smrg 643b8e80941Smrg# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 644b8e80941Smrg# patterns like those below. 645b8e80941Smrgfor op in ('ushr', 'ishr'): 646b8e80941Smrg optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 647b8e80941Smrg optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 648b8e80941Smrg optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 649b8e80941Smrg 650b8e80941Smrgoptimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 651b8e80941Smrg 652b8e80941Smrg# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 653b8e80941Smrg# patterns like those below. 654b8e80941Smrgfor op in ('extract_u8', 'extract_i8'): 655b8e80941Smrg optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 656b8e80941Smrg optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 657b8e80941Smrg optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 658b8e80941Smrg 659b8e80941Smrgoptimizations.extend([ 660b8e80941Smrg # Word extraction 661b8e80941Smrg (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 662b8e80941Smrg (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 663b8e80941Smrg (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 664b8e80941Smrg (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 665b8e80941Smrg (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 666b8e80941Smrg 667b8e80941Smrg # Subtracts 668b8e80941Smrg (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), 669b8e80941Smrg (('isub', a, ('isub', 0, b)), ('iadd', a, b)), 670b8e80941Smrg (('ussub_4x8', a, 0), a), 671b8e80941Smrg (('ussub_4x8', a, ~0), 0), 672b8e80941Smrg (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'), 673b8e80941Smrg (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'), 674b8e80941Smrg (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), 675b8e80941Smrg (('ineg', a), ('isub', 0, a), 'options->lower_negate'), 676b8e80941Smrg (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), 677b8e80941Smrg (('iadd', a, ('isub', 0, b)), ('isub', a, b)), 678b8e80941Smrg (('fabs', ('fsub', 0.0, a)), ('fabs', a)), 679b8e80941Smrg (('iabs', ('isub', 0, a)), ('iabs', a)), 680b8e80941Smrg 681b8e80941Smrg # Propagate negation up multiplication chains 682b8e80941Smrg (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))), 683b8e80941Smrg (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 684b8e80941Smrg 685b8e80941Smrg # Propagate constants up multiplication chains 686b8e80941Smrg (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 687b8e80941Smrg (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 688b8e80941Smrg (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 689b8e80941Smrg (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 690b8e80941Smrg 691b8e80941Smrg # Reassociate constants in add/mul chains so they can be folded together. 692b8e80941Smrg # For now, we mostly only handle cases where the constants are separated by 693b8e80941Smrg # a single non-constant. We could do better eventually. 694b8e80941Smrg (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 695b8e80941Smrg (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 696b8e80941Smrg (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 697b8e80941Smrg (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 698b8e80941Smrg (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 699b8e80941Smrg 700b8e80941Smrg # By definition... 701b8e80941Smrg (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 702b8e80941Smrg (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 703b8e80941Smrg (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 704b8e80941Smrg 705b8e80941Smrg (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 706b8e80941Smrg (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 707b8e80941Smrg (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 708b8e80941Smrg 709b8e80941Smrg (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 710b8e80941Smrg 711b8e80941Smrg # Misc. lowering 712b8e80941Smrg (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod16'), 713b8e80941Smrg (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'), 714b8e80941Smrg (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'), 715b8e80941Smrg (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'), 716b8e80941Smrg (('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 717b8e80941Smrg (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 718b8e80941Smrg 719b8e80941Smrg (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 720b8e80941Smrg ('bcsel', ('ilt', 31, 'bits'), 'insert', 721b8e80941Smrg ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 722b8e80941Smrg 'options->lower_bitfield_insert'), 723b8e80941Smrg (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 724b8e80941Smrg (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 725b8e80941Smrg (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 726b8e80941Smrg (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 727b8e80941Smrg (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'), 728b8e80941Smrg (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'), 729b8e80941Smrg 730b8e80941Smrg # Alternative lowering that doesn't rely on bfi. 731b8e80941Smrg (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 732b8e80941Smrg ('bcsel', ('ilt', 31, 'bits'), 733b8e80941Smrg 'insert', 734b8e80941Smrg ('ior', 735b8e80941Smrg ('iand', 'base', ('inot', ('bfm', 'bits', 'offset'))), 736b8e80941Smrg ('iand', ('ishl', 'insert', 'offset'), ('bfm', 'bits', 'offset')))), 737b8e80941Smrg 'options->lower_bitfield_insert_to_shifts'), 738b8e80941Smrg 739b8e80941Smrg # bfm lowering -- note that the NIR opcode is undefined if either arg is 32. 740b8e80941Smrg (('bfm', 'bits', 'offset'), 741b8e80941Smrg ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'), 742b8e80941Smrg 'options->lower_bfm'), 743b8e80941Smrg 744b8e80941Smrg (('ibitfield_extract', 'value', 'offset', 'bits'), 745b8e80941Smrg ('bcsel', ('ilt', 31, 'bits'), 'value', 746b8e80941Smrg ('ibfe', 'value', 'offset', 'bits')), 747b8e80941Smrg 'options->lower_bitfield_extract'), 748b8e80941Smrg 749b8e80941Smrg (('ubitfield_extract', 'value', 'offset', 'bits'), 750b8e80941Smrg ('bcsel', ('ult', 31, 'bits'), 'value', 751b8e80941Smrg ('ubfe', 'value', 'offset', 'bits')), 752b8e80941Smrg 'options->lower_bitfield_extract'), 753b8e80941Smrg 754b8e80941Smrg (('ibitfield_extract', 'value', 'offset', 'bits'), 755b8e80941Smrg ('bcsel', ('ieq', 0, 'bits'), 756b8e80941Smrg 0, 757b8e80941Smrg ('ishr', 758b8e80941Smrg ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 759b8e80941Smrg ('isub', 32, 'bits'))), 760b8e80941Smrg 'options->lower_bitfield_extract_to_shifts'), 761b8e80941Smrg 762b8e80941Smrg (('ubitfield_extract', 'value', 'offset', 'bits'), 763b8e80941Smrg ('iand', 764b8e80941Smrg ('ushr', 'value', 'offset'), 765b8e80941Smrg ('bcsel', ('ieq', 'bits', 32), 766b8e80941Smrg 0xffffffff, 767b8e80941Smrg ('bfm', 'bits', 0))), 768b8e80941Smrg 'options->lower_bitfield_extract_to_shifts'), 769b8e80941Smrg 770b8e80941Smrg (('ifind_msb', 'value'), 771b8e80941Smrg ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 772b8e80941Smrg 'options->lower_ifind_msb'), 773b8e80941Smrg 774b8e80941Smrg (('find_lsb', 'value'), 775b8e80941Smrg ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 776b8e80941Smrg 'options->lower_find_lsb'), 777b8e80941Smrg 778b8e80941Smrg (('extract_i8', a, 'b@32'), 779b8e80941Smrg ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 780b8e80941Smrg 'options->lower_extract_byte'), 781b8e80941Smrg 782b8e80941Smrg (('extract_u8', a, 'b@32'), 783b8e80941Smrg ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 784b8e80941Smrg 'options->lower_extract_byte'), 785b8e80941Smrg 786b8e80941Smrg (('extract_i16', a, 'b@32'), 787b8e80941Smrg ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 788b8e80941Smrg 'options->lower_extract_word'), 789b8e80941Smrg 790b8e80941Smrg (('extract_u16', a, 'b@32'), 791b8e80941Smrg ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 792b8e80941Smrg 'options->lower_extract_word'), 793b8e80941Smrg 794b8e80941Smrg (('pack_unorm_2x16', 'v'), 795b8e80941Smrg ('pack_uvec2_to_uint', 796b8e80941Smrg ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 797b8e80941Smrg 'options->lower_pack_unorm_2x16'), 798b8e80941Smrg 799b8e80941Smrg (('pack_unorm_4x8', 'v'), 800b8e80941Smrg ('pack_uvec4_to_uint', 801b8e80941Smrg ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 802b8e80941Smrg 'options->lower_pack_unorm_4x8'), 803b8e80941Smrg 804b8e80941Smrg (('pack_snorm_2x16', 'v'), 805b8e80941Smrg ('pack_uvec2_to_uint', 806b8e80941Smrg ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 807b8e80941Smrg 'options->lower_pack_snorm_2x16'), 808b8e80941Smrg 809b8e80941Smrg (('pack_snorm_4x8', 'v'), 810b8e80941Smrg ('pack_uvec4_to_uint', 811b8e80941Smrg ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 812b8e80941Smrg 'options->lower_pack_snorm_4x8'), 813b8e80941Smrg 814b8e80941Smrg (('unpack_unorm_2x16', 'v'), 815b8e80941Smrg ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 816b8e80941Smrg ('extract_u16', 'v', 1))), 817b8e80941Smrg 65535.0), 818b8e80941Smrg 'options->lower_unpack_unorm_2x16'), 819b8e80941Smrg 820b8e80941Smrg (('unpack_unorm_4x8', 'v'), 821b8e80941Smrg ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 822b8e80941Smrg ('extract_u8', 'v', 1), 823b8e80941Smrg ('extract_u8', 'v', 2), 824b8e80941Smrg ('extract_u8', 'v', 3))), 825b8e80941Smrg 255.0), 826b8e80941Smrg 'options->lower_unpack_unorm_4x8'), 827b8e80941Smrg 828b8e80941Smrg (('unpack_snorm_2x16', 'v'), 829b8e80941Smrg ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 830b8e80941Smrg ('extract_i16', 'v', 1))), 831b8e80941Smrg 32767.0))), 832b8e80941Smrg 'options->lower_unpack_snorm_2x16'), 833b8e80941Smrg 834b8e80941Smrg (('unpack_snorm_4x8', 'v'), 835b8e80941Smrg ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 836b8e80941Smrg ('extract_i8', 'v', 1), 837b8e80941Smrg ('extract_i8', 'v', 2), 838b8e80941Smrg ('extract_i8', 'v', 3))), 839b8e80941Smrg 127.0))), 840b8e80941Smrg 'options->lower_unpack_snorm_4x8'), 841b8e80941Smrg 842b8e80941Smrg (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 843b8e80941Smrg (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'), 844b8e80941Smrg]) 845b8e80941Smrg 846b8e80941Smrg# bit_size dependent lowerings 847b8e80941Smrgfor bit_size in [8, 16, 32, 64]: 848b8e80941Smrg # convenience constants 849b8e80941Smrg intmax = (1 << (bit_size - 1)) - 1 850b8e80941Smrg intmin = 1 << (bit_size - 1) 851b8e80941Smrg 852b8e80941Smrg optimizations += [ 853b8e80941Smrg (('iadd_sat@' + str(bit_size), a, b), 854b8e80941Smrg ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 855b8e80941Smrg ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_add_sat'), 856b8e80941Smrg (('isub_sat@' + str(bit_size), a, b), 857b8e80941Smrg ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 858b8e80941Smrg ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'), 859b8e80941Smrg ] 860b8e80941Smrg 861b8e80941Smrginvert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')]) 862b8e80941Smrg 863b8e80941Smrgfor left, right in itertools.combinations_with_replacement(invert.keys(), 2): 864b8e80941Smrg optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), 865b8e80941Smrg ('iand', (invert[left], a, b), (invert[right], c, d)))) 866b8e80941Smrg optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))), 867b8e80941Smrg ('ior', (invert[left], a, b), (invert[right], c, d)))) 868b8e80941Smrg 869b8e80941Smrg# Optimize x2bN(b2x(x)) -> x 870b8e80941Smrgfor size in type_sizes('bool'): 871b8e80941Smrg aN = 'a@' + str(size) 872b8e80941Smrg f2bN = 'f2b' + str(size) 873b8e80941Smrg i2bN = 'i2b' + str(size) 874b8e80941Smrg optimizations.append(((f2bN, ('b2f', aN)), a)) 875b8e80941Smrg optimizations.append(((i2bN, ('b2i', aN)), a)) 876b8e80941Smrg 877b8e80941Smrg# Optimize x2yN(b2x(x)) -> b2y 878b8e80941Smrgfor x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 879b8e80941Smrg if x != 'f' and y != 'f' and x != y: 880b8e80941Smrg continue 881b8e80941Smrg 882b8e80941Smrg b2x = 'b2f' if x == 'f' else 'b2i' 883b8e80941Smrg b2y = 'b2f' if y == 'f' else 'b2i' 884b8e80941Smrg x2yN = '{}2{}'.format(x, y) 885b8e80941Smrg optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 886b8e80941Smrg 887b8e80941Smrg# Optimize away x2xN(a@N) 888b8e80941Smrgfor t in ['int', 'uint', 'float']: 889b8e80941Smrg for N in type_sizes(t): 890b8e80941Smrg x2xN = '{0}2{0}{1}'.format(t[0], N) 891b8e80941Smrg aN = 'a@{0}'.format(N) 892b8e80941Smrg optimizations.append(((x2xN, aN), a)) 893b8e80941Smrg 894b8e80941Smrg# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 895b8e80941Smrg# In particular, we can optimize away everything except upcast of downcast and 896b8e80941Smrg# upcasts where the type differs from the other cast 897b8e80941Smrgfor N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 898b8e80941Smrg if N < M: 899b8e80941Smrg # The outer cast is a down-cast. It doesn't matter what the size of the 900b8e80941Smrg # argument of the inner cast is because we'll never been in the upcast 901b8e80941Smrg # of downcast case. Regardless of types, we'll always end up with y2yN 902b8e80941Smrg # in the end. 903b8e80941Smrg for x, y in itertools.product(['i', 'u'], ['i', 'u']): 904b8e80941Smrg x2xN = '{0}2{0}{1}'.format(x, N) 905b8e80941Smrg y2yM = '{0}2{0}{1}'.format(y, M) 906b8e80941Smrg y2yN = '{0}2{0}{1}'.format(y, N) 907b8e80941Smrg optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 908b8e80941Smrg elif N > M: 909b8e80941Smrg # If the outer cast is an up-cast, we have to be more careful about the 910b8e80941Smrg # size of the argument of the inner cast and with types. In this case, 911b8e80941Smrg # the type is always the type of type up-cast which is given by the 912b8e80941Smrg # outer cast. 913b8e80941Smrg for P in type_sizes('uint'): 914b8e80941Smrg # We can't optimize away up-cast of down-cast. 915b8e80941Smrg if M < P: 916b8e80941Smrg continue 917b8e80941Smrg 918b8e80941Smrg # Because we're doing down-cast of down-cast, the types always have 919b8e80941Smrg # to match between the two casts 920b8e80941Smrg for x in ['i', 'u']: 921b8e80941Smrg x2xN = '{0}2{0}{1}'.format(x, N) 922b8e80941Smrg x2xM = '{0}2{0}{1}'.format(x, M) 923b8e80941Smrg aP = 'a@{0}'.format(P) 924b8e80941Smrg optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 925b8e80941Smrg else: 926b8e80941Smrg # The N == M case is handled by other optimizations 927b8e80941Smrg pass 928b8e80941Smrg 929b8e80941Smrgdef fexp2i(exp, bits): 930b8e80941Smrg # We assume that exp is already in the right range. 931b8e80941Smrg if bits == 16: 932b8e80941Smrg return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 933b8e80941Smrg elif bits == 32: 934b8e80941Smrg return ('ishl', ('iadd', exp, 127), 23) 935b8e80941Smrg elif bits == 64: 936b8e80941Smrg return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 937b8e80941Smrg else: 938b8e80941Smrg assert False 939b8e80941Smrg 940b8e80941Smrgdef ldexp(f, exp, bits): 941b8e80941Smrg # First, we clamp exp to a reasonable range. The maximum possible range 942b8e80941Smrg # for a normal exponent is [-126, 127] and, throwing in denormals, you get 943b8e80941Smrg # a maximum range of [-149, 127]. This means that we can potentially have 944b8e80941Smrg # a swing of +-276. If you start with FLT_MAX, you actually have to do 945b8e80941Smrg # ldexp(FLT_MAX, -278) to get it to flush all the way to zero. The GLSL 946b8e80941Smrg # spec, on the other hand, only requires that we handle an exponent value 947b8e80941Smrg # in the range [-126, 128]. This implementation is *mostly* correct; it 948b8e80941Smrg # handles a range on exp of [-252, 254] which allows you to create any 949b8e80941Smrg # value (including denorms if the hardware supports it) and to adjust the 950b8e80941Smrg # exponent of any normal value to anything you want. 951b8e80941Smrg if bits == 16: 952b8e80941Smrg exp = ('imin', ('imax', exp, -28), 30) 953b8e80941Smrg elif bits == 32: 954b8e80941Smrg exp = ('imin', ('imax', exp, -252), 254) 955b8e80941Smrg elif bits == 64: 956b8e80941Smrg exp = ('imin', ('imax', exp, -2044), 2046) 957b8e80941Smrg else: 958b8e80941Smrg assert False 959b8e80941Smrg 960b8e80941Smrg # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 961b8e80941Smrg # (We use ishr which isn't the same for -1, but the -1 case still works 962b8e80941Smrg # since we use exp-exp/2 as the second exponent.) While the spec 963b8e80941Smrg # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 964b8e80941Smrg # work with denormals and doesn't allow for the full swing in exponents 965b8e80941Smrg # that you can get with normalized values. Instead, we create two powers 966b8e80941Smrg # of two and multiply by them each in turn. That way the effective range 967b8e80941Smrg # of our exponent is doubled. 968b8e80941Smrg pow2_1 = fexp2i(('ishr', exp, 1), bits) 969b8e80941Smrg pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 970b8e80941Smrg return ('fmul', ('fmul', f, pow2_1), pow2_2) 971b8e80941Smrg 972b8e80941Smrgoptimizations += [ 973b8e80941Smrg (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 974b8e80941Smrg (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 975b8e80941Smrg (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 976b8e80941Smrg] 977b8e80941Smrg 978b8e80941Smrg# Unreal Engine 4 demo applications open-codes bitfieldReverse() 979b8e80941Smrgdef bitfield_reverse(u): 980b8e80941Smrg step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 981b8e80941Smrg step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 982b8e80941Smrg step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 983b8e80941Smrg step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 984b8e80941Smrg step5 = ('ior', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 985b8e80941Smrg 986b8e80941Smrg return step5 987b8e80941Smrg 988b8e80941Smrgoptimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 989b8e80941Smrg 990b8e80941Smrg# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 991b8e80941Smrg# then the "a == a" is redundant because it's equivalent to "a is not NaN" 992b8e80941Smrg# and, if a is a NaN then the second comparison will fail anyway. 993b8e80941Smrgfor op in ['flt', 'fge', 'feq']: 994b8e80941Smrg optimizations += [ 995b8e80941Smrg (('iand', ('feq', a, a), (op, a, b)), (op, a, b)), 996b8e80941Smrg (('iand', ('feq', a, a), (op, b, a)), (op, b, a)), 997b8e80941Smrg ] 998b8e80941Smrg 999b8e80941Smrg# Add optimizations to handle the case where the result of a ternary is 1000b8e80941Smrg# compared to a constant. This way we can take things like 1001b8e80941Smrg# 1002b8e80941Smrg# (a ? 0 : 1) > 0 1003b8e80941Smrg# 1004b8e80941Smrg# and turn it into 1005b8e80941Smrg# 1006b8e80941Smrg# a ? (0 > 0) : (1 > 0) 1007b8e80941Smrg# 1008b8e80941Smrg# which constant folding will eat for lunch. The resulting ternary will 1009b8e80941Smrg# further get cleaned up by the boolean reductions above and we will be 1010b8e80941Smrg# left with just the original variable "a". 1011b8e80941Smrgfor op in ['flt', 'fge', 'feq', 'fne', 1012b8e80941Smrg 'ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']: 1013b8e80941Smrg optimizations += [ 1014b8e80941Smrg ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 1015b8e80941Smrg ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 1016b8e80941Smrg ((op, '#d', ('bcsel', a, '#b', '#c')), 1017b8e80941Smrg ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 1018b8e80941Smrg ] 1019b8e80941Smrg 1020b8e80941Smrg 1021b8e80941Smrg# For example, this converts things like 1022b8e80941Smrg# 1023b8e80941Smrg# 1 + mix(0, a - 1, condition) 1024b8e80941Smrg# 1025b8e80941Smrg# into 1026b8e80941Smrg# 1027b8e80941Smrg# mix(1, (a-1)+1, condition) 1028b8e80941Smrg# 1029b8e80941Smrg# Other optimizations will rearrange the constants. 1030b8e80941Smrgfor op in ['fadd', 'fmul', 'iadd', 'imul']: 1031b8e80941Smrg optimizations += [ 1032b8e80941Smrg ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 1033b8e80941Smrg ] 1034b8e80941Smrg 1035b8e80941Smrg# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives 1036b8e80941Smrg# states: 1037b8e80941Smrg# 1038b8e80941Smrg# If neither layout qualifier is specified, derivatives in compute shaders 1039b8e80941Smrg# return zero, which is consistent with the handling of built-in texture 1040b8e80941Smrg# functions like texture() in GLSL 4.50 compute shaders. 1041b8e80941Smrgfor op in ['fddx', 'fddx_fine', 'fddx_coarse', 1042b8e80941Smrg 'fddy', 'fddy_fine', 'fddy_coarse']: 1043b8e80941Smrg optimizations += [ 1044b8e80941Smrg ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') 1045b8e80941Smrg] 1046b8e80941Smrg 1047b8e80941Smrg# This section contains "late" optimizations that should be run before 1048b8e80941Smrg# creating ffmas and calling regular optimizations for the final time. 1049b8e80941Smrg# Optimizations should go here if they help code generation and conflict 1050b8e80941Smrg# with the regular optimizations. 1051b8e80941Smrgbefore_ffma_optimizations = [ 1052b8e80941Smrg # Propagate constants down multiplication chains 1053b8e80941Smrg (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 1054b8e80941Smrg (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 1055b8e80941Smrg (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 1056b8e80941Smrg (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 1057b8e80941Smrg 1058b8e80941Smrg (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 1059b8e80941Smrg (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 1060b8e80941Smrg (('~fadd', ('fneg', a), a), 0.0), 1061b8e80941Smrg (('iadd', ('ineg', a), a), 0), 1062b8e80941Smrg (('iadd', ('ineg', a), ('iadd', a, b)), b), 1063b8e80941Smrg (('iadd', a, ('iadd', ('ineg', a), b)), b), 1064b8e80941Smrg (('~fadd', ('fneg', a), ('fadd', a, b)), b), 1065b8e80941Smrg (('~fadd', a, ('fadd', ('fneg', a), b)), b), 1066b8e80941Smrg] 1067b8e80941Smrg 1068b8e80941Smrg# This section contains "late" optimizations that should be run after the 1069b8e80941Smrg# regular optimizations have finished. Optimizations should go here if 1070b8e80941Smrg# they help code generation but do not necessarily produce code that is 1071b8e80941Smrg# more easily optimizable. 1072b8e80941Smrglate_optimizations = [ 1073b8e80941Smrg # Most of these optimizations aren't quite safe when you get infinity or 1074b8e80941Smrg # Nan involved but the first one should be fine. 1075b8e80941Smrg (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))), 1076b8e80941Smrg (('flt', ('fneg', ('fadd', a, b)), 0.0), ('flt', ('fneg', a), b)), 1077b8e80941Smrg (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))), 1078b8e80941Smrg (('~fge', ('fneg', ('fadd', a, b)), 0.0), ('fge', ('fneg', a), b)), 1079b8e80941Smrg (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))), 1080b8e80941Smrg (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))), 1081b8e80941Smrg 1082b8e80941Smrg (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 1083b8e80941Smrg 1084b8e80941Smrg (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'), 1085b8e80941Smrg (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'), 1086b8e80941Smrg (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'), 1087b8e80941Smrg (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 1088b8e80941Smrg 1089b8e80941Smrg # we do these late so that we don't get in the way of creating ffmas 1090b8e80941Smrg (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 1091b8e80941Smrg (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 1092b8e80941Smrg 1093b8e80941Smrg (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))), 1094b8e80941Smrg] 1095b8e80941Smrg 1096b8e80941Smrgprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 1097b8e80941Smrgprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 1098b8e80941Smrg before_ffma_optimizations).render()) 1099b8e80941Smrgprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 1100b8e80941Smrg late_optimizations).render()) 1101