nir_range_analysis.c revision 7ec681f3
1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23#include <math.h> 24#include <float.h> 25#include "nir.h" 26#include "nir_range_analysis.h" 27#include "util/hash_table.h" 28 29/** 30 * Analyzes a sequence of operations to determine some aspects of the range of 31 * the result. 32 */ 33 34static bool 35is_not_negative(enum ssa_ranges r) 36{ 37 return r == gt_zero || r == ge_zero || r == eq_zero; 38} 39 40static bool 41is_not_zero(enum ssa_ranges r) 42{ 43 return r == gt_zero || r == lt_zero || r == ne_zero; 44} 45 46static void * 47pack_data(const struct ssa_result_range r) 48{ 49 return (void *)(uintptr_t)(r.range | r.is_integral << 8 | r.is_finite << 9 | 50 r.is_a_number << 10); 51} 52 53static struct ssa_result_range 54unpack_data(const void *p) 55{ 56 const uintptr_t v = (uintptr_t) p; 57 58 return (struct ssa_result_range){ 59 .range = v & 0xff, 60 .is_integral = (v & 0x00100) != 0, 61 .is_finite = (v & 0x00200) != 0, 62 .is_a_number = (v & 0x00400) != 0 63 }; 64} 65 66static void * 67pack_key(const struct nir_alu_instr *instr, nir_alu_type type) 68{ 69 uintptr_t type_encoding; 70 uintptr_t ptr = (uintptr_t) instr; 71 72 /* The low 2 bits have to be zero or this whole scheme falls apart. */ 73 assert((ptr & 0x3) == 0); 74 75 /* NIR is typeless in the sense that sequences of bits have whatever 76 * meaning is attached to them by the instruction that consumes them. 77 * However, the number of bits must match between producer and consumer. 78 * As a result, the number of bits does not need to be encoded here. 79 */ 80 switch (nir_alu_type_get_base_type(type)) { 81 case nir_type_int: type_encoding = 0; break; 82 case nir_type_uint: type_encoding = 1; break; 83 case nir_type_bool: type_encoding = 2; break; 84 case nir_type_float: type_encoding = 3; break; 85 default: unreachable("Invalid base type."); 86 } 87 88 return (void *)(ptr | type_encoding); 89} 90 91static nir_alu_type 92nir_alu_src_type(const nir_alu_instr *instr, unsigned src) 93{ 94 return nir_alu_type_get_base_type(nir_op_infos[instr->op].input_types[src]) | 95 nir_src_bit_size(instr->src[src].src); 96} 97 98static struct ssa_result_range 99analyze_constant(const struct nir_alu_instr *instr, unsigned src, 100 nir_alu_type use_type) 101{ 102 uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0, 1, 2, 3, 103 4, 5, 6, 7, 104 8, 9, 10, 11, 105 12, 13, 14, 15 }; 106 107 /* If the source is an explicitly sized source, then we need to reset 108 * both the number of components and the swizzle. 109 */ 110 const unsigned num_components = nir_ssa_alu_instr_src_components(instr, src); 111 112 for (unsigned i = 0; i < num_components; ++i) 113 swizzle[i] = instr->src[src].swizzle[i]; 114 115 const nir_load_const_instr *const load = 116 nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr); 117 118 struct ssa_result_range r = { unknown, false, false, false }; 119 120 switch (nir_alu_type_get_base_type(use_type)) { 121 case nir_type_float: { 122 double min_value = DBL_MAX; 123 double max_value = -DBL_MAX; 124 bool any_zero = false; 125 bool all_zero = true; 126 127 r.is_integral = true; 128 r.is_a_number = true; 129 r.is_finite = true; 130 131 for (unsigned i = 0; i < num_components; ++i) { 132 const double v = nir_const_value_as_float(load->value[swizzle[i]], 133 load->def.bit_size); 134 135 if (floor(v) != v) 136 r.is_integral = false; 137 138 if (isnan(v)) 139 r.is_a_number = false; 140 141 if (!isfinite(v)) 142 r.is_finite = false; 143 144 any_zero = any_zero || (v == 0.0); 145 all_zero = all_zero && (v == 0.0); 146 min_value = MIN2(min_value, v); 147 max_value = MAX2(max_value, v); 148 } 149 150 assert(any_zero >= all_zero); 151 assert(isnan(max_value) || max_value >= min_value); 152 153 if (all_zero) 154 r.range = eq_zero; 155 else if (min_value > 0.0) 156 r.range = gt_zero; 157 else if (min_value == 0.0) 158 r.range = ge_zero; 159 else if (max_value < 0.0) 160 r.range = lt_zero; 161 else if (max_value == 0.0) 162 r.range = le_zero; 163 else if (!any_zero) 164 r.range = ne_zero; 165 else 166 r.range = unknown; 167 168 return r; 169 } 170 171 case nir_type_int: 172 case nir_type_bool: { 173 int64_t min_value = INT_MAX; 174 int64_t max_value = INT_MIN; 175 bool any_zero = false; 176 bool all_zero = true; 177 178 for (unsigned i = 0; i < num_components; ++i) { 179 const int64_t v = nir_const_value_as_int(load->value[swizzle[i]], 180 load->def.bit_size); 181 182 any_zero = any_zero || (v == 0); 183 all_zero = all_zero && (v == 0); 184 min_value = MIN2(min_value, v); 185 max_value = MAX2(max_value, v); 186 } 187 188 assert(any_zero >= all_zero); 189 assert(max_value >= min_value); 190 191 if (all_zero) 192 r.range = eq_zero; 193 else if (min_value > 0) 194 r.range = gt_zero; 195 else if (min_value == 0) 196 r.range = ge_zero; 197 else if (max_value < 0) 198 r.range = lt_zero; 199 else if (max_value == 0) 200 r.range = le_zero; 201 else if (!any_zero) 202 r.range = ne_zero; 203 else 204 r.range = unknown; 205 206 return r; 207 } 208 209 case nir_type_uint: { 210 bool any_zero = false; 211 bool all_zero = true; 212 213 for (unsigned i = 0; i < num_components; ++i) { 214 const uint64_t v = nir_const_value_as_uint(load->value[swizzle[i]], 215 load->def.bit_size); 216 217 any_zero = any_zero || (v == 0); 218 all_zero = all_zero && (v == 0); 219 } 220 221 assert(any_zero >= all_zero); 222 223 if (all_zero) 224 r.range = eq_zero; 225 else if (any_zero) 226 r.range = ge_zero; 227 else 228 r.range = gt_zero; 229 230 return r; 231 } 232 233 default: 234 unreachable("Invalid alu source type"); 235 } 236} 237 238/** 239 * Short-hand name for use in the tables in analyze_expression. If this name 240 * becomes a problem on some compiler, we can change it to _. 241 */ 242#define _______ unknown 243 244 245#if defined(__clang__) 246 /* clang wants _Pragma("unroll X") */ 247 #define pragma_unroll_5 _Pragma("unroll 5") 248 #define pragma_unroll_7 _Pragma("unroll 7") 249/* gcc wants _Pragma("GCC unroll X") */ 250#elif defined(__GNUC__) 251 #if __GNUC__ >= 8 252 #define pragma_unroll_5 _Pragma("GCC unroll 5") 253 #define pragma_unroll_7 _Pragma("GCC unroll 7") 254 #else 255 #pragma GCC optimize ("unroll-loops") 256 #define pragma_unroll_5 257 #define pragma_unroll_7 258 #endif 259#else 260 /* MSVC doesn't have C99's _Pragma() */ 261 #define pragma_unroll_5 262 #define pragma_unroll_7 263#endif 264 265 266#ifndef NDEBUG 267#define ASSERT_TABLE_IS_COMMUTATIVE(t) \ 268 do { \ 269 static bool first = true; \ 270 if (first) { \ 271 first = false; \ 272 pragma_unroll_7 \ 273 for (unsigned r = 0; r < ARRAY_SIZE(t); r++) { \ 274 pragma_unroll_7 \ 275 for (unsigned c = 0; c < ARRAY_SIZE(t[0]); c++) \ 276 assert(t[r][c] == t[c][r]); \ 277 } \ 278 } \ 279 } while (false) 280 281#define ASSERT_TABLE_IS_DIAGONAL(t) \ 282 do { \ 283 static bool first = true; \ 284 if (first) { \ 285 first = false; \ 286 pragma_unroll_7 \ 287 for (unsigned r = 0; r < ARRAY_SIZE(t); r++) \ 288 assert(t[r][r] == r); \ 289 } \ 290 } while (false) 291 292#else 293#define ASSERT_TABLE_IS_COMMUTATIVE(t) 294#define ASSERT_TABLE_IS_DIAGONAL(t) 295#endif /* !defined(NDEBUG) */ 296 297static enum ssa_ranges 298union_ranges(enum ssa_ranges a, enum ssa_ranges b) 299{ 300 static const enum ssa_ranges union_table[last_range + 1][last_range + 1] = { 301 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 302 /* unknown */ { _______, _______, _______, _______, _______, _______, _______ }, 303 /* lt_zero */ { _______, lt_zero, le_zero, ne_zero, _______, ne_zero, le_zero }, 304 /* le_zero */ { _______, le_zero, le_zero, _______, _______, _______, le_zero }, 305 /* gt_zero */ { _______, ne_zero, _______, gt_zero, ge_zero, ne_zero, ge_zero }, 306 /* ge_zero */ { _______, _______, _______, ge_zero, ge_zero, _______, ge_zero }, 307 /* ne_zero */ { _______, ne_zero, _______, ne_zero, _______, ne_zero, _______ }, 308 /* eq_zero */ { _______, le_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 309 }; 310 311 ASSERT_TABLE_IS_COMMUTATIVE(union_table); 312 ASSERT_TABLE_IS_DIAGONAL(union_table); 313 314 return union_table[a][b]; 315} 316 317#ifndef NDEBUG 318/* Verify that the 'unknown' entry in each row (or column) of the table is the 319 * union of all the other values in the row (or column). 320 */ 321#define ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(t) \ 322 do { \ 323 static bool first = true; \ 324 if (first) { \ 325 first = false; \ 326 pragma_unroll_7 \ 327 for (unsigned i = 0; i < last_range; i++) { \ 328 enum ssa_ranges col_range = t[i][unknown + 1]; \ 329 enum ssa_ranges row_range = t[unknown + 1][i]; \ 330 \ 331 pragma_unroll_5 \ 332 for (unsigned j = unknown + 2; j < last_range; j++) { \ 333 col_range = union_ranges(col_range, t[i][j]); \ 334 row_range = union_ranges(row_range, t[j][i]); \ 335 } \ 336 \ 337 assert(col_range == t[i][unknown]); \ 338 assert(row_range == t[unknown][i]); \ 339 } \ 340 } \ 341 } while (false) 342 343/* For most operations, the union of ranges for a strict inequality and 344 * equality should be the range of the non-strict inequality (e.g., 345 * union_ranges(range(op(lt_zero), range(op(eq_zero))) == range(op(le_zero)). 346 * 347 * Does not apply to selection-like opcodes (bcsel, fmin, fmax, etc.). 348 */ 349#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(t) \ 350 do { \ 351 assert(union_ranges(t[lt_zero], t[eq_zero]) == t[le_zero]); \ 352 assert(union_ranges(t[gt_zero], t[eq_zero]) == t[ge_zero]); \ 353 } while (false) 354 355#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(t) \ 356 do { \ 357 static bool first = true; \ 358 if (first) { \ 359 first = false; \ 360 pragma_unroll_7 \ 361 for (unsigned i = 0; i < last_range; i++) { \ 362 assert(union_ranges(t[i][lt_zero], t[i][eq_zero]) == t[i][le_zero]); \ 363 assert(union_ranges(t[i][gt_zero], t[i][eq_zero]) == t[i][ge_zero]); \ 364 assert(union_ranges(t[lt_zero][i], t[eq_zero][i]) == t[le_zero][i]); \ 365 assert(union_ranges(t[gt_zero][i], t[eq_zero][i]) == t[ge_zero][i]); \ 366 } \ 367 } \ 368 } while (false) 369 370/* Several other unordered tuples span the range of "everything." Each should 371 * have the same value as unknown: (lt_zero, ge_zero), (le_zero, gt_zero), and 372 * (eq_zero, ne_zero). union_ranges is already commutative, so only one 373 * ordering needs to be checked. 374 * 375 * Does not apply to selection-like opcodes (bcsel, fmin, fmax, etc.). 376 * 377 * In cases where this can be used, it is unnecessary to also use 378 * ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_*_SOURCE. For any range X, 379 * union_ranges(X, X) == X. The disjoint ranges cover all of the non-unknown 380 * possibilities, so the union of all the unions of disjoint ranges is 381 * equivalent to the union of "others." 382 */ 383#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(t) \ 384 do { \ 385 assert(union_ranges(t[lt_zero], t[ge_zero]) == t[unknown]); \ 386 assert(union_ranges(t[le_zero], t[gt_zero]) == t[unknown]); \ 387 assert(union_ranges(t[eq_zero], t[ne_zero]) == t[unknown]); \ 388 } while (false) 389 390#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(t) \ 391 do { \ 392 static bool first = true; \ 393 if (first) { \ 394 first = false; \ 395 pragma_unroll_7 \ 396 for (unsigned i = 0; i < last_range; i++) { \ 397 assert(union_ranges(t[i][lt_zero], t[i][ge_zero]) == \ 398 t[i][unknown]); \ 399 assert(union_ranges(t[i][le_zero], t[i][gt_zero]) == \ 400 t[i][unknown]); \ 401 assert(union_ranges(t[i][eq_zero], t[i][ne_zero]) == \ 402 t[i][unknown]); \ 403 \ 404 assert(union_ranges(t[lt_zero][i], t[ge_zero][i]) == \ 405 t[unknown][i]); \ 406 assert(union_ranges(t[le_zero][i], t[gt_zero][i]) == \ 407 t[unknown][i]); \ 408 assert(union_ranges(t[eq_zero][i], t[ne_zero][i]) == \ 409 t[unknown][i]); \ 410 } \ 411 } \ 412 } while (false) 413 414#else 415#define ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(t) 416#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(t) 417#define ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(t) 418#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(t) 419#define ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(t) 420#endif /* !defined(NDEBUG) */ 421 422/** 423 * Analyze an expression to determine the range of its result 424 * 425 * The end result of this analysis is a token that communicates something 426 * about the range of values. There's an implicit grammar that produces 427 * tokens from sequences of literal values, other tokens, and operations. 428 * This function implements this grammar as a recursive-descent parser. Some 429 * (but not all) of the grammar is listed in-line in the function. 430 */ 431static struct ssa_result_range 432analyze_expression(const nir_alu_instr *instr, unsigned src, 433 struct hash_table *ht, nir_alu_type use_type) 434{ 435 /* Ensure that the _Pragma("GCC unroll 7") above are correct. */ 436 STATIC_ASSERT(last_range + 1 == 7); 437 438 if (!instr->src[src].src.is_ssa) 439 return (struct ssa_result_range){unknown, false, false, false}; 440 441 if (nir_src_is_const(instr->src[src].src)) 442 return analyze_constant(instr, src, use_type); 443 444 if (instr->src[src].src.ssa->parent_instr->type != nir_instr_type_alu) 445 return (struct ssa_result_range){unknown, false, false, false}; 446 447 const struct nir_alu_instr *const alu = 448 nir_instr_as_alu(instr->src[src].src.ssa->parent_instr); 449 450 /* Bail if the type of the instruction generating the value does not match 451 * the type the value will be interpreted as. int/uint/bool can be 452 * reinterpreted trivially. The most important cases are between float and 453 * non-float. 454 */ 455 if (alu->op != nir_op_mov && alu->op != nir_op_bcsel) { 456 const nir_alu_type use_base_type = 457 nir_alu_type_get_base_type(use_type); 458 const nir_alu_type src_base_type = 459 nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type); 460 461 if (use_base_type != src_base_type && 462 (use_base_type == nir_type_float || 463 src_base_type == nir_type_float)) { 464 return (struct ssa_result_range){unknown, false, false, false}; 465 } 466 } 467 468 struct hash_entry *he = _mesa_hash_table_search(ht, pack_key(alu, use_type)); 469 if (he != NULL) 470 return unpack_data(he->data); 471 472 struct ssa_result_range r = {unknown, false, false, false}; 473 474 /* ge_zero: ge_zero + ge_zero 475 * 476 * gt_zero: gt_zero + eq_zero 477 * | gt_zero + ge_zero 478 * | eq_zero + gt_zero # Addition is commutative 479 * | ge_zero + gt_zero # Addition is commutative 480 * | gt_zero + gt_zero 481 * ; 482 * 483 * le_zero: le_zero + le_zero 484 * 485 * lt_zero: lt_zero + eq_zero 486 * | lt_zero + le_zero 487 * | eq_zero + lt_zero # Addition is commutative 488 * | le_zero + lt_zero # Addition is commutative 489 * | lt_zero + lt_zero 490 * ; 491 * 492 * ne_zero: eq_zero + ne_zero 493 * | ne_zero + eq_zero # Addition is commutative 494 * ; 495 * 496 * eq_zero: eq_zero + eq_zero 497 * ; 498 * 499 * All other cases are 'unknown'. The seeming odd entry is (ne_zero, 500 * ne_zero), but that could be (-5, +5) which is not ne_zero. 501 */ 502 static const enum ssa_ranges fadd_table[last_range + 1][last_range + 1] = { 503 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 504 /* unknown */ { _______, _______, _______, _______, _______, _______, _______ }, 505 /* lt_zero */ { _______, lt_zero, lt_zero, _______, _______, _______, lt_zero }, 506 /* le_zero */ { _______, lt_zero, le_zero, _______, _______, _______, le_zero }, 507 /* gt_zero */ { _______, _______, _______, gt_zero, gt_zero, _______, gt_zero }, 508 /* ge_zero */ { _______, _______, _______, gt_zero, ge_zero, _______, ge_zero }, 509 /* ne_zero */ { _______, _______, _______, _______, _______, _______, ne_zero }, 510 /* eq_zero */ { _______, lt_zero, le_zero, gt_zero, ge_zero, ne_zero, eq_zero }, 511 }; 512 513 ASSERT_TABLE_IS_COMMUTATIVE(fadd_table); 514 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(fadd_table); 515 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(fadd_table); 516 517 /* Due to flush-to-zero semanatics of floating-point numbers with very 518 * small mangnitudes, we can never really be sure a result will be 519 * non-zero. 520 * 521 * ge_zero: ge_zero * ge_zero 522 * | ge_zero * gt_zero 523 * | ge_zero * eq_zero 524 * | le_zero * lt_zero 525 * | lt_zero * le_zero # Multiplication is commutative 526 * | le_zero * le_zero 527 * | gt_zero * ge_zero # Multiplication is commutative 528 * | eq_zero * ge_zero # Multiplication is commutative 529 * | a * a # Left source == right source 530 * | gt_zero * gt_zero 531 * | lt_zero * lt_zero 532 * ; 533 * 534 * le_zero: ge_zero * le_zero 535 * | ge_zero * lt_zero 536 * | lt_zero * ge_zero # Multiplication is commutative 537 * | le_zero * ge_zero # Multiplication is commutative 538 * | le_zero * gt_zero 539 * | lt_zero * gt_zero 540 * | gt_zero * lt_zero # Multiplication is commutative 541 * ; 542 * 543 * eq_zero: eq_zero * <any> 544 * <any> * eq_zero # Multiplication is commutative 545 * 546 * All other cases are 'unknown'. 547 */ 548 static const enum ssa_ranges fmul_table[last_range + 1][last_range + 1] = { 549 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 550 /* unknown */ { _______, _______, _______, _______, _______, _______, eq_zero }, 551 /* lt_zero */ { _______, ge_zero, ge_zero, le_zero, le_zero, _______, eq_zero }, 552 /* le_zero */ { _______, ge_zero, ge_zero, le_zero, le_zero, _______, eq_zero }, 553 /* gt_zero */ { _______, le_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 554 /* ge_zero */ { _______, le_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 555 /* ne_zero */ { _______, _______, _______, _______, _______, _______, eq_zero }, 556 /* eq_zero */ { eq_zero, eq_zero, eq_zero, eq_zero, eq_zero, eq_zero, eq_zero } 557 }; 558 559 ASSERT_TABLE_IS_COMMUTATIVE(fmul_table); 560 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(fmul_table); 561 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(fmul_table); 562 563 static const enum ssa_ranges fneg_table[last_range + 1] = { 564 /* unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 565 _______, gt_zero, ge_zero, lt_zero, le_zero, ne_zero, eq_zero 566 }; 567 568 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(fneg_table); 569 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(fneg_table); 570 571 572 switch (alu->op) { 573 case nir_op_b2f32: 574 case nir_op_b2i32: 575 /* b2f32 will generate either 0.0 or 1.0. This case is trivial. 576 * 577 * b2i32 will generate either 0x00000000 or 0x00000001. When those bit 578 * patterns are interpreted as floating point, they are 0.0 and 579 * 1.401298464324817e-45. The latter is subnormal, but it is finite and 580 * a number. 581 */ 582 r = (struct ssa_result_range){ge_zero, alu->op == nir_op_b2f32, true, true}; 583 break; 584 585 case nir_op_bcsel: { 586 const struct ssa_result_range left = 587 analyze_expression(alu, 1, ht, use_type); 588 const struct ssa_result_range right = 589 analyze_expression(alu, 2, ht, use_type); 590 591 r.is_integral = left.is_integral && right.is_integral; 592 593 /* This could be better, but it would require a lot of work. For 594 * example, the result of the following is a number: 595 * 596 * bcsel(a > 0.0, a, 38.6) 597 * 598 * If the result of 'a > 0.0' is true, then the use of 'a' in the true 599 * part of the bcsel must be a number. 600 * 601 * Other cases are even more challenging. 602 * 603 * bcsel(a > 0.5, a - 0.5, 0.0) 604 */ 605 r.is_a_number = left.is_a_number && right.is_a_number; 606 r.is_finite = left.is_finite && right.is_finite; 607 608 r.range = union_ranges(left.range, right.range); 609 break; 610 } 611 612 case nir_op_i2f32: 613 case nir_op_u2f32: 614 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 615 616 r.is_integral = true; 617 r.is_a_number = true; 618 r.is_finite = true; 619 620 if (r.range == unknown && alu->op == nir_op_u2f32) 621 r.range = ge_zero; 622 623 break; 624 625 case nir_op_fabs: 626 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 627 628 switch (r.range) { 629 case unknown: 630 case le_zero: 631 case ge_zero: 632 r.range = ge_zero; 633 break; 634 635 case lt_zero: 636 case gt_zero: 637 case ne_zero: 638 r.range = gt_zero; 639 break; 640 641 case eq_zero: 642 break; 643 } 644 645 break; 646 647 case nir_op_fadd: { 648 const struct ssa_result_range left = 649 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 650 const struct ssa_result_range right = 651 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 652 653 r.is_integral = left.is_integral && right.is_integral; 654 r.range = fadd_table[left.range][right.range]; 655 656 /* X + Y is NaN if either operand is NaN or if one operand is +Inf and 657 * the other is -Inf. If neither operand is NaN and at least one of the 658 * operands is finite, then the result cannot be NaN. 659 */ 660 r.is_a_number = left.is_a_number && right.is_a_number && 661 (left.is_finite || right.is_finite); 662 break; 663 } 664 665 case nir_op_fexp2: { 666 /* If the parameter might be less than zero, the mathematically result 667 * will be on (0, 1). For sufficiently large magnitude negative 668 * parameters, the result will flush to zero. 669 */ 670 static const enum ssa_ranges table[last_range + 1] = { 671 /* unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 672 ge_zero, ge_zero, ge_zero, gt_zero, gt_zero, ge_zero, gt_zero 673 }; 674 675 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 676 677 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_1_SOURCE(table); 678 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_1_SOURCE(table); 679 680 r.is_integral = r.is_integral && is_not_negative(r.range); 681 r.range = table[r.range]; 682 683 /* Various cases can result in NaN, so assume the worst. */ 684 r.is_finite = false; 685 r.is_a_number = false; 686 break; 687 } 688 689 case nir_op_fmax: { 690 const struct ssa_result_range left = 691 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 692 const struct ssa_result_range right = 693 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 694 695 r.is_integral = left.is_integral && right.is_integral; 696 697 /* This is conservative. It may be possible to determine that the 698 * result must be finite in more cases, but it would take some effort to 699 * work out all the corners. For example, fmax({lt_zero, finite}, 700 * {lt_zero}) should result in {lt_zero, finite}. 701 */ 702 r.is_finite = left.is_finite && right.is_finite; 703 704 /* If one source is NaN, fmax always picks the other source. */ 705 r.is_a_number = left.is_a_number || right.is_a_number; 706 707 /* gt_zero: fmax(gt_zero, *) 708 * | fmax(*, gt_zero) # Treat fmax as commutative 709 * ; 710 * 711 * ge_zero: fmax(ge_zero, ne_zero) 712 * | fmax(ge_zero, lt_zero) 713 * | fmax(ge_zero, le_zero) 714 * | fmax(ge_zero, eq_zero) 715 * | fmax(ne_zero, ge_zero) # Treat fmax as commutative 716 * | fmax(lt_zero, ge_zero) # Treat fmax as commutative 717 * | fmax(le_zero, ge_zero) # Treat fmax as commutative 718 * | fmax(eq_zero, ge_zero) # Treat fmax as commutative 719 * | fmax(ge_zero, ge_zero) 720 * ; 721 * 722 * le_zero: fmax(le_zero, lt_zero) 723 * | fmax(lt_zero, le_zero) # Treat fmax as commutative 724 * | fmax(le_zero, le_zero) 725 * ; 726 * 727 * lt_zero: fmax(lt_zero, lt_zero) 728 * ; 729 * 730 * ne_zero: fmax(ne_zero, lt_zero) 731 * | fmax(lt_zero, ne_zero) # Treat fmax as commutative 732 * | fmax(ne_zero, ne_zero) 733 * ; 734 * 735 * eq_zero: fmax(eq_zero, le_zero) 736 * | fmax(eq_zero, lt_zero) 737 * | fmax(le_zero, eq_zero) # Treat fmax as commutative 738 * | fmax(lt_zero, eq_zero) # Treat fmax as commutative 739 * | fmax(eq_zero, eq_zero) 740 * ; 741 * 742 * All other cases are 'unknown'. 743 */ 744 static const enum ssa_ranges table[last_range + 1][last_range + 1] = { 745 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 746 /* unknown */ { _______, _______, _______, gt_zero, ge_zero, _______, _______ }, 747 /* lt_zero */ { _______, lt_zero, le_zero, gt_zero, ge_zero, ne_zero, eq_zero }, 748 /* le_zero */ { _______, le_zero, le_zero, gt_zero, ge_zero, _______, eq_zero }, 749 /* gt_zero */ { gt_zero, gt_zero, gt_zero, gt_zero, gt_zero, gt_zero, gt_zero }, 750 /* ge_zero */ { ge_zero, ge_zero, ge_zero, gt_zero, ge_zero, ge_zero, ge_zero }, 751 /* ne_zero */ { _______, ne_zero, _______, gt_zero, ge_zero, ne_zero, _______ }, 752 /* eq_zero */ { _______, eq_zero, eq_zero, gt_zero, ge_zero, _______, eq_zero } 753 }; 754 755 /* Treat fmax as commutative. */ 756 ASSERT_TABLE_IS_COMMUTATIVE(table); 757 ASSERT_TABLE_IS_DIAGONAL(table); 758 ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(table); 759 760 r.range = table[left.range][right.range]; 761 762 /* Recall that when either value is NaN, fmax will pick the other value. 763 * This means the result range of the fmax will either be the "ideal" 764 * result range (calculated above) or the range of the non-NaN value. 765 */ 766 if (!left.is_a_number) 767 r.range = union_ranges(r.range, right.range); 768 769 if (!right.is_a_number) 770 r.range = union_ranges(r.range, left.range); 771 772 break; 773 } 774 775 case nir_op_fmin: { 776 const struct ssa_result_range left = 777 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 778 const struct ssa_result_range right = 779 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 780 781 r.is_integral = left.is_integral && right.is_integral; 782 783 /* This is conservative. It may be possible to determine that the 784 * result must be finite in more cases, but it would take some effort to 785 * work out all the corners. For example, fmin({gt_zero, finite}, 786 * {gt_zero}) should result in {gt_zero, finite}. 787 */ 788 r.is_finite = left.is_finite && right.is_finite; 789 790 /* If one source is NaN, fmin always picks the other source. */ 791 r.is_a_number = left.is_a_number || right.is_a_number; 792 793 /* lt_zero: fmin(lt_zero, *) 794 * | fmin(*, lt_zero) # Treat fmin as commutative 795 * ; 796 * 797 * le_zero: fmin(le_zero, ne_zero) 798 * | fmin(le_zero, gt_zero) 799 * | fmin(le_zero, ge_zero) 800 * | fmin(le_zero, eq_zero) 801 * | fmin(ne_zero, le_zero) # Treat fmin as commutative 802 * | fmin(gt_zero, le_zero) # Treat fmin as commutative 803 * | fmin(ge_zero, le_zero) # Treat fmin as commutative 804 * | fmin(eq_zero, le_zero) # Treat fmin as commutative 805 * | fmin(le_zero, le_zero) 806 * ; 807 * 808 * ge_zero: fmin(ge_zero, gt_zero) 809 * | fmin(gt_zero, ge_zero) # Treat fmin as commutative 810 * | fmin(ge_zero, ge_zero) 811 * ; 812 * 813 * gt_zero: fmin(gt_zero, gt_zero) 814 * ; 815 * 816 * ne_zero: fmin(ne_zero, gt_zero) 817 * | fmin(gt_zero, ne_zero) # Treat fmin as commutative 818 * | fmin(ne_zero, ne_zero) 819 * ; 820 * 821 * eq_zero: fmin(eq_zero, ge_zero) 822 * | fmin(eq_zero, gt_zero) 823 * | fmin(ge_zero, eq_zero) # Treat fmin as commutative 824 * | fmin(gt_zero, eq_zero) # Treat fmin as commutative 825 * | fmin(eq_zero, eq_zero) 826 * ; 827 * 828 * All other cases are 'unknown'. 829 */ 830 static const enum ssa_ranges table[last_range + 1][last_range + 1] = { 831 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 832 /* unknown */ { _______, lt_zero, le_zero, _______, _______, _______, _______ }, 833 /* lt_zero */ { lt_zero, lt_zero, lt_zero, lt_zero, lt_zero, lt_zero, lt_zero }, 834 /* le_zero */ { le_zero, lt_zero, le_zero, le_zero, le_zero, le_zero, le_zero }, 835 /* gt_zero */ { _______, lt_zero, le_zero, gt_zero, ge_zero, ne_zero, eq_zero }, 836 /* ge_zero */ { _______, lt_zero, le_zero, ge_zero, ge_zero, _______, eq_zero }, 837 /* ne_zero */ { _______, lt_zero, le_zero, ne_zero, _______, ne_zero, _______ }, 838 /* eq_zero */ { _______, lt_zero, le_zero, eq_zero, eq_zero, _______, eq_zero } 839 }; 840 841 /* Treat fmin as commutative. */ 842 ASSERT_TABLE_IS_COMMUTATIVE(table); 843 ASSERT_TABLE_IS_DIAGONAL(table); 844 ASSERT_UNION_OF_OTHERS_MATCHES_UNKNOWN_2_SOURCE(table); 845 846 r.range = table[left.range][right.range]; 847 848 /* Recall that when either value is NaN, fmin will pick the other value. 849 * This means the result range of the fmin will either be the "ideal" 850 * result range (calculated above) or the range of the non-NaN value. 851 */ 852 if (!left.is_a_number) 853 r.range = union_ranges(r.range, right.range); 854 855 if (!right.is_a_number) 856 r.range = union_ranges(r.range, left.range); 857 858 break; 859 } 860 861 case nir_op_fmul: { 862 const struct ssa_result_range left = 863 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 864 const struct ssa_result_range right = 865 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 866 867 r.is_integral = left.is_integral && right.is_integral; 868 869 /* x * x => ge_zero */ 870 if (left.range != eq_zero && nir_alu_srcs_equal(alu, alu, 0, 1)) { 871 /* Even if x > 0, the result of x*x can be zero when x is, for 872 * example, a subnormal number. 873 */ 874 r.range = ge_zero; 875 } else if (left.range != eq_zero && nir_alu_srcs_negative_equal(alu, alu, 0, 1)) { 876 /* -x * x => le_zero. */ 877 r.range = le_zero; 878 } else 879 r.range = fmul_table[left.range][right.range]; 880 881 /* Mulitpliation produces NaN for X * NaN and for 0 * ±Inf. If both 882 * operands are numbers and either both are finite or one is finite and 883 * the other cannot be zero, then the result must be a number. 884 */ 885 r.is_a_number = (left.is_a_number && right.is_a_number) && 886 ((left.is_finite && right.is_finite) || 887 (!is_not_zero(left.range) && right.is_finite) || 888 (left.is_finite && !is_not_zero(right.range))); 889 890 break; 891 } 892 893 case nir_op_frcp: 894 r = (struct ssa_result_range){ 895 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)).range, 896 false, 897 false, /* Various cases can result in NaN, so assume the worst. */ 898 false /* " " " " " " " " " " */ 899 }; 900 break; 901 902 case nir_op_mov: 903 r = analyze_expression(alu, 0, ht, use_type); 904 break; 905 906 case nir_op_fneg: 907 r = analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 908 909 r.range = fneg_table[r.range]; 910 break; 911 912 case nir_op_fsat: { 913 const struct ssa_result_range left = 914 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 915 916 /* fsat(NaN) = 0. */ 917 r.is_a_number = true; 918 r.is_finite = true; 919 920 switch (left.range) { 921 case le_zero: 922 case lt_zero: 923 case eq_zero: 924 r.range = eq_zero; 925 r.is_integral = true; 926 break; 927 928 case gt_zero: 929 /* fsat is equivalent to fmin(fmax(X, 0.0), 1.0), so if X is not a 930 * number, the result will be 0. 931 */ 932 r.range = left.is_a_number ? gt_zero : ge_zero; 933 r.is_integral = left.is_integral; 934 break; 935 936 case ge_zero: 937 case ne_zero: 938 case unknown: 939 /* Since the result must be in [0, 1], the value must be >= 0. */ 940 r.range = ge_zero; 941 r.is_integral = left.is_integral; 942 break; 943 } 944 break; 945 } 946 947 case nir_op_fsign: 948 r = (struct ssa_result_range){ 949 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)).range, 950 true, 951 true, /* fsign is -1, 0, or 1, even for NaN, so it must be a number. */ 952 true /* fsign is -1, 0, or 1, even for NaN, so it must be finite. */ 953 }; 954 break; 955 956 case nir_op_fsqrt: 957 case nir_op_frsq: 958 r = (struct ssa_result_range){ge_zero, false, false, false}; 959 break; 960 961 case nir_op_ffloor: { 962 const struct ssa_result_range left = 963 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 964 965 r.is_integral = true; 966 967 /* In IEEE 754, floor(NaN) is NaN, and floor(±Inf) is ±Inf. See 968 * https://pubs.opengroup.org/onlinepubs/9699919799.2016edition/functions/floor.html 969 */ 970 r.is_a_number = left.is_a_number; 971 r.is_finite = left.is_finite; 972 973 if (left.is_integral || left.range == le_zero || left.range == lt_zero) 974 r.range = left.range; 975 else if (left.range == ge_zero || left.range == gt_zero) 976 r.range = ge_zero; 977 else if (left.range == ne_zero) 978 r.range = unknown; 979 980 break; 981 } 982 983 case nir_op_fceil: { 984 const struct ssa_result_range left = 985 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 986 987 r.is_integral = true; 988 989 /* In IEEE 754, ceil(NaN) is NaN, and ceil(±Inf) is ±Inf. See 990 * https://pubs.opengroup.org/onlinepubs/9699919799.2016edition/functions/ceil.html 991 */ 992 r.is_a_number = left.is_a_number; 993 r.is_finite = left.is_finite; 994 995 if (left.is_integral || left.range == ge_zero || left.range == gt_zero) 996 r.range = left.range; 997 else if (left.range == le_zero || left.range == lt_zero) 998 r.range = le_zero; 999 else if (left.range == ne_zero) 1000 r.range = unknown; 1001 1002 break; 1003 } 1004 1005 case nir_op_ftrunc: { 1006 const struct ssa_result_range left = 1007 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1008 1009 r.is_integral = true; 1010 1011 /* In IEEE 754, trunc(NaN) is NaN, and trunc(±Inf) is ±Inf. See 1012 * https://pubs.opengroup.org/onlinepubs/9699919799.2016edition/functions/trunc.html 1013 */ 1014 r.is_a_number = left.is_a_number; 1015 r.is_finite = left.is_finite; 1016 1017 if (left.is_integral) 1018 r.range = left.range; 1019 else if (left.range == ge_zero || left.range == gt_zero) 1020 r.range = ge_zero; 1021 else if (left.range == le_zero || left.range == lt_zero) 1022 r.range = le_zero; 1023 else if (left.range == ne_zero) 1024 r.range = unknown; 1025 1026 break; 1027 } 1028 1029 case nir_op_flt: 1030 case nir_op_fge: 1031 case nir_op_feq: 1032 case nir_op_fneu: 1033 case nir_op_ilt: 1034 case nir_op_ige: 1035 case nir_op_ieq: 1036 case nir_op_ine: 1037 case nir_op_ult: 1038 case nir_op_uge: 1039 /* Boolean results are 0 or -1. */ 1040 r = (struct ssa_result_range){le_zero, false, true, false}; 1041 break; 1042 1043 case nir_op_fpow: { 1044 /* Due to flush-to-zero semanatics of floating-point numbers with very 1045 * small mangnitudes, we can never really be sure a result will be 1046 * non-zero. 1047 * 1048 * NIR uses pow() and powf() to constant evaluate nir_op_fpow. The man 1049 * page for that function says: 1050 * 1051 * If y is 0, the result is 1.0 (even if x is a NaN). 1052 * 1053 * gt_zero: pow(*, eq_zero) 1054 * | pow(eq_zero, lt_zero) # 0^-y = +inf 1055 * | pow(eq_zero, le_zero) # 0^-y = +inf or 0^0 = 1.0 1056 * ; 1057 * 1058 * eq_zero: pow(eq_zero, gt_zero) 1059 * ; 1060 * 1061 * ge_zero: pow(gt_zero, gt_zero) 1062 * | pow(gt_zero, ge_zero) 1063 * | pow(gt_zero, lt_zero) 1064 * | pow(gt_zero, le_zero) 1065 * | pow(gt_zero, ne_zero) 1066 * | pow(gt_zero, unknown) 1067 * | pow(ge_zero, gt_zero) 1068 * | pow(ge_zero, ge_zero) 1069 * | pow(ge_zero, lt_zero) 1070 * | pow(ge_zero, le_zero) 1071 * | pow(ge_zero, ne_zero) 1072 * | pow(ge_zero, unknown) 1073 * | pow(eq_zero, ge_zero) # 0^0 = 1.0 or 0^+y = 0.0 1074 * | pow(eq_zero, ne_zero) # 0^-y = +inf or 0^+y = 0.0 1075 * | pow(eq_zero, unknown) # union of all other y cases 1076 * ; 1077 * 1078 * All other cases are unknown. 1079 * 1080 * We could do better if the right operand is a constant, integral 1081 * value. 1082 */ 1083 static const enum ssa_ranges table[last_range + 1][last_range + 1] = { 1084 /* left\right unknown lt_zero le_zero gt_zero ge_zero ne_zero eq_zero */ 1085 /* unknown */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1086 /* lt_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1087 /* le_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1088 /* gt_zero */ { ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, gt_zero }, 1089 /* ge_zero */ { ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, ge_zero, gt_zero }, 1090 /* ne_zero */ { _______, _______, _______, _______, _______, _______, gt_zero }, 1091 /* eq_zero */ { ge_zero, gt_zero, gt_zero, eq_zero, ge_zero, ge_zero, gt_zero }, 1092 }; 1093 1094 const struct ssa_result_range left = 1095 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1096 const struct ssa_result_range right = 1097 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 1098 1099 ASSERT_UNION_OF_DISJOINT_MATCHES_UNKNOWN_2_SOURCE(table); 1100 ASSERT_UNION_OF_EQ_AND_STRICT_INEQ_MATCHES_NONSTRICT_2_SOURCE(table); 1101 1102 r.is_integral = left.is_integral && right.is_integral && 1103 is_not_negative(right.range); 1104 r.range = table[left.range][right.range]; 1105 1106 /* Various cases can result in NaN, so assume the worst. */ 1107 r.is_a_number = false; 1108 1109 break; 1110 } 1111 1112 case nir_op_ffma: { 1113 const struct ssa_result_range first = 1114 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1115 const struct ssa_result_range second = 1116 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 1117 const struct ssa_result_range third = 1118 analyze_expression(alu, 2, ht, nir_alu_src_type(alu, 2)); 1119 1120 r.is_integral = first.is_integral && second.is_integral && 1121 third.is_integral; 1122 1123 /* Various cases can result in NaN, so assume the worst. */ 1124 r.is_a_number = false; 1125 1126 enum ssa_ranges fmul_range; 1127 1128 if (first.range != eq_zero && nir_alu_srcs_equal(alu, alu, 0, 1)) { 1129 /* See handling of nir_op_fmul for explanation of why ge_zero is the 1130 * range. 1131 */ 1132 fmul_range = ge_zero; 1133 } else if (first.range != eq_zero && nir_alu_srcs_negative_equal(alu, alu, 0, 1)) { 1134 /* -x * x => le_zero */ 1135 fmul_range = le_zero; 1136 } else 1137 fmul_range = fmul_table[first.range][second.range]; 1138 1139 r.range = fadd_table[fmul_range][third.range]; 1140 break; 1141 } 1142 1143 case nir_op_flrp: { 1144 const struct ssa_result_range first = 1145 analyze_expression(alu, 0, ht, nir_alu_src_type(alu, 0)); 1146 const struct ssa_result_range second = 1147 analyze_expression(alu, 1, ht, nir_alu_src_type(alu, 1)); 1148 const struct ssa_result_range third = 1149 analyze_expression(alu, 2, ht, nir_alu_src_type(alu, 2)); 1150 1151 r.is_integral = first.is_integral && second.is_integral && 1152 third.is_integral; 1153 1154 /* Various cases can result in NaN, so assume the worst. */ 1155 r.is_a_number = false; 1156 1157 /* Decompose the flrp to first + third * (second + -first) */ 1158 const enum ssa_ranges inner_fadd_range = 1159 fadd_table[second.range][fneg_table[first.range]]; 1160 1161 const enum ssa_ranges fmul_range = 1162 fmul_table[third.range][inner_fadd_range]; 1163 1164 r.range = fadd_table[first.range][fmul_range]; 1165 break; 1166 } 1167 1168 default: 1169 r = (struct ssa_result_range){unknown, false, false, false}; 1170 break; 1171 } 1172 1173 if (r.range == eq_zero) 1174 r.is_integral = true; 1175 1176 /* Just like isfinite(), the is_finite flag implies the value is a number. */ 1177 assert((int) r.is_finite <= (int) r.is_a_number); 1178 1179 _mesa_hash_table_insert(ht, pack_key(alu, use_type), pack_data(r)); 1180 return r; 1181} 1182 1183#undef _______ 1184 1185struct ssa_result_range 1186nir_analyze_range(struct hash_table *range_ht, 1187 const nir_alu_instr *instr, unsigned src) 1188{ 1189 return analyze_expression(instr, src, range_ht, 1190 nir_alu_src_type(instr, src)); 1191} 1192 1193static uint32_t bitmask(uint32_t size) { 1194 return size >= 32 ? 0xffffffffu : ((uint32_t)1 << size) - 1u; 1195} 1196 1197static uint64_t mul_clamp(uint32_t a, uint32_t b) 1198{ 1199 if (a != 0 && (a * b) / a != b) 1200 return (uint64_t)UINT32_MAX + 1; 1201 else 1202 return a * b; 1203} 1204 1205/* recursively gather at most "buf_size" phi/bcsel sources */ 1206static unsigned 1207search_phi_bcsel(nir_ssa_scalar scalar, nir_ssa_scalar *buf, unsigned buf_size, struct set *visited) 1208{ 1209 if (_mesa_set_search(visited, scalar.def)) 1210 return 0; 1211 _mesa_set_add(visited, scalar.def); 1212 1213 if (scalar.def->parent_instr->type == nir_instr_type_phi) { 1214 nir_phi_instr *phi = nir_instr_as_phi(scalar.def->parent_instr); 1215 unsigned num_sources_left = exec_list_length(&phi->srcs); 1216 if (buf_size >= num_sources_left) { 1217 unsigned total_added = 0; 1218 nir_foreach_phi_src(src, phi) { 1219 num_sources_left--; 1220 unsigned added = search_phi_bcsel( 1221 (nir_ssa_scalar){src->src.ssa, 0}, buf + total_added, buf_size - num_sources_left, visited); 1222 assert(added <= buf_size); 1223 buf_size -= added; 1224 total_added += added; 1225 } 1226 return total_added; 1227 } 1228 } 1229 1230 if (nir_ssa_scalar_is_alu(scalar)) { 1231 nir_op op = nir_ssa_scalar_alu_op(scalar); 1232 1233 if ((op == nir_op_bcsel || op == nir_op_b32csel) && buf_size >= 2) { 1234 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0); 1235 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1); 1236 1237 unsigned added = search_phi_bcsel(src0, buf, buf_size - 1, visited); 1238 buf_size -= added; 1239 added += search_phi_bcsel(src1, buf + added, buf_size, visited); 1240 return added; 1241 } 1242 } 1243 1244 buf[0] = scalar; 1245 return 1; 1246} 1247 1248static nir_variable * 1249lookup_input(nir_shader *shader, unsigned driver_location) 1250{ 1251 return nir_find_variable_with_driver_location(shader, nir_var_shader_in, 1252 driver_location); 1253} 1254 1255/* The config here should be generic enough to be correct on any HW. */ 1256static const nir_unsigned_upper_bound_config default_ub_config = { 1257 .min_subgroup_size = 1u, 1258 .max_subgroup_size = UINT16_MAX, 1259 .max_workgroup_invocations = UINT16_MAX, 1260 .max_workgroup_count = {UINT16_MAX, UINT16_MAX, UINT16_MAX}, 1261 .max_workgroup_size = {UINT16_MAX, UINT16_MAX, UINT16_MAX}, 1262 .vertex_attrib_max = { 1263 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1264 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1265 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1266 UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX, 1267 }, 1268}; 1269 1270uint32_t 1271nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht, 1272 nir_ssa_scalar scalar, 1273 const nir_unsigned_upper_bound_config *config) 1274{ 1275 assert(scalar.def->bit_size <= 32); 1276 1277 if (!config) 1278 config = &default_ub_config; 1279 if (nir_ssa_scalar_is_const(scalar)) 1280 return nir_ssa_scalar_as_uint(scalar); 1281 1282 /* keys can't be 0, so we have to add 1 to the index */ 1283 void *key = (void*)(((uintptr_t)(scalar.def->index + 1) << 4) | scalar.comp); 1284 struct hash_entry *he = _mesa_hash_table_search(range_ht, key); 1285 if (he != NULL) 1286 return (uintptr_t)he->data; 1287 1288 uint32_t max = bitmask(scalar.def->bit_size); 1289 1290 if (scalar.def->parent_instr->type == nir_instr_type_intrinsic) { 1291 uint32_t res = max; 1292 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(scalar.def->parent_instr); 1293 switch (intrin->intrinsic) { 1294 case nir_intrinsic_load_local_invocation_index: 1295 /* The local invocation index is used under the hood by RADV for 1296 * some non-compute-like shaders (eg. LS and NGG). These technically 1297 * run in workgroups on the HW, even though this fact is not exposed 1298 * by the API. 1299 * They can safely use the same code path here as variable sized 1300 * compute-like shader stages. 1301 */ 1302 if (!gl_shader_stage_uses_workgroup(shader->info.stage) || 1303 shader->info.workgroup_size_variable) { 1304 res = config->max_workgroup_invocations - 1; 1305 } else { 1306 res = (shader->info.workgroup_size[0] * 1307 shader->info.workgroup_size[1] * 1308 shader->info.workgroup_size[2]) - 1u; 1309 } 1310 break; 1311 case nir_intrinsic_load_local_invocation_id: 1312 if (shader->info.workgroup_size_variable) 1313 res = config->max_workgroup_size[scalar.comp] - 1u; 1314 else 1315 res = shader->info.workgroup_size[scalar.comp] - 1u; 1316 break; 1317 case nir_intrinsic_load_workgroup_id: 1318 res = config->max_workgroup_count[scalar.comp] - 1u; 1319 break; 1320 case nir_intrinsic_load_num_workgroups: 1321 res = config->max_workgroup_count[scalar.comp]; 1322 break; 1323 case nir_intrinsic_load_global_invocation_id: 1324 if (shader->info.workgroup_size_variable) { 1325 res = mul_clamp(config->max_workgroup_size[scalar.comp], 1326 config->max_workgroup_count[scalar.comp]) - 1u; 1327 } else { 1328 res = (shader->info.workgroup_size[scalar.comp] * 1329 config->max_workgroup_count[scalar.comp]) - 1u; 1330 } 1331 break; 1332 case nir_intrinsic_load_invocation_id: 1333 if (shader->info.stage == MESA_SHADER_TESS_CTRL) 1334 res = shader->info.tess.tcs_vertices_out 1335 ? (shader->info.tess.tcs_vertices_out - 1) 1336 : 511; /* Generous maximum output patch size of 512 */ 1337 break; 1338 case nir_intrinsic_load_subgroup_invocation: 1339 case nir_intrinsic_first_invocation: 1340 res = config->max_subgroup_size - 1; 1341 break; 1342 case nir_intrinsic_mbcnt_amd: { 1343 uint32_t src0 = config->max_subgroup_size - 1; 1344 uint32_t src1 = nir_unsigned_upper_bound(shader, range_ht, (nir_ssa_scalar){intrin->src[1].ssa, 0}, config); 1345 1346 if (src0 + src1 < src0) 1347 res = max; /* overflow */ 1348 else 1349 res = src0 + src1; 1350 break; 1351 } 1352 case nir_intrinsic_load_subgroup_size: 1353 res = config->max_subgroup_size; 1354 break; 1355 case nir_intrinsic_load_subgroup_id: 1356 case nir_intrinsic_load_num_subgroups: { 1357 uint32_t workgroup_size = config->max_workgroup_invocations; 1358 if (gl_shader_stage_uses_workgroup(shader->info.stage) && 1359 !shader->info.workgroup_size_variable) { 1360 workgroup_size = shader->info.workgroup_size[0] * 1361 shader->info.workgroup_size[1] * 1362 shader->info.workgroup_size[2]; 1363 } 1364 res = DIV_ROUND_UP(workgroup_size, config->min_subgroup_size); 1365 if (intrin->intrinsic == nir_intrinsic_load_subgroup_id) 1366 res--; 1367 break; 1368 } 1369 case nir_intrinsic_load_input: { 1370 if (shader->info.stage == MESA_SHADER_VERTEX && nir_src_is_const(intrin->src[0])) { 1371 nir_variable *var = lookup_input(shader, nir_intrinsic_base(intrin)); 1372 if (var) { 1373 int loc = var->data.location - VERT_ATTRIB_GENERIC0; 1374 if (loc >= 0) 1375 res = config->vertex_attrib_max[loc]; 1376 } 1377 } 1378 break; 1379 } 1380 case nir_intrinsic_reduce: 1381 case nir_intrinsic_inclusive_scan: 1382 case nir_intrinsic_exclusive_scan: { 1383 nir_op op = nir_intrinsic_reduction_op(intrin); 1384 if (op == nir_op_umin || op == nir_op_umax || op == nir_op_imin || op == nir_op_imax) 1385 res = nir_unsigned_upper_bound(shader, range_ht, (nir_ssa_scalar){intrin->src[0].ssa, 0}, config); 1386 break; 1387 } 1388 case nir_intrinsic_read_first_invocation: 1389 case nir_intrinsic_read_invocation: 1390 case nir_intrinsic_shuffle: 1391 case nir_intrinsic_shuffle_xor: 1392 case nir_intrinsic_shuffle_up: 1393 case nir_intrinsic_shuffle_down: 1394 case nir_intrinsic_quad_broadcast: 1395 case nir_intrinsic_quad_swap_horizontal: 1396 case nir_intrinsic_quad_swap_vertical: 1397 case nir_intrinsic_quad_swap_diagonal: 1398 case nir_intrinsic_quad_swizzle_amd: 1399 case nir_intrinsic_masked_swizzle_amd: 1400 res = nir_unsigned_upper_bound(shader, range_ht, (nir_ssa_scalar){intrin->src[0].ssa, 0}, config); 1401 break; 1402 case nir_intrinsic_write_invocation_amd: { 1403 uint32_t src0 = nir_unsigned_upper_bound(shader, range_ht, (nir_ssa_scalar){intrin->src[0].ssa, 0}, config); 1404 uint32_t src1 = nir_unsigned_upper_bound(shader, range_ht, (nir_ssa_scalar){intrin->src[1].ssa, 0}, config); 1405 res = MAX2(src0, src1); 1406 break; 1407 } 1408 case nir_intrinsic_load_tess_rel_patch_id_amd: 1409 case nir_intrinsic_load_tcs_num_patches_amd: 1410 /* Very generous maximum: TCS/TES executed by largest possible workgroup */ 1411 res = config->max_workgroup_invocations / MAX2(shader->info.tess.tcs_vertices_out, 1u); 1412 break; 1413 default: 1414 break; 1415 } 1416 if (res != max) 1417 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)res); 1418 return res; 1419 } 1420 1421 if (scalar.def->parent_instr->type == nir_instr_type_phi) { 1422 nir_cf_node *prev = nir_cf_node_prev(&scalar.def->parent_instr->block->cf_node); 1423 1424 uint32_t res = 0; 1425 if (!prev || prev->type == nir_cf_node_block) { 1426 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)max); 1427 1428 struct set *visited = _mesa_pointer_set_create(NULL); 1429 nir_ssa_scalar defs[64]; 1430 unsigned def_count = search_phi_bcsel(scalar, defs, 64, visited); 1431 _mesa_set_destroy(visited, NULL); 1432 1433 for (unsigned i = 0; i < def_count; i++) 1434 res = MAX2(res, nir_unsigned_upper_bound(shader, range_ht, defs[i], config)); 1435 } else { 1436 nir_foreach_phi_src(src, nir_instr_as_phi(scalar.def->parent_instr)) { 1437 res = MAX2(res, nir_unsigned_upper_bound( 1438 shader, range_ht, (nir_ssa_scalar){src->src.ssa, 0}, config)); 1439 } 1440 } 1441 1442 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)res); 1443 return res; 1444 } 1445 1446 if (nir_ssa_scalar_is_alu(scalar)) { 1447 nir_op op = nir_ssa_scalar_alu_op(scalar); 1448 1449 switch (op) { 1450 case nir_op_umin: 1451 case nir_op_imin: 1452 case nir_op_imax: 1453 case nir_op_umax: 1454 case nir_op_iand: 1455 case nir_op_ior: 1456 case nir_op_ixor: 1457 case nir_op_ishl: 1458 case nir_op_imul: 1459 case nir_op_ushr: 1460 case nir_op_ishr: 1461 case nir_op_iadd: 1462 case nir_op_umod: 1463 case nir_op_udiv: 1464 case nir_op_bcsel: 1465 case nir_op_b32csel: 1466 case nir_op_ubfe: 1467 case nir_op_bfm: 1468 case nir_op_fmul: 1469 case nir_op_extract_u8: 1470 case nir_op_extract_i8: 1471 case nir_op_extract_u16: 1472 case nir_op_extract_i16: 1473 break; 1474 case nir_op_u2u1: 1475 case nir_op_u2u8: 1476 case nir_op_u2u16: 1477 case nir_op_u2u32: 1478 case nir_op_f2u32: 1479 if (nir_ssa_scalar_chase_alu_src(scalar, 0).def->bit_size > 32) { 1480 /* If src is >32 bits, return max */ 1481 return max; 1482 } 1483 break; 1484 default: 1485 return max; 1486 } 1487 1488 uint32_t src0 = nir_unsigned_upper_bound(shader, range_ht, nir_ssa_scalar_chase_alu_src(scalar, 0), config); 1489 uint32_t src1 = max, src2 = max; 1490 if (nir_op_infos[op].num_inputs > 1) 1491 src1 = nir_unsigned_upper_bound(shader, range_ht, nir_ssa_scalar_chase_alu_src(scalar, 1), config); 1492 if (nir_op_infos[op].num_inputs > 2) 1493 src2 = nir_unsigned_upper_bound(shader, range_ht, nir_ssa_scalar_chase_alu_src(scalar, 2), config); 1494 1495 uint32_t res = max; 1496 switch (op) { 1497 case nir_op_umin: 1498 res = src0 < src1 ? src0 : src1; 1499 break; 1500 case nir_op_imin: 1501 case nir_op_imax: 1502 case nir_op_umax: 1503 res = src0 > src1 ? src0 : src1; 1504 break; 1505 case nir_op_iand: 1506 res = bitmask(util_last_bit64(src0)) & bitmask(util_last_bit64(src1)); 1507 break; 1508 case nir_op_ior: 1509 case nir_op_ixor: 1510 res = bitmask(util_last_bit64(src0)) | bitmask(util_last_bit64(src1)); 1511 break; 1512 case nir_op_ishl: 1513 if (util_last_bit64(src0) + src1 > scalar.def->bit_size) 1514 res = max; /* overflow */ 1515 else 1516 res = src0 << MIN2(src1, scalar.def->bit_size - 1u); 1517 break; 1518 case nir_op_imul: 1519 if (src0 != 0 && (src0 * src1) / src0 != src1) 1520 res = max; 1521 else 1522 res = src0 * src1; 1523 break; 1524 case nir_op_ushr: { 1525 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1526 if (nir_ssa_scalar_is_const(src1_scalar)) 1527 res = src0 >> nir_ssa_scalar_as_uint(src1_scalar); 1528 else 1529 res = src0; 1530 break; 1531 } 1532 case nir_op_ishr: { 1533 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1534 if (src0 <= 2147483647 && nir_ssa_scalar_is_const(src1_scalar)) 1535 res = src0 >> nir_ssa_scalar_as_uint(src1_scalar); 1536 else 1537 res = src0; 1538 break; 1539 } 1540 case nir_op_iadd: 1541 if (src0 + src1 < src0) 1542 res = max; /* overflow */ 1543 else 1544 res = src0 + src1; 1545 break; 1546 case nir_op_umod: 1547 res = src1 ? src1 - 1 : 0; 1548 break; 1549 case nir_op_udiv: { 1550 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1551 if (nir_ssa_scalar_is_const(src1_scalar)) 1552 res = nir_ssa_scalar_as_uint(src1_scalar) ? src0 / nir_ssa_scalar_as_uint(src1_scalar) : 0; 1553 else 1554 res = src0; 1555 break; 1556 } 1557 case nir_op_bcsel: 1558 case nir_op_b32csel: 1559 res = src1 > src2 ? src1 : src2; 1560 break; 1561 case nir_op_ubfe: 1562 res = bitmask(MIN2(src2, scalar.def->bit_size)); 1563 break; 1564 case nir_op_bfm: { 1565 nir_ssa_scalar src1_scalar = nir_ssa_scalar_chase_alu_src(scalar, 1); 1566 if (nir_ssa_scalar_is_const(src1_scalar)) { 1567 src0 = MIN2(src0, 31); 1568 src1 = nir_ssa_scalar_as_uint(src1_scalar) & 0x1fu; 1569 res = bitmask(src0) << src1; 1570 } else { 1571 src0 = MIN2(src0, 31); 1572 src1 = MIN2(src1, 31); 1573 res = bitmask(MIN2(src0 + src1, 32)); 1574 } 1575 break; 1576 } 1577 /* limited floating-point support for f2u32(fmul(load_input(), <constant>)) */ 1578 case nir_op_f2u32: 1579 /* infinity/NaN starts at 0x7f800000u, negative numbers at 0x80000000 */ 1580 if (src0 < 0x7f800000u) { 1581 float val; 1582 memcpy(&val, &src0, 4); 1583 res = (uint32_t)val; 1584 } 1585 break; 1586 case nir_op_fmul: 1587 /* infinity/NaN starts at 0x7f800000u, negative numbers at 0x80000000 */ 1588 if (src0 < 0x7f800000u && src1 < 0x7f800000u) { 1589 float src0_f, src1_f; 1590 memcpy(&src0_f, &src0, 4); 1591 memcpy(&src1_f, &src1, 4); 1592 /* not a proper rounding-up multiplication, but should be good enough */ 1593 float max_f = ceilf(src0_f) * ceilf(src1_f); 1594 memcpy(&res, &max_f, 4); 1595 } 1596 break; 1597 case nir_op_u2u1: 1598 case nir_op_u2u8: 1599 case nir_op_u2u16: 1600 case nir_op_u2u32: 1601 res = MIN2(src0, max); 1602 break; 1603 case nir_op_sad_u8x4: 1604 res = src2 + 4 * 255; 1605 break; 1606 case nir_op_extract_u8: 1607 res = MIN2(src0, UINT8_MAX); 1608 break; 1609 case nir_op_extract_i8: 1610 res = (src0 >= 0x80) ? max : MIN2(src0, INT8_MAX); 1611 break; 1612 case nir_op_extract_u16: 1613 res = MIN2(src0, UINT16_MAX); 1614 break; 1615 case nir_op_extract_i16: 1616 res = (src0 >= 0x8000) ? max : MIN2(src0, INT16_MAX); 1617 break; 1618 default: 1619 res = max; 1620 break; 1621 } 1622 _mesa_hash_table_insert(range_ht, key, (void*)(uintptr_t)res); 1623 return res; 1624 } 1625 1626 return max; 1627} 1628 1629bool 1630nir_addition_might_overflow(nir_shader *shader, struct hash_table *range_ht, 1631 nir_ssa_scalar ssa, unsigned const_val, 1632 const nir_unsigned_upper_bound_config *config) 1633{ 1634 if (nir_ssa_scalar_is_alu(ssa)) { 1635 nir_op alu_op = nir_ssa_scalar_alu_op(ssa); 1636 1637 /* iadd(imul(a, #b), #c) */ 1638 if (alu_op == nir_op_imul || alu_op == nir_op_ishl) { 1639 nir_ssa_scalar mul_src0 = nir_ssa_scalar_chase_alu_src(ssa, 0); 1640 nir_ssa_scalar mul_src1 = nir_ssa_scalar_chase_alu_src(ssa, 1); 1641 uint32_t stride = 1; 1642 if (nir_ssa_scalar_is_const(mul_src0)) 1643 stride = nir_ssa_scalar_as_uint(mul_src0); 1644 else if (nir_ssa_scalar_is_const(mul_src1)) 1645 stride = nir_ssa_scalar_as_uint(mul_src1); 1646 1647 if (alu_op == nir_op_ishl) 1648 stride = 1u << (stride % 32u); 1649 1650 if (!stride || const_val <= UINT32_MAX - (UINT32_MAX / stride * stride)) 1651 return false; 1652 } 1653 1654 /* iadd(iand(a, #b), #c) */ 1655 if (alu_op == nir_op_iand) { 1656 nir_ssa_scalar and_src0 = nir_ssa_scalar_chase_alu_src(ssa, 0); 1657 nir_ssa_scalar and_src1 = nir_ssa_scalar_chase_alu_src(ssa, 1); 1658 uint32_t mask = 0xffffffff; 1659 if (nir_ssa_scalar_is_const(and_src0)) 1660 mask = nir_ssa_scalar_as_uint(and_src0); 1661 else if (nir_ssa_scalar_is_const(and_src1)) 1662 mask = nir_ssa_scalar_as_uint(and_src1); 1663 if (mask == 0 || const_val < (1u << (ffs(mask) - 1))) 1664 return false; 1665 } 1666 } 1667 1668 uint32_t ub = nir_unsigned_upper_bound(shader, range_ht, ssa, config); 1669 return const_val + ub < const_val; 1670} 1671 1672static uint64_t 1673ssa_def_bits_used(nir_ssa_def *def, int recur) 1674{ 1675 uint64_t bits_used = 0; 1676 uint64_t all_bits = BITFIELD64_MASK(def->bit_size); 1677 1678 /* Querying the bits used from a vector is too hard of a question to 1679 * answer. Return the conservative answer that all bits are used. To 1680 * handle this, the function would need to be extended to be a query of a 1681 * single component of the vector. That would also necessary to fully 1682 * handle the 'num_components > 1' inside the loop below. 1683 * 1684 * FINISHME: This restriction will eventually need to be restricted to be 1685 * useful for hardware that uses u16vec2 as the native 16-bit integer type. 1686 */ 1687 if (def->num_components > 1) 1688 return all_bits; 1689 1690 /* Limit recursion */ 1691 if (recur-- <= 0) 1692 return all_bits; 1693 1694 nir_foreach_use(src, def) { 1695 switch (src->parent_instr->type) { 1696 case nir_instr_type_alu: { 1697 nir_alu_instr *use_alu = nir_instr_as_alu(src->parent_instr); 1698 unsigned src_idx = container_of(src, nir_alu_src, src) - use_alu->src; 1699 1700 /* If a user of the value produces a vector result, return the 1701 * conservative answer that all bits are used. It is possible to 1702 * answer this query by looping over the components used. For example, 1703 * 1704 * vec4 32 ssa_5 = load_const(0x0000f000, 0x00000f00, 0x000000f0, 0x0000000f) 1705 * ... 1706 * vec4 32 ssa_8 = iand ssa_7.xxxx, ssa_5 1707 * 1708 * could conceivably return 0x0000ffff when queyring the bits used of 1709 * ssa_7. This is unlikely to be worth the effort because the 1710 * question can eventually answered after the shader has been 1711 * scalarized. 1712 */ 1713 if (use_alu->dest.dest.ssa.num_components > 1) 1714 return all_bits; 1715 1716 switch (use_alu->op) { 1717 case nir_op_u2u8: 1718 case nir_op_i2i8: 1719 bits_used |= 0xff; 1720 break; 1721 1722 case nir_op_u2u16: 1723 case nir_op_i2i16: 1724 bits_used |= all_bits & 0xffff; 1725 break; 1726 1727 case nir_op_u2u32: 1728 case nir_op_i2i32: 1729 bits_used |= all_bits & 0xffffffff; 1730 break; 1731 1732 case nir_op_extract_u8: 1733 case nir_op_extract_i8: 1734 if (src_idx == 0 && nir_src_is_const(use_alu->src[1].src)) { 1735 unsigned chunk = nir_src_comp_as_uint(use_alu->src[1].src, 1736 use_alu->src[1].swizzle[0]); 1737 bits_used |= 0xffull << (chunk * 8); 1738 break; 1739 } else { 1740 return all_bits; 1741 } 1742 1743 case nir_op_extract_u16: 1744 case nir_op_extract_i16: 1745 if (src_idx == 0 && nir_src_is_const(use_alu->src[1].src)) { 1746 unsigned chunk = nir_src_comp_as_uint(use_alu->src[1].src, 1747 use_alu->src[1].swizzle[0]); 1748 bits_used |= 0xffffull << (chunk * 16); 1749 break; 1750 } else { 1751 return all_bits; 1752 } 1753 1754 case nir_op_ishl: 1755 case nir_op_ishr: 1756 case nir_op_ushr: 1757 if (src_idx == 1) { 1758 bits_used |= (nir_src_bit_size(use_alu->src[0].src) - 1); 1759 break; 1760 } else { 1761 return all_bits; 1762 } 1763 1764 case nir_op_iand: 1765 assert(src_idx < 2); 1766 if (nir_src_is_const(use_alu->src[1 - src_idx].src)) { 1767 uint64_t u64 = nir_src_comp_as_uint(use_alu->src[1 - src_idx].src, 1768 use_alu->src[1 - src_idx].swizzle[0]); 1769 bits_used |= u64; 1770 break; 1771 } else { 1772 return all_bits; 1773 } 1774 1775 case nir_op_ior: 1776 assert(src_idx < 2); 1777 if (nir_src_is_const(use_alu->src[1 - src_idx].src)) { 1778 uint64_t u64 = nir_src_comp_as_uint(use_alu->src[1 - src_idx].src, 1779 use_alu->src[1 - src_idx].swizzle[0]); 1780 bits_used |= all_bits & ~u64; 1781 break; 1782 } else { 1783 return all_bits; 1784 } 1785 1786 default: 1787 /* We don't know what this op does */ 1788 return all_bits; 1789 } 1790 break; 1791 } 1792 1793 case nir_instr_type_intrinsic: { 1794 nir_intrinsic_instr *use_intrin = 1795 nir_instr_as_intrinsic(src->parent_instr); 1796 unsigned src_idx = src - use_intrin->src; 1797 1798 switch (use_intrin->intrinsic) { 1799 case nir_intrinsic_read_invocation: 1800 case nir_intrinsic_shuffle: 1801 case nir_intrinsic_shuffle_up: 1802 case nir_intrinsic_shuffle_down: 1803 case nir_intrinsic_shuffle_xor: 1804 case nir_intrinsic_quad_broadcast: 1805 case nir_intrinsic_quad_swap_horizontal: 1806 case nir_intrinsic_quad_swap_vertical: 1807 case nir_intrinsic_quad_swap_diagonal: 1808 if (src_idx == 0) { 1809 assert(use_intrin->dest.is_ssa); 1810 bits_used |= ssa_def_bits_used(&use_intrin->dest.ssa, recur); 1811 } else { 1812 if (use_intrin->intrinsic == nir_intrinsic_quad_broadcast) { 1813 bits_used |= 3; 1814 } else { 1815 /* Subgroups larger than 128 are not a thing */ 1816 bits_used |= 127; 1817 } 1818 } 1819 break; 1820 1821 case nir_intrinsic_reduce: 1822 case nir_intrinsic_inclusive_scan: 1823 case nir_intrinsic_exclusive_scan: 1824 assert(src_idx == 0); 1825 switch (nir_intrinsic_reduction_op(use_intrin)) { 1826 case nir_op_iadd: 1827 case nir_op_imul: 1828 case nir_op_ior: 1829 case nir_op_iand: 1830 case nir_op_ixor: 1831 bits_used |= ssa_def_bits_used(&use_intrin->dest.ssa, recur); 1832 break; 1833 1834 default: 1835 return all_bits; 1836 } 1837 break; 1838 1839 default: 1840 /* We don't know what this op does */ 1841 return all_bits; 1842 } 1843 break; 1844 } 1845 1846 case nir_instr_type_phi: { 1847 nir_phi_instr *use_phi = nir_instr_as_phi(src->parent_instr); 1848 bits_used |= ssa_def_bits_used(&use_phi->dest.ssa, recur); 1849 break; 1850 } 1851 1852 default: 1853 return all_bits; 1854 } 1855 1856 /* If we've somehow shown that all our bits are used, we're done */ 1857 assert((bits_used & ~all_bits) == 0); 1858 if (bits_used == all_bits) 1859 return all_bits; 1860 } 1861 1862 return bits_used; 1863} 1864 1865uint64_t 1866nir_ssa_def_bits_used(nir_ssa_def *def) 1867{ 1868 return ssa_def_bits_used(def, 2); 1869} 1870