17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2018 Intel Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg#include <math.h> 247ec681f3Smrg#include "nir.h" 257ec681f3Smrg#include "nir_builder.h" 267ec681f3Smrg#include "util/u_vector.h" 277ec681f3Smrg 287ec681f3Smrg/** 297ec681f3Smrg * Lower flrp instructions. 307ec681f3Smrg * 317ec681f3Smrg * Unlike the lowerings that are possible in nir_opt_algrbraic, this pass can 327ec681f3Smrg * examine more global information to determine a possibly more efficient 337ec681f3Smrg * lowering for each flrp. 347ec681f3Smrg */ 357ec681f3Smrg 367ec681f3Smrgstatic void 377ec681f3Smrgappend_flrp_to_dead_list(struct u_vector *dead_flrp, struct nir_alu_instr *alu) 387ec681f3Smrg{ 397ec681f3Smrg struct nir_alu_instr **tail = u_vector_add(dead_flrp); 407ec681f3Smrg *tail = alu; 417ec681f3Smrg} 427ec681f3Smrg 437ec681f3Smrg/** 447ec681f3Smrg * Replace flrp(a, b, c) with ffma(b, c, ffma(-a, c, a)). 457ec681f3Smrg */ 467ec681f3Smrgstatic void 477ec681f3Smrgreplace_with_strict_ffma(struct nir_builder *bld, struct u_vector *dead_flrp, 487ec681f3Smrg struct nir_alu_instr *alu) 497ec681f3Smrg{ 507ec681f3Smrg nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0); 517ec681f3Smrg nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1); 527ec681f3Smrg nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2); 537ec681f3Smrg 547ec681f3Smrg nir_ssa_def *const neg_a = nir_fneg(bld, a); 557ec681f3Smrg nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact; 567ec681f3Smrg 577ec681f3Smrg nir_ssa_def *const inner_ffma = nir_ffma(bld, neg_a, c, a); 587ec681f3Smrg nir_instr_as_alu(inner_ffma->parent_instr)->exact = alu->exact; 597ec681f3Smrg 607ec681f3Smrg nir_ssa_def *const outer_ffma = nir_ffma(bld, b, c, inner_ffma); 617ec681f3Smrg nir_instr_as_alu(outer_ffma->parent_instr)->exact = alu->exact; 627ec681f3Smrg 637ec681f3Smrg nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_ffma); 647ec681f3Smrg 657ec681f3Smrg /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are 667ec681f3Smrg * based on other uses of the sources. Removing the flrp may cause the 677ec681f3Smrg * last flrp in a sequence to make a different, incorrect choice. 687ec681f3Smrg */ 697ec681f3Smrg append_flrp_to_dead_list(dead_flrp, alu); 707ec681f3Smrg} 717ec681f3Smrg 727ec681f3Smrg/** 737ec681f3Smrg * Replace flrp(a, b, c) with ffma(a, (1 - c), bc) 747ec681f3Smrg */ 757ec681f3Smrgstatic void 767ec681f3Smrgreplace_with_single_ffma(struct nir_builder *bld, struct u_vector *dead_flrp, 777ec681f3Smrg struct nir_alu_instr *alu) 787ec681f3Smrg{ 797ec681f3Smrg nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0); 807ec681f3Smrg nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1); 817ec681f3Smrg nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2); 827ec681f3Smrg 837ec681f3Smrg nir_ssa_def *const neg_c = nir_fneg(bld, c); 847ec681f3Smrg nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact; 857ec681f3Smrg 867ec681f3Smrg nir_ssa_def *const one_minus_c = 877ec681f3Smrg nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c); 887ec681f3Smrg nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact; 897ec681f3Smrg 907ec681f3Smrg nir_ssa_def *const b_times_c = nir_fmul(bld, b, c); 917ec681f3Smrg nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact; 927ec681f3Smrg 937ec681f3Smrg nir_ssa_def *const final_ffma = nir_ffma(bld, a, one_minus_c, b_times_c); 947ec681f3Smrg nir_instr_as_alu(final_ffma->parent_instr)->exact = alu->exact; 957ec681f3Smrg 967ec681f3Smrg nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, final_ffma); 977ec681f3Smrg 987ec681f3Smrg /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are 997ec681f3Smrg * based on other uses of the sources. Removing the flrp may cause the 1007ec681f3Smrg * last flrp in a sequence to make a different, incorrect choice. 1017ec681f3Smrg */ 1027ec681f3Smrg append_flrp_to_dead_list(dead_flrp, alu); 1037ec681f3Smrg} 1047ec681f3Smrg 1057ec681f3Smrg/** 1067ec681f3Smrg * Replace flrp(a, b, c) with a(1-c) + bc. 1077ec681f3Smrg */ 1087ec681f3Smrgstatic void 1097ec681f3Smrgreplace_with_strict(struct nir_builder *bld, struct u_vector *dead_flrp, 1107ec681f3Smrg struct nir_alu_instr *alu) 1117ec681f3Smrg{ 1127ec681f3Smrg nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0); 1137ec681f3Smrg nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1); 1147ec681f3Smrg nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2); 1157ec681f3Smrg 1167ec681f3Smrg nir_ssa_def *const neg_c = nir_fneg(bld, c); 1177ec681f3Smrg nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact; 1187ec681f3Smrg 1197ec681f3Smrg nir_ssa_def *const one_minus_c = 1207ec681f3Smrg nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c); 1217ec681f3Smrg nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact; 1227ec681f3Smrg 1237ec681f3Smrg nir_ssa_def *const first_product = nir_fmul(bld, a, one_minus_c); 1247ec681f3Smrg nir_instr_as_alu(first_product->parent_instr)->exact = alu->exact; 1257ec681f3Smrg 1267ec681f3Smrg nir_ssa_def *const second_product = nir_fmul(bld, b, c); 1277ec681f3Smrg nir_instr_as_alu(second_product->parent_instr)->exact = alu->exact; 1287ec681f3Smrg 1297ec681f3Smrg nir_ssa_def *const sum = nir_fadd(bld, first_product, second_product); 1307ec681f3Smrg nir_instr_as_alu(sum->parent_instr)->exact = alu->exact; 1317ec681f3Smrg 1327ec681f3Smrg nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum); 1337ec681f3Smrg 1347ec681f3Smrg /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are 1357ec681f3Smrg * based on other uses of the sources. Removing the flrp may cause the 1367ec681f3Smrg * last flrp in a sequence to make a different, incorrect choice. 1377ec681f3Smrg */ 1387ec681f3Smrg append_flrp_to_dead_list(dead_flrp, alu); 1397ec681f3Smrg} 1407ec681f3Smrg 1417ec681f3Smrg/** 1427ec681f3Smrg * Replace flrp(a, b, c) with a + c(b-a). 1437ec681f3Smrg */ 1447ec681f3Smrgstatic void 1457ec681f3Smrgreplace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp, 1467ec681f3Smrg struct nir_alu_instr *alu) 1477ec681f3Smrg{ 1487ec681f3Smrg nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0); 1497ec681f3Smrg nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1); 1507ec681f3Smrg nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2); 1517ec681f3Smrg 1527ec681f3Smrg nir_ssa_def *const neg_a = nir_fneg(bld, a); 1537ec681f3Smrg nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact; 1547ec681f3Smrg 1557ec681f3Smrg nir_ssa_def *const b_minus_a = nir_fadd(bld, b, neg_a); 1567ec681f3Smrg nir_instr_as_alu(b_minus_a->parent_instr)->exact = alu->exact; 1577ec681f3Smrg 1587ec681f3Smrg nir_ssa_def *const product = nir_fmul(bld, c, b_minus_a); 1597ec681f3Smrg nir_instr_as_alu(product->parent_instr)->exact = alu->exact; 1607ec681f3Smrg 1617ec681f3Smrg nir_ssa_def *const sum = nir_fadd(bld, a, product); 1627ec681f3Smrg nir_instr_as_alu(sum->parent_instr)->exact = alu->exact; 1637ec681f3Smrg 1647ec681f3Smrg nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum); 1657ec681f3Smrg 1667ec681f3Smrg /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are 1677ec681f3Smrg * based on other uses of the sources. Removing the flrp may cause the 1687ec681f3Smrg * last flrp in a sequence to make a different, incorrect choice. 1697ec681f3Smrg */ 1707ec681f3Smrg append_flrp_to_dead_list(dead_flrp, alu); 1717ec681f3Smrg} 1727ec681f3Smrg 1737ec681f3Smrg/** 1747ec681f3Smrg * Replace flrp(a, b, c) with (b*c ± c) + a => b*c + (a ± c) 1757ec681f3Smrg * 1767ec681f3Smrg * \note: This only works if a = ±1. 1777ec681f3Smrg */ 1787ec681f3Smrgstatic void 1797ec681f3Smrgreplace_with_expanded_ffma_and_add(struct nir_builder *bld, 1807ec681f3Smrg struct u_vector *dead_flrp, 1817ec681f3Smrg struct nir_alu_instr *alu, bool subtract_c) 1827ec681f3Smrg{ 1837ec681f3Smrg nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0); 1847ec681f3Smrg nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1); 1857ec681f3Smrg nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2); 1867ec681f3Smrg 1877ec681f3Smrg nir_ssa_def *const b_times_c = nir_fmul(bld, b, c); 1887ec681f3Smrg nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact; 1897ec681f3Smrg 1907ec681f3Smrg nir_ssa_def *inner_sum; 1917ec681f3Smrg 1927ec681f3Smrg if (subtract_c) { 1937ec681f3Smrg nir_ssa_def *const neg_c = nir_fneg(bld, c); 1947ec681f3Smrg nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact; 1957ec681f3Smrg 1967ec681f3Smrg inner_sum = nir_fadd(bld, a, neg_c); 1977ec681f3Smrg } else { 1987ec681f3Smrg inner_sum = nir_fadd(bld, a, c); 1997ec681f3Smrg } 2007ec681f3Smrg 2017ec681f3Smrg nir_instr_as_alu(inner_sum->parent_instr)->exact = alu->exact; 2027ec681f3Smrg 2037ec681f3Smrg nir_ssa_def *const outer_sum = nir_fadd(bld, inner_sum, b_times_c); 2047ec681f3Smrg nir_instr_as_alu(outer_sum->parent_instr)->exact = alu->exact; 2057ec681f3Smrg 2067ec681f3Smrg nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_sum); 2077ec681f3Smrg 2087ec681f3Smrg /* DO NOT REMOVE the original flrp yet. Many of the lowering choices are 2097ec681f3Smrg * based on other uses of the sources. Removing the flrp may cause the 2107ec681f3Smrg * last flrp in a sequence to make a different, incorrect choice. 2117ec681f3Smrg */ 2127ec681f3Smrg append_flrp_to_dead_list(dead_flrp, alu); 2137ec681f3Smrg} 2147ec681f3Smrg 2157ec681f3Smrg/** 2167ec681f3Smrg * Determines whether a swizzled source is constant w/ all components the same. 2177ec681f3Smrg * 2187ec681f3Smrg * The value of the constant is stored in \c result. 2197ec681f3Smrg * 2207ec681f3Smrg * \return 2217ec681f3Smrg * True if all components of the swizzled source are the same constant. 2227ec681f3Smrg * Otherwise false is returned. 2237ec681f3Smrg */ 2247ec681f3Smrgstatic bool 2257ec681f3Smrgall_same_constant(const nir_alu_instr *instr, unsigned src, double *result) 2267ec681f3Smrg{ 2277ec681f3Smrg nir_const_value *val = nir_src_as_const_value(instr->src[src].src); 2287ec681f3Smrg 2297ec681f3Smrg if (!val) 2307ec681f3Smrg return false; 2317ec681f3Smrg 2327ec681f3Smrg const uint8_t *const swizzle = instr->src[src].swizzle; 2337ec681f3Smrg const unsigned num_components = nir_dest_num_components(instr->dest.dest); 2347ec681f3Smrg 2357ec681f3Smrg if (instr->dest.dest.ssa.bit_size == 32) { 2367ec681f3Smrg const float first = val[swizzle[0]].f32; 2377ec681f3Smrg 2387ec681f3Smrg for (unsigned i = 1; i < num_components; i++) { 2397ec681f3Smrg if (val[swizzle[i]].f32 != first) 2407ec681f3Smrg return false; 2417ec681f3Smrg } 2427ec681f3Smrg 2437ec681f3Smrg *result = first; 2447ec681f3Smrg } else { 2457ec681f3Smrg const double first = val[swizzle[0]].f64; 2467ec681f3Smrg 2477ec681f3Smrg for (unsigned i = 1; i < num_components; i++) { 2487ec681f3Smrg if (val[swizzle[i]].f64 != first) 2497ec681f3Smrg return false; 2507ec681f3Smrg } 2517ec681f3Smrg 2527ec681f3Smrg *result = first; 2537ec681f3Smrg } 2547ec681f3Smrg 2557ec681f3Smrg return true; 2567ec681f3Smrg} 2577ec681f3Smrg 2587ec681f3Smrgstatic bool 2597ec681f3Smrgsources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr) 2607ec681f3Smrg{ 2617ec681f3Smrg nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src); 2627ec681f3Smrg nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src); 2637ec681f3Smrg 2647ec681f3Smrg if (val0 == NULL || val1 == NULL) 2657ec681f3Smrg return false; 2667ec681f3Smrg 2677ec681f3Smrg const uint8_t *const swizzle0 = instr->src[0].swizzle; 2687ec681f3Smrg const uint8_t *const swizzle1 = instr->src[1].swizzle; 2697ec681f3Smrg const unsigned num_components = nir_dest_num_components(instr->dest.dest); 2707ec681f3Smrg 2717ec681f3Smrg if (instr->dest.dest.ssa.bit_size == 32) { 2727ec681f3Smrg for (unsigned i = 0; i < num_components; i++) { 2737ec681f3Smrg int exp0; 2747ec681f3Smrg int exp1; 2757ec681f3Smrg 2767ec681f3Smrg frexpf(val0[swizzle0[i]].f32, &exp0); 2777ec681f3Smrg frexpf(val1[swizzle1[i]].f32, &exp1); 2787ec681f3Smrg 2797ec681f3Smrg /* If the difference between exponents is >= 24, then A+B will always 2807ec681f3Smrg * have the value whichever between A and B has the largest absolute 2817ec681f3Smrg * value. So, [0, 23] is the valid range. The smaller the limit 2827ec681f3Smrg * value, the more precision will be maintained at a potential 2837ec681f3Smrg * performance cost. Somewhat arbitrarilly split the range in half. 2847ec681f3Smrg */ 2857ec681f3Smrg if (abs(exp0 - exp1) > (23 / 2)) 2867ec681f3Smrg return false; 2877ec681f3Smrg } 2887ec681f3Smrg } else { 2897ec681f3Smrg for (unsigned i = 0; i < num_components; i++) { 2907ec681f3Smrg int exp0; 2917ec681f3Smrg int exp1; 2927ec681f3Smrg 2937ec681f3Smrg frexp(val0[swizzle0[i]].f64, &exp0); 2947ec681f3Smrg frexp(val1[swizzle1[i]].f64, &exp1); 2957ec681f3Smrg 2967ec681f3Smrg /* If the difference between exponents is >= 53, then A+B will always 2977ec681f3Smrg * have the value whichever between A and B has the largest absolute 2987ec681f3Smrg * value. So, [0, 52] is the valid range. The smaller the limit 2997ec681f3Smrg * value, the more precision will be maintained at a potential 3007ec681f3Smrg * performance cost. Somewhat arbitrarilly split the range in half. 3017ec681f3Smrg */ 3027ec681f3Smrg if (abs(exp0 - exp1) > (52 / 2)) 3037ec681f3Smrg return false; 3047ec681f3Smrg } 3057ec681f3Smrg } 3067ec681f3Smrg 3077ec681f3Smrg return true; 3087ec681f3Smrg} 3097ec681f3Smrg 3107ec681f3Smrg/** 3117ec681f3Smrg * Counts of similar types of nir_op_flrp instructions 3127ec681f3Smrg * 3137ec681f3Smrg * If a similar instruction fits into more than one category, it will only be 3147ec681f3Smrg * counted once. The assumption is that no other instruction will have all 3157ec681f3Smrg * sources the same, or CSE would have removed one of the instructions. 3167ec681f3Smrg */ 3177ec681f3Smrgstruct similar_flrp_stats { 3187ec681f3Smrg unsigned src2; 3197ec681f3Smrg unsigned src0_and_src2; 3207ec681f3Smrg unsigned src1_and_src2; 3217ec681f3Smrg}; 3227ec681f3Smrg 3237ec681f3Smrg/** 3247ec681f3Smrg * Collection counts of similar FLRP instructions. 3257ec681f3Smrg * 3267ec681f3Smrg * This function only cares about similar instructions that have src2 in 3277ec681f3Smrg * common. 3287ec681f3Smrg */ 3297ec681f3Smrgstatic void 3307ec681f3Smrgget_similar_flrp_stats(nir_alu_instr *alu, struct similar_flrp_stats *st) 3317ec681f3Smrg{ 3327ec681f3Smrg memset(st, 0, sizeof(*st)); 3337ec681f3Smrg 3347ec681f3Smrg nir_foreach_use(other_use, alu->src[2].src.ssa) { 3357ec681f3Smrg /* Is the use also a flrp? */ 3367ec681f3Smrg nir_instr *const other_instr = other_use->parent_instr; 3377ec681f3Smrg if (other_instr->type != nir_instr_type_alu) 3387ec681f3Smrg continue; 3397ec681f3Smrg 3407ec681f3Smrg /* Eh-hem... don't match the instruction with itself. */ 3417ec681f3Smrg if (other_instr == &alu->instr) 3427ec681f3Smrg continue; 3437ec681f3Smrg 3447ec681f3Smrg nir_alu_instr *const other_alu = nir_instr_as_alu(other_instr); 3457ec681f3Smrg if (other_alu->op != nir_op_flrp) 3467ec681f3Smrg continue; 3477ec681f3Smrg 3487ec681f3Smrg /* Does the other flrp use source 2 from the first flrp as its source 2 3497ec681f3Smrg * as well? 3507ec681f3Smrg */ 3517ec681f3Smrg if (!nir_alu_srcs_equal(alu, other_alu, 2, 2)) 3527ec681f3Smrg continue; 3537ec681f3Smrg 3547ec681f3Smrg if (nir_alu_srcs_equal(alu, other_alu, 0, 0)) 3557ec681f3Smrg st->src0_and_src2++; 3567ec681f3Smrg else if (nir_alu_srcs_equal(alu, other_alu, 1, 1)) 3577ec681f3Smrg st->src1_and_src2++; 3587ec681f3Smrg else 3597ec681f3Smrg st->src2++; 3607ec681f3Smrg } 3617ec681f3Smrg} 3627ec681f3Smrg 3637ec681f3Smrgstatic void 3647ec681f3Smrgconvert_flrp_instruction(nir_builder *bld, 3657ec681f3Smrg struct u_vector *dead_flrp, 3667ec681f3Smrg nir_alu_instr *alu, 3677ec681f3Smrg bool always_precise) 3687ec681f3Smrg{ 3697ec681f3Smrg bool have_ffma = false; 3707ec681f3Smrg unsigned bit_size = nir_dest_bit_size(alu->dest.dest); 3717ec681f3Smrg 3727ec681f3Smrg if (bit_size == 16) 3737ec681f3Smrg have_ffma = !bld->shader->options->lower_ffma16; 3747ec681f3Smrg else if (bit_size == 32) 3757ec681f3Smrg have_ffma = !bld->shader->options->lower_ffma32; 3767ec681f3Smrg else if (bit_size == 64) 3777ec681f3Smrg have_ffma = !bld->shader->options->lower_ffma64; 3787ec681f3Smrg else 3797ec681f3Smrg unreachable("invalid bit_size"); 3807ec681f3Smrg 3817ec681f3Smrg bld->cursor = nir_before_instr(&alu->instr); 3827ec681f3Smrg 3837ec681f3Smrg /* There are two methods to implement flrp(x, y, t). The strictly correct 3847ec681f3Smrg * implementation according to the GLSL spec is: 3857ec681f3Smrg * 3867ec681f3Smrg * x(1 - t) + yt 3877ec681f3Smrg * 3887ec681f3Smrg * This can also be implemented using two chained FMAs 3897ec681f3Smrg * 3907ec681f3Smrg * fma(y, t, fma(-x, t, x)) 3917ec681f3Smrg * 3927ec681f3Smrg * This method, using either formulation, has better precision when the 3937ec681f3Smrg * difference between x and y is very large. It guarantess that flrp(x, y, 3947ec681f3Smrg * 1) = y. For example, flrp(1e38, 1.0, 1.0) is 1.0. This is correct. 3957ec681f3Smrg * 3967ec681f3Smrg * The other possible implementation is: 3977ec681f3Smrg * 3987ec681f3Smrg * x + t(y - x) 3997ec681f3Smrg * 4007ec681f3Smrg * This can also be formuated as an FMA: 4017ec681f3Smrg * 4027ec681f3Smrg * fma(y - x, t, x) 4037ec681f3Smrg * 4047ec681f3Smrg * For this implementation, flrp(1e38, 1.0, 1.0) is 0.0. Since 1.0 was 4057ec681f3Smrg * expected, that's a pretty significant error. 4067ec681f3Smrg * 4077ec681f3Smrg * The choice made for lowering depends on a number of factors. 4087ec681f3Smrg * 4097ec681f3Smrg * - If the flrp is marked precise and FMA is supported: 4107ec681f3Smrg * 4117ec681f3Smrg * fma(y, t, fma(-x, t, x)) 4127ec681f3Smrg * 4137ec681f3Smrg * This is strictly correct (maybe?), and the cost is two FMA 4147ec681f3Smrg * instructions. It at least maintains the flrp(x, y, 1.0) == y 4157ec681f3Smrg * condition. 4167ec681f3Smrg * 4177ec681f3Smrg * - If the flrp is marked precise and FMA is not supported: 4187ec681f3Smrg * 4197ec681f3Smrg * x(1 - t) + yt 4207ec681f3Smrg * 4217ec681f3Smrg * This is strictly correct, and the cost is 4 instructions. If FMA is 4227ec681f3Smrg * supported, this may or may not be reduced to 3 instructions (a 4237ec681f3Smrg * subtract, a multiply, and an FMA)... but in that case the other 4247ec681f3Smrg * formulation should have been used. 4257ec681f3Smrg */ 4267ec681f3Smrg if (alu->exact) { 4277ec681f3Smrg if (have_ffma) 4287ec681f3Smrg replace_with_strict_ffma(bld, dead_flrp, alu); 4297ec681f3Smrg else 4307ec681f3Smrg replace_with_strict(bld, dead_flrp, alu); 4317ec681f3Smrg 4327ec681f3Smrg return; 4337ec681f3Smrg } 4347ec681f3Smrg 4357ec681f3Smrg /* 4367ec681f3Smrg * - If x and y are both immediates and the relative magnitude of the 4377ec681f3Smrg * values is similar (such that x-y does not lose too much precision): 4387ec681f3Smrg * 4397ec681f3Smrg * x + t(x - y) 4407ec681f3Smrg * 4417ec681f3Smrg * We rely on constant folding to eliminate x-y, and we rely on 4427ec681f3Smrg * nir_opt_algebraic to possibly generate an FMA. The cost is either one 4437ec681f3Smrg * FMA or two instructions. 4447ec681f3Smrg */ 4457ec681f3Smrg if (sources_are_constants_with_similar_magnitudes(alu)) { 4467ec681f3Smrg replace_with_fast(bld, dead_flrp, alu); 4477ec681f3Smrg return; 4487ec681f3Smrg } 4497ec681f3Smrg 4507ec681f3Smrg /* 4517ec681f3Smrg * - If x = 1: 4527ec681f3Smrg * 4537ec681f3Smrg * (yt + -t) + 1 4547ec681f3Smrg * 4557ec681f3Smrg * - If x = -1: 4567ec681f3Smrg * 4577ec681f3Smrg * (yt + t) - 1 4587ec681f3Smrg * 4597ec681f3Smrg * In both cases, x is used in place of ±1 for simplicity. Both forms 4607ec681f3Smrg * lend to ffma generation on platforms that support ffma. 4617ec681f3Smrg */ 4627ec681f3Smrg double src0_as_constant; 4637ec681f3Smrg if (all_same_constant(alu, 0, &src0_as_constant)) { 4647ec681f3Smrg if (src0_as_constant == 1.0) { 4657ec681f3Smrg replace_with_expanded_ffma_and_add(bld, dead_flrp, alu, 4667ec681f3Smrg true /* subtract t */); 4677ec681f3Smrg return; 4687ec681f3Smrg } else if (src0_as_constant == -1.0) { 4697ec681f3Smrg replace_with_expanded_ffma_and_add(bld, dead_flrp, alu, 4707ec681f3Smrg false /* add t */); 4717ec681f3Smrg return; 4727ec681f3Smrg } 4737ec681f3Smrg } 4747ec681f3Smrg 4757ec681f3Smrg /* 4767ec681f3Smrg * - If y = ±1: 4777ec681f3Smrg * 4787ec681f3Smrg * x(1 - t) + yt 4797ec681f3Smrg * 4807ec681f3Smrg * In this case either the multiply in yt will be eliminated by 4817ec681f3Smrg * nir_opt_algebraic. If FMA is supported, this results in fma(x, (1 - 4827ec681f3Smrg * t), ±t) for two instructions. If FMA is not supported, then the cost 4837ec681f3Smrg * is 3 instructions. We rely on nir_opt_algebraic to generate the FMA 4847ec681f3Smrg * instructions as well. 4857ec681f3Smrg * 4867ec681f3Smrg * Another possible replacement is 4877ec681f3Smrg * 4887ec681f3Smrg * -xt + x ± t 4897ec681f3Smrg * 4907ec681f3Smrg * Some groupings of this may be better on some platforms in some 4917ec681f3Smrg * circumstances, bit it is probably dependent on scheduling. Futher 4927ec681f3Smrg * investigation may be required. 4937ec681f3Smrg */ 4947ec681f3Smrg double src1_as_constant; 4957ec681f3Smrg if ((all_same_constant(alu, 1, &src1_as_constant) && 4967ec681f3Smrg (src1_as_constant == -1.0 || src1_as_constant == 1.0))) { 4977ec681f3Smrg replace_with_strict(bld, dead_flrp, alu); 4987ec681f3Smrg return; 4997ec681f3Smrg } 5007ec681f3Smrg 5017ec681f3Smrg if (have_ffma) { 5027ec681f3Smrg if (always_precise) { 5037ec681f3Smrg replace_with_strict_ffma(bld, dead_flrp, alu); 5047ec681f3Smrg return; 5057ec681f3Smrg } 5067ec681f3Smrg 5077ec681f3Smrg /* 5087ec681f3Smrg * - If FMA is supported and other flrp(x, _, t) exists: 5097ec681f3Smrg * 5107ec681f3Smrg * fma(y, t, fma(-x, t, x)) 5117ec681f3Smrg * 5127ec681f3Smrg * The hope is that the inner FMA calculation will be shared with the 5137ec681f3Smrg * other lowered flrp. This results in two FMA instructions for the 5147ec681f3Smrg * first flrp and one FMA instruction for each additional flrp. It 5157ec681f3Smrg * also means that the live range for x might be complete after the 5167ec681f3Smrg * inner ffma instead of after the last flrp. 5177ec681f3Smrg */ 5187ec681f3Smrg struct similar_flrp_stats st; 5197ec681f3Smrg 5207ec681f3Smrg get_similar_flrp_stats(alu, &st); 5217ec681f3Smrg if (st.src0_and_src2 > 0) { 5227ec681f3Smrg replace_with_strict_ffma(bld, dead_flrp, alu); 5237ec681f3Smrg return; 5247ec681f3Smrg } 5257ec681f3Smrg 5267ec681f3Smrg /* 5277ec681f3Smrg * - If FMA is supported and another flrp(_, y, t) exists: 5287ec681f3Smrg * 5297ec681f3Smrg * fma(x, (1 - t), yt) 5307ec681f3Smrg * 5317ec681f3Smrg * The hope is that the (1 - t) and the yt will be shared with the 5327ec681f3Smrg * other lowered flrp. This results in 3 insructions for the first 5337ec681f3Smrg * flrp and 1 for each additional flrp. 5347ec681f3Smrg */ 5357ec681f3Smrg if (st.src1_and_src2 > 0) { 5367ec681f3Smrg replace_with_single_ffma(bld, dead_flrp, alu); 5377ec681f3Smrg return; 5387ec681f3Smrg } 5397ec681f3Smrg } else { 5407ec681f3Smrg if (always_precise) { 5417ec681f3Smrg replace_with_strict(bld, dead_flrp, alu); 5427ec681f3Smrg return; 5437ec681f3Smrg } 5447ec681f3Smrg 5457ec681f3Smrg /* 5467ec681f3Smrg * - If FMA is not supported and another flrp(x, _, t) exists: 5477ec681f3Smrg * 5487ec681f3Smrg * x(1 - t) + yt 5497ec681f3Smrg * 5507ec681f3Smrg * The hope is that the x(1 - t) will be shared with the other lowered 5517ec681f3Smrg * flrp. This results in 4 insructions for the first flrp and 2 for 5527ec681f3Smrg * each additional flrp. 5537ec681f3Smrg * 5547ec681f3Smrg * - If FMA is not supported and another flrp(_, y, t) exists: 5557ec681f3Smrg * 5567ec681f3Smrg * x(1 - t) + yt 5577ec681f3Smrg * 5587ec681f3Smrg * The hope is that the (1 - t) and the yt will be shared with the 5597ec681f3Smrg * other lowered flrp. This results in 4 insructions for the first 5607ec681f3Smrg * flrp and 2 for each additional flrp. 5617ec681f3Smrg */ 5627ec681f3Smrg struct similar_flrp_stats st; 5637ec681f3Smrg 5647ec681f3Smrg get_similar_flrp_stats(alu, &st); 5657ec681f3Smrg if (st.src0_and_src2 > 0 || st.src1_and_src2 > 0) { 5667ec681f3Smrg replace_with_strict(bld, dead_flrp, alu); 5677ec681f3Smrg return; 5687ec681f3Smrg } 5697ec681f3Smrg } 5707ec681f3Smrg 5717ec681f3Smrg /* 5727ec681f3Smrg * - If t is constant: 5737ec681f3Smrg * 5747ec681f3Smrg * x(1 - t) + yt 5757ec681f3Smrg * 5767ec681f3Smrg * The cost is three instructions without FMA or two instructions with 5777ec681f3Smrg * FMA. This is the same cost as the imprecise lowering, but it gives 5787ec681f3Smrg * the instruction scheduler a little more freedom. 5797ec681f3Smrg * 5807ec681f3Smrg * There is no need to handle t = 0.5 specially. nir_opt_algebraic 5817ec681f3Smrg * already has optimizations to convert 0.5x + 0.5y to 0.5(x + y). 5827ec681f3Smrg */ 5837ec681f3Smrg if (alu->src[2].src.ssa->parent_instr->type == nir_instr_type_load_const) { 5847ec681f3Smrg replace_with_strict(bld, dead_flrp, alu); 5857ec681f3Smrg return; 5867ec681f3Smrg } 5877ec681f3Smrg 5887ec681f3Smrg /* 5897ec681f3Smrg * - Otherwise 5907ec681f3Smrg * 5917ec681f3Smrg * x + t(x - y) 5927ec681f3Smrg */ 5937ec681f3Smrg replace_with_fast(bld, dead_flrp, alu); 5947ec681f3Smrg} 5957ec681f3Smrg 5967ec681f3Smrgstatic void 5977ec681f3Smrglower_flrp_impl(nir_function_impl *impl, 5987ec681f3Smrg struct u_vector *dead_flrp, 5997ec681f3Smrg unsigned lowering_mask, 6007ec681f3Smrg bool always_precise) 6017ec681f3Smrg{ 6027ec681f3Smrg nir_builder b; 6037ec681f3Smrg nir_builder_init(&b, impl); 6047ec681f3Smrg 6057ec681f3Smrg nir_foreach_block(block, impl) { 6067ec681f3Smrg nir_foreach_instr_safe(instr, block) { 6077ec681f3Smrg if (instr->type == nir_instr_type_alu) { 6087ec681f3Smrg nir_alu_instr *const alu = nir_instr_as_alu(instr); 6097ec681f3Smrg 6107ec681f3Smrg if (alu->op == nir_op_flrp && 6117ec681f3Smrg (alu->dest.dest.ssa.bit_size & lowering_mask)) { 6127ec681f3Smrg convert_flrp_instruction(&b, dead_flrp, alu, always_precise); 6137ec681f3Smrg } 6147ec681f3Smrg } 6157ec681f3Smrg } 6167ec681f3Smrg } 6177ec681f3Smrg 6187ec681f3Smrg nir_metadata_preserve(impl, nir_metadata_block_index | 6197ec681f3Smrg nir_metadata_dominance); 6207ec681f3Smrg} 6217ec681f3Smrg 6227ec681f3Smrg/** 6237ec681f3Smrg * \param lowering_mask - Bitwise-or of the bit sizes that need to be lowered 6247ec681f3Smrg * (e.g., 16 | 64 if only 16-bit and 64-bit flrp need 6257ec681f3Smrg * lowering). 6267ec681f3Smrg * \param always_precise - Always require precise lowering for flrp. This 6277ec681f3Smrg * will always lower flrp to (a * (1 - c)) + (b * c). 6287ec681f3Smrg * \param have_ffma - Set to true if the GPU has an FFMA instruction that 6297ec681f3Smrg * should be used. 6307ec681f3Smrg */ 6317ec681f3Smrgbool 6327ec681f3Smrgnir_lower_flrp(nir_shader *shader, 6337ec681f3Smrg unsigned lowering_mask, 6347ec681f3Smrg bool always_precise) 6357ec681f3Smrg{ 6367ec681f3Smrg struct u_vector dead_flrp; 6377ec681f3Smrg 6387ec681f3Smrg if (!u_vector_init_pow2(&dead_flrp, 8, sizeof(struct nir_alu_instr *))) 6397ec681f3Smrg return false; 6407ec681f3Smrg 6417ec681f3Smrg nir_foreach_function(function, shader) { 6427ec681f3Smrg if (function->impl) { 6437ec681f3Smrg lower_flrp_impl(function->impl, &dead_flrp, lowering_mask, 6447ec681f3Smrg always_precise); 6457ec681f3Smrg } 6467ec681f3Smrg } 6477ec681f3Smrg 6487ec681f3Smrg /* Progress was made if the dead list is not empty. Remove all the 6497ec681f3Smrg * instructions from the dead list. 6507ec681f3Smrg */ 6517ec681f3Smrg const bool progress = u_vector_length(&dead_flrp) != 0; 6527ec681f3Smrg 6537ec681f3Smrg struct nir_alu_instr **instr; 6547ec681f3Smrg u_vector_foreach(instr, &dead_flrp) 6557ec681f3Smrg nir_instr_remove(&(*instr)->instr); 6567ec681f3Smrg 6577ec681f3Smrg u_vector_finish(&dead_flrp); 6587ec681f3Smrg 6597ec681f3Smrg return progress; 6607ec681f3Smrg} 661