17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2018 Intel Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg#include <math.h>
247ec681f3Smrg#include "nir.h"
257ec681f3Smrg#include "nir_builder.h"
267ec681f3Smrg#include "util/u_vector.h"
277ec681f3Smrg
287ec681f3Smrg/**
297ec681f3Smrg * Lower flrp instructions.
307ec681f3Smrg *
317ec681f3Smrg * Unlike the lowerings that are possible in nir_opt_algrbraic, this pass can
327ec681f3Smrg * examine more global information to determine a possibly more efficient
337ec681f3Smrg * lowering for each flrp.
347ec681f3Smrg */
357ec681f3Smrg
367ec681f3Smrgstatic void
377ec681f3Smrgappend_flrp_to_dead_list(struct u_vector *dead_flrp, struct nir_alu_instr *alu)
387ec681f3Smrg{
397ec681f3Smrg   struct nir_alu_instr **tail = u_vector_add(dead_flrp);
407ec681f3Smrg   *tail = alu;
417ec681f3Smrg}
427ec681f3Smrg
437ec681f3Smrg/**
447ec681f3Smrg * Replace flrp(a, b, c) with ffma(b, c, ffma(-a, c, a)).
457ec681f3Smrg */
467ec681f3Smrgstatic void
477ec681f3Smrgreplace_with_strict_ffma(struct nir_builder *bld, struct u_vector *dead_flrp,
487ec681f3Smrg                         struct nir_alu_instr *alu)
497ec681f3Smrg{
507ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
517ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
527ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
537ec681f3Smrg
547ec681f3Smrg   nir_ssa_def *const neg_a = nir_fneg(bld, a);
557ec681f3Smrg   nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact;
567ec681f3Smrg
577ec681f3Smrg   nir_ssa_def *const inner_ffma = nir_ffma(bld, neg_a, c, a);
587ec681f3Smrg   nir_instr_as_alu(inner_ffma->parent_instr)->exact = alu->exact;
597ec681f3Smrg
607ec681f3Smrg   nir_ssa_def *const outer_ffma = nir_ffma(bld, b, c, inner_ffma);
617ec681f3Smrg   nir_instr_as_alu(outer_ffma->parent_instr)->exact = alu->exact;
627ec681f3Smrg
637ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_ffma);
647ec681f3Smrg
657ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
667ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
677ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
687ec681f3Smrg    */
697ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
707ec681f3Smrg}
717ec681f3Smrg
727ec681f3Smrg/**
737ec681f3Smrg * Replace flrp(a, b, c) with ffma(a, (1 - c), bc)
747ec681f3Smrg */
757ec681f3Smrgstatic void
767ec681f3Smrgreplace_with_single_ffma(struct nir_builder *bld, struct u_vector *dead_flrp,
777ec681f3Smrg                         struct nir_alu_instr *alu)
787ec681f3Smrg{
797ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
807ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
817ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
827ec681f3Smrg
837ec681f3Smrg   nir_ssa_def *const neg_c = nir_fneg(bld, c);
847ec681f3Smrg   nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
857ec681f3Smrg
867ec681f3Smrg   nir_ssa_def *const one_minus_c =
877ec681f3Smrg      nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c);
887ec681f3Smrg   nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact;
897ec681f3Smrg
907ec681f3Smrg   nir_ssa_def *const b_times_c = nir_fmul(bld, b, c);
917ec681f3Smrg   nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact;
927ec681f3Smrg
937ec681f3Smrg   nir_ssa_def *const final_ffma = nir_ffma(bld, a, one_minus_c, b_times_c);
947ec681f3Smrg   nir_instr_as_alu(final_ffma->parent_instr)->exact = alu->exact;
957ec681f3Smrg
967ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, final_ffma);
977ec681f3Smrg
987ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
997ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
1007ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
1017ec681f3Smrg    */
1027ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
1037ec681f3Smrg}
1047ec681f3Smrg
1057ec681f3Smrg/**
1067ec681f3Smrg * Replace flrp(a, b, c) with a(1-c) + bc.
1077ec681f3Smrg */
1087ec681f3Smrgstatic void
1097ec681f3Smrgreplace_with_strict(struct nir_builder *bld, struct u_vector *dead_flrp,
1107ec681f3Smrg                    struct nir_alu_instr *alu)
1117ec681f3Smrg{
1127ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
1137ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
1147ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
1157ec681f3Smrg
1167ec681f3Smrg   nir_ssa_def *const neg_c = nir_fneg(bld, c);
1177ec681f3Smrg   nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
1187ec681f3Smrg
1197ec681f3Smrg   nir_ssa_def *const one_minus_c =
1207ec681f3Smrg      nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c);
1217ec681f3Smrg   nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact;
1227ec681f3Smrg
1237ec681f3Smrg   nir_ssa_def *const first_product = nir_fmul(bld, a, one_minus_c);
1247ec681f3Smrg   nir_instr_as_alu(first_product->parent_instr)->exact = alu->exact;
1257ec681f3Smrg
1267ec681f3Smrg   nir_ssa_def *const second_product = nir_fmul(bld, b, c);
1277ec681f3Smrg   nir_instr_as_alu(second_product->parent_instr)->exact = alu->exact;
1287ec681f3Smrg
1297ec681f3Smrg   nir_ssa_def *const sum = nir_fadd(bld, first_product, second_product);
1307ec681f3Smrg   nir_instr_as_alu(sum->parent_instr)->exact = alu->exact;
1317ec681f3Smrg
1327ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
1337ec681f3Smrg
1347ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
1357ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
1367ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
1377ec681f3Smrg    */
1387ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
1397ec681f3Smrg}
1407ec681f3Smrg
1417ec681f3Smrg/**
1427ec681f3Smrg * Replace flrp(a, b, c) with a + c(b-a).
1437ec681f3Smrg */
1447ec681f3Smrgstatic void
1457ec681f3Smrgreplace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp,
1467ec681f3Smrg                  struct nir_alu_instr *alu)
1477ec681f3Smrg{
1487ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
1497ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
1507ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
1517ec681f3Smrg
1527ec681f3Smrg   nir_ssa_def *const neg_a = nir_fneg(bld, a);
1537ec681f3Smrg   nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact;
1547ec681f3Smrg
1557ec681f3Smrg   nir_ssa_def *const b_minus_a = nir_fadd(bld, b, neg_a);
1567ec681f3Smrg   nir_instr_as_alu(b_minus_a->parent_instr)->exact = alu->exact;
1577ec681f3Smrg
1587ec681f3Smrg   nir_ssa_def *const product = nir_fmul(bld, c, b_minus_a);
1597ec681f3Smrg   nir_instr_as_alu(product->parent_instr)->exact = alu->exact;
1607ec681f3Smrg
1617ec681f3Smrg   nir_ssa_def *const sum = nir_fadd(bld, a, product);
1627ec681f3Smrg   nir_instr_as_alu(sum->parent_instr)->exact = alu->exact;
1637ec681f3Smrg
1647ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
1657ec681f3Smrg
1667ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
1677ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
1687ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
1697ec681f3Smrg    */
1707ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
1717ec681f3Smrg}
1727ec681f3Smrg
1737ec681f3Smrg/**
1747ec681f3Smrg * Replace flrp(a, b, c) with (b*c ± c) + a => b*c + (a ± c)
1757ec681f3Smrg *
1767ec681f3Smrg * \note: This only works if a = ±1.
1777ec681f3Smrg */
1787ec681f3Smrgstatic void
1797ec681f3Smrgreplace_with_expanded_ffma_and_add(struct nir_builder *bld,
1807ec681f3Smrg                                   struct u_vector *dead_flrp,
1817ec681f3Smrg                                   struct nir_alu_instr *alu, bool subtract_c)
1827ec681f3Smrg{
1837ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
1847ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
1857ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
1867ec681f3Smrg
1877ec681f3Smrg   nir_ssa_def *const b_times_c = nir_fmul(bld, b, c);
1887ec681f3Smrg   nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact;
1897ec681f3Smrg
1907ec681f3Smrg   nir_ssa_def *inner_sum;
1917ec681f3Smrg
1927ec681f3Smrg   if (subtract_c) {
1937ec681f3Smrg      nir_ssa_def *const neg_c = nir_fneg(bld, c);
1947ec681f3Smrg      nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
1957ec681f3Smrg
1967ec681f3Smrg      inner_sum = nir_fadd(bld, a, neg_c);
1977ec681f3Smrg   } else {
1987ec681f3Smrg      inner_sum = nir_fadd(bld, a, c);
1997ec681f3Smrg   }
2007ec681f3Smrg
2017ec681f3Smrg   nir_instr_as_alu(inner_sum->parent_instr)->exact = alu->exact;
2027ec681f3Smrg
2037ec681f3Smrg   nir_ssa_def *const outer_sum = nir_fadd(bld, inner_sum, b_times_c);
2047ec681f3Smrg   nir_instr_as_alu(outer_sum->parent_instr)->exact = alu->exact;
2057ec681f3Smrg
2067ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_sum);
2077ec681f3Smrg
2087ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
2097ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
2107ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
2117ec681f3Smrg    */
2127ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
2137ec681f3Smrg}
2147ec681f3Smrg
2157ec681f3Smrg/**
2167ec681f3Smrg * Determines whether a swizzled source is constant w/ all components the same.
2177ec681f3Smrg *
2187ec681f3Smrg * The value of the constant is stored in \c result.
2197ec681f3Smrg *
2207ec681f3Smrg * \return
2217ec681f3Smrg * True if all components of the swizzled source are the same constant.
2227ec681f3Smrg * Otherwise false is returned.
2237ec681f3Smrg */
2247ec681f3Smrgstatic bool
2257ec681f3Smrgall_same_constant(const nir_alu_instr *instr, unsigned src, double *result)
2267ec681f3Smrg{
2277ec681f3Smrg   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
2287ec681f3Smrg
2297ec681f3Smrg   if (!val)
2307ec681f3Smrg      return false;
2317ec681f3Smrg
2327ec681f3Smrg   const uint8_t *const swizzle = instr->src[src].swizzle;
2337ec681f3Smrg   const unsigned num_components = nir_dest_num_components(instr->dest.dest);
2347ec681f3Smrg
2357ec681f3Smrg   if (instr->dest.dest.ssa.bit_size == 32) {
2367ec681f3Smrg      const float first = val[swizzle[0]].f32;
2377ec681f3Smrg
2387ec681f3Smrg      for (unsigned i = 1; i < num_components; i++) {
2397ec681f3Smrg         if (val[swizzle[i]].f32 != first)
2407ec681f3Smrg            return false;
2417ec681f3Smrg      }
2427ec681f3Smrg
2437ec681f3Smrg      *result = first;
2447ec681f3Smrg   } else {
2457ec681f3Smrg      const double first = val[swizzle[0]].f64;
2467ec681f3Smrg
2477ec681f3Smrg      for (unsigned i = 1; i < num_components; i++) {
2487ec681f3Smrg         if (val[swizzle[i]].f64 != first)
2497ec681f3Smrg            return false;
2507ec681f3Smrg      }
2517ec681f3Smrg
2527ec681f3Smrg      *result = first;
2537ec681f3Smrg   }
2547ec681f3Smrg
2557ec681f3Smrg   return true;
2567ec681f3Smrg}
2577ec681f3Smrg
2587ec681f3Smrgstatic bool
2597ec681f3Smrgsources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr)
2607ec681f3Smrg{
2617ec681f3Smrg   nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src);
2627ec681f3Smrg   nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src);
2637ec681f3Smrg
2647ec681f3Smrg   if (val0 == NULL || val1 == NULL)
2657ec681f3Smrg      return false;
2667ec681f3Smrg
2677ec681f3Smrg   const uint8_t *const swizzle0 = instr->src[0].swizzle;
2687ec681f3Smrg   const uint8_t *const swizzle1 = instr->src[1].swizzle;
2697ec681f3Smrg   const unsigned num_components = nir_dest_num_components(instr->dest.dest);
2707ec681f3Smrg
2717ec681f3Smrg   if (instr->dest.dest.ssa.bit_size == 32) {
2727ec681f3Smrg      for (unsigned i = 0; i < num_components; i++) {
2737ec681f3Smrg         int exp0;
2747ec681f3Smrg         int exp1;
2757ec681f3Smrg
2767ec681f3Smrg         frexpf(val0[swizzle0[i]].f32, &exp0);
2777ec681f3Smrg         frexpf(val1[swizzle1[i]].f32, &exp1);
2787ec681f3Smrg
2797ec681f3Smrg         /* If the difference between exponents is >= 24, then A+B will always
2807ec681f3Smrg          * have the value whichever between A and B has the largest absolute
2817ec681f3Smrg          * value.  So, [0, 23] is the valid range.  The smaller the limit
2827ec681f3Smrg          * value, the more precision will be maintained at a potential
2837ec681f3Smrg          * performance cost.  Somewhat arbitrarilly split the range in half.
2847ec681f3Smrg          */
2857ec681f3Smrg         if (abs(exp0 - exp1) > (23 / 2))
2867ec681f3Smrg            return false;
2877ec681f3Smrg      }
2887ec681f3Smrg   } else {
2897ec681f3Smrg      for (unsigned i = 0; i < num_components; i++) {
2907ec681f3Smrg         int exp0;
2917ec681f3Smrg         int exp1;
2927ec681f3Smrg
2937ec681f3Smrg         frexp(val0[swizzle0[i]].f64, &exp0);
2947ec681f3Smrg         frexp(val1[swizzle1[i]].f64, &exp1);
2957ec681f3Smrg
2967ec681f3Smrg         /* If the difference between exponents is >= 53, then A+B will always
2977ec681f3Smrg          * have the value whichever between A and B has the largest absolute
2987ec681f3Smrg          * value.  So, [0, 52] is the valid range.  The smaller the limit
2997ec681f3Smrg          * value, the more precision will be maintained at a potential
3007ec681f3Smrg          * performance cost.  Somewhat arbitrarilly split the range in half.
3017ec681f3Smrg          */
3027ec681f3Smrg         if (abs(exp0 - exp1) > (52 / 2))
3037ec681f3Smrg            return false;
3047ec681f3Smrg      }
3057ec681f3Smrg   }
3067ec681f3Smrg
3077ec681f3Smrg   return true;
3087ec681f3Smrg}
3097ec681f3Smrg
3107ec681f3Smrg/**
3117ec681f3Smrg * Counts of similar types of nir_op_flrp instructions
3127ec681f3Smrg *
3137ec681f3Smrg * If a similar instruction fits into more than one category, it will only be
3147ec681f3Smrg * counted once.  The assumption is that no other instruction will have all
3157ec681f3Smrg * sources the same, or CSE would have removed one of the instructions.
3167ec681f3Smrg */
3177ec681f3Smrgstruct similar_flrp_stats {
3187ec681f3Smrg   unsigned src2;
3197ec681f3Smrg   unsigned src0_and_src2;
3207ec681f3Smrg   unsigned src1_and_src2;
3217ec681f3Smrg};
3227ec681f3Smrg
3237ec681f3Smrg/**
3247ec681f3Smrg * Collection counts of similar FLRP instructions.
3257ec681f3Smrg *
3267ec681f3Smrg * This function only cares about similar instructions that have src2 in
3277ec681f3Smrg * common.
3287ec681f3Smrg */
3297ec681f3Smrgstatic void
3307ec681f3Smrgget_similar_flrp_stats(nir_alu_instr *alu, struct similar_flrp_stats *st)
3317ec681f3Smrg{
3327ec681f3Smrg   memset(st, 0, sizeof(*st));
3337ec681f3Smrg
3347ec681f3Smrg   nir_foreach_use(other_use, alu->src[2].src.ssa) {
3357ec681f3Smrg      /* Is the use also a flrp? */
3367ec681f3Smrg      nir_instr *const other_instr = other_use->parent_instr;
3377ec681f3Smrg      if (other_instr->type != nir_instr_type_alu)
3387ec681f3Smrg         continue;
3397ec681f3Smrg
3407ec681f3Smrg      /* Eh-hem... don't match the instruction with itself. */
3417ec681f3Smrg      if (other_instr == &alu->instr)
3427ec681f3Smrg         continue;
3437ec681f3Smrg
3447ec681f3Smrg      nir_alu_instr *const other_alu = nir_instr_as_alu(other_instr);
3457ec681f3Smrg      if (other_alu->op != nir_op_flrp)
3467ec681f3Smrg         continue;
3477ec681f3Smrg
3487ec681f3Smrg      /* Does the other flrp use source 2 from the first flrp as its source 2
3497ec681f3Smrg       * as well?
3507ec681f3Smrg       */
3517ec681f3Smrg      if (!nir_alu_srcs_equal(alu, other_alu, 2, 2))
3527ec681f3Smrg         continue;
3537ec681f3Smrg
3547ec681f3Smrg      if (nir_alu_srcs_equal(alu, other_alu, 0, 0))
3557ec681f3Smrg         st->src0_and_src2++;
3567ec681f3Smrg      else if (nir_alu_srcs_equal(alu, other_alu, 1, 1))
3577ec681f3Smrg         st->src1_and_src2++;
3587ec681f3Smrg      else
3597ec681f3Smrg         st->src2++;
3607ec681f3Smrg   }
3617ec681f3Smrg}
3627ec681f3Smrg
3637ec681f3Smrgstatic void
3647ec681f3Smrgconvert_flrp_instruction(nir_builder *bld,
3657ec681f3Smrg                         struct u_vector *dead_flrp,
3667ec681f3Smrg                         nir_alu_instr *alu,
3677ec681f3Smrg                         bool always_precise)
3687ec681f3Smrg{
3697ec681f3Smrg   bool have_ffma = false;
3707ec681f3Smrg   unsigned bit_size = nir_dest_bit_size(alu->dest.dest);
3717ec681f3Smrg
3727ec681f3Smrg   if (bit_size == 16)
3737ec681f3Smrg      have_ffma = !bld->shader->options->lower_ffma16;
3747ec681f3Smrg   else if (bit_size == 32)
3757ec681f3Smrg      have_ffma = !bld->shader->options->lower_ffma32;
3767ec681f3Smrg   else if (bit_size == 64)
3777ec681f3Smrg      have_ffma = !bld->shader->options->lower_ffma64;
3787ec681f3Smrg   else
3797ec681f3Smrg      unreachable("invalid bit_size");
3807ec681f3Smrg
3817ec681f3Smrg   bld->cursor = nir_before_instr(&alu->instr);
3827ec681f3Smrg
3837ec681f3Smrg   /* There are two methods to implement flrp(x, y, t).  The strictly correct
3847ec681f3Smrg    * implementation according to the GLSL spec is:
3857ec681f3Smrg    *
3867ec681f3Smrg    *    x(1 - t) + yt
3877ec681f3Smrg    *
3887ec681f3Smrg    * This can also be implemented using two chained FMAs
3897ec681f3Smrg    *
3907ec681f3Smrg    *    fma(y, t, fma(-x, t, x))
3917ec681f3Smrg    *
3927ec681f3Smrg    * This method, using either formulation, has better precision when the
3937ec681f3Smrg    * difference between x and y is very large.  It guarantess that flrp(x, y,
3947ec681f3Smrg    * 1) = y.  For example, flrp(1e38, 1.0, 1.0) is 1.0.  This is correct.
3957ec681f3Smrg    *
3967ec681f3Smrg    * The other possible implementation is:
3977ec681f3Smrg    *
3987ec681f3Smrg    *    x + t(y - x)
3997ec681f3Smrg    *
4007ec681f3Smrg    * This can also be formuated as an FMA:
4017ec681f3Smrg    *
4027ec681f3Smrg    *    fma(y - x, t, x)
4037ec681f3Smrg    *
4047ec681f3Smrg    * For this implementation, flrp(1e38, 1.0, 1.0) is 0.0.  Since 1.0 was
4057ec681f3Smrg    * expected, that's a pretty significant error.
4067ec681f3Smrg    *
4077ec681f3Smrg    * The choice made for lowering depends on a number of factors.
4087ec681f3Smrg    *
4097ec681f3Smrg    * - If the flrp is marked precise and FMA is supported:
4107ec681f3Smrg    *
4117ec681f3Smrg    *        fma(y, t, fma(-x, t, x))
4127ec681f3Smrg    *
4137ec681f3Smrg    *   This is strictly correct (maybe?), and the cost is two FMA
4147ec681f3Smrg    *   instructions.  It at least maintains the flrp(x, y, 1.0) == y
4157ec681f3Smrg    *   condition.
4167ec681f3Smrg    *
4177ec681f3Smrg    * - If the flrp is marked precise and FMA is not supported:
4187ec681f3Smrg    *
4197ec681f3Smrg    *        x(1 - t) + yt
4207ec681f3Smrg    *
4217ec681f3Smrg    *   This is strictly correct, and the cost is 4 instructions.  If FMA is
4227ec681f3Smrg    *   supported, this may or may not be reduced to 3 instructions (a
4237ec681f3Smrg    *   subtract, a multiply, and an FMA)... but in that case the other
4247ec681f3Smrg    *   formulation should have been used.
4257ec681f3Smrg    */
4267ec681f3Smrg   if (alu->exact) {
4277ec681f3Smrg      if (have_ffma)
4287ec681f3Smrg         replace_with_strict_ffma(bld, dead_flrp, alu);
4297ec681f3Smrg      else
4307ec681f3Smrg         replace_with_strict(bld, dead_flrp, alu);
4317ec681f3Smrg
4327ec681f3Smrg      return;
4337ec681f3Smrg   }
4347ec681f3Smrg
4357ec681f3Smrg   /*
4367ec681f3Smrg    * - If x and y are both immediates and the relative magnitude of the
4377ec681f3Smrg    *   values is similar (such that x-y does not lose too much precision):
4387ec681f3Smrg    *
4397ec681f3Smrg    *        x + t(x - y)
4407ec681f3Smrg    *
4417ec681f3Smrg    *   We rely on constant folding to eliminate x-y, and we rely on
4427ec681f3Smrg    *   nir_opt_algebraic to possibly generate an FMA.  The cost is either one
4437ec681f3Smrg    *   FMA or two instructions.
4447ec681f3Smrg    */
4457ec681f3Smrg   if (sources_are_constants_with_similar_magnitudes(alu)) {
4467ec681f3Smrg      replace_with_fast(bld, dead_flrp, alu);
4477ec681f3Smrg      return;
4487ec681f3Smrg   }
4497ec681f3Smrg
4507ec681f3Smrg   /*
4517ec681f3Smrg    * - If x = 1:
4527ec681f3Smrg    *
4537ec681f3Smrg    *        (yt + -t) + 1
4547ec681f3Smrg    *
4557ec681f3Smrg    * - If x = -1:
4567ec681f3Smrg    *
4577ec681f3Smrg    *        (yt + t) - 1
4587ec681f3Smrg    *
4597ec681f3Smrg    *   In both cases, x is used in place of ±1 for simplicity.  Both forms
4607ec681f3Smrg    *   lend to ffma generation on platforms that support ffma.
4617ec681f3Smrg    */
4627ec681f3Smrg   double src0_as_constant;
4637ec681f3Smrg   if (all_same_constant(alu, 0, &src0_as_constant)) {
4647ec681f3Smrg      if (src0_as_constant == 1.0) {
4657ec681f3Smrg         replace_with_expanded_ffma_and_add(bld, dead_flrp, alu,
4667ec681f3Smrg                                            true /* subtract t */);
4677ec681f3Smrg         return;
4687ec681f3Smrg      } else if (src0_as_constant == -1.0) {
4697ec681f3Smrg         replace_with_expanded_ffma_and_add(bld, dead_flrp, alu,
4707ec681f3Smrg                                            false /* add t */);
4717ec681f3Smrg         return;
4727ec681f3Smrg      }
4737ec681f3Smrg   }
4747ec681f3Smrg
4757ec681f3Smrg   /*
4767ec681f3Smrg    * - If y = ±1:
4777ec681f3Smrg    *
4787ec681f3Smrg    *        x(1 - t) + yt
4797ec681f3Smrg    *
4807ec681f3Smrg    *   In this case either the multiply in yt will be eliminated by
4817ec681f3Smrg    *   nir_opt_algebraic.  If FMA is supported, this results in fma(x, (1 -
4827ec681f3Smrg    *   t), ±t) for two instructions.  If FMA is not supported, then the cost
4837ec681f3Smrg    *   is 3 instructions.  We rely on nir_opt_algebraic to generate the FMA
4847ec681f3Smrg    *   instructions as well.
4857ec681f3Smrg    *
4867ec681f3Smrg    *   Another possible replacement is
4877ec681f3Smrg    *
4887ec681f3Smrg    *        -xt + x ± t
4897ec681f3Smrg    *
4907ec681f3Smrg    *   Some groupings of this may be better on some platforms in some
4917ec681f3Smrg    *   circumstances, bit it is probably dependent on scheduling.  Futher
4927ec681f3Smrg    *   investigation may be required.
4937ec681f3Smrg    */
4947ec681f3Smrg   double src1_as_constant;
4957ec681f3Smrg   if ((all_same_constant(alu, 1, &src1_as_constant) &&
4967ec681f3Smrg        (src1_as_constant == -1.0 || src1_as_constant == 1.0))) {
4977ec681f3Smrg      replace_with_strict(bld, dead_flrp, alu);
4987ec681f3Smrg      return;
4997ec681f3Smrg   }
5007ec681f3Smrg
5017ec681f3Smrg   if (have_ffma) {
5027ec681f3Smrg      if (always_precise) {
5037ec681f3Smrg         replace_with_strict_ffma(bld, dead_flrp, alu);
5047ec681f3Smrg         return;
5057ec681f3Smrg      }
5067ec681f3Smrg
5077ec681f3Smrg      /*
5087ec681f3Smrg       * - If FMA is supported and other flrp(x, _, t) exists:
5097ec681f3Smrg       *
5107ec681f3Smrg       *        fma(y, t, fma(-x, t, x))
5117ec681f3Smrg       *
5127ec681f3Smrg       *   The hope is that the inner FMA calculation will be shared with the
5137ec681f3Smrg       *   other lowered flrp.  This results in two FMA instructions for the
5147ec681f3Smrg       *   first flrp and one FMA instruction for each additional flrp.  It
5157ec681f3Smrg       *   also means that the live range for x might be complete after the
5167ec681f3Smrg       *   inner ffma instead of after the last flrp.
5177ec681f3Smrg       */
5187ec681f3Smrg      struct similar_flrp_stats st;
5197ec681f3Smrg
5207ec681f3Smrg      get_similar_flrp_stats(alu, &st);
5217ec681f3Smrg      if (st.src0_and_src2 > 0) {
5227ec681f3Smrg         replace_with_strict_ffma(bld, dead_flrp, alu);
5237ec681f3Smrg         return;
5247ec681f3Smrg      }
5257ec681f3Smrg
5267ec681f3Smrg      /*
5277ec681f3Smrg       * - If FMA is supported and another flrp(_, y, t) exists:
5287ec681f3Smrg       *
5297ec681f3Smrg       *        fma(x, (1 - t), yt)
5307ec681f3Smrg       *
5317ec681f3Smrg       *   The hope is that the (1 - t) and the yt will be shared with the
5327ec681f3Smrg       *   other lowered flrp.  This results in 3 insructions for the first
5337ec681f3Smrg       *   flrp and 1 for each additional flrp.
5347ec681f3Smrg       */
5357ec681f3Smrg      if (st.src1_and_src2 > 0) {
5367ec681f3Smrg         replace_with_single_ffma(bld, dead_flrp, alu);
5377ec681f3Smrg         return;
5387ec681f3Smrg      }
5397ec681f3Smrg   } else {
5407ec681f3Smrg      if (always_precise) {
5417ec681f3Smrg         replace_with_strict(bld, dead_flrp, alu);
5427ec681f3Smrg         return;
5437ec681f3Smrg      }
5447ec681f3Smrg
5457ec681f3Smrg      /*
5467ec681f3Smrg       * - If FMA is not supported and another flrp(x, _, t) exists:
5477ec681f3Smrg       *
5487ec681f3Smrg       *        x(1 - t) + yt
5497ec681f3Smrg       *
5507ec681f3Smrg       *   The hope is that the x(1 - t) will be shared with the other lowered
5517ec681f3Smrg       *   flrp.  This results in 4 insructions for the first flrp and 2 for
5527ec681f3Smrg       *   each additional flrp.
5537ec681f3Smrg       *
5547ec681f3Smrg       * - If FMA is not supported and another flrp(_, y, t) exists:
5557ec681f3Smrg       *
5567ec681f3Smrg       *        x(1 - t) + yt
5577ec681f3Smrg       *
5587ec681f3Smrg       *   The hope is that the (1 - t) and the yt will be shared with the
5597ec681f3Smrg       *   other lowered flrp.  This results in 4 insructions for the first
5607ec681f3Smrg       *   flrp and 2 for each additional flrp.
5617ec681f3Smrg       */
5627ec681f3Smrg      struct similar_flrp_stats st;
5637ec681f3Smrg
5647ec681f3Smrg      get_similar_flrp_stats(alu, &st);
5657ec681f3Smrg      if (st.src0_and_src2 > 0 || st.src1_and_src2 > 0) {
5667ec681f3Smrg         replace_with_strict(bld, dead_flrp, alu);
5677ec681f3Smrg         return;
5687ec681f3Smrg      }
5697ec681f3Smrg   }
5707ec681f3Smrg
5717ec681f3Smrg   /*
5727ec681f3Smrg    * - If t is constant:
5737ec681f3Smrg    *
5747ec681f3Smrg    *        x(1 - t) + yt
5757ec681f3Smrg    *
5767ec681f3Smrg    *   The cost is three instructions without FMA or two instructions with
5777ec681f3Smrg    *   FMA.  This is the same cost as the imprecise lowering, but it gives
5787ec681f3Smrg    *   the instruction scheduler a little more freedom.
5797ec681f3Smrg    *
5807ec681f3Smrg    *   There is no need to handle t = 0.5 specially.  nir_opt_algebraic
5817ec681f3Smrg    *   already has optimizations to convert 0.5x + 0.5y to 0.5(x + y).
5827ec681f3Smrg    */
5837ec681f3Smrg   if (alu->src[2].src.ssa->parent_instr->type == nir_instr_type_load_const) {
5847ec681f3Smrg      replace_with_strict(bld, dead_flrp, alu);
5857ec681f3Smrg      return;
5867ec681f3Smrg   }
5877ec681f3Smrg
5887ec681f3Smrg   /*
5897ec681f3Smrg    * - Otherwise
5907ec681f3Smrg    *
5917ec681f3Smrg    *        x + t(x - y)
5927ec681f3Smrg    */
5937ec681f3Smrg   replace_with_fast(bld, dead_flrp, alu);
5947ec681f3Smrg}
5957ec681f3Smrg
5967ec681f3Smrgstatic void
5977ec681f3Smrglower_flrp_impl(nir_function_impl *impl,
5987ec681f3Smrg                struct u_vector *dead_flrp,
5997ec681f3Smrg                unsigned lowering_mask,
6007ec681f3Smrg                bool always_precise)
6017ec681f3Smrg{
6027ec681f3Smrg   nir_builder b;
6037ec681f3Smrg   nir_builder_init(&b, impl);
6047ec681f3Smrg
6057ec681f3Smrg   nir_foreach_block(block, impl) {
6067ec681f3Smrg      nir_foreach_instr_safe(instr, block) {
6077ec681f3Smrg         if (instr->type == nir_instr_type_alu) {
6087ec681f3Smrg            nir_alu_instr *const alu = nir_instr_as_alu(instr);
6097ec681f3Smrg
6107ec681f3Smrg            if (alu->op == nir_op_flrp &&
6117ec681f3Smrg                (alu->dest.dest.ssa.bit_size & lowering_mask)) {
6127ec681f3Smrg               convert_flrp_instruction(&b, dead_flrp, alu, always_precise);
6137ec681f3Smrg            }
6147ec681f3Smrg         }
6157ec681f3Smrg      }
6167ec681f3Smrg   }
6177ec681f3Smrg
6187ec681f3Smrg   nir_metadata_preserve(impl, nir_metadata_block_index |
6197ec681f3Smrg                               nir_metadata_dominance);
6207ec681f3Smrg}
6217ec681f3Smrg
6227ec681f3Smrg/**
6237ec681f3Smrg * \param lowering_mask - Bitwise-or of the bit sizes that need to be lowered
6247ec681f3Smrg *                        (e.g., 16 | 64 if only 16-bit and 64-bit flrp need
6257ec681f3Smrg *                        lowering).
6267ec681f3Smrg * \param always_precise - Always require precise lowering for flrp.  This
6277ec681f3Smrg *                        will always lower flrp to (a * (1 - c)) + (b * c).
6287ec681f3Smrg * \param have_ffma - Set to true if the GPU has an FFMA instruction that
6297ec681f3Smrg *                    should be used.
6307ec681f3Smrg */
6317ec681f3Smrgbool
6327ec681f3Smrgnir_lower_flrp(nir_shader *shader,
6337ec681f3Smrg               unsigned lowering_mask,
6347ec681f3Smrg               bool always_precise)
6357ec681f3Smrg{
6367ec681f3Smrg   struct u_vector dead_flrp;
6377ec681f3Smrg
6387ec681f3Smrg   if (!u_vector_init_pow2(&dead_flrp, 8, sizeof(struct nir_alu_instr *)))
6397ec681f3Smrg      return false;
6407ec681f3Smrg
6417ec681f3Smrg   nir_foreach_function(function, shader) {
6427ec681f3Smrg      if (function->impl) {
6437ec681f3Smrg         lower_flrp_impl(function->impl, &dead_flrp, lowering_mask,
6447ec681f3Smrg                         always_precise);
6457ec681f3Smrg      }
6467ec681f3Smrg   }
6477ec681f3Smrg
6487ec681f3Smrg   /* Progress was made if the dead list is not empty.  Remove all the
6497ec681f3Smrg    * instructions from the dead list.
6507ec681f3Smrg    */
6517ec681f3Smrg   const bool progress = u_vector_length(&dead_flrp) != 0;
6527ec681f3Smrg
6537ec681f3Smrg   struct nir_alu_instr **instr;
6547ec681f3Smrg   u_vector_foreach(instr, &dead_flrp)
6557ec681f3Smrg      nir_instr_remove(&(*instr)->instr);
6567ec681f3Smrg
6577ec681f3Smrg   u_vector_finish(&dead_flrp);
6587ec681f3Smrg
6597ec681f3Smrg   return progress;
6607ec681f3Smrg}
661