compiler/nir/nir_lower_flrp.c

7ec681f3Smrg/*
7ec681f3Smrg * Copyright © 2018 Intel Corporation
7ec681f3Smrg *
7ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
7ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
7ec681f3Smrg * to deal in the Software without restriction, including without limitation
7ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
7ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
7ec681f3Smrg *
7ec681f3Smrg * The above copyright notice and this permission notice (including the next
7ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
7ec681f3Smrg * Software.
7ec681f3Smrg *
7ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
7ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
7ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
7ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
7ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
7ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
7ec681f3Smrg * IN THE SOFTWARE.
7ec681f3Smrg */
7ec681f3Smrg#include <math.h>
7ec681f3Smrg#include "nir.h"
7ec681f3Smrg#include "nir_builder.h"
7ec681f3Smrg#include "util/u_vector.h"
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Lower flrp instructions.
7ec681f3Smrg *
7ec681f3Smrg * Unlike the lowerings that are possible in nir_opt_algrbraic, this pass can
7ec681f3Smrg * examine more global information to determine a possibly more efficient
7ec681f3Smrg * lowering for each flrp.
7ec681f3Smrg */
7ec681f3Smrg
7ec681f3Smrgstatic void
7ec681f3Smrgappend_flrp_to_dead_list(struct u_vector *dead_flrp, struct nir_alu_instr *alu)
7ec681f3Smrg{
7ec681f3Smrg   struct nir_alu_instr **tail = u_vector_add(dead_flrp);
7ec681f3Smrg   *tail = alu;
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Replace flrp(a, b, c) with ffma(b, c, ffma(-a, c, a)).
7ec681f3Smrg */
7ec681f3Smrgstatic void
7ec681f3Smrgreplace_with_strict_ffma(struct nir_builder *bld, struct u_vector *dead_flrp,
7ec681f3Smrg                         struct nir_alu_instr *alu)
7ec681f3Smrg{
7ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
7ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
7ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const neg_a = nir_fneg(bld, a);
7ec681f3Smrg   nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const inner_ffma = nir_ffma(bld, neg_a, c, a);
7ec681f3Smrg   nir_instr_as_alu(inner_ffma->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const outer_ffma = nir_ffma(bld, b, c, inner_ffma);
7ec681f3Smrg   nir_instr_as_alu(outer_ffma->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_ffma);
7ec681f3Smrg
7ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
7ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
7ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
7ec681f3Smrg    */
7ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Replace flrp(a, b, c) with ffma(a, (1 - c), bc)
7ec681f3Smrg */
7ec681f3Smrgstatic void
7ec681f3Smrgreplace_with_single_ffma(struct nir_builder *bld, struct u_vector *dead_flrp,
7ec681f3Smrg                         struct nir_alu_instr *alu)
7ec681f3Smrg{
7ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
7ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
7ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const neg_c = nir_fneg(bld, c);
7ec681f3Smrg   nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const one_minus_c =
7ec681f3Smrg      nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c);
7ec681f3Smrg   nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const b_times_c = nir_fmul(bld, b, c);
7ec681f3Smrg   nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const final_ffma = nir_ffma(bld, a, one_minus_c, b_times_c);
7ec681f3Smrg   nir_instr_as_alu(final_ffma->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, final_ffma);
7ec681f3Smrg
7ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
7ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
7ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
7ec681f3Smrg    */
7ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Replace flrp(a, b, c) with a(1-c) + bc.
7ec681f3Smrg */
7ec681f3Smrgstatic void
7ec681f3Smrgreplace_with_strict(struct nir_builder *bld, struct u_vector *dead_flrp,
7ec681f3Smrg                    struct nir_alu_instr *alu)
7ec681f3Smrg{
7ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
7ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
7ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const neg_c = nir_fneg(bld, c);
7ec681f3Smrg   nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const one_minus_c =
7ec681f3Smrg      nir_fadd(bld, nir_imm_floatN_t(bld, 1.0f, c->bit_size), neg_c);
7ec681f3Smrg   nir_instr_as_alu(one_minus_c->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const first_product = nir_fmul(bld, a, one_minus_c);
7ec681f3Smrg   nir_instr_as_alu(first_product->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const second_product = nir_fmul(bld, b, c);
7ec681f3Smrg   nir_instr_as_alu(second_product->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const sum = nir_fadd(bld, first_product, second_product);
7ec681f3Smrg   nir_instr_as_alu(sum->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
7ec681f3Smrg
7ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
7ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
7ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
7ec681f3Smrg    */
7ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Replace flrp(a, b, c) with a + c(b-a).
7ec681f3Smrg */
7ec681f3Smrgstatic void
7ec681f3Smrgreplace_with_fast(struct nir_builder *bld, struct u_vector *dead_flrp,
7ec681f3Smrg                  struct nir_alu_instr *alu)
7ec681f3Smrg{
7ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
7ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
7ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const neg_a = nir_fneg(bld, a);
7ec681f3Smrg   nir_instr_as_alu(neg_a->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const b_minus_a = nir_fadd(bld, b, neg_a);
7ec681f3Smrg   nir_instr_as_alu(b_minus_a->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const product = nir_fmul(bld, c, b_minus_a);
7ec681f3Smrg   nir_instr_as_alu(product->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const sum = nir_fadd(bld, a, product);
7ec681f3Smrg   nir_instr_as_alu(sum->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum);
7ec681f3Smrg
7ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
7ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
7ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
7ec681f3Smrg    */
7ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Replace flrp(a, b, c) with (b*c ± c) + a => b*c + (a ± c)
7ec681f3Smrg *
7ec681f3Smrg * \note: This only works if a = ±1.
7ec681f3Smrg */
7ec681f3Smrgstatic void
7ec681f3Smrgreplace_with_expanded_ffma_and_add(struct nir_builder *bld,
7ec681f3Smrg                                   struct u_vector *dead_flrp,
7ec681f3Smrg                                   struct nir_alu_instr *alu, bool subtract_c)
7ec681f3Smrg{
7ec681f3Smrg   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, alu, 0);
7ec681f3Smrg   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, alu, 1);
7ec681f3Smrg   nir_ssa_def *const c = nir_ssa_for_alu_src(bld, alu, 2);
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const b_times_c = nir_fmul(bld, b, c);
7ec681f3Smrg   nir_instr_as_alu(b_times_c->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *inner_sum;
7ec681f3Smrg
7ec681f3Smrg   if (subtract_c) {
7ec681f3Smrg      nir_ssa_def *const neg_c = nir_fneg(bld, c);
7ec681f3Smrg      nir_instr_as_alu(neg_c->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg      inner_sum = nir_fadd(bld, a, neg_c);
7ec681f3Smrg   } else {
7ec681f3Smrg      inner_sum = nir_fadd(bld, a, c);
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   nir_instr_as_alu(inner_sum->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def *const outer_sum = nir_fadd(bld, inner_sum, b_times_c);
7ec681f3Smrg   nir_instr_as_alu(outer_sum->parent_instr)->exact = alu->exact;
7ec681f3Smrg
7ec681f3Smrg   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, outer_sum);
7ec681f3Smrg
7ec681f3Smrg   /* DO NOT REMOVE the original flrp yet.  Many of the lowering choices are
7ec681f3Smrg    * based on other uses of the sources.  Removing the flrp may cause the
7ec681f3Smrg    * last flrp in a sequence to make a different, incorrect choice.
7ec681f3Smrg    */
7ec681f3Smrg   append_flrp_to_dead_list(dead_flrp, alu);
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Determines whether a swizzled source is constant w/ all components the same.
7ec681f3Smrg *
7ec681f3Smrg * The value of the constant is stored in \c result.
7ec681f3Smrg *
7ec681f3Smrg * \return
7ec681f3Smrg * True if all components of the swizzled source are the same constant.
7ec681f3Smrg * Otherwise false is returned.
7ec681f3Smrg */
7ec681f3Smrgstatic bool
7ec681f3Smrgall_same_constant(const nir_alu_instr *instr, unsigned src, double *result)
7ec681f3Smrg{
7ec681f3Smrg   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
7ec681f3Smrg
7ec681f3Smrg   if (!val)
7ec681f3Smrg      return false;
7ec681f3Smrg
7ec681f3Smrg   const uint8_t *const swizzle = instr->src[src].swizzle;
7ec681f3Smrg   const unsigned num_components = nir_dest_num_components(instr->dest.dest);
7ec681f3Smrg
7ec681f3Smrg   if (instr->dest.dest.ssa.bit_size == 32) {
7ec681f3Smrg      const float first = val[swizzle[0]].f32;
7ec681f3Smrg
7ec681f3Smrg      for (unsigned i = 1; i < num_components; i++) {
7ec681f3Smrg         if (val[swizzle[i]].f32 != first)
7ec681f3Smrg            return false;
7ec681f3Smrg      }
7ec681f3Smrg
7ec681f3Smrg      *result = first;
7ec681f3Smrg   } else {
7ec681f3Smrg      const double first = val[swizzle[0]].f64;
7ec681f3Smrg
7ec681f3Smrg      for (unsigned i = 1; i < num_components; i++) {
7ec681f3Smrg         if (val[swizzle[i]].f64 != first)
7ec681f3Smrg            return false;
7ec681f3Smrg      }
7ec681f3Smrg
7ec681f3Smrg      *result = first;
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   return true;
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrgstatic bool
7ec681f3Smrgsources_are_constants_with_similar_magnitudes(const nir_alu_instr *instr)
7ec681f3Smrg{
7ec681f3Smrg   nir_const_value *val0 = nir_src_as_const_value(instr->src[0].src);
7ec681f3Smrg   nir_const_value *val1 = nir_src_as_const_value(instr->src[1].src);
7ec681f3Smrg
7ec681f3Smrg   if (val0 == NULL || val1 == NULL)
7ec681f3Smrg      return false;
7ec681f3Smrg
7ec681f3Smrg   const uint8_t *const swizzle0 = instr->src[0].swizzle;
7ec681f3Smrg   const uint8_t *const swizzle1 = instr->src[1].swizzle;
7ec681f3Smrg   const unsigned num_components = nir_dest_num_components(instr->dest.dest);
7ec681f3Smrg
7ec681f3Smrg   if (instr->dest.dest.ssa.bit_size == 32) {
7ec681f3Smrg      for (unsigned i = 0; i < num_components; i++) {
7ec681f3Smrg         int exp0;
7ec681f3Smrg         int exp1;
7ec681f3Smrg
7ec681f3Smrg         frexpf(val0[swizzle0[i]].f32, &exp0);
7ec681f3Smrg         frexpf(val1[swizzle1[i]].f32, &exp1);
7ec681f3Smrg
7ec681f3Smrg         /* If the difference between exponents is >= 24, then A+B will always
7ec681f3Smrg          * have the value whichever between A and B has the largest absolute
7ec681f3Smrg          * value.  So, [0, 23] is the valid range.  The smaller the limit
7ec681f3Smrg          * value, the more precision will be maintained at a potential
7ec681f3Smrg          * performance cost.  Somewhat arbitrarilly split the range in half.
7ec681f3Smrg          */
7ec681f3Smrg         if (abs(exp0 - exp1) > (23 / 2))
7ec681f3Smrg            return false;
7ec681f3Smrg      }
7ec681f3Smrg   } else {
7ec681f3Smrg      for (unsigned i = 0; i < num_components; i++) {
7ec681f3Smrg         int exp0;
7ec681f3Smrg         int exp1;
7ec681f3Smrg
7ec681f3Smrg         frexp(val0[swizzle0[i]].f64, &exp0);
7ec681f3Smrg         frexp(val1[swizzle1[i]].f64, &exp1);
7ec681f3Smrg
7ec681f3Smrg         /* If the difference between exponents is >= 53, then A+B will always
7ec681f3Smrg          * have the value whichever between A and B has the largest absolute
7ec681f3Smrg          * value.  So, [0, 52] is the valid range.  The smaller the limit
7ec681f3Smrg          * value, the more precision will be maintained at a potential
7ec681f3Smrg          * performance cost.  Somewhat arbitrarilly split the range in half.
7ec681f3Smrg          */
7ec681f3Smrg         if (abs(exp0 - exp1) > (52 / 2))
7ec681f3Smrg            return false;
7ec681f3Smrg      }
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   return true;
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Counts of similar types of nir_op_flrp instructions
7ec681f3Smrg *
7ec681f3Smrg * If a similar instruction fits into more than one category, it will only be
7ec681f3Smrg * counted once.  The assumption is that no other instruction will have all
7ec681f3Smrg * sources the same, or CSE would have removed one of the instructions.
7ec681f3Smrg */
7ec681f3Smrgstruct similar_flrp_stats {
7ec681f3Smrg   unsigned src2;
7ec681f3Smrg   unsigned src0_and_src2;
7ec681f3Smrg   unsigned src1_and_src2;
7ec681f3Smrg};
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * Collection counts of similar FLRP instructions.
7ec681f3Smrg *
7ec681f3Smrg * This function only cares about similar instructions that have src2 in
7ec681f3Smrg * common.
7ec681f3Smrg */
7ec681f3Smrgstatic void
7ec681f3Smrgget_similar_flrp_stats(nir_alu_instr *alu, struct similar_flrp_stats *st)
7ec681f3Smrg{
7ec681f3Smrg   memset(st, 0, sizeof(*st));
7ec681f3Smrg
7ec681f3Smrg   nir_foreach_use(other_use, alu->src[2].src.ssa) {
7ec681f3Smrg      /* Is the use also a flrp? */
7ec681f3Smrg      nir_instr *const other_instr = other_use->parent_instr;
7ec681f3Smrg      if (other_instr->type != nir_instr_type_alu)
7ec681f3Smrg         continue;
7ec681f3Smrg
7ec681f3Smrg      /* Eh-hem... don't match the instruction with itself. */
7ec681f3Smrg      if (other_instr == &alu->instr)
7ec681f3Smrg         continue;
7ec681f3Smrg
7ec681f3Smrg      nir_alu_instr *const other_alu = nir_instr_as_alu(other_instr);
7ec681f3Smrg      if (other_alu->op != nir_op_flrp)
7ec681f3Smrg         continue;
7ec681f3Smrg
7ec681f3Smrg      /* Does the other flrp use source 2 from the first flrp as its source 2
7ec681f3Smrg       * as well?
7ec681f3Smrg       */
7ec681f3Smrg      if (!nir_alu_srcs_equal(alu, other_alu, 2, 2))
7ec681f3Smrg         continue;
7ec681f3Smrg
7ec681f3Smrg      if (nir_alu_srcs_equal(alu, other_alu, 0, 0))
7ec681f3Smrg         st->src0_and_src2++;
7ec681f3Smrg      else if (nir_alu_srcs_equal(alu, other_alu, 1, 1))
7ec681f3Smrg         st->src1_and_src2++;
7ec681f3Smrg      else
7ec681f3Smrg         st->src2++;
7ec681f3Smrg   }
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrgstatic void
7ec681f3Smrgconvert_flrp_instruction(nir_builder *bld,
7ec681f3Smrg                         struct u_vector *dead_flrp,
7ec681f3Smrg                         nir_alu_instr *alu,
7ec681f3Smrg                         bool always_precise)
7ec681f3Smrg{
7ec681f3Smrg   bool have_ffma = false;
7ec681f3Smrg   unsigned bit_size = nir_dest_bit_size(alu->dest.dest);
7ec681f3Smrg
7ec681f3Smrg   if (bit_size == 16)
7ec681f3Smrg      have_ffma = !bld->shader->options->lower_ffma16;
7ec681f3Smrg   else if (bit_size == 32)
7ec681f3Smrg      have_ffma = !bld->shader->options->lower_ffma32;
7ec681f3Smrg   else if (bit_size == 64)
7ec681f3Smrg      have_ffma = !bld->shader->options->lower_ffma64;
7ec681f3Smrg   else
7ec681f3Smrg      unreachable("invalid bit_size");
7ec681f3Smrg
7ec681f3Smrg   bld->cursor = nir_before_instr(&alu->instr);
7ec681f3Smrg
7ec681f3Smrg   /* There are two methods to implement flrp(x, y, t).  The strictly correct
7ec681f3Smrg    * implementation according to the GLSL spec is:
7ec681f3Smrg    *
7ec681f3Smrg    *    x(1 - t) + yt
7ec681f3Smrg    *
7ec681f3Smrg    * This can also be implemented using two chained FMAs
7ec681f3Smrg    *
7ec681f3Smrg    *    fma(y, t, fma(-x, t, x))
7ec681f3Smrg    *
7ec681f3Smrg    * This method, using either formulation, has better precision when the
7ec681f3Smrg    * difference between x and y is very large.  It guarantess that flrp(x, y,
7ec681f3Smrg    * 1) = y.  For example, flrp(1e38, 1.0, 1.0) is 1.0.  This is correct.
7ec681f3Smrg    *
7ec681f3Smrg    * The other possible implementation is:
7ec681f3Smrg    *
7ec681f3Smrg    *    x + t(y - x)
7ec681f3Smrg    *
7ec681f3Smrg    * This can also be formuated as an FMA:
7ec681f3Smrg    *
7ec681f3Smrg    *    fma(y - x, t, x)
7ec681f3Smrg    *
7ec681f3Smrg    * For this implementation, flrp(1e38, 1.0, 1.0) is 0.0.  Since 1.0 was
7ec681f3Smrg    * expected, that's a pretty significant error.
7ec681f3Smrg    *
7ec681f3Smrg    * The choice made for lowering depends on a number of factors.
7ec681f3Smrg    *
7ec681f3Smrg    * - If the flrp is marked precise and FMA is supported:
7ec681f3Smrg    *
7ec681f3Smrg    *        fma(y, t, fma(-x, t, x))
7ec681f3Smrg    *
7ec681f3Smrg    *   This is strictly correct (maybe?), and the cost is two FMA
7ec681f3Smrg    *   instructions.  It at least maintains the flrp(x, y, 1.0) == y
7ec681f3Smrg    *   condition.
7ec681f3Smrg    *
7ec681f3Smrg    * - If the flrp is marked precise and FMA is not supported:
7ec681f3Smrg    *
7ec681f3Smrg    *        x(1 - t) + yt
7ec681f3Smrg    *
7ec681f3Smrg    *   This is strictly correct, and the cost is 4 instructions.  If FMA is
7ec681f3Smrg    *   supported, this may or may not be reduced to 3 instructions (a
7ec681f3Smrg    *   subtract, a multiply, and an FMA)... but in that case the other
7ec681f3Smrg    *   formulation should have been used.
7ec681f3Smrg    */
7ec681f3Smrg   if (alu->exact) {
7ec681f3Smrg      if (have_ffma)
7ec681f3Smrg         replace_with_strict_ffma(bld, dead_flrp, alu);
7ec681f3Smrg      else
7ec681f3Smrg         replace_with_strict(bld, dead_flrp, alu);
7ec681f3Smrg
7ec681f3Smrg      return;
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   /*
7ec681f3Smrg    * - If x and y are both immediates and the relative magnitude of the
7ec681f3Smrg    *   values is similar (such that x-y does not lose too much precision):
7ec681f3Smrg    *
7ec681f3Smrg    *        x + t(x - y)
7ec681f3Smrg    *
7ec681f3Smrg    *   We rely on constant folding to eliminate x-y, and we rely on
7ec681f3Smrg    *   nir_opt_algebraic to possibly generate an FMA.  The cost is either one
7ec681f3Smrg    *   FMA or two instructions.
7ec681f3Smrg    */
7ec681f3Smrg   if (sources_are_constants_with_similar_magnitudes(alu)) {
7ec681f3Smrg      replace_with_fast(bld, dead_flrp, alu);
7ec681f3Smrg      return;
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   /*
7ec681f3Smrg    * - If x = 1:
7ec681f3Smrg    *
7ec681f3Smrg    *        (yt + -t) + 1
7ec681f3Smrg    *
7ec681f3Smrg    * - If x = -1:
7ec681f3Smrg    *
7ec681f3Smrg    *        (yt + t) - 1
7ec681f3Smrg    *
7ec681f3Smrg    *   In both cases, x is used in place of ±1 for simplicity.  Both forms
7ec681f3Smrg    *   lend to ffma generation on platforms that support ffma.
7ec681f3Smrg    */
7ec681f3Smrg   double src0_as_constant;
7ec681f3Smrg   if (all_same_constant(alu, 0, &src0_as_constant)) {
7ec681f3Smrg      if (src0_as_constant == 1.0) {
7ec681f3Smrg         replace_with_expanded_ffma_and_add(bld, dead_flrp, alu,
7ec681f3Smrg                                            true /* subtract t */);
7ec681f3Smrg         return;
7ec681f3Smrg      } else if (src0_as_constant == -1.0) {
7ec681f3Smrg         replace_with_expanded_ffma_and_add(bld, dead_flrp, alu,
7ec681f3Smrg                                            false /* add t */);
7ec681f3Smrg         return;
7ec681f3Smrg      }
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   /*
7ec681f3Smrg    * - If y = ±1:
7ec681f3Smrg    *
7ec681f3Smrg    *        x(1 - t) + yt
7ec681f3Smrg    *
7ec681f3Smrg    *   In this case either the multiply in yt will be eliminated by
7ec681f3Smrg    *   nir_opt_algebraic.  If FMA is supported, this results in fma(x, (1 -
7ec681f3Smrg    *   t), ±t) for two instructions.  If FMA is not supported, then the cost
7ec681f3Smrg    *   is 3 instructions.  We rely on nir_opt_algebraic to generate the FMA
7ec681f3Smrg    *   instructions as well.
7ec681f3Smrg    *
7ec681f3Smrg    *   Another possible replacement is
7ec681f3Smrg    *
7ec681f3Smrg    *        -xt + x ± t
7ec681f3Smrg    *
7ec681f3Smrg    *   Some groupings of this may be better on some platforms in some
7ec681f3Smrg    *   circumstances, bit it is probably dependent on scheduling.  Futher
7ec681f3Smrg    *   investigation may be required.
7ec681f3Smrg    */
7ec681f3Smrg   double src1_as_constant;
7ec681f3Smrg   if ((all_same_constant(alu, 1, &src1_as_constant) &&
7ec681f3Smrg        (src1_as_constant == -1.0 || src1_as_constant == 1.0))) {
7ec681f3Smrg      replace_with_strict(bld, dead_flrp, alu);
7ec681f3Smrg      return;
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   if (have_ffma) {
7ec681f3Smrg      if (always_precise) {
7ec681f3Smrg         replace_with_strict_ffma(bld, dead_flrp, alu);
7ec681f3Smrg         return;
7ec681f3Smrg      }
7ec681f3Smrg
7ec681f3Smrg      /*
7ec681f3Smrg       * - If FMA is supported and other flrp(x, _, t) exists:
7ec681f3Smrg       *
7ec681f3Smrg       *        fma(y, t, fma(-x, t, x))
7ec681f3Smrg       *
7ec681f3Smrg       *   The hope is that the inner FMA calculation will be shared with the
7ec681f3Smrg       *   other lowered flrp.  This results in two FMA instructions for the
7ec681f3Smrg       *   first flrp and one FMA instruction for each additional flrp.  It
7ec681f3Smrg       *   also means that the live range for x might be complete after the
7ec681f3Smrg       *   inner ffma instead of after the last flrp.
7ec681f3Smrg       */
7ec681f3Smrg      struct similar_flrp_stats st;
7ec681f3Smrg
7ec681f3Smrg      get_similar_flrp_stats(alu, &st);
7ec681f3Smrg      if (st.src0_and_src2 > 0) {
7ec681f3Smrg         replace_with_strict_ffma(bld, dead_flrp, alu);
7ec681f3Smrg         return;
7ec681f3Smrg      }
7ec681f3Smrg
7ec681f3Smrg      /*
7ec681f3Smrg       * - If FMA is supported and another flrp(_, y, t) exists:
7ec681f3Smrg       *
7ec681f3Smrg       *        fma(x, (1 - t), yt)
7ec681f3Smrg       *
7ec681f3Smrg       *   The hope is that the (1 - t) and the yt will be shared with the
7ec681f3Smrg       *   other lowered flrp.  This results in 3 insructions for the first
7ec681f3Smrg       *   flrp and 1 for each additional flrp.
7ec681f3Smrg       */
7ec681f3Smrg      if (st.src1_and_src2 > 0) {
7ec681f3Smrg         replace_with_single_ffma(bld, dead_flrp, alu);
7ec681f3Smrg         return;
7ec681f3Smrg      }
7ec681f3Smrg   } else {
7ec681f3Smrg      if (always_precise) {
7ec681f3Smrg         replace_with_strict(bld, dead_flrp, alu);
7ec681f3Smrg         return;
7ec681f3Smrg      }
7ec681f3Smrg
7ec681f3Smrg      /*
7ec681f3Smrg       * - If FMA is not supported and another flrp(x, _, t) exists:
7ec681f3Smrg       *
7ec681f3Smrg       *        x(1 - t) + yt
7ec681f3Smrg       *
7ec681f3Smrg       *   The hope is that the x(1 - t) will be shared with the other lowered
7ec681f3Smrg       *   flrp.  This results in 4 insructions for the first flrp and 2 for
7ec681f3Smrg       *   each additional flrp.
7ec681f3Smrg       *
7ec681f3Smrg       * - If FMA is not supported and another flrp(_, y, t) exists:
7ec681f3Smrg       *
7ec681f3Smrg       *        x(1 - t) + yt
7ec681f3Smrg       *
7ec681f3Smrg       *   The hope is that the (1 - t) and the yt will be shared with the
7ec681f3Smrg       *   other lowered flrp.  This results in 4 insructions for the first
7ec681f3Smrg       *   flrp and 2 for each additional flrp.
7ec681f3Smrg       */
7ec681f3Smrg      struct similar_flrp_stats st;
7ec681f3Smrg
7ec681f3Smrg      get_similar_flrp_stats(alu, &st);
7ec681f3Smrg      if (st.src0_and_src2 > 0 || st.src1_and_src2 > 0) {
7ec681f3Smrg         replace_with_strict(bld, dead_flrp, alu);
7ec681f3Smrg         return;
7ec681f3Smrg      }
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   /*
7ec681f3Smrg    * - If t is constant:
7ec681f3Smrg    *
7ec681f3Smrg    *        x(1 - t) + yt
7ec681f3Smrg    *
7ec681f3Smrg    *   The cost is three instructions without FMA or two instructions with
7ec681f3Smrg    *   FMA.  This is the same cost as the imprecise lowering, but it gives
7ec681f3Smrg    *   the instruction scheduler a little more freedom.
7ec681f3Smrg    *
7ec681f3Smrg    *   There is no need to handle t = 0.5 specially.  nir_opt_algebraic
7ec681f3Smrg    *   already has optimizations to convert 0.5x + 0.5y to 0.5(x + y).
7ec681f3Smrg    */
7ec681f3Smrg   if (alu->src[2].src.ssa->parent_instr->type == nir_instr_type_load_const) {
7ec681f3Smrg      replace_with_strict(bld, dead_flrp, alu);
7ec681f3Smrg      return;
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   /*
7ec681f3Smrg    * - Otherwise
7ec681f3Smrg    *
7ec681f3Smrg    *        x + t(x - y)
7ec681f3Smrg    */
7ec681f3Smrg   replace_with_fast(bld, dead_flrp, alu);
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrgstatic void
7ec681f3Smrglower_flrp_impl(nir_function_impl *impl,
7ec681f3Smrg                struct u_vector *dead_flrp,
7ec681f3Smrg                unsigned lowering_mask,
7ec681f3Smrg                bool always_precise)
7ec681f3Smrg{
7ec681f3Smrg   nir_builder b;
7ec681f3Smrg   nir_builder_init(&b, impl);
7ec681f3Smrg
7ec681f3Smrg   nir_foreach_block(block, impl) {
7ec681f3Smrg      nir_foreach_instr_safe(instr, block) {
7ec681f3Smrg         if (instr->type == nir_instr_type_alu) {
7ec681f3Smrg            nir_alu_instr *const alu = nir_instr_as_alu(instr);
7ec681f3Smrg
7ec681f3Smrg            if (alu->op == nir_op_flrp &&
7ec681f3Smrg                (alu->dest.dest.ssa.bit_size & lowering_mask)) {
7ec681f3Smrg               convert_flrp_instruction(&b, dead_flrp, alu, always_precise);
7ec681f3Smrg            }
7ec681f3Smrg         }
7ec681f3Smrg      }
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   nir_metadata_preserve(impl, nir_metadata_block_index |
7ec681f3Smrg                               nir_metadata_dominance);
7ec681f3Smrg}
7ec681f3Smrg
7ec681f3Smrg/**
7ec681f3Smrg * \param lowering_mask - Bitwise-or of the bit sizes that need to be lowered
7ec681f3Smrg *                        (e.g., 16 | 64 if only 16-bit and 64-bit flrp need
7ec681f3Smrg *                        lowering).
7ec681f3Smrg * \param always_precise - Always require precise lowering for flrp.  This
7ec681f3Smrg *                        will always lower flrp to (a * (1 - c)) + (b * c).
7ec681f3Smrg * \param have_ffma - Set to true if the GPU has an FFMA instruction that
7ec681f3Smrg *                    should be used.
7ec681f3Smrg */
7ec681f3Smrgbool
7ec681f3Smrgnir_lower_flrp(nir_shader *shader,
7ec681f3Smrg               unsigned lowering_mask,
7ec681f3Smrg               bool always_precise)
7ec681f3Smrg{
7ec681f3Smrg   struct u_vector dead_flrp;
7ec681f3Smrg
7ec681f3Smrg   if (!u_vector_init_pow2(&dead_flrp, 8, sizeof(struct nir_alu_instr *)))
7ec681f3Smrg      return false;
7ec681f3Smrg
7ec681f3Smrg   nir_foreach_function(function, shader) {
7ec681f3Smrg      if (function->impl) {
7ec681f3Smrg         lower_flrp_impl(function->impl, &dead_flrp, lowering_mask,
7ec681f3Smrg                         always_precise);
7ec681f3Smrg      }
7ec681f3Smrg   }
7ec681f3Smrg
7ec681f3Smrg   /* Progress was made if the dead list is not empty.  Remove all the
7ec681f3Smrg    * instructions from the dead list.
7ec681f3Smrg    */
7ec681f3Smrg   const bool progress = u_vector_length(&dead_flrp) != 0;
7ec681f3Smrg
7ec681f3Smrg   struct nir_alu_instr **instr;
7ec681f3Smrg   u_vector_foreach(instr, &dead_flrp)
7ec681f3Smrg      nir_instr_remove(&(*instr)->instr);
7ec681f3Smrg
7ec681f3Smrg   u_vector_finish(&dead_flrp);
7ec681f3Smrg
7ec681f3Smrg   return progress;
7ec681f3Smrg}