17ec681f3Smrg/*
27ec681f3Smrg * License for Berkeley SoftFloat Release 3e
37ec681f3Smrg *
47ec681f3Smrg * John R. Hauser
57ec681f3Smrg * 2018 January 20
67ec681f3Smrg *
77ec681f3Smrg * The following applies to the whole of SoftFloat Release 3e as well as to
87ec681f3Smrg * each source file individually.
97ec681f3Smrg *
107ec681f3Smrg * Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 The Regents of the
117ec681f3Smrg * University of California.  All rights reserved.
127ec681f3Smrg *
137ec681f3Smrg * Redistribution and use in source and binary forms, with or without
147ec681f3Smrg * modification, are permitted provided that the following conditions are met:
157ec681f3Smrg *
167ec681f3Smrg *  1. Redistributions of source code must retain the above copyright notice,
177ec681f3Smrg *     this list of conditions, and the following disclaimer.
187ec681f3Smrg *
197ec681f3Smrg *  2. Redistributions in binary form must reproduce the above copyright
207ec681f3Smrg *     notice, this list of conditions, and the following disclaimer in the
217ec681f3Smrg *     documentation and/or other materials provided with the distribution.
227ec681f3Smrg *
237ec681f3Smrg *  3. Neither the name of the University nor the names of its contributors
247ec681f3Smrg *     may be used to endorse or promote products derived from this software
257ec681f3Smrg *     without specific prior written permission.
267ec681f3Smrg *
277ec681f3Smrg * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
287ec681f3Smrg * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
297ec681f3Smrg * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
307ec681f3Smrg * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
317ec681f3Smrg * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
327ec681f3Smrg * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
337ec681f3Smrg * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
347ec681f3Smrg * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
357ec681f3Smrg * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
367ec681f3Smrg * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
377ec681f3Smrg *
387ec681f3Smrg *
397ec681f3Smrg * The functions listed in this file are modified versions of the ones
407ec681f3Smrg * from the Berkeley SoftFloat 3e Library.
417ec681f3Smrg *
427ec681f3Smrg * Their implementation correctness has been checked with the Berkeley
437ec681f3Smrg * TestFloat Release 3e tool for x86_64.
447ec681f3Smrg */
457ec681f3Smrg
467ec681f3Smrg#include "rounding.h"
477ec681f3Smrg#include "bitscan.h"
487ec681f3Smrg#include "softfloat.h"
497ec681f3Smrg
507ec681f3Smrg#if defined(BIG_ENDIAN)
517ec681f3Smrg#define word_incr -1
527ec681f3Smrg#define index_word(total, n) ((total) - 1 - (n))
537ec681f3Smrg#define index_word_hi(total) 0
547ec681f3Smrg#define index_word_lo(total) ((total) - 1)
557ec681f3Smrg#define index_multiword_hi(total, n) 0
567ec681f3Smrg#define index_multiword_lo(total, n) ((total) - (n))
577ec681f3Smrg#define index_multiword_hi_but(total, n) 0
587ec681f3Smrg#define index_multiword_lo_but(total, n) (n)
597ec681f3Smrg#else
607ec681f3Smrg#define word_incr 1
617ec681f3Smrg#define index_word(total, n) (n)
627ec681f3Smrg#define index_word_hi(total) ((total) - 1)
637ec681f3Smrg#define index_word_lo(total) 0
647ec681f3Smrg#define index_multiword_hi(total, n) ((total) - (n))
657ec681f3Smrg#define index_multiword_lo(total, n) 0
667ec681f3Smrg#define index_multiword_hi_but(total, n) (n)
677ec681f3Smrg#define index_multiword_lo_but(total, n) 0
687ec681f3Smrg#endif
697ec681f3Smrg
707ec681f3Smrgtypedef union { double f; int64_t i; uint64_t u; } di_type;
717ec681f3Smrgtypedef union { float f; int32_t i; uint32_t u; } fi_type;
727ec681f3Smrg
737ec681f3Smrgconst uint8_t count_leading_zeros8[256] = {
747ec681f3Smrg    8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
757ec681f3Smrg    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
767ec681f3Smrg    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
777ec681f3Smrg    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
787ec681f3Smrg    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
797ec681f3Smrg    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
807ec681f3Smrg    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
817ec681f3Smrg    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
827ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
837ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
847ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
857ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
867ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
877ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
887ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
897ec681f3Smrg    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
907ec681f3Smrg};
917ec681f3Smrg
927ec681f3Smrg/**
937ec681f3Smrg * \brief Shifts 'a' right by the number of bits given in 'dist', which must be in
947ec681f3Smrg * the range 1 to 63.  If any nonzero bits are shifted off, they are "jammed"
957ec681f3Smrg * into the least-significant bit of the shifted value by setting the
967ec681f3Smrg * least-significant bit to 1.  This shifted-and-jammed value is returned.
977ec681f3Smrg *
987ec681f3Smrg * From softfloat_shortShiftRightJam64()
997ec681f3Smrg */
1007ec681f3Smrgstatic inline
1017ec681f3Smrguint64_t _mesa_short_shift_right_jam64(uint64_t a, uint8_t dist)
1027ec681f3Smrg{
1037ec681f3Smrg    return a >> dist | ((a & (((uint64_t) 1 << dist) - 1)) != 0);
1047ec681f3Smrg}
1057ec681f3Smrg
1067ec681f3Smrg/**
1077ec681f3Smrg * \brief Shifts 'a' right by the number of bits given in 'dist', which must not
1087ec681f3Smrg * be zero.  If any nonzero bits are shifted off, they are "jammed" into the
1097ec681f3Smrg * least-significant bit of the shifted value by setting the least-significant
1107ec681f3Smrg * bit to 1.  This shifted-and-jammed value is returned.
1117ec681f3Smrg * The value of 'dist' can be arbitrarily large.  In particular, if 'dist' is
1127ec681f3Smrg * greater than 64, the result will be either 0 or 1, depending on whether 'a'
1137ec681f3Smrg * is zero or nonzero.
1147ec681f3Smrg *
1157ec681f3Smrg * From softfloat_shiftRightJam64()
1167ec681f3Smrg */
1177ec681f3Smrgstatic inline
1187ec681f3Smrguint64_t _mesa_shift_right_jam64(uint64_t a, uint32_t dist)
1197ec681f3Smrg{
1207ec681f3Smrg    return
1217ec681f3Smrg        (dist < 63) ? a >> dist | ((uint64_t) (a << (-dist & 63)) != 0) : (a != 0);
1227ec681f3Smrg}
1237ec681f3Smrg
1247ec681f3Smrg/**
1257ec681f3Smrg * \brief Shifts 'a' right by the number of bits given in 'dist', which must not be
1267ec681f3Smrg * zero.  If any nonzero bits are shifted off, they are "jammed" into the
1277ec681f3Smrg * least-significant bit of the shifted value by setting the least-significant
1287ec681f3Smrg * bit to 1.  This shifted-and-jammed value is returned.
1297ec681f3Smrg * The value of 'dist' can be arbitrarily large.  In particular, if 'dist' is
1307ec681f3Smrg * greater than 32, the result will be either 0 or 1, depending on whether 'a'
1317ec681f3Smrg * is zero or nonzero.
1327ec681f3Smrg *
1337ec681f3Smrg * From softfloat_shiftRightJam32()
1347ec681f3Smrg */
1357ec681f3Smrgstatic inline
1367ec681f3Smrguint32_t _mesa_shift_right_jam32(uint32_t a, uint16_t dist)
1377ec681f3Smrg{
1387ec681f3Smrg    return
1397ec681f3Smrg        (dist < 31) ? a >> dist | ((uint32_t) (a << (-dist & 31)) != 0) : (a != 0);
1407ec681f3Smrg}
1417ec681f3Smrg
1427ec681f3Smrg/**
1437ec681f3Smrg * \brief Extracted from softfloat_roundPackToF64()
1447ec681f3Smrg */
1457ec681f3Smrgstatic inline
1467ec681f3Smrgdouble _mesa_roundtozero_f64(int64_t s, int64_t e, int64_t m)
1477ec681f3Smrg{
1487ec681f3Smrg    di_type result;
1497ec681f3Smrg
1507ec681f3Smrg    if ((uint64_t) e >= 0x7fd) {
1517ec681f3Smrg        if (e < 0) {
1527ec681f3Smrg            m = _mesa_shift_right_jam64(m, -e);
1537ec681f3Smrg            e = 0;
1547ec681f3Smrg        } else if ((e > 0x7fd) || (0x8000000000000000 <= m)) {
1557ec681f3Smrg            e = 0x7ff;
1567ec681f3Smrg            m = 0;
1577ec681f3Smrg            result.u = (s << 63) + (e << 52) + m;
1587ec681f3Smrg            result.u -= 1;
1597ec681f3Smrg            return result.f;
1607ec681f3Smrg        }
1617ec681f3Smrg    }
1627ec681f3Smrg
1637ec681f3Smrg    m >>= 10;
1647ec681f3Smrg    if (m == 0)
1657ec681f3Smrg        e = 0;
1667ec681f3Smrg
1677ec681f3Smrg    result.u = (s << 63) + (e << 52) + m;
1687ec681f3Smrg    return result.f;
1697ec681f3Smrg}
1707ec681f3Smrg
1717ec681f3Smrg/**
1727ec681f3Smrg * \brief Extracted from softfloat_roundPackToF32()
1737ec681f3Smrg */
1747ec681f3Smrgstatic inline
1757ec681f3Smrgfloat _mesa_round_f32(int32_t s, int32_t e, int32_t m, bool rtz)
1767ec681f3Smrg{
1777ec681f3Smrg    fi_type result;
1787ec681f3Smrg    uint8_t round_increment = rtz ? 0 : 0x40;
1797ec681f3Smrg
1807ec681f3Smrg    if ((uint32_t) e >= 0xfd) {
1817ec681f3Smrg        if (e < 0) {
1827ec681f3Smrg            m = _mesa_shift_right_jam32(m, -e);
1837ec681f3Smrg            e = 0;
1847ec681f3Smrg        } else if ((e > 0xfd) || (0x80000000 <= m + round_increment)) {
1857ec681f3Smrg            e = 0xff;
1867ec681f3Smrg            m = 0;
1877ec681f3Smrg            result.u = (s << 31) + (e << 23) + m;
1887ec681f3Smrg            result.u -= !round_increment;
1897ec681f3Smrg            return result.f;
1907ec681f3Smrg        }
1917ec681f3Smrg    }
1927ec681f3Smrg
1937ec681f3Smrg    uint8_t round_bits;
1947ec681f3Smrg    round_bits = m & 0x7f;
1957ec681f3Smrg    m = ((uint32_t) m + round_increment) >> 7;
1967ec681f3Smrg    m &= ~(uint32_t) (! (round_bits ^ 0x40) & !rtz);
1977ec681f3Smrg    if (m == 0)
1987ec681f3Smrg        e = 0;
1997ec681f3Smrg
2007ec681f3Smrg    result.u = (s << 31) + (e << 23) + m;
2017ec681f3Smrg    return result.f;
2027ec681f3Smrg}
2037ec681f3Smrg
2047ec681f3Smrg/**
2057ec681f3Smrg * \brief Extracted from softfloat_roundPackToF16()
2067ec681f3Smrg */
2077ec681f3Smrgstatic inline
2087ec681f3Smrguint16_t _mesa_roundtozero_f16(int16_t s, int16_t e, int16_t m)
2097ec681f3Smrg{
2107ec681f3Smrg    if ((uint16_t) e >= 0x1d) {
2117ec681f3Smrg        if (e < 0) {
2127ec681f3Smrg            m = _mesa_shift_right_jam32(m, -e);
2137ec681f3Smrg            e = 0;
2147ec681f3Smrg        } else if (e > 0x1d) {
2157ec681f3Smrg            e = 0x1f;
2167ec681f3Smrg            m = 0;
2177ec681f3Smrg            return (s << 15) + (e << 10) + m - 1;
2187ec681f3Smrg        }
2197ec681f3Smrg    }
2207ec681f3Smrg
2217ec681f3Smrg    m >>= 4;
2227ec681f3Smrg    if (m == 0)
2237ec681f3Smrg        e = 0;
2247ec681f3Smrg
2257ec681f3Smrg    return (s << 15) + (e << 10) + m;
2267ec681f3Smrg}
2277ec681f3Smrg
2287ec681f3Smrg/**
2297ec681f3Smrg * \brief Shifts the N-bit unsigned integer pointed to by 'a' left by the number of
2307ec681f3Smrg * bits given in 'dist', where N = 'size_words' * 32.  The value of 'dist'
2317ec681f3Smrg * must be in the range 1 to 31.  Any nonzero bits shifted off are lost.  The
2327ec681f3Smrg * shifted N-bit result is stored at the location pointed to by 'm_out'.  Each
2337ec681f3Smrg * of 'a' and 'm_out' points to a 'size_words'-long array of 32-bit elements
2347ec681f3Smrg * that concatenate in the platform's normal endian order to form an N-bit
2357ec681f3Smrg * integer.
2367ec681f3Smrg *
2377ec681f3Smrg * From softfloat_shortShiftLeftM()
2387ec681f3Smrg */
2397ec681f3Smrgstatic inline void
2407ec681f3Smrg_mesa_short_shift_left_m(uint8_t size_words, const uint32_t *a, uint8_t dist, uint32_t *m_out)
2417ec681f3Smrg{
2427ec681f3Smrg    uint8_t neg_dist;
2437ec681f3Smrg    unsigned index, last_index;
2447ec681f3Smrg    uint32_t part_word, a_word;
2457ec681f3Smrg
2467ec681f3Smrg    neg_dist = -dist;
2477ec681f3Smrg    index = index_word_hi(size_words);
2487ec681f3Smrg    last_index = index_word_lo(size_words);
2497ec681f3Smrg    part_word = a[index] << dist;
2507ec681f3Smrg    while (index != last_index) {
2517ec681f3Smrg        a_word = a[index - word_incr];
2527ec681f3Smrg        m_out[index] = part_word | a_word >> (neg_dist & 31);
2537ec681f3Smrg        index -= word_incr;
2547ec681f3Smrg        part_word = a_word << dist;
2557ec681f3Smrg    }
2567ec681f3Smrg    m_out[index] = part_word;
2577ec681f3Smrg}
2587ec681f3Smrg
2597ec681f3Smrg/**
2607ec681f3Smrg * \brief Shifts the N-bit unsigned integer pointed to by 'a' left by the number of
2617ec681f3Smrg * bits given in 'dist', where N = 'size_words' * 32.  The value of 'dist'
2627ec681f3Smrg * must not be zero.  Any nonzero bits shifted off are lost.  The shifted
2637ec681f3Smrg * N-bit result is stored at the location pointed to by 'm_out'.  Each of 'a'
2647ec681f3Smrg * and 'm_out' points to a 'size_words'-long array of 32-bit elements that
2657ec681f3Smrg * concatenate in the platform's normal endian order to form an N-bit
2667ec681f3Smrg * integer. The value of 'dist' can be arbitrarily large.  In particular, if
2677ec681f3Smrg * 'dist' is greater than N, the stored result will be 0.
2687ec681f3Smrg *
2697ec681f3Smrg * From softfloat_shiftLeftM()
2707ec681f3Smrg */
2717ec681f3Smrgstatic inline void
2727ec681f3Smrg_mesa_shift_left_m(uint8_t size_words, const uint32_t *a, uint32_t dist, uint32_t *m_out)
2737ec681f3Smrg{
2747ec681f3Smrg    uint32_t word_dist;
2757ec681f3Smrg    uint8_t inner_dist;
2767ec681f3Smrg    uint8_t i;
2777ec681f3Smrg
2787ec681f3Smrg    word_dist = dist >> 5;
2797ec681f3Smrg    if (word_dist < size_words) {
2807ec681f3Smrg        a += index_multiword_lo_but(size_words, word_dist);
2817ec681f3Smrg        inner_dist = dist & 31;
2827ec681f3Smrg        if (inner_dist) {
2837ec681f3Smrg            _mesa_short_shift_left_m(size_words - word_dist, a, inner_dist,
2847ec681f3Smrg                                     m_out + index_multiword_hi_but(size_words, word_dist));
2857ec681f3Smrg            if (!word_dist)
2867ec681f3Smrg                return;
2877ec681f3Smrg        } else {
2887ec681f3Smrg            uint32_t *dest = m_out + index_word_hi(size_words);
2897ec681f3Smrg            a += index_word_hi(size_words - word_dist);
2907ec681f3Smrg            for (i = size_words - word_dist; i; --i) {
2917ec681f3Smrg                *dest = *a;
2927ec681f3Smrg                a -= word_incr;
2937ec681f3Smrg                dest -= word_incr;
2947ec681f3Smrg            }
2957ec681f3Smrg        }
2967ec681f3Smrg        m_out += index_multiword_lo(size_words, word_dist);
2977ec681f3Smrg    } else {
2987ec681f3Smrg        word_dist = size_words;
2997ec681f3Smrg    }
3007ec681f3Smrg    do {
3017ec681f3Smrg        *m_out++ = 0;
3027ec681f3Smrg        --word_dist;
3037ec681f3Smrg    } while (word_dist);
3047ec681f3Smrg}
3057ec681f3Smrg
3067ec681f3Smrg/**
3077ec681f3Smrg * \brief Shifts the N-bit unsigned integer pointed to by 'a' right by the number of
3087ec681f3Smrg * bits given in 'dist', where N = 'size_words' * 32.  The value of 'dist'
3097ec681f3Smrg * must be in the range 1 to 31.  Any nonzero bits shifted off are lost.  The
3107ec681f3Smrg * shifted N-bit result is stored at the location pointed to by 'm_out'.  Each
3117ec681f3Smrg * of 'a' and 'm_out' points to a 'size_words'-long array of 32-bit elements
3127ec681f3Smrg * that concatenate in the platform's normal endian order to form an N-bit
3137ec681f3Smrg * integer.
3147ec681f3Smrg *
3157ec681f3Smrg * From softfloat_shortShiftRightM()
3167ec681f3Smrg */
3177ec681f3Smrgstatic inline void
3187ec681f3Smrg_mesa_short_shift_right_m(uint8_t size_words, const uint32_t *a, uint8_t dist, uint32_t *m_out)
3197ec681f3Smrg{
3207ec681f3Smrg    uint8_t neg_dist;
3217ec681f3Smrg    unsigned index, last_index;
3227ec681f3Smrg    uint32_t part_word, a_word;
3237ec681f3Smrg
3247ec681f3Smrg    neg_dist = -dist;
3257ec681f3Smrg    index = index_word_lo(size_words);
3267ec681f3Smrg    last_index = index_word_hi(size_words);
3277ec681f3Smrg    part_word = a[index] >> dist;
3287ec681f3Smrg    while (index != last_index) {
3297ec681f3Smrg        a_word = a[index + word_incr];
3307ec681f3Smrg        m_out[index] = a_word << (neg_dist & 31) | part_word;
3317ec681f3Smrg        index += word_incr;
3327ec681f3Smrg        part_word = a_word >> dist;
3337ec681f3Smrg    }
3347ec681f3Smrg    m_out[index] = part_word;
3357ec681f3Smrg}
3367ec681f3Smrg
3377ec681f3Smrg/**
3387ec681f3Smrg * \brief Shifts the N-bit unsigned integer pointed to by 'a' right by the number of
3397ec681f3Smrg * bits given in 'dist', where N = 'size_words' * 32.  The value of 'dist'
3407ec681f3Smrg * must be in the range 1 to 31.  If any nonzero bits are shifted off, they
3417ec681f3Smrg * are "jammed" into the least-significant bit of the shifted value by setting
3427ec681f3Smrg * the least-significant bit to 1.  This shifted-and-jammed N-bit result is
3437ec681f3Smrg * stored at the location pointed to by 'm_out'.  Each of 'a' and 'm_out'
3447ec681f3Smrg * points to a 'size_words'-long array of 32-bit elements that concatenate in
3457ec681f3Smrg * the platform's normal endian order to form an N-bit integer.
3467ec681f3Smrg *
3477ec681f3Smrg *
3487ec681f3Smrg * From softfloat_shortShiftRightJamM()
3497ec681f3Smrg */
3507ec681f3Smrgstatic inline void
3517ec681f3Smrg_mesa_short_shift_right_jam_m(uint8_t size_words, const uint32_t *a, uint8_t dist, uint32_t *m_out)
3527ec681f3Smrg{
3537ec681f3Smrg    uint8_t neg_dist;
3547ec681f3Smrg    unsigned index, last_index;
3557ec681f3Smrg    uint64_t part_word, a_word;
3567ec681f3Smrg
3577ec681f3Smrg    neg_dist = -dist;
3587ec681f3Smrg    index = index_word_lo(size_words);
3597ec681f3Smrg    last_index = index_word_hi(size_words);
3607ec681f3Smrg    a_word = a[index];
3617ec681f3Smrg    part_word = a_word >> dist;
3627ec681f3Smrg    if (part_word << dist != a_word )
3637ec681f3Smrg        part_word |= 1;
3647ec681f3Smrg    while (index != last_index) {
3657ec681f3Smrg        a_word = a[index + word_incr];
3667ec681f3Smrg        m_out[index] = a_word << (neg_dist & 31) | part_word;
3677ec681f3Smrg        index += word_incr;
3687ec681f3Smrg        part_word = a_word >> dist;
3697ec681f3Smrg    }
3707ec681f3Smrg    m_out[index] = part_word;
3717ec681f3Smrg}
3727ec681f3Smrg
3737ec681f3Smrg/**
3747ec681f3Smrg * \brief Shifts the N-bit unsigned integer pointed to by 'a' right by the number of
3757ec681f3Smrg * bits given in 'dist', where N = 'size_words' * 32.  The value of 'dist'
3767ec681f3Smrg * must not be zero.  If any nonzero bits are shifted off, they are "jammed"
3777ec681f3Smrg * into the least-significant bit of the shifted value by setting the
3787ec681f3Smrg * least-significant bit to 1.  This shifted-and-jammed N-bit result is stored
3797ec681f3Smrg * at the location pointed to by 'm_out'.  Each of 'a' and 'm_out' points to a
3807ec681f3Smrg * 'size_words'-long array of 32-bit elements that concatenate in the
3817ec681f3Smrg * platform's normal endian order to form an N-bit integer.  The value of
3827ec681f3Smrg * 'dist' can be arbitrarily large.  In particular, if 'dist' is greater than
3837ec681f3Smrg * N, the stored result will be either 0 or 1, depending on whether the
3847ec681f3Smrg * original N bits are all zeros.
3857ec681f3Smrg *
3867ec681f3Smrg * From softfloat_shiftRightJamM()
3877ec681f3Smrg */
3887ec681f3Smrgstatic inline void
3897ec681f3Smrg_mesa_shift_right_jam_m(uint8_t size_words, const uint32_t *a, uint32_t dist, uint32_t *m_out)
3907ec681f3Smrg{
3917ec681f3Smrg    uint32_t word_jam, word_dist, *tmp;
3927ec681f3Smrg    uint8_t i, inner_dist;
3937ec681f3Smrg
3947ec681f3Smrg    word_jam = 0;
3957ec681f3Smrg    word_dist = dist >> 5;
3967ec681f3Smrg    tmp = NULL;
3977ec681f3Smrg    if (word_dist) {
3987ec681f3Smrg        if (size_words < word_dist)
3997ec681f3Smrg            word_dist = size_words;
4007ec681f3Smrg        tmp = (uint32_t *) (a + index_multiword_lo(size_words, word_dist));
4017ec681f3Smrg        i = word_dist;
4027ec681f3Smrg        do {
4037ec681f3Smrg            word_jam = *tmp++;
4047ec681f3Smrg            if (word_jam)
4057ec681f3Smrg                break;
4067ec681f3Smrg            --i;
4077ec681f3Smrg        } while (i);
4087ec681f3Smrg        tmp = m_out;
4097ec681f3Smrg    }
4107ec681f3Smrg    if (word_dist < size_words) {
4117ec681f3Smrg        a += index_multiword_hi_but(size_words, word_dist);
4127ec681f3Smrg        inner_dist = dist & 31;
4137ec681f3Smrg        if (inner_dist) {
4147ec681f3Smrg            _mesa_short_shift_right_jam_m(size_words - word_dist, a, inner_dist,
4157ec681f3Smrg                                          m_out + index_multiword_lo_but(size_words, word_dist));
4167ec681f3Smrg            if (!word_dist) {
4177ec681f3Smrg                if (word_jam)
4187ec681f3Smrg                    m_out[index_word_lo(size_words)] |= 1;
4197ec681f3Smrg                return;
4207ec681f3Smrg            }
4217ec681f3Smrg        } else {
4227ec681f3Smrg            a += index_word_lo(size_words - word_dist);
4237ec681f3Smrg            tmp = m_out + index_word_lo(size_words);
4247ec681f3Smrg            for (i = size_words - word_dist; i; --i) {
4257ec681f3Smrg                *tmp = *a;
4267ec681f3Smrg                a += word_incr;
4277ec681f3Smrg                tmp += word_incr;
4287ec681f3Smrg            }
4297ec681f3Smrg        }
4307ec681f3Smrg        tmp = m_out + index_multiword_hi(size_words, word_dist);
4317ec681f3Smrg    }
4327ec681f3Smrg    if (tmp) {
4337ec681f3Smrg       do {
4347ec681f3Smrg           *tmp++ = 0;
4357ec681f3Smrg           --word_dist;
4367ec681f3Smrg       } while (word_dist);
4377ec681f3Smrg    }
4387ec681f3Smrg    if (word_jam)
4397ec681f3Smrg        m_out[index_word_lo(size_words)] |= 1;
4407ec681f3Smrg}
4417ec681f3Smrg
4427ec681f3Smrg/**
4437ec681f3Smrg * \brief Calculate a + b but rounding to zero.
4447ec681f3Smrg *
4457ec681f3Smrg * Notice that this mainly differs from the original Berkeley SoftFloat 3e
4467ec681f3Smrg * implementation in that we don't really treat NaNs, Zeroes nor the
4477ec681f3Smrg * signalling flags. Any NaN is good for us and the sign of the Zero is not
4487ec681f3Smrg * important.
4497ec681f3Smrg *
4507ec681f3Smrg * From f64_add()
4517ec681f3Smrg */
4527ec681f3Smrgdouble
4537ec681f3Smrg_mesa_double_add_rtz(double a, double b)
4547ec681f3Smrg{
4557ec681f3Smrg    const di_type a_di = {a};
4567ec681f3Smrg    uint64_t a_flt_m = a_di.u & 0x0fffffffffffff;
4577ec681f3Smrg    uint64_t a_flt_e = (a_di.u >> 52) & 0x7ff;
4587ec681f3Smrg    uint64_t a_flt_s = (a_di.u >> 63) & 0x1;
4597ec681f3Smrg    const di_type b_di = {b};
4607ec681f3Smrg    uint64_t b_flt_m = b_di.u & 0x0fffffffffffff;
4617ec681f3Smrg    uint64_t b_flt_e = (b_di.u >> 52) & 0x7ff;
4627ec681f3Smrg    uint64_t b_flt_s = (b_di.u >> 63) & 0x1;
4637ec681f3Smrg    int64_t s, e, m = 0;
4647ec681f3Smrg
4657ec681f3Smrg    s = a_flt_s;
4667ec681f3Smrg
4677ec681f3Smrg    const int64_t exp_diff = a_flt_e - b_flt_e;
4687ec681f3Smrg
4697ec681f3Smrg    /* Handle special cases */
4707ec681f3Smrg
4717ec681f3Smrg    if (a_flt_s != b_flt_s) {
4727ec681f3Smrg        return _mesa_double_sub_rtz(a, -b);
4737ec681f3Smrg    } else if ((a_flt_e == 0) && (a_flt_m == 0)) {
4747ec681f3Smrg        /* 'a' is zero, return 'b' */
4757ec681f3Smrg        return b;
4767ec681f3Smrg    } else if ((b_flt_e == 0) && (b_flt_m == 0)) {
4777ec681f3Smrg        /* 'b' is zero, return 'a' */
4787ec681f3Smrg        return a;
4797ec681f3Smrg    } else if (a_flt_e == 0x7ff && a_flt_m != 0) {
4807ec681f3Smrg        /* 'a' is a NaN, return NaN */
4817ec681f3Smrg        return a;
4827ec681f3Smrg    } else if (b_flt_e == 0x7ff && b_flt_m != 0) {
4837ec681f3Smrg        /* 'b' is a NaN, return NaN */
4847ec681f3Smrg        return b;
4857ec681f3Smrg    } else if (a_flt_e == 0x7ff && a_flt_m == 0) {
4867ec681f3Smrg        /* Inf + x = Inf */
4877ec681f3Smrg        return a;
4887ec681f3Smrg    } else if (b_flt_e == 0x7ff && b_flt_m == 0) {
4897ec681f3Smrg        /* x + Inf = Inf */
4907ec681f3Smrg        return b;
4917ec681f3Smrg    } else if (exp_diff == 0 && a_flt_e == 0) {
4927ec681f3Smrg        di_type result_di;
4937ec681f3Smrg        result_di.u = a_di.u + b_flt_m;
4947ec681f3Smrg        return result_di.f;
4957ec681f3Smrg    } else if (exp_diff == 0) {
4967ec681f3Smrg        e = a_flt_e;
4977ec681f3Smrg        m = 0x0020000000000000 + a_flt_m + b_flt_m;
4987ec681f3Smrg        m <<= 9;
4997ec681f3Smrg    } else if (exp_diff < 0) {
5007ec681f3Smrg        a_flt_m <<= 9;
5017ec681f3Smrg        b_flt_m <<= 9;
5027ec681f3Smrg        e = b_flt_e;
5037ec681f3Smrg
5047ec681f3Smrg        if (a_flt_e != 0)
5057ec681f3Smrg            a_flt_m += 0x2000000000000000;
5067ec681f3Smrg        else
5077ec681f3Smrg            a_flt_m <<= 1;
5087ec681f3Smrg
5097ec681f3Smrg        a_flt_m = _mesa_shift_right_jam64(a_flt_m, -exp_diff);
5107ec681f3Smrg        m = 0x2000000000000000 + a_flt_m + b_flt_m;
5117ec681f3Smrg        if (m < 0x4000000000000000) {
5127ec681f3Smrg            --e;
5137ec681f3Smrg            m <<= 1;
5147ec681f3Smrg        }
5157ec681f3Smrg    } else {
5167ec681f3Smrg        a_flt_m <<= 9;
5177ec681f3Smrg        b_flt_m <<= 9;
5187ec681f3Smrg        e = a_flt_e;
5197ec681f3Smrg
5207ec681f3Smrg        if (b_flt_e != 0)
5217ec681f3Smrg            b_flt_m += 0x2000000000000000;
5227ec681f3Smrg        else
5237ec681f3Smrg            b_flt_m <<= 1;
5247ec681f3Smrg
5257ec681f3Smrg        b_flt_m = _mesa_shift_right_jam64(b_flt_m, exp_diff);
5267ec681f3Smrg        m = 0x2000000000000000 + a_flt_m + b_flt_m;
5277ec681f3Smrg        if (m < 0x4000000000000000) {
5287ec681f3Smrg            --e;
5297ec681f3Smrg            m <<= 1;
5307ec681f3Smrg        }
5317ec681f3Smrg    }
5327ec681f3Smrg
5337ec681f3Smrg    return _mesa_roundtozero_f64(s, e, m);
5347ec681f3Smrg}
5357ec681f3Smrg
5367ec681f3Smrg/**
5377ec681f3Smrg * \brief Returns the number of leading 0 bits before the most-significant 1 bit of
5387ec681f3Smrg * 'a'.  If 'a' is zero, 64 is returned.
5397ec681f3Smrg */
5407ec681f3Smrgstatic inline unsigned
5417ec681f3Smrg_mesa_count_leading_zeros64(uint64_t a)
5427ec681f3Smrg{
5437ec681f3Smrg    return 64 - util_last_bit64(a);
5447ec681f3Smrg}
5457ec681f3Smrg
5467ec681f3Smrg/**
5477ec681f3Smrg * \brief Returns the number of leading 0 bits before the most-significant 1 bit of
5487ec681f3Smrg * 'a'.  If 'a' is zero, 32 is returned.
5497ec681f3Smrg */
5507ec681f3Smrgstatic inline unsigned
5517ec681f3Smrg_mesa_count_leading_zeros32(uint32_t a)
5527ec681f3Smrg{
5537ec681f3Smrg    return 32 - util_last_bit(a);
5547ec681f3Smrg}
5557ec681f3Smrg
5567ec681f3Smrgstatic inline double
5577ec681f3Smrg_mesa_norm_round_pack_f64(int64_t s, int64_t e, int64_t m)
5587ec681f3Smrg{
5597ec681f3Smrg    int8_t shift_dist;
5607ec681f3Smrg
5617ec681f3Smrg    shift_dist = _mesa_count_leading_zeros64(m) - 1;
5627ec681f3Smrg    e -= shift_dist;
5637ec681f3Smrg    if ((10 <= shift_dist) && ((unsigned) e < 0x7fd)) {
5647ec681f3Smrg        di_type result;
5657ec681f3Smrg        result.u = (s << 63) + ((m ? e : 0) << 52) + (m << (shift_dist - 10));
5667ec681f3Smrg        return result.f;
5677ec681f3Smrg    } else {
5687ec681f3Smrg        return _mesa_roundtozero_f64(s, e, m << shift_dist);
5697ec681f3Smrg    }
5707ec681f3Smrg}
5717ec681f3Smrg
5727ec681f3Smrg/**
5737ec681f3Smrg * \brief Replaces the N-bit unsigned integer pointed to by 'm_out' by the
5747ec681f3Smrg * 2s-complement of itself, where N = 'size_words' * 32.  Argument 'm_out'
5757ec681f3Smrg * points to a 'size_words'-long array of 32-bit elements that concatenate in
5767ec681f3Smrg * the platform's normal endian order to form an N-bit integer.
5777ec681f3Smrg *
5787ec681f3Smrg * From softfloat_negXM()
5797ec681f3Smrg */
5807ec681f3Smrgstatic inline void
5817ec681f3Smrg_mesa_neg_x_m(uint8_t size_words, uint32_t *m_out)
5827ec681f3Smrg{
5837ec681f3Smrg    unsigned index, last_index;
5847ec681f3Smrg    uint8_t carry;
5857ec681f3Smrg    uint32_t word;
5867ec681f3Smrg
5877ec681f3Smrg    index = index_word_lo(size_words);
5887ec681f3Smrg    last_index = index_word_hi(size_words);
5897ec681f3Smrg    carry = 1;
5907ec681f3Smrg    for (;;) {
5917ec681f3Smrg        word = ~m_out[index] + carry;
5927ec681f3Smrg        m_out[index] = word;
5937ec681f3Smrg        if (index == last_index)
5947ec681f3Smrg            break;
5957ec681f3Smrg        index += word_incr;
5967ec681f3Smrg        if (word)
5977ec681f3Smrg            carry = 0;
5987ec681f3Smrg    }
5997ec681f3Smrg}
6007ec681f3Smrg
6017ec681f3Smrg/**
6027ec681f3Smrg * \brief Adds the two N-bit integers pointed to by 'a' and 'b', where N =
6037ec681f3Smrg * 'size_words' * 32.  The addition is modulo 2^N, so any carry out is
6047ec681f3Smrg * lost. The N-bit sum is stored at the location pointed to by 'm_out'.  Each
6057ec681f3Smrg * of 'a', 'b', and 'm_out' points to a 'size_words'-long array of 32-bit
6067ec681f3Smrg * elements that concatenate in the platform's normal endian order to form an
6077ec681f3Smrg * N-bit integer.
6087ec681f3Smrg *
6097ec681f3Smrg * From softfloat_addM()
6107ec681f3Smrg */
6117ec681f3Smrgstatic inline void
6127ec681f3Smrg_mesa_add_m(uint8_t size_words, const uint32_t *a, const uint32_t *b, uint32_t *m_out)
6137ec681f3Smrg{
6147ec681f3Smrg    unsigned index, last_index;
6157ec681f3Smrg    uint8_t carry;
6167ec681f3Smrg    uint32_t a_word, word;
6177ec681f3Smrg
6187ec681f3Smrg    index = index_word_lo(size_words);
6197ec681f3Smrg    last_index = index_word_hi(size_words);
6207ec681f3Smrg    carry = 0;
6217ec681f3Smrg    for (;;) {
6227ec681f3Smrg        a_word = a[index];
6237ec681f3Smrg        word = a_word + b[index] + carry;
6247ec681f3Smrg        m_out[index] = word;
6257ec681f3Smrg        if (index == last_index)
6267ec681f3Smrg            break;
6277ec681f3Smrg        if (word != a_word)
6287ec681f3Smrg            carry = (word < a_word);
6297ec681f3Smrg        index += word_incr;
6307ec681f3Smrg    }
6317ec681f3Smrg}
6327ec681f3Smrg
6337ec681f3Smrg/**
6347ec681f3Smrg * \brief Subtracts the two N-bit integers pointed to by 'a' and 'b', where N =
6357ec681f3Smrg * 'size_words' * 32.  The subtraction is modulo 2^N, so any borrow out (carry
6367ec681f3Smrg * out) is lost.  The N-bit difference is stored at the location pointed to by
6377ec681f3Smrg * 'm_out'.  Each of 'a', 'b', and 'm_out' points to a 'size_words'-long array
6387ec681f3Smrg * of 32-bit elements that concatenate in the platform's normal endian order
6397ec681f3Smrg * to form an N-bit integer.
6407ec681f3Smrg *
6417ec681f3Smrg * From softfloat_subM()
6427ec681f3Smrg */
6437ec681f3Smrgstatic inline void
6447ec681f3Smrg_mesa_sub_m(uint8_t size_words, const uint32_t *a, const uint32_t *b, uint32_t *m_out)
6457ec681f3Smrg{
6467ec681f3Smrg    unsigned index, last_index;
6477ec681f3Smrg    uint8_t borrow;
6487ec681f3Smrg    uint32_t a_word, b_word;
6497ec681f3Smrg
6507ec681f3Smrg    index = index_word_lo(size_words);
6517ec681f3Smrg    last_index = index_word_hi(size_words);
6527ec681f3Smrg    borrow = 0;
6537ec681f3Smrg    for (;;) {
6547ec681f3Smrg        a_word = a[index];
6557ec681f3Smrg        b_word = b[index];
6567ec681f3Smrg        m_out[index] = a_word - b_word - borrow;
6577ec681f3Smrg        if (index == last_index)
6587ec681f3Smrg            break;
6597ec681f3Smrg        borrow = borrow ? (a_word <= b_word) : (a_word < b_word);
6607ec681f3Smrg        index += word_incr;
6617ec681f3Smrg    }
6627ec681f3Smrg}
6637ec681f3Smrg
6647ec681f3Smrg/* Calculate a - b but rounding to zero.
6657ec681f3Smrg *
6667ec681f3Smrg * Notice that this mainly differs from the original Berkeley SoftFloat 3e
6677ec681f3Smrg * implementation in that we don't really treat NaNs, Zeroes nor the
6687ec681f3Smrg * signalling flags. Any NaN is good for us and the sign of the Zero is not
6697ec681f3Smrg * important.
6707ec681f3Smrg *
6717ec681f3Smrg * From f64_sub()
6727ec681f3Smrg */
6737ec681f3Smrgdouble
6747ec681f3Smrg_mesa_double_sub_rtz(double a, double b)
6757ec681f3Smrg{
6767ec681f3Smrg    const di_type a_di = {a};
6777ec681f3Smrg    uint64_t a_flt_m = a_di.u & 0x0fffffffffffff;
6787ec681f3Smrg    uint64_t a_flt_e = (a_di.u >> 52) & 0x7ff;
6797ec681f3Smrg    uint64_t a_flt_s = (a_di.u >> 63) & 0x1;
6807ec681f3Smrg    const di_type b_di = {b};
6817ec681f3Smrg    uint64_t b_flt_m = b_di.u & 0x0fffffffffffff;
6827ec681f3Smrg    uint64_t b_flt_e = (b_di.u >> 52) & 0x7ff;
6837ec681f3Smrg    uint64_t b_flt_s = (b_di.u >> 63) & 0x1;
6847ec681f3Smrg    int64_t s, e, m = 0;
6857ec681f3Smrg    int64_t m_diff = 0;
6867ec681f3Smrg    unsigned shift_dist = 0;
6877ec681f3Smrg
6887ec681f3Smrg    s = a_flt_s;
6897ec681f3Smrg
6907ec681f3Smrg    const int64_t exp_diff = a_flt_e - b_flt_e;
6917ec681f3Smrg
6927ec681f3Smrg    /* Handle special cases */
6937ec681f3Smrg
6947ec681f3Smrg    if (a_flt_s != b_flt_s) {
6957ec681f3Smrg        return _mesa_double_add_rtz(a, -b);
6967ec681f3Smrg    } else if ((a_flt_e == 0) && (a_flt_m == 0)) {
6977ec681f3Smrg        /* 'a' is zero, return '-b' */
6987ec681f3Smrg        return -b;
6997ec681f3Smrg    } else if ((b_flt_e == 0) && (b_flt_m == 0)) {
7007ec681f3Smrg        /* 'b' is zero, return 'a' */
7017ec681f3Smrg        return a;
7027ec681f3Smrg    } else if (a_flt_e == 0x7ff && a_flt_m != 0) {
7037ec681f3Smrg        /* 'a' is a NaN, return NaN */
7047ec681f3Smrg        return a;
7057ec681f3Smrg    } else if (b_flt_e == 0x7ff && b_flt_m != 0) {
7067ec681f3Smrg        /* 'b' is a NaN, return NaN */
7077ec681f3Smrg        return b;
7087ec681f3Smrg    } else if (a_flt_e == 0x7ff && a_flt_m == 0) {
7097ec681f3Smrg        if (b_flt_e == 0x7ff && b_flt_m == 0) {
7107ec681f3Smrg            /* Inf - Inf =  NaN */
7117ec681f3Smrg            di_type result;
7127ec681f3Smrg            e = 0x7ff;
7137ec681f3Smrg            result.u = (s << 63) + (e << 52) + 0x1;
7147ec681f3Smrg            return result.f;
7157ec681f3Smrg        }
7167ec681f3Smrg        /* Inf - x = Inf */
7177ec681f3Smrg        return a;
7187ec681f3Smrg    } else if (b_flt_e == 0x7ff && b_flt_m == 0) {
7197ec681f3Smrg        /* x - Inf = -Inf */
7207ec681f3Smrg        return -b;
7217ec681f3Smrg    } else if (exp_diff == 0) {
7227ec681f3Smrg        m_diff = a_flt_m - b_flt_m;
7237ec681f3Smrg
7247ec681f3Smrg        if (m_diff == 0)
7257ec681f3Smrg            return 0;
7267ec681f3Smrg        if (a_flt_e)
7277ec681f3Smrg            --a_flt_e;
7287ec681f3Smrg        if (m_diff < 0) {
7297ec681f3Smrg            s = !s;
7307ec681f3Smrg            m_diff = -m_diff;
7317ec681f3Smrg        }
7327ec681f3Smrg
7337ec681f3Smrg        shift_dist = _mesa_count_leading_zeros64(m_diff) - 11;
7347ec681f3Smrg        e = a_flt_e - shift_dist;
7357ec681f3Smrg        if (e < 0) {
7367ec681f3Smrg            shift_dist = a_flt_e;
7377ec681f3Smrg            e = 0;
7387ec681f3Smrg        }
7397ec681f3Smrg
7407ec681f3Smrg        di_type result;
7417ec681f3Smrg        result.u = (s << 63) + (e << 52) + (m_diff << shift_dist);
7427ec681f3Smrg        return result.f;
7437ec681f3Smrg    } else if (exp_diff < 0) {
7447ec681f3Smrg        a_flt_m <<= 10;
7457ec681f3Smrg        b_flt_m <<= 10;
7467ec681f3Smrg        s = !s;
7477ec681f3Smrg
7487ec681f3Smrg        a_flt_m += (a_flt_e) ? 0x4000000000000000 : a_flt_m;
7497ec681f3Smrg        a_flt_m = _mesa_shift_right_jam64(a_flt_m, -exp_diff);
7507ec681f3Smrg        b_flt_m |= 0x4000000000000000;
7517ec681f3Smrg        e = b_flt_e;
7527ec681f3Smrg        m = b_flt_m - a_flt_m;
7537ec681f3Smrg    } else {
7547ec681f3Smrg        a_flt_m <<= 10;
7557ec681f3Smrg        b_flt_m <<= 10;
7567ec681f3Smrg
7577ec681f3Smrg        b_flt_m += (b_flt_e) ? 0x4000000000000000 : b_flt_m;
7587ec681f3Smrg        b_flt_m = _mesa_shift_right_jam64(b_flt_m, exp_diff);
7597ec681f3Smrg        a_flt_m |= 0x4000000000000000;
7607ec681f3Smrg        e = a_flt_e;
7617ec681f3Smrg        m = a_flt_m - b_flt_m;
7627ec681f3Smrg    }
7637ec681f3Smrg
7647ec681f3Smrg    return _mesa_norm_round_pack_f64(s, e - 1, m);
7657ec681f3Smrg}
7667ec681f3Smrg
7677ec681f3Smrgstatic inline void
7687ec681f3Smrg_mesa_norm_subnormal_mantissa_f64(uint64_t m, uint64_t *exp, uint64_t *m_out)
7697ec681f3Smrg{
7707ec681f3Smrg    int shift_dist;
7717ec681f3Smrg
7727ec681f3Smrg    shift_dist = _mesa_count_leading_zeros64(m) - 11;
7737ec681f3Smrg    *exp = 1 - shift_dist;
7747ec681f3Smrg    *m_out = m << shift_dist;
7757ec681f3Smrg}
7767ec681f3Smrg
7777ec681f3Smrgstatic inline void
7787ec681f3Smrg_mesa_norm_subnormal_mantissa_f32(uint32_t m, uint32_t *exp, uint32_t *m_out)
7797ec681f3Smrg{
7807ec681f3Smrg    int shift_dist;
7817ec681f3Smrg
7827ec681f3Smrg    shift_dist = _mesa_count_leading_zeros32(m) - 8;
7837ec681f3Smrg    *exp = 1 - shift_dist;
7847ec681f3Smrg    *m_out = m << shift_dist;
7857ec681f3Smrg}
7867ec681f3Smrg
7877ec681f3Smrg/**
7887ec681f3Smrg * \brief Multiplies 'a' and 'b' and stores the 128-bit product at the location
7897ec681f3Smrg * pointed to by 'zPtr'.  Argument 'zPtr' points to an array of four 32-bit
7907ec681f3Smrg * elements that concatenate in the platform's normal endian order to form a
7917ec681f3Smrg * 128-bit integer.
7927ec681f3Smrg *
7937ec681f3Smrg * From softfloat_mul64To128M()
7947ec681f3Smrg */
7957ec681f3Smrgstatic inline void
7967ec681f3Smrg_mesa_softfloat_mul_f64_to_f128_m(uint64_t a, uint64_t b, uint32_t *m_out)
7977ec681f3Smrg{
7987ec681f3Smrg    uint32_t a32, a0, b32, b0;
7997ec681f3Smrg    uint64_t z0, mid1, z64, mid;
8007ec681f3Smrg
8017ec681f3Smrg    a32 = a >> 32;
8027ec681f3Smrg    a0 = a;
8037ec681f3Smrg    b32 = b >> 32;
8047ec681f3Smrg    b0 = b;
8057ec681f3Smrg    z0 = (uint64_t) a0 * b0;
8067ec681f3Smrg    mid1 = (uint64_t) a32 * b0;
8077ec681f3Smrg    mid = mid1 + (uint64_t) a0 * b32;
8087ec681f3Smrg    z64 = (uint64_t) a32 * b32;
8097ec681f3Smrg    z64 += (uint64_t) (mid < mid1) << 32 | mid >> 32;
8107ec681f3Smrg    mid <<= 32;
8117ec681f3Smrg    z0 += mid;
8127ec681f3Smrg    m_out[index_word(4, 1)] = z0 >> 32;
8137ec681f3Smrg    m_out[index_word(4, 0)] = z0;
8147ec681f3Smrg    z64 += (z0 < mid);
8157ec681f3Smrg    m_out[index_word(4, 3)] = z64 >> 32;
8167ec681f3Smrg    m_out[index_word(4, 2)] = z64;
8177ec681f3Smrg}
8187ec681f3Smrg
8197ec681f3Smrg/* Calculate a * b but rounding to zero.
8207ec681f3Smrg *
8217ec681f3Smrg * Notice that this mainly differs from the original Berkeley SoftFloat 3e
8227ec681f3Smrg * implementation in that we don't really treat NaNs, Zeroes nor the
8237ec681f3Smrg * signalling flags. Any NaN is good for us and the sign of the Zero is not
8247ec681f3Smrg * important.
8257ec681f3Smrg *
8267ec681f3Smrg * From f64_mul()
8277ec681f3Smrg */
8287ec681f3Smrgdouble
8297ec681f3Smrg_mesa_double_mul_rtz(double a, double b)
8307ec681f3Smrg{
8317ec681f3Smrg    const di_type a_di = {a};
8327ec681f3Smrg    uint64_t a_flt_m = a_di.u & 0x0fffffffffffff;
8337ec681f3Smrg    uint64_t a_flt_e = (a_di.u >> 52) & 0x7ff;
8347ec681f3Smrg    uint64_t a_flt_s = (a_di.u >> 63) & 0x1;
8357ec681f3Smrg    const di_type b_di = {b};
8367ec681f3Smrg    uint64_t b_flt_m = b_di.u & 0x0fffffffffffff;
8377ec681f3Smrg    uint64_t b_flt_e = (b_di.u >> 52) & 0x7ff;
8387ec681f3Smrg    uint64_t b_flt_s = (b_di.u >> 63) & 0x1;
8397ec681f3Smrg    int64_t s, e, m = 0;
8407ec681f3Smrg
8417ec681f3Smrg    s = a_flt_s ^ b_flt_s;
8427ec681f3Smrg
8437ec681f3Smrg    if (a_flt_e == 0x7ff) {
8447ec681f3Smrg        if (a_flt_m != 0) {
8457ec681f3Smrg            /* 'a' is a NaN, return NaN */
8467ec681f3Smrg            return a;
8477ec681f3Smrg        } else if (b_flt_e == 0x7ff && b_flt_m != 0) {
8487ec681f3Smrg            /* 'b' is a NaN, return NaN */
8497ec681f3Smrg            return b;
8507ec681f3Smrg        }
8517ec681f3Smrg
8527ec681f3Smrg        if (!(b_flt_e | b_flt_m)) {
8537ec681f3Smrg            /* Inf * 0 = NaN */
8547ec681f3Smrg            di_type result;
8557ec681f3Smrg            e = 0x7ff;
8567ec681f3Smrg            result.u = (s << 63) + (e << 52) + 0x1;
8577ec681f3Smrg            return result.f;
8587ec681f3Smrg        }
8597ec681f3Smrg        /* Inf * x = Inf */
8607ec681f3Smrg        di_type result;
8617ec681f3Smrg        e = 0x7ff;
8627ec681f3Smrg        result.u = (s << 63) + (e << 52) + 0;
8637ec681f3Smrg        return result.f;
8647ec681f3Smrg    }
8657ec681f3Smrg
8667ec681f3Smrg    if (b_flt_e == 0x7ff) {
8677ec681f3Smrg        if (b_flt_m != 0) {
8687ec681f3Smrg            /* 'b' is a NaN, return NaN */
8697ec681f3Smrg            return b;
8707ec681f3Smrg        }
8717ec681f3Smrg        if (!(a_flt_e | a_flt_m)) {
8727ec681f3Smrg            /* 0 * Inf = NaN */
8737ec681f3Smrg            di_type result;
8747ec681f3Smrg            e = 0x7ff;
8757ec681f3Smrg            result.u = (s << 63) + (e << 52) + 0x1;
8767ec681f3Smrg            return result.f;
8777ec681f3Smrg        }
8787ec681f3Smrg        /* x * Inf = Inf */
8797ec681f3Smrg        di_type result;
8807ec681f3Smrg        e = 0x7ff;
8817ec681f3Smrg        result.u = (s << 63) + (e << 52) + 0;
8827ec681f3Smrg        return result.f;
8837ec681f3Smrg    }
8847ec681f3Smrg
8857ec681f3Smrg    if (a_flt_e == 0) {
8867ec681f3Smrg        if (a_flt_m == 0) {
8877ec681f3Smrg            /* 'a' is zero. Return zero */
8887ec681f3Smrg            di_type result;
8897ec681f3Smrg            result.u = (s << 63) + 0;
8907ec681f3Smrg            return result.f;
8917ec681f3Smrg        }
8927ec681f3Smrg        _mesa_norm_subnormal_mantissa_f64(a_flt_m , &a_flt_e, &a_flt_m);
8937ec681f3Smrg    }
8947ec681f3Smrg    if (b_flt_e == 0) {
8957ec681f3Smrg        if (b_flt_m == 0) {
8967ec681f3Smrg            /* 'b' is zero. Return zero */
8977ec681f3Smrg            di_type result;
8987ec681f3Smrg            result.u = (s << 63) + 0;
8997ec681f3Smrg            return result.f;
9007ec681f3Smrg        }
9017ec681f3Smrg        _mesa_norm_subnormal_mantissa_f64(b_flt_m , &b_flt_e, &b_flt_m);
9027ec681f3Smrg    }
9037ec681f3Smrg
9047ec681f3Smrg    e = a_flt_e + b_flt_e - 0x3ff;
9057ec681f3Smrg    a_flt_m = (a_flt_m | 0x0010000000000000) << 10;
9067ec681f3Smrg    b_flt_m = (b_flt_m | 0x0010000000000000) << 11;
9077ec681f3Smrg
9087ec681f3Smrg    uint32_t m_128[4];
9097ec681f3Smrg    _mesa_softfloat_mul_f64_to_f128_m(a_flt_m, b_flt_m, m_128);
9107ec681f3Smrg
9117ec681f3Smrg    m = (uint64_t) m_128[index_word(4, 3)] << 32 | m_128[index_word(4, 2)];
9127ec681f3Smrg    if (m_128[index_word(4, 1)] || m_128[index_word(4, 0)])
9137ec681f3Smrg        m |= 1;
9147ec681f3Smrg
9157ec681f3Smrg    if (m < 0x4000000000000000) {
9167ec681f3Smrg        --e;
9177ec681f3Smrg        m <<= 1;
9187ec681f3Smrg    }
9197ec681f3Smrg
9207ec681f3Smrg    return _mesa_roundtozero_f64(s, e, m);
9217ec681f3Smrg}
9227ec681f3Smrg
9237ec681f3Smrg
9247ec681f3Smrg/**
9257ec681f3Smrg * \brief Calculate a * b + c but rounding to zero.
9267ec681f3Smrg *
9277ec681f3Smrg * Notice that this mainly differs from the original Berkeley SoftFloat 3e
9287ec681f3Smrg * implementation in that we don't really treat NaNs, Zeroes nor the
9297ec681f3Smrg * signalling flags. Any NaN is good for us and the sign of the Zero is not
9307ec681f3Smrg * important.
9317ec681f3Smrg *
9327ec681f3Smrg * From f64_mulAdd()
9337ec681f3Smrg */
9347ec681f3Smrgdouble
9357ec681f3Smrg_mesa_double_fma_rtz(double a, double b, double c)
9367ec681f3Smrg{
9377ec681f3Smrg    const di_type a_di = {a};
9387ec681f3Smrg    uint64_t a_flt_m = a_di.u & 0x0fffffffffffff;
9397ec681f3Smrg    uint64_t a_flt_e = (a_di.u >> 52) & 0x7ff;
9407ec681f3Smrg    uint64_t a_flt_s = (a_di.u >> 63) & 0x1;
9417ec681f3Smrg    const di_type b_di = {b};
9427ec681f3Smrg    uint64_t b_flt_m = b_di.u & 0x0fffffffffffff;
9437ec681f3Smrg    uint64_t b_flt_e = (b_di.u >> 52) & 0x7ff;
9447ec681f3Smrg    uint64_t b_flt_s = (b_di.u >> 63) & 0x1;
9457ec681f3Smrg    const di_type c_di = {c};
9467ec681f3Smrg    uint64_t c_flt_m = c_di.u & 0x0fffffffffffff;
9477ec681f3Smrg    uint64_t c_flt_e = (c_di.u >> 52) & 0x7ff;
9487ec681f3Smrg    uint64_t c_flt_s = (c_di.u >> 63) & 0x1;
9497ec681f3Smrg    int64_t s, e, m = 0;
9507ec681f3Smrg
9517ec681f3Smrg    c_flt_s ^= 0;
9527ec681f3Smrg    s = a_flt_s ^ b_flt_s ^ 0;
9537ec681f3Smrg
9547ec681f3Smrg    if (a_flt_e == 0x7ff) {
9557ec681f3Smrg        if (a_flt_m != 0) {
9567ec681f3Smrg            /* 'a' is a NaN, return NaN */
9577ec681f3Smrg            return a;
9587ec681f3Smrg        } else if (b_flt_e == 0x7ff && b_flt_m != 0) {
9597ec681f3Smrg            /* 'b' is a NaN, return NaN */
9607ec681f3Smrg            return b;
9617ec681f3Smrg        } else if (c_flt_e == 0x7ff && c_flt_m != 0) {
9627ec681f3Smrg            /* 'c' is a NaN, return NaN */
9637ec681f3Smrg            return c;
9647ec681f3Smrg        }
9657ec681f3Smrg
9667ec681f3Smrg        if (!(b_flt_e | b_flt_m)) {
9677ec681f3Smrg            /* Inf * 0 + y = NaN */
9687ec681f3Smrg            di_type result;
9697ec681f3Smrg            e = 0x7ff;
9707ec681f3Smrg            result.u = (s << 63) + (e << 52) + 0x1;
9717ec681f3Smrg            return result.f;
9727ec681f3Smrg        }
9737ec681f3Smrg
9747ec681f3Smrg        if ((c_flt_e == 0x7ff && c_flt_m == 0) && (s != c_flt_s)) {
9757ec681f3Smrg            /* Inf * x - Inf = NaN */
9767ec681f3Smrg            di_type result;
9777ec681f3Smrg            e = 0x7ff;
9787ec681f3Smrg            result.u = (s << 63) + (e << 52) + 0x1;
9797ec681f3Smrg            return result.f;
9807ec681f3Smrg        }
9817ec681f3Smrg
9827ec681f3Smrg        /* Inf * x + y = Inf */
9837ec681f3Smrg        di_type result;
9847ec681f3Smrg        e = 0x7ff;
9857ec681f3Smrg        result.u = (s << 63) + (e << 52) + 0;
9867ec681f3Smrg        return result.f;
9877ec681f3Smrg    }
9887ec681f3Smrg
9897ec681f3Smrg    if (b_flt_e == 0x7ff) {
9907ec681f3Smrg        if (b_flt_m != 0) {
9917ec681f3Smrg            /* 'b' is a NaN, return NaN */
9927ec681f3Smrg            return b;
9937ec681f3Smrg        } else if (c_flt_e == 0x7ff && c_flt_m != 0) {
9947ec681f3Smrg            /* 'c' is a NaN, return NaN */
9957ec681f3Smrg            return c;
9967ec681f3Smrg        }
9977ec681f3Smrg
9987ec681f3Smrg        if (!(a_flt_e | a_flt_m)) {
9997ec681f3Smrg            /* 0 * Inf + y = NaN */
10007ec681f3Smrg            di_type result;
10017ec681f3Smrg            e = 0x7ff;
10027ec681f3Smrg            result.u = (s << 63) + (e << 52) + 0x1;
10037ec681f3Smrg            return result.f;
10047ec681f3Smrg        }
10057ec681f3Smrg
10067ec681f3Smrg        if ((c_flt_e == 0x7ff && c_flt_m == 0) && (s != c_flt_s)) {
10077ec681f3Smrg            /* x * Inf - Inf = NaN */
10087ec681f3Smrg            di_type result;
10097ec681f3Smrg            e = 0x7ff;
10107ec681f3Smrg            result.u = (s << 63) + (e << 52) + 0x1;
10117ec681f3Smrg            return result.f;
10127ec681f3Smrg        }
10137ec681f3Smrg
10147ec681f3Smrg        /* x * Inf + y = Inf */
10157ec681f3Smrg        di_type result;
10167ec681f3Smrg        e = 0x7ff;
10177ec681f3Smrg        result.u = (s << 63) + (e << 52) + 0;
10187ec681f3Smrg        return result.f;
10197ec681f3Smrg    }
10207ec681f3Smrg
10217ec681f3Smrg    if (c_flt_e == 0x7ff) {
10227ec681f3Smrg        if (c_flt_m != 0) {
10237ec681f3Smrg            /* 'c' is a NaN, return NaN */
10247ec681f3Smrg            return c;
10257ec681f3Smrg        }
10267ec681f3Smrg
10277ec681f3Smrg        /* x * y + Inf = Inf */
10287ec681f3Smrg        return c;
10297ec681f3Smrg    }
10307ec681f3Smrg
10317ec681f3Smrg    if (a_flt_e == 0) {
10327ec681f3Smrg        if (a_flt_m == 0) {
10337ec681f3Smrg            /* 'a' is zero, return 'c' */
10347ec681f3Smrg            return c;
10357ec681f3Smrg        }
10367ec681f3Smrg        _mesa_norm_subnormal_mantissa_f64(a_flt_m , &a_flt_e, &a_flt_m);
10377ec681f3Smrg    }
10387ec681f3Smrg
10397ec681f3Smrg    if (b_flt_e == 0) {
10407ec681f3Smrg        if (b_flt_m == 0) {
10417ec681f3Smrg            /* 'b' is zero, return 'c' */
10427ec681f3Smrg            return c;
10437ec681f3Smrg        }
10447ec681f3Smrg        _mesa_norm_subnormal_mantissa_f64(b_flt_m , &b_flt_e, &b_flt_m);
10457ec681f3Smrg    }
10467ec681f3Smrg
10477ec681f3Smrg    e = a_flt_e + b_flt_e - 0x3fe;
10487ec681f3Smrg    a_flt_m = (a_flt_m | 0x0010000000000000) << 10;
10497ec681f3Smrg    b_flt_m = (b_flt_m | 0x0010000000000000) << 11;
10507ec681f3Smrg
10517ec681f3Smrg    uint32_t m_128[4];
10527ec681f3Smrg    _mesa_softfloat_mul_f64_to_f128_m(a_flt_m, b_flt_m, m_128);
10537ec681f3Smrg
10547ec681f3Smrg    m = (uint64_t) m_128[index_word(4, 3)] << 32 | m_128[index_word(4, 2)];
10557ec681f3Smrg
10567ec681f3Smrg    int64_t shift_dist = 0;
10577ec681f3Smrg    if (!(m & 0x4000000000000000)) {
10587ec681f3Smrg        --e;
10597ec681f3Smrg        shift_dist = -1;
10607ec681f3Smrg    }
10617ec681f3Smrg
10627ec681f3Smrg    if (c_flt_e == 0) {
10637ec681f3Smrg        if (c_flt_m == 0) {
10647ec681f3Smrg            /* 'c' is zero, return 'a * b' */
10657ec681f3Smrg            if (shift_dist)
10667ec681f3Smrg                m <<= 1;
10677ec681f3Smrg
10687ec681f3Smrg            if (m_128[index_word(4, 1)] || m_128[index_word(4, 0)])
10697ec681f3Smrg                m |= 1;
10707ec681f3Smrg            return _mesa_roundtozero_f64(s, e - 1, m);
10717ec681f3Smrg        }
10727ec681f3Smrg        _mesa_norm_subnormal_mantissa_f64(c_flt_m , &c_flt_e, &c_flt_m);
10737ec681f3Smrg    }
10747ec681f3Smrg    c_flt_m = (c_flt_m | 0x0010000000000000) << 10;
10757ec681f3Smrg
10767ec681f3Smrg    uint32_t c_flt_m_128[4];
10777ec681f3Smrg    int64_t exp_diff = e - c_flt_e;
10787ec681f3Smrg    if (exp_diff < 0) {
10797ec681f3Smrg        e = c_flt_e;
10807ec681f3Smrg        if ((s == c_flt_s) || (exp_diff < -1)) {
10817ec681f3Smrg            shift_dist -= exp_diff;
10827ec681f3Smrg            if (shift_dist) {
10837ec681f3Smrg                m = _mesa_shift_right_jam64(m, shift_dist);
10847ec681f3Smrg            }
10857ec681f3Smrg        } else {
10867ec681f3Smrg            if (!shift_dist) {
10877ec681f3Smrg                _mesa_short_shift_right_m(4, m_128, 1, m_128);
10887ec681f3Smrg            }
10897ec681f3Smrg        }
10907ec681f3Smrg    } else {
10917ec681f3Smrg        if (shift_dist)
10927ec681f3Smrg            _mesa_add_m(4, m_128, m_128, m_128);
10937ec681f3Smrg        if (!exp_diff) {
10947ec681f3Smrg            m = (uint64_t) m_128[index_word(4, 3)] << 32
10957ec681f3Smrg                | m_128[index_word(4, 2)];
10967ec681f3Smrg        } else {
10977ec681f3Smrg            c_flt_m_128[index_word(4, 3)] = c_flt_m >> 32;
10987ec681f3Smrg            c_flt_m_128[index_word(4, 2)] = c_flt_m;
10997ec681f3Smrg            c_flt_m_128[index_word(4, 1)] = 0;
11007ec681f3Smrg            c_flt_m_128[index_word(4, 0)] = 0;
11017ec681f3Smrg            _mesa_shift_right_jam_m(4, c_flt_m_128, exp_diff, c_flt_m_128);
11027ec681f3Smrg        }
11037ec681f3Smrg    }
11047ec681f3Smrg
11057ec681f3Smrg    if (s == c_flt_s) {
11067ec681f3Smrg        if (exp_diff <= 0) {
11077ec681f3Smrg            m += c_flt_m;
11087ec681f3Smrg        } else {
11097ec681f3Smrg            _mesa_add_m(4, m_128, c_flt_m_128, m_128);
11107ec681f3Smrg            m = (uint64_t) m_128[index_word(4, 3)] << 32
11117ec681f3Smrg                | m_128[index_word(4, 2)];
11127ec681f3Smrg        }
11137ec681f3Smrg        if (m & 0x8000000000000000) {
11147ec681f3Smrg            e++;
11157ec681f3Smrg            m = _mesa_short_shift_right_jam64(m, 1);
11167ec681f3Smrg        }
11177ec681f3Smrg    } else {
11187ec681f3Smrg        if (exp_diff < 0) {
11197ec681f3Smrg            s = c_flt_s;
11207ec681f3Smrg            if (exp_diff < -1) {
11217ec681f3Smrg                m = c_flt_m - m;
11227ec681f3Smrg                if (m_128[index_word(4, 1)] || m_128[index_word(4, 0)]) {
11237ec681f3Smrg                    m = (m - 1) | 1;
11247ec681f3Smrg                }
11257ec681f3Smrg                if (!(m & 0x4000000000000000)) {
11267ec681f3Smrg                    --e;
11277ec681f3Smrg                    m <<= 1;
11287ec681f3Smrg                }
11297ec681f3Smrg                return _mesa_roundtozero_f64(s, e - 1, m);
11307ec681f3Smrg            } else {
11317ec681f3Smrg                c_flt_m_128[index_word(4, 3)] = c_flt_m >> 32;
11327ec681f3Smrg                c_flt_m_128[index_word(4, 2)] = c_flt_m;
11337ec681f3Smrg                c_flt_m_128[index_word(4, 1)] = 0;
11347ec681f3Smrg                c_flt_m_128[index_word(4, 0)] = 0;
11357ec681f3Smrg                _mesa_sub_m(4, c_flt_m_128, m_128, m_128);
11367ec681f3Smrg            }
11377ec681f3Smrg        } else if (!exp_diff) {
11387ec681f3Smrg            m -= c_flt_m;
11397ec681f3Smrg            if (!m && !m_128[index_word(4, 1)] && !m_128[index_word(4, 0)]) {
11407ec681f3Smrg                /* Return zero */
11417ec681f3Smrg                di_type result;
11427ec681f3Smrg                result.u = (s << 63) + 0;
11437ec681f3Smrg                return result.f;
11447ec681f3Smrg            }
11457ec681f3Smrg            m_128[index_word(4, 3)] = m >> 32;
11467ec681f3Smrg            m_128[index_word(4, 2)] = m;
11477ec681f3Smrg            if (m & 0x8000000000000000) {
11487ec681f3Smrg                s = !s;
11497ec681f3Smrg                _mesa_neg_x_m(4, m_128);
11507ec681f3Smrg            }
11517ec681f3Smrg        } else {
11527ec681f3Smrg            _mesa_sub_m(4, m_128, c_flt_m_128, m_128);
11537ec681f3Smrg            if (1 < exp_diff) {
11547ec681f3Smrg                m = (uint64_t) m_128[index_word(4, 3)] << 32
11557ec681f3Smrg                    | m_128[index_word(4, 2)];
11567ec681f3Smrg                if (!(m & 0x4000000000000000)) {
11577ec681f3Smrg                    --e;
11587ec681f3Smrg                    m <<= 1;
11597ec681f3Smrg                }
11607ec681f3Smrg                if (m_128[index_word(4, 1)] || m_128[index_word(4, 0)])
11617ec681f3Smrg                    m |= 1;
11627ec681f3Smrg                return _mesa_roundtozero_f64(s, e - 1, m);
11637ec681f3Smrg            }
11647ec681f3Smrg        }
11657ec681f3Smrg
11667ec681f3Smrg        shift_dist = 0;
11677ec681f3Smrg        m = (uint64_t) m_128[index_word(4, 3)] << 32
11687ec681f3Smrg            | m_128[index_word(4, 2)];
11697ec681f3Smrg        if (!m) {
11707ec681f3Smrg            shift_dist = 64;
11717ec681f3Smrg            m = (uint64_t) m_128[index_word(4, 1)] << 32
11727ec681f3Smrg                | m_128[index_word(4, 0)];
11737ec681f3Smrg        }
11747ec681f3Smrg        shift_dist += _mesa_count_leading_zeros64(m) - 1;
11757ec681f3Smrg        if (shift_dist) {
11767ec681f3Smrg            e -= shift_dist;
11777ec681f3Smrg            _mesa_shift_left_m(4, m_128, shift_dist, m_128);
11787ec681f3Smrg            m = (uint64_t) m_128[index_word(4, 3)] << 32
11797ec681f3Smrg                | m_128[index_word(4, 2)];
11807ec681f3Smrg        }
11817ec681f3Smrg    }
11827ec681f3Smrg
11837ec681f3Smrg    if (m_128[index_word(4, 1)] || m_128[index_word(4, 0)])
11847ec681f3Smrg        m |= 1;
11857ec681f3Smrg    return _mesa_roundtozero_f64(s, e - 1, m);
11867ec681f3Smrg}
11877ec681f3Smrg
11887ec681f3Smrg
11897ec681f3Smrg/**
11907ec681f3Smrg * \brief Calculate a * b + c but rounding to zero.
11917ec681f3Smrg *
11927ec681f3Smrg * Notice that this mainly differs from the original Berkeley SoftFloat 3e
11937ec681f3Smrg * implementation in that we don't really treat NaNs, Zeroes nor the
11947ec681f3Smrg * signalling flags. Any NaN is good for us and the sign of the Zero is not
11957ec681f3Smrg * important.
11967ec681f3Smrg *
11977ec681f3Smrg * From f32_mulAdd()
11987ec681f3Smrg */
11997ec681f3Smrgfloat
12007ec681f3Smrg_mesa_float_fma_rtz(float a, float b, float c)
12017ec681f3Smrg{
12027ec681f3Smrg    const fi_type a_fi = {a};
12037ec681f3Smrg    uint32_t a_flt_m = a_fi.u & 0x07fffff;
12047ec681f3Smrg    uint32_t a_flt_e = (a_fi.u >> 23) & 0xff;
12057ec681f3Smrg    uint32_t a_flt_s = (a_fi.u >> 31) & 0x1;
12067ec681f3Smrg    const fi_type b_fi = {b};
12077ec681f3Smrg    uint32_t b_flt_m = b_fi.u & 0x07fffff;
12087ec681f3Smrg    uint32_t b_flt_e = (b_fi.u >> 23) & 0xff;
12097ec681f3Smrg    uint32_t b_flt_s = (b_fi.u >> 31) & 0x1;
12107ec681f3Smrg    const fi_type c_fi = {c};
12117ec681f3Smrg    uint32_t c_flt_m = c_fi.u & 0x07fffff;
12127ec681f3Smrg    uint32_t c_flt_e = (c_fi.u >> 23) & 0xff;
12137ec681f3Smrg    uint32_t c_flt_s = (c_fi.u >> 31) & 0x1;
12147ec681f3Smrg    int32_t s, e, m = 0;
12157ec681f3Smrg
12167ec681f3Smrg    c_flt_s ^= 0;
12177ec681f3Smrg    s = a_flt_s ^ b_flt_s ^ 0;
12187ec681f3Smrg
12197ec681f3Smrg    if (a_flt_e == 0xff) {
12207ec681f3Smrg        if (a_flt_m != 0) {
12217ec681f3Smrg            /* 'a' is a NaN, return NaN */
12227ec681f3Smrg            return a;
12237ec681f3Smrg        } else if (b_flt_e == 0xff && b_flt_m != 0) {
12247ec681f3Smrg            /* 'b' is a NaN, return NaN */
12257ec681f3Smrg            return b;
12267ec681f3Smrg        } else if (c_flt_e == 0xff && c_flt_m != 0) {
12277ec681f3Smrg            /* 'c' is a NaN, return NaN */
12287ec681f3Smrg            return c;
12297ec681f3Smrg        }
12307ec681f3Smrg
12317ec681f3Smrg        if (!(b_flt_e | b_flt_m)) {
12327ec681f3Smrg            /* Inf * 0 + y = NaN */
12337ec681f3Smrg            fi_type result;
12347ec681f3Smrg            e = 0xff;
12357ec681f3Smrg            result.u = (s << 31) + (e << 23) + 0x1;
12367ec681f3Smrg            return result.f;
12377ec681f3Smrg        }
12387ec681f3Smrg
12397ec681f3Smrg        if ((c_flt_e == 0xff && c_flt_m == 0) && (s != c_flt_s)) {
12407ec681f3Smrg            /* Inf * x - Inf = NaN */
12417ec681f3Smrg            fi_type result;
12427ec681f3Smrg            e = 0xff;
12437ec681f3Smrg            result.u = (s << 31) + (e << 23) + 0x1;
12447ec681f3Smrg            return result.f;
12457ec681f3Smrg        }
12467ec681f3Smrg
12477ec681f3Smrg        /* Inf * x + y = Inf */
12487ec681f3Smrg        fi_type result;
12497ec681f3Smrg        e = 0xff;
12507ec681f3Smrg        result.u = (s << 31) + (e << 23) + 0;
12517ec681f3Smrg        return result.f;
12527ec681f3Smrg    }
12537ec681f3Smrg
12547ec681f3Smrg    if (b_flt_e == 0xff) {
12557ec681f3Smrg        if (b_flt_m != 0) {
12567ec681f3Smrg            /* 'b' is a NaN, return NaN */
12577ec681f3Smrg            return b;
12587ec681f3Smrg        } else if (c_flt_e == 0xff && c_flt_m != 0) {
12597ec681f3Smrg            /* 'c' is a NaN, return NaN */
12607ec681f3Smrg            return c;
12617ec681f3Smrg        }
12627ec681f3Smrg
12637ec681f3Smrg        if (!(a_flt_e | a_flt_m)) {
12647ec681f3Smrg            /* 0 * Inf + y = NaN */
12657ec681f3Smrg            fi_type result;
12667ec681f3Smrg            e = 0xff;
12677ec681f3Smrg            result.u = (s << 31) + (e << 23) + 0x1;
12687ec681f3Smrg            return result.f;
12697ec681f3Smrg        }
12707ec681f3Smrg
12717ec681f3Smrg        if ((c_flt_e == 0xff && c_flt_m == 0) && (s != c_flt_s)) {
12727ec681f3Smrg            /* x * Inf - Inf = NaN */
12737ec681f3Smrg            fi_type result;
12747ec681f3Smrg            e = 0xff;
12757ec681f3Smrg            result.u = (s << 31) + (e << 23) + 0x1;
12767ec681f3Smrg            return result.f;
12777ec681f3Smrg        }
12787ec681f3Smrg
12797ec681f3Smrg        /* x * Inf + y = Inf */
12807ec681f3Smrg        fi_type result;
12817ec681f3Smrg        e = 0xff;
12827ec681f3Smrg        result.u = (s << 31) + (e << 23) + 0;
12837ec681f3Smrg        return result.f;
12847ec681f3Smrg    }
12857ec681f3Smrg
12867ec681f3Smrg    if (c_flt_e == 0xff) {
12877ec681f3Smrg        if (c_flt_m != 0) {
12887ec681f3Smrg            /* 'c' is a NaN, return NaN */
12897ec681f3Smrg            return c;
12907ec681f3Smrg        }
12917ec681f3Smrg
12927ec681f3Smrg        /* x * y + Inf = Inf */
12937ec681f3Smrg        return c;
12947ec681f3Smrg    }
12957ec681f3Smrg
12967ec681f3Smrg    if (a_flt_e == 0) {
12977ec681f3Smrg        if (a_flt_m == 0) {
12987ec681f3Smrg            /* 'a' is zero, return 'c' */
12997ec681f3Smrg            return c;
13007ec681f3Smrg        }
13017ec681f3Smrg        _mesa_norm_subnormal_mantissa_f32(a_flt_m , &a_flt_e, &a_flt_m);
13027ec681f3Smrg    }
13037ec681f3Smrg
13047ec681f3Smrg    if (b_flt_e == 0) {
13057ec681f3Smrg        if (b_flt_m == 0) {
13067ec681f3Smrg            /* 'b' is zero, return 'c' */
13077ec681f3Smrg            return c;
13087ec681f3Smrg        }
13097ec681f3Smrg        _mesa_norm_subnormal_mantissa_f32(b_flt_m , &b_flt_e, &b_flt_m);
13107ec681f3Smrg    }
13117ec681f3Smrg
13127ec681f3Smrg    e = a_flt_e + b_flt_e - 0x7e;
13137ec681f3Smrg    a_flt_m = (a_flt_m | 0x00800000) << 7;
13147ec681f3Smrg    b_flt_m = (b_flt_m | 0x00800000) << 7;
13157ec681f3Smrg
13167ec681f3Smrg    uint64_t m_64 = (uint64_t) a_flt_m * b_flt_m;
13177ec681f3Smrg    if (m_64 < 0x2000000000000000) {
13187ec681f3Smrg        --e;
13197ec681f3Smrg        m_64 <<= 1;
13207ec681f3Smrg    }
13217ec681f3Smrg
13227ec681f3Smrg    if (c_flt_e == 0) {
13237ec681f3Smrg        if (c_flt_m == 0) {
13247ec681f3Smrg            /* 'c' is zero, return 'a * b' */
13257ec681f3Smrg            m = _mesa_short_shift_right_jam64(m_64, 31);
13267ec681f3Smrg            return _mesa_round_f32(s, e - 1, m, true);
13277ec681f3Smrg        }
13287ec681f3Smrg        _mesa_norm_subnormal_mantissa_f32(c_flt_m , &c_flt_e, &c_flt_m);
13297ec681f3Smrg    }
13307ec681f3Smrg    c_flt_m = (c_flt_m | 0x00800000) << 6;
13317ec681f3Smrg
13327ec681f3Smrg    int16_t exp_diff = e - c_flt_e;
13337ec681f3Smrg    if (s == c_flt_s) {
13347ec681f3Smrg        if (exp_diff <= 0) {
13357ec681f3Smrg            e = c_flt_e;
13367ec681f3Smrg            m = c_flt_m + _mesa_shift_right_jam64(m_64, 32 - exp_diff);
13377ec681f3Smrg        } else {
13387ec681f3Smrg            m_64 += _mesa_shift_right_jam64((uint64_t) c_flt_m << 32, exp_diff);
13397ec681f3Smrg            m = _mesa_short_shift_right_jam64(m_64, 32);
13407ec681f3Smrg        }
13417ec681f3Smrg        if (m < 0x40000000) {
13427ec681f3Smrg            --e;
13437ec681f3Smrg            m <<= 1;
13447ec681f3Smrg        }
13457ec681f3Smrg    } else {
13467ec681f3Smrg        uint64_t c_flt_m_64 = (uint64_t) c_flt_m << 32;
13477ec681f3Smrg        if (exp_diff < 0) {
13487ec681f3Smrg            s = c_flt_s;
13497ec681f3Smrg            e = c_flt_e;
13507ec681f3Smrg            m_64 = c_flt_m_64 - _mesa_shift_right_jam64(m_64, -exp_diff);
13517ec681f3Smrg        } else if (!exp_diff) {
13527ec681f3Smrg            m_64 -= c_flt_m_64;
13537ec681f3Smrg            if (!m_64) {
13547ec681f3Smrg                /* Return zero */
13557ec681f3Smrg                fi_type result;
13567ec681f3Smrg                result.u = (s << 31) + 0;
13577ec681f3Smrg                return result.f;
13587ec681f3Smrg            }
13597ec681f3Smrg            if (m_64 & 0x8000000000000000) {
13607ec681f3Smrg                s = !s;
13617ec681f3Smrg                m_64 = -m_64;
13627ec681f3Smrg            }
13637ec681f3Smrg        } else {
13647ec681f3Smrg            m_64 -= _mesa_shift_right_jam64(c_flt_m_64, exp_diff);
13657ec681f3Smrg        }
13667ec681f3Smrg        int8_t shift_dist = _mesa_count_leading_zeros64(m_64) - 1;
13677ec681f3Smrg        e -= shift_dist;
13687ec681f3Smrg        shift_dist -= 32;
13697ec681f3Smrg        if (shift_dist < 0) {
13707ec681f3Smrg            m = _mesa_short_shift_right_jam64(m_64, -shift_dist);
13717ec681f3Smrg        } else {
13727ec681f3Smrg            m = (uint32_t) m_64 << shift_dist;
13737ec681f3Smrg        }
13747ec681f3Smrg    }
13757ec681f3Smrg
13767ec681f3Smrg    return _mesa_round_f32(s, e, m, true);
13777ec681f3Smrg}
13787ec681f3Smrg
13797ec681f3Smrg
13807ec681f3Smrg/**
13817ec681f3Smrg * \brief Converts from 64bits to 32bits float and rounds according to
13827ec681f3Smrg * instructed.
13837ec681f3Smrg *
13847ec681f3Smrg * From f64_to_f32()
13857ec681f3Smrg */
13867ec681f3Smrgfloat
13877ec681f3Smrg_mesa_double_to_f32(double val, bool rtz)
13887ec681f3Smrg{
13897ec681f3Smrg    const di_type di = {val};
13907ec681f3Smrg    uint64_t flt_m = di.u & 0x0fffffffffffff;
13917ec681f3Smrg    uint64_t flt_e = (di.u >> 52) & 0x7ff;
13927ec681f3Smrg    uint64_t flt_s = (di.u >> 63) & 0x1;
13937ec681f3Smrg    int32_t s, e, m = 0;
13947ec681f3Smrg
13957ec681f3Smrg    s = flt_s;
13967ec681f3Smrg
13977ec681f3Smrg    if (flt_e == 0x7ff) {
13987ec681f3Smrg        if (flt_m != 0) {
13997ec681f3Smrg            /* 'val' is a NaN, return NaN */
14007ec681f3Smrg            fi_type result;
14017ec681f3Smrg            e = 0xff;
14027ec681f3Smrg            m = 0x1;
14037ec681f3Smrg            result.u = (s << 31) + (e << 23) + m;
14047ec681f3Smrg            return result.f;
14057ec681f3Smrg        }
14067ec681f3Smrg
14077ec681f3Smrg        /* 'val' is Inf, return Inf */
14087ec681f3Smrg        fi_type result;
14097ec681f3Smrg        e = 0xff;
14107ec681f3Smrg        result.u = (s << 31) + (e << 23) + m;
14117ec681f3Smrg        return result.f;
14127ec681f3Smrg    }
14137ec681f3Smrg
14147ec681f3Smrg    if (!(flt_e | flt_m)) {
14157ec681f3Smrg        /* 'val' is zero, return zero */
14167ec681f3Smrg        fi_type result;
14177ec681f3Smrg        e = 0;
14187ec681f3Smrg        result.u = (s << 31) + (e << 23) + m;
14197ec681f3Smrg        return result.f;
14207ec681f3Smrg    }
14217ec681f3Smrg
14227ec681f3Smrg    m = _mesa_short_shift_right_jam64(flt_m, 22);
14237ec681f3Smrg    if ( ! (flt_e | m) ) {
14247ec681f3Smrg        /* 'val' is denorm, return zero */
14257ec681f3Smrg        fi_type result;
14267ec681f3Smrg        e = 0;
14277ec681f3Smrg        result.u = (s << 31) + (e << 23) + m;
14287ec681f3Smrg        return result.f;
14297ec681f3Smrg    }
14307ec681f3Smrg
14317ec681f3Smrg    return _mesa_round_f32(s, flt_e - 0x381, m | 0x40000000, rtz);
14327ec681f3Smrg}
14337ec681f3Smrg
14347ec681f3Smrg
14357ec681f3Smrg/**
14367ec681f3Smrg * \brief Converts from 32bits to 16bits float and rounds the result to zero.
14377ec681f3Smrg *
14387ec681f3Smrg * From f32_to_f16()
14397ec681f3Smrg */
14407ec681f3Smrguint16_t
14417ec681f3Smrg_mesa_float_to_half_rtz_slow(float val)
14427ec681f3Smrg{
14437ec681f3Smrg    const fi_type fi = {val};
14447ec681f3Smrg    const uint32_t flt_m = fi.u & 0x7fffff;
14457ec681f3Smrg    const uint32_t flt_e = (fi.u >> 23) & 0xff;
14467ec681f3Smrg    const uint32_t flt_s = (fi.u >> 31) & 0x1;
14477ec681f3Smrg    int16_t s, e, m = 0;
14487ec681f3Smrg
14497ec681f3Smrg    s = flt_s;
14507ec681f3Smrg
14517ec681f3Smrg    if (flt_e == 0xff) {
14527ec681f3Smrg        if (flt_m != 0) {
14537ec681f3Smrg            /* 'val' is a NaN, return NaN */
14547ec681f3Smrg            e = 0x1f;
14557ec681f3Smrg            m = 0x1;
14567ec681f3Smrg            return (s << 15) + (e << 10) + m;
14577ec681f3Smrg        }
14587ec681f3Smrg
14597ec681f3Smrg        /* 'val' is Inf, return Inf */
14607ec681f3Smrg        e = 0x1f;
14617ec681f3Smrg        return (s << 15) + (e << 10) + m;
14627ec681f3Smrg    }
14637ec681f3Smrg
14647ec681f3Smrg    if (!(flt_e | flt_m)) {
14657ec681f3Smrg        /* 'val' is zero, return zero */
14667ec681f3Smrg        e = 0;
14677ec681f3Smrg        return (s << 15) + (e << 10) + m;
14687ec681f3Smrg    }
14697ec681f3Smrg
14707ec681f3Smrg    m = flt_m >> 9 | ((flt_m & 0x1ff) != 0);
14717ec681f3Smrg    if ( ! (flt_e | m) ) {
14727ec681f3Smrg        /* 'val' is denorm, return zero */
14737ec681f3Smrg        e = 0;
14747ec681f3Smrg        return (s << 15) + (e << 10) + m;
14757ec681f3Smrg    }
14767ec681f3Smrg
14777ec681f3Smrg    return _mesa_roundtozero_f16(s, flt_e - 0x71, m | 0x4000);
14787ec681f3Smrg}
1479