1 1.1.1.11 mrg /* Copyright (C) 2011-2024 Free Software Foundation, Inc. 2 1.1 mrg Contributed by Embecosm on behalf of Adapteva, Inc. 3 1.1 mrg 4 1.1 mrg This file is part of GCC. 5 1.1 mrg 6 1.1 mrg GCC is free software; you can redistribute it and/or modify it under 7 1.1 mrg the terms of the GNU General Public License as published by the Free 8 1.1 mrg Software Foundation; either version 3, or (at your option) any later 9 1.1 mrg version. 10 1.1 mrg 11 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY 12 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 1.1 mrg for more details. 15 1.1 mrg 16 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 17 1.1 mrg permissions described in the GCC Runtime Library Exception, version 18 1.1 mrg 3.1, as published by the Free Software Foundation. 19 1.1 mrg 20 1.1 mrg You should have received a copy of the GNU General Public License and 21 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 22 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 1.1 mrg <http://www.gnu.org/licenses/>. */ 24 1.1 mrg 25 1.1 mrg #include "../epiphany-asm.h" 26 1.1 mrg 27 1.1 mrg .section _fast_div_text,"a",@progbits; 28 1.1 mrg .balign 8; 29 1.1 mrg _fast_div_table: 30 1.1 mrg .word 0x007fffff// mantissa mask 31 1.1 mrg .word 0x40257ebb// hold constant a = 2.58586 32 1.1 mrg 33 1.1 mrg .word 0x3f000000// hold constant 126 shifted to bits [30:23] 34 1.1 mrg .word 0xc0ba2e88// hold constant b = -5.81818 35 1.1 mrg 36 1.1 mrg .word 0x4087c1e8// hold constant c = 4.24242 37 1.1 mrg .word 0x40000000// to hold constant 2 for Newton-Raphson iterations 38 1.1 mrg 39 1.1 mrg .global SYM(__fast_recipsf2) 40 1.1 mrg FUNC(__fast_recipsf2) 41 1.1 mrg SYM(__fast_recipsf2): 42 1.1 mrg 43 1.1 mrg //################### 44 1.1 mrg //# input operands: 45 1.1 mrg //################### 46 1.1 mrg // Divisor 47 1.1 mrg //R0 48 1.1 mrg // Function address (used with negative offsets to read _fast_div_table) 49 1.1 mrg //R1 50 1.1 mrg /* Scratch registers: two single (TMP0/TMP5) and two pairs. */ 51 1.1 mrg #define P0L TMP1 52 1.1 mrg #define P0H TMP2 53 1.1 mrg #define P1L TMP3 54 1.1 mrg #define P1H TMP4 55 1.1 mrg 56 1.1 mrg //######################################### 57 1.1 mrg //# Constants to be used in the algorithm 58 1.1 mrg //######################################### 59 1.1 mrg ldrd P0L , [ R1 , -3 ] 60 1.1 mrg 61 1.1 mrg ldrd P1L , [ R1 , -2 ] 62 1.1 mrg 63 1.1 mrg 64 1.1 mrg 65 1.1 mrg //############################################################################# 66 1.1 mrg //# The Algorithm 67 1.1 mrg //# 68 1.1 mrg //# Operation: C=A/B 69 1.1 mrg //# stage 1 - find the reciprocal 1/B according to the following scheme: 70 1.1 mrg //# B = (2^E)*m (1<m<2, E=e-127) 71 1.1 mrg //# 1/B = 1/((2^E)*m) = 1/((2^(E+1))*m1) (0.5<m1<1) 72 1.1 mrg //# = (2^-(E+1))*(1/m1) = (2^E1)*(1/m1) 73 1.1 mrg //# 74 1.1 mrg //# Now we can find the new exponent: 75 1.1 mrg //# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e ** 76 1.1 mrg //# 1/m1 alreadt has the exponent 127, so we have to add 126-e. 77 1.1 mrg //# the exponent might underflow, which we can detect as a sign change. 78 1.1 mrg //# Since the architeture uses flush-to-zero for subnormals, we can 79 1.1 mrg //# give the result 0. then. 80 1.1 mrg //# 81 1.1 mrg //# The 1/m1 term with 0.5<m1<1 is approximated with the Chebyshev polynomial 82 1.1 mrg //# 1/m1 = 2.58586*(m1^2) - 5.81818*m1 + 4.24242 83 1.1 mrg //# 84 1.1 mrg //# Next step is to use two iterations of Newton-Raphson algorithm to complete 85 1.1 mrg //# the reciprocal calculation. 86 1.1 mrg //# 87 1.1 mrg //# Final result is achieved by multiplying A with 1/B 88 1.1 mrg //############################################################################# 89 1.1 mrg 90 1.1 mrg 91 1.1 mrg 92 1.1 mrg // R0 exponent and sign "replacement" into TMP0 93 1.1 mrg AND TMP0,R0,P0L ; 94 1.1 mrg ORR TMP0,TMP0,P1L 95 1.1 mrg SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5 96 1.1 mrg // Calculate new mantissa 97 1.1 mrg FMADD P1H,TMP0,P0H ; 98 1.1 mrg // Calculate new exponent offset 126 - "old exponent" 99 1.1 mrg SUB P1L,P1L,TMP5 100 1.1 mrg ldrd P0L , [ R1 , -1 ] 101 1.1 mrg FMADD P0L,TMP0,P1H ; 102 1.1 mrg eor P1H,r0,P1L // check for overflow (N-BIT). 103 1.1 mrg blt .Lret_0 104 1.1 mrg // P0L exponent and sign "replacement" 105 1.1 mrg sub P0L,P0L,TMP5 106 1.1 mrg 107 1.1 mrg // Newton-Raphson iteration #1 108 1.1 mrg MOV TMP0,P0H ; 109 1.1 mrg FMSUB P0H,R0,P0L ; 110 1.1 mrg FMUL P0L,P0H,P0L ; 111 1.1 mrg // Newton-Raphson iteration #2 112 1.1 mrg FMSUB TMP0,R0,P0L ; 113 1.1 mrg FMUL R0,TMP0,P0L ; 114 1.1 mrg jr lr 115 1.1 mrg .Lret_0:ldrd P0L , [ R1 , -3 ] 116 1.1 mrg lsr TMP0,r0,31 ; extract sign 117 1.1 mrg lsl TMP0,TMP0,31 118 1.1 mrg add P0L,P0L,r0 ; check for NaN input 119 1.1 mrg eor P0L,P0L,r0 120 1.1 mrg movgte r0,TMP0 121 1.1 mrg jr lr 122 1.1 mrg // Quotient calculation is expected by the caller: FMUL quotient,divident,R0 123 1.1 mrg ; 124 1.1 mrg ENDFUNC(__fast_recipsf2) 125