Home | History | Annotate | Line # | Download | only in ieee-754
      1  1.1.1.11  mrg /* Copyright (C) 2011-2024 Free Software Foundation, Inc.
      2       1.1  mrg    Contributed by Embecosm on behalf of Adapteva, Inc.
      3       1.1  mrg 
      4       1.1  mrg This file is part of GCC.
      5       1.1  mrg 
      6       1.1  mrg GCC is free software; you can redistribute it and/or modify it under
      7       1.1  mrg the terms of the GNU General Public License as published by the Free
      8       1.1  mrg Software Foundation; either version 3, or (at your option) any later
      9       1.1  mrg version.
     10       1.1  mrg 
     11       1.1  mrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
     12       1.1  mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
     13       1.1  mrg FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     14       1.1  mrg for more details.
     15       1.1  mrg 
     16       1.1  mrg Under Section 7 of GPL version 3, you are granted additional
     17       1.1  mrg permissions described in the GCC Runtime Library Exception, version
     18       1.1  mrg 3.1, as published by the Free Software Foundation.
     19       1.1  mrg 
     20       1.1  mrg You should have received a copy of the GNU General Public License and
     21       1.1  mrg a copy of the GCC Runtime Library Exception along with this program;
     22       1.1  mrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     23       1.1  mrg <http://www.gnu.org/licenses/>.  */
     24       1.1  mrg 
     25       1.1  mrg #include "../epiphany-asm.h"
     26       1.1  mrg 
     27       1.1  mrg .section _fast_div_text,"a",@progbits;
     28       1.1  mrg   .balign 8;
     29       1.1  mrg _fast_div_table:
     30       1.1  mrg .word 0x007fffff//   mantissa mask
     31       1.1  mrg .word 0x40257ebb//   hold constant a = 2.58586
     32       1.1  mrg 
     33       1.1  mrg .word 0x3f000000//   hold constant 126 shifted to bits [30:23]
     34       1.1  mrg .word 0xc0ba2e88//   hold constant b = -5.81818
     35       1.1  mrg 
     36       1.1  mrg .word 0x4087c1e8//   hold constant c = 4.24242
     37       1.1  mrg .word 0x40000000//  to hold constant 2 for Newton-Raphson iterations
     38       1.1  mrg 
     39       1.1  mrg  .global SYM(__fast_recipsf2)
     40       1.1  mrg  FUNC(__fast_recipsf2)
     41       1.1  mrg SYM(__fast_recipsf2):
     42       1.1  mrg 
     43       1.1  mrg //###################
     44       1.1  mrg //# input operands:
     45       1.1  mrg //###################
     46       1.1  mrg // Divisor
     47       1.1  mrg //R0
     48       1.1  mrg // Function address (used with negative offsets to read _fast_div_table)
     49       1.1  mrg //R1
     50       1.1  mrg /* Scratch registers:  two single (TMP0/TMP5) and two pairs.  */
     51       1.1  mrg #define P0L TMP1
     52       1.1  mrg #define P0H TMP2
     53       1.1  mrg #define P1L TMP3
     54       1.1  mrg #define P1H TMP4
     55       1.1  mrg 
     56       1.1  mrg //#########################################
     57       1.1  mrg //# Constants to be used in the algorithm
     58       1.1  mrg //#########################################
     59       1.1  mrg ldrd P0L , [ R1 , -3 ]
     60       1.1  mrg 
     61       1.1  mrg ldrd P1L , [ R1 , -2 ]
     62       1.1  mrg 
     63       1.1  mrg 
     64       1.1  mrg 
     65       1.1  mrg //#############################################################################
     66       1.1  mrg //#                       The Algorithm
     67       1.1  mrg //#
     68       1.1  mrg //# Operation: C=A/B
     69       1.1  mrg //# stage 1 - find the reciprocal 1/B according to the following scheme:
     70       1.1  mrg //#  B = (2^E)*m                                (1<m<2, E=e-127)
     71       1.1  mrg //#  1/B = 1/((2^E)*m) = 1/((2^(E+1))*m1)          (0.5<m1<1)
     72       1.1  mrg //#      = (2^-(E+1))*(1/m1) = (2^E1)*(1/m1)
     73       1.1  mrg //#
     74       1.1  mrg //# Now we can find the new exponent:
     75       1.1  mrg //# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e **
     76       1.1  mrg //# 1/m1 alreadt has the exponent 127, so we have to add 126-e.
     77       1.1  mrg //# the exponent might underflow, which we can detect as a sign change.
     78       1.1  mrg //# Since the architeture uses flush-to-zero for subnormals, we can
     79       1.1  mrg //# give the result 0. then.
     80       1.1  mrg //#
     81       1.1  mrg //# The 1/m1 term with 0.5<m1<1 is approximated with the Chebyshev polynomial
     82       1.1  mrg //# 1/m1 = 2.58586*(m1^2) - 5.81818*m1 + 4.24242
     83       1.1  mrg //#
     84       1.1  mrg //# Next step is to use two iterations of Newton-Raphson algorithm to complete
     85       1.1  mrg //# the reciprocal calculation.
     86       1.1  mrg //#
     87       1.1  mrg //# Final result is achieved by multiplying A with 1/B
     88       1.1  mrg //#############################################################################
     89       1.1  mrg 
     90       1.1  mrg 
     91       1.1  mrg 
     92       1.1  mrg // R0 exponent and sign "replacement" into TMP0
     93       1.1  mrg AND TMP0,R0,P0L		 ;
     94       1.1  mrg ORR TMP0,TMP0,P1L
     95       1.1  mrg SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5
     96       1.1  mrg // Calculate new mantissa
     97       1.1  mrg FMADD P1H,TMP0,P0H	         ;
     98       1.1  mrg 		// Calculate new exponent offset 126 - "old exponent"
     99       1.1  mrg 		SUB P1L,P1L,TMP5
    100       1.1  mrg 	ldrd P0L , [ R1 , -1 ]
    101       1.1  mrg FMADD P0L,TMP0,P1H	         ;
    102       1.1  mrg 		eor P1H,r0,P1L // check for overflow (N-BIT).
    103       1.1  mrg 		blt .Lret_0
    104       1.1  mrg // P0L exponent and sign "replacement"
    105       1.1  mrg sub P0L,P0L,TMP5
    106       1.1  mrg 
    107       1.1  mrg // Newton-Raphson iteration #1
    108       1.1  mrg MOV TMP0,P0H	         ;
    109       1.1  mrg FMSUB P0H,R0,P0L	 ;
    110       1.1  mrg FMUL  P0L,P0H,P0L	 ;
    111       1.1  mrg // Newton-Raphson iteration #2
    112       1.1  mrg FMSUB TMP0,R0,P0L	;
    113       1.1  mrg FMUL  R0,TMP0,P0L	         ;
    114       1.1  mrg jr lr
    115       1.1  mrg .Lret_0:ldrd P0L , [ R1 , -3 ]
    116       1.1  mrg 	lsr TMP0,r0,31 ; extract sign
    117       1.1  mrg 	lsl TMP0,TMP0,31
    118       1.1  mrg 	add P0L,P0L,r0 ; check for NaN input
    119       1.1  mrg 	eor P0L,P0L,r0
    120       1.1  mrg 	movgte r0,TMP0
    121       1.1  mrg 	jr lr
    122       1.1  mrg // Quotient calculation is expected by the caller: FMUL quotient,divident,R0
    123       1.1  mrg         ;
    124       1.1  mrg 	ENDFUNC(__fast_recipsf2)
    125