epiphany/ieee-754/fast_div.S

1.1.1.11  mrg /* Copyright (C) 2011-2024 Free Software Foundation, Inc.
     1.1  mrg    Contributed by Embecosm on behalf of Adapteva, Inc.
     1.1  mrg
     1.1  mrg This file is part of GCC.
     1.1  mrg
     1.1  mrg GCC is free software; you can redistribute it and/or modify it under
     1.1  mrg the terms of the GNU General Public License as published by the Free
     1.1  mrg Software Foundation; either version 3, or (at your option) any later
     1.1  mrg version.
     1.1  mrg
     1.1  mrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
     1.1  mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
     1.1  mrg FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     1.1  mrg for more details.
     1.1  mrg
     1.1  mrg Under Section 7 of GPL version 3, you are granted additional
     1.1  mrg permissions described in the GCC Runtime Library Exception, version
     1.1  mrg 3.1, as published by the Free Software Foundation.
     1.1  mrg
     1.1  mrg You should have received a copy of the GNU General Public License and
     1.1  mrg a copy of the GCC Runtime Library Exception along with this program;
     1.1  mrg see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     1.1  mrg <http://www.gnu.org/licenses/>.  */
     1.1  mrg
     1.1  mrg #include "../epiphany-asm.h"
     1.1  mrg
     1.1  mrg .section _fast_div_text,"a",@progbits;
     1.1  mrg   .balign 8;
     1.1  mrg _fast_div_table:
     1.1  mrg .word 0x007fffff//   mantissa mask
     1.1  mrg .word 0x40257ebb//   hold constant a = 2.58586
     1.1  mrg
     1.1  mrg .word 0x3f000000//   hold constant 126 shifted to bits [30:23]
     1.1  mrg .word 0xc0ba2e88//   hold constant b = -5.81818
     1.1  mrg
     1.1  mrg .word 0x4087c1e8//   hold constant c = 4.24242
     1.1  mrg .word 0x40000000//  to hold constant 2 for Newton-Raphson iterations
     1.1  mrg
     1.1  mrg  .global SYM(__fast_recipsf2)
     1.1  mrg  FUNC(__fast_recipsf2)
     1.1  mrg SYM(__fast_recipsf2):
     1.1  mrg
     1.1  mrg //###################
     1.1  mrg //# input operands:
     1.1  mrg //###################
     1.1  mrg // Divisor
     1.1  mrg //R0
     1.1  mrg // Function address (used with negative offsets to read _fast_div_table)
     1.1  mrg //R1
     1.1  mrg /* Scratch registers:  two single (TMP0/TMP5) and two pairs.  */
     1.1  mrg #define P0L TMP1
     1.1  mrg #define P0H TMP2
     1.1  mrg #define P1L TMP3
     1.1  mrg #define P1H TMP4
     1.1  mrg
     1.1  mrg //#########################################
     1.1  mrg //# Constants to be used in the algorithm
     1.1  mrg //#########################################
     1.1  mrg ldrd P0L , [ R1 , -3 ]
     1.1  mrg
     1.1  mrg ldrd P1L , [ R1 , -2 ]
     1.1  mrg
     1.1  mrg
     1.1  mrg
     1.1  mrg //#############################################################################
     1.1  mrg //#                       The Algorithm
     1.1  mrg //#
     1.1  mrg //# Operation: C=A/B
     1.1  mrg //# stage 1 - find the reciprocal 1/B according to the following scheme:
     1.1  mrg //#  B = (2^E)*m                                (1<m<2, E=e-127)
     1.1  mrg //#  1/B = 1/((2^E)*m) = 1/((2^(E+1))*m1)          (0.5<m1<1)
     1.1  mrg //#      = (2^-(E+1))*(1/m1) = (2^E1)*(1/m1)
     1.1  mrg //#
     1.1  mrg //# Now we can find the new exponent:
     1.1  mrg //# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e **
     1.1  mrg //# 1/m1 alreadt has the exponent 127, so we have to add 126-e.
     1.1  mrg //# the exponent might underflow, which we can detect as a sign change.
     1.1  mrg //# Since the architeture uses flush-to-zero for subnormals, we can
     1.1  mrg //# give the result 0. then.
     1.1  mrg //#
     1.1  mrg //# The 1/m1 term with 0.5<m1<1 is approximated with the Chebyshev polynomial
     1.1  mrg //# 1/m1 = 2.58586*(m1^2) - 5.81818*m1 + 4.24242
     1.1  mrg //#
     1.1  mrg //# Next step is to use two iterations of Newton-Raphson algorithm to complete
     1.1  mrg //# the reciprocal calculation.
     1.1  mrg //#
     1.1  mrg //# Final result is achieved by multiplying A with 1/B
     1.1  mrg //#############################################################################
     1.1  mrg
     1.1  mrg
     1.1  mrg
     1.1  mrg // R0 exponent and sign "replacement" into TMP0
     1.1  mrg AND TMP0,R0,P0L		 ;
     1.1  mrg ORR TMP0,TMP0,P1L
     1.1  mrg SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5
     1.1  mrg // Calculate new mantissa
     1.1  mrg FMADD P1H,TMP0,P0H	         ;
     1.1  mrg 		// Calculate new exponent offset 126 - "old exponent"
     1.1  mrg 		SUB P1L,P1L,TMP5
     1.1  mrg 	ldrd P0L , [ R1 , -1 ]
     1.1  mrg FMADD P0L,TMP0,P1H	         ;
     1.1  mrg 		eor P1H,r0,P1L // check for overflow (N-BIT).
     1.1  mrg 		blt .Lret_0
     1.1  mrg // P0L exponent and sign "replacement"
     1.1  mrg sub P0L,P0L,TMP5
     1.1  mrg
     1.1  mrg // Newton-Raphson iteration #1
     1.1  mrg MOV TMP0,P0H	         ;
     1.1  mrg FMSUB P0H,R0,P0L	 ;
     1.1  mrg FMUL  P0L,P0H,P0L	 ;
     1.1  mrg // Newton-Raphson iteration #2
     1.1  mrg FMSUB TMP0,R0,P0L	;
     1.1  mrg FMUL  R0,TMP0,P0L	         ;
     1.1  mrg jr lr
     1.1  mrg .Lret_0:ldrd P0L , [ R1 , -3 ]
     1.1  mrg 	lsr TMP0,r0,31 ; extract sign
     1.1  mrg 	lsl TMP0,TMP0,31
     1.1  mrg 	add P0L,P0L,r0 ; check for NaN input
     1.1  mrg 	eor P0L,P0L,r0
     1.1  mrg 	movgte r0,TMP0
     1.1  mrg 	jr lr
     1.1  mrg // Quotient calculation is expected by the caller: FMUL quotient,divident,R0
     1.1  mrg         ;
     1.1  mrg 	ENDFUNC(__fast_recipsf2)