Home | History | Annotate | Line # | Download | only in i386
floatundisf.S revision 1.1.1.3.4.2
      1 // This file is dual licensed under the MIT and the University of Illinois Open
      2 // Source Licenses. See LICENSE.TXT for details.
      3 
      4 #include "../assembly.h"
      5 
      6 // float __floatundisf(du_int a);
      7 
      8 // Note that there is a hardware instruction, fildll, that does most of what
      9 // this function needs to do.  However, because of our ia32 ABI, it will take
     10 // a write-small read-large stall, so the software implementation here is
     11 // actually several cycles faster.
     12 
     13 // This is a branch-free implementation.  A branchy implementation might be
     14 // faster for the common case if you know something a priori about the input
     15 // distribution.
     16 
     17 /* branch-free x87 implementation - one cycle slower than without x87.
     18 
     19 #ifdef __i386__
     20 
     21 .const
     22 .balign 3
     23 
     24 		.quad	0x43f0000000000000
     25 twop64:	.quad	0x0000000000000000
     26 
     27 #define			TWOp64			twop64-0b(%ecx,%eax,8)
     28 
     29 .text
     30 .balign 4
     31 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
     32 	movl		8(%esp),		%eax
     33 	movd		8(%esp),		%xmm1
     34 	movd		4(%esp),		%xmm0
     35 	punpckldq	%xmm1,			%xmm0
     36 	calll		0f
     37 0:	popl		%ecx
     38 	sarl		$31,			%eax
     39 	movq		%xmm0,			4(%esp)
     40 	fildll		4(%esp)
     41 	faddl		TWOp64
     42 	fstps		4(%esp)
     43 	flds		4(%esp)
     44 	ret
     45 END_COMPILERRT_FUNCTION(__floatundisf)
     46 
     47 #endif // __i386__
     48 
     49 */
     50 
     51 /* branch-free, x87-free implementation - faster at the expense of code size */
     52 
     53 #ifdef __i386__
     54 
     55 #if defined(__APPLE__)
     56 	.const
     57 #elif defined(__ELF__)
     58 	.section .rodata
     59 #else
     60 	.section .rdata,"rd"
     61 #endif
     62 
     63 	.balign 16
     64 twop52:
     65 	.quad 0x4330000000000000
     66 	.quad 0x0000000000000fff
     67 
     68 	.balign 16
     69 sticky:
     70 	.quad 0x0000000000000000
     71 	.long 0x00000012
     72 
     73 	.balign 16
     74 twelve:
     75 	.long 0x00000000
     76 
     77 #define			TWOp52			twop52-0b(%ecx)
     78 #define			STICKY			sticky-0b(%ecx,%eax,8)
     79 
     80 .text
     81 .balign 4
     82 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
     83 	movl		8(%esp),		%eax
     84 	movd		8(%esp),		%xmm1
     85 	movd		4(%esp),		%xmm0
     86 	punpckldq	%xmm1,			%xmm0
     87 
     88 	calll		0f
     89 0:	popl		%ecx
     90 	shrl		%eax					// high 31 bits of input as sint32
     91 	addl		$0x7ff80000,	%eax
     92 	sarl		$31,			%eax	// (big input) ? -1 : 0
     93 	movsd		STICKY,			%xmm1	// (big input) ? 0xfff : 0
     94 	movl		$12,			%edx
     95 	andl		%eax,			%edx	// (big input) ? 12 : 0
     96 	movd		%edx,			%xmm3
     97 	andpd		%xmm0,			%xmm1	// (big input) ? input & 0xfff : 0
     98 	movsd		TWOp52,			%xmm2	// 0x1.0p52
     99 	psrlq		%xmm3,			%xmm0	// (big input) ? input >> 12 : input
    100 	orpd		%xmm2,			%xmm1	// 0x1.0p52 + ((big input) ? input & 0xfff : input)
    101 	orpd		%xmm1,			%xmm0	// 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
    102 	subsd		%xmm2,			%xmm0	// (double)((big input) ? (input >> 12 | input & 0xfff) : input)
    103 	cvtsd2ss	%xmm0,			%xmm0	// (float)((big input) ? (input >> 12 | input & 0xfff) : input)
    104 	pslld		$23,			%xmm3
    105 	paddd		%xmm3,			%xmm0	// (float)input
    106 	movd		%xmm0,			4(%esp)
    107 	flds		4(%esp)
    108 	ret
    109 END_COMPILERRT_FUNCTION(__floatundisf)
    110 
    111 #endif // __i386__
    112