Home | History | Annotate | Line # | Download | only in mmx
      1      1.1  mrg dnl  AMD K7 mpn_lshift -- mpn left shift.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 1999-2002 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C K7: 1.21 cycles/limb (at 16 limbs/loop).
     35      1.1  mrg 
     36      1.1  mrg 
     37      1.1  mrg 
     38      1.1  mrg dnl  K7: UNROLL_COUNT cycles/limb
     39      1.1  mrg dnl           4           1.51
     40      1.1  mrg dnl           8           1.26
     41      1.1  mrg dnl          16           1.21
     42      1.1  mrg dnl          32           1.2
     43      1.1  mrg dnl  Maximum possible with the current code is 64.
     44      1.1  mrg 
     45      1.1  mrg deflit(UNROLL_COUNT, 16)
     46      1.1  mrg 
     47      1.1  mrg 
     48      1.1  mrg C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
     49      1.1  mrg C                       unsigned shift);
     50      1.1  mrg C
     51      1.1  mrg C Shift src,size left by shift many bits and store the result in dst,size.
     52      1.1  mrg C Zeros are shifted in at the right.  The bits shifted out at the left are
     53      1.1  mrg C the return value.
     54      1.1  mrg C
     55      1.1  mrg C The comments in mpn_rshift apply here too.
     56      1.1  mrg 
     57      1.1  mrg ifdef(`PIC',`
     58      1.1  mrg deflit(UNROLL_THRESHOLD, 10)
     59      1.1  mrg ',`
     60      1.1  mrg deflit(UNROLL_THRESHOLD, 10)
     61      1.1  mrg ')
     62      1.1  mrg 
     63      1.1  mrg defframe(PARAM_SHIFT,16)
     64      1.1  mrg defframe(PARAM_SIZE, 12)
     65      1.1  mrg defframe(PARAM_SRC,  8)
     66      1.1  mrg defframe(PARAM_DST,  4)
     67      1.1  mrg 
     68      1.1  mrg defframe(SAVE_EDI, -4)
     69      1.1  mrg defframe(SAVE_ESI, -8)
     70      1.1  mrg defframe(SAVE_EBX, -12)
     71      1.1  mrg deflit(SAVE_SIZE, 12)
     72      1.1  mrg 
     73      1.1  mrg 	TEXT
     74      1.1  mrg 	ALIGN(32)
     75      1.1  mrg 
     76      1.1  mrg PROLOGUE(mpn_lshift)
     77      1.1  mrg deflit(`FRAME',0)
     78      1.1  mrg 
     79      1.1  mrg 	movl	PARAM_SIZE, %eax
     80      1.1  mrg 	movl	PARAM_SRC, %edx
     81      1.1  mrg 	subl	$SAVE_SIZE, %esp
     82      1.1  mrg deflit(`FRAME',SAVE_SIZE)
     83      1.1  mrg 
     84      1.1  mrg 	movl	PARAM_SHIFT, %ecx
     85      1.1  mrg 	movl	%edi, SAVE_EDI
     86      1.1  mrg 
     87      1.1  mrg 	movl	PARAM_DST, %edi
     88      1.1  mrg 	decl	%eax
     89      1.1  mrg 	jnz	L(more_than_one_limb)
     90      1.1  mrg 
     91      1.1  mrg 	movl	(%edx), %edx
     92      1.1  mrg 
     93      1.1  mrg 	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
     94      1.1  mrg 
     95      1.1  mrg 	shll	%cl, %edx
     96      1.1  mrg 
     97      1.1  mrg 	movl	%edx, (%edi)
     98      1.1  mrg 	movl	SAVE_EDI, %edi
     99      1.1  mrg 	addl	$SAVE_SIZE, %esp
    100      1.1  mrg 
    101      1.1  mrg 	ret
    102      1.1  mrg 
    103      1.1  mrg 
    104      1.1  mrg C -----------------------------------------------------------------------------
    105      1.1  mrg L(more_than_one_limb):
    106      1.1  mrg 	C eax	size-1
    107      1.1  mrg 	C ebx
    108      1.1  mrg 	C ecx	shift
    109      1.1  mrg 	C edx	src
    110      1.1  mrg 	C esi
    111      1.1  mrg 	C edi	dst
    112      1.1  mrg 	C ebp
    113      1.1  mrg 
    114      1.1  mrg 	movd	PARAM_SHIFT, %mm6
    115      1.1  mrg 	movd	(%edx,%eax,4), %mm5	C src high limb
    116      1.1  mrg 	cmp	$UNROLL_THRESHOLD-1, %eax
    117      1.1  mrg 
    118      1.1  mrg 	jae	L(unroll)
    119      1.1  mrg 	negl	%ecx
    120      1.1  mrg 	movd	(%edx), %mm4		C src low limb
    121      1.1  mrg 
    122      1.1  mrg 	addl	$32, %ecx
    123      1.1  mrg 
    124      1.1  mrg 	movd	%ecx, %mm7
    125      1.1  mrg 
    126      1.1  mrg L(simple_top):
    127      1.1  mrg 	C eax	loop counter, limbs
    128      1.1  mrg 	C ebx
    129      1.1  mrg 	C ecx
    130      1.1  mrg 	C edx	src
    131      1.1  mrg 	C esi
    132      1.1  mrg 	C edi	dst
    133      1.1  mrg 	C ebp
    134      1.1  mrg 	C
    135      1.1  mrg 	C mm0	scratch
    136      1.1  mrg 	C mm4	src low limb
    137      1.1  mrg 	C mm5	src high limb
    138      1.1  mrg 	C mm6	shift
    139      1.1  mrg 	C mm7	32-shift
    140      1.1  mrg 
    141      1.1  mrg 	movq	-4(%edx,%eax,4), %mm0
    142      1.1  mrg 	decl	%eax
    143      1.1  mrg 
    144      1.1  mrg 	psrlq	%mm7, %mm0
    145      1.1  mrg 
    146      1.1  mrg 	movd	%mm0, 4(%edi,%eax,4)
    147      1.1  mrg 	jnz	L(simple_top)
    148      1.1  mrg 
    149      1.1  mrg 
    150      1.1  mrg 	psllq	%mm6, %mm5
    151      1.1  mrg 	psllq	%mm6, %mm4
    152      1.1  mrg 
    153      1.1  mrg 	psrlq	$32, %mm5
    154      1.1  mrg 	movd	%mm4, (%edi)		C dst low limb
    155      1.1  mrg 
    156      1.1  mrg 	movd	%mm5, %eax		C return value
    157      1.1  mrg 
    158      1.1  mrg 	movl	SAVE_EDI, %edi
    159      1.1  mrg 	addl	$SAVE_SIZE, %esp
    160      1.1  mrg 	emms
    161      1.1  mrg 
    162      1.1  mrg 	ret
    163      1.1  mrg 
    164      1.1  mrg 
    165      1.1  mrg C -----------------------------------------------------------------------------
    166      1.1  mrg 	ALIGN(16)
    167      1.1  mrg L(unroll):
    168      1.1  mrg 	C eax	size-1
    169      1.1  mrg 	C ebx	(saved)
    170      1.1  mrg 	C ecx	shift
    171      1.1  mrg 	C edx	src
    172      1.1  mrg 	C esi
    173      1.1  mrg 	C edi	dst
    174      1.1  mrg 	C ebp
    175      1.1  mrg 	C
    176      1.1  mrg 	C mm5	src high limb, for return value
    177      1.1  mrg 	C mm6	lshift
    178      1.1  mrg 
    179      1.1  mrg 	movl	%esi, SAVE_ESI
    180      1.1  mrg 	movl	%ebx, SAVE_EBX
    181      1.1  mrg 	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
    182      1.1  mrg 
    183      1.1  mrg 	testb	$4, %dl
    184      1.1  mrg 	movq	(%edx), %mm1		C src high qword
    185      1.1  mrg 
    186      1.1  mrg 	jz	L(start_src_aligned)
    187      1.1  mrg 
    188      1.1  mrg 
    189      1.1  mrg 	C src isn't aligned, process high limb (marked xxx) separately to
    190      1.1  mrg 	C make it so
    191      1.1  mrg 	C
    192      1.1  mrg 	C  source    -4(edx,%eax,4)
    193      1.1  mrg 	C                  |
    194      1.1  mrg 	C  +-------+-------+-------+--
    195      1.1  mrg 	C  |  xxx          |
    196      1.1  mrg 	C  +-------+-------+-------+--
    197      1.1  mrg 	C        0mod8   4mod8   0mod8
    198      1.1  mrg 	C
    199      1.1  mrg 	C  dest      -4(edi,%eax,4)
    200      1.1  mrg 	C                  |
    201      1.1  mrg 	C  +-------+-------+--
    202      1.1  mrg 	C  |  xxx  |       |
    203      1.1  mrg 	C  +-------+-------+--
    204      1.1  mrg 
    205      1.1  mrg 	psllq	%mm6, %mm1
    206      1.1  mrg 	subl	$4, %edx
    207      1.1  mrg 	movl	%eax, PARAM_SIZE	C size-1
    208      1.1  mrg 
    209      1.1  mrg 	psrlq	$32, %mm1
    210      1.1  mrg 	decl	%eax			C size-2 is new size-1
    211      1.1  mrg 
    212      1.1  mrg 	movd	%mm1, 4(%edi,%eax,4)
    213      1.1  mrg 	movq	(%edx), %mm1		C new src high qword
    214      1.1  mrg L(start_src_aligned):
    215      1.1  mrg 
    216      1.1  mrg 
    217      1.1  mrg 	leal	-4(%edi,%eax,4), %edi   C &dst[size-2]
    218      1.1  mrg 	psllq	%mm6, %mm5
    219      1.1  mrg 
    220      1.1  mrg 	testl	$4, %edi
    221      1.1  mrg 	psrlq	$32, %mm5		C return value
    222      1.1  mrg 
    223      1.1  mrg 	jz	L(start_dst_aligned)
    224      1.1  mrg 
    225      1.1  mrg 
    226      1.1  mrg 	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
    227      1.1  mrg 	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
    228      1.1  mrg 	C here separately.
    229      1.1  mrg 	C
    230      1.1  mrg 	C  source       %edx
    231      1.1  mrg 	C  +-------+-------+--
    232      1.1  mrg 	C  |      mm1      |
    233      1.1  mrg 	C  +-------+-------+--
    234      1.1  mrg 	C                0mod8   4mod8
    235      1.1  mrg 	C
    236      1.1  mrg 	C  dest         %edi
    237      1.1  mrg 	C  +-------+-------+-------+--
    238      1.1  mrg 	C  |  xxx  |
    239      1.1  mrg 	C  +-------+-------+-------+--
    240      1.1  mrg 	C        0mod8   4mod8   0mod8
    241      1.1  mrg 
    242      1.1  mrg 	movq	%mm1, %mm0
    243      1.1  mrg 	psllq	%mm6, %mm1
    244      1.1  mrg 	addl	$32, %ecx		C shift+32
    245      1.1  mrg 
    246      1.1  mrg 	psrlq	$32, %mm1
    247      1.1  mrg 
    248      1.1  mrg 	movd	%mm1, 4(%edi)
    249      1.1  mrg 	movq	%mm0, %mm1
    250      1.1  mrg 	subl	$4, %edi
    251      1.1  mrg 
    252      1.1  mrg 	movd	%ecx, %mm6		C new lshift
    253      1.1  mrg L(start_dst_aligned):
    254      1.1  mrg 
    255      1.1  mrg 	decl	%eax			C size-2, two last limbs handled at end
    256      1.1  mrg 	movq	%mm1, %mm2		C copy of src high qword
    257      1.1  mrg 	negl	%ecx
    258      1.1  mrg 
    259      1.1  mrg 	andl	$-2, %eax		C round size down to even
    260      1.1  mrg 	addl	$64, %ecx
    261      1.1  mrg 
    262      1.1  mrg 	movl	%eax, %ebx
    263      1.1  mrg 	negl	%eax
    264      1.1  mrg 
    265      1.1  mrg 	andl	$UNROLL_MASK, %eax
    266      1.1  mrg 	decl	%ebx
    267      1.1  mrg 
    268      1.1  mrg 	shll	%eax
    269      1.1  mrg 
    270      1.1  mrg 	movd	%ecx, %mm7		C rshift = 64-lshift
    271      1.1  mrg 
    272      1.1  mrg ifdef(`PIC',`
    273      1.1  mrg 	call	L(pic_calc)
    274      1.1  mrg L(here):
    275      1.1  mrg ',`
    276      1.1  mrg 	leal	L(entry) (%eax,%eax,4), %esi
    277      1.1  mrg ')
    278      1.1  mrg 	shrl	$UNROLL_LOG2, %ebx	C loop counter
    279      1.1  mrg 
    280      1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
    281      1.1  mrg 	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
    282      1.1  mrg 	movl	PARAM_SIZE, %eax	C for use at end
    283      1.1  mrg 	jmp	*%esi
    284      1.1  mrg 
    285      1.1  mrg 
    286      1.1  mrg ifdef(`PIC',`
    287      1.1  mrg L(pic_calc):
    288      1.1  mrg 	C See mpn/x86/README about old gas bugs
    289      1.1  mrg 	leal	(%eax,%eax,4), %esi
    290      1.1  mrg 	addl	$L(entry)-L(here), %esi
    291      1.1  mrg 	addl	(%esp), %esi
    292      1.1  mrg 
    293      1.1  mrg 	ret_internal
    294      1.1  mrg ')
    295      1.1  mrg 
    296      1.1  mrg 
    297      1.1  mrg C -----------------------------------------------------------------------------
    298      1.1  mrg 	ALIGN(32)
    299      1.1  mrg L(top):
    300      1.1  mrg 	C eax	size (for use at end)
    301      1.1  mrg 	C ebx	loop counter
    302      1.1  mrg 	C ecx	rshift
    303      1.1  mrg 	C edx	src
    304      1.1  mrg 	C esi	computed jump
    305      1.1  mrg 	C edi	dst
    306      1.1  mrg 	C ebp
    307      1.1  mrg 	C
    308      1.1  mrg 	C mm0	scratch
    309      1.1  mrg 	C mm1	\ carry (alternating, mm2 first)
    310      1.1  mrg 	C mm2	/
    311      1.1  mrg 	C mm6	lshift
    312      1.1  mrg 	C mm7	rshift
    313      1.1  mrg 	C
    314      1.1  mrg 	C 10 code bytes/limb
    315      1.1  mrg 	C
    316      1.1  mrg 	C The two chunks differ in whether mm1 or mm2 hold the carry.
    317      1.1  mrg 	C The computed jump puts the initial carry in both mm1 and mm2.
    318      1.1  mrg 
    319      1.1  mrg L(entry):
    320      1.1  mrg deflit(CHUNK_COUNT, 4)
    321      1.1  mrg forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    322      1.1  mrg 	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
    323      1.1  mrg 	deflit(`disp1', eval(disp0 - 8))
    324      1.1  mrg 
    325      1.1  mrg Zdisp(	movq,	disp0,(%edx), %mm0)
    326      1.1  mrg 	psllq	%mm6, %mm2
    327      1.1  mrg 
    328      1.1  mrg 	movq	%mm0, %mm1
    329      1.1  mrg 	psrlq	%mm7, %mm0
    330      1.1  mrg 
    331      1.1  mrg 	por	%mm2, %mm0
    332      1.1  mrg Zdisp(	movq,	%mm0, disp0,(%edi))
    333      1.1  mrg 
    334      1.1  mrg 
    335      1.1  mrg Zdisp(	movq,	disp1,(%edx), %mm0)
    336      1.1  mrg 	psllq	%mm6, %mm1
    337      1.1  mrg 
    338      1.1  mrg 	movq	%mm0, %mm2
    339      1.1  mrg 	psrlq	%mm7, %mm0
    340      1.1  mrg 
    341      1.1  mrg 	por	%mm1, %mm0
    342      1.1  mrg Zdisp(	movq,	%mm0, disp1,(%edi))
    343      1.1  mrg ')
    344      1.1  mrg 
    345      1.1  mrg 	subl	$UNROLL_BYTES, %edx
    346      1.1  mrg 	subl	$UNROLL_BYTES, %edi
    347      1.1  mrg 	decl	%ebx
    348      1.1  mrg 
    349      1.1  mrg 	jns	L(top)
    350      1.1  mrg 
    351      1.1  mrg 
    352      1.1  mrg 
    353      1.1  mrg define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
    354      1.1  mrg 
    355      1.1  mrg L(end):
    356      1.1  mrg 	testb	$1, %al
    357      1.1  mrg 	movl	SAVE_EBX, %ebx
    358      1.1  mrg 	psllq	%mm6, %mm2	C wanted left shifted in all cases below
    359      1.1  mrg 
    360      1.1  mrg 	movd	%mm5, %eax
    361      1.1  mrg 
    362      1.1  mrg 	movl	SAVE_ESI, %esi
    363      1.1  mrg 	jz	L(end_even)
    364      1.1  mrg 
    365      1.1  mrg 
    366      1.1  mrg L(end_odd):
    367      1.1  mrg 
    368      1.1  mrg 	C Size odd, destination was aligned.
    369      1.1  mrg 	C
    370      1.1  mrg 	C                 source        edx+8   edx+4
    371      1.1  mrg 	C                 --+---------------+-------+
    372      1.1  mrg 	C                   |      mm2      |       |
    373      1.1  mrg 	C                 --+---------------+-------+
    374      1.1  mrg 	C
    375      1.1  mrg 	C dest                            edi
    376      1.1  mrg 	C --+---------------+---------------+-------+
    377      1.1  mrg 	C   |   written     |               |       |
    378      1.1  mrg 	C --+---------------+---------------+-------+
    379      1.1  mrg 	C
    380      1.1  mrg 	C mm6 = shift
    381      1.1  mrg 	C mm7 = ecx = 64-shift
    382      1.1  mrg 
    383      1.1  mrg 
    384      1.1  mrg 	C Size odd, destination was unaligned.
    385      1.1  mrg 	C
    386      1.1  mrg 	C                 source        edx+8   edx+4
    387      1.1  mrg 	C                 --+---------------+-------+
    388      1.1  mrg 	C                   |      mm2      |       |
    389      1.1  mrg 	C                 --+---------------+-------+
    390      1.1  mrg 	C
    391      1.1  mrg 	C         dest                            edi
    392      1.1  mrg 	C         --+---------------+---------------+
    393      1.1  mrg 	C           |   written     |               |
    394      1.1  mrg 	C         --+---------------+---------------+
    395      1.1  mrg 	C
    396      1.1  mrg 	C mm6 = shift+32
    397      1.1  mrg 	C mm7 = ecx = 64-(shift+32)
    398      1.1  mrg 
    399      1.1  mrg 
    400      1.1  mrg 	C In both cases there's one extra limb of src to fetch and combine
    401      1.1  mrg 	C with mm2 to make a qword at (%edi), and in the aligned case
    402      1.1  mrg 	C there's an extra limb of dst to be formed from that extra src limb
    403      1.1  mrg 	C left shifted.
    404      1.1  mrg 
    405      1.1  mrg 	movd	disp(4) (%edx), %mm0
    406      1.1  mrg 	testb	$32, %cl
    407      1.1  mrg 
    408      1.1  mrg 	movq	%mm0, %mm1
    409      1.1  mrg 	psllq	$32, %mm0
    410      1.1  mrg 
    411      1.1  mrg 	psrlq	%mm7, %mm0
    412      1.1  mrg 	psllq	%mm6, %mm1
    413      1.1  mrg 
    414      1.1  mrg 	por	%mm2, %mm0
    415      1.1  mrg 
    416      1.1  mrg 	movq	%mm0, disp(0) (%edi)
    417      1.1  mrg 	jz	L(end_odd_unaligned)
    418      1.1  mrg 	movd	%mm1, disp(-4) (%edi)
    419      1.1  mrg L(end_odd_unaligned):
    420      1.1  mrg 
    421      1.1  mrg 	movl	SAVE_EDI, %edi
    422      1.1  mrg 	addl	$SAVE_SIZE, %esp
    423      1.1  mrg 	emms
    424      1.1  mrg 
    425      1.1  mrg 	ret
    426      1.1  mrg 
    427      1.1  mrg 
    428      1.1  mrg L(end_even):
    429      1.1  mrg 
    430      1.1  mrg 	C Size even, destination was aligned.
    431      1.1  mrg 	C
    432      1.1  mrg 	C                 source        edx+8
    433      1.1  mrg 	C                 --+---------------+
    434      1.1  mrg 	C                   |      mm2      |
    435      1.1  mrg 	C                 --+---------------+
    436      1.1  mrg 	C
    437      1.1  mrg 	C dest                            edi
    438      1.1  mrg 	C --+---------------+---------------+
    439      1.1  mrg 	C   |   written     |               |
    440      1.1  mrg 	C --+---------------+---------------+
    441      1.1  mrg 	C
    442      1.1  mrg 	C mm6 = shift
    443      1.1  mrg 	C mm7 = ecx = 64-shift
    444      1.1  mrg 
    445      1.1  mrg 
    446      1.1  mrg 	C Size even, destination was unaligned.
    447      1.1  mrg 	C
    448      1.1  mrg 	C               source          edx+8
    449      1.1  mrg 	C                 --+---------------+
    450      1.1  mrg 	C                   |      mm2      |
    451      1.1  mrg 	C                 --+---------------+
    452      1.1  mrg 	C
    453      1.1  mrg 	C         dest                  edi+4
    454      1.1  mrg 	C         --+---------------+-------+
    455      1.1  mrg 	C           |    written    |       |
    456      1.1  mrg 	C         --+---------------+-------+
    457      1.1  mrg 	C
    458      1.1  mrg 	C mm6 = shift+32
    459      1.1  mrg 	C mm7 = ecx = 64-(shift+32)
    460      1.1  mrg 
    461      1.1  mrg 
    462      1.1  mrg 	C The movq for the aligned case overwrites the movd for the
    463      1.1  mrg 	C unaligned case.
    464      1.1  mrg 
    465      1.1  mrg 	movq	%mm2, %mm0
    466      1.1  mrg 	psrlq	$32, %mm2
    467      1.1  mrg 
    468      1.1  mrg 	testb	$32, %cl
    469      1.1  mrg 	movd	%mm2, disp(4) (%edi)
    470      1.1  mrg 
    471      1.1  mrg 	jz	L(end_even_unaligned)
    472      1.1  mrg 	movq	%mm0, disp(0) (%edi)
    473      1.1  mrg L(end_even_unaligned):
    474      1.1  mrg 
    475      1.1  mrg 	movl	SAVE_EDI, %edi
    476      1.1  mrg 	addl	$SAVE_SIZE, %esp
    477      1.1  mrg 	emms
    478      1.1  mrg 
    479      1.1  mrg 	ret
    480      1.1  mrg 
    481      1.1  mrg EPILOGUE()
    482