Home | History | Annotate | Line # | Download | only in pentium
      1 dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
      2 
      3 dnl  Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C P5: 14.2 cycles/crossproduct (approx)
     35 
     36 
     37 C void mpn_mul_basecase (mp_ptr wp,
     38 C                        mp_srcptr xp, mp_size_t xsize,
     39 C                        mp_srcptr yp, mp_size_t ysize);
     40 
     41 defframe(PARAM_YSIZE, 20)
     42 defframe(PARAM_YP,    16)
     43 defframe(PARAM_XSIZE, 12)
     44 defframe(PARAM_XP,    8)
     45 defframe(PARAM_WP,    4)
     46 
     47 defframe(VAR_COUNTER, -4)
     48 
     49 	TEXT
     50 	ALIGN(8)
     51 PROLOGUE(mpn_mul_basecase)
     52 
     53 	pushl	%eax			C dummy push for allocating stack slot
     54 	pushl	%esi
     55 	pushl	%ebp
     56 	pushl	%edi
     57 deflit(`FRAME',16)
     58 
     59 	movl	PARAM_XP,%esi
     60 	movl	PARAM_WP,%edi
     61 	movl	PARAM_YP,%ebp
     62 
     63 	movl	(%esi),%eax		C load xp[0]
     64 	mull	(%ebp)			C multiply by yp[0]
     65 	movl	%eax,(%edi)		C store to wp[0]
     66 	movl	PARAM_XSIZE,%ecx	C xsize
     67 	decl	%ecx			C If xsize = 1, ysize = 1 too
     68 	jz	L(done)
     69 
     70 	movl	PARAM_XSIZE,%eax
     71 	pushl	%ebx
     72 FRAME_pushl()
     73 	movl	%edx,%ebx
     74 	leal	(%esi,%eax,4),%esi	C make xp point at end
     75 	leal	(%edi,%eax,4),%edi	C offset wp by xsize
     76 	negl	%ecx			C negate j size/index for inner loop
     77 	xorl	%eax,%eax		C clear carry
     78 
     79 	ALIGN(8)
     80 L(oop1):	adcl	$0,%ebx
     81 	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
     82 	mull	(%ebp)
     83 	addl	%ebx,%eax
     84 	movl	%eax,(%edi,%ecx,4)
     85 	incl	%ecx
     86 	movl	%edx,%ebx
     87 	jnz	L(oop1)
     88 
     89 	adcl	$0,%ebx
     90 	movl	PARAM_YSIZE,%eax
     91 	movl	%ebx,(%edi)		C most significant limb of product
     92 	addl	$4,%edi			C increment wp
     93 	decl	%eax
     94 	jz	L(skip)
     95 	movl	%eax,VAR_COUNTER	C set index i to ysize
     96 
     97 L(outer):
     98 	addl	$4,%ebp			C make ebp point to next y limb
     99 	movl	PARAM_XSIZE,%ecx
    100 	negl	%ecx
    101 	xorl	%ebx,%ebx
    102 
    103 	C code at 0x61 here, close enough to aligned
    104 L(oop2):
    105 	adcl	$0,%ebx
    106 	movl	(%esi,%ecx,4),%eax
    107 	mull	(%ebp)
    108 	addl	%ebx,%eax
    109 	movl	(%edi,%ecx,4),%ebx
    110 	adcl	$0,%edx
    111 	addl	%eax,%ebx
    112 	movl	%ebx,(%edi,%ecx,4)
    113 	incl	%ecx
    114 	movl	%edx,%ebx
    115 	jnz	L(oop2)
    116 
    117 	adcl	$0,%ebx
    118 
    119 	movl	%ebx,(%edi)
    120 	addl	$4,%edi
    121 	movl	VAR_COUNTER,%eax
    122 	decl	%eax
    123 	movl	%eax,VAR_COUNTER
    124 	jnz	L(outer)
    125 
    126 L(skip):
    127 	popl	%ebx
    128 	popl	%edi
    129 	popl	%ebp
    130 	popl	%esi
    131 	addl	$4,%esp
    132 	ret
    133 
    134 L(done):
    135 	movl	%edx,4(%edi)	C store to wp[1]
    136 	popl	%edi
    137 	popl	%ebp
    138 	popl	%esi
    139 	popl	%eax		C dummy pop for deallocating stack slot
    140 	ret
    141 
    142 EPILOGUE()
    143