Home | History | Annotate | Line # | Download | only in vmx
      1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
      2 
      3 dnl  Copyright 2006 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C                16-byte coaligned      unaligned
     34 C                   cycles/limb        cycles/limb
     35 C 7400,7410 (G4):       0.5                0.64
     36 C 744x,745x (G4+):      0.75               0.82
     37 C 970 (G5):             0.78               1.02		(64-bit limbs)
     38 
     39 C STATUS
     40 C  * Works for all sizes and alignments.
     41 
     42 C TODO
     43 C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
     44 C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
     45 C    c/l for 970.
     46 C  * Consider using VMX instructions also for head and tail, by using some
     47 C    read-modify-write tricks.
     48 C  * The VMX code is used from the smallest sizes it handles, but measurements
     49 C    show a large speed bump at the cutoff points.  Small copying (perhaps
     50 C    using some read-modify-write technique) should be optimized.
     51 C  * Make an mpn_com based on this code.
     52 
     53 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     54 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
     55 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
     56 
     57 
     58 ifelse(GMP_LIMB_BITS,32,`
     59 	define(`LIMB32',`	$1')
     60 	define(`LIMB64',`')
     61 ',`
     62 	define(`LIMB32',`')
     63 	define(`LIMB64',`	$1')
     64 ')
     65 
     66 C INPUT PARAMETERS
     67 define(`rp',	`r3')
     68 define(`up',	`r4')
     69 define(`n',	`r5')
     70 
     71 define(`us',	`v4')
     72 
     73 
     74 ASM_START()
     75 PROLOGUE(mpn_copyd)
     76 
     77 LIMB32(`slwi.	r0, n, 2	')
     78 LIMB64(`sldi.	r0, n, 3	')
     79 	add	rp, rp, r0
     80 	add	up, up, r0
     81 
     82 LIMB32(`cmpi	cr7, n, 11	')
     83 LIMB64(`cmpdi	cr7, n, 5	')
     84 	bge	cr7, L(big)
     85 
     86 	beqlr	cr0
     87 
     88 C Handle small cases with plain operations
     89 	mtctr	n
     90 L(topS):
     91 LIMB32(`lwz	r0, -4(up)	')
     92 LIMB64(`ld	r0, -8(up)	')
     93 	addi	up, up, -GMP_LIMB_BYTES
     94 LIMB32(`stw	r0, -4(rp)	')
     95 LIMB64(`std	r0, -8(rp)	')
     96 	addi	rp, rp, -GMP_LIMB_BYTES
     97 	bdnz	L(topS)
     98 	blr
     99 
    100 C Handle large cases with VMX operations
    101 L(big):
    102 	addi	rp, rp, -16
    103 	addi	up, up, -16
    104 	mfspr	r12, 256
    105 	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
    106 	mtspr	256, r0
    107 
    108 LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
    109 LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
    110 	beq	L(rp_aligned)
    111 
    112 	subf	n, r7, n
    113 L(top0):
    114 LIMB32(`lwz	r0, 12(up)	')
    115 LIMB64(`ld	r0, 8(up)	')
    116 	addi	up, up, -GMP_LIMB_BYTES
    117 LIMB32(`addic.	r7, r7, -1	')
    118 LIMB32(`stw	r0, 12(rp)	')
    119 LIMB64(`std	r0, 8(rp)	')
    120 	addi	rp, rp, -GMP_LIMB_BYTES
    121 LIMB32(`bne	L(top0)		')
    122 
    123 L(rp_aligned):
    124 
    125 LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
    126 LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
    127 
    128 LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
    129 LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
    130 	mtctr	r7			C copy n to count register
    131 
    132 	li	r10, -16
    133 
    134 	beq	L(up_aligned)
    135 
    136 	lvsl	us, 0, up
    137 
    138 	addi	up, up, 16
    139 LIMB32(`andi.	r0, n, 0x4	')
    140 LIMB64(`andi.	r0, n, 0x2	')
    141 	beq	L(1)
    142 	lvx	v0, 0, up
    143 	lvx	v2, r10, up
    144 	vperm	v3, v2, v0, us
    145 	stvx	v3, 0, rp
    146 	addi	up, up, -32
    147 	addi	rp, rp, -16
    148 	b	L(lpu)
    149 L(1):	lvx	v2, 0, up
    150 	addi	up, up, -16
    151 	b	L(lpu)
    152 
    153 	ALIGN(32)
    154 L(lpu):	lvx	v0, 0, up
    155 	vperm	v3, v0, v2, us
    156 	stvx	v3, 0, rp
    157 	lvx	v2, r10, up
    158 	addi	up, up, -32
    159 	vperm	v3, v2, v0, us
    160 	stvx	v3, r10, rp
    161 	addi	rp, rp, -32
    162 	bdnz	L(lpu)
    163 
    164 	b	L(tail)
    165 
    166 L(up_aligned):
    167 
    168 LIMB32(`andi.	r0, n, 0x4	')
    169 LIMB64(`andi.	r0, n, 0x2	')
    170 	beq	L(lpa)
    171 	lvx	v0, 0,   up
    172 	stvx	v0, 0,   rp
    173 	addi	up, up, -16
    174 	addi	rp, rp, -16
    175 	b	L(lpa)
    176 
    177 	ALIGN(32)
    178 L(lpa):	lvx	v0, 0,   up
    179 	lvx	v1, r10, up
    180 	addi	up, up, -32
    181 	nop
    182 	stvx	v0, 0,   rp
    183 	stvx	v1, r10, rp
    184 	addi	rp, rp, -32
    185 	bdnz	L(lpa)
    186 
    187 L(tail):
    188 LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
    189 LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
    190 	beq	L(ret)
    191 LIMB32(`li	r10, 12		')
    192 L(top2):
    193 LIMB32(`lwzx	r0, r10, up	')
    194 LIMB64(`ld	r0, 8(up)	')
    195 LIMB32(`addic.	r7, r7, -1	')
    196 LIMB32(`stwx	r0, r10, rp	')
    197 LIMB64(`std	r0, 8(rp)	')
    198 LIMB32(`addi	r10, r10, -GMP_LIMB_BYTES')
    199 LIMB32(`bne	L(top2)		')
    200 
    201 L(ret):	mtspr	256, r12
    202 	blr
    203 EPILOGUE()
    204