Home | History | Annotate | Line # | Download | only in vmx
      1      1.1  mrg dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2006 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6  1.1.1.2  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26  1.1.1.2  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg C                16-byte coaligned      unaligned
     34      1.1  mrg C                   cycles/limb        cycles/limb
     35      1.1  mrg C 7400,7410 (G4):       0.5                0.64
     36      1.1  mrg C 744x,745x (G4+):      0.75               0.82
     37      1.1  mrg C 970 (G5):             0.78               1.02		(64-bit limbs)
     38      1.1  mrg 
     39      1.1  mrg C STATUS
     40      1.1  mrg C  * Works for all sizes and alignments.
     41      1.1  mrg 
     42      1.1  mrg C TODO
     43      1.1  mrg C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
     44      1.1  mrg C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
     45      1.1  mrg C    c/l for 970.
     46      1.1  mrg C  * Consider using VMX instructions also for head and tail, by using some
     47      1.1  mrg C    read-modify-write tricks.
     48      1.1  mrg C  * The VMX code is used from the smallest sizes it handles, but measurements
     49      1.1  mrg C    show a large speed bump at the cutoff points.  Small copying (perhaps
     50      1.1  mrg C    using some read-modify-write technique) should be optimized.
     51  1.1.1.2  mrg C  * Make an mpn_com based on this code.
     52      1.1  mrg 
     53      1.1  mrg define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     54      1.1  mrg define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
     55      1.1  mrg define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
     56      1.1  mrg 
     57      1.1  mrg 
     58      1.1  mrg ifelse(GMP_LIMB_BITS,32,`
     59      1.1  mrg 	define(`LIMB32',`	$1')
     60      1.1  mrg 	define(`LIMB64',`')
     61      1.1  mrg ',`
     62      1.1  mrg 	define(`LIMB32',`')
     63      1.1  mrg 	define(`LIMB64',`	$1')
     64      1.1  mrg ')
     65      1.1  mrg 
     66      1.1  mrg C INPUT PARAMETERS
     67      1.1  mrg define(`rp',	`r3')
     68      1.1  mrg define(`up',	`r4')
     69      1.1  mrg define(`n',	`r5')
     70      1.1  mrg 
     71      1.1  mrg define(`us',	`v4')
     72      1.1  mrg 
     73      1.1  mrg 
     74      1.1  mrg ASM_START()
     75      1.1  mrg PROLOGUE(mpn_copyd)
     76      1.1  mrg 
     77      1.1  mrg LIMB32(`slwi.	r0, n, 2	')
     78      1.1  mrg LIMB64(`sldi.	r0, n, 3	')
     79      1.1  mrg 	add	rp, rp, r0
     80      1.1  mrg 	add	up, up, r0
     81      1.1  mrg 
     82      1.1  mrg LIMB32(`cmpi	cr7, n, 11	')
     83      1.1  mrg LIMB64(`cmpdi	cr7, n, 5	')
     84      1.1  mrg 	bge	cr7, L(big)
     85      1.1  mrg 
     86      1.1  mrg 	beqlr	cr0
     87      1.1  mrg 
     88      1.1  mrg C Handle small cases with plain operations
     89      1.1  mrg 	mtctr	n
     90      1.1  mrg L(topS):
     91      1.1  mrg LIMB32(`lwz	r0, -4(up)	')
     92      1.1  mrg LIMB64(`ld	r0, -8(up)	')
     93      1.1  mrg 	addi	up, up, -GMP_LIMB_BYTES
     94      1.1  mrg LIMB32(`stw	r0, -4(rp)	')
     95      1.1  mrg LIMB64(`std	r0, -8(rp)	')
     96      1.1  mrg 	addi	rp, rp, -GMP_LIMB_BYTES
     97      1.1  mrg 	bdnz	L(topS)
     98      1.1  mrg 	blr
     99      1.1  mrg 
    100      1.1  mrg C Handle large cases with VMX operations
    101      1.1  mrg L(big):
    102      1.1  mrg 	addi	rp, rp, -16
    103      1.1  mrg 	addi	up, up, -16
    104      1.1  mrg 	mfspr	r12, 256
    105      1.1  mrg 	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
    106      1.1  mrg 	mtspr	256, r0
    107      1.1  mrg 
    108      1.1  mrg LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
    109      1.1  mrg LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
    110      1.1  mrg 	beq	L(rp_aligned)
    111      1.1  mrg 
    112      1.1  mrg 	subf	n, r7, n
    113      1.1  mrg L(top0):
    114      1.1  mrg LIMB32(`lwz	r0, 12(up)	')
    115      1.1  mrg LIMB64(`ld	r0, 8(up)	')
    116      1.1  mrg 	addi	up, up, -GMP_LIMB_BYTES
    117      1.1  mrg LIMB32(`addic.	r7, r7, -1	')
    118      1.1  mrg LIMB32(`stw	r0, 12(rp)	')
    119      1.1  mrg LIMB64(`std	r0, 8(rp)	')
    120      1.1  mrg 	addi	rp, rp, -GMP_LIMB_BYTES
    121      1.1  mrg LIMB32(`bne	L(top0)		')
    122      1.1  mrg 
    123      1.1  mrg L(rp_aligned):
    124      1.1  mrg 
    125      1.1  mrg LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
    126      1.1  mrg LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
    127      1.1  mrg 
    128      1.1  mrg LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
    129      1.1  mrg LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
    130      1.1  mrg 	mtctr	r7			C copy n to count register
    131      1.1  mrg 
    132      1.1  mrg 	li	r10, -16
    133      1.1  mrg 
    134      1.1  mrg 	beq	L(up_aligned)
    135      1.1  mrg 
    136      1.1  mrg 	lvsl	us, 0, up
    137      1.1  mrg 
    138      1.1  mrg 	addi	up, up, 16
    139      1.1  mrg LIMB32(`andi.	r0, n, 0x4	')
    140      1.1  mrg LIMB64(`andi.	r0, n, 0x2	')
    141      1.1  mrg 	beq	L(1)
    142      1.1  mrg 	lvx	v0, 0, up
    143      1.1  mrg 	lvx	v2, r10, up
    144      1.1  mrg 	vperm	v3, v2, v0, us
    145      1.1  mrg 	stvx	v3, 0, rp
    146      1.1  mrg 	addi	up, up, -32
    147      1.1  mrg 	addi	rp, rp, -16
    148      1.1  mrg 	b	L(lpu)
    149      1.1  mrg L(1):	lvx	v2, 0, up
    150      1.1  mrg 	addi	up, up, -16
    151      1.1  mrg 	b	L(lpu)
    152      1.1  mrg 
    153      1.1  mrg 	ALIGN(32)
    154      1.1  mrg L(lpu):	lvx	v0, 0, up
    155      1.1  mrg 	vperm	v3, v0, v2, us
    156      1.1  mrg 	stvx	v3, 0, rp
    157      1.1  mrg 	lvx	v2, r10, up
    158      1.1  mrg 	addi	up, up, -32
    159      1.1  mrg 	vperm	v3, v2, v0, us
    160      1.1  mrg 	stvx	v3, r10, rp
    161      1.1  mrg 	addi	rp, rp, -32
    162      1.1  mrg 	bdnz	L(lpu)
    163      1.1  mrg 
    164      1.1  mrg 	b	L(tail)
    165      1.1  mrg 
    166      1.1  mrg L(up_aligned):
    167      1.1  mrg 
    168      1.1  mrg LIMB32(`andi.	r0, n, 0x4	')
    169      1.1  mrg LIMB64(`andi.	r0, n, 0x2	')
    170      1.1  mrg 	beq	L(lpa)
    171      1.1  mrg 	lvx	v0, 0,   up
    172      1.1  mrg 	stvx	v0, 0,   rp
    173      1.1  mrg 	addi	up, up, -16
    174      1.1  mrg 	addi	rp, rp, -16
    175      1.1  mrg 	b	L(lpa)
    176      1.1  mrg 
    177      1.1  mrg 	ALIGN(32)
    178      1.1  mrg L(lpa):	lvx	v0, 0,   up
    179      1.1  mrg 	lvx	v1, r10, up
    180      1.1  mrg 	addi	up, up, -32
    181      1.1  mrg 	nop
    182      1.1  mrg 	stvx	v0, 0,   rp
    183      1.1  mrg 	stvx	v1, r10, rp
    184      1.1  mrg 	addi	rp, rp, -32
    185      1.1  mrg 	bdnz	L(lpa)
    186      1.1  mrg 
    187      1.1  mrg L(tail):
    188      1.1  mrg LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
    189      1.1  mrg LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
    190      1.1  mrg 	beq	L(ret)
    191      1.1  mrg LIMB32(`li	r10, 12		')
    192      1.1  mrg L(top2):
    193      1.1  mrg LIMB32(`lwzx	r0, r10, up	')
    194      1.1  mrg LIMB64(`ld	r0, 8(up)	')
    195      1.1  mrg LIMB32(`addic.	r7, r7, -1	')
    196      1.1  mrg LIMB32(`stwx	r0, r10, rp	')
    197      1.1  mrg LIMB64(`std	r0, 8(rp)	')
    198      1.1  mrg LIMB32(`addi	r10, r10, -GMP_LIMB_BYTES')
    199      1.1  mrg LIMB32(`bne	L(top2)		')
    200      1.1  mrg 
    201      1.1  mrg L(ret):	mtspr	256, r12
    202      1.1  mrg 	blr
    203      1.1  mrg EPILOGUE()
    204