Home | History | Annotate | Line # | Download | only in vmx
      1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
      2 dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
      3 dnl  logical operations.
      4 
      5 dnl  Copyright 2006 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 
     36 C               and,ior,andn,nior,xor    iorn,xnor         nand
     37 C                   cycles/limb         cycles/limb    cycles/limb
     38 C 7400,7410 (G4):       1.39                 ?              ?
     39 C 744x,745x (G4+):      1.14                1.39           1.39
     40 C 970:                  1.7                 2.0            2.0
     41 
     42 C STATUS
     43 C  * Works for all sizes and alignment for 32-bit limbs.
     44 C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
     45 C  * Current performance makes this pointless for 970
     46 
     47 C TODO
     48 C  * Might want to make variants when just one of the source operands needs
     49 C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
     50 C  * Idea: If the source operands are equally aligned, we could do the logops
     51 C    first, then vperm before storing!  That means we never need more than one
     52 C    vperm, ever!
     53 C  * Perhaps align `rp' after initial alignment loop?
     54 C  * Instead of having scalar code in the beginning and end, consider using
     55 C    read-modify-write vector code.
     56 C  * Software pipeline?  Hopefully not too important, this is hairy enough
     57 C    already.
     58 C  * At least be more clever about operand loading, i.e., load v operands before
     59 C    u operands, since v operands are sometimes negated.
     60 
     61 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
     62 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
     63 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
     64 
     65 define(`vnegb', `')		C default neg-before to null
     66 define(`vnega', `')		C default neg-before to null
     67 
     68 ifdef(`OPERATION_and_n',
     69 `	define(`func',	`mpn_and_n')
     70 	define(`logopS',`and	$1,$2,$3')
     71 	define(`logop',	`vand	$1,$2,$3')')
     72 ifdef(`OPERATION_andn_n',
     73 `	define(`func',	`mpn_andn_n')
     74 	define(`logopS',`andc	$1,$2,$3')
     75 	define(`logop',	`vandc	$1,$2,$3')')
     76 ifdef(`OPERATION_nand_n',
     77 `	define(`func',	`mpn_nand_n')
     78 	define(`logopS',`nand	$1,$2,$3')
     79 	define(`logop',	`vand	$1,$2,$3')
     80 	define(`vnega',	`vnor	$1,$2,$2')')
     81 ifdef(`OPERATION_ior_n',
     82 `	define(`func',	`mpn_ior_n')
     83 	define(`logopS',`or	$1,$2,$3')
     84 	define(`logop',	`vor	$1,$2,$3')')
     85 ifdef(`OPERATION_iorn_n',
     86 `	define(`func',	`mpn_iorn_n')
     87 	define(`logopS',`orc	$1,$2,$3')
     88 	define(`vnegb',	`vnor	$1,$2,$2')
     89 	define(`logop',	`vor	$1,$2,$3')')
     90 ifdef(`OPERATION_nior_n',
     91 `	define(`func',	`mpn_nior_n')
     92 	define(`logopS',`nor	$1,$2,$3')
     93 	define(`logop',	`vnor	$1,$2,$3')')
     94 ifdef(`OPERATION_xor_n',
     95 `	define(`func',	`mpn_xor_n')
     96 	define(`logopS',`xor	$1,$2,$3')
     97 	define(`logop',	`vxor	$1,$2,$3')')
     98 ifdef(`OPERATION_xnor_n',
     99 `	define(`func',`mpn_xnor_n')
    100 	define(`logopS',`eqv	$1,$2,$3')
    101 	define(`vnegb',	`vnor	$1,$2,$2')
    102 	define(`logop',	`vxor	$1,$2,$3')')
    103 
    104 ifelse(GMP_LIMB_BITS,`32',`
    105 	define(`LIMB32',`	$1')
    106 	define(`LIMB64',`')
    107 ',`
    108 	define(`LIMB32',`')
    109 	define(`LIMB64',`	$1')
    110 ')
    111 
    112 C INPUT PARAMETERS
    113 define(`rp',	`r3')
    114 define(`up',	`r4')
    115 define(`vp',	`r5')
    116 define(`n',	`r6')
    117 
    118 define(`us',	`v8')
    119 define(`vs',	`v9')
    120 
    121 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
    122 
    123 ASM_START()
    124 PROLOGUE(func)
    125 
    126 LIMB32(`cmpwi	cr0, n, 8	')
    127 LIMB64(`cmpdi	cr0, n, 4	')
    128 	bge	L(big)
    129 
    130 	mtctr	n
    131 
    132 LIMB32(`lwz	r8, 0(up)	')
    133 LIMB32(`lwz	r9, 0(vp)	')
    134 LIMB32(`logopS(	r0, r8, r9)	')
    135 LIMB32(`stw	r0, 0(rp)	')
    136 LIMB32(`bdz	L(endS)		')
    137 
    138 L(topS):
    139 LIMB32(`lwzu	r8, 4(up)	')
    140 LIMB64(`ld	r8, 0(up)	')
    141 LIMB64(`addi	up, up, GMP_LIMB_BYTES	')
    142 LIMB32(`lwzu	r9, 4(vp)	')
    143 LIMB64(`ld	r9, 0(vp)	')
    144 LIMB64(`addi	vp, vp, GMP_LIMB_BYTES	')
    145 	logopS(	r0, r8, r9)
    146 LIMB32(`stwu	r0, 4(rp)	')
    147 LIMB64(`std	r0, 0(rp)	')
    148 LIMB64(`addi	rp, rp, GMP_LIMB_BYTES	')
    149 	bdnz	L(topS)
    150 L(endS):
    151 	blr
    152 
    153 L(big):	mfspr	r12, 256
    154 	oris	r0, r12, 0xfffc		C Set VRSAVE bit 0-13 FIXME
    155 	mtspr	256, r0
    156 
    157 C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
    158 C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
    159 
    160 LIMB32(`rlwinm.	r0, rp, 30,30,31')	C (rp >> 2) mod 4
    161 LIMB64(`rlwinm.	r0, rp, 29,31,31')	C (rp >> 3) mod 2
    162 	beq	L(aligned)
    163 
    164 	subfic	r7, r0, LIMBS_PER_VR
    165 LIMB32(`li	r10, 0		')
    166 	subf	n, r7, n
    167 L(top0):
    168 LIMB32(`lwz	r8, 0(up)	')
    169 LIMB64(`ld	r8, 0(up)	')
    170 	addi	up, up, GMP_LIMB_BYTES
    171 LIMB32(`lwz	r9, 0(vp)	')
    172 LIMB64(`ld	r9, 0(vp)	')
    173 	addi	vp, vp, GMP_LIMB_BYTES
    174 LIMB32(`addic.	r7, r7, -1	')
    175 	logopS(	r0, r8, r9)
    176 LIMB32(`stwx	r0, r10, rp	')
    177 LIMB64(`std	r0, 0(rp)	')
    178 LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
    179 LIMB32(`bne	L(top0)		')
    180 
    181 	addi	rp, rp, 16		C update rp, but preserve its alignment
    182 
    183 L(aligned):
    184 LIMB64(`srdi	r7, n, 1	')	C loop count corresponding to n
    185 LIMB32(`srwi	r7, n, 2	')	C loop count corresponding to n
    186 	mtctr	r7			C copy n to count register
    187 
    188 	li	r10, 16
    189 	lvsl	us, 0, up
    190 	lvsl	vs, 0, vp
    191 
    192 	lvx	v2, 0, up
    193 	lvx	v3, 0, vp
    194 	bdnz	L(gt1)
    195 	lvx	v0, r10, up
    196 	lvx	v1, r10, vp
    197 	vperm	v4, v2, v0, us
    198 	vperm	v5, v3, v1, vs
    199 	vnegb(	v5, v5)
    200 	logop(	v6, v4, v5)
    201 	vnega(	v6, v6)
    202 	stvx	v6, 0, rp
    203 	addi	up, up, 16
    204 	addi	vp, vp, 16
    205 	addi	rp, rp, 4
    206 	b	L(tail)
    207 
    208 L(gt1):	addi	up, up, 16
    209 	addi	vp, vp, 16
    210 
    211 L(top):	lvx	v0, 0, up
    212 	lvx	v1, 0, vp
    213 	vperm	v4, v2, v0, us
    214 	vperm	v5, v3, v1, vs
    215 	vnegb(	v5, v5)
    216 	logop(	v6, v4, v5)
    217 	vnega(	v6, v6)
    218 	stvx	v6, 0, rp
    219 	bdz	L(end)
    220 	lvx	v2, r10, up
    221 	lvx	v3, r10, vp
    222 	vperm	v4, v0, v2, us
    223 	vperm	v5, v1, v3, vs
    224 	vnegb(	v5, v5)
    225 	logop(	v6, v4, v5)
    226 	vnega(	v6, v6)
    227 	stvx	v6, r10, rp
    228 	addi	up, up, 32
    229 	addi	vp, vp, 32
    230 	addi	rp, rp, 32
    231 	bdnz	L(top)
    232 
    233 	andi.	r0, up, 15
    234 	vxor	v0, v0, v0
    235 	beq	1f
    236 	lvx	v0, 0, up
    237 1:	andi.	r0, vp, 15
    238 	vxor	v1, v1, v1
    239 	beq	1f
    240 	lvx	v1, 0, vp
    241 1:	vperm	v4, v2, v0, us
    242 	vperm	v5, v3, v1, vs
    243 	vnegb(	v5, v5)
    244 	logop(	v6, v4, v5)
    245 	vnega(	v6, v6)
    246 	stvx	v6, 0, rp
    247 	addi	rp, rp, 4
    248 	b	L(tail)
    249 
    250 L(end):	andi.	r0, up, 15
    251 	vxor	v2, v2, v2
    252 	beq	1f
    253 	lvx	v2, r10, up
    254 1:	andi.	r0, vp, 15
    255 	vxor	v3, v3, v3
    256 	beq	1f
    257 	lvx	v3, r10, vp
    258 1:	vperm	v4, v0, v2, us
    259 	vperm	v5, v1, v3, vs
    260 	vnegb(	v5, v5)
    261 	logop(	v6, v4, v5)
    262 	vnega(	v6, v6)
    263 	stvx	v6, r10, rp
    264 
    265 	addi	up, up, 16
    266 	addi	vp, vp, 16
    267 	addi	rp, rp, 20
    268 
    269 L(tail):
    270 LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
    271 LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
    272 	beq	L(ret)
    273 	addi	rp, rp, 15
    274 LIMB32(`rlwinm	rp, rp, 0,0,27	')
    275 LIMB64(`rldicr	rp, rp, 0,59	')
    276 	li	r10, 0
    277 L(top2):
    278 LIMB32(`lwzx	r8, r10, up	')
    279 LIMB64(`ldx	r8, r10, up	')
    280 LIMB32(`lwzx	r9, r10, vp	')
    281 LIMB64(`ldx	r9, r10, vp	')
    282 LIMB32(`addic.	r7, r7, -1	')
    283 	logopS(	r0, r8, r9)
    284 LIMB32(`stwx	r0, r10, rp	')
    285 LIMB64(`std	r0, 0(rp)	')
    286 LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
    287 LIMB32(`bne	L(top2)		')
    288 
    289 L(ret):	mtspr	256, r12
    290 	blr
    291 EPILOGUE()
    292 
    293 C This works for 64-bit PowerPC, since a limb ptr can only be aligned
    294 C in 2 relevant ways, which means we can always find a pair of aligned
    295 C pointers of rp, up, and vp.
    296 C process words until rp is 16-byte aligned
    297 C if (((up | vp) & 15) == 0)
    298 C   process with VMX without any vperm
    299 C else if ((up & 15) != 0 && (vp & 15) != 0)
    300 C   process with VMX using vperm on store data
    301 C else if ((up & 15) != 0)
    302 C   process with VMX using vperm on up data
    303 C else
    304 C   process with VMX using vperm on vp data
    305 C
    306 C	rlwinm,	r0, up, 0,28,31
    307 C	rlwinm	r0, vp, 0,28,31
    308 C	cmpwi	cr7, r0, 0
    309 C	cror	cr6, cr0, cr7
    310 C	crand	cr0, cr0, cr7
    311