1 1.1 mrg dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2006 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1.1.2 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1.1.2 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg C 16-byte coaligned unaligned 34 1.1 mrg C cycles/limb cycles/limb 35 1.1 mrg C 7400,7410 (G4): 0.5 0.64 36 1.1 mrg C 744x,745x (G4+): 0.75 0.82 37 1.1 mrg C 970 (G5): 0.78 1.02 (64-bit limbs) 38 1.1 mrg 39 1.1 mrg C STATUS 40 1.1 mrg C * Works for all sizes and alignments. 41 1.1 mrg 42 1.1 mrg C TODO 43 1.1 mrg C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling 44 1.1 mrg C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 45 1.1 mrg C c/l for 970. 46 1.1 mrg C * Consider using VMX instructions also for head and tail, by using some 47 1.1 mrg C read-modify-write tricks. 48 1.1 mrg C * The VMX code is used from the smallest sizes it handles, but measurements 49 1.1 mrg C show a large speed bump at the cutoff points. Small copying (perhaps 50 1.1 mrg C using some read-modify-write technique) should be optimized. 51 1.1.1.2 mrg C * Make an mpn_com based on this code. 52 1.1 mrg 53 1.1 mrg define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 54 1.1 mrg define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 55 1.1 mrg define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 56 1.1 mrg 57 1.1 mrg 58 1.1 mrg ifelse(GMP_LIMB_BITS,32,` 59 1.1 mrg define(`LIMB32',` $1') 60 1.1 mrg define(`LIMB64',`') 61 1.1 mrg ',` 62 1.1 mrg define(`LIMB32',`') 63 1.1 mrg define(`LIMB64',` $1') 64 1.1 mrg ') 65 1.1 mrg 66 1.1 mrg C INPUT PARAMETERS 67 1.1 mrg define(`rp', `r3') 68 1.1 mrg define(`up', `r4') 69 1.1 mrg define(`n', `r5') 70 1.1 mrg 71 1.1 mrg define(`us', `v4') 72 1.1 mrg 73 1.1 mrg 74 1.1 mrg ASM_START() 75 1.1 mrg PROLOGUE(mpn_copyd) 76 1.1 mrg 77 1.1 mrg LIMB32(`slwi. r0, n, 2 ') 78 1.1 mrg LIMB64(`sldi. r0, n, 3 ') 79 1.1 mrg add rp, rp, r0 80 1.1 mrg add up, up, r0 81 1.1 mrg 82 1.1 mrg LIMB32(`cmpi cr7, n, 11 ') 83 1.1 mrg LIMB64(`cmpdi cr7, n, 5 ') 84 1.1 mrg bge cr7, L(big) 85 1.1 mrg 86 1.1 mrg beqlr cr0 87 1.1 mrg 88 1.1 mrg C Handle small cases with plain operations 89 1.1 mrg mtctr n 90 1.1 mrg L(topS): 91 1.1 mrg LIMB32(`lwz r0, -4(up) ') 92 1.1 mrg LIMB64(`ld r0, -8(up) ') 93 1.1 mrg addi up, up, -GMP_LIMB_BYTES 94 1.1 mrg LIMB32(`stw r0, -4(rp) ') 95 1.1 mrg LIMB64(`std r0, -8(rp) ') 96 1.1 mrg addi rp, rp, -GMP_LIMB_BYTES 97 1.1 mrg bdnz L(topS) 98 1.1 mrg blr 99 1.1 mrg 100 1.1 mrg C Handle large cases with VMX operations 101 1.1 mrg L(big): 102 1.1 mrg addi rp, rp, -16 103 1.1 mrg addi up, up, -16 104 1.1 mrg mfspr r12, 256 105 1.1 mrg oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 106 1.1 mrg mtspr 256, r0 107 1.1 mrg 108 1.1 mrg LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 109 1.1 mrg LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 110 1.1 mrg beq L(rp_aligned) 111 1.1 mrg 112 1.1 mrg subf n, r7, n 113 1.1 mrg L(top0): 114 1.1 mrg LIMB32(`lwz r0, 12(up) ') 115 1.1 mrg LIMB64(`ld r0, 8(up) ') 116 1.1 mrg addi up, up, -GMP_LIMB_BYTES 117 1.1 mrg LIMB32(`addic. r7, r7, -1 ') 118 1.1 mrg LIMB32(`stw r0, 12(rp) ') 119 1.1 mrg LIMB64(`std r0, 8(rp) ') 120 1.1 mrg addi rp, rp, -GMP_LIMB_BYTES 121 1.1 mrg LIMB32(`bne L(top0) ') 122 1.1 mrg 123 1.1 mrg L(rp_aligned): 124 1.1 mrg 125 1.1 mrg LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 126 1.1 mrg LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 127 1.1 mrg 128 1.1 mrg LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 129 1.1 mrg LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 130 1.1 mrg mtctr r7 C copy n to count register 131 1.1 mrg 132 1.1 mrg li r10, -16 133 1.1 mrg 134 1.1 mrg beq L(up_aligned) 135 1.1 mrg 136 1.1 mrg lvsl us, 0, up 137 1.1 mrg 138 1.1 mrg addi up, up, 16 139 1.1 mrg LIMB32(`andi. r0, n, 0x4 ') 140 1.1 mrg LIMB64(`andi. r0, n, 0x2 ') 141 1.1 mrg beq L(1) 142 1.1 mrg lvx v0, 0, up 143 1.1 mrg lvx v2, r10, up 144 1.1 mrg vperm v3, v2, v0, us 145 1.1 mrg stvx v3, 0, rp 146 1.1 mrg addi up, up, -32 147 1.1 mrg addi rp, rp, -16 148 1.1 mrg b L(lpu) 149 1.1 mrg L(1): lvx v2, 0, up 150 1.1 mrg addi up, up, -16 151 1.1 mrg b L(lpu) 152 1.1 mrg 153 1.1 mrg ALIGN(32) 154 1.1 mrg L(lpu): lvx v0, 0, up 155 1.1 mrg vperm v3, v0, v2, us 156 1.1 mrg stvx v3, 0, rp 157 1.1 mrg lvx v2, r10, up 158 1.1 mrg addi up, up, -32 159 1.1 mrg vperm v3, v2, v0, us 160 1.1 mrg stvx v3, r10, rp 161 1.1 mrg addi rp, rp, -32 162 1.1 mrg bdnz L(lpu) 163 1.1 mrg 164 1.1 mrg b L(tail) 165 1.1 mrg 166 1.1 mrg L(up_aligned): 167 1.1 mrg 168 1.1 mrg LIMB32(`andi. r0, n, 0x4 ') 169 1.1 mrg LIMB64(`andi. r0, n, 0x2 ') 170 1.1 mrg beq L(lpa) 171 1.1 mrg lvx v0, 0, up 172 1.1 mrg stvx v0, 0, rp 173 1.1 mrg addi up, up, -16 174 1.1 mrg addi rp, rp, -16 175 1.1 mrg b L(lpa) 176 1.1 mrg 177 1.1 mrg ALIGN(32) 178 1.1 mrg L(lpa): lvx v0, 0, up 179 1.1 mrg lvx v1, r10, up 180 1.1 mrg addi up, up, -32 181 1.1 mrg nop 182 1.1 mrg stvx v0, 0, rp 183 1.1 mrg stvx v1, r10, rp 184 1.1 mrg addi rp, rp, -32 185 1.1 mrg bdnz L(lpa) 186 1.1 mrg 187 1.1 mrg L(tail): 188 1.1 mrg LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 189 1.1 mrg LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 190 1.1 mrg beq L(ret) 191 1.1 mrg LIMB32(`li r10, 12 ') 192 1.1 mrg L(top2): 193 1.1 mrg LIMB32(`lwzx r0, r10, up ') 194 1.1 mrg LIMB64(`ld r0, 8(up) ') 195 1.1 mrg LIMB32(`addic. r7, r7, -1 ') 196 1.1 mrg LIMB32(`stwx r0, r10, rp ') 197 1.1 mrg LIMB64(`std r0, 8(rp) ') 198 1.1 mrg LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') 199 1.1 mrg LIMB32(`bne L(top2) ') 200 1.1 mrg 201 1.1 mrg L(ret): mtspr 256, r12 202 1.1 mrg blr 203 1.1 mrg EPILOGUE() 204