1 1.1 mrg dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb. 2 1.1 mrg 3 1.1.1.3 mrg dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1.1.3 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.3 mrg dnl it under the terms of either: 9 1.1.1.3 mrg dnl 10 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.3 mrg dnl option) any later version. 13 1.1.1.3 mrg dnl 14 1.1.1.3 mrg dnl or 15 1.1.1.3 mrg dnl 16 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.3 mrg dnl later version. 19 1.1.1.3 mrg dnl 20 1.1.1.3 mrg dnl or both in parallel, as here. 21 1.1.1.3 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.3 mrg dnl for more details. 26 1.1.1.3 mrg dnl 27 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1.1.2 mrg C cycles/limb 34 1.1.1.2 mrg C norm unorm frac 35 1.1.1.2 mrg C POWER3/PPC630 16-34 16-34 ~11 outdated figures 36 1.1.1.2 mrg C POWER4/PPC970 28 28 19 37 1.1.1.2 mrg C POWER5 29 29 ~19 38 1.1.1.2 mrg C POWER6 49 59 ~42 39 1.1.1.2 mrg C POWER7 24.5 23 ~14 40 1.1 mrg 41 1.1 mrg C INPUT PARAMETERS 42 1.1 mrg C qp = r3 43 1.1 mrg C fn = r4 44 1.1 mrg C up = r5 45 1.1 mrg C un = r6 46 1.1 mrg C d = r7 47 1.1 mrg 48 1.1 mrg C We use a not very predictable branch in the frac code, therefore the cycle 49 1.1 mrg C count wobbles somewhat. With the alternative branch-free code, things run 50 1.1 mrg C considerably slower on POWER4/PPC970 and POWER5. 51 1.1 mrg 52 1.1 mrg C Add preinv entry point. 53 1.1 mrg 54 1.1 mrg 55 1.1 mrg ASM_START() 56 1.1 mrg 57 1.1 mrg EXTERN_FUNC(mpn_invert_limb) 58 1.1 mrg 59 1.1.1.3 mrg PROLOGUE(mpn_divrem_1,toc) 60 1.1 mrg 61 1.1 mrg mfcr r12 62 1.1 mrg add. r10, r6, r4 63 1.1 mrg std r25, -56(r1) 64 1.1 mrg mr r25, r4 65 1.1 mrg mflr r0 66 1.1 mrg std r26, -48(r1) 67 1.1 mrg mr r26, r5 68 1.1 mrg std r28, -32(r1) 69 1.1 mrg mr r28, r6 70 1.1 mrg std r29, -24(r1) 71 1.1 mrg mr r29, r3 72 1.1 mrg li r3, 0 73 1.1 mrg std r30, -16(r1) 74 1.1 mrg mr r30, r7 75 1.1 mrg std r31, -8(r1) 76 1.1 mrg li r31, 0 77 1.1 mrg std r27, -40(r1) 78 1.1 mrg std r0, 16(r1) 79 1.1 mrg stw r12, 8(r1) 80 1.1 mrg stdu r1, -176(r1) 81 1.1 mrg beq- cr0, L(1) 82 1.1 mrg cmpdi cr7, r7, 0 83 1.1 mrg sldi r0, r10, 3 84 1.1 mrg add r11, r0, r29 85 1.1 mrg addi r29, r11, -8 86 1.1 mrg blt- cr7, L(162) 87 1.1 mrg cmpdi cr4, r6, 0 88 1.1 mrg beq+ cr4, L(71) 89 1.1 mrg L(163): 90 1.1 mrg sldi r9, r6, 3 91 1.1 mrg add r9, r9, r5 92 1.1 mrg ld r7, -8(r9) 93 1.1 mrg cmpld cr7, r7, r30 94 1.1 mrg bge- cr7, L(71) 95 1.1 mrg cmpdi cr7, r10, 1 96 1.1 mrg li r0, 0 97 1.1 mrg mr r31, r7 98 1.1 mrg std r0, -8(r11) 99 1.1 mrg addi r29, r29, -8 100 1.1 mrg mr r3, r7 101 1.1 mrg beq- cr7, L(1) 102 1.1 mrg addi r28, r6, -1 103 1.1 mrg cmpdi cr4, r28, 0 104 1.1 mrg L(71): 105 1.1 mrg cntlzd r27, r30 106 1.1 mrg sld r30, r30, r27 107 1.1 mrg sld r31, r31, r27 108 1.1 mrg mr r3, r30 109 1.1 mrg CALL( mpn_invert_limb) 110 1.1 mrg beq- cr4, L(110) 111 1.1 mrg sldi r9, r28, 3 112 1.1 mrg addic. r6, r28, -2 113 1.1 mrg add r9, r9, r26 114 1.1 mrg subfic r5, r27, 64 115 1.1 mrg ld r8, -8(r9) 116 1.1 mrg srd r0, r8, r5 117 1.1 mrg or r31, r31, r0 118 1.1 mrg sld r7, r8, r27 119 1.1 mrg blt- cr0, L(154) 120 1.1 mrg addi r28, r28, -1 121 1.1 mrg mtctr r28 122 1.1 mrg sldi r6, r6, 3 123 1.1 mrg ALIGN(16) 124 1.1 mrg L(uloop): 125 1.1 mrg ldx r8, r26, r6 126 1.1.1.2 mrg nop 127 1.1 mrg mulld r0, r31, r3 128 1.1 mrg mulhdu r10, r31, r3 129 1.1.1.2 mrg addi r11, r31, 1 130 1.1 mrg srd r9, r8, r5 131 1.1.1.2 mrg addi r6, r6, -8 132 1.1 mrg or r9, r7, r9 133 1.1 mrg addc r0, r0, r9 134 1.1 mrg adde r10, r10, r11 135 1.1 mrg mulld r31, r10, r30 136 1.1 mrg subf r31, r31, r9 137 1.1.1.2 mrg subfc r0, r31, r0 C r <= ql 138 1.1.1.2 mrg subfe r0, r0, r0 C r0 = -(r <= ql) 139 1.1.1.2 mrg and r9, r30, r0 140 1.1.1.2 mrg add r31, r31, r9 141 1.1.1.2 mrg add r10, r0, r10 C qh -= (r >= ql) 142 1.1 mrg cmpld cr7, r31, r30 143 1.1 mrg bge- cr7, L(164) 144 1.1 mrg L(123): 145 1.1 mrg std r10, 0(r29) 146 1.1 mrg addi r29, r29, -8 147 1.1 mrg sld r7, r8, r27 148 1.1 mrg bdnz L(uloop) 149 1.1 mrg L(154): 150 1.1 mrg addi r11, r31, 1 151 1.1 mrg nop 152 1.1 mrg mulld r0, r31, r3 153 1.1 mrg mulhdu r8, r31, r3 154 1.1 mrg addc r0, r0, r7 155 1.1 mrg adde r8, r8, r11 156 1.1 mrg mulld r31, r8, r30 157 1.1 mrg subf r31, r31, r7 158 1.1 mrg subfc r0, r0, r31 C r >= ql 159 1.1 mrg subfe r0, r0, r0 C r0 = -(r >= ql) 160 1.1 mrg not r7, r0 161 1.1 mrg add r8, r7, r8 C qh -= (r >= ql) 162 1.1 mrg andc r0, r30, r0 163 1.1 mrg add r31, r31, r0 164 1.1 mrg cmpld cr7, r31, r30 165 1.1 mrg bge- cr7, L(165) 166 1.1 mrg L(134): 167 1.1 mrg std r8, 0(r29) 168 1.1 mrg addi r29, r29, -8 169 1.1 mrg L(110): 170 1.1 mrg addic. r0, r25, -1 171 1.1 mrg blt- cr0, L(156) 172 1.1 mrg mtctr r25 173 1.1 mrg neg r9, r30 174 1.1 mrg ALIGN(16) 175 1.1 mrg L(ufloop): 176 1.1 mrg addi r11, r31, 1 177 1.1 mrg nop 178 1.1.1.2 mrg mulld r0, r3, r31 179 1.1 mrg mulhdu r10, r3, r31 180 1.1 mrg add r10, r10, r11 181 1.1 mrg mulld r31, r9, r10 182 1.1 mrg ifelse(0,1,` 183 1.1.1.2 mrg subfc r0, r0, r31 184 1.1 mrg subfe r0, r0, r0 C r0 = -(r >= ql) 185 1.1 mrg not r7, r0 186 1.1 mrg add r10, r7, r10 C qh -= (r >= ql) 187 1.1 mrg andc r0, r30, r0 188 1.1 mrg add r31, r31, r0 189 1.1 mrg ',` 190 1.1.1.2 mrg cmpld cr7, r31, r0 191 1.1 mrg blt cr7, L(29) 192 1.1 mrg add r31, r30, r31 193 1.1 mrg addi r10, r10, -1 194 1.1 mrg L(29): 195 1.1 mrg ') 196 1.1 mrg std r10, 0(r29) 197 1.1 mrg addi r29, r29, -8 198 1.1 mrg bdnz L(ufloop) 199 1.1 mrg L(156): 200 1.1 mrg srd r3, r31, r27 201 1.1 mrg L(1): 202 1.1 mrg addi r1, r1, 176 203 1.1 mrg ld r0, 16(r1) 204 1.1 mrg lwz r12, 8(r1) 205 1.1 mrg mtlr r0 206 1.1 mrg ld r25, -56(r1) 207 1.1 mrg ld r26, -48(r1) 208 1.1 mrg mtcrf 8, r12 209 1.1 mrg ld r27, -40(r1) 210 1.1 mrg ld r28, -32(r1) 211 1.1 mrg ld r29, -24(r1) 212 1.1 mrg ld r30, -16(r1) 213 1.1 mrg ld r31, -8(r1) 214 1.1 mrg blr 215 1.1 mrg L(162): 216 1.1 mrg cmpdi cr7, r6, 0 217 1.1 mrg beq- cr7, L(8) 218 1.1 mrg sldi r9, r6, 3 219 1.1 mrg addi r29, r29, -8 220 1.1 mrg add r9, r9, r5 221 1.1 mrg addi r28, r6, -1 222 1.1 mrg ld r31, -8(r9) 223 1.1 mrg subfc r9, r7, r31 224 1.1 mrg li r9, 0 225 1.1 mrg adde r9, r9, r9 226 1.1 mrg neg r0, r9 227 1.1 mrg std r9, -8(r11) 228 1.1 mrg and r0, r0, r7 229 1.1 mrg subf r31, r0, r31 230 1.1 mrg L(8): 231 1.1 mrg mr r3, r30 232 1.1 mrg CALL( mpn_invert_limb) 233 1.1.1.2 mrg li r27, 0 234 1.1 mrg addic. r6, r28, -1 235 1.1.1.2 mrg blt- cr0, L(110) 236 1.1 mrg mtctr r28 237 1.1 mrg sldi r6, r6, 3 238 1.1 mrg ALIGN(16) 239 1.1 mrg L(nloop): 240 1.1 mrg addi r11, r31, 1 241 1.1 mrg ldx r8, r26, r6 242 1.1 mrg mulld r0, r31, r3 243 1.1 mrg mulhdu r10, r31, r3 244 1.1.1.2 mrg addi r6, r6, -8 245 1.1.1.2 mrg addc r0, r0, r8 246 1.1 mrg adde r10, r10, r11 247 1.1 mrg mulld r31, r10, r30 248 1.1 mrg subf r31, r31, r8 C r = nl - qh * d 249 1.1.1.2 mrg subfc r0, r31, r0 C r <= ql 250 1.1.1.2 mrg subfe r0, r0, r0 C r0 = -(r <= ql) 251 1.1.1.2 mrg and r9, r30, r0 252 1.1.1.2 mrg add r31, r31, r9 253 1.1.1.2 mrg add r10, r0, r10 C qh -= (r >= ql) 254 1.1 mrg cmpld cr7, r31, r30 255 1.1 mrg bge- cr7, L(167) 256 1.1 mrg L(51): 257 1.1 mrg std r10, 0(r29) 258 1.1 mrg addi r29, r29, -8 259 1.1 mrg bdnz L(nloop) 260 1.1.1.2 mrg b L(110) 261 1.1 mrg 262 1.1 mrg L(164): 263 1.1 mrg subf r31, r30, r31 264 1.1 mrg addi r10, r10, 1 265 1.1 mrg b L(123) 266 1.1 mrg L(167): 267 1.1 mrg subf r31, r30, r31 268 1.1 mrg addi r10, r10, 1 269 1.1 mrg b L(51) 270 1.1 mrg L(165): 271 1.1 mrg subf r31, r30, r31 272 1.1 mrg addi r8, r8, 1 273 1.1 mrg b L(134) 274 1.1 mrg EPILOGUE() 275