1 dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor, 2 dnl returning quotient only. 3 4 dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software 5 dnl Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C cycles/limb 36 C norm/unorm 37 C AMD K8,K9 10 + 38 C AMD K10 10 + 39 C AMD bull 13.7 - 40 C AMD pile 13.7 + 41 C AMD steam 42 C AMD excavator 43 C AMD bobcat 15 - 44 C AMD jaguar 16 - 45 C Intel P4 33 = 46 C Intel core2 13.25 = 47 C Intel NHM 14 = 48 C Intel SBR 8.5 - 49 C Intel IBR 8.5 - 50 C Intel HWL 8 = 51 C Intel BWL 8 = 52 C Intel SKL 8 = 53 C Intel atom 42 -- 54 C Intel SLM 20.4 -- 55 C VIA nano 56 57 C INPUT PARAMETERS 58 define(`rp', `%rdi') 59 define(`up', `%rsi') 60 define(`n', `%rdx') 61 define(`d', `%rcx') 62 define(`di', `%r8') C just mpn_pi1_bdiv_q_1 63 define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 64 65 ABI_SUPPORT(DOS64) 66 ABI_SUPPORT(STD64) 67 68 ASM_START() 69 TEXT 70 ALIGN(16) 71 PROLOGUE(mpn_bdiv_q_1) 72 FUNC_ENTRY(4) 73 push %rbx 74 75 mov %rcx, %rax 76 xor R32(%rcx), R32(%rcx) C ncnt count 77 mov %rdx, %r10 78 79 bt $0, R32(%rax) 80 jnc L(evn) C skip bsf unless divisor is even 81 82 L(odd): mov %rax, %rbx 83 shr R32(%rax) 84 and $127, R32(%rax) C d/2, 7 bits 85 86 LEA( binvert_limb_table, %rdx) 87 88 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 89 90 mov %rbx, %r11 C d without twos 91 92 lea (%rax,%rax), R32(%rdx) C 2*inv 93 imul R32(%rax), R32(%rax) C inv*inv 94 imul R32(%rbx), R32(%rax) C inv*inv*d 95 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 96 97 lea (%rdx,%rdx), R32(%rax) C 2*inv 98 imul R32(%rdx), R32(%rdx) C inv*inv 99 imul R32(%rbx), R32(%rdx) C inv*inv*d 100 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 101 102 lea (%rax,%rax), %r8 C 2*inv 103 imul %rax, %rax C inv*inv 104 imul %rbx, %rax C inv*inv*d 105 sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits 106 107 jmp L(pi1) 108 109 L(evn): bsf %rax, %rcx 110 shr R8(%rcx), %rax 111 jmp L(odd) 112 EPILOGUE() 113 114 PROLOGUE(mpn_pi1_bdiv_q_1) 115 FUNC_ENTRY(4) 116 IFDOS(` mov 56(%rsp), %r8 ') 117 IFDOS(` mov 64(%rsp), %r9 ') 118 push %rbx 119 120 mov %rcx, %r11 C d 121 mov %rdx, %r10 C n 122 mov %r9, %rcx C ncnt 123 124 L(pi1): mov (up), %rax C up[0] 125 126 dec %r10 127 jz L(one) 128 129 mov 8(up), %rdx C up[1] 130 lea (up,%r10,8), up C up end 131 lea (rp,%r10,8), rp C rp end 132 neg %r10 C -n 133 134 shrd R8(%rcx), %rdx, %rax 135 136 xor R32(%rbx), R32(%rbx) 137 jmp L(ent) 138 139 ALIGN(8) 140 L(top): 141 C rax q 142 C rbx carry bit, 0 or 1 143 C rcx ncnt 144 C rdx 145 C r10 counter, limbs, negative 146 C r11 d 147 148 mul %r11 C carry limb in rdx 149 mov (up,%r10,8), %rax 150 mov 8(up,%r10,8), %r9 151 shrd R8(%rcx), %r9, %rax 152 nop 153 sub %rbx, %rax C apply carry bit 154 setc R8(%rbx) 155 sub %rdx, %rax C apply carry limb 156 adc $0, R32(%rbx) 157 L(ent): imul %r8, %rax 158 mov %rax, (rp,%r10,8) 159 inc %r10 160 jnz L(top) 161 162 mul %r11 C carry limb in rdx 163 mov (up), %rax C up high limb 164 shr R8(%rcx), %rax 165 sub %rbx, %rax C apply carry bit 166 sub %rdx, %rax C apply carry limb 167 imul %r8, %rax 168 mov %rax, (rp) 169 pop %rbx 170 FUNC_EXIT() 171 ret 172 173 L(one): shr R8(%rcx), %rax 174 imul %r8, %rax 175 mov %rax, (rp) 176 pop %rbx 177 FUNC_EXIT() 178 ret 179 EPILOGUE() 180