1 1.1 mrg dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C cycles/limb 35 1.1 mrg C Athlon: 11.0 36 1.1 mrg C Hammer: 9.0 37 1.1 mrg 38 1.1 mrg 39 1.1 mrg C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 40 1.1 mrg C mp_limb_t divisor); 41 1.1 mrg C 42 1.1 mrg C The dependent chain is mul+imul+sub for 11 cycles and that speed is 43 1.1 mrg C achieved with no special effort. The load and shrld latencies are hidden 44 1.1 mrg C by out of order execution. 45 1.1 mrg C 46 1.1 mrg C It's a touch faster on size==1 to use the mul-by-inverse than divl. 47 1.1 mrg 48 1.1 mrg defframe(PARAM_DIVISOR,16) 49 1.1 mrg defframe(PARAM_SIZE, 12) 50 1.1 mrg defframe(PARAM_SRC, 8) 51 1.1 mrg defframe(PARAM_DST, 4) 52 1.1 mrg 53 1.1 mrg defframe(SAVE_EBX, -4) 54 1.1 mrg defframe(SAVE_ESI, -8) 55 1.1 mrg defframe(SAVE_EDI, -12) 56 1.1 mrg defframe(SAVE_EBP, -16) 57 1.1 mrg defframe(VAR_INVERSE, -20) 58 1.1 mrg defframe(VAR_DST_END, -24) 59 1.1 mrg 60 1.1 mrg deflit(STACK_SPACE, 24) 61 1.1 mrg 62 1.1 mrg TEXT 63 1.1 mrg 64 1.1 mrg ALIGN(16) 65 1.1 mrg PROLOGUE(mpn_divexact_1) 66 1.1 mrg deflit(`FRAME',0) 67 1.1 mrg 68 1.1 mrg movl PARAM_DIVISOR, %eax 69 1.1 mrg subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 70 1.1 mrg movl $-1, %ecx C shift count 71 1.1 mrg 72 1.1 mrg movl %ebp, SAVE_EBP 73 1.1 mrg movl PARAM_SIZE, %ebp 74 1.1 mrg 75 1.1 mrg movl %esi, SAVE_ESI 76 1.1 mrg movl %edi, SAVE_EDI 77 1.1 mrg 78 1.1 mrg C If there's usually only one or two trailing zero bits then this 79 1.1 mrg C should be faster than bsfl. 80 1.1 mrg L(strip_twos): 81 1.1 mrg incl %ecx 82 1.1 mrg shrl %eax 83 1.1 mrg jnc L(strip_twos) 84 1.1 mrg 85 1.1 mrg movl %ebx, SAVE_EBX 86 1.1 mrg leal 1(%eax,%eax), %ebx C d without twos 87 1.1 mrg andl $127, %eax C d/2, 7 bits 88 1.1 mrg 89 1.1 mrg ifdef(`PIC',` 90 1.1 mrg LEA( binvert_limb_table, %edx) 91 1.1 mrg movzbl (%eax,%edx), %eax C inv 8 bits 92 1.1 mrg ',` 93 1.1 mrg movzbl binvert_limb_table(%eax), %eax C inv 8 bits 94 1.1 mrg ') 95 1.1 mrg 96 1.1 mrg leal (%eax,%eax), %edx C 2*inv 97 1.1 mrg movl %ebx, PARAM_DIVISOR C d without twos 98 1.1 mrg 99 1.1 mrg imull %eax, %eax C inv*inv 100 1.1 mrg 101 1.1 mrg movl PARAM_SRC, %esi 102 1.1 mrg movl PARAM_DST, %edi 103 1.1 mrg 104 1.1 mrg imull %ebx, %eax C inv*inv*d 105 1.1 mrg 106 1.1 mrg subl %eax, %edx C inv = 2*inv - inv*inv*d 107 1.1 mrg leal (%edx,%edx), %eax C 2*inv 108 1.1 mrg 109 1.1 mrg imull %edx, %edx C inv*inv 110 1.1 mrg 111 1.1 mrg leal (%esi,%ebp,4), %esi C src end 112 1.1 mrg leal (%edi,%ebp,4), %edi C dst end 113 1.1 mrg negl %ebp C -size 114 1.1 mrg 115 1.1 mrg imull %ebx, %edx C inv*inv*d 116 1.1 mrg 117 1.1 mrg subl %edx, %eax C inv = 2*inv - inv*inv*d 118 1.1 mrg 119 1.1 mrg ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 120 1.1 mrg pushl %eax FRAME_pushl() 121 1.1 mrg imull PARAM_DIVISOR, %eax 122 1.1 mrg cmpl $1, %eax 123 1.1 mrg popl %eax FRAME_popl()') 124 1.1 mrg 125 1.1 mrg movl %eax, VAR_INVERSE 126 1.1 mrg movl (%esi,%ebp,4), %eax C src[0] 127 1.1 mrg 128 1.1 mrg incl %ebp 129 1.1 mrg jz L(one) 130 1.1 mrg 131 1.1 mrg movl (%esi,%ebp,4), %edx C src[1] 132 1.1 mrg 133 1.1 mrg shrdl( %cl, %edx, %eax) 134 1.1 mrg 135 1.1 mrg movl %edi, VAR_DST_END 136 1.1 mrg xorl %ebx, %ebx 137 1.1 mrg jmp L(entry) 138 1.1 mrg 139 1.1 mrg ALIGN(8) 140 1.1 mrg L(top): 141 1.1 mrg C eax q 142 1.1 mrg C ebx carry bit, 0 or 1 143 1.1 mrg C ecx shift 144 1.1 mrg C edx 145 1.1 mrg C esi src end 146 1.1 mrg C edi dst end 147 1.1 mrg C ebp counter, limbs, negative 148 1.1 mrg 149 1.1 mrg mull PARAM_DIVISOR C carry limb in edx 150 1.1 mrg 151 1.1 mrg movl -4(%esi,%ebp,4), %eax 152 1.1 mrg movl (%esi,%ebp,4), %edi 153 1.1 mrg 154 1.1 mrg shrdl( %cl, %edi, %eax) 155 1.1 mrg 156 1.1 mrg subl %ebx, %eax C apply carry bit 157 1.1 mrg setc %bl 158 1.1 mrg movl VAR_DST_END, %edi 159 1.1 mrg 160 1.1 mrg subl %edx, %eax C apply carry limb 161 1.1 mrg adcl $0, %ebx 162 1.1 mrg 163 1.1 mrg L(entry): 164 1.1 mrg imull VAR_INVERSE, %eax 165 1.1 mrg 166 1.1 mrg movl %eax, -4(%edi,%ebp,4) 167 1.1 mrg incl %ebp 168 1.1 mrg jnz L(top) 169 1.1 mrg 170 1.1 mrg 171 1.1 mrg mull PARAM_DIVISOR C carry limb in edx 172 1.1 mrg 173 1.1 mrg movl -4(%esi), %eax C src high limb 174 1.1 mrg shrl %cl, %eax 175 1.1 mrg movl SAVE_ESI, %esi 176 1.1 mrg 177 1.1 mrg subl %ebx, %eax C apply carry bit 178 1.1 mrg movl SAVE_EBX, %ebx 179 1.1 mrg movl SAVE_EBP, %ebp 180 1.1 mrg 181 1.1 mrg subl %edx, %eax C apply carry limb 182 1.1 mrg 183 1.1 mrg imull VAR_INVERSE, %eax 184 1.1 mrg 185 1.1 mrg movl %eax, -4(%edi) 186 1.1 mrg movl SAVE_EDI, %edi 187 1.1 mrg addl $STACK_SPACE, %esp 188 1.1 mrg 189 1.1 mrg ret 190 1.1 mrg 191 1.1 mrg 192 1.1 mrg L(one): 193 1.1 mrg shrl %cl, %eax 194 1.1 mrg movl SAVE_ESI, %esi 195 1.1 mrg movl SAVE_EBX, %ebx 196 1.1 mrg 197 1.1 mrg imull VAR_INVERSE, %eax 198 1.1 mrg 199 1.1 mrg movl SAVE_EBP, %ebp 200 1.1 mrg movl %eax, -4(%edi) 201 1.1 mrg 202 1.1 mrg movl SAVE_EDI, %edi 203 1.1 mrg addl $STACK_SPACE, %esp 204 1.1 mrg 205 1.1 mrg ret 206 1.1 mrg 207 1.1 mrg EPILOGUE() 208 1.1.1.2 mrg ASM_END() 209