1 1.1 mrg dnl x86 mpn_divexact_1 -- mpn by limb exact division. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C cycles/limb 35 1.1 mrg C P54 30.0 36 1.1 mrg C P55 29.0 37 1.1 mrg C P6 13.0 odd divisor, 12.0 even (strangely) 38 1.1 mrg C K6 14.0 39 1.1 mrg C K7 12.0 40 1.1 mrg C P4 42.0 41 1.1 mrg 42 1.1 mrg 43 1.1 mrg C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 44 1.1 mrg C mp_limb_t divisor); 45 1.1 mrg C 46 1.1 mrg 47 1.1 mrg defframe(PARAM_DIVISOR,16) 48 1.1 mrg defframe(PARAM_SIZE, 12) 49 1.1 mrg defframe(PARAM_SRC, 8) 50 1.1 mrg defframe(PARAM_DST, 4) 51 1.1 mrg 52 1.1 mrg dnl re-use parameter space 53 1.1 mrg define(VAR_INVERSE,`PARAM_SRC') 54 1.1 mrg 55 1.1 mrg TEXT 56 1.1 mrg 57 1.1 mrg ALIGN(16) 58 1.1 mrg PROLOGUE(mpn_divexact_1) 59 1.1 mrg deflit(`FRAME',0) 60 1.1 mrg 61 1.1 mrg movl PARAM_DIVISOR, %eax 62 1.1 mrg pushl %ebp FRAME_pushl() 63 1.1 mrg 64 1.1 mrg movl PARAM_SIZE, %ebp 65 1.1 mrg pushl %edi FRAME_pushl() 66 1.1 mrg 67 1.1 mrg pushl %ebx FRAME_pushl() 68 1.1 mrg movl $-1, %ecx C shift count 69 1.1 mrg 70 1.1 mrg pushl %esi FRAME_pushl() 71 1.1 mrg 72 1.1 mrg L(strip_twos): 73 1.1 mrg incl %ecx 74 1.1 mrg 75 1.1 mrg shrl %eax 76 1.1 mrg jnc L(strip_twos) 77 1.1 mrg 78 1.1 mrg leal 1(%eax,%eax), %ebx C d without twos 79 1.1 mrg andl $127, %eax C d/2, 7 bits 80 1.1 mrg 81 1.1 mrg ifdef(`PIC',` 82 1.1 mrg LEA( binvert_limb_table, %edx) 83 1.1 mrg movzbl (%eax,%edx), %eax C inv 8 bits 84 1.1 mrg ',` 85 1.1 mrg movzbl binvert_limb_table(%eax), %eax C inv 8 bits 86 1.1 mrg ') 87 1.1 mrg 88 1.1 mrg leal (%eax,%eax), %edx C 2*inv 89 1.1 mrg movl %ebx, PARAM_DIVISOR C d without twos 90 1.1 mrg 91 1.1 mrg imull %eax, %eax C inv*inv 92 1.1 mrg 93 1.1 mrg movl PARAM_SRC, %esi 94 1.1 mrg movl PARAM_DST, %edi 95 1.1 mrg 96 1.1 mrg imull %ebx, %eax C inv*inv*d 97 1.1 mrg 98 1.1 mrg subl %eax, %edx C inv = 2*inv - inv*inv*d 99 1.1 mrg leal (%edx,%edx), %eax C 2*inv 100 1.1 mrg 101 1.1 mrg imull %edx, %edx C inv*inv 102 1.1 mrg 103 1.1 mrg leal (%esi,%ebp,4), %esi C src end 104 1.1 mrg leal (%edi,%ebp,4), %edi C dst end 105 1.1 mrg negl %ebp C -size 106 1.1 mrg 107 1.1 mrg imull %ebx, %edx C inv*inv*d 108 1.1 mrg 109 1.1 mrg subl %edx, %eax C inv = 2*inv - inv*inv*d 110 1.1 mrg 111 1.1 mrg ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 112 1.1 mrg pushl %eax FRAME_pushl() 113 1.1 mrg imull PARAM_DIVISOR, %eax 114 1.1 mrg cmpl $1, %eax 115 1.1 mrg popl %eax FRAME_popl()') 116 1.1 mrg 117 1.1 mrg movl %eax, VAR_INVERSE 118 1.1 mrg movl (%esi,%ebp,4), %eax C src[0] 119 1.1 mrg 120 1.1 mrg xorl %ebx, %ebx 121 1.1 mrg xorl %edx, %edx 122 1.1 mrg 123 1.1 mrg incl %ebp 124 1.1 mrg jz L(one) 125 1.1 mrg 126 1.1 mrg movl (%esi,%ebp,4), %edx C src[1] 127 1.1 mrg 128 1.1 mrg shrdl( %cl, %edx, %eax) 129 1.1 mrg 130 1.1 mrg movl VAR_INVERSE, %edx 131 1.1 mrg jmp L(entry) 132 1.1 mrg 133 1.1 mrg 134 1.1 mrg ALIGN(8) 135 1.1 mrg nop C k6 code alignment 136 1.1 mrg nop 137 1.1 mrg L(top): 138 1.1 mrg C eax q 139 1.1 mrg C ebx carry bit, 0 or -1 140 1.1 mrg C ecx shift 141 1.1 mrg C edx carry limb 142 1.1 mrg C esi src end 143 1.1 mrg C edi dst end 144 1.1 mrg C ebp counter, limbs, negative 145 1.1 mrg 146 1.1 mrg movl -4(%esi,%ebp,4), %eax 147 1.1 mrg subl %ebx, %edx C accumulate carry bit 148 1.1 mrg 149 1.1 mrg movl (%esi,%ebp,4), %ebx 150 1.1 mrg 151 1.1 mrg shrdl( %cl, %ebx, %eax) 152 1.1 mrg 153 1.1 mrg subl %edx, %eax C apply carry limb 154 1.1 mrg movl VAR_INVERSE, %edx 155 1.1 mrg 156 1.1 mrg sbbl %ebx, %ebx 157 1.1 mrg 158 1.1 mrg L(entry): 159 1.1 mrg imull %edx, %eax 160 1.1 mrg 161 1.1 mrg movl %eax, -4(%edi,%ebp,4) 162 1.1 mrg movl PARAM_DIVISOR, %edx 163 1.1 mrg 164 1.1 mrg mull %edx 165 1.1 mrg 166 1.1 mrg incl %ebp 167 1.1 mrg jnz L(top) 168 1.1 mrg 169 1.1 mrg 170 1.1 mrg movl -4(%esi), %eax C src high limb 171 1.1 mrg L(one): 172 1.1 mrg shrl %cl, %eax 173 1.1 mrg popl %esi FRAME_popl() 174 1.1 mrg 175 1.1 mrg addl %ebx, %eax C apply carry bit 176 1.1 mrg popl %ebx FRAME_popl() 177 1.1 mrg 178 1.1 mrg subl %edx, %eax C apply carry limb 179 1.1 mrg 180 1.1 mrg imull VAR_INVERSE, %eax 181 1.1 mrg 182 1.1 mrg movl %eax, -4(%edi) 183 1.1 mrg 184 1.1 mrg popl %edi 185 1.1 mrg popl %ebp 186 1.1 mrg 187 1.1 mrg ret 188 1.1 mrg 189 1.1 mrg EPILOGUE() 190 1.1.1.2 mrg ASM_END() 191