1 1.1 mrg dnl x86 mpn_bdiv_q_1 -- mpn by limb exact division. 2 1.1 mrg 3 1.1.1.2 mrg dnl Rearranged from mpn/x86/dive_1.asm by Marco Bodrato. 4 1.1.1.2 mrg 5 1.1 mrg dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. 6 1.1.1.2 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1 mrg dnl 9 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1.1.2 mrg dnl it under the terms of either: 11 1.1 mrg dnl 12 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1.1.2 mrg dnl option) any later version. 15 1.1 mrg dnl 16 1.1.1.2 mrg dnl or 17 1.1.1.2 mrg dnl 18 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1.1.2 mrg dnl later version. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl or both in parallel, as here. 23 1.1.1.2 mrg dnl 24 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1.1.2 mrg dnl for more details. 28 1.1.1.2 mrg dnl 29 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg 36 1.1 mrg C cycles/limb 37 1.1 mrg C P54 30.0 38 1.1 mrg C P55 29.0 39 1.1 mrg C P6 13.0 odd divisor, 12.0 even (strangely) 40 1.1 mrg C K6 14.0 41 1.1 mrg C K7 12.0 42 1.1 mrg C P4 42.0 43 1.1 mrg 44 1.1 mrg MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) 45 1.1 mrg 46 1.1 mrg defframe(PARAM_SHIFT, 24) 47 1.1 mrg defframe(PARAM_INVERSE,20) 48 1.1 mrg defframe(PARAM_DIVISOR,16) 49 1.1 mrg defframe(PARAM_SIZE, 12) 50 1.1 mrg defframe(PARAM_SRC, 8) 51 1.1 mrg defframe(PARAM_DST, 4) 52 1.1 mrg 53 1.1 mrg dnl re-use parameter space 54 1.1 mrg define(VAR_INVERSE,`PARAM_SRC') 55 1.1 mrg 56 1.1 mrg TEXT 57 1.1 mrg 58 1.1 mrg C mp_limb_t 59 1.1 mrg C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 60 1.1 mrg C mp_limb_t inverse, int shift) 61 1.1 mrg 62 1.1 mrg ALIGN(16) 63 1.1 mrg PROLOGUE(mpn_pi1_bdiv_q_1) 64 1.1 mrg deflit(`FRAME',0) 65 1.1 mrg 66 1.1 mrg movl PARAM_SHIFT, %ecx 67 1.1 mrg pushl %ebp FRAME_pushl() 68 1.1 mrg 69 1.1 mrg movl PARAM_INVERSE, %eax 70 1.1 mrg movl PARAM_SIZE, %ebp 71 1.1 mrg pushl %ebx FRAME_pushl() 72 1.1 mrg L(common): 73 1.1 mrg pushl %edi FRAME_pushl() 74 1.1 mrg pushl %esi FRAME_pushl() 75 1.1 mrg 76 1.1 mrg movl PARAM_SRC, %esi 77 1.1 mrg movl PARAM_DST, %edi 78 1.1 mrg 79 1.1 mrg leal (%esi,%ebp,4), %esi C src end 80 1.1 mrg leal (%edi,%ebp,4), %edi C dst end 81 1.1 mrg negl %ebp C -size 82 1.1 mrg 83 1.1 mrg movl %eax, VAR_INVERSE 84 1.1 mrg movl (%esi,%ebp,4), %eax C src[0] 85 1.1 mrg 86 1.1 mrg xorl %ebx, %ebx 87 1.1 mrg xorl %edx, %edx 88 1.1 mrg 89 1.1 mrg incl %ebp 90 1.1 mrg jz L(one) 91 1.1 mrg 92 1.1 mrg movl (%esi,%ebp,4), %edx C src[1] 93 1.1 mrg 94 1.1 mrg shrdl( %cl, %edx, %eax) 95 1.1 mrg 96 1.1 mrg movl VAR_INVERSE, %edx 97 1.1 mrg jmp L(entry) 98 1.1 mrg 99 1.1 mrg 100 1.1 mrg ALIGN(8) 101 1.1 mrg nop C k6 code alignment 102 1.1 mrg nop 103 1.1 mrg L(top): 104 1.1 mrg C eax q 105 1.1 mrg C ebx carry bit, 0 or -1 106 1.1 mrg C ecx shift 107 1.1 mrg C edx carry limb 108 1.1 mrg C esi src end 109 1.1 mrg C edi dst end 110 1.1 mrg C ebp counter, limbs, negative 111 1.1 mrg 112 1.1 mrg movl -4(%esi,%ebp,4), %eax 113 1.1 mrg subl %ebx, %edx C accumulate carry bit 114 1.1 mrg 115 1.1 mrg movl (%esi,%ebp,4), %ebx 116 1.1 mrg 117 1.1 mrg shrdl( %cl, %ebx, %eax) 118 1.1 mrg 119 1.1 mrg subl %edx, %eax C apply carry limb 120 1.1 mrg movl VAR_INVERSE, %edx 121 1.1 mrg 122 1.1 mrg sbbl %ebx, %ebx 123 1.1 mrg 124 1.1 mrg L(entry): 125 1.1 mrg imull %edx, %eax 126 1.1 mrg 127 1.1 mrg movl %eax, -4(%edi,%ebp,4) 128 1.1 mrg movl PARAM_DIVISOR, %edx 129 1.1 mrg 130 1.1 mrg mull %edx 131 1.1 mrg 132 1.1 mrg incl %ebp 133 1.1 mrg jnz L(top) 134 1.1 mrg 135 1.1 mrg 136 1.1 mrg movl -4(%esi), %eax C src high limb 137 1.1 mrg L(one): 138 1.1 mrg shrl %cl, %eax 139 1.1 mrg popl %esi FRAME_popl() 140 1.1 mrg 141 1.1 mrg addl %ebx, %eax C apply carry bit 142 1.1 mrg 143 1.1 mrg subl %edx, %eax C apply carry limb 144 1.1 mrg 145 1.1 mrg imull VAR_INVERSE, %eax 146 1.1 mrg 147 1.1 mrg movl %eax, -4(%edi) 148 1.1 mrg 149 1.1 mrg popl %edi 150 1.1 mrg popl %ebx 151 1.1 mrg popl %ebp 152 1.1 mrg 153 1.1 mrg ret 154 1.1 mrg 155 1.1 mrg EPILOGUE() 156 1.1 mrg 157 1.1 mrg C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 158 1.1 mrg C mp_limb_t divisor); 159 1.1 mrg C 160 1.1 mrg 161 1.1 mrg ALIGN(16) 162 1.1 mrg PROLOGUE(mpn_bdiv_q_1) 163 1.1 mrg deflit(`FRAME',0) 164 1.1 mrg 165 1.1 mrg movl PARAM_DIVISOR, %eax 166 1.1 mrg pushl %ebp FRAME_pushl() 167 1.1 mrg 168 1.1 mrg movl $-1, %ecx C shift count 169 1.1 mrg movl PARAM_SIZE, %ebp 170 1.1 mrg 171 1.1 mrg pushl %ebx FRAME_pushl() 172 1.1 mrg 173 1.1 mrg L(strip_twos): 174 1.1 mrg incl %ecx 175 1.1 mrg 176 1.1 mrg shrl %eax 177 1.1 mrg jnc L(strip_twos) 178 1.1 mrg 179 1.1 mrg leal 1(%eax,%eax), %ebx C d without twos 180 1.1 mrg andl $127, %eax C d/2, 7 bits 181 1.1 mrg 182 1.1 mrg ifdef(`PIC',` 183 1.1 mrg LEA( binvert_limb_table, %edx) 184 1.1 mrg movzbl (%eax,%edx), %eax C inv 8 bits 185 1.1 mrg ',` 186 1.1 mrg movzbl binvert_limb_table(%eax), %eax C inv 8 bits 187 1.1 mrg ') 188 1.1 mrg 189 1.1 mrg leal (%eax,%eax), %edx C 2*inv 190 1.1 mrg movl %ebx, PARAM_DIVISOR C d without twos 191 1.1 mrg imull %eax, %eax C inv*inv 192 1.1 mrg imull %ebx, %eax C inv*inv*d 193 1.1 mrg subl %eax, %edx C inv = 2*inv - inv*inv*d 194 1.1 mrg 195 1.1 mrg leal (%edx,%edx), %eax C 2*inv 196 1.1 mrg imull %edx, %edx C inv*inv 197 1.1 mrg imull %ebx, %edx C inv*inv*d 198 1.1 mrg subl %edx, %eax C inv = 2*inv - inv*inv*d 199 1.1 mrg 200 1.1 mrg ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 201 1.1 mrg pushl %eax FRAME_pushl() 202 1.1 mrg imull PARAM_DIVISOR, %eax 203 1.1 mrg cmpl $1, %eax 204 1.1 mrg popl %eax FRAME_popl()') 205 1.1 mrg 206 1.1 mrg jmp L(common) 207 1.1 mrg EPILOGUE() 208 1.1.1.2 mrg ASM_END() 209