1 1.1 mrg dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 1.1 mrg 3 1.1.1.3 mrg dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc. 4 1.1.1.3 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.3 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.3 mrg dnl it under the terms of either: 9 1.1.1.3 mrg dnl 10 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.3 mrg dnl option) any later version. 13 1.1.1.3 mrg dnl 14 1.1.1.3 mrg dnl or 15 1.1.1.3 mrg dnl 16 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.3 mrg dnl later version. 19 1.1.1.3 mrg dnl 20 1.1.1.3 mrg dnl or both in parallel, as here. 21 1.1 mrg dnl 22 1.1.1.3 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.3 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.3 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1.1.2 mrg C cycles/limb 35 1.1.1.2 mrg C P5 36 1.1.1.2 mrg C P6 model 0-8,10-12 6.44 37 1.1.1.2 mrg C P6 model 9 (Banias) 6.15 38 1.1.1.2 mrg C P6 model 13 (Dothan) 6.11 39 1.1 mrg C P4 model 0 (Willamette) 40 1.1 mrg C P4 model 1 (?) 41 1.1 mrg C P4 model 2 (Northwood) 42 1.1 mrg C P4 model 3 (Prescott) 43 1.1 mrg C P4 model 4 (Nocona) 44 1.1.1.2 mrg C AMD K6 45 1.1.1.2 mrg C AMD K7 46 1.1.1.2 mrg C AMD K8 47 1.1 mrg 48 1.1 mrg 49 1.1 mrg dnl P6 UNROLL_COUNT cycles/limb 50 1.1 mrg dnl 8 6.7 51 1.1 mrg dnl 16 6.35 52 1.1 mrg dnl 32 6.3 53 1.1 mrg dnl 64 6.3 54 1.1 mrg dnl Maximum possible with the current code is 64. 55 1.1 mrg 56 1.1 mrg deflit(UNROLL_COUNT, 16) 57 1.1 mrg 58 1.1 mrg 59 1.1 mrg ifdef(`OPERATION_addmul_1', ` 60 1.1 mrg define(M4_inst, addl) 61 1.1 mrg define(M4_function_1, mpn_addmul_1) 62 1.1 mrg define(M4_function_1c, mpn_addmul_1c) 63 1.1 mrg define(M4_description, add it to) 64 1.1 mrg define(M4_desc_retval, carry) 65 1.1 mrg ',`ifdef(`OPERATION_submul_1', ` 66 1.1 mrg define(M4_inst, subl) 67 1.1 mrg define(M4_function_1, mpn_submul_1) 68 1.1 mrg define(M4_function_1c, mpn_submul_1c) 69 1.1 mrg define(M4_description, subtract it from) 70 1.1 mrg define(M4_desc_retval, borrow) 71 1.1 mrg ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 72 1.1 mrg ')')') 73 1.1 mrg 74 1.1 mrg MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) 75 1.1 mrg 76 1.1 mrg 77 1.1 mrg C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 78 1.1 mrg C mp_limb_t mult); 79 1.1 mrg C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 80 1.1 mrg C mp_limb_t mult, mp_limb_t carry); 81 1.1 mrg C 82 1.1 mrg C Calculate src,size multiplied by mult and M4_description dst,size. 83 1.1 mrg C Return the M4_desc_retval limb from the top of the result. 84 1.1 mrg C 85 1.1 mrg C This code is pretty much the same as the K6 code. The unrolled loop is 86 1.1 mrg C the same, but there's just a few scheduling tweaks in the setups and the 87 1.1 mrg C simple loop. 88 1.1 mrg C 89 1.1 mrg C A number of variations have been tried for the unrolled loop, with one or 90 1.1 mrg C two carries, and with loads scheduled earlier, but nothing faster than 6 91 1.1 mrg C cycles/limb has been found. 92 1.1 mrg 93 1.1 mrg ifdef(`PIC',` 94 1.1 mrg deflit(UNROLL_THRESHOLD, 5) 95 1.1 mrg ',` 96 1.1 mrg deflit(UNROLL_THRESHOLD, 5) 97 1.1 mrg ') 98 1.1 mrg 99 1.1 mrg defframe(PARAM_CARRY, 20) 100 1.1 mrg defframe(PARAM_MULTIPLIER,16) 101 1.1 mrg defframe(PARAM_SIZE, 12) 102 1.1 mrg defframe(PARAM_SRC, 8) 103 1.1 mrg defframe(PARAM_DST, 4) 104 1.1 mrg 105 1.1 mrg TEXT 106 1.1 mrg ALIGN(32) 107 1.1 mrg 108 1.1 mrg PROLOGUE(M4_function_1c) 109 1.1 mrg pushl %ebx 110 1.1 mrg deflit(`FRAME',4) 111 1.1 mrg movl PARAM_CARRY, %ebx 112 1.1 mrg jmp L(start_nc) 113 1.1 mrg EPILOGUE() 114 1.1 mrg 115 1.1 mrg PROLOGUE(M4_function_1) 116 1.1 mrg push %ebx 117 1.1 mrg deflit(`FRAME',4) 118 1.1 mrg xorl %ebx, %ebx C initial carry 119 1.1 mrg 120 1.1 mrg L(start_nc): 121 1.1 mrg movl PARAM_SIZE, %ecx 122 1.1 mrg pushl %esi 123 1.1 mrg deflit(`FRAME',8) 124 1.1 mrg 125 1.1 mrg movl PARAM_SRC, %esi 126 1.1 mrg pushl %edi 127 1.1 mrg deflit(`FRAME',12) 128 1.1 mrg 129 1.1 mrg movl PARAM_DST, %edi 130 1.1 mrg pushl %ebp 131 1.1 mrg deflit(`FRAME',16) 132 1.1 mrg cmpl $UNROLL_THRESHOLD, %ecx 133 1.1 mrg 134 1.1 mrg movl PARAM_MULTIPLIER, %ebp 135 1.1 mrg jae L(unroll) 136 1.1 mrg 137 1.1 mrg 138 1.1 mrg C simple loop 139 1.1 mrg C this is offset 0x22, so close enough to aligned 140 1.1 mrg L(simple): 141 1.1 mrg C eax scratch 142 1.1 mrg C ebx carry 143 1.1 mrg C ecx counter 144 1.1 mrg C edx scratch 145 1.1 mrg C esi src 146 1.1 mrg C edi dst 147 1.1 mrg C ebp multiplier 148 1.1 mrg 149 1.1 mrg movl (%esi), %eax 150 1.1 mrg addl $4, %edi 151 1.1 mrg 152 1.1 mrg mull %ebp 153 1.1 mrg 154 1.1 mrg addl %ebx, %eax 155 1.1 mrg adcl $0, %edx 156 1.1 mrg 157 1.1 mrg M4_inst %eax, -4(%edi) 158 1.1 mrg movl %edx, %ebx 159 1.1 mrg 160 1.1 mrg adcl $0, %ebx 161 1.1 mrg decl %ecx 162 1.1 mrg 163 1.1 mrg leal 4(%esi), %esi 164 1.1 mrg jnz L(simple) 165 1.1 mrg 166 1.1 mrg 167 1.1 mrg popl %ebp 168 1.1 mrg popl %edi 169 1.1 mrg 170 1.1 mrg popl %esi 171 1.1 mrg movl %ebx, %eax 172 1.1 mrg 173 1.1 mrg popl %ebx 174 1.1 mrg ret 175 1.1 mrg 176 1.1 mrg 177 1.1 mrg 178 1.1 mrg C------------------------------------------------------------------------------ 179 1.1 mrg C VAR_JUMP holds the computed jump temporarily because there's not enough 180 1.1 mrg C registers when doing the mul for the initial two carry limbs. 181 1.1 mrg C 182 1.1 mrg C The add/adc for the initial carry in %ebx is necessary only for the 183 1.1 mrg C mpn_add/submul_1c entry points. Duplicating the startup code to 184 1.1 mrg C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good 185 1.1 mrg C idea. 186 1.1 mrg 187 1.1 mrg dnl overlapping with parameters already fetched 188 1.1 mrg define(VAR_COUNTER,`PARAM_SIZE') 189 1.1 mrg define(VAR_JUMP, `PARAM_DST') 190 1.1 mrg 191 1.1 mrg C this is offset 0x43, so close enough to aligned 192 1.1 mrg L(unroll): 193 1.1 mrg C eax 194 1.1 mrg C ebx initial carry 195 1.1 mrg C ecx size 196 1.1 mrg C edx 197 1.1 mrg C esi src 198 1.1 mrg C edi dst 199 1.1 mrg C ebp 200 1.1 mrg 201 1.1 mrg movl %ecx, %edx 202 1.1 mrg decl %ecx 203 1.1 mrg 204 1.1 mrg subl $2, %edx 205 1.1 mrg negl %ecx 206 1.1 mrg 207 1.1 mrg shrl $UNROLL_LOG2, %edx 208 1.1 mrg andl $UNROLL_MASK, %ecx 209 1.1 mrg 210 1.1 mrg movl %edx, VAR_COUNTER 211 1.1 mrg movl %ecx, %edx 212 1.1 mrg 213 1.1 mrg C 15 code bytes per limb 214 1.1 mrg ifdef(`PIC',` 215 1.1 mrg call L(pic_calc) 216 1.1 mrg L(here): 217 1.1 mrg ',` 218 1.1 mrg shll $4, %edx 219 1.1 mrg negl %ecx 220 1.1 mrg 221 1.1 mrg leal L(entry) (%edx,%ecx,1), %edx 222 1.1 mrg ') 223 1.1 mrg movl (%esi), %eax C src low limb 224 1.1 mrg 225 1.1 mrg movl %edx, VAR_JUMP 226 1.1 mrg leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi 227 1.1 mrg 228 1.1 mrg mull %ebp 229 1.1 mrg 230 1.1 mrg addl %ebx, %eax C initial carry (from _1c) 231 1.1 mrg adcl $0, %edx 232 1.1 mrg 233 1.1 mrg movl %edx, %ebx C high carry 234 1.1 mrg leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi 235 1.1 mrg 236 1.1 mrg movl VAR_JUMP, %edx 237 1.1 mrg testl $1, %ecx 238 1.1 mrg movl %eax, %ecx C low carry 239 1.1 mrg 240 1.1 mrg cmovnz( %ebx, %ecx) C high,low carry other way around 241 1.1 mrg cmovnz( %eax, %ebx) 242 1.1 mrg 243 1.1 mrg jmp *%edx 244 1.1 mrg 245 1.1 mrg 246 1.1 mrg ifdef(`PIC',` 247 1.1 mrg L(pic_calc): 248 1.1 mrg shll $4, %edx 249 1.1 mrg negl %ecx 250 1.1 mrg 251 1.1 mrg C See mpn/x86/README about old gas bugs 252 1.1 mrg leal (%edx,%ecx,1), %edx 253 1.1 mrg addl $L(entry)-L(here), %edx 254 1.1 mrg 255 1.1 mrg addl (%esp), %edx 256 1.1 mrg 257 1.1 mrg ret_internal 258 1.1 mrg ') 259 1.1 mrg 260 1.1 mrg 261 1.1 mrg C ----------------------------------------------------------- 262 1.1 mrg ALIGN(32) 263 1.1 mrg L(top): 264 1.1 mrg deflit(`FRAME',16) 265 1.1 mrg C eax scratch 266 1.1 mrg C ebx carry hi 267 1.1 mrg C ecx carry lo 268 1.1 mrg C edx scratch 269 1.1 mrg C esi src 270 1.1 mrg C edi dst 271 1.1 mrg C ebp multiplier 272 1.1 mrg C 273 1.1 mrg C VAR_COUNTER loop counter 274 1.1 mrg C 275 1.1 mrg C 15 code bytes per limb 276 1.1 mrg 277 1.1 mrg addl $UNROLL_BYTES, %edi 278 1.1 mrg 279 1.1 mrg L(entry): 280 1.1 mrg deflit(CHUNK_COUNT,2) 281 1.1 mrg forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 282 1.1 mrg deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) 283 1.1 mrg deflit(`disp1', eval(disp0 + 4)) 284 1.1 mrg 285 1.1 mrg Zdisp( movl, disp0,(%esi), %eax) 286 1.1 mrg mull %ebp 287 1.1 mrg Zdisp( M4_inst,%ecx, disp0,(%edi)) 288 1.1 mrg adcl %eax, %ebx 289 1.1 mrg movl %edx, %ecx 290 1.1 mrg adcl $0, %ecx 291 1.1 mrg 292 1.1 mrg movl disp1(%esi), %eax 293 1.1 mrg mull %ebp 294 1.1 mrg M4_inst %ebx, disp1(%edi) 295 1.1 mrg adcl %eax, %ecx 296 1.1 mrg movl %edx, %ebx 297 1.1 mrg adcl $0, %ebx 298 1.1 mrg ') 299 1.1 mrg 300 1.1 mrg decl VAR_COUNTER 301 1.1 mrg leal UNROLL_BYTES(%esi), %esi 302 1.1 mrg 303 1.1 mrg jns L(top) 304 1.1 mrg 305 1.1 mrg 306 1.1 mrg deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) 307 1.1 mrg 308 1.1 mrg M4_inst %ecx, disp0(%edi) 309 1.1 mrg movl %ebx, %eax 310 1.1 mrg 311 1.1 mrg popl %ebp 312 1.1 mrg popl %edi 313 1.1 mrg 314 1.1 mrg popl %esi 315 1.1 mrg popl %ebx 316 1.1 mrg adcl $0, %eax 317 1.1 mrg 318 1.1 mrg ret 319 1.1 mrg 320 1.1 mrg EPILOGUE() 321