1 1.1 mrg dnl Intel P5 mpn_sqr_basecase -- square an mpn number. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular 35 1.1 mrg C product at around 20x20 limbs. 36 1.1 mrg 37 1.1 mrg 38 1.1 mrg C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 39 1.1 mrg C 40 1.1 mrg C Calculate src,size squared, storing the result in dst,2*size. 41 1.1 mrg C 42 1.1 mrg C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 43 1.1 mrg C lot of function call overheads are avoided, especially when the size is 44 1.1 mrg C small. 45 1.1 mrg 46 1.1 mrg defframe(PARAM_SIZE,12) 47 1.1 mrg defframe(PARAM_SRC, 8) 48 1.1 mrg defframe(PARAM_DST, 4) 49 1.1 mrg 50 1.1 mrg TEXT 51 1.1 mrg ALIGN(8) 52 1.1 mrg PROLOGUE(mpn_sqr_basecase) 53 1.1 mrg deflit(`FRAME',0) 54 1.1 mrg 55 1.1 mrg movl PARAM_SIZE, %edx 56 1.1 mrg movl PARAM_SRC, %eax 57 1.1 mrg 58 1.1 mrg cmpl $2, %edx 59 1.1 mrg movl PARAM_DST, %ecx 60 1.1 mrg 61 1.1 mrg je L(two_limbs) 62 1.1 mrg 63 1.1 mrg movl (%eax), %eax 64 1.1 mrg ja L(three_or_more) 65 1.1 mrg 66 1.1 mrg C ----------------------------------------------------------------------------- 67 1.1 mrg C one limb only 68 1.1 mrg C eax src 69 1.1 mrg C ebx 70 1.1 mrg C ecx dst 71 1.1 mrg C edx 72 1.1 mrg 73 1.1 mrg mull %eax 74 1.1 mrg 75 1.1 mrg movl %eax, (%ecx) 76 1.1 mrg movl %edx, 4(%ecx) 77 1.1 mrg 78 1.1 mrg ret 79 1.1 mrg 80 1.1 mrg C ----------------------------------------------------------------------------- 81 1.1 mrg ALIGN(8) 82 1.1 mrg L(two_limbs): 83 1.1 mrg C eax src 84 1.1 mrg C ebx 85 1.1 mrg C ecx dst 86 1.1 mrg C edx size 87 1.1 mrg 88 1.1 mrg pushl %ebp 89 1.1 mrg pushl %edi 90 1.1 mrg 91 1.1 mrg pushl %esi 92 1.1 mrg pushl %ebx 93 1.1 mrg 94 1.1 mrg movl %eax, %ebx 95 1.1 mrg movl (%eax), %eax 96 1.1 mrg 97 1.1 mrg mull %eax C src[0]^2 98 1.1 mrg 99 1.1 mrg movl %eax, (%ecx) C dst[0] 100 1.1 mrg movl %edx, %esi C dst[1] 101 1.1 mrg 102 1.1 mrg movl 4(%ebx), %eax 103 1.1 mrg 104 1.1 mrg mull %eax C src[1]^2 105 1.1 mrg 106 1.1 mrg movl %eax, %edi C dst[2] 107 1.1 mrg movl %edx, %ebp C dst[3] 108 1.1 mrg 109 1.1 mrg movl (%ebx), %eax 110 1.1 mrg 111 1.1 mrg mull 4(%ebx) C src[0]*src[1] 112 1.1 mrg 113 1.1 mrg addl %eax, %esi 114 1.1 mrg popl %ebx 115 1.1 mrg 116 1.1 mrg adcl %edx, %edi 117 1.1 mrg 118 1.1 mrg adcl $0, %ebp 119 1.1 mrg addl %esi, %eax 120 1.1 mrg 121 1.1 mrg adcl %edi, %edx 122 1.1 mrg movl %eax, 4(%ecx) 123 1.1 mrg 124 1.1 mrg adcl $0, %ebp 125 1.1 mrg popl %esi 126 1.1 mrg 127 1.1 mrg movl %edx, 8(%ecx) 128 1.1 mrg movl %ebp, 12(%ecx) 129 1.1 mrg 130 1.1 mrg popl %edi 131 1.1 mrg popl %ebp 132 1.1 mrg 133 1.1 mrg ret 134 1.1 mrg 135 1.1 mrg 136 1.1 mrg C ----------------------------------------------------------------------------- 137 1.1 mrg ALIGN(8) 138 1.1 mrg L(three_or_more): 139 1.1 mrg C eax src low limb 140 1.1 mrg C ebx 141 1.1 mrg C ecx dst 142 1.1 mrg C edx size 143 1.1 mrg 144 1.1 mrg cmpl $4, %edx 145 1.1 mrg pushl %ebx 146 1.1 mrg deflit(`FRAME',4) 147 1.1 mrg 148 1.1 mrg movl PARAM_SRC, %ebx 149 1.1 mrg jae L(four_or_more) 150 1.1 mrg 151 1.1 mrg 152 1.1 mrg C ----------------------------------------------------------------------------- 153 1.1 mrg C three limbs 154 1.1 mrg C eax src low limb 155 1.1 mrg C ebx src 156 1.1 mrg C ecx dst 157 1.1 mrg C edx size 158 1.1 mrg 159 1.1 mrg pushl %ebp 160 1.1 mrg pushl %edi 161 1.1 mrg 162 1.1 mrg mull %eax C src[0] ^ 2 163 1.1 mrg 164 1.1 mrg movl %eax, (%ecx) 165 1.1 mrg movl %edx, 4(%ecx) 166 1.1 mrg 167 1.1 mrg movl 4(%ebx), %eax 168 1.1 mrg xorl %ebp, %ebp 169 1.1 mrg 170 1.1 mrg mull %eax C src[1] ^ 2 171 1.1 mrg 172 1.1 mrg movl %eax, 8(%ecx) 173 1.1 mrg movl %edx, 12(%ecx) 174 1.1 mrg 175 1.1 mrg movl 8(%ebx), %eax 176 1.1 mrg pushl %esi C risk of cache bank clash 177 1.1 mrg 178 1.1 mrg mull %eax C src[2] ^ 2 179 1.1 mrg 180 1.1 mrg movl %eax, 16(%ecx) 181 1.1 mrg movl %edx, 20(%ecx) 182 1.1 mrg 183 1.1 mrg movl (%ebx), %eax 184 1.1 mrg 185 1.1 mrg mull 4(%ebx) C src[0] * src[1] 186 1.1 mrg 187 1.1 mrg movl %eax, %esi 188 1.1 mrg movl %edx, %edi 189 1.1 mrg 190 1.1 mrg movl (%ebx), %eax 191 1.1 mrg 192 1.1 mrg mull 8(%ebx) C src[0] * src[2] 193 1.1 mrg 194 1.1 mrg addl %eax, %edi 195 1.1 mrg movl %edx, %ebp 196 1.1 mrg 197 1.1 mrg adcl $0, %ebp 198 1.1 mrg movl 4(%ebx), %eax 199 1.1 mrg 200 1.1 mrg mull 8(%ebx) C src[1] * src[2] 201 1.1 mrg 202 1.1 mrg xorl %ebx, %ebx 203 1.1 mrg addl %eax, %ebp 204 1.1 mrg 205 1.1 mrg C eax 206 1.1 mrg C ebx zero, will be dst[5] 207 1.1 mrg C ecx dst 208 1.1 mrg C edx dst[4] 209 1.1 mrg C esi dst[1] 210 1.1 mrg C edi dst[2] 211 1.1 mrg C ebp dst[3] 212 1.1 mrg 213 1.1 mrg adcl $0, %edx 214 1.1 mrg addl %esi, %esi 215 1.1 mrg 216 1.1 mrg adcl %edi, %edi 217 1.1 mrg 218 1.1 mrg adcl %ebp, %ebp 219 1.1 mrg 220 1.1 mrg adcl %edx, %edx 221 1.1 mrg movl 4(%ecx), %eax 222 1.1 mrg 223 1.1 mrg adcl $0, %ebx 224 1.1 mrg addl %esi, %eax 225 1.1 mrg 226 1.1 mrg movl %eax, 4(%ecx) 227 1.1 mrg movl 8(%ecx), %eax 228 1.1 mrg 229 1.1 mrg adcl %edi, %eax 230 1.1 mrg movl 12(%ecx), %esi 231 1.1 mrg 232 1.1 mrg adcl %ebp, %esi 233 1.1 mrg movl 16(%ecx), %edi 234 1.1 mrg 235 1.1 mrg movl %eax, 8(%ecx) 236 1.1 mrg movl %esi, 12(%ecx) 237 1.1 mrg 238 1.1 mrg adcl %edx, %edi 239 1.1 mrg popl %esi 240 1.1 mrg 241 1.1 mrg movl 20(%ecx), %eax 242 1.1 mrg movl %edi, 16(%ecx) 243 1.1 mrg 244 1.1 mrg popl %edi 245 1.1 mrg popl %ebp 246 1.1 mrg 247 1.1 mrg adcl %ebx, %eax C no carry out of this 248 1.1 mrg popl %ebx 249 1.1 mrg 250 1.1 mrg movl %eax, 20(%ecx) 251 1.1 mrg 252 1.1 mrg ret 253 1.1 mrg 254 1.1 mrg 255 1.1 mrg C ----------------------------------------------------------------------------- 256 1.1 mrg ALIGN(8) 257 1.1 mrg L(four_or_more): 258 1.1 mrg C eax src low limb 259 1.1 mrg C ebx src 260 1.1 mrg C ecx dst 261 1.1 mrg C edx size 262 1.1 mrg C esi 263 1.1 mrg C edi 264 1.1 mrg C ebp 265 1.1 mrg C 266 1.1 mrg C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 267 1.1 mrg 268 1.1 mrg deflit(`FRAME',4) 269 1.1 mrg 270 1.1 mrg pushl %edi 271 1.1 mrg FRAME_pushl() 272 1.1 mrg pushl %esi 273 1.1 mrg FRAME_pushl() 274 1.1 mrg 275 1.1 mrg pushl %ebp 276 1.1 mrg FRAME_pushl() 277 1.1 mrg leal (%ecx,%edx,4), %edi C dst end of this mul1 278 1.1 mrg 279 1.1 mrg leal (%ebx,%edx,4), %esi C src end 280 1.1 mrg movl %ebx, %ebp C src 281 1.1 mrg 282 1.1 mrg negl %edx C -size 283 1.1 mrg xorl %ebx, %ebx C clear carry limb and carry flag 284 1.1 mrg 285 1.1 mrg leal 1(%edx), %ecx C -(size-1) 286 1.1 mrg 287 1.1 mrg L(mul1): 288 1.1 mrg C eax scratch 289 1.1 mrg C ebx carry 290 1.1 mrg C ecx counter, negative 291 1.1 mrg C edx scratch 292 1.1 mrg C esi &src[size] 293 1.1 mrg C edi &dst[size] 294 1.1 mrg C ebp src 295 1.1 mrg 296 1.1 mrg adcl $0, %ebx 297 1.1 mrg movl (%esi,%ecx,4), %eax 298 1.1 mrg 299 1.1 mrg mull (%ebp) 300 1.1 mrg 301 1.1 mrg addl %eax, %ebx 302 1.1 mrg 303 1.1 mrg movl %ebx, (%edi,%ecx,4) 304 1.1 mrg incl %ecx 305 1.1 mrg 306 1.1 mrg movl %edx, %ebx 307 1.1 mrg jnz L(mul1) 308 1.1 mrg 309 1.1 mrg 310 1.1 mrg C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for 311 1.1 mrg C n=1..size-2. 312 1.1 mrg C 313 1.1 mrg C The last two products, which are the end corner of the product 314 1.1 mrg C triangle, are handled separately to save looping overhead. These 315 1.1 mrg C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. 316 1.1 mrg C If size is 4 then it's only these that need to be done. 317 1.1 mrg C 318 1.1 mrg C In the outer loop %esi is a constant, and %edi just advances by 1 319 1.1 mrg C limb each time. The size of the operation decreases by 1 limb 320 1.1 mrg C each time. 321 1.1 mrg 322 1.1 mrg C eax 323 1.1 mrg C ebx carry (needing carry flag added) 324 1.1 mrg C ecx 325 1.1 mrg C edx 326 1.1 mrg C esi &src[size] 327 1.1 mrg C edi &dst[size] 328 1.1 mrg C ebp 329 1.1 mrg 330 1.1 mrg adcl $0, %ebx 331 1.1 mrg movl PARAM_SIZE, %edx 332 1.1 mrg 333 1.1 mrg movl %ebx, (%edi) 334 1.1 mrg subl $4, %edx 335 1.1 mrg 336 1.1 mrg negl %edx 337 1.1 mrg jz L(corner) 338 1.1 mrg 339 1.1 mrg 340 1.1 mrg L(outer): 341 1.1 mrg C ebx previous carry limb to store 342 1.1 mrg C edx outer loop counter (negative) 343 1.1 mrg C esi &src[size] 344 1.1 mrg C edi dst, pointing at stored carry limb of previous loop 345 1.1 mrg 346 1.1 mrg pushl %edx C new outer loop counter 347 1.1 mrg leal -2(%edx), %ecx 348 1.1 mrg 349 1.1 mrg movl %ebx, (%edi) 350 1.1 mrg addl $4, %edi 351 1.1 mrg 352 1.1 mrg addl $4, %ebp 353 1.1 mrg xorl %ebx, %ebx C initial carry limb, clear carry flag 354 1.1 mrg 355 1.1 mrg L(inner): 356 1.1 mrg C eax scratch 357 1.1 mrg C ebx carry (needing carry flag added) 358 1.1 mrg C ecx counter, negative 359 1.1 mrg C edx scratch 360 1.1 mrg C esi &src[size] 361 1.1 mrg C edi dst end of this addmul 362 1.1 mrg C ebp &src[j] 363 1.1 mrg 364 1.1 mrg adcl $0, %ebx 365 1.1 mrg movl (%esi,%ecx,4), %eax 366 1.1 mrg 367 1.1 mrg mull (%ebp) 368 1.1 mrg 369 1.1 mrg addl %ebx, %eax 370 1.1 mrg movl (%edi,%ecx,4), %ebx 371 1.1 mrg 372 1.1 mrg adcl $0, %edx 373 1.1 mrg addl %eax, %ebx 374 1.1 mrg 375 1.1 mrg movl %ebx, (%edi,%ecx,4) 376 1.1 mrg incl %ecx 377 1.1 mrg 378 1.1 mrg movl %edx, %ebx 379 1.1 mrg jnz L(inner) 380 1.1 mrg 381 1.1 mrg 382 1.1 mrg adcl $0, %ebx 383 1.1 mrg popl %edx C outer loop counter 384 1.1 mrg 385 1.1 mrg incl %edx 386 1.1 mrg jnz L(outer) 387 1.1 mrg 388 1.1 mrg 389 1.1 mrg movl %ebx, (%edi) 390 1.1 mrg 391 1.1 mrg L(corner): 392 1.1 mrg C esi &src[size] 393 1.1 mrg C edi &dst[2*size-4] 394 1.1 mrg 395 1.1 mrg movl -8(%esi), %eax 396 1.1 mrg movl -4(%edi), %ebx C risk of data cache bank clash here 397 1.1 mrg 398 1.1 mrg mull -12(%esi) C src[size-2]*src[size-3] 399 1.1 mrg 400 1.1 mrg addl %eax, %ebx 401 1.1 mrg movl %edx, %ecx 402 1.1 mrg 403 1.1 mrg adcl $0, %ecx 404 1.1 mrg movl -4(%esi), %eax 405 1.1 mrg 406 1.1 mrg mull -12(%esi) C src[size-1]*src[size-3] 407 1.1 mrg 408 1.1 mrg addl %ecx, %eax 409 1.1 mrg movl (%edi), %ecx 410 1.1 mrg 411 1.1 mrg adcl $0, %edx 412 1.1 mrg movl %ebx, -4(%edi) 413 1.1 mrg 414 1.1 mrg addl %eax, %ecx 415 1.1 mrg movl %edx, %ebx 416 1.1 mrg 417 1.1 mrg adcl $0, %ebx 418 1.1 mrg movl -4(%esi), %eax 419 1.1 mrg 420 1.1 mrg mull -8(%esi) C src[size-1]*src[size-2] 421 1.1 mrg 422 1.1 mrg movl %ecx, (%edi) 423 1.1 mrg addl %eax, %ebx 424 1.1 mrg 425 1.1 mrg adcl $0, %edx 426 1.1 mrg movl PARAM_SIZE, %eax 427 1.1 mrg 428 1.1 mrg negl %eax 429 1.1 mrg movl %ebx, 4(%edi) 430 1.1 mrg 431 1.1 mrg addl $1, %eax C -(size-1) and clear carry 432 1.1 mrg movl %edx, 8(%edi) 433 1.1 mrg 434 1.1 mrg 435 1.1 mrg C ----------------------------------------------------------------------------- 436 1.1 mrg C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 437 1.1 mrg 438 1.1 mrg L(lshift): 439 1.1 mrg C eax counter, negative 440 1.1 mrg C ebx next limb 441 1.1 mrg C ecx 442 1.1 mrg C edx 443 1.1 mrg C esi 444 1.1 mrg C edi &dst[2*size-4] 445 1.1 mrg C ebp 446 1.1 mrg 447 1.1 mrg movl 12(%edi,%eax,8), %ebx 448 1.1 mrg 449 1.1 mrg rcll %ebx 450 1.1 mrg movl 16(%edi,%eax,8), %ecx 451 1.1 mrg 452 1.1 mrg rcll %ecx 453 1.1 mrg movl %ebx, 12(%edi,%eax,8) 454 1.1 mrg 455 1.1 mrg movl %ecx, 16(%edi,%eax,8) 456 1.1 mrg incl %eax 457 1.1 mrg 458 1.1 mrg jnz L(lshift) 459 1.1 mrg 460 1.1 mrg 461 1.1 mrg adcl %eax, %eax C high bit out 462 1.1 mrg movl PARAM_SRC, %esi 463 1.1 mrg 464 1.1 mrg movl PARAM_SIZE, %ecx C risk of cache bank clash 465 1.1 mrg movl %eax, 12(%edi) C dst most significant limb 466 1.1 mrg 467 1.1 mrg 468 1.1 mrg C ----------------------------------------------------------------------------- 469 1.1 mrg C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., 470 1.1 mrg C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 471 1.1 mrg C low limb of src[0]^2. 472 1.1 mrg 473 1.1 mrg movl (%esi), %eax C src[0] 474 1.1 mrg leal (%esi,%ecx,4), %esi C src end 475 1.1 mrg 476 1.1 mrg negl %ecx 477 1.1 mrg 478 1.1 mrg mull %eax 479 1.1 mrg 480 1.1 mrg movl %eax, 16(%edi,%ecx,8) C dst[0] 481 1.1 mrg movl %edx, %ebx 482 1.1 mrg 483 1.1 mrg addl $1, %ecx C size-1 and clear carry 484 1.1 mrg 485 1.1 mrg L(diag): 486 1.1 mrg C eax scratch (low product) 487 1.1 mrg C ebx carry limb 488 1.1 mrg C ecx counter, negative 489 1.1 mrg C edx scratch (high product) 490 1.1 mrg C esi &src[size] 491 1.1 mrg C edi &dst[2*size-4] 492 1.1 mrg C ebp scratch (fetched dst limbs) 493 1.1 mrg 494 1.1 mrg movl (%esi,%ecx,4), %eax 495 1.1 mrg adcl $0, %ebx 496 1.1 mrg 497 1.1 mrg mull %eax 498 1.1 mrg 499 1.1 mrg movl 16-4(%edi,%ecx,8), %ebp 500 1.1 mrg 501 1.1 mrg addl %ebp, %ebx 502 1.1 mrg movl 16(%edi,%ecx,8), %ebp 503 1.1 mrg 504 1.1 mrg adcl %eax, %ebp 505 1.1 mrg movl %ebx, 16-4(%edi,%ecx,8) 506 1.1 mrg 507 1.1 mrg movl %ebp, 16(%edi,%ecx,8) 508 1.1 mrg incl %ecx 509 1.1 mrg 510 1.1 mrg movl %edx, %ebx 511 1.1 mrg jnz L(diag) 512 1.1 mrg 513 1.1 mrg 514 1.1 mrg adcl $0, %edx 515 1.1 mrg movl 16-4(%edi), %eax C dst most significant limb 516 1.1 mrg 517 1.1 mrg addl %eax, %edx 518 1.1 mrg popl %ebp 519 1.1 mrg 520 1.1 mrg movl %edx, 16-4(%edi) 521 1.1 mrg popl %esi C risk of cache bank clash 522 1.1 mrg 523 1.1 mrg popl %edi 524 1.1 mrg popl %ebx 525 1.1 mrg 526 1.1 mrg ret 527 1.1 mrg 528 1.1 mrg EPILOGUE() 529