1 1.1 mrg dnl AMD K7 mpn_lshift -- mpn left shift. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C K7: 1.21 cycles/limb (at 16 limbs/loop). 35 1.1 mrg 36 1.1 mrg 37 1.1 mrg 38 1.1 mrg dnl K7: UNROLL_COUNT cycles/limb 39 1.1 mrg dnl 4 1.51 40 1.1 mrg dnl 8 1.26 41 1.1 mrg dnl 16 1.21 42 1.1 mrg dnl 32 1.2 43 1.1 mrg dnl Maximum possible with the current code is 64. 44 1.1 mrg 45 1.1 mrg deflit(UNROLL_COUNT, 16) 46 1.1 mrg 47 1.1 mrg 48 1.1 mrg C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, 49 1.1 mrg C unsigned shift); 50 1.1 mrg C 51 1.1 mrg C Shift src,size left by shift many bits and store the result in dst,size. 52 1.1 mrg C Zeros are shifted in at the right. The bits shifted out at the left are 53 1.1 mrg C the return value. 54 1.1 mrg C 55 1.1 mrg C The comments in mpn_rshift apply here too. 56 1.1 mrg 57 1.1 mrg ifdef(`PIC',` 58 1.1 mrg deflit(UNROLL_THRESHOLD, 10) 59 1.1 mrg ',` 60 1.1 mrg deflit(UNROLL_THRESHOLD, 10) 61 1.1 mrg ') 62 1.1 mrg 63 1.1 mrg defframe(PARAM_SHIFT,16) 64 1.1 mrg defframe(PARAM_SIZE, 12) 65 1.1 mrg defframe(PARAM_SRC, 8) 66 1.1 mrg defframe(PARAM_DST, 4) 67 1.1 mrg 68 1.1 mrg defframe(SAVE_EDI, -4) 69 1.1 mrg defframe(SAVE_ESI, -8) 70 1.1 mrg defframe(SAVE_EBX, -12) 71 1.1 mrg deflit(SAVE_SIZE, 12) 72 1.1 mrg 73 1.1 mrg TEXT 74 1.1 mrg ALIGN(32) 75 1.1 mrg 76 1.1 mrg PROLOGUE(mpn_lshift) 77 1.1 mrg deflit(`FRAME',0) 78 1.1 mrg 79 1.1 mrg movl PARAM_SIZE, %eax 80 1.1 mrg movl PARAM_SRC, %edx 81 1.1 mrg subl $SAVE_SIZE, %esp 82 1.1 mrg deflit(`FRAME',SAVE_SIZE) 83 1.1 mrg 84 1.1 mrg movl PARAM_SHIFT, %ecx 85 1.1 mrg movl %edi, SAVE_EDI 86 1.1 mrg 87 1.1 mrg movl PARAM_DST, %edi 88 1.1 mrg decl %eax 89 1.1 mrg jnz L(more_than_one_limb) 90 1.1 mrg 91 1.1 mrg movl (%edx), %edx 92 1.1 mrg 93 1.1 mrg shldl( %cl, %edx, %eax) C eax was decremented to zero 94 1.1 mrg 95 1.1 mrg shll %cl, %edx 96 1.1 mrg 97 1.1 mrg movl %edx, (%edi) 98 1.1 mrg movl SAVE_EDI, %edi 99 1.1 mrg addl $SAVE_SIZE, %esp 100 1.1 mrg 101 1.1 mrg ret 102 1.1 mrg 103 1.1 mrg 104 1.1 mrg C ----------------------------------------------------------------------------- 105 1.1 mrg L(more_than_one_limb): 106 1.1 mrg C eax size-1 107 1.1 mrg C ebx 108 1.1 mrg C ecx shift 109 1.1 mrg C edx src 110 1.1 mrg C esi 111 1.1 mrg C edi dst 112 1.1 mrg C ebp 113 1.1 mrg 114 1.1 mrg movd PARAM_SHIFT, %mm6 115 1.1 mrg movd (%edx,%eax,4), %mm5 C src high limb 116 1.1 mrg cmp $UNROLL_THRESHOLD-1, %eax 117 1.1 mrg 118 1.1 mrg jae L(unroll) 119 1.1 mrg negl %ecx 120 1.1 mrg movd (%edx), %mm4 C src low limb 121 1.1 mrg 122 1.1 mrg addl $32, %ecx 123 1.1 mrg 124 1.1 mrg movd %ecx, %mm7 125 1.1 mrg 126 1.1 mrg L(simple_top): 127 1.1 mrg C eax loop counter, limbs 128 1.1 mrg C ebx 129 1.1 mrg C ecx 130 1.1 mrg C edx src 131 1.1 mrg C esi 132 1.1 mrg C edi dst 133 1.1 mrg C ebp 134 1.1 mrg C 135 1.1 mrg C mm0 scratch 136 1.1 mrg C mm4 src low limb 137 1.1 mrg C mm5 src high limb 138 1.1 mrg C mm6 shift 139 1.1 mrg C mm7 32-shift 140 1.1 mrg 141 1.1 mrg movq -4(%edx,%eax,4), %mm0 142 1.1 mrg decl %eax 143 1.1 mrg 144 1.1 mrg psrlq %mm7, %mm0 145 1.1 mrg 146 1.1 mrg movd %mm0, 4(%edi,%eax,4) 147 1.1 mrg jnz L(simple_top) 148 1.1 mrg 149 1.1 mrg 150 1.1 mrg psllq %mm6, %mm5 151 1.1 mrg psllq %mm6, %mm4 152 1.1 mrg 153 1.1 mrg psrlq $32, %mm5 154 1.1 mrg movd %mm4, (%edi) C dst low limb 155 1.1 mrg 156 1.1 mrg movd %mm5, %eax C return value 157 1.1 mrg 158 1.1 mrg movl SAVE_EDI, %edi 159 1.1 mrg addl $SAVE_SIZE, %esp 160 1.1 mrg emms 161 1.1 mrg 162 1.1 mrg ret 163 1.1 mrg 164 1.1 mrg 165 1.1 mrg C ----------------------------------------------------------------------------- 166 1.1 mrg ALIGN(16) 167 1.1 mrg L(unroll): 168 1.1 mrg C eax size-1 169 1.1 mrg C ebx (saved) 170 1.1 mrg C ecx shift 171 1.1 mrg C edx src 172 1.1 mrg C esi 173 1.1 mrg C edi dst 174 1.1 mrg C ebp 175 1.1 mrg C 176 1.1 mrg C mm5 src high limb, for return value 177 1.1 mrg C mm6 lshift 178 1.1 mrg 179 1.1 mrg movl %esi, SAVE_ESI 180 1.1 mrg movl %ebx, SAVE_EBX 181 1.1 mrg leal -4(%edx,%eax,4), %edx C &src[size-2] 182 1.1 mrg 183 1.1 mrg testb $4, %dl 184 1.1 mrg movq (%edx), %mm1 C src high qword 185 1.1 mrg 186 1.1 mrg jz L(start_src_aligned) 187 1.1 mrg 188 1.1 mrg 189 1.1 mrg C src isn't aligned, process high limb (marked xxx) separately to 190 1.1 mrg C make it so 191 1.1 mrg C 192 1.1 mrg C source -4(edx,%eax,4) 193 1.1 mrg C | 194 1.1 mrg C +-------+-------+-------+-- 195 1.1 mrg C | xxx | 196 1.1 mrg C +-------+-------+-------+-- 197 1.1 mrg C 0mod8 4mod8 0mod8 198 1.1 mrg C 199 1.1 mrg C dest -4(edi,%eax,4) 200 1.1 mrg C | 201 1.1 mrg C +-------+-------+-- 202 1.1 mrg C | xxx | | 203 1.1 mrg C +-------+-------+-- 204 1.1 mrg 205 1.1 mrg psllq %mm6, %mm1 206 1.1 mrg subl $4, %edx 207 1.1 mrg movl %eax, PARAM_SIZE C size-1 208 1.1 mrg 209 1.1 mrg psrlq $32, %mm1 210 1.1 mrg decl %eax C size-2 is new size-1 211 1.1 mrg 212 1.1 mrg movd %mm1, 4(%edi,%eax,4) 213 1.1 mrg movq (%edx), %mm1 C new src high qword 214 1.1 mrg L(start_src_aligned): 215 1.1 mrg 216 1.1 mrg 217 1.1 mrg leal -4(%edi,%eax,4), %edi C &dst[size-2] 218 1.1 mrg psllq %mm6, %mm5 219 1.1 mrg 220 1.1 mrg testl $4, %edi 221 1.1 mrg psrlq $32, %mm5 C return value 222 1.1 mrg 223 1.1 mrg jz L(start_dst_aligned) 224 1.1 mrg 225 1.1 mrg 226 1.1 mrg C dst isn't aligned, subtract 4 bytes to make it so, and pretend the 227 1.1 mrg C shift is 32 bits extra. High limb of dst (marked xxx) handled 228 1.1 mrg C here separately. 229 1.1 mrg C 230 1.1 mrg C source %edx 231 1.1 mrg C +-------+-------+-- 232 1.1 mrg C | mm1 | 233 1.1 mrg C +-------+-------+-- 234 1.1 mrg C 0mod8 4mod8 235 1.1 mrg C 236 1.1 mrg C dest %edi 237 1.1 mrg C +-------+-------+-------+-- 238 1.1 mrg C | xxx | 239 1.1 mrg C +-------+-------+-------+-- 240 1.1 mrg C 0mod8 4mod8 0mod8 241 1.1 mrg 242 1.1 mrg movq %mm1, %mm0 243 1.1 mrg psllq %mm6, %mm1 244 1.1 mrg addl $32, %ecx C shift+32 245 1.1 mrg 246 1.1 mrg psrlq $32, %mm1 247 1.1 mrg 248 1.1 mrg movd %mm1, 4(%edi) 249 1.1 mrg movq %mm0, %mm1 250 1.1 mrg subl $4, %edi 251 1.1 mrg 252 1.1 mrg movd %ecx, %mm6 C new lshift 253 1.1 mrg L(start_dst_aligned): 254 1.1 mrg 255 1.1 mrg decl %eax C size-2, two last limbs handled at end 256 1.1 mrg movq %mm1, %mm2 C copy of src high qword 257 1.1 mrg negl %ecx 258 1.1 mrg 259 1.1 mrg andl $-2, %eax C round size down to even 260 1.1 mrg addl $64, %ecx 261 1.1 mrg 262 1.1 mrg movl %eax, %ebx 263 1.1 mrg negl %eax 264 1.1 mrg 265 1.1 mrg andl $UNROLL_MASK, %eax 266 1.1 mrg decl %ebx 267 1.1 mrg 268 1.1 mrg shll %eax 269 1.1 mrg 270 1.1 mrg movd %ecx, %mm7 C rshift = 64-lshift 271 1.1 mrg 272 1.1 mrg ifdef(`PIC',` 273 1.1 mrg call L(pic_calc) 274 1.1 mrg L(here): 275 1.1 mrg ',` 276 1.1 mrg leal L(entry) (%eax,%eax,4), %esi 277 1.1 mrg ') 278 1.1 mrg shrl $UNROLL_LOG2, %ebx C loop counter 279 1.1 mrg 280 1.1 mrg leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx 281 1.1 mrg leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi 282 1.1 mrg movl PARAM_SIZE, %eax C for use at end 283 1.1 mrg jmp *%esi 284 1.1 mrg 285 1.1 mrg 286 1.1 mrg ifdef(`PIC',` 287 1.1 mrg L(pic_calc): 288 1.1 mrg C See mpn/x86/README about old gas bugs 289 1.1 mrg leal (%eax,%eax,4), %esi 290 1.1 mrg addl $L(entry)-L(here), %esi 291 1.1 mrg addl (%esp), %esi 292 1.1 mrg 293 1.1 mrg ret_internal 294 1.1 mrg ') 295 1.1 mrg 296 1.1 mrg 297 1.1 mrg C ----------------------------------------------------------------------------- 298 1.1 mrg ALIGN(32) 299 1.1 mrg L(top): 300 1.1 mrg C eax size (for use at end) 301 1.1 mrg C ebx loop counter 302 1.1 mrg C ecx rshift 303 1.1 mrg C edx src 304 1.1 mrg C esi computed jump 305 1.1 mrg C edi dst 306 1.1 mrg C ebp 307 1.1 mrg C 308 1.1 mrg C mm0 scratch 309 1.1 mrg C mm1 \ carry (alternating, mm2 first) 310 1.1 mrg C mm2 / 311 1.1 mrg C mm6 lshift 312 1.1 mrg C mm7 rshift 313 1.1 mrg C 314 1.1 mrg C 10 code bytes/limb 315 1.1 mrg C 316 1.1 mrg C The two chunks differ in whether mm1 or mm2 hold the carry. 317 1.1 mrg C The computed jump puts the initial carry in both mm1 and mm2. 318 1.1 mrg 319 1.1 mrg L(entry): 320 1.1 mrg deflit(CHUNK_COUNT, 4) 321 1.1 mrg forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 322 1.1 mrg deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 323 1.1 mrg deflit(`disp1', eval(disp0 - 8)) 324 1.1 mrg 325 1.1 mrg Zdisp( movq, disp0,(%edx), %mm0) 326 1.1 mrg psllq %mm6, %mm2 327 1.1 mrg 328 1.1 mrg movq %mm0, %mm1 329 1.1 mrg psrlq %mm7, %mm0 330 1.1 mrg 331 1.1 mrg por %mm2, %mm0 332 1.1 mrg Zdisp( movq, %mm0, disp0,(%edi)) 333 1.1 mrg 334 1.1 mrg 335 1.1 mrg Zdisp( movq, disp1,(%edx), %mm0) 336 1.1 mrg psllq %mm6, %mm1 337 1.1 mrg 338 1.1 mrg movq %mm0, %mm2 339 1.1 mrg psrlq %mm7, %mm0 340 1.1 mrg 341 1.1 mrg por %mm1, %mm0 342 1.1 mrg Zdisp( movq, %mm0, disp1,(%edi)) 343 1.1 mrg ') 344 1.1 mrg 345 1.1 mrg subl $UNROLL_BYTES, %edx 346 1.1 mrg subl $UNROLL_BYTES, %edi 347 1.1 mrg decl %ebx 348 1.1 mrg 349 1.1 mrg jns L(top) 350 1.1 mrg 351 1.1 mrg 352 1.1 mrg 353 1.1 mrg define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') 354 1.1 mrg 355 1.1 mrg L(end): 356 1.1 mrg testb $1, %al 357 1.1 mrg movl SAVE_EBX, %ebx 358 1.1 mrg psllq %mm6, %mm2 C wanted left shifted in all cases below 359 1.1 mrg 360 1.1 mrg movd %mm5, %eax 361 1.1 mrg 362 1.1 mrg movl SAVE_ESI, %esi 363 1.1 mrg jz L(end_even) 364 1.1 mrg 365 1.1 mrg 366 1.1 mrg L(end_odd): 367 1.1 mrg 368 1.1 mrg C Size odd, destination was aligned. 369 1.1 mrg C 370 1.1 mrg C source edx+8 edx+4 371 1.1 mrg C --+---------------+-------+ 372 1.1 mrg C | mm2 | | 373 1.1 mrg C --+---------------+-------+ 374 1.1 mrg C 375 1.1 mrg C dest edi 376 1.1 mrg C --+---------------+---------------+-------+ 377 1.1 mrg C | written | | | 378 1.1 mrg C --+---------------+---------------+-------+ 379 1.1 mrg C 380 1.1 mrg C mm6 = shift 381 1.1 mrg C mm7 = ecx = 64-shift 382 1.1 mrg 383 1.1 mrg 384 1.1 mrg C Size odd, destination was unaligned. 385 1.1 mrg C 386 1.1 mrg C source edx+8 edx+4 387 1.1 mrg C --+---------------+-------+ 388 1.1 mrg C | mm2 | | 389 1.1 mrg C --+---------------+-------+ 390 1.1 mrg C 391 1.1 mrg C dest edi 392 1.1 mrg C --+---------------+---------------+ 393 1.1 mrg C | written | | 394 1.1 mrg C --+---------------+---------------+ 395 1.1 mrg C 396 1.1 mrg C mm6 = shift+32 397 1.1 mrg C mm7 = ecx = 64-(shift+32) 398 1.1 mrg 399 1.1 mrg 400 1.1 mrg C In both cases there's one extra limb of src to fetch and combine 401 1.1 mrg C with mm2 to make a qword at (%edi), and in the aligned case 402 1.1 mrg C there's an extra limb of dst to be formed from that extra src limb 403 1.1 mrg C left shifted. 404 1.1 mrg 405 1.1 mrg movd disp(4) (%edx), %mm0 406 1.1 mrg testb $32, %cl 407 1.1 mrg 408 1.1 mrg movq %mm0, %mm1 409 1.1 mrg psllq $32, %mm0 410 1.1 mrg 411 1.1 mrg psrlq %mm7, %mm0 412 1.1 mrg psllq %mm6, %mm1 413 1.1 mrg 414 1.1 mrg por %mm2, %mm0 415 1.1 mrg 416 1.1 mrg movq %mm0, disp(0) (%edi) 417 1.1 mrg jz L(end_odd_unaligned) 418 1.1 mrg movd %mm1, disp(-4) (%edi) 419 1.1 mrg L(end_odd_unaligned): 420 1.1 mrg 421 1.1 mrg movl SAVE_EDI, %edi 422 1.1 mrg addl $SAVE_SIZE, %esp 423 1.1 mrg emms 424 1.1 mrg 425 1.1 mrg ret 426 1.1 mrg 427 1.1 mrg 428 1.1 mrg L(end_even): 429 1.1 mrg 430 1.1 mrg C Size even, destination was aligned. 431 1.1 mrg C 432 1.1 mrg C source edx+8 433 1.1 mrg C --+---------------+ 434 1.1 mrg C | mm2 | 435 1.1 mrg C --+---------------+ 436 1.1 mrg C 437 1.1 mrg C dest edi 438 1.1 mrg C --+---------------+---------------+ 439 1.1 mrg C | written | | 440 1.1 mrg C --+---------------+---------------+ 441 1.1 mrg C 442 1.1 mrg C mm6 = shift 443 1.1 mrg C mm7 = ecx = 64-shift 444 1.1 mrg 445 1.1 mrg 446 1.1 mrg C Size even, destination was unaligned. 447 1.1 mrg C 448 1.1 mrg C source edx+8 449 1.1 mrg C --+---------------+ 450 1.1 mrg C | mm2 | 451 1.1 mrg C --+---------------+ 452 1.1 mrg C 453 1.1 mrg C dest edi+4 454 1.1 mrg C --+---------------+-------+ 455 1.1 mrg C | written | | 456 1.1 mrg C --+---------------+-------+ 457 1.1 mrg C 458 1.1 mrg C mm6 = shift+32 459 1.1 mrg C mm7 = ecx = 64-(shift+32) 460 1.1 mrg 461 1.1 mrg 462 1.1 mrg C The movq for the aligned case overwrites the movd for the 463 1.1 mrg C unaligned case. 464 1.1 mrg 465 1.1 mrg movq %mm2, %mm0 466 1.1 mrg psrlq $32, %mm2 467 1.1 mrg 468 1.1 mrg testb $32, %cl 469 1.1 mrg movd %mm2, disp(4) (%edi) 470 1.1 mrg 471 1.1 mrg jz L(end_even_unaligned) 472 1.1 mrg movq %mm0, disp(0) (%edi) 473 1.1 mrg L(end_even_unaligned): 474 1.1 mrg 475 1.1 mrg movl SAVE_EDI, %edi 476 1.1 mrg addl $SAVE_SIZE, %esp 477 1.1 mrg emms 478 1.1 mrg 479 1.1 mrg ret 480 1.1 mrg 481 1.1 mrg EPILOGUE() 482