1 1.1 mrg dnl AMD Athlon mpn_com -- mpn bitwise one's complement. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2002 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C K7: 1.0 cycles/limb 35 1.1 mrg 36 1.1 mrg 37 1.1 mrg C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 38 1.1 mrg C 39 1.1 mrg C The loop form below is necessary for the claimed speed. It needs to be 40 1.1 mrg C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it 41 1.1 mrg C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on 42 1.1 mrg C the movq's and achieve the necessary size. 43 1.1 mrg C 44 1.1 mrg C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one 45 1.1 mrg C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked 46 1.1 mrg C (offset by the size, as per the loop addressing) and one high limb 47 1.1 mrg C processed separately to get alignment. 48 1.1 mrg C 49 1.1 mrg C The padding for the nails case is unattractive, but shouldn't cost any 50 1.1 mrg C cycles. Explicit .byte's guarantee the desired instructions, at a point 51 1.1 mrg C where we're probably stalled waiting for loads anyway. 52 1.1 mrg C 53 1.1 mrg C Enhancements: 54 1.1 mrg C 55 1.1 mrg C The combination load/pxor/store might be able to be unrolled to approach 56 1.1 mrg C 0.5 c/l if desired. 57 1.1 mrg 58 1.1 mrg defframe(PARAM_SIZE,12) 59 1.1 mrg defframe(PARAM_SRC, 8) 60 1.1 mrg defframe(PARAM_DST, 4) 61 1.1 mrg 62 1.1 mrg TEXT 63 1.1 mrg ALIGN(16) 64 1.1 mrg 65 1.1 mrg PROLOGUE(mpn_com) 66 1.1 mrg deflit(`FRAME',0) 67 1.1 mrg 68 1.1 mrg movl PARAM_DST, %edx 69 1.1 mrg movl PARAM_SIZE, %ecx 70 1.1 mrg pcmpeqd %mm7, %mm7 71 1.1 mrg 72 1.1 mrg leal (%edx,%ecx,4), %eax 73 1.1 mrg andl $4, %eax 74 1.1 mrg ifelse(GMP_NAIL_BITS,0,, 75 1.1 mrg ` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK 76 1.1 mrg 77 1.1 mrg movl PARAM_SRC, %eax 78 1.1 mrg movd -4(%eax,%ecx,4), %mm0 C src high limb 79 1.1 mrg 80 1.1 mrg ifelse(GMP_NAIL_BITS,0,, 81 1.1 mrg ` C padding for alignment below 82 1.1 mrg .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi 83 1.1 mrg .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi 84 1.1 mrg ') 85 1.1 mrg 86 1.1 mrg jz L(aligned) 87 1.1 mrg 88 1.1 mrg pxor %mm7, %mm0 89 1.1 mrg movd %mm0, -4(%edx,%ecx,4) C dst high limb 90 1.1 mrg decl %ecx 91 1.1 mrg jz L(done) 92 1.1 mrg L(aligned): 93 1.1 mrg 94 1.1 mrg addl $4, %eax 95 1.1 mrg addl $4, %edx 96 1.1 mrg decl %ecx 97 1.1 mrg jz L(one) 98 1.1 mrg 99 1.1 mrg C offset 0x30 for no nails, or 0x40 for nails 100 1.1 mrg ALIGN(16) 101 1.1 mrg L(top): 102 1.1 mrg C eax src 103 1.1 mrg C ebx 104 1.1 mrg C ecx counter 105 1.1 mrg C edx dst 106 1.1 mrg 107 1.1 mrg subl $2, %ecx 108 1.1 mrg movq (%eax,%ecx,4), %mm0 109 1.1 mrg pxor %mm7, %mm0 110 1.1 mrg movq %mm0, (%edx,%ecx,4) 111 1.1 mrg jg L(top) 112 1.1 mrg 113 1.1 mrg jnz L(done) C if size even 114 1.1 mrg 115 1.1 mrg L(one): 116 1.1 mrg movd -4(%eax), %mm0 C src low limb 117 1.1 mrg pxor %mm7, %mm0 118 1.1 mrg movd %mm0, -4(%edx) C dst low limb 119 1.1 mrg 120 1.1 mrg L(done): 121 1.1 mrg emms 122 1.1 mrg 123 1.1 mrg ret 124 1.1 mrg 125 1.1 mrg EPILOGUE() 126