1 1.1 mrg dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) 2 1.1 mrg dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] 3 1.1 mrg 4 1.1.1.3 mrg dnl Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc. 5 1.1 mrg 6 1.1 mrg dnl This file is part of the GNU MP Library. 7 1.1.1.3 mrg dnl 8 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 1.1.1.3 mrg dnl it under the terms of either: 10 1.1.1.3 mrg dnl 11 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 12 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 13 1.1.1.3 mrg dnl option) any later version. 14 1.1.1.3 mrg dnl 15 1.1.1.3 mrg dnl or 16 1.1.1.3 mrg dnl 17 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 18 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 19 1.1.1.3 mrg dnl later version. 20 1.1.1.3 mrg dnl 21 1.1.1.3 mrg dnl or both in parallel, as here. 22 1.1.1.3 mrg dnl 23 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 1.1.1.3 mrg dnl for more details. 27 1.1.1.3 mrg dnl 28 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 29 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1 mrg 35 1.1 mrg C cycles/limb 36 1.1.1.2 mrg C AMD K8,K9 2 37 1.1.1.2 mrg C AMD K10 2 38 1.1.1.3 mrg C AMD bd1 ? 39 1.1.1.3 mrg C AMD bobcat ? 40 1.1.1.2 mrg C Intel P4 13 41 1.1.1.2 mrg C Intel core2 3.45 42 1.1.1.3 mrg C Intel NHM ? 43 1.1.1.3 mrg C Intel SBR ? 44 1.1.1.2 mrg C Intel atom ? 45 1.1.1.2 mrg C VIA nano ? 46 1.1 mrg 47 1.1 mrg 48 1.1 mrg C Sometimes speed degenerates, supposedly related to that some operand 49 1.1 mrg C alignments cause cache conflicts. 50 1.1 mrg 51 1.1 mrg C The speed is limited by decoding/issue bandwidth. There are 22 instructions 52 1.1 mrg C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l. 53 1.1 mrg 54 1.1 mrg C INPUT PARAMETERS 55 1.1 mrg define(`rp',`%rdi') 56 1.1 mrg define(`up',`%rsi') 57 1.1 mrg define(`vp',`%rdx') 58 1.1 mrg define(`n', `%rcx') 59 1.1 mrg 60 1.1 mrg ifdef(`OPERATION_addlsh1_n', ` 61 1.1.1.2 mrg define(ADDSUB, add) 62 1.1.1.2 mrg define(ADCSBB, adc) 63 1.1.1.2 mrg define(func, mpn_addlsh1_n)') 64 1.1 mrg ifdef(`OPERATION_rsblsh1_n', ` 65 1.1.1.2 mrg define(ADDSUB, sub) 66 1.1.1.2 mrg define(ADCSBB, sbb) 67 1.1.1.2 mrg define(func, mpn_rsblsh1_n)') 68 1.1 mrg 69 1.1 mrg MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) 70 1.1 mrg 71 1.1.1.2 mrg ABI_SUPPORT(DOS64) 72 1.1.1.2 mrg ABI_SUPPORT(STD64) 73 1.1.1.2 mrg 74 1.1 mrg ASM_START() 75 1.1 mrg TEXT 76 1.1 mrg ALIGN(16) 77 1.1 mrg PROLOGUE(func) 78 1.1.1.2 mrg FUNC_ENTRY(4) 79 1.1 mrg push %rbp 80 1.1 mrg 81 1.1 mrg mov (vp), %r8 82 1.1 mrg mov R32(n), R32(%rax) 83 1.1 mrg lea (rp,n,8), rp 84 1.1 mrg lea (up,n,8), up 85 1.1 mrg lea (vp,n,8), vp 86 1.1 mrg neg n 87 1.1 mrg xor R32(%rbp), R32(%rbp) 88 1.1 mrg and $3, R32(%rax) 89 1.1 mrg je L(b00) 90 1.1 mrg cmp $2, R32(%rax) 91 1.1 mrg jc L(b01) 92 1.1 mrg je L(b10) 93 1.1 mrg 94 1.1 mrg L(b11): add %r8, %r8 95 1.1 mrg mov 8(vp,n,8), %r9 96 1.1 mrg adc %r9, %r9 97 1.1 mrg mov 16(vp,n,8), %r10 98 1.1 mrg adc %r10, %r10 99 1.1 mrg sbb R32(%rax), R32(%rax) C save scy 100 1.1 mrg ADDSUB (up,n,8), %r8 101 1.1 mrg ADCSBB 8(up,n,8), %r9 102 1.1 mrg mov %r8, (rp,n,8) 103 1.1 mrg mov %r9, 8(rp,n,8) 104 1.1 mrg ADCSBB 16(up,n,8), %r10 105 1.1 mrg mov %r10, 16(rp,n,8) 106 1.1 mrg sbb R32(%rbp), R32(%rbp) C save acy 107 1.1 mrg add $3, n 108 1.1 mrg jmp L(ent) 109 1.1 mrg 110 1.1 mrg L(b10): add %r8, %r8 111 1.1 mrg mov 8(vp,n,8), %r9 112 1.1 mrg adc %r9, %r9 113 1.1 mrg sbb R32(%rax), R32(%rax) C save scy 114 1.1 mrg ADDSUB (up,n,8), %r8 115 1.1 mrg ADCSBB 8(up,n,8), %r9 116 1.1 mrg mov %r8, (rp,n,8) 117 1.1 mrg mov %r9, 8(rp,n,8) 118 1.1 mrg sbb R32(%rbp), R32(%rbp) C save acy 119 1.1 mrg add $2, n 120 1.1 mrg jmp L(ent) 121 1.1 mrg 122 1.1 mrg L(b01): add %r8, %r8 123 1.1 mrg sbb R32(%rax), R32(%rax) C save scy 124 1.1 mrg ADDSUB (up,n,8), %r8 125 1.1 mrg mov %r8, (rp,n,8) 126 1.1 mrg sbb R32(%rbp), R32(%rbp) C save acy 127 1.1 mrg inc n 128 1.1 mrg L(ent): jns L(end) 129 1.1 mrg 130 1.1 mrg ALIGN(16) 131 1.1 mrg L(top): add R32(%rax), R32(%rax) C restore scy 132 1.1 mrg 133 1.1 mrg mov (vp,n,8), %r8 134 1.1 mrg L(b00): adc %r8, %r8 135 1.1 mrg mov 8(vp,n,8), %r9 136 1.1 mrg adc %r9, %r9 137 1.1 mrg mov 16(vp,n,8), %r10 138 1.1 mrg adc %r10, %r10 139 1.1 mrg mov 24(vp,n,8), %r11 140 1.1 mrg adc %r11, %r11 141 1.1 mrg 142 1.1 mrg sbb R32(%rax), R32(%rax) C save scy 143 1.1 mrg add R32(%rbp), R32(%rbp) C restore acy 144 1.1 mrg 145 1.1 mrg ADCSBB (up,n,8), %r8 146 1.1 mrg nop C Hammer speedup! 147 1.1 mrg ADCSBB 8(up,n,8), %r9 148 1.1 mrg mov %r8, (rp,n,8) 149 1.1 mrg mov %r9, 8(rp,n,8) 150 1.1 mrg ADCSBB 16(up,n,8), %r10 151 1.1 mrg ADCSBB 24(up,n,8), %r11 152 1.1 mrg mov %r10, 16(rp,n,8) 153 1.1 mrg mov %r11, 24(rp,n,8) 154 1.1 mrg 155 1.1 mrg sbb R32(%rbp), R32(%rbp) C save acy 156 1.1 mrg add $4, n 157 1.1 mrg js L(top) 158 1.1 mrg 159 1.1 mrg L(end): 160 1.1 mrg ifdef(`OPERATION_addlsh1_n',` 161 1.1 mrg add R32(%rbp), R32(%rax) 162 1.1 mrg neg R32(%rax)') 163 1.1 mrg ifdef(`OPERATION_rsblsh1_n',` 164 1.1 mrg sub R32(%rax), R32(%rbp) 165 1.1 mrg movslq R32(%rbp), %rax') 166 1.1 mrg 167 1.1 mrg pop %rbp 168 1.1.1.2 mrg FUNC_EXIT() 169 1.1 mrg ret 170 1.1 mrg EPILOGUE() 171