1 1.1 mrg dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 1999-2003 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C K7: 1.64 cycles/limb (at 16 limbs/loop). 35 1.1 mrg 36 1.1 mrg 37 1.1 mrg 38 1.1 mrg dnl K7: UNROLL_COUNT cycles/limb 39 1.1 mrg dnl 8 1.9 40 1.1 mrg dnl 16 1.64 41 1.1 mrg dnl 32 1.7 42 1.1 mrg dnl 64 2.0 43 1.1 mrg dnl Maximum possible with the current code is 64. 44 1.1 mrg 45 1.1 mrg deflit(UNROLL_COUNT, 16) 46 1.1 mrg 47 1.1 mrg 48 1.1 mrg ifdef(`OPERATION_add_n', ` 49 1.1 mrg define(M4_inst, adcl) 50 1.1 mrg define(M4_function_n, mpn_add_n) 51 1.1 mrg define(M4_function_nc, mpn_add_nc) 52 1.1 mrg define(M4_description, add) 53 1.1 mrg ',`ifdef(`OPERATION_sub_n', ` 54 1.1 mrg define(M4_inst, sbbl) 55 1.1 mrg define(M4_function_n, mpn_sub_n) 56 1.1 mrg define(M4_function_nc, mpn_sub_nc) 57 1.1 mrg define(M4_description, subtract) 58 1.1 mrg ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n 59 1.1 mrg ')')') 60 1.1 mrg 61 1.1 mrg MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 62 1.1 mrg 63 1.1 mrg 64 1.1 mrg C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 65 1.1 mrg C mp_size_t size); 66 1.1 mrg C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 67 1.1 mrg C mp_size_t size, mp_limb_t carry); 68 1.1 mrg C 69 1.1 mrg C Calculate src1,size M4_description src2,size, and store the result in 70 1.1 mrg C dst,size. The return value is the carry bit from the top of the result (1 71 1.1 mrg C or 0). 72 1.1 mrg C 73 1.1 mrg C The _nc version accepts 1 or 0 for an initial carry into the low limb of 74 1.1 mrg C the calculation. Note values other than 1 or 0 here will lead to garbage 75 1.1 mrg C results. 76 1.1 mrg C 77 1.1 mrg C This code runs at 1.64 cycles/limb, which might be the best possible with 78 1.1 mrg C plain integer operations. Each limb is 2 loads and 1 store, any 2 of 79 1.1 mrg C which can be done each cycle, leading to 1.5 c/l. 80 1.1 mrg 81 1.1 mrg dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. 82 1.1 mrg ifdef(`PIC',` 83 1.1 mrg deflit(UNROLL_THRESHOLD, 8) 84 1.1 mrg ',` 85 1.1 mrg deflit(UNROLL_THRESHOLD, 8) 86 1.1 mrg ') 87 1.1 mrg 88 1.1 mrg defframe(PARAM_CARRY,20) 89 1.1 mrg defframe(PARAM_SIZE, 16) 90 1.1 mrg defframe(PARAM_SRC2, 12) 91 1.1 mrg defframe(PARAM_SRC1, 8) 92 1.1 mrg defframe(PARAM_DST, 4) 93 1.1 mrg 94 1.1 mrg defframe(SAVE_EBP, -4) 95 1.1 mrg defframe(SAVE_ESI, -8) 96 1.1 mrg defframe(SAVE_EBX, -12) 97 1.1 mrg defframe(SAVE_EDI, -16) 98 1.1 mrg deflit(STACK_SPACE, 16) 99 1.1 mrg 100 1.1 mrg TEXT 101 1.1 mrg ALIGN(32) 102 1.1 mrg deflit(`FRAME',0) 103 1.1 mrg 104 1.1 mrg PROLOGUE(M4_function_nc) 105 1.1 mrg movl PARAM_CARRY, %eax 106 1.1 mrg jmp L(start) 107 1.1 mrg EPILOGUE() 108 1.1 mrg 109 1.1 mrg PROLOGUE(M4_function_n) 110 1.1 mrg 111 1.1 mrg xorl %eax, %eax C carry 112 1.1 mrg L(start): 113 1.1 mrg movl PARAM_SIZE, %ecx 114 1.1 mrg subl $STACK_SPACE, %esp 115 1.1 mrg deflit(`FRAME',STACK_SPACE) 116 1.1 mrg 117 1.1 mrg movl %edi, SAVE_EDI 118 1.1 mrg movl %ebx, SAVE_EBX 119 1.1 mrg cmpl $UNROLL_THRESHOLD, %ecx 120 1.1 mrg 121 1.1 mrg movl PARAM_SRC2, %edx 122 1.1 mrg movl PARAM_SRC1, %ebx 123 1.1 mrg jae L(unroll) 124 1.1 mrg 125 1.1 mrg movl PARAM_DST, %edi 126 1.1 mrg leal (%ebx,%ecx,4), %ebx 127 1.1 mrg leal (%edx,%ecx,4), %edx 128 1.1 mrg 129 1.1 mrg leal (%edi,%ecx,4), %edi 130 1.1 mrg negl %ecx 131 1.1 mrg shrl %eax 132 1.1 mrg 133 1.1 mrg C This loop in in a single 16 byte code block already, so no 134 1.1 mrg C alignment necessary. 135 1.1 mrg L(simple): 136 1.1 mrg C eax scratch 137 1.1 mrg C ebx src1 138 1.1 mrg C ecx counter 139 1.1 mrg C edx src2 140 1.1 mrg C esi 141 1.1 mrg C edi dst 142 1.1 mrg C ebp 143 1.1 mrg 144 1.1 mrg movl (%ebx,%ecx,4), %eax 145 1.1 mrg M4_inst (%edx,%ecx,4), %eax 146 1.1 mrg movl %eax, (%edi,%ecx,4) 147 1.1 mrg incl %ecx 148 1.1 mrg jnz L(simple) 149 1.1 mrg 150 1.1 mrg movl $0, %eax 151 1.1 mrg movl SAVE_EDI, %edi 152 1.1 mrg 153 1.1 mrg movl SAVE_EBX, %ebx 154 1.1 mrg setc %al 155 1.1 mrg addl $STACK_SPACE, %esp 156 1.1 mrg 157 1.1 mrg ret 158 1.1 mrg 159 1.1 mrg 160 1.1 mrg C ----------------------------------------------------------------------------- 161 1.1 mrg C This is at 0x55, close enough to aligned. 162 1.1 mrg L(unroll): 163 1.1 mrg deflit(`FRAME',STACK_SPACE) 164 1.1 mrg movl %ebp, SAVE_EBP 165 1.1 mrg andl $-2, %ecx C size low bit masked out 166 1.1 mrg andl $1, PARAM_SIZE C size low bit kept 167 1.1 mrg 168 1.1 mrg movl %ecx, %edi 169 1.1 mrg decl %ecx 170 1.1 mrg movl PARAM_DST, %ebp 171 1.1 mrg 172 1.1 mrg shrl $UNROLL_LOG2, %ecx 173 1.1 mrg negl %edi 174 1.1 mrg movl %esi, SAVE_ESI 175 1.1 mrg 176 1.1 mrg andl $UNROLL_MASK, %edi 177 1.1 mrg 178 1.1 mrg ifdef(`PIC',` 179 1.1 mrg call L(pic_calc) 180 1.1 mrg L(here): 181 1.1 mrg ',` 182 1.1 mrg leal L(entry) (%edi,%edi,8), %esi C 9 bytes per 183 1.1 mrg ') 184 1.1 mrg negl %edi 185 1.1 mrg shrl %eax 186 1.1 mrg 187 1.1 mrg leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx 188 1.1 mrg leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx 189 1.1 mrg leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi 190 1.1 mrg 191 1.1 mrg jmp *%esi 192 1.1 mrg 193 1.1 mrg 194 1.1 mrg ifdef(`PIC',` 195 1.1 mrg L(pic_calc): 196 1.1 mrg C See mpn/x86/README about old gas bugs 197 1.1 mrg leal (%edi,%edi,8), %esi 198 1.1 mrg addl $L(entry)-L(here), %esi 199 1.1 mrg addl (%esp), %esi 200 1.1 mrg ret_internal 201 1.1 mrg ') 202 1.1 mrg 203 1.1 mrg 204 1.1 mrg C ----------------------------------------------------------------------------- 205 1.1 mrg ALIGN(32) 206 1.1 mrg L(top): 207 1.1 mrg C eax zero 208 1.1 mrg C ebx src1 209 1.1 mrg C ecx counter 210 1.1 mrg C edx src2 211 1.1 mrg C esi scratch (was computed jump) 212 1.1 mrg C edi dst 213 1.1 mrg C ebp scratch 214 1.1 mrg 215 1.1 mrg leal UNROLL_BYTES(%edx), %edx 216 1.1 mrg 217 1.1 mrg L(entry): 218 1.1 mrg deflit(CHUNK_COUNT, 2) 219 1.1 mrg forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 220 1.1 mrg deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 221 1.1 mrg deflit(`disp1', eval(disp0 + 4)) 222 1.1 mrg 223 1.1 mrg Zdisp( movl, disp0,(%ebx), %esi) 224 1.1 mrg movl disp1(%ebx), %ebp 225 1.1 mrg Zdisp( M4_inst,disp0,(%edx), %esi) 226 1.1 mrg Zdisp( movl, %esi, disp0,(%edi)) 227 1.1 mrg M4_inst disp1(%edx), %ebp 228 1.1 mrg movl %ebp, disp1(%edi) 229 1.1 mrg ') 230 1.1 mrg 231 1.1 mrg decl %ecx 232 1.1 mrg leal UNROLL_BYTES(%ebx), %ebx 233 1.1 mrg leal UNROLL_BYTES(%edi), %edi 234 1.1 mrg jns L(top) 235 1.1 mrg 236 1.1 mrg 237 1.1 mrg mov PARAM_SIZE, %esi 238 1.1 mrg movl SAVE_EBP, %ebp 239 1.1 mrg movl $0, %eax 240 1.1 mrg 241 1.1 mrg decl %esi 242 1.1 mrg js L(even) 243 1.1 mrg 244 1.1 mrg movl (%ebx), %ecx 245 1.1 mrg M4_inst UNROLL_BYTES(%edx), %ecx 246 1.1 mrg movl %ecx, (%edi) 247 1.1 mrg L(even): 248 1.1 mrg 249 1.1 mrg movl SAVE_EDI, %edi 250 1.1 mrg movl SAVE_EBX, %ebx 251 1.1 mrg setc %al 252 1.1 mrg 253 1.1 mrg movl SAVE_ESI, %esi 254 1.1 mrg addl $STACK_SPACE, %esp 255 1.1 mrg 256 1.1 mrg ret 257 1.1 mrg 258 1.1 mrg EPILOGUE() 259