1 1.1 mrg dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C cycles/limb 35 1.1 mrg C Athlon: 11.0 36 1.1 mrg C Hammer: 7.0 37 1.1 mrg 38 1.1 mrg 39 1.1 mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 40 1.1 mrg C mp_limb_t divisor); 41 1.1 mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 42 1.1 mrg C mp_limb_t divisor, mp_limb_t carry); 43 1.1 mrg C 44 1.1 mrg C With the loop running at just 11 cycles it doesn't seem worth bothering to 45 1.1 mrg C check for high<divisor to save one step. 46 1.1 mrg C 47 1.1 mrg C Using a divl for size==1 measures slower than the modexact method, which 48 1.1 mrg C is not too surprising since for the latter it's only about 24 cycles to 49 1.1 mrg C calculate the modular inverse. 50 1.1 mrg 51 1.1 mrg defframe(PARAM_CARRY, 16) 52 1.1 mrg defframe(PARAM_DIVISOR,12) 53 1.1 mrg defframe(PARAM_SIZE, 8) 54 1.1 mrg defframe(PARAM_SRC, 4) 55 1.1 mrg 56 1.1 mrg defframe(SAVE_EBX, -4) 57 1.1 mrg defframe(SAVE_ESI, -8) 58 1.1 mrg defframe(SAVE_EDI, -12) 59 1.1 mrg defframe(SAVE_EBP, -16) 60 1.1 mrg 61 1.1 mrg deflit(STACK_SPACE, 16) 62 1.1 mrg 63 1.1 mrg TEXT 64 1.1 mrg 65 1.1 mrg ALIGN(16) 66 1.1 mrg PROLOGUE(mpn_modexact_1c_odd) 67 1.1 mrg deflit(`FRAME',0) 68 1.1 mrg 69 1.1 mrg movl PARAM_CARRY, %ecx 70 1.1 mrg jmp L(start_1c) 71 1.1 mrg 72 1.1 mrg EPILOGUE() 73 1.1 mrg 74 1.1 mrg 75 1.1 mrg ALIGN(16) 76 1.1 mrg PROLOGUE(mpn_modexact_1_odd) 77 1.1 mrg deflit(`FRAME',0) 78 1.1 mrg 79 1.1 mrg xorl %ecx, %ecx 80 1.1 mrg L(start_1c): 81 1.1 mrg movl PARAM_DIVISOR, %eax 82 1.1 mrg subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 83 1.1 mrg 84 1.1 mrg movl %esi, SAVE_ESI 85 1.1 mrg movl PARAM_DIVISOR, %esi 86 1.1 mrg 87 1.1 mrg movl %edi, SAVE_EDI 88 1.1 mrg 89 1.1 mrg shrl %eax C d/2 90 1.1 mrg 91 1.1 mrg andl $127, %eax 92 1.1 mrg 93 1.1 mrg ifdef(`PIC',` 94 1.1 mrg LEA( binvert_limb_table, %edi) 95 1.1 mrg movzbl (%eax,%edi), %edi C inv 8 bits 96 1.1 mrg ',` 97 1.1 mrg movzbl binvert_limb_table(%eax), %edi C inv 8 bits 98 1.1 mrg ') 99 1.1 mrg 100 1.1 mrg xorl %edx, %edx C initial extra carry 101 1.1 mrg leal (%edi,%edi), %eax C 2*inv 102 1.1 mrg 103 1.1 mrg imull %edi, %edi C inv*inv 104 1.1 mrg 105 1.1 mrg movl %ebp, SAVE_EBP 106 1.1 mrg movl PARAM_SIZE, %ebp 107 1.1 mrg 108 1.1 mrg movl %ebx, SAVE_EBX 109 1.1 mrg movl PARAM_SRC, %ebx 110 1.1 mrg 111 1.1 mrg imull %esi, %edi C inv*inv*d 112 1.1 mrg 113 1.1 mrg subl %edi, %eax C inv = 2*inv - inv*inv*d 114 1.1 mrg leal (%eax,%eax), %edi C 2*inv 115 1.1 mrg 116 1.1 mrg imull %eax, %eax C inv*inv 117 1.1 mrg 118 1.1 mrg imull %esi, %eax C inv*inv*d 119 1.1 mrg 120 1.1 mrg leal (%ebx,%ebp,4), %ebx C src end 121 1.1 mrg negl %ebp C -size 122 1.1 mrg 123 1.1 mrg subl %eax, %edi C inv = 2*inv - inv*inv*d 124 1.1 mrg 125 1.1 mrg ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 126 1.1 mrg movl %esi, %eax 127 1.1 mrg imull %edi, %eax 128 1.1 mrg cmpl $1, %eax') 129 1.1 mrg 130 1.1 mrg 131 1.1 mrg C The dependent chain here is 132 1.1 mrg C 133 1.1 mrg C cycles 134 1.1 mrg C subl %edx, %eax 1 135 1.1 mrg C imull %edi, %eax 4 136 1.1 mrg C mull %esi 6 (high limb) 137 1.1 mrg C ---- 138 1.1 mrg C total 11 139 1.1 mrg C 140 1.1 mrg C Out of order execution hides the load latency for the source data, so no 141 1.1 mrg C special scheduling is required. 142 1.1 mrg 143 1.1 mrg L(top): 144 1.1 mrg C eax src limb 145 1.1 mrg C ebx src end ptr 146 1.1 mrg C ecx next carry bit, 0 or 1 (or initial carry param) 147 1.1 mrg C edx carry limb, high of last product 148 1.1 mrg C esi divisor 149 1.1 mrg C edi inverse 150 1.1 mrg C ebp counter, limbs, negative 151 1.1 mrg 152 1.1 mrg movl (%ebx,%ebp,4), %eax 153 1.1 mrg 154 1.1 mrg subl %ecx, %eax C apply carry bit 155 1.1 mrg movl $0, %ecx 156 1.1 mrg 157 1.1 mrg setc %cl C new carry bit 158 1.1 mrg 159 1.1 mrg subl %edx, %eax C apply carry limb 160 1.1 mrg adcl $0, %ecx 161 1.1 mrg 162 1.1 mrg imull %edi, %eax 163 1.1 mrg 164 1.1 mrg mull %esi 165 1.1 mrg 166 1.1 mrg incl %ebp 167 1.1 mrg jnz L(top) 168 1.1 mrg 169 1.1 mrg 170 1.1 mrg movl SAVE_ESI, %esi 171 1.1 mrg movl SAVE_EDI, %edi 172 1.1 mrg leal (%ecx,%edx), %eax 173 1.1 mrg 174 1.1 mrg movl SAVE_EBX, %ebx 175 1.1 mrg movl SAVE_EBP, %ebp 176 1.1 mrg addl $STACK_SPACE, %esp 177 1.1 mrg 178 1.1 mrg ret 179 1.1 mrg 180 1.1 mrg EPILOGUE() 181 1.1.1.2 mrg ASM_END() 182