1 1.1 mrg dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder. 2 1.1 mrg 3 1.1.1.2 mrg dnl Copyright 2000-2003, 2007 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C K6: 10.0 cycles/limb 35 1.1 mrg 36 1.1 mrg 37 1.1 mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 38 1.1 mrg C mp_limb_t divisor); 39 1.1 mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 40 1.1 mrg C mp_limb_t divisor, mp_limb_t carry); 41 1.1 mrg C 42 1.1 mrg C A special case for high<divisor at the end measured only about 4 cycles 43 1.1 mrg C faster, and so isn't used. 44 1.1 mrg C 45 1.1 mrg C A special case for size==1 using a divl rather than the inverse measured 46 1.1 mrg C only about 5 cycles faster, and so isn't used. When size==1 and 47 1.1 mrg C high<divisor it can skip a division and be a full 24 cycles faster, but 48 1.1 mrg C this isn't an important case. 49 1.1 mrg 50 1.1 mrg defframe(PARAM_CARRY, 16) 51 1.1 mrg defframe(PARAM_DIVISOR,12) 52 1.1 mrg defframe(PARAM_SIZE, 8) 53 1.1 mrg defframe(PARAM_SRC, 4) 54 1.1 mrg 55 1.1 mrg TEXT 56 1.1 mrg 57 1.1 mrg ALIGN(32) 58 1.1 mrg PROLOGUE(mpn_modexact_1c_odd) 59 1.1 mrg deflit(`FRAME',0) 60 1.1 mrg 61 1.1 mrg movl PARAM_DIVISOR, %ecx 62 1.1 mrg pushl %esi FRAME_pushl() 63 1.1 mrg 64 1.1 mrg movl PARAM_CARRY, %edx 65 1.1 mrg jmp L(start_1c) 66 1.1 mrg 67 1.1 mrg EPILOGUE() 68 1.1 mrg 69 1.1 mrg 70 1.1 mrg ALIGN(16) 71 1.1 mrg PROLOGUE(mpn_modexact_1_odd) 72 1.1 mrg deflit(`FRAME',0) 73 1.1 mrg 74 1.1 mrg movl PARAM_DIVISOR, %ecx 75 1.1 mrg pushl %esi FRAME_pushl() 76 1.1 mrg 77 1.1 mrg xorl %edx, %edx 78 1.1 mrg L(start_1c): 79 1.1 mrg pushl %edi FRAME_pushl() 80 1.1 mrg 81 1.1 mrg shrl %ecx C d/2 82 1.1 mrg movl PARAM_DIVISOR, %esi 83 1.1 mrg 84 1.1 mrg andl $127, %ecx C d/2, 7 bits 85 1.1 mrg pushl %ebp FRAME_pushl() 86 1.1 mrg 87 1.1 mrg ifdef(`PIC',` 88 1.1 mrg LEA( binvert_limb_table, %edi) 89 1.1 mrg Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits 90 1.1 mrg ',` 91 1.1 mrg movzbl binvert_limb_table(%ecx), %edi C inv 8 bits 92 1.1 mrg ') 93 1.1 mrg leal (%edi,%edi), %ecx C 2*inv 94 1.1 mrg 95 1.1 mrg imull %edi, %edi C inv*inv 96 1.1 mrg 97 1.1 mrg movl PARAM_SRC, %eax 98 1.1 mrg movl PARAM_SIZE, %ebp 99 1.1 mrg 100 1.1 mrg imull %esi, %edi C inv*inv*d 101 1.1 mrg 102 1.1 mrg pushl %ebx FRAME_pushl() 103 1.1 mrg leal (%eax,%ebp,4), %ebx C src end 104 1.1 mrg 105 1.1 mrg subl %edi, %ecx C inv = 2*inv - inv*inv*d 106 1.1 mrg leal (%ecx,%ecx), %edi C 2*inv 107 1.1 mrg 108 1.1 mrg imull %ecx, %ecx C inv*inv 109 1.1 mrg 110 1.1 mrg movl (%eax), %eax C src low limb 111 1.1 mrg negl %ebp C -size 112 1.1 mrg 113 1.1 mrg imull %esi, %ecx C inv*inv*d 114 1.1 mrg 115 1.1 mrg subl %ecx, %edi C inv = 2*inv - inv*inv*d 116 1.1 mrg 117 1.1 mrg ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 118 1.1 mrg pushl %eax 119 1.1 mrg movl %esi, %eax 120 1.1 mrg imull %edi, %eax 121 1.1 mrg cmpl $1, %eax 122 1.1 mrg popl %eax') 123 1.1 mrg 124 1.1 mrg jmp L(entry) 125 1.1 mrg 126 1.1 mrg 127 1.1 mrg C Rotating the mul to the top of the loop saves 1 cycle, presumably by 128 1.1 mrg C hiding the loop control under the imul latency. 129 1.1 mrg C 130 1.1 mrg C The run time is 10 cycles, but decoding is only 9 (and the dependent chain 131 1.1 mrg C only 8). It's not clear how to get down to 9 cycles. 132 1.1 mrg C 133 1.1 mrg C The xor and rcl to handle the carry bit could be an sbb instead, with the 134 1.1 mrg C the carry bit add becoming a sub, but that doesn't save anything. 135 1.1 mrg 136 1.1 mrg L(top): 137 1.1 mrg C eax (low product) 138 1.1 mrg C ebx src end 139 1.1 mrg C ecx carry bit, 0 or 1 140 1.1 mrg C edx (high product, being carry limb) 141 1.1 mrg C esi divisor 142 1.1 mrg C edi inverse 143 1.1 mrg C ebp counter, limbs, negative 144 1.1 mrg 145 1.1 mrg mull %esi 146 1.1 mrg 147 1.1 mrg movl (%ebx,%ebp,4), %eax 148 1.1 mrg addl %ecx, %edx C apply carry bit to carry limb 149 1.1 mrg 150 1.1 mrg L(entry): 151 1.1 mrg xorl %ecx, %ecx 152 1.1 mrg subl %edx, %eax C apply carry limb 153 1.1 mrg 154 1.1 mrg rcll %ecx 155 1.1 mrg 156 1.1 mrg imull %edi, %eax 157 1.1 mrg 158 1.1 mrg incl %ebp 159 1.1 mrg jnz L(top) 160 1.1 mrg 161 1.1 mrg 162 1.1 mrg 163 1.1 mrg popl %ebx 164 1.1 mrg popl %ebp 165 1.1 mrg 166 1.1 mrg mull %esi 167 1.1 mrg 168 1.1 mrg popl %edi 169 1.1 mrg popl %esi 170 1.1 mrg 171 1.1 mrg leal (%ecx,%edx), %eax 172 1.1 mrg 173 1.1 mrg ret 174 1.1 mrg 175 1.1 mrg EPILOGUE() 176 1.1.1.2 mrg ASM_END() 177