1 1.1 mrg dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4 1.1.1.3 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1.1.3 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.3 mrg dnl it under the terms of either: 9 1.1.1.3 mrg dnl 10 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.3 mrg dnl option) any later version. 13 1.1.1.3 mrg dnl 14 1.1.1.3 mrg dnl or 15 1.1.1.3 mrg dnl 16 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.3 mrg dnl later version. 19 1.1.1.3 mrg dnl 20 1.1.1.3 mrg dnl or both in parallel, as here. 21 1.1.1.3 mrg dnl 22 1.1.1.3 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1.1.3 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.3 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C P4: 19.0 cycles/limb 35 1.1 mrg 36 1.1 mrg 37 1.1 mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 38 1.1 mrg C mp_limb_t divisor); 39 1.1 mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 40 1.1 mrg C mp_limb_t divisor, mp_limb_t carry); 41 1.1 mrg C 42 1.1 mrg 43 1.1 mrg defframe(PARAM_CARRY, 16) 44 1.1 mrg defframe(PARAM_DIVISOR,12) 45 1.1 mrg defframe(PARAM_SIZE, 8) 46 1.1 mrg defframe(PARAM_SRC, 4) 47 1.1 mrg 48 1.1 mrg TEXT 49 1.1 mrg 50 1.1 mrg ALIGN(16) 51 1.1 mrg PROLOGUE(mpn_modexact_1c_odd) 52 1.1 mrg deflit(`FRAME',0) 53 1.1 mrg 54 1.1 mrg movd PARAM_CARRY, %mm1 55 1.1 mrg jmp L(start_1c) 56 1.1 mrg 57 1.1 mrg EPILOGUE() 58 1.1 mrg 59 1.1 mrg 60 1.1 mrg ALIGN(16) 61 1.1 mrg PROLOGUE(mpn_modexact_1_odd) 62 1.1 mrg deflit(`FRAME',0) 63 1.1 mrg 64 1.1 mrg pxor %mm1, %mm1 C carry limb 65 1.1 mrg L(start_1c): 66 1.1 mrg movl PARAM_DIVISOR, %eax 67 1.1 mrg 68 1.1 mrg movd PARAM_DIVISOR, %mm7 69 1.1 mrg 70 1.1 mrg shrl %eax 71 1.1 mrg 72 1.1 mrg andl $127, %eax C d/2, 7 bits 73 1.1 mrg 74 1.1 mrg ifdef(`PIC',` 75 1.1 mrg LEA( binvert_limb_table, %edx) 76 1.1 mrg movzbl (%eax,%edx), %eax C inv 8 bits 77 1.1 mrg ',` 78 1.1 mrg movzbl binvert_limb_table(%eax), %eax C inv 8 bits 79 1.1 mrg ') 80 1.1 mrg 81 1.1 mrg C 82 1.1 mrg 83 1.1 mrg movd %eax, %mm6 C inv 84 1.1 mrg 85 1.1 mrg movd %eax, %mm0 C inv 86 1.1 mrg 87 1.1 mrg pmuludq %mm6, %mm6 C inv*inv 88 1.1 mrg 89 1.1 mrg C 90 1.1 mrg 91 1.1 mrg pmuludq %mm7, %mm6 C inv*inv*d 92 1.1 mrg paddd %mm0, %mm0 C 2*inv 93 1.1 mrg 94 1.1 mrg C 95 1.1 mrg 96 1.1 mrg psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d 97 1.1 mrg pxor %mm6, %mm6 98 1.1 mrg 99 1.1 mrg paddd %mm0, %mm6 100 1.1 mrg pmuludq %mm0, %mm0 C inv*inv 101 1.1 mrg 102 1.1 mrg C 103 1.1 mrg 104 1.1 mrg pmuludq %mm7, %mm0 C inv*inv*d 105 1.1 mrg paddd %mm6, %mm6 C 2*inv 106 1.1 mrg 107 1.1 mrg 108 1.1 mrg movl PARAM_SRC, %eax 109 1.1 mrg movl PARAM_SIZE, %ecx 110 1.1 mrg 111 1.1 mrg C 112 1.1 mrg 113 1.1 mrg psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d 114 1.1 mrg 115 1.1 mrg ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 116 1.1 mrg pushl %eax FRAME_pushl() 117 1.1 mrg movd %mm6, %eax 118 1.1 mrg imul PARAM_DIVISOR, %eax 119 1.1 mrg cmpl $1, %eax 120 1.1 mrg popl %eax FRAME_popl()') 121 1.1 mrg 122 1.1 mrg pxor %mm0, %mm0 C carry bit 123 1.1 mrg 124 1.1 mrg 125 1.1 mrg C The dependent chain here is as follows. 126 1.1 mrg C 127 1.1.1.2 mrg C latency 128 1.1.1.2 mrg C psubq s = (src-cbit) - climb 2 129 1.1.1.2 mrg C pmuludq q = s*inverse 8 130 1.1.1.2 mrg C pmuludq prod = q*divisor 8 131 1.1.1.2 mrg C psrlq climb = high(prod) 2 132 1.1.1.2 mrg C -- 133 1.1.1.2 mrg C 20 134 1.1 mrg C 135 1.1 mrg C Yet the loop measures 19.0 c/l, so obviously there's something gained 136 1.1 mrg C there over a straight reading of the chip documentation. 137 1.1 mrg 138 1.1 mrg L(top): 139 1.1 mrg C eax src, incrementing 140 1.1 mrg C ebx 141 1.1 mrg C ecx counter, limbs 142 1.1 mrg C edx 143 1.1 mrg C 144 1.1 mrg C mm0 carry bit 145 1.1 mrg C mm1 carry limb 146 1.1 mrg C mm6 inverse 147 1.1 mrg C mm7 divisor 148 1.1 mrg 149 1.1 mrg movd (%eax), %mm2 150 1.1 mrg addl $4, %eax 151 1.1 mrg 152 1.1 mrg psubq %mm0, %mm2 C src - cbit 153 1.1 mrg 154 1.1 mrg psubq %mm1, %mm2 C src - cbit - climb 155 1.1 mrg movq %mm2, %mm0 156 1.1 mrg psrlq $63, %mm0 C new cbit 157 1.1 mrg 158 1.1 mrg pmuludq %mm6, %mm2 C s*inverse 159 1.1 mrg 160 1.1 mrg movq %mm7, %mm1 161 1.1 mrg pmuludq %mm2, %mm1 C q*divisor 162 1.1 mrg psrlq $32, %mm1 C new climb 163 1.1 mrg 164 1.1 mrg subl $1, %ecx 165 1.1 mrg jnz L(top) 166 1.1 mrg 167 1.1 mrg 168 1.1 mrg L(done): 169 1.1 mrg paddq %mm1, %mm0 170 1.1 mrg movd %mm0, %eax 171 1.1 mrg emms 172 1.1 mrg ret 173 1.1 mrg 174 1.1 mrg EPILOGUE() 175 1.1.1.3 mrg ASM_END() 176