mode1o.asm revision 1.1.1.3 1 dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
2
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
30
31 include(`../config.m4')
32
33
34 C P4: 19.0 cycles/limb
35
36
37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
38 C mp_limb_t divisor);
39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor, mp_limb_t carry);
41 C
42
43 defframe(PARAM_CARRY, 16)
44 defframe(PARAM_DIVISOR,12)
45 defframe(PARAM_SIZE, 8)
46 defframe(PARAM_SRC, 4)
47
48 TEXT
49
50 ALIGN(16)
51 PROLOGUE(mpn_modexact_1c_odd)
52 deflit(`FRAME',0)
53
54 movd PARAM_CARRY, %mm1
55 jmp L(start_1c)
56
57 EPILOGUE()
58
59
60 ALIGN(16)
61 PROLOGUE(mpn_modexact_1_odd)
62 deflit(`FRAME',0)
63
64 pxor %mm1, %mm1 C carry limb
65 L(start_1c):
66 movl PARAM_DIVISOR, %eax
67
68 movd PARAM_DIVISOR, %mm7
69
70 shrl %eax
71
72 andl $127, %eax C d/2, 7 bits
73
74 ifdef(`PIC',`
75 LEA( binvert_limb_table, %edx)
76 movzbl (%eax,%edx), %eax C inv 8 bits
77 ',`
78 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
79 ')
80
81 C
82
83 movd %eax, %mm6 C inv
84
85 movd %eax, %mm0 C inv
86
87 pmuludq %mm6, %mm6 C inv*inv
88
89 C
90
91 pmuludq %mm7, %mm6 C inv*inv*d
92 paddd %mm0, %mm0 C 2*inv
93
94 C
95
96 psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
97 pxor %mm6, %mm6
98
99 paddd %mm0, %mm6
100 pmuludq %mm0, %mm0 C inv*inv
101
102 C
103
104 pmuludq %mm7, %mm0 C inv*inv*d
105 paddd %mm6, %mm6 C 2*inv
106
107
108 movl PARAM_SRC, %eax
109 movl PARAM_SIZE, %ecx
110
111 C
112
113 psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
114
115 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
116 pushl %eax FRAME_pushl()
117 movd %mm6, %eax
118 imul PARAM_DIVISOR, %eax
119 cmpl $1, %eax
120 popl %eax FRAME_popl()')
121
122 pxor %mm0, %mm0 C carry bit
123
124
125 C The dependent chain here is as follows.
126 C
127 C latency
128 C psubq s = (src-cbit) - climb 2
129 C pmuludq q = s*inverse 8
130 C pmuludq prod = q*divisor 8
131 C psrlq climb = high(prod) 2
132 C --
133 C 20
134 C
135 C Yet the loop measures 19.0 c/l, so obviously there's something gained
136 C there over a straight reading of the chip documentation.
137
138 L(top):
139 C eax src, incrementing
140 C ebx
141 C ecx counter, limbs
142 C edx
143 C
144 C mm0 carry bit
145 C mm1 carry limb
146 C mm6 inverse
147 C mm7 divisor
148
149 movd (%eax), %mm2
150 addl $4, %eax
151
152 psubq %mm0, %mm2 C src - cbit
153
154 psubq %mm1, %mm2 C src - cbit - climb
155 movq %mm2, %mm0
156 psrlq $63, %mm0 C new cbit
157
158 pmuludq %mm6, %mm2 C s*inverse
159
160 movq %mm7, %mm1
161 pmuludq %mm2, %mm1 C q*divisor
162 psrlq $32, %mm1 C new climb
163
164 subl $1, %ecx
165 jnz L(top)
166
167
168 L(done):
169 paddq %mm1, %mm0
170 movd %mm0, %eax
171 emms
172 ret
173
174 EPILOGUE()
175 ASM_END()
176