mode1o.asm revision 1.1.1.1.8.1 1 dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
2
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C P4: 19.0 cycles/limb
24
25
26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
27 C mp_limb_t divisor);
28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
29 C mp_limb_t divisor, mp_limb_t carry);
30 C
31
32 defframe(PARAM_CARRY, 16)
33 defframe(PARAM_DIVISOR,12)
34 defframe(PARAM_SIZE, 8)
35 defframe(PARAM_SRC, 4)
36
37 TEXT
38
39 ALIGN(16)
40 PROLOGUE(mpn_modexact_1c_odd)
41 deflit(`FRAME',0)
42
43 movd PARAM_CARRY, %mm1
44 jmp L(start_1c)
45
46 EPILOGUE()
47
48
49 ALIGN(16)
50 PROLOGUE(mpn_modexact_1_odd)
51 deflit(`FRAME',0)
52
53 pxor %mm1, %mm1 C carry limb
54 L(start_1c):
55 movl PARAM_DIVISOR, %eax
56
57 movd PARAM_DIVISOR, %mm7
58
59 shrl %eax
60
61 andl $127, %eax C d/2, 7 bits
62
63 ifdef(`PIC',`
64 LEA( binvert_limb_table, %edx)
65 movzbl (%eax,%edx), %eax C inv 8 bits
66 ',`
67 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
68 ')
69
70 C
71
72 movd %eax, %mm6 C inv
73
74 movd %eax, %mm0 C inv
75
76 pmuludq %mm6, %mm6 C inv*inv
77
78 C
79
80 pmuludq %mm7, %mm6 C inv*inv*d
81 paddd %mm0, %mm0 C 2*inv
82
83 C
84
85 psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
86 pxor %mm6, %mm6
87
88 paddd %mm0, %mm6
89 pmuludq %mm0, %mm0 C inv*inv
90
91 C
92
93 pmuludq %mm7, %mm0 C inv*inv*d
94 paddd %mm6, %mm6 C 2*inv
95
96
97 movl PARAM_SRC, %eax
98 movl PARAM_SIZE, %ecx
99
100 C
101
102 psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
103
104 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
105 pushl %eax FRAME_pushl()
106 movd %mm6, %eax
107 imul PARAM_DIVISOR, %eax
108 cmpl $1, %eax
109 popl %eax FRAME_popl()')
110
111 pxor %mm0, %mm0 C carry bit
112
113
114 C The dependent chain here is as follows.
115 C
116 C latency
117 C psubq s = (src-cbit) - climb 2
118 C pmuludq q = s*inverse 8
119 C pmuludq prod = q*divisor 8
120 C psrlq climb = high(prod) 2
121 C --
122 C 20
123 C
124 C Yet the loop measures 19.0 c/l, so obviously there's something gained
125 C there over a straight reading of the chip documentation.
126
127 L(top):
128 C eax src, incrementing
129 C ebx
130 C ecx counter, limbs
131 C edx
132 C
133 C mm0 carry bit
134 C mm1 carry limb
135 C mm6 inverse
136 C mm7 divisor
137
138 movd (%eax), %mm2
139 addl $4, %eax
140
141 psubq %mm0, %mm2 C src - cbit
142
143 psubq %mm1, %mm2 C src - cbit - climb
144 movq %mm2, %mm0
145 psrlq $63, %mm0 C new cbit
146
147 pmuludq %mm6, %mm2 C s*inverse
148
149 movq %mm7, %mm1
150 pmuludq %mm2, %mm1 C q*divisor
151 psrlq $32, %mm1 C new climb
152
153 subl $1, %ecx
154 jnz L(top)
155
156
157 L(done):
158 paddq %mm1, %mm0
159 movd %mm0, %eax
160 emms
161 ret
162
163 EPILOGUE()
164