mode1o.asm revision 1.1.1.2 1 1.1 mrg dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
2 1.1 mrg
3 1.1.1.2 mrg dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
4 1.1.1.2 mrg
5 1.1 mrg dnl This file is part of the GNU MP Library.
6 1.1 mrg dnl
7 1.1.1.2 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 1.1.1.2 mrg dnl it under the terms of either:
9 1.1.1.2 mrg dnl
10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free
11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your
12 1.1.1.2 mrg dnl option) any later version.
13 1.1.1.2 mrg dnl
14 1.1.1.2 mrg dnl or
15 1.1.1.2 mrg dnl
16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software
17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any
18 1.1.1.2 mrg dnl later version.
19 1.1.1.2 mrg dnl
20 1.1.1.2 mrg dnl or both in parallel, as here.
21 1.1.1.2 mrg dnl
22 1.1.1.2 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 1.1.1.2 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 1.1.1.2 mrg dnl for more details.
26 1.1 mrg dnl
27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the
28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/.
30 1.1 mrg
31 1.1 mrg include(`../config.m4')
32 1.1 mrg
33 1.1 mrg
34 1.1 mrg C cycles/limb
35 1.1 mrg C Athlon: 11.0
36 1.1 mrg C Hammer: 7.0
37 1.1 mrg
38 1.1 mrg
39 1.1 mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
40 1.1 mrg C mp_limb_t divisor);
41 1.1 mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
42 1.1 mrg C mp_limb_t divisor, mp_limb_t carry);
43 1.1 mrg C
44 1.1 mrg C With the loop running at just 11 cycles it doesn't seem worth bothering to
45 1.1 mrg C check for high<divisor to save one step.
46 1.1 mrg C
47 1.1 mrg C Using a divl for size==1 measures slower than the modexact method, which
48 1.1 mrg C is not too surprising since for the latter it's only about 24 cycles to
49 1.1 mrg C calculate the modular inverse.
50 1.1 mrg
51 1.1 mrg defframe(PARAM_CARRY, 16)
52 1.1 mrg defframe(PARAM_DIVISOR,12)
53 1.1 mrg defframe(PARAM_SIZE, 8)
54 1.1 mrg defframe(PARAM_SRC, 4)
55 1.1 mrg
56 1.1 mrg defframe(SAVE_EBX, -4)
57 1.1 mrg defframe(SAVE_ESI, -8)
58 1.1 mrg defframe(SAVE_EDI, -12)
59 1.1 mrg defframe(SAVE_EBP, -16)
60 1.1 mrg
61 1.1 mrg deflit(STACK_SPACE, 16)
62 1.1 mrg
63 1.1 mrg TEXT
64 1.1 mrg
65 1.1 mrg ALIGN(16)
66 1.1 mrg PROLOGUE(mpn_modexact_1c_odd)
67 1.1 mrg deflit(`FRAME',0)
68 1.1 mrg
69 1.1 mrg movl PARAM_CARRY, %ecx
70 1.1 mrg jmp L(start_1c)
71 1.1 mrg
72 1.1 mrg EPILOGUE()
73 1.1 mrg
74 1.1 mrg
75 1.1 mrg ALIGN(16)
76 1.1 mrg PROLOGUE(mpn_modexact_1_odd)
77 1.1 mrg deflit(`FRAME',0)
78 1.1 mrg
79 1.1 mrg xorl %ecx, %ecx
80 1.1 mrg L(start_1c):
81 1.1 mrg movl PARAM_DIVISOR, %eax
82 1.1 mrg subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
83 1.1 mrg
84 1.1 mrg movl %esi, SAVE_ESI
85 1.1 mrg movl PARAM_DIVISOR, %esi
86 1.1 mrg
87 1.1 mrg movl %edi, SAVE_EDI
88 1.1 mrg
89 1.1 mrg shrl %eax C d/2
90 1.1 mrg
91 1.1 mrg andl $127, %eax
92 1.1 mrg
93 1.1 mrg ifdef(`PIC',`
94 1.1 mrg LEA( binvert_limb_table, %edi)
95 1.1 mrg movzbl (%eax,%edi), %edi C inv 8 bits
96 1.1 mrg ',`
97 1.1 mrg movzbl binvert_limb_table(%eax), %edi C inv 8 bits
98 1.1 mrg ')
99 1.1 mrg
100 1.1 mrg xorl %edx, %edx C initial extra carry
101 1.1 mrg leal (%edi,%edi), %eax C 2*inv
102 1.1 mrg
103 1.1 mrg imull %edi, %edi C inv*inv
104 1.1 mrg
105 1.1 mrg movl %ebp, SAVE_EBP
106 1.1 mrg movl PARAM_SIZE, %ebp
107 1.1 mrg
108 1.1 mrg movl %ebx, SAVE_EBX
109 1.1 mrg movl PARAM_SRC, %ebx
110 1.1 mrg
111 1.1 mrg imull %esi, %edi C inv*inv*d
112 1.1 mrg
113 1.1 mrg subl %edi, %eax C inv = 2*inv - inv*inv*d
114 1.1 mrg leal (%eax,%eax), %edi C 2*inv
115 1.1 mrg
116 1.1 mrg imull %eax, %eax C inv*inv
117 1.1 mrg
118 1.1 mrg imull %esi, %eax C inv*inv*d
119 1.1 mrg
120 1.1 mrg leal (%ebx,%ebp,4), %ebx C src end
121 1.1 mrg negl %ebp C -size
122 1.1 mrg
123 1.1 mrg subl %eax, %edi C inv = 2*inv - inv*inv*d
124 1.1 mrg
125 1.1 mrg ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
126 1.1 mrg movl %esi, %eax
127 1.1 mrg imull %edi, %eax
128 1.1 mrg cmpl $1, %eax')
129 1.1 mrg
130 1.1 mrg
131 1.1 mrg C The dependent chain here is
132 1.1 mrg C
133 1.1 mrg C cycles
134 1.1 mrg C subl %edx, %eax 1
135 1.1 mrg C imull %edi, %eax 4
136 1.1 mrg C mull %esi 6 (high limb)
137 1.1 mrg C ----
138 1.1 mrg C total 11
139 1.1 mrg C
140 1.1 mrg C Out of order execution hides the load latency for the source data, so no
141 1.1 mrg C special scheduling is required.
142 1.1 mrg
143 1.1 mrg L(top):
144 1.1 mrg C eax src limb
145 1.1 mrg C ebx src end ptr
146 1.1 mrg C ecx next carry bit, 0 or 1 (or initial carry param)
147 1.1 mrg C edx carry limb, high of last product
148 1.1 mrg C esi divisor
149 1.1 mrg C edi inverse
150 1.1 mrg C ebp counter, limbs, negative
151 1.1 mrg
152 1.1 mrg movl (%ebx,%ebp,4), %eax
153 1.1 mrg
154 1.1 mrg subl %ecx, %eax C apply carry bit
155 1.1 mrg movl $0, %ecx
156 1.1 mrg
157 1.1 mrg setc %cl C new carry bit
158 1.1 mrg
159 1.1 mrg subl %edx, %eax C apply carry limb
160 1.1 mrg adcl $0, %ecx
161 1.1 mrg
162 1.1 mrg imull %edi, %eax
163 1.1 mrg
164 1.1 mrg mull %esi
165 1.1 mrg
166 1.1 mrg incl %ebp
167 1.1 mrg jnz L(top)
168 1.1 mrg
169 1.1 mrg
170 1.1 mrg movl SAVE_ESI, %esi
171 1.1 mrg movl SAVE_EDI, %edi
172 1.1 mrg leal (%ecx,%edx), %eax
173 1.1 mrg
174 1.1 mrg movl SAVE_EBX, %ebx
175 1.1 mrg movl SAVE_EBP, %ebp
176 1.1 mrg addl $STACK_SPACE, %esp
177 1.1 mrg
178 1.1 mrg ret
179 1.1 mrg
180 1.1 mrg EPILOGUE()
181 1.1.1.2 mrg ASM_END()
182