mode1o.asm revision 1.1.1.1 1 dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
2
3 dnl Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
4 dnl
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
11 dnl
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
16 dnl
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22
23 C cycles/limb
24 C Athlon: 11.0
25 C Hammer: 7.0
26
27
28 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
29 C mp_limb_t divisor);
30 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
31 C mp_limb_t divisor, mp_limb_t carry);
32 C
33 C With the loop running at just 11 cycles it doesn't seem worth bothering to
34 C check for high<divisor to save one step.
35 C
36 C Using a divl for size==1 measures slower than the modexact method, which
37 C is not too surprising since for the latter it's only about 24 cycles to
38 C calculate the modular inverse.
39
40 defframe(PARAM_CARRY, 16)
41 defframe(PARAM_DIVISOR,12)
42 defframe(PARAM_SIZE, 8)
43 defframe(PARAM_SRC, 4)
44
45 defframe(SAVE_EBX, -4)
46 defframe(SAVE_ESI, -8)
47 defframe(SAVE_EDI, -12)
48 defframe(SAVE_EBP, -16)
49
50 deflit(STACK_SPACE, 16)
51
52 TEXT
53
54 ALIGN(16)
55 PROLOGUE(mpn_modexact_1c_odd)
56 deflit(`FRAME',0)
57
58 movl PARAM_CARRY, %ecx
59 jmp L(start_1c)
60
61 EPILOGUE()
62
63
64 ALIGN(16)
65 PROLOGUE(mpn_modexact_1_odd)
66 deflit(`FRAME',0)
67
68 xorl %ecx, %ecx
69 L(start_1c):
70 movl PARAM_DIVISOR, %eax
71 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
72
73 movl %esi, SAVE_ESI
74 movl PARAM_DIVISOR, %esi
75
76 movl %edi, SAVE_EDI
77
78 shrl %eax C d/2
79
80 andl $127, %eax
81
82 ifdef(`PIC',`
83 LEA( binvert_limb_table, %edi)
84 movzbl (%eax,%edi), %edi C inv 8 bits
85 ',`
86 movzbl binvert_limb_table(%eax), %edi C inv 8 bits
87 ')
88
89 xorl %edx, %edx C initial extra carry
90 leal (%edi,%edi), %eax C 2*inv
91
92 imull %edi, %edi C inv*inv
93
94 movl %ebp, SAVE_EBP
95 movl PARAM_SIZE, %ebp
96
97 movl %ebx, SAVE_EBX
98 movl PARAM_SRC, %ebx
99
100 imull %esi, %edi C inv*inv*d
101
102 subl %edi, %eax C inv = 2*inv - inv*inv*d
103 leal (%eax,%eax), %edi C 2*inv
104
105 imull %eax, %eax C inv*inv
106
107 imull %esi, %eax C inv*inv*d
108
109 leal (%ebx,%ebp,4), %ebx C src end
110 negl %ebp C -size
111
112 subl %eax, %edi C inv = 2*inv - inv*inv*d
113
114 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
115 movl %esi, %eax
116 imull %edi, %eax
117 cmpl $1, %eax')
118
119
120 C The dependent chain here is
121 C
122 C cycles
123 C subl %edx, %eax 1
124 C imull %edi, %eax 4
125 C mull %esi 6 (high limb)
126 C ----
127 C total 11
128 C
129 C Out of order execution hides the load latency for the source data, so no
130 C special scheduling is required.
131
132 L(top):
133 C eax src limb
134 C ebx src end ptr
135 C ecx next carry bit, 0 or 1 (or initial carry param)
136 C edx carry limb, high of last product
137 C esi divisor
138 C edi inverse
139 C ebp counter, limbs, negative
140
141 movl (%ebx,%ebp,4), %eax
142
143 subl %ecx, %eax C apply carry bit
144 movl $0, %ecx
145
146 setc %cl C new carry bit
147
148 subl %edx, %eax C apply carry limb
149 adcl $0, %ecx
150
151 imull %edi, %eax
152
153 mull %esi
154
155 incl %ebp
156 jnz L(top)
157
158
159 movl SAVE_ESI, %esi
160 movl SAVE_EDI, %edi
161 leal (%ecx,%edx), %eax
162
163 movl SAVE_EBX, %ebx
164 movl SAVE_EBP, %ebp
165 addl $STACK_SPACE, %esp
166
167 ret
168
169 EPILOGUE()
170