mode1o.asm revision 1.1 1 1.1 mrg dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
2 1.1 mrg
3 1.1 mrg dnl Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
4 1.1 mrg dnl
5 1.1 mrg dnl This file is part of the GNU MP Library.
6 1.1 mrg dnl
7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or
8 1.1 mrg dnl modify it under the terms of the GNU Lesser General Public License as
9 1.1 mrg dnl published by the Free Software Foundation; either version 3 of the
10 1.1 mrg dnl License, or (at your option) any later version.
11 1.1 mrg dnl
12 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful,
13 1.1 mrg dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 1.1 mrg dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 1.1 mrg dnl Lesser General Public License for more details.
16 1.1 mrg dnl
17 1.1 mrg dnl You should have received a copy of the GNU Lesser General Public License
18 1.1 mrg dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19 1.1 mrg
20 1.1 mrg include(`../config.m4')
21 1.1 mrg
22 1.1 mrg
23 1.1 mrg C cycles/limb
24 1.1 mrg C Athlon: 11.0
25 1.1 mrg C Hammer: 7.0
26 1.1 mrg
27 1.1 mrg
28 1.1 mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
29 1.1 mrg C mp_limb_t divisor);
30 1.1 mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
31 1.1 mrg C mp_limb_t divisor, mp_limb_t carry);
32 1.1 mrg C
33 1.1 mrg C With the loop running at just 11 cycles it doesn't seem worth bothering to
34 1.1 mrg C check for high<divisor to save one step.
35 1.1 mrg C
36 1.1 mrg C Using a divl for size==1 measures slower than the modexact method, which
37 1.1 mrg C is not too surprising since for the latter it's only about 24 cycles to
38 1.1 mrg C calculate the modular inverse.
39 1.1 mrg
40 1.1 mrg defframe(PARAM_CARRY, 16)
41 1.1 mrg defframe(PARAM_DIVISOR,12)
42 1.1 mrg defframe(PARAM_SIZE, 8)
43 1.1 mrg defframe(PARAM_SRC, 4)
44 1.1 mrg
45 1.1 mrg defframe(SAVE_EBX, -4)
46 1.1 mrg defframe(SAVE_ESI, -8)
47 1.1 mrg defframe(SAVE_EDI, -12)
48 1.1 mrg defframe(SAVE_EBP, -16)
49 1.1 mrg
50 1.1 mrg deflit(STACK_SPACE, 16)
51 1.1 mrg
52 1.1 mrg TEXT
53 1.1 mrg
54 1.1 mrg ALIGN(16)
55 1.1 mrg PROLOGUE(mpn_modexact_1c_odd)
56 1.1 mrg deflit(`FRAME',0)
57 1.1 mrg
58 1.1 mrg movl PARAM_CARRY, %ecx
59 1.1 mrg jmp L(start_1c)
60 1.1 mrg
61 1.1 mrg EPILOGUE()
62 1.1 mrg
63 1.1 mrg
64 1.1 mrg ALIGN(16)
65 1.1 mrg PROLOGUE(mpn_modexact_1_odd)
66 1.1 mrg deflit(`FRAME',0)
67 1.1 mrg
68 1.1 mrg xorl %ecx, %ecx
69 1.1 mrg L(start_1c):
70 1.1 mrg movl PARAM_DIVISOR, %eax
71 1.1 mrg subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
72 1.1 mrg
73 1.1 mrg movl %esi, SAVE_ESI
74 1.1 mrg movl PARAM_DIVISOR, %esi
75 1.1 mrg
76 1.1 mrg movl %edi, SAVE_EDI
77 1.1 mrg
78 1.1 mrg shrl %eax C d/2
79 1.1 mrg
80 1.1 mrg andl $127, %eax
81 1.1 mrg
82 1.1 mrg ifdef(`PIC',`
83 1.1 mrg LEA( binvert_limb_table, %edi)
84 1.1 mrg movzbl (%eax,%edi), %edi C inv 8 bits
85 1.1 mrg ',`
86 1.1 mrg movzbl binvert_limb_table(%eax), %edi C inv 8 bits
87 1.1 mrg ')
88 1.1 mrg
89 1.1 mrg xorl %edx, %edx C initial extra carry
90 1.1 mrg leal (%edi,%edi), %eax C 2*inv
91 1.1 mrg
92 1.1 mrg imull %edi, %edi C inv*inv
93 1.1 mrg
94 1.1 mrg movl %ebp, SAVE_EBP
95 1.1 mrg movl PARAM_SIZE, %ebp
96 1.1 mrg
97 1.1 mrg movl %ebx, SAVE_EBX
98 1.1 mrg movl PARAM_SRC, %ebx
99 1.1 mrg
100 1.1 mrg imull %esi, %edi C inv*inv*d
101 1.1 mrg
102 1.1 mrg subl %edi, %eax C inv = 2*inv - inv*inv*d
103 1.1 mrg leal (%eax,%eax), %edi C 2*inv
104 1.1 mrg
105 1.1 mrg imull %eax, %eax C inv*inv
106 1.1 mrg
107 1.1 mrg imull %esi, %eax C inv*inv*d
108 1.1 mrg
109 1.1 mrg leal (%ebx,%ebp,4), %ebx C src end
110 1.1 mrg negl %ebp C -size
111 1.1 mrg
112 1.1 mrg subl %eax, %edi C inv = 2*inv - inv*inv*d
113 1.1 mrg
114 1.1 mrg ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
115 1.1 mrg movl %esi, %eax
116 1.1 mrg imull %edi, %eax
117 1.1 mrg cmpl $1, %eax')
118 1.1 mrg
119 1.1 mrg
120 1.1 mrg C The dependent chain here is
121 1.1 mrg C
122 1.1 mrg C cycles
123 1.1 mrg C subl %edx, %eax 1
124 1.1 mrg C imull %edi, %eax 4
125 1.1 mrg C mull %esi 6 (high limb)
126 1.1 mrg C ----
127 1.1 mrg C total 11
128 1.1 mrg C
129 1.1 mrg C Out of order execution hides the load latency for the source data, so no
130 1.1 mrg C special scheduling is required.
131 1.1 mrg
132 1.1 mrg L(top):
133 1.1 mrg C eax src limb
134 1.1 mrg C ebx src end ptr
135 1.1 mrg C ecx next carry bit, 0 or 1 (or initial carry param)
136 1.1 mrg C edx carry limb, high of last product
137 1.1 mrg C esi divisor
138 1.1 mrg C edi inverse
139 1.1 mrg C ebp counter, limbs, negative
140 1.1 mrg
141 1.1 mrg movl (%ebx,%ebp,4), %eax
142 1.1 mrg
143 1.1 mrg subl %ecx, %eax C apply carry bit
144 1.1 mrg movl $0, %ecx
145 1.1 mrg
146 1.1 mrg setc %cl C new carry bit
147 1.1 mrg
148 1.1 mrg subl %edx, %eax C apply carry limb
149 1.1 mrg adcl $0, %ecx
150 1.1 mrg
151 1.1 mrg imull %edi, %eax
152 1.1 mrg
153 1.1 mrg mull %esi
154 1.1 mrg
155 1.1 mrg incl %ebp
156 1.1 mrg jnz L(top)
157 1.1 mrg
158 1.1 mrg
159 1.1 mrg movl SAVE_ESI, %esi
160 1.1 mrg movl SAVE_EDI, %edi
161 1.1 mrg leal (%ecx,%edx), %eax
162 1.1 mrg
163 1.1 mrg movl SAVE_EBX, %ebx
164 1.1 mrg movl SAVE_EBP, %ebp
165 1.1 mrg addl $STACK_SPACE, %esp
166 1.1 mrg
167 1.1 mrg ret
168 1.1 mrg
169 1.1 mrg EPILOGUE()
170