alpha-mont.S revision 1.1.4.1 1 #ifdef __linux__
2 #include <asm/regdef.h>
3 #else
4 #include <asm.h>
5 #include <regdef.h>
6 #endif
7
8 .text
9
10 .set noat
11 .set noreorder
12
13 .globl bn_mul_mont
14 .align 5
15 .ent bn_mul_mont
16 bn_mul_mont:
17 lda sp,-48(sp)
18 stq ra,0(sp)
19 stq s3,8(sp)
20 stq s4,16(sp)
21 stq s5,24(sp)
22 stq fp,32(sp)
23 mov sp,fp
24 .mask 0x0400f000,-48
25 .frame fp,48,ra
26 .prologue 0
27
28 .align 4
29 .set reorder
30 sextl a5,a5
31 mov 0,v0
32 cmplt a5,4,AT
33 bne AT,.Lexit
34
35 ldq t1,0(a1) # ap[0]
36 s8addq a5,16,AT
37 ldq t4,8(a1)
38 subq sp,AT,sp
39 ldq t5,0(a2) # bp[0]
40 lda AT,-4096(zero) # mov -4096,AT
41 ldq a4,0(a4)
42 and sp,AT,sp
43
44 mulq t1,t5,t0
45 ldq t3,0(a3) # np[0]
46 umulh t1,t5,t1
47 ldq t6,8(a3)
48
49 mulq t0,a4,s5
50
51 mulq t3,s5,t2
52 umulh t3,s5,t3
53
54 addq t2,t0,t2
55 cmpult t2,t0,AT
56 addq t3,AT,t3
57
58 mulq t4,t5,t8
59 mov 2,s4
60 umulh t4,t5,t9
61 mov sp,t7
62
63 mulq t6,s5,t10
64 s8addq s4,a1,t4
65 umulh t6,s5,t11
66 s8addq s4,a3,t6
67 .align 4
68 .L1st:
69 .set noreorder
70 ldq t4,0(t4)
71 addl s4,1,s4
72 ldq t6,0(t6)
73 lda t7,8(t7)
74
75 addq t8,t1,t0
76 mulq t4,t5,t8
77 cmpult t0,t1,AT
78 addq t10,t3,t2
79
80 mulq t6,s5,t10
81 addq t9,AT,t1
82 cmpult t2,t3,v0
83 cmplt s4,a5,t12
84
85 umulh t4,t5,t9
86 addq t11,v0,t3
87 addq t2,t0,t2
88 s8addq s4,a1,t4
89
90 umulh t6,s5,t11
91 cmpult t2,t0,v0
92 addq t3,v0,t3
93 s8addq s4,a3,t6
94
95 stq t2,-8(t7)
96 nop
97 unop
98 bne t12,.L1st
99 .set reorder
100
101 addq t8,t1,t0
102 addq t10,t3,t2
103 cmpult t0,t1,AT
104 cmpult t2,t3,v0
105 addq t9,AT,t1
106 addq t11,v0,t3
107
108 addq t2,t0,t2
109 cmpult t2,t0,v0
110 addq t3,v0,t3
111
112 stq t2,0(t7)
113
114 addq t3,t1,t3
115 cmpult t3,t1,AT
116 stq t3,8(t7)
117 stq AT,16(t7)
118
119 mov 1,s3
120 .align 4
121 .Louter:
122 s8addq s3,a2,t5
123 ldq t1,0(a1)
124 ldq t4,8(a1)
125 ldq t5,0(t5)
126 ldq t3,0(a3)
127 ldq t6,8(a3)
128 ldq t12,0(sp)
129
130 mulq t1,t5,t0
131 umulh t1,t5,t1
132
133 addq t0,t12,t0
134 cmpult t0,t12,AT
135 addq t1,AT,t1
136
137 mulq t0,a4,s5
138
139 mulq t3,s5,t2
140 umulh t3,s5,t3
141
142 addq t2,t0,t2
143 cmpult t2,t0,AT
144 mov 2,s4
145 addq t3,AT,t3
146
147 mulq t4,t5,t8
148 mov sp,t7
149 umulh t4,t5,t9
150
151 mulq t6,s5,t10
152 s8addq s4,a1,t4
153 umulh t6,s5,t11
154 .align 4
155 .Linner:
156 .set noreorder
157 ldq t12,8(t7) #L0
158 nop #U1
159 ldq t4,0(t4) #L1
160 s8addq s4,a3,t6 #U0
161
162 ldq t6,0(t6) #L0
163 nop #U1
164 addq t8,t1,t0 #L1
165 lda t7,8(t7)
166
167 mulq t4,t5,t8 #U1
168 cmpult t0,t1,AT #L0
169 addq t10,t3,t2 #L1
170 addl s4,1,s4
171
172 mulq t6,s5,t10 #U1
173 addq t9,AT,t1 #L0
174 addq t0,t12,t0 #L1
175 cmpult t2,t3,v0 #U0
176
177 umulh t4,t5,t9 #U1
178 cmpult t0,t12,AT #L0
179 addq t2,t0,t2 #L1
180 addq t11,v0,t3 #U0
181
182 umulh t6,s5,t11 #U1
183 s8addq s4,a1,t4 #L0
184 cmpult t2,t0,v0 #L1
185 cmplt s4,a5,t12 #U0 # borrow t12
186
187 addq t1,AT,t1 #L0
188 addq t3,v0,t3 #U1
189 stq t2,-8(t7) #L1
190 bne t12,.Linner #U0
191 .set reorder
192
193 ldq t12,8(t7)
194 addq t8,t1,t0
195 addq t10,t3,t2
196 cmpult t0,t1,AT
197 cmpult t2,t3,v0
198 addq t9,AT,t1
199 addq t11,v0,t3
200
201 addq t0,t12,t0
202 cmpult t0,t12,AT
203 addq t1,AT,t1
204
205 ldq t12,16(t7)
206 addq t2,t0,s4
207 cmpult s4,t0,v0
208 addq t3,v0,t3
209
210 addq t3,t1,t2
211 stq s4,0(t7)
212 cmpult t2,t1,t3
213 addq t2,t12,t2
214 cmpult t2,t12,AT
215 addl s3,1,s3
216 addq t3,AT,t3
217 stq t2,8(t7)
218 cmplt s3,a5,t12 # borrow t12
219 stq t3,16(t7)
220 bne t12,.Louter
221
222 s8addq a5,sp,t12 # &tp[num]
224 mov a0,a2 # put rp aside
225 mov sp,t7
226 mov sp,a1
227 mov 0,t1 # clear borrow bit
228
229 .align 4
230 .Lsub: ldq t0,0(t7)
231 ldq t2,0(a3)
232 lda t7,8(t7)
233 lda a3,8(a3)
234 subq t0,t2,t2 # tp[i]-np[i]
235 cmpult t0,t2,AT
236 subq t2,t1,t0
237 cmpult t2,t0,t1
238 or t1,AT,t1
239 stq t0,0(a0)
240 cmpult t7,t12,v0
241 lda a0,8(a0)
242 bne v0,.Lsub
243
244 subq t3,t1,t1 # handle upmost overflow bit
245 mov sp,t7
246 mov a2,a0 # restore rp
247
248 .align 4
249 .Lcopy: ldq t4,0(t7) # conditional copy
250 ldq t6,0(a0)
251 lda t7,8(t7)
252 lda a0,8(a0)
253 cmoveq t1,t6,t4
254 stq zero,-8(t7) # zap tp
255 cmpult t7,t12,AT
256 stq t4,-8(a0)
257 bne AT,.Lcopy
258 mov 1,v0
259
260 .Lexit:
261 .set noreorder
262 mov fp,sp
263 /*ldq ra,0(sp)*/
264 ldq s3,8(sp)
265 ldq s4,16(sp)
266 ldq s5,24(sp)
267 ldq fp,32(sp)
268 lda sp,48(sp)
269 ret (ra)
270 .end bn_mul_mont
271 .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro (at) openssl.org>"
272 .align 2
273