1 #ifdef __linux__ 2 #include <asm/regdef.h> 3 #else 4 #include <asm.h> 5 #include <regdef.h> 6 #endif 7 8 .text 9 10 .set noat 11 .set noreorder 12 13 .globl bn_mul_mont 14 .align 5 15 .ent bn_mul_mont 16 bn_mul_mont: 17 lda sp,-48(sp) 18 stq ra,0(sp) 19 stq s3,8(sp) 20 stq s4,16(sp) 21 stq s5,24(sp) 22 stq fp,32(sp) 23 mov sp,fp 24 .mask 0x0400f000,-48 25 .frame fp,48,ra 26 .prologue 0 27 28 .align 4 29 .set reorder 30 sextl a5,a5 31 mov 0,v0 32 cmplt a5,4,AT 33 bne AT,.Lexit 34 35 ldq t1,0(a1) # ap[0] 36 s8addq a5,16,AT 37 ldq t4,8(a1) 38 subq sp,AT,sp 39 ldq t5,0(a2) # bp[0] 40 lda AT,-4096(zero) # mov -4096,AT 41 ldq a4,0(a4) 42 and sp,AT,sp 43 44 mulq t1,t5,t0 45 ldq t3,0(a3) # np[0] 46 umulh t1,t5,t1 47 ldq t6,8(a3) 48 49 mulq t0,a4,s5 50 51 mulq t3,s5,t2 52 umulh t3,s5,t3 53 54 addq t2,t0,t2 55 cmpult t2,t0,AT 56 addq t3,AT,t3 57 58 mulq t4,t5,t8 59 mov 2,s4 60 umulh t4,t5,t9 61 mov sp,t7 62 63 mulq t6,s5,t10 64 s8addq s4,a1,t4 65 umulh t6,s5,t11 66 s8addq s4,a3,t6 67 .align 4 68 .L1st: 69 .set noreorder 70 ldq t4,0(t4) 71 addl s4,1,s4 72 ldq t6,0(t6) 73 lda t7,8(t7) 74 75 addq t8,t1,t0 76 mulq t4,t5,t8 77 cmpult t0,t1,AT 78 addq t10,t3,t2 79 80 mulq t6,s5,t10 81 addq t9,AT,t1 82 cmpult t2,t3,v0 83 cmplt s4,a5,t12 84 85 umulh t4,t5,t9 86 addq t11,v0,t3 87 addq t2,t0,t2 88 s8addq s4,a1,t4 89 90 umulh t6,s5,t11 91 cmpult t2,t0,v0 92 addq t3,v0,t3 93 s8addq s4,a3,t6 94 95 stq t2,-8(t7) 96 nop 97 unop 98 bne t12,.L1st 99 .set reorder 100 101 addq t8,t1,t0 102 addq t10,t3,t2 103 cmpult t0,t1,AT 104 cmpult t2,t3,v0 105 addq t9,AT,t1 106 addq t11,v0,t3 107 108 addq t2,t0,t2 109 cmpult t2,t0,v0 110 addq t3,v0,t3 111 112 stq t2,0(t7) 113 114 addq t3,t1,t3 115 cmpult t3,t1,AT 116 stq t3,8(t7) 117 stq AT,16(t7) 118 119 mov 1,s3 120 .align 4 121 .Louter: 122 s8addq s3,a2,t5 123 ldq t1,0(a1) 124 ldq t4,8(a1) 125 ldq t5,0(t5) 126 ldq t3,0(a3) 127 ldq t6,8(a3) 128 ldq t12,0(sp) 129 130 mulq t1,t5,t0 131 umulh t1,t5,t1 132 133 addq t0,t12,t0 134 cmpult t0,t12,AT 135 addq t1,AT,t1 136 137 mulq t0,a4,s5 138 139 mulq t3,s5,t2 140 umulh t3,s5,t3 141 142 addq t2,t0,t2 143 cmpult t2,t0,AT 144 mov 2,s4 145 addq t3,AT,t3 146 147 mulq t4,t5,t8 148 mov sp,t7 149 umulh t4,t5,t9 150 151 mulq t6,s5,t10 152 s8addq s4,a1,t4 153 umulh t6,s5,t11 154 .align 4 155 .Linner: 156 .set noreorder 157 ldq t12,8(t7) #L0 158 nop #U1 159 ldq t4,0(t4) #L1 160 s8addq s4,a3,t6 #U0 161 162 ldq t6,0(t6) #L0 163 nop #U1 164 addq t8,t1,t0 #L1 165 lda t7,8(t7) 166 167 mulq t4,t5,t8 #U1 168 cmpult t0,t1,AT #L0 169 addq t10,t3,t2 #L1 170 addl s4,1,s4 171 172 mulq t6,s5,t10 #U1 173 addq t9,AT,t1 #L0 174 addq t0,t12,t0 #L1 175 cmpult t2,t3,v0 #U0 176 177 umulh t4,t5,t9 #U1 178 cmpult t0,t12,AT #L0 179 addq t2,t0,t2 #L1 180 addq t11,v0,t3 #U0 181 182 umulh t6,s5,t11 #U1 183 s8addq s4,a1,t4 #L0 184 cmpult t2,t0,v0 #L1 185 cmplt s4,a5,t12 #U0 # borrow t12 186 187 addq t1,AT,t1 #L0 188 addq t3,v0,t3 #U1 189 stq t2,-8(t7) #L1 190 bne t12,.Linner #U0 191 .set reorder 192 193 ldq t12,8(t7) 194 addq t8,t1,t0 195 addq t10,t3,t2 196 cmpult t0,t1,AT 197 cmpult t2,t3,v0 198 addq t9,AT,t1 199 addq t11,v0,t3 200 201 addq t0,t12,t0 202 cmpult t0,t12,AT 203 addq t1,AT,t1 204 205 ldq t12,16(t7) 206 addq t2,t0,s4 207 cmpult s4,t0,v0 208 addq t3,v0,t3 209 210 addq t3,t1,t2 211 stq s4,0(t7) 212 cmpult t2,t1,t3 213 addq t2,t12,t2 214 cmpult t2,t12,AT 215 addl s3,1,s3 216 addq t3,AT,t3 217 stq t2,8(t7) 218 cmplt s3,a5,t12 # borrow t12 219 stq t3,16(t7) 220 bne t12,.Louter 221 222 s8addq a5,sp,t12 # &tp[num] 224 mov a0,a2 # put rp aside 225 mov sp,t7 226 mov sp,a1 227 mov 0,t1 # clear borrow bit 228 229 .align 4 230 .Lsub: ldq t0,0(t7) 231 ldq t2,0(a3) 232 lda t7,8(t7) 233 lda a3,8(a3) 234 subq t0,t2,t2 # tp[i]-np[i] 235 cmpult t0,t2,AT 236 subq t2,t1,t0 237 cmpult t2,t0,t1 238 or t1,AT,t1 239 stq t0,0(a0) 240 cmpult t7,t12,v0 241 lda a0,8(a0) 242 bne v0,.Lsub 243 244 subq t3,t1,t1 # handle upmost overflow bit 245 mov sp,t7 246 mov a2,a0 # restore rp 247 248 .align 4 249 .Lcopy: ldq t4,0(t7) # conditional copy 250 ldq t6,0(a0) 251 lda t7,8(t7) 252 lda a0,8(a0) 253 cmoveq t1,t6,t4 254 stq zero,-8(t7) # zap tp 255 cmpult t7,t12,AT 256 stq t4,-8(a0) 257 bne AT,.Lcopy 258 mov 1,v0 259 260 .Lexit: 261 .set noreorder 262 mov fp,sp 263 /*ldq ra,0(sp)*/ 264 ldq s3,8(sp) 265 ldq s4,16(sp) 266 ldq s5,24(sp) 267 ldq fp,32(sp) 268 lda sp,48(sp) 269 ret (ra) 270 .end bn_mul_mont 271 .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro (at) openssl.org>" 272 .align 2 273