1 # $NetBSD: bn_asm_vax.S,v 1.1.1.2 2023/04/18 14:19:11 christos Exp $ 2 # 3 # w.j.m. 15-jan-1999 4 # 5 # it's magic ... 6 # 7 # ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) { 8 # ULONG c = 0; 9 # int i; 10 # for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ; 11 # return c; 12 # } 13 14 .globl bn_mul_add_words 15 .type bn_mul_add_words@function 16 17 bn_mul_add_words: 18 .word 0x40 19 20 movl 4(%ap),%r2 # *r 21 movl 8(%ap),%r3 # *a 22 movl 12(%ap),%r4 # n 23 movl 16(%ap),%r5 # w 24 clrl %r6 # return value ("carry") 25 26 0: emul %r5,(%r3),(%r2),%r0 # w * a[0] + r[0] -> r0 27 28 # fixup for "negative" r[] 29 tstl (%r2) 30 bgeq 1f 31 incl %r1 # add 1 to highword 32 33 1: # add saved carry to result 34 addl2 %r6,%r0 35 adwc $0,%r1 36 37 # combined fixup for "negative" w, a[] 38 tstl %r5 # if w is negative... 39 bgeq 1f 40 addl2 (%r3),%r1 # ...add a[0] again to highword 41 1: tstl (%r3) # if a[0] is negative... 42 bgeq 1f 43 addl2 %r5,%r1 # ...add w again to highword 44 1: 45 movl %r0,(%r2)+ # save low word in dest & advance *r 46 addl2 $4,%r3 # advance *a 47 movl %r1,%r6 # high word in r6 for return value 48 49 sobgtr %r4,0b # loop? 50 51 movl %r6,%r0 52 ret 53 .size bn_mul_add_words, .-bn_mul_add_words 54 55 # .title vax_bn_mul_words unsigned multiply & add, 32*32+32=>64 56 #; 57 #; w.j.m. 15-jan-1999 58 #; 59 #; it's magic ... 60 #; 61 #; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) { 62 #; ULONG c = 0; 63 #; int i; 64 #; for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ; 65 #; return(c); 66 #; } 67 # 68 .globl bn_mul_words 69 .type bn_mul_words@function 70 bn_mul_words: 71 .word 0x40 72 73 movl 4(%ap),%r2 # *r 74 movl 8(%ap),%r3 # *a 75 movl 12(%ap),%r4 # n 76 movl 16(%ap),%r5 # w 77 clrl %r6 # carry 78 79 0: emul %r5,(%r3),%r6,%r0 # w * a[0] + carry -> r0 80 81 # fixup for "negative" carry 82 tstl %r6 83 bgeq 1f 84 incl %r1 85 86 1: # combined fixup for "negative" w, a[] 87 tstl %r5 88 bgeq 1f 89 addl2 (%r3),%r1 90 1: tstl (%r3) 91 bgeq 1f 92 addl2 %r5,%r1 93 94 1: movl %r0,(%r2)+ 95 addl2 $4,%r3 96 movl %r1,%r6 97 98 sobgtr %r4,0b 99 100 movl %r6,%r0 101 ret 102 .size bn_mul_words, .-bn_mul_words 103 104 105 106 # .title vax_bn_sqr_words unsigned square, 32*32=>64 107 #; 108 #; w.j.m. 15-jan-1999 109 #; 110 #; it's magic ... 111 #; 112 #; void bn_sqr_words(ULONG r[],ULONG a[],int n) { 113 #; int i; 114 #; for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ; 115 #; } 116 # 117 .globl bn_sqr_words 118 .type bn_sqr_words@function 119 bn_sqr_words: 120 .word 0 121 122 movl 4(%ap),%r2 # r 123 movl 8(%ap),%r3 # a 124 movl 12(%ap),%r4 # n 125 126 0: movl (%r3)+,%r5 # r5 = a[] & advance 127 128 emul %r5,%r5,$0,%r0 # a[0] * a[0] + 0 -> r0 129 130 # fixup for "negative" a[] 131 tstl %r5 132 bgeq 1f 133 addl2 %r5,%r1 134 addl2 %r5,%r1 135 136 1: movq %r0,(%r2)+ # store 64-bit result 137 138 sobgtr %r4,0b # loop 139 140 ret 141 .size bn_sqr_words, .-bn_sqr_words 142 143 144 # .title vax_bn_div_words unsigned divide 145 #; 146 #; Richard Levitte 20-Nov-2000 147 #; 148 #; ULONG bn_div_words(ULONG h, ULONG l, ULONG d) 149 #; { 150 #; return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d); 151 #; } 152 #; 153 #; Using EDIV would be very easy, if it didn't do signed calculations. 154 #; Any time any of the input numbers are signed, there are problems, 155 #; usually with integer overflow, at which point it returns useless 156 #; data (the quotient gets the value of l, and the remainder becomes 0). 157 #; 158 #; If it was just for the dividend, it would be very easy, just divide 159 #; it by 2 (unsigned), do the division, multiply the resulting quotient 160 #; and remainder by 2, add the bit that was dropped when dividing by 2 161 #; to the remainder, and do some adjustment so the remainder doesn't 162 #; end up larger than the divisor. For some cases when the divisor is 163 #; negative (from EDIV's point of view, i.e. when the highest bit is set), 164 #; dividing the dividend by 2 isn't enough, and since some operations 165 #; might generate integer overflows even when the dividend is divided by 166 #; 4 (when the high part of the shifted down dividend ends up being exactly 167 #; half of the divisor, the result is the quotient 0x80000000, which is 168 #; negative...) it needs to be divided by 8. Furthermore, the divisor needs 169 #; to be divided by 2 (unsigned) as well, to avoid more problems with the sign. 170 #; In this case, a little extra fiddling with the remainder is required. 171 #; 172 #; So, the simplest way to handle this is always to divide the dividend 173 #; by 8, and to divide the divisor by 2 if it's highest bit is set. 174 #; After EDIV has been used, the quotient gets multiplied by 8 if the 175 #; original divisor was positive, otherwise 4. The remainder, oddly 176 #; enough, is *always* multiplied by 8. 177 #; NOTE: in the case mentioned above, where the high part of the shifted 178 #; down dividend ends up being exactly half the shifted down divisor, we 179 #; end up with a 33 bit quotient. That's no problem however, it usually 180 #; means we have ended up with a too large remainder as well, and the 181 #; problem is fixed by the last part of the algorithm (next paragraph). 182 #; 183 #; The routine ends with comparing the resulting remainder with the 184 #; original divisor and if the remainder is larger, subtract the 185 #; original divisor from it, and increase the quotient by 1. This is 186 #; done until the remainder is smaller than the divisor. 187 #; 188 #; The complete algorithm looks like this: 189 #; 190 #; d' = d 191 #; l' = l & 7 192 #; [h,l] = [h,l] >> 3 193 #; [q,r] = floor([h,l] / d) # This is the EDIV operation 194 #; if (q < 0) q = -q # I doubt this is necessary any more 195 #; 196 #; r' = r >> 29 197 #; if (d' >= 0) 198 #; q' = q >> 29 199 #; q = q << 3 200 #; else 201 #; q' = q >> 30 202 #; q = q << 2 203 #; r = (r << 3) + l' 204 #; 205 #; if (d' < 0) 206 #; { 207 #; [r',r] = [r',r] - q 208 #; while ([r',r] < 0) 209 #; { 210 #; [r',r] = [r',r] + d 211 #; [q',q] = [q',q] - 1 212 #; } 213 #; } 214 #; 215 #; while ([r',r] >= d') 216 #; { 217 #; [r',r] = [r',r] - d' 218 #; [q',q] = [q',q] + 1 219 #; } 220 #; 221 #; return q 222 # 223 #;r2 = l, q 224 #;r3 = h, r 225 #;r4 = d 226 #;r5 = l' 227 #;r6 = r' 228 #;r7 = d' 229 #;r8 = q' 230 # 231 .globl bn_div_words 232 .type bn_div_words@function 233 bn_div_words: 234 .word 0x1c0 235 236 movl 4(%ap),%r3 # h 237 movl 8(%ap),%r2 # l 238 movl 12(%ap),%r4 # d 239 240 bicl3 $-8,%r2,%r5 # l' = l & 7 241 bicl3 $7,%r2,%r2 242 243 bicl3 $-8,%r3,%r6 244 bicl3 $7,%r3,%r3 245 246 addl2 %r6,%r2 247 248 rotl $-3,%r2,%r2 # l = l >> 3 249 rotl $-3,%r3,%r3 # h = h >> 3 250 251 movl %r4,%r7 # d' = d 252 253 clrl %r6 # r' = 0 254 clrl %r8 # q' = 0 255 256 tstl %r4 257 beql 0f # Uh-oh, the divisor is 0... 258 bgtr 1f 259 rotl $-1,%r4,%r4 # If d is negative, shift it right. 260 bicl2 $0x80000000,%r4 # Since d is then a large number, the 261 # lowest bit is insignificant 262 # (contradict that, and I'll fix the problem!) 263 1: 264 ediv %r4,%r2,%r2,%r3 # Do the actual division 265 266 tstl %r2 267 bgeq 1f 268 mnegl %r2,%r2 # if q < 0, negate it 269 1: 270 tstl %r7 271 blss 1f 272 rotl $3,%r2,%r2 # q = q << 3 273 bicl3 $-8,%r2,%r8 # q' gets the high bits from q 274 bicl3 $7,%r2,%r2 275 brb 2f 276 277 1: # else 278 rotl $2,%r2,%r2 # q = q << 2 279 bicl3 $-4,%r2,%r8 # q' gets the high bits from q 280 bicl3 $3,%r2,%r2 281 2: 282 rotl $3,%r3,%r3 # r = r << 3 283 bicl3 $-8,%r3,%r6 # r' gets the high bits from r 284 bicl3 $7,%r3,%r3 285 addl2 %r5,%r3 # r = r + l' 286 287 tstl %r7 288 bgeq 5f 289 bitl $1,%r7 290 beql 5f # if d' < 0 && d' & 1 291 subl2 %r2,%r3 # [r',r] = [r',r] - [q',q] 292 sbwc %r8,%r6 293 3: 294 bgeq 5f # while r < 0 295 decl %r2 # [q',q] = [q',q] - 1 296 sbwc $0,%r8 297 addl2 %r7,%r3 # [r',r] = [r',r] + d' 298 adwc $0,%r6 299 brb 3b 300 301 # The return points are placed in the middle to keep a short distance from 302 # all the branch points 303 1: 304 # movl %r3,%r1 305 movl %r2,%r0 306 ret 307 0: 308 movl $-1,%r0 309 ret 310 5: 311 tstl %r6 312 bneq 6f 313 cmpl %r3,%r7 314 blssu 1b # while [r',r] >= d' 315 6: 316 subl2 %r7,%r3 # [r',r] = [r',r] - d' 317 sbwc $0,%r6 318 incl %r2 # [q',q] = [q',q] + 1 319 adwc $0,%r8 320 brb 5b 321 .size bn_div_words, .-bn_div_words 322 323 324 325 # .title vax_bn_add_words unsigned add of two arrays 326 #; 327 #; Richard Levitte 20-Nov-2000 328 #; 329 #; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) { 330 #; ULONG c = 0; 331 #; int i; 332 #; for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c; 333 #; return(c); 334 #; } 335 # 336 337 .globl bn_add_words 338 .type bn_add_words@function 339 bn_add_words: 340 .word 0 341 342 movl 4(%ap),%r2 # r 343 movl 8(%ap),%r3 # a 344 movl 12(%ap),%r4 # b 345 movl 16(%ap),%r5 # n 346 clrl %r0 347 348 tstl %r5 349 bleq 1f 350 351 0: movl (%r3)+,%r1 # carry untouched 352 adwc (%r4)+,%r1 # carry used and touched 353 movl %r1,(%r2)+ # carry untouched 354 sobgtr %r5,0b # carry untouched 355 356 adwc $0,%r0 357 1: ret 358 .size bn_add_words, .-bn_add_words 359 360 #; 361 #; Richard Levitte 20-Nov-2000 362 #; 363 #; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) { 364 #; ULONG c = 0; 365 #; int i; 366 #; for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c; 367 #; return(c); 368 #; } 369 # 370 .globl bn_sub_words 371 .type bn_sub_words@function 372 bn_sub_words: 373 .word 0x40 374 375 movl 4(%ap),%r2 # r 376 movl 8(%ap),%r3 # a 377 movl 12(%ap),%r4 # b 378 movl 16(%ap),%r5 # n 379 clrl %r0 380 381 tstl %r5 382 bleq 1f 383 384 0: movl (%r3)+,%r6 # carry untouched 385 sbwc (%r4)+,%r6 # carry used and touched 386 movl %r6,(%r2)+ # carry untouched 387 sobgtr %r5,0b # carry untouched 388 389 1: adwc $0,%r0 390 ret 391 .size bn_sub_words, .-bn_sub_words 392 393 # 394 # Ragge 20-Sep-2003 395 # 396 # Multiply a vector of 4/8 longword by another. 397 # Uses two loops and 16/64 emuls. 398 # 399 .globl bn_mul_comba4 400 .type bn_mul_comba4@function 401 bn_mul_comba4: 402 .word 0x3c0 403 movl $4,%r9 # 4*4 404 brb 6f 405 406 .globl bn_mul_comba8 407 .type bn_mul_comba8@function 408 bn_mul_comba8: 409 .word 0x3c0 410 movl $8,%r9 # 8*8 411 412 6: movl 8(%ap),%r3 # a[] 413 movl 12(%ap),%r7 # b[] 414 brb 5f 415 416 .globl bn_sqr_comba4 417 .type bn_sqr_comba4@function 418 bn_sqr_comba4: 419 .word 0x3c0 420 movl $4,%r9 # 4*4 421 brb 0f 422 423 .globl bn_sqr_comba8 424 .type bn_sqr_comba8@function 425 bn_sqr_comba8: 426 .word 0x3c0 427 movl $8,%r9 # 8*8 428 429 0: 430 movl 8(%ap),%r3 # a[] 431 movl %r3,%r7 # a[] 432 433 5: movl 4(%ap),%r5 # r[] 434 movl %r9,%r8 435 436 clrq (%r5) # clear destinatino, for add. 437 clrq 8(%r5) 438 clrq 16(%r5) # these only needed for comba8 439 clrq 24(%r5) 440 441 2: clrl %r4 # carry 442 movl %r9,%r6 # inner loop count 443 movl (%r7)+,%r2 # value to multiply with 444 445 1: emul %r2,(%r3),%r4,%r0 446 tstl %r4 447 bgeq 3f 448 incl %r1 449 3: tstl %r2 450 bgeq 3f 451 addl2 (%r3),%r1 452 3: tstl (%r3) 453 bgeq 3f 454 addl2 %r2,%r1 455 456 3: addl2 %r0,(%r5)+ # add to destination 457 adwc $0,%r1 # remember carry 458 movl %r1,%r4 # add carry in next emul 459 addl2 $4,%r3 460 sobgtr %r6,1b 461 462 movl %r4,(%r5) # save highest add result 463 464 ashl $2,%r9,%r4 465 subl2 %r4,%r3 466 subl2 $4,%r4 467 subl2 %r4,%r5 468 469 sobgtr %r8,2b 470 471 ret 472 .size bn_mul_comba4, .-bn_mul_comba4 473