Home | History | Annotate | Line # | Download | only in vax
      1 #	$NetBSD: bn_asm_vax.S,v 1.1.1.2 2023/04/18 14:19:11 christos Exp $
      2 #
      3 # w.j.m. 15-jan-1999
      4 #
      5 # it's magic ...
      6 #
      7 # ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) {
      8 #	ULONG c = 0;
      9 #	int i;
     10 #	for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ;
     11 #	return c;
     12 # }
     13 
     14 	.globl	bn_mul_add_words
     15 	.type   bn_mul_add_words@function
     16 
     17 bn_mul_add_words:
     18 	.word	0x40
     19 
     20 	movl	4(%ap),%r2		# *r
     21 	movl	8(%ap),%r3		# *a
     22 	movl	12(%ap),%r4		# n
     23 	movl	16(%ap),%r5		# w
     24 	clrl	%r6			# return value ("carry")
     25 
     26 0:	emul	%r5,(%r3),(%r2),%r0	# w * a[0] + r[0] -> r0
     27 
     28 	# fixup for "negative" r[]
     29 	tstl	(%r2)
     30 	bgeq	1f
     31 	incl	%r1			# add 1 to highword
     32 
     33 1:	# add saved carry to result
     34 	addl2	%r6,%r0
     35 	adwc	$0,%r1
     36 
     37 	# combined fixup for "negative" w, a[]
     38 	tstl	%r5		# if w is negative...
     39 	bgeq	1f
     40 	addl2	(%r3),%r1	# ...add a[0] again to highword
     41 1:	tstl	(%r3)		# if a[0] is negative...
     42 	bgeq	1f
     43 	addl2	%r5,%r1		# ...add w again to highword
     44 1:
     45 	movl	%r0,(%r2)+	# save low word in dest & advance *r
     46 	addl2	$4,%r3		# advance *a
     47 	movl	%r1,%r6		# high word in r6 for return value
     48 
     49 	sobgtr	%r4,0b		# loop?
     50 
     51 	movl	%r6,%r0
     52 	ret
     53 	.size  bn_mul_add_words, .-bn_mul_add_words
     54 
     55 #	.title	vax_bn_mul_words  unsigned multiply & add, 32*32+32=>64
     56 #;
     57 #; w.j.m. 15-jan-1999
     58 #;
     59 #; it's magic ...
     60 #;
     61 #; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) {
     62 #;	ULONG c = 0;
     63 #;	int i;
     64 #;	for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ;
     65 #;	return(c);
     66 #; }
     67 #
     68 	.globl	bn_mul_words
     69 	.type   bn_mul_words@function
     70 bn_mul_words:
     71 	.word	0x40
     72 
     73 	movl	4(%ap),%r2		# *r
     74 	movl	8(%ap),%r3		# *a
     75 	movl	12(%ap),%r4		# n
     76 	movl	16(%ap),%r5		# w
     77 	clrl	%r6			# carry
     78 
     79 0:	emul	%r5,(%r3),%r6,%r0	# w * a[0] + carry -> r0
     80 
     81 	# fixup for "negative" carry
     82 	tstl	%r6
     83 	bgeq	1f
     84 	incl	%r1
     85 
     86 1:	# combined fixup for "negative" w, a[]
     87 	tstl	%r5
     88 	bgeq	1f
     89 	addl2	(%r3),%r1
     90 1:	tstl	(%r3)
     91 	bgeq	1f
     92 	addl2	%r5,%r1
     93 
     94 1:	movl	%r0,(%r2)+
     95 	addl2	$4,%r3
     96 	movl	%r1,%r6
     97 
     98 	sobgtr	%r4,0b
     99 
    100 	movl	%r6,%r0
    101 	ret
    102 	.size  bn_mul_words, .-bn_mul_words
    103 
    104 
    105 
    106 #	.title	vax_bn_sqr_words  unsigned square, 32*32=>64
    107 #;
    108 #; w.j.m. 15-jan-1999
    109 #;
    110 #; it's magic ...
    111 #;
    112 #; void bn_sqr_words(ULONG r[],ULONG a[],int n) {
    113 #;	int i;
    114 #;	for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ;
    115 #; }
    116 #
    117 	.globl	bn_sqr_words
    118 	.type   bn_sqr_words@function
    119 bn_sqr_words:
    120 	.word	0
    121 
    122 	movl	4(%ap),%r2		# r
    123 	movl	8(%ap),%r3		# a
    124 	movl	12(%ap),%r4		# n
    125 
    126 0:	movl	(%r3)+,%r5		# r5 = a[] & advance
    127 
    128 	emul	%r5,%r5,$0,%r0		# a[0] * a[0] + 0 -> r0
    129 
    130 	# fixup for "negative" a[]
    131 	tstl	%r5
    132 	bgeq	1f
    133 	addl2	%r5,%r1
    134 	addl2	%r5,%r1
    135 
    136 1:	movq	%r0,(%r2)+		# store 64-bit result
    137 
    138 	sobgtr	%r4,0b			# loop
    139 
    140 	ret
    141 	.size  bn_sqr_words, .-bn_sqr_words
    142 
    143 
    144 #	.title	vax_bn_div_words  unsigned divide
    145 #;
    146 #; Richard Levitte 20-Nov-2000
    147 #;
    148 #; ULONG bn_div_words(ULONG h, ULONG l, ULONG d)
    149 #; {
    150 #;	return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d);
    151 #; }
    152 #;
    153 #; Using EDIV would be very easy, if it didn't do signed calculations.
    154 #; Any time any of the input numbers are signed, there are problems,
    155 #; usually with integer overflow, at which point it returns useless
    156 #; data (the quotient gets the value of l, and the remainder becomes 0).
    157 #;
    158 #; If it was just for the dividend, it would be very easy, just divide
    159 #; it by 2 (unsigned), do the division, multiply the resulting quotient
    160 #; and remainder by 2, add the bit that was dropped when dividing by 2
    161 #; to the remainder, and do some adjustment so the remainder doesn't
    162 #; end up larger than the divisor.  For some cases when the divisor is
    163 #; negative (from EDIV's point of view, i.e. when the highest bit is set),
    164 #; dividing the dividend by 2 isn't enough, and since some operations
    165 #; might generate integer overflows even when the dividend is divided by
    166 #; 4 (when the high part of the shifted down dividend ends up being exactly
    167 #; half of the divisor, the result is the quotient 0x80000000, which is
    168 #; negative...) it needs to be divided by 8.  Furthermore, the divisor needs
    169 #; to be divided by 2 (unsigned) as well, to avoid more problems with the sign.
    170 #; In this case, a little extra fiddling with the remainder is required.
    171 #;
    172 #; So, the simplest way to handle this is always to divide the dividend
    173 #; by 8, and to divide the divisor by 2 if it's highest bit is set.
    174 #; After EDIV has been used, the quotient gets multiplied by 8 if the
    175 #; original divisor was positive, otherwise 4.  The remainder, oddly
    176 #; enough, is *always* multiplied by 8.
    177 #; NOTE: in the case mentioned above, where the high part of the shifted
    178 #; down dividend ends up being exactly half the shifted down divisor, we
    179 #; end up with a 33 bit quotient.  That's no problem however, it usually
    180 #; means we have ended up with a too large remainder as well, and the
    181 #; problem is fixed by the last part of the algorithm (next paragraph).
    182 #;
    183 #; The routine ends with comparing the resulting remainder with the
    184 #; original divisor and if the remainder is larger, subtract the
    185 #; original divisor from it, and increase the quotient by 1.  This is
    186 #; done until the remainder is smaller than the divisor.
    187 #;
    188 #; The complete algorithm looks like this:
    189 #;
    190 #; d'    = d
    191 #; l'    = l & 7
    192 #; [h,l] = [h,l] >> 3
    193 #; [q,r] = floor([h,l] / d)	# This is the EDIV operation
    194 #; if (q < 0) q = -q		# I doubt this is necessary any more
    195 #;
    196 #; r'    = r >> 29
    197 #; if (d' >= 0)
    198 #;   q'  = q >> 29
    199 #;   q   = q << 3
    200 #; else
    201 #;   q'  = q >> 30
    202 #;   q   = q << 2
    203 #; r     = (r << 3) + l'
    204 #;
    205 #; if (d' < 0)
    206 #;   {
    207 #;     [r',r] = [r',r] - q
    208 #;     while ([r',r] < 0)
    209 #;       {
    210 #;         [r',r] = [r',r] + d
    211 #;         [q',q] = [q',q] - 1
    212 #;       }
    213 #;   }
    214 #;
    215 #; while ([r',r] >= d')
    216 #;   {
    217 #;     [r',r] = [r',r] - d'
    218 #;     [q',q] = [q',q] + 1
    219 #;   }
    220 #;
    221 #; return q
    222 #
    223 #;r2 = l, q
    224 #;r3 = h, r
    225 #;r4 = d
    226 #;r5 = l'
    227 #;r6 = r'
    228 #;r7 = d'
    229 #;r8 = q'
    230 #
    231 	.globl	bn_div_words
    232 	.type   bn_div_words@function
    233 bn_div_words:
    234 	.word	0x1c0
    235 
    236 	movl	4(%ap),%r3		# h
    237 	movl	8(%ap),%r2		# l
    238 	movl	12(%ap),%r4		# d
    239 
    240 	bicl3	$-8,%r2,%r5		# l' = l & 7
    241 	bicl3	$7,%r2,%r2
    242 
    243 	bicl3	$-8,%r3,%r6
    244 	bicl3	$7,%r3,%r3
    245 
    246 	addl2	%r6,%r2
    247 
    248 	rotl	$-3,%r2,%r2		# l = l >> 3
    249 	rotl	$-3,%r3,%r3		# h = h >> 3
    250 
    251 	movl	%r4,%r7			# d' = d
    252 
    253 	clrl	%r6			# r' = 0
    254 	clrl	%r8			# q' = 0
    255 
    256 	tstl	%r4
    257 	beql	0f			# Uh-oh, the divisor is 0...
    258 	bgtr	1f
    259 	rotl	$-1,%r4,%r4	# If d is negative, shift it right.
    260 	bicl2	$0x80000000,%r4	# Since d is then a large number, the
    261 				# lowest bit is insignificant
    262 				# (contradict that, and I'll fix the problem!)
    263 1:
    264 	ediv	%r4,%r2,%r2,%r3		# Do the actual division
    265 
    266 	tstl	%r2
    267 	bgeq	1f
    268 	mnegl	%r2,%r2		# if q < 0, negate it
    269 1:
    270 	tstl	%r7
    271 	blss	1f
    272 	rotl	$3,%r2,%r2	#   q = q << 3
    273 	bicl3	$-8,%r2,%r8	#   q' gets the high bits from q
    274 	bicl3	$7,%r2,%r2
    275 	brb	2f
    276 
    277 1:				# else
    278 	rotl	$2,%r2,%r2	#   q = q << 2
    279 	bicl3	$-4,%r2,%r8	#   q' gets the high bits from q
    280 	bicl3	$3,%r2,%r2
    281 2:
    282 	rotl	$3,%r3,%r3	# r = r << 3
    283 	bicl3	$-8,%r3,%r6	# r' gets the high bits from r
    284 	bicl3	$7,%r3,%r3
    285 	addl2	%r5,%r3		# r = r + l'
    286 
    287 	tstl	%r7
    288 	bgeq	5f
    289 	bitl	$1,%r7
    290 	beql	5f		# if d' < 0 && d' & 1
    291 	subl2	%r2,%r3		#   [r',r] = [r',r] - [q',q]
    292 	sbwc	%r8,%r6
    293 3:
    294 	bgeq	5f		#   while r < 0
    295 	decl	%r2		#     [q',q] = [q',q] - 1
    296 	sbwc	$0,%r8
    297 	addl2	%r7,%r3		#     [r',r] = [r',r] + d'
    298 	adwc	$0,%r6
    299 	brb	3b
    300 
    301 # The return points are placed in the middle to keep a short distance from
    302 # all the branch points
    303 1:
    304 #	movl	%r3,%r1
    305 	movl	%r2,%r0
    306 	ret
    307 0:
    308 	movl	$-1,%r0
    309 	ret
    310 5:
    311 	tstl	%r6
    312 	bneq	6f
    313 	cmpl	%r3,%r7
    314 	blssu	1b		# while [r',r] >= d'
    315 6:
    316 	subl2	%r7,%r3		#   [r',r] = [r',r] - d'
    317 	sbwc	$0,%r6
    318 	incl	%r2		#   [q',q] = [q',q] + 1
    319 	adwc	$0,%r8
    320 	brb	5b
    321 	.size  bn_div_words, .-bn_div_words
    322 
    323 
    324 
    325 #	.title	vax_bn_add_words  unsigned add of two arrays
    326 #;
    327 #; Richard Levitte 20-Nov-2000
    328 #;
    329 #; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) {
    330 #;	ULONG c = 0;
    331 #;	int i;
    332 #;	for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c;
    333 #;	return(c);
    334 #; }
    335 #
    336 
    337 	.globl	bn_add_words
    338 	.type   bn_add_words@function
    339 bn_add_words:
    340 	.word	0
    341 
    342 	movl	4(%ap),%r2	# r
    343 	movl	8(%ap),%r3	# a
    344 	movl	12(%ap),%r4	# b
    345 	movl	16(%ap),%r5	# n
    346 	clrl	%r0
    347 
    348 	tstl	%r5
    349 	bleq	1f
    350 
    351 0:	movl	(%r3)+,%r1	# carry untouched
    352 	adwc	(%r4)+,%r1	# carry used and touched
    353 	movl	%r1,(%r2)+	# carry untouched
    354 	sobgtr	%r5,0b		# carry untouched
    355 
    356 	adwc	$0,%r0
    357 1:	ret
    358 	.size  bn_add_words, .-bn_add_words
    359 
    360 #;
    361 #; Richard Levitte 20-Nov-2000
    362 #;
    363 #; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) {
    364 #;	ULONG c = 0;
    365 #;	int i;
    366 #;	for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c;
    367 #;	return(c);
    368 #; }
    369 #
    370 	.globl	bn_sub_words
    371 	.type   bn_sub_words@function
    372 bn_sub_words:
    373 	.word	0x40
    374 
    375 	movl	4(%ap),%r2	# r
    376 	movl	8(%ap),%r3	# a
    377 	movl	12(%ap),%r4	# b
    378 	movl	16(%ap),%r5	# n
    379 	clrl	%r0
    380 
    381 	tstl	%r5
    382 	bleq	1f
    383 
    384 0:	movl	(%r3)+,%r6	# carry untouched
    385 	sbwc	(%r4)+,%r6	# carry used and touched
    386 	movl	%r6,(%r2)+	# carry untouched
    387 	sobgtr	%r5,0b		# carry untouched
    388 
    389 1:	adwc	$0,%r0
    390 	ret
    391 	.size  bn_sub_words, .-bn_sub_words
    392 
    393 #
    394 #	Ragge 20-Sep-2003
    395 #
    396 #	Multiply a vector of 4/8 longword by another.
    397 #	Uses two loops and 16/64 emuls.
    398 #
    399 	.globl	bn_mul_comba4
    400 	.type   bn_mul_comba4@function
    401 bn_mul_comba4:
    402 	.word	0x3c0
    403 	movl	$4,%r9		# 4*4
    404 	brb	6f
    405 
    406 	.globl	bn_mul_comba8
    407 	.type   bn_mul_comba8@function
    408 bn_mul_comba8:
    409 	.word	0x3c0
    410 	movl	$8,%r9		# 8*8
    411 
    412 6:	movl	8(%ap),%r3	# a[]
    413 	movl	12(%ap),%r7	# b[]
    414 	brb	5f
    415 
    416 	.globl	bn_sqr_comba4
    417 	.type   bn_sqr_comba4@function
    418 bn_sqr_comba4:
    419 	.word	0x3c0
    420 	movl	$4,%r9		# 4*4
    421 	brb 0f
    422 
    423 	.globl	bn_sqr_comba8
    424 	.type   bn_sqr_comba8@function
    425 bn_sqr_comba8:
    426 	.word	0x3c0
    427 	movl	$8,%r9		# 8*8
    428 
    429 0:
    430 	movl	8(%ap),%r3	# a[]
    431 	movl	%r3,%r7		# a[]
    432 
    433 5:	movl	4(%ap),%r5	# r[]
    434 	movl	%r9,%r8
    435 
    436 	clrq	(%r5)		# clear destinatino, for add.
    437 	clrq	8(%r5)
    438 	clrq	16(%r5)		# these only needed for comba8
    439 	clrq	24(%r5)
    440 
    441 2:	clrl	%r4		# carry
    442 	movl	%r9,%r6		# inner loop count
    443 	movl	(%r7)+,%r2	# value to multiply with
    444 
    445 1:	emul	%r2,(%r3),%r4,%r0
    446 	tstl	%r4
    447 	bgeq	3f
    448 	incl	%r1
    449 3:	tstl	%r2
    450 	bgeq	3f
    451 	addl2	(%r3),%r1
    452 3:	tstl	(%r3)
    453 	bgeq	3f
    454 	addl2	%r2,%r1
    455 
    456 3:	addl2	%r0,(%r5)+	# add to destination
    457 	adwc	$0,%r1		# remember carry
    458 	movl	%r1,%r4		# add carry in next emul
    459 	addl2	$4,%r3
    460 	sobgtr	%r6,1b
    461 
    462 	movl	%r4,(%r5)	# save highest add result
    463 
    464 	ashl	$2,%r9,%r4
    465 	subl2	%r4,%r3
    466 	subl2	$4,%r4
    467 	subl2	%r4,%r5
    468 
    469 	sobgtr	%r8,2b
    470 
    471 	ret
    472 	.size  bn_mul_comba4, .-bn_mul_comba4
    473