Home | History | Annotate | Line # | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the Apache License 2.0 (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 #
      9 # ====================================================================
     10 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     11 # project. The module is, however, dual licensed under OpenSSL and
     12 # CRYPTOGAMS licenses depending on where you obtain it. For further
     13 # details see http://www.openssl.org/~appro/cryptogams/.
     14 # ====================================================================
     15 #
     16 # X25519 lower-level primitives for PPC64.
     17 #
     18 # July 2018.
     19 #
     20 # Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
     21 # faster on PPC970/G5. POWER8 on the other hand seems to trip on own
     22 # shoelaces when handling longer carry chains. As base 2^51 has just
     23 # single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
     24 # pretty old, base 2^64 implementation is not engaged. Comparison to
     25 # compiler-generated code is complicated by the fact that not all
     26 # compilers support 128-bit integers. When compiler doesn't, like xlc,
     27 # this module delivers more than 2x improvement, and when it does,
     28 # from 12% to 30% improvement was measured...
     29 
     30 # $output is the last argument if it looks like a file (it has an extension)
     31 # $flavour is the first argument if it doesn't look like a file
     32 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
     33 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
     34 
     35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     36 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     37 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     38 die "can't locate ppc-xlate.pl";
     39 
     40 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
     41     or die "can't call $xlate: $!";
     42 *STDOUT=*OUT;
     43 
     44 my $sp = "r1";
     45 my ($rp,$ap,$bp) = map("r$_",3..5);
     46 
     47 ####################################################### base 2^64
     48 if (0) {
     49 my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
     50     $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
     51     map("r$_",(6..12,22..31));
     52 my $zero = "r0";
     53 my $FRAME = 16*8;
     54 
     55 $code.=<<___;
     56 .text
     57 
     58 .globl	x25519_fe64_mul
     59 .type	x25519_fe64_mul,\@function
     60 .align	5
     61 x25519_fe64_mul:
     62 	stdu	$sp,-$FRAME($sp)
     63 	std	r22,`$FRAME-8*10`($sp)
     64 	std	r23,`$FRAME-8*9`($sp)
     65 	std	r24,`$FRAME-8*8`($sp)
     66 	std	r25,`$FRAME-8*7`($sp)
     67 	std	r26,`$FRAME-8*6`($sp)
     68 	std	r27,`$FRAME-8*5`($sp)
     69 	std	r28,`$FRAME-8*4`($sp)
     70 	std	r29,`$FRAME-8*3`($sp)
     71 	std	r30,`$FRAME-8*2`($sp)
     72 	std	r31,`$FRAME-8*1`($sp)
     73 
     74 	ld	$bi,0($bp)
     75 	ld	$a0,0($ap)
     76 	xor	$zero,$zero,$zero
     77 	ld	$a1,8($ap)
     78 	ld	$a2,16($ap)
     79 	ld	$a3,24($ap)
     80 
     81 	mulld	$acc0,$a0,$bi		# a[0]*b[0]
     82 	mulhdu	$t0,$a0,$bi
     83 	mulld	$acc1,$a1,$bi		# a[1]*b[0]
     84 	mulhdu	$t1,$a1,$bi
     85 	mulld	$acc2,$a2,$bi		# a[2]*b[0]
     86 	mulhdu	$t2,$a2,$bi
     87 	mulld	$acc3,$a3,$bi		# a[3]*b[0]
     88 	mulhdu	$t3,$a3,$bi
     89 ___
     90 for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
     91     my $i=1; $i<4; shift(@acc), $i++) {
     92 my $acc4 = $i==1? $zero : @acc[4];
     93 
     94 $code.=<<___;
     95 	ld	$bi,`8*$i`($bp)
     96 	addc	@acc[1],@acc[1],$t0	# accumulate high parts
     97 	mulld	$t0,$a0,$bi
     98 	adde	@acc[2],@acc[2],$t1
     99 	mulld	$t1,$a1,$bi
    100 	adde	@acc[3],@acc[3],$t2
    101 	mulld	$t2,$a2,$bi
    102 	adde	@acc[4],$acc4,$t3
    103 	mulld	$t3,$a3,$bi
    104 	addc	@acc[1],@acc[1],$t0	# accumulate low parts
    105 	mulhdu	$t0,$a0,$bi
    106 	adde	@acc[2],@acc[2],$t1
    107 	mulhdu	$t1,$a1,$bi
    108 	adde	@acc[3],@acc[3],$t2
    109 	mulhdu	$t2,$a2,$bi
    110 	adde	@acc[4],@acc[4],$t3
    111 	mulhdu	$t3,$a3,$bi
    112 	adde	@acc[5],$zero,$zero
    113 ___
    114 }
    115 $code.=<<___;
    116 	li	$bi,38
    117 	addc	$acc4,$acc4,$t0
    118 	mulld	$t0,$acc4,$bi
    119 	adde	$acc5,$acc5,$t1
    120 	mulld	$t1,$acc5,$bi
    121 	adde	$acc6,$acc6,$t2
    122 	mulld	$t2,$acc6,$bi
    123 	adde	$acc7,$acc7,$t3
    124 	mulld	$t3,$acc7,$bi
    125 
    126 	addc	$acc0,$acc0,$t0
    127 	mulhdu	$t0,$acc4,$bi
    128 	adde	$acc1,$acc1,$t1
    129 	mulhdu	$t1,$acc5,$bi
    130 	adde	$acc2,$acc2,$t2
    131 	mulhdu	$t2,$acc6,$bi
    132 	adde	$acc3,$acc3,$t3
    133 	mulhdu	$t3,$acc7,$bi
    134 	adde	$acc4,$zero,$zero
    135 
    136 	addc	$acc1,$acc1,$t0
    137 	adde	$acc2,$acc2,$t1
    138 	adde	$acc3,$acc3,$t2
    139 	adde	$acc4,$acc4,$t3
    140 
    141 	mulld	$acc4,$acc4,$bi
    142 
    143 	addc	$acc0,$acc0,$acc4
    144 	addze	$acc1,$acc1
    145 	addze	$acc2,$acc2
    146 	addze	$acc3,$acc3
    147 
    148 	subfe	$acc4,$acc4,$acc4	# carry -> ~mask
    149 	std	$acc1,8($rp)
    150 	andc	$acc4,$bi,$acc4
    151 	std	$acc2,16($rp)
    152 	add	$acc0,$acc0,$acc4
    153 	std	$acc3,24($rp)
    154 	std	$acc0,0($rp)
    155 
    156 	ld	r22,`$FRAME-8*10`($sp)
    157 	ld	r23,`$FRAME-8*9`($sp)
    158 	ld	r24,`$FRAME-8*8`($sp)
    159 	ld	r25,`$FRAME-8*7`($sp)
    160 	ld	r26,`$FRAME-8*6`($sp)
    161 	ld	r27,`$FRAME-8*5`($sp)
    162 	ld	r28,`$FRAME-8*4`($sp)
    163 	ld	r29,`$FRAME-8*3`($sp)
    164 	ld	r30,`$FRAME-8*2`($sp)
    165 	ld	r31,`$FRAME-8*1`($sp)
    166 	addi	$sp,$sp,$FRAME
    167 	blr
    168 	.long	0
    169 	.byte	0,12,4,0,0x80,10,3,0
    170 	.long	0
    171 .size	x25519_fe64_mul,.-x25519_fe64_mul
    172 
    173 .globl	x25519_fe64_sqr
    174 .type	x25519_fe64_sqr,\@function
    175 .align	5
    176 x25519_fe64_sqr:
    177 	stdu	$sp,-$FRAME($sp)
    178 	std	r22,`$FRAME-8*10`($sp)
    179 	std	r23,`$FRAME-8*9`($sp)
    180 	std	r24,`$FRAME-8*8`($sp)
    181 	std	r25,`$FRAME-8*7`($sp)
    182 	std	r26,`$FRAME-8*6`($sp)
    183 	std	r27,`$FRAME-8*5`($sp)
    184 	std	r28,`$FRAME-8*4`($sp)
    185 	std	r29,`$FRAME-8*3`($sp)
    186 	std	r30,`$FRAME-8*2`($sp)
    187 	std	r31,`$FRAME-8*1`($sp)
    188 
    189 	ld	$a0,0($ap)
    190 	xor	$zero,$zero,$zero
    191 	ld	$a1,8($ap)
    192 	ld	$a2,16($ap)
    193 	ld	$a3,24($ap)
    194 
    195 	################################
    196 	#  |  |  |  |  |  |a1*a0|  |
    197 	#  |  |  |  |  |a2*a0|  |  |
    198 	#  |  |a3*a2|a3*a0|  |  |  |
    199 	#  |  |  |  |a2*a1|  |  |  |
    200 	#  |  |  |a3*a1|  |  |  |  |
    201 	# *|  |  |  |  |  |  |  | 2|
    202 	# +|a3*a3|a2*a2|a1*a1|a0*a0|
    203 	#  |--+--+--+--+--+--+--+--|
    204 	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
    205 	#
    206 	#  "can't overflow" below mark carrying into high part of
    207 	#  multiplication result, which can't overflow, because it
    208 	#  can never be all ones.
    209 
    210 	mulld	$acc1,$a1,$a0		# a[1]*a[0]
    211 	mulhdu	$t1,$a1,$a0
    212 	mulld	$acc2,$a2,$a0		# a[2]*a[0]
    213 	mulhdu	$t2,$a2,$a0
    214 	mulld	$acc3,$a3,$a0		# a[3]*a[0]
    215 	mulhdu	$acc4,$a3,$a0
    216 
    217 	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
    218 	 mulld	$t0,$a2,$a1		# a[2]*a[1]
    219 	 mulhdu	$t1,$a2,$a1
    220 	adde	$acc3,$acc3,$t2
    221 	 mulld	$t2,$a3,$a1		# a[3]*a[1]
    222 	 mulhdu	$t3,$a3,$a1
    223 	addze	$acc4,$acc4		# can't overflow
    224 
    225 	mulld	$acc5,$a3,$a2		# a[3]*a[2]
    226 	mulhdu	$acc6,$a3,$a2
    227 
    228 	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
    229 	 mulld	$acc0,$a0,$a0		# a[0]*a[0]
    230 	addze	$t2,$t3			# can't overflow
    231 
    232 	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
    233 	 mulhdu	$a0,$a0,$a0
    234 	adde	$acc4,$acc4,$t1
    235 	 mulld	$t1,$a1,$a1		# a[1]*a[1]
    236 	adde	$acc5,$acc5,$t2
    237 	 mulhdu	$a1,$a1,$a1
    238 	addze	$acc6,$acc6		# can't overflow
    239 
    240 	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
    241 	 mulld	$t2,$a2,$a2		# a[2]*a[2]
    242 	adde	$acc2,$acc2,$acc2
    243 	 mulhdu	$a2,$a2,$a2
    244 	adde	$acc3,$acc3,$acc3
    245 	 mulld	$t3,$a3,$a3		# a[3]*a[3]
    246 	adde	$acc4,$acc4,$acc4
    247 	 mulhdu	$a3,$a3,$a3
    248 	adde	$acc5,$acc5,$acc5
    249 	adde	$acc6,$acc6,$acc6
    250 	addze	$acc7,$zero
    251 
    252 	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
    253 	 li	$bi,38
    254 	adde	$acc2,$acc2,$t1
    255 	adde	$acc3,$acc3,$a1
    256 	adde	$acc4,$acc4,$t2
    257 	adde	$acc5,$acc5,$a2
    258 	adde	$acc6,$acc6,$t3
    259 	adde	$acc7,$acc7,$a3
    260 
    261 	mulld	$t0,$acc4,$bi
    262 	mulld	$t1,$acc5,$bi
    263 	mulld	$t2,$acc6,$bi
    264 	mulld	$t3,$acc7,$bi
    265 
    266 	addc	$acc0,$acc0,$t0
    267 	mulhdu	$t0,$acc4,$bi
    268 	adde	$acc1,$acc1,$t1
    269 	mulhdu	$t1,$acc5,$bi
    270 	adde	$acc2,$acc2,$t2
    271 	mulhdu	$t2,$acc6,$bi
    272 	adde	$acc3,$acc3,$t3
    273 	mulhdu	$t3,$acc7,$bi
    274 	addze	$acc4,$zero
    275 
    276 	addc	$acc1,$acc1,$t0
    277 	adde	$acc2,$acc2,$t1
    278 	adde	$acc3,$acc3,$t2
    279 	adde	$acc4,$acc4,$t3
    280 
    281 	mulld	$acc4,$acc4,$bi
    282 
    283 	addc	$acc0,$acc0,$acc4
    284 	addze	$acc1,$acc1
    285 	addze	$acc2,$acc2
    286 	addze	$acc3,$acc3
    287 
    288 	subfe	$acc4,$acc4,$acc4	# carry -> ~mask
    289 	std	$acc1,8($rp)
    290 	andc	$acc4,$bi,$acc4
    291 	std	$acc2,16($rp)
    292 	add	$acc0,$acc0,$acc4
    293 	std	$acc3,24($rp)
    294 	std	$acc0,0($rp)
    295 
    296 	ld	r22,`$FRAME-8*10`($sp)
    297 	ld	r23,`$FRAME-8*9`($sp)
    298 	ld	r24,`$FRAME-8*8`($sp)
    299 	ld	r25,`$FRAME-8*7`($sp)
    300 	ld	r26,`$FRAME-8*6`($sp)
    301 	ld	r27,`$FRAME-8*5`($sp)
    302 	ld	r28,`$FRAME-8*4`($sp)
    303 	ld	r29,`$FRAME-8*3`($sp)
    304 	ld	r30,`$FRAME-8*2`($sp)
    305 	ld	r31,`$FRAME-8*1`($sp)
    306 	addi	$sp,$sp,$FRAME
    307 	blr
    308 	.long	0
    309 	.byte	0,12,4,0,0x80,10,2,0
    310 	.long	0
    311 .size	x25519_fe64_sqr,.-x25519_fe64_sqr
    312 
    313 .globl	x25519_fe64_mul121666
    314 .type	x25519_fe64_mul121666,\@function
    315 .align	5
    316 x25519_fe64_mul121666:
    317 	lis	$bi,`65536>>16`
    318 	ori	$bi,$bi,`121666-65536`
    319 
    320 	ld	$t0,0($ap)
    321 	ld	$t1,8($ap)
    322 	ld	$bp,16($ap)
    323 	ld	$ap,24($ap)
    324 
    325 	mulld	$a0,$t0,$bi
    326 	mulhdu	$t0,$t0,$bi
    327 	mulld	$a1,$t1,$bi
    328 	mulhdu	$t1,$t1,$bi
    329 	mulld	$a2,$bp,$bi
    330 	mulhdu	$bp,$bp,$bi
    331 	mulld	$a3,$ap,$bi
    332 	mulhdu	$ap,$ap,$bi
    333 
    334 	addc	$a1,$a1,$t0
    335 	adde	$a2,$a2,$t1
    336 	adde	$a3,$a3,$bp
    337 	addze	$ap,    $ap
    338 
    339 	mulli	$ap,$ap,38
    340 
    341 	addc	$a0,$a0,$ap
    342 	addze	$a1,$a1
    343 	addze	$a2,$a2
    344 	addze	$a3,$a3
    345 
    346 	subfe	$t1,$t1,$t1		# carry -> ~mask
    347 	std	$a1,8($rp)
    348 	andc	$t0,$t0,$t1
    349 	std	$a2,16($rp)
    350 	add	$a0,$a0,$t0
    351 	std	$a3,24($rp)
    352 	std	$a0,0($rp)
    353 
    354 	blr
    355 	.long	0
    356 	.byte	0,12,0x14,0,0,0,2,0
    357 	.long	0
    358 .size	x25519_fe64_mul121666,.-x25519_fe64_mul121666
    359 
    360 .globl	x25519_fe64_add
    361 .type	x25519_fe64_add,\@function
    362 .align	5
    363 x25519_fe64_add:
    364 	ld	$a0,0($ap)
    365 	ld	$t0,0($bp)
    366 	ld	$a1,8($ap)
    367 	ld	$t1,8($bp)
    368 	ld	$a2,16($ap)
    369 	ld	$bi,16($bp)
    370 	ld	$a3,24($ap)
    371 	ld	$bp,24($bp)
    372 
    373 	addc	$a0,$a0,$t0
    374 	adde	$a1,$a1,$t1
    375 	adde	$a2,$a2,$bi
    376 	adde	$a3,$a3,$bp
    377 
    378 	li	$t0,38
    379 	subfe	$t1,$t1,$t1		# carry -> ~mask
    380 	andc	$t1,$t0,$t1
    381 
    382 	addc	$a0,$a0,$t1
    383 	addze	$a1,$a1
    384 	addze	$a2,$a2
    385 	addze	$a3,$a3
    386 
    387 	subfe	$t1,$t1,$t1		# carry -> ~mask
    388 	std	$a1,8($rp)
    389 	andc	$t0,$t0,$t1
    390 	std	$a2,16($rp)
    391 	add	$a0,$a0,$t0
    392 	std	$a3,24($rp)
    393 	std	$a0,0($rp)
    394 
    395 	blr
    396 	.long	0
    397 	.byte	0,12,0x14,0,0,0,3,0
    398 	.long	0
    399 .size	x25519_fe64_add,.-x25519_fe64_add
    400 
    401 .globl	x25519_fe64_sub
    402 .type	x25519_fe64_sub,\@function
    403 .align	5
    404 x25519_fe64_sub:
    405 	ld	$a0,0($ap)
    406 	ld	$t0,0($bp)
    407 	ld	$a1,8($ap)
    408 	ld	$t1,8($bp)
    409 	ld	$a2,16($ap)
    410 	ld	$bi,16($bp)
    411 	ld	$a3,24($ap)
    412 	ld	$bp,24($bp)
    413 
    414 	subfc	$a0,$t0,$a0
    415 	subfe	$a1,$t1,$a1
    416 	subfe	$a2,$bi,$a2
    417 	subfe	$a3,$bp,$a3
    418 
    419 	li	$t0,38
    420 	subfe	$t1,$t1,$t1		# borrow -> mask
    421 	xor	$zero,$zero,$zero
    422 	and	$t1,$t0,$t1
    423 
    424 	subfc	$a0,$t1,$a0
    425 	subfe	$a1,$zero,$a1
    426 	subfe	$a2,$zero,$a2
    427 	subfe	$a3,$zero,$a3
    428 
    429 	subfe	$t1,$t1,$t1		# borrow -> mask
    430 	std	$a1,8($rp)
    431 	and	$t0,$t0,$t1
    432 	std	$a2,16($rp)
    433 	subf	$a0,$t0,$a0
    434 	std	$a3,24($rp)
    435 	std	$a0,0($rp)
    436 
    437 	blr
    438 	.long	0
    439 	.byte	0,12,0x14,0,0,0,3,0
    440 	.long	0
    441 .size	x25519_fe64_sub,.-x25519_fe64_sub
    442 
    443 .globl	x25519_fe64_tobytes
    444 .type	x25519_fe64_tobytes,\@function
    445 .align	5
    446 x25519_fe64_tobytes:
    447 	ld	$a3,24($ap)
    448 	ld	$a0,0($ap)
    449 	ld	$a1,8($ap)
    450 	ld	$a2,16($ap)
    451 
    452 	sradi	$t0,$a3,63		# most significant bit -> mask
    453 	li	$t1,19
    454 	and	$t0,$t0,$t1
    455 	sldi	$a3,$a3,1
    456 	add	$t0,$t0,$t1		# compare to modulus in the same go
    457 	srdi	$a3,$a3,1		# most significant bit cleared
    458 
    459 	addc	$a0,$a0,$t0
    460 	addze	$a1,$a1
    461 	addze	$a2,$a2
    462 	addze	$a3,$a3
    463 
    464 	xor	$zero,$zero,$zero
    465 	sradi	$t0,$a3,63		# most significant bit -> mask
    466 	sldi	$a3,$a3,1
    467 	andc	$t0,$t1,$t0
    468 	srdi	$a3,$a3,1		# most significant bit cleared
    469 
    470 	subi	$rp,$rp,1
    471 	subfc	$a0,$t0,$a0
    472 	subfe	$a1,$zero,$a1
    473 	subfe	$a2,$zero,$a2
    474 	subfe	$a3,$zero,$a3
    475 
    476 ___
    477 for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
    478 $code.=<<___;
    479 	srdi	$t0,@a[0],8
    480 	stbu	@a[0],1($rp)
    481 	srdi	@a[0],@a[0],16
    482 	stbu	$t0,1($rp)
    483 	srdi	$t0,@a[0],8
    484 	stbu	@a[0],1($rp)
    485 	srdi	@a[0],@a[0],16
    486 	stbu	$t0,1($rp)
    487 	srdi	$t0,@a[0],8
    488 	stbu	@a[0],1($rp)
    489 	srdi	@a[0],@a[0],16
    490 	stbu	$t0,1($rp)
    491 	srdi	$t0,@a[0],8
    492 	stbu	@a[0],1($rp)
    493 	stbu	$t0,1($rp)
    494 ___
    495 }
    496 $code.=<<___;
    497 	blr
    498 	.long	0
    499 	.byte	0,12,0x14,0,0,0,2,0
    500 	.long	0
    501 .size	x25519_fe64_tobytes,.-x25519_fe64_tobytes
    502 ___
    503 }
    504 ####################################################### base 2^51
    505 {
    506 my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
    507     $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
    508     map("r$_",(6..12,21..31));
    509 my $mask = "r0";
    510 my $FRAME = 18*8;
    511 
    512 $code.=<<___;
    513 .text
    514 
    515 .globl	x25519_fe51_mul
    516 .type	x25519_fe51_mul,\@function
    517 .align	5
    518 x25519_fe51_mul:
    519 	stdu	$sp,-$FRAME($sp)
    520 	std	r21,`$FRAME-8*11`($sp)
    521 	std	r22,`$FRAME-8*10`($sp)
    522 	std	r23,`$FRAME-8*9`($sp)
    523 	std	r24,`$FRAME-8*8`($sp)
    524 	std	r25,`$FRAME-8*7`($sp)
    525 	std	r26,`$FRAME-8*6`($sp)
    526 	std	r27,`$FRAME-8*5`($sp)
    527 	std	r28,`$FRAME-8*4`($sp)
    528 	std	r29,`$FRAME-8*3`($sp)
    529 	std	r30,`$FRAME-8*2`($sp)
    530 	std	r31,`$FRAME-8*1`($sp)
    531 
    532 	ld	$bi,0($bp)
    533 	ld	$a0,0($ap)
    534 	ld	$a1,8($ap)
    535 	ld	$a2,16($ap)
    536 	ld	$a3,24($ap)
    537 	ld	$a4,32($ap)
    538 
    539 	mulld	$h0lo,$a0,$bi		# a[0]*b[0]
    540 	mulhdu	$h0hi,$a0,$bi
    541 
    542 	mulld	$h1lo,$a1,$bi		# a[1]*b[0]
    543 	mulhdu	$h1hi,$a1,$bi
    544 
    545 	 mulld	$h4lo,$a4,$bi		# a[4]*b[0]
    546 	 mulhdu	$h4hi,$a4,$bi
    547 	 ld	$ap,8($bp)
    548 	 mulli	$a4,$a4,19
    549 
    550 	mulld	$h2lo,$a2,$bi		# a[2]*b[0]
    551 	mulhdu	$h2hi,$a2,$bi
    552 
    553 	mulld	$h3lo,$a3,$bi		# a[3]*b[0]
    554 	mulhdu	$h3hi,$a3,$bi
    555 ___
    556 for(my @a=($a0,$a1,$a2,$a3,$a4),
    557     my $i=1; $i<4; $i++) {
    558 	($ap,$bi) = ($bi,$ap);
    559 $code.=<<___;
    560 	mulld	$t0,@a[4],$bi
    561 	mulhdu	$t1,@a[4],$bi
    562 	addc	$h0lo,$h0lo,$t0
    563 	adde	$h0hi,$h0hi,$t1
    564 
    565 	mulld	$t0,@a[0],$bi
    566 	mulhdu	$t1,@a[0],$bi
    567 	addc	$h1lo,$h1lo,$t0
    568 	adde	$h1hi,$h1hi,$t1
    569 
    570 	 mulld	$t0,@a[3],$bi
    571 	 mulhdu	$t1,@a[3],$bi
    572 	 ld	$ap,`8*($i+1)`($bp)
    573 	 mulli	@a[3],@a[3],19
    574 	 addc	$h4lo,$h4lo,$t0
    575 	 adde	$h4hi,$h4hi,$t1
    576 
    577 	mulld	$t0,@a[1],$bi
    578 	mulhdu	$t1,@a[1],$bi
    579 	addc	$h2lo,$h2lo,$t0
    580 	adde	$h2hi,$h2hi,$t1
    581 
    582 	mulld	$t0,@a[2],$bi
    583 	mulhdu	$t1,@a[2],$bi
    584 	addc	$h3lo,$h3lo,$t0
    585 	adde	$h3hi,$h3hi,$t1
    586 ___
    587 	unshift(@a,pop(@a));
    588 }
    589 	($ap,$bi) = ($bi,$ap);
    590 $code.=<<___;
    591 	mulld	$t0,$a1,$bi
    592 	mulhdu	$t1,$a1,$bi
    593 	addc	$h0lo,$h0lo,$t0
    594 	adde	$h0hi,$h0hi,$t1
    595 
    596 	mulld	$t0,$a2,$bi
    597 	mulhdu	$t1,$a2,$bi
    598 	addc	$h1lo,$h1lo,$t0
    599 	adde	$h1hi,$h1hi,$t1
    600 
    601 	mulld	$t0,$a3,$bi
    602 	mulhdu	$t1,$a3,$bi
    603 	addc	$h2lo,$h2lo,$t0
    604 	adde	$h2hi,$h2hi,$t1
    605 
    606 	mulld	$t0,$a4,$bi
    607 	mulhdu	$t1,$a4,$bi
    608 	addc	$h3lo,$h3lo,$t0
    609 	adde	$h3hi,$h3hi,$t1
    610 
    611 	mulld	$t0,$a0,$bi
    612 	mulhdu	$t1,$a0,$bi
    613 	addc	$h4lo,$h4lo,$t0
    614 	adde	$h4hi,$h4hi,$t1
    615 
    616 .Lfe51_reduce:
    617 	li	$mask,-1
    618 	srdi	$mask,$mask,13		# 0x7ffffffffffff
    619 
    620 	srdi	$t0,$h2lo,51
    621 	and	$a2,$h2lo,$mask
    622 	insrdi	$t0,$h2hi,51,0		# h2>>51
    623 	 srdi	$t1,$h0lo,51
    624 	 and	$a0,$h0lo,$mask
    625 	 insrdi	$t1,$h0hi,51,0		# h0>>51
    626 	addc	$h3lo,$h3lo,$t0
    627 	addze	$h3hi,$h3hi
    628 	 addc	$h1lo,$h1lo,$t1
    629 	 addze	$h1hi,$h1hi
    630 
    631 	srdi	$t0,$h3lo,51
    632 	and	$a3,$h3lo,$mask
    633 	insrdi	$t0,$h3hi,51,0		# h3>>51
    634 	 srdi	$t1,$h1lo,51
    635 	 and	$a1,$h1lo,$mask
    636 	 insrdi	$t1,$h1hi,51,0		# h1>>51
    637 	addc	$h4lo,$h4lo,$t0
    638 	addze	$h4hi,$h4hi
    639 	 add	$a2,$a2,$t1
    640 
    641 	srdi	$t0,$h4lo,51
    642 	and	$a4,$h4lo,$mask
    643 	insrdi	$t0,$h4hi,51,0
    644 	mulli	$t0,$t0,19		# (h4 >> 51) * 19
    645 
    646 	add	$a0,$a0,$t0
    647 
    648 	srdi	$t1,$a2,51
    649 	and	$a2,$a2,$mask
    650 	add	$a3,$a3,$t1
    651 
    652 	srdi	$t0,$a0,51
    653 	and	$a0,$a0,$mask
    654 	add	$a1,$a1,$t0
    655 
    656 	std	$a2,16($rp)
    657 	std	$a3,24($rp)
    658 	std	$a4,32($rp)
    659 	std	$a0,0($rp)
    660 	std	$a1,8($rp)
    661 
    662 	ld	r21,`$FRAME-8*11`($sp)
    663 	ld	r22,`$FRAME-8*10`($sp)
    664 	ld	r23,`$FRAME-8*9`($sp)
    665 	ld	r24,`$FRAME-8*8`($sp)
    666 	ld	r25,`$FRAME-8*7`($sp)
    667 	ld	r26,`$FRAME-8*6`($sp)
    668 	ld	r27,`$FRAME-8*5`($sp)
    669 	ld	r28,`$FRAME-8*4`($sp)
    670 	ld	r29,`$FRAME-8*3`($sp)
    671 	ld	r30,`$FRAME-8*2`($sp)
    672 	ld	r31,`$FRAME-8*1`($sp)
    673 	addi	$sp,$sp,$FRAME
    674 	blr
    675 	.long	0
    676 	.byte	0,12,4,0,0x80,11,3,0
    677 	.long	0
    678 .size	x25519_fe51_mul,.-x25519_fe51_mul
    679 ___
    680 {
    681 my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
    682 $code.=<<___;
    683 .globl	x25519_fe51_sqr
    684 .type	x25519_fe51_sqr,\@function
    685 .align	5
    686 x25519_fe51_sqr:
    687 	stdu	$sp,-$FRAME($sp)
    688 	std	r21,`$FRAME-8*11`($sp)
    689 	std	r22,`$FRAME-8*10`($sp)
    690 	std	r23,`$FRAME-8*9`($sp)
    691 	std	r24,`$FRAME-8*8`($sp)
    692 	std	r25,`$FRAME-8*7`($sp)
    693 	std	r26,`$FRAME-8*6`($sp)
    694 	std	r27,`$FRAME-8*5`($sp)
    695 	std	r28,`$FRAME-8*4`($sp)
    696 	std	r29,`$FRAME-8*3`($sp)
    697 	std	r30,`$FRAME-8*2`($sp)
    698 	std	r31,`$FRAME-8*1`($sp)
    699 
    700 	ld	$a0,0($ap)
    701 	ld	$a1,8($ap)
    702 	ld	$a2,16($ap)
    703 	ld	$a3,24($ap)
    704 	ld	$a4,32($ap)
    705 
    706 	add	$bi,$a0,$a0		# a[0]*2
    707 	mulli	$t1,$a4,19		# a[4]*19
    708 
    709 	mulld	$h0lo,$a0,$a0
    710 	mulhdu	$h0hi,$a0,$a0
    711 	mulld	$h1lo,$a1,$bi
    712 	mulhdu	$h1hi,$a1,$bi
    713 	mulld	$h2lo,$a2,$bi
    714 	mulhdu	$h2hi,$a2,$bi
    715 	mulld	$h3lo,$a3,$bi
    716 	mulhdu	$h3hi,$a3,$bi
    717 	mulld	$h4lo,$a4,$bi
    718 	mulhdu	$h4hi,$a4,$bi
    719 	add	$bi,$a1,$a1		# a[1]*2
    720 ___
    721 	($a4,$t1) = ($t1,$a4);
    722 $code.=<<___;
    723 	mulld	$t0,$t1,$a4
    724 	mulhdu	$t1,$t1,$a4
    725 	addc	$h3lo,$h3lo,$t0
    726 	adde	$h3hi,$h3hi,$t1
    727 
    728 	mulli	$bp,$a3,19		# a[3]*19
    729 
    730 	mulld	$t0,$a1,$a1
    731 	mulhdu	$t1,$a1,$a1
    732 	addc	$h2lo,$h2lo,$t0
    733 	adde	$h2hi,$h2hi,$t1
    734 	mulld	$t0,$a2,$bi
    735 	mulhdu	$t1,$a2,$bi
    736 	addc	$h3lo,$h3lo,$t0
    737 	adde	$h3hi,$h3hi,$t1
    738 	mulld	$t0,$a3,$bi
    739 	mulhdu	$t1,$a3,$bi
    740 	addc	$h4lo,$h4lo,$t0
    741 	adde	$h4hi,$h4hi,$t1
    742 	mulld	$t0,$a4,$bi
    743 	mulhdu	$t1,$a4,$bi
    744 	add	$bi,$a3,$a3		# a[3]*2
    745 	addc	$h0lo,$h0lo,$t0
    746 	adde	$h0hi,$h0hi,$t1
    747 ___
    748 	($a3,$t1) = ($bp,$a3);
    749 $code.=<<___;
    750 	mulld	$t0,$t1,$a3
    751 	mulhdu	$t1,$t1,$a3
    752 	addc	$h1lo,$h1lo,$t0
    753 	adde	$h1hi,$h1hi,$t1
    754 	mulld	$t0,$bi,$a4
    755 	mulhdu	$t1,$bi,$a4
    756 	add	$bi,$a2,$a2		# a[2]*2
    757 	addc	$h2lo,$h2lo,$t0
    758 	adde	$h2hi,$h2hi,$t1
    759 
    760 	mulld	$t0,$a2,$a2
    761 	mulhdu	$t1,$a2,$a2
    762 	addc	$h4lo,$h4lo,$t0
    763 	adde	$h4hi,$h4hi,$t1
    764 	mulld	$t0,$a3,$bi
    765 	mulhdu	$t1,$a3,$bi
    766 	addc	$h0lo,$h0lo,$t0
    767 	adde	$h0hi,$h0hi,$t1
    768 	mulld	$t0,$a4,$bi
    769 	mulhdu	$t1,$a4,$bi
    770 	addc	$h1lo,$h1lo,$t0
    771 	adde	$h1hi,$h1hi,$t1
    772 
    773 	b	.Lfe51_reduce
    774 	.long	0
    775 	.byte	0,12,4,0,0x80,11,2,0
    776 	.long	0
    777 .size	x25519_fe51_sqr,.-x25519_fe51_sqr
    778 ___
    779 }
    780 $code.=<<___;
    781 .globl	x25519_fe51_mul121666
    782 .type	x25519_fe51_mul121666,\@function
    783 .align	5
    784 x25519_fe51_mul121666:
    785 	stdu	$sp,-$FRAME($sp)
    786 	std	r21,`$FRAME-8*11`($sp)
    787 	std	r22,`$FRAME-8*10`($sp)
    788 	std	r23,`$FRAME-8*9`($sp)
    789 	std	r24,`$FRAME-8*8`($sp)
    790 	std	r25,`$FRAME-8*7`($sp)
    791 	std	r26,`$FRAME-8*6`($sp)
    792 	std	r27,`$FRAME-8*5`($sp)
    793 	std	r28,`$FRAME-8*4`($sp)
    794 	std	r29,`$FRAME-8*3`($sp)
    795 	std	r30,`$FRAME-8*2`($sp)
    796 	std	r31,`$FRAME-8*1`($sp)
    797 
    798 	lis	$bi,`65536>>16`
    799 	ori	$bi,$bi,`121666-65536`
    800 	ld	$a0,0($ap)
    801 	ld	$a1,8($ap)
    802 	ld	$a2,16($ap)
    803 	ld	$a3,24($ap)
    804 	ld	$a4,32($ap)
    805 
    806 	mulld	$h0lo,$a0,$bi		# a[0]*121666
    807 	mulhdu	$h0hi,$a0,$bi
    808 	mulld	$h1lo,$a1,$bi		# a[1]*121666
    809 	mulhdu	$h1hi,$a1,$bi
    810 	mulld	$h2lo,$a2,$bi		# a[2]*121666
    811 	mulhdu	$h2hi,$a2,$bi
    812 	mulld	$h3lo,$a3,$bi		# a[3]*121666
    813 	mulhdu	$h3hi,$a3,$bi
    814 	mulld	$h4lo,$a4,$bi		# a[4]*121666
    815 	mulhdu	$h4hi,$a4,$bi
    816 
    817 	b	.Lfe51_reduce
    818 	.long	0
    819 	.byte	0,12,4,0,0x80,11,2,0
    820 	.long	0
    821 .size	x25519_fe51_mul121666,.-x25519_fe51_mul121666
    822 ___
    823 }
    824 
    825 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    826 print $code;
    827 close STDOUT or die "error closing STDOUT: $!";
    828