Home | History | Annotate | Line # | Download | only in asm
      1 ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
      2 ;;
      3 ;; Licensed under the Apache License 2.0 (the "License").  You may not use
      4 ;; this file except in compliance with the License.  You can obtain a copy
      5 ;; in the file LICENSE in the source distribution or at
      6 ;; https://www.openssl.org/source/license.html
      7 ;;
      8 ;;====================================================================
      9 ;; Written by Andy Polyakov <appro (a] openssl.org> for the OpenSSL
     10 ;; project.
     11 ;;
     12 ;; Rights for redistribution and usage in source and binary forms are
     13 ;; granted according to the License. Warranty of any kind is disclaimed.
     14 ;;====================================================================
     15 ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
     16 ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
     17 ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
     18 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
     19 ;;====================================================================
     20 	.text
     21 
     22 	.if	.ASSEMBLER_VERSION<7000000
     23 	.asg	0,__TI_EABI__
     24 	.endif
     25 	.if	__TI_EABI__
     26 	.asg	bn_mul_add_words,_bn_mul_add_words
     27 	.asg	bn_mul_words,_bn_mul_words
     28 	.asg	bn_sqr_words,_bn_sqr_words
     29 	.asg	bn_add_words,_bn_add_words
     30 	.asg	bn_sub_words,_bn_sub_words
     31 	.asg	bn_div_words,_bn_div_words
     32 	.asg	bn_sqr_comba8,_bn_sqr_comba8
     33 	.asg	bn_mul_comba8,_bn_mul_comba8
     34 	.asg	bn_sqr_comba4,_bn_sqr_comba4
     35 	.asg	bn_mul_comba4,_bn_mul_comba4
     36 	.endif
     37 
     38 	.asg	B3,RA
     39 	.asg	A4,ARG0
     40 	.asg	B4,ARG1
     41 	.asg	A6,ARG2
     42 	.asg	B6,ARG3
     43 	.asg	A8,ARG4
     44 	.asg	B8,ARG5
     45 	.asg	A4,RET
     46 	.asg	A15,FP
     47 	.asg	B14,DP
     48 	.asg	B15,SP
     49 
     50 	.global	_bn_mul_add_words
     51 _bn_mul_add_words:
     52 	.asmfunc
     53 	MV	ARG2,B0
     54   [!B0]	BNOP	RA
     55 ||[!B0]	MVK	0,RET
     56    [B0]	MVC	B0,ILC
     57    [B0]	ZERO	A19		; high part of accumulator
     58 || [B0]	MV	ARG0,A2
     59 || [B0]	MV	ARG3,A3
     60 	NOP	3
     61 
     62 	SPLOOP	2		; 2*n+10
     63 ;;====================================================================
     64 	LDW	*ARG1++,B7	; ap[i]
     65 	NOP	3
     66 	LDW	*ARG0++,A7	; rp[i]
     67 	MPY32U	B7,A3,A17:A16
     68 	NOP	3		; [2,0] in epilogue
     69 	ADDU	A16,A7,A21:A20
     70 	ADDU	A19,A21:A20,A19:A18
     71 ||	MV.S	A17,A23
     72 	SPKERNEL 2,1		; leave slot for "return value"
     73 ||	STW	A18,*A2++	; rp[i]
     74 ||	ADD	A19,A23,A19
     75 ;;====================================================================
     76 	BNOP	RA,4
     77 	MV	A19,RET		; return value
     78 	.endasmfunc
     79 
     80 	.global	_bn_mul_words
     81 _bn_mul_words:
     82 	.asmfunc
     83 	MV	ARG2,B0
     84   [!B0]	BNOP	RA
     85 ||[!B0]	MVK	0,RET
     86    [B0]	MVC	B0,ILC
     87    [B0]	ZERO	A19		; high part of accumulator
     88 	NOP	3
     89 
     90 	SPLOOP	2		; 2*n+10
     91 ;;====================================================================
     92 	LDW	*ARG1++,A7	; ap[i]
     93 	NOP	4
     94 	MPY32U	A7,ARG3,A17:A16
     95 	NOP	4		; [2,0] in epiloque
     96 	ADDU	A19,A16,A19:A18
     97 ||	MV.S	A17,A21
     98 	SPKERNEL 2,1		; leave slot for "return value"
     99 ||	STW	A18,*ARG0++	; rp[i]
    100 ||	ADD.L	A19,A21,A19
    101 ;;====================================================================
    102 	BNOP	RA,4
    103 	MV	A19,RET		; return value
    104 	.endasmfunc
    105 
    106 	.global	_bn_sqr_words
    107 _bn_sqr_words:
    108 	.asmfunc
    109 	MV	ARG2,B0
    110   [!B0]	BNOP	RA
    111 ||[!B0]	MVK	0,RET
    112    [B0]	MVC	B0,ILC
    113    [B0]	MV	ARG0,B2
    114 || [B0]	ADD	4,ARG0,ARG0
    115 	NOP	3
    116 
    117 	SPLOOP	2		; 2*n+10
    118 ;;====================================================================
    119 	LDW	*ARG1++,B7	; ap[i]
    120 	NOP	4
    121 	MPY32U	B7,B7,B1:B0
    122 	NOP	3		; [2,0] in epilogue
    123 	STW	B0,*B2++(8)	; rp[2*i]
    124 	MV	B1,A1
    125 	SPKERNEL 2,0		; fully overlap BNOP RA,5
    126 ||	STW	A1,*ARG0++(8)	; rp[2*i+1]
    127 ;;====================================================================
    128 	BNOP	RA,5
    129 	.endasmfunc
    130 
    131 	.global	_bn_add_words
    132 _bn_add_words:
    133 	.asmfunc
    134 	MV	ARG3,B0
    135   [!B0]	BNOP	RA
    136 ||[!B0]	MVK	0,RET
    137    [B0]	MVC	B0,ILC
    138    [B0]	ZERO	A1		; carry flag
    139 || [B0]	MV	ARG0,A3
    140 	NOP	3
    141 
    142 	SPLOOP	2		; 2*n+6
    143 ;;====================================================================
    144 	LDW	*ARG2++,A7	; bp[i]
    145 ||	LDW	*ARG1++,B7	; ap[i]
    146 	NOP	4
    147 	ADDU	A7,B7,A9:A8
    148 	ADDU	A1,A9:A8,A1:A0
    149 	SPKERNEL 0,0		; fully overlap BNOP RA,5
    150 ||	STW	A0,*A3++	; write result
    151 ||	MV	A1,RET		; keep carry flag in RET
    152 ;;====================================================================
    153 	BNOP	RA,5
    154 	.endasmfunc
    155 
    156 	.global	_bn_sub_words
    157 _bn_sub_words:
    158 	.asmfunc
    159 	MV	ARG3,B0
    160   [!B0]	BNOP	RA
    161 ||[!B0]	MVK	0,RET
    162    [B0]	MVC	B0,ILC
    163    [B0]	ZERO	A2		; borrow flag
    164 || [B0]	MV	ARG0,A3
    165 	NOP	3
    166 
    167 	SPLOOP	2		; 2*n+6
    168 ;;====================================================================
    169 	LDW	*ARG2++,A7	; bp[i]
    170 ||	LDW	*ARG1++,B7	; ap[i]
    171 	NOP	4
    172 	SUBU	B7,A7,A1:A0
    173   [A2]	SUB	A1:A0,1,A1:A0
    174 	SPKERNEL 0,1		; leave slot for "return borrow flag"
    175 ||	STW	A0,*A3++	; write result
    176 ||	AND	1,A1,A2		; pass on borrow flag
    177 ;;====================================================================
    178 	BNOP	RA,4
    179 	AND	1,A1,RET	; return borrow flag
    180 	.endasmfunc
    181 
    182 	.global	_bn_div_words
    183 _bn_div_words:
    184 	.asmfunc
    185 	LMBD	1,A6,A0		; leading zero bits in dv
    186 	LMBD	1,A4,A1		; leading zero bits in hi
    187 ||	MVK	32,B0
    188 	CMPLTU	A1,A0,A2
    189 ||	ADD	A0,B0,B0
    190   [ A2]	BNOP	RA
    191 ||[ A2]	MVK	-1,A4		; return overflow
    192 ||[!A2]	MV	A4,A3		; reassign hi
    193   [!A2]	MV	B4,A4		; reassign lo, will be quotient
    194 ||[!A2]	MVC	B0,ILC
    195   [!A2]	SHL	A6,A0,A6	; normalize dv
    196 ||	MVK	1,A1
    197 
    198   [!A2]	CMPLTU	A3,A6,A1	; hi<dv?
    199 ||[!A2]	SHL	A4,1,A5:A4	; lo<<1
    200   [!A1]	SUB	A3,A6,A3	; hi-=dv
    201 ||[!A1]	OR	1,A4,A4
    202   [!A2]	SHRU	A3,31,A1	; upper bit
    203 ||[!A2]	ADDAH	A5,A3,A3	; hi<<1|lo>>31
    204 
    205 	SPLOOP	3
    206   [!A1]	CMPLTU	A3,A6,A1	; hi<dv?
    207 ||[ A1]	ZERO	A1
    208 ||	SHL	A4,1,A5:A4	; lo<<1
    209   [!A1]	SUB	A3,A6,A3	; hi-=dv
    210 ||[!A1]	OR	1,A4,A4		; quotient
    211 	SHRU	A3,31,A1	; upper bit
    212 ||	ADDAH	A5,A3,A3	; hi<<1|lo>>31
    213 	SPKERNEL
    214 
    215 	BNOP	RA,5
    216 	.endasmfunc
    217 
    218 ;;====================================================================
    219 ;; Not really Comba algorithm, just straightforward NxM... Dedicated
    220 ;; fully unrolled real Comba implementations are asymptotically 2x
    221 ;; faster, but naturally larger undertaking. Purpose of this exercise
    222 ;; was rather to learn to master nested SPLOOPs...
    223 ;;====================================================================
    224 	.global	_bn_sqr_comba8
    225 	.global	_bn_mul_comba8
    226 _bn_sqr_comba8:
    227 	MV	ARG1,ARG2
    228 _bn_mul_comba8:
    229 	.asmfunc
    230 	MVK	8,B0		; N, RILC
    231 ||	MVK	8,A0		; M, outer loop counter
    232 ||	MV	ARG1,A5		; copy ap
    233 ||	MV	ARG0,B4		; copy rp
    234 ||	ZERO	B19		; high part of accumulator
    235 	MVC	B0,RILC
    236 ||	SUB	B0,2,B1		; N-2, initial ILC
    237 ||	SUB	B0,1,B2		; const B2=N-1
    238 ||	LDW	*A5++,B6	; ap[0]
    239 ||	MV	A0,A3		; const A3=M
    240 sploopNxM?:			; for best performance arrange M<=N
    241    [A0]	SPLOOPD	2		; 2*n+10
    242 ||	MVC	B1,ILC
    243 ||	ADDAW	B4,B0,B5
    244 ||	ZERO	B7
    245 ||	LDW	*A5++,A9	; pre-fetch ap[1]
    246 ||	ZERO	A1
    247 ||	SUB	A0,1,A0
    248 ;;====================================================================
    249 ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
    250 ;; This is because of Advisory 15 from TI publication SPRZ247I.
    251 	LDW	*ARG2++,A7	; bp[i]
    252 	NOP	3
    253    [A1]	LDW	*B5++,B7	; rp[i]
    254 	MPY32U	A7,B6,B17:B16
    255 	NOP	3
    256 	ADDU	B16,B7,B21:B20
    257 	ADDU	B19,B21:B20,B19:B18
    258 ||	MV.S	B17,B23
    259 	SPKERNEL
    260 ||	STW	B18,*B4++	; rp[i]
    261 ||	ADD.S	B19,B23,B19
    262 ;;====================================================================
    263 outer?:				; m*2*(n+1)+10
    264 	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
    265 	SPMASKR
    266 ||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
    267 	MVD	A9,B6		; move through .M unit(*)
    268    [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
    269 	SUBAW	B5,B2,B5	; rewind rp to rp[1]
    270 	MVK	1,A1
    271    [A0]	BNOP.S1	outer?,4
    272 || [A0]	SUB.L	A0,1,A0
    273 	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
    274 ||	ZERO.S	B19		; high part of accumulator
    275 ;; end of outer?
    276 	BNOP	RA,5		; return
    277 	.endasmfunc
    278 ;; (*)	It should be noted that B6 is used as input to MPY32U in
    279 ;;	chronologically next cycle in *preceding* SPLOOP iteration.
    280 ;;	Normally such arrangement would require DINT, but at this
    281 ;;	point SPLOOP is draining and interrupts are disabled
    282 ;;	implicitly.
    283 
    284 	.global	_bn_sqr_comba4
    285 	.global	_bn_mul_comba4
    286 _bn_sqr_comba4:
    287 	MV	ARG1,ARG2
    288 _bn_mul_comba4:
    289 	.asmfunc
    290 	.if	0
    291 	BNOP	sploopNxM?,3
    292 	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
    293 	;; because of low-counter effect, when prologue phase finishes
    294 	;; before SPKERNEL instruction is reached. As result it's 25%
    295 	;; slower than expected...
    296 	MVK	4,B0		; N, RILC
    297 ||	MVK	4,A0		; M, outer loop counter
    298 ||	MV	ARG1,A5		; copy ap
    299 ||	MV	ARG0,B4		; copy rp
    300 ||	ZERO	B19		; high part of accumulator
    301 	MVC	B0,RILC
    302 ||	SUB	B0,2,B1		; first ILC
    303 ||	SUB	B0,1,B2		; const B2=N-1
    304 ||	LDW	*A5++,B6	; ap[0]
    305 ||	MV	A0,A3		; const A3=M
    306 	.else
    307 	;; This alternative is an exercise in fully unrolled Comba
    308 	;; algorithm implementation that operates at n*(n+1)+12, or
    309 	;; as little as 32 cycles...
    310 	LDW	*ARG1[0],B16	; a[0]
    311 ||	LDW	*ARG2[0],A16	; b[0]
    312 	LDW	*ARG1[1],B17	; a[1]
    313 ||	LDW	*ARG2[1],A17	; b[1]
    314 	LDW	*ARG1[2],B18	; a[2]
    315 ||	LDW	*ARG2[2],A18	; b[2]
    316 	LDW	*ARG1[3],B19	; a[3]
    317 ||	LDW	*ARG2[3],A19	; b[3]
    318 	NOP
    319 	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
    320 	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
    321 	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
    322 	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
    323 	STW	A0,*ARG0[0]
    324 ||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
    325 	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
    326 ||	ADDU	A22,A1,A1:A0
    327 	MV	A23,B0
    328 ||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
    329 ||	ADDU	A24,A1:A0,A1:A0
    330 	ADDU	A25,B0,B1:B0
    331 ||	STW	A0,*ARG0[1]
    332 ||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
    333 ||	ADDU	A26,A1,A9:A8
    334 	ADDU	A27,B1,B9:B8
    335 ||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
    336 ||	ADDU	A28,A9:A8,A9:A8
    337 	ADDU	A29,B9:B8,B9:B8
    338 ||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
    339 ||	ADDU	A30,A9:A8,A9:A8
    340 	ADDU	A31,B9:B8,B9:B8
    341 ||	ADDU	B0,A9:A8,A9:A8
    342 	STW	A8,*ARG0[2]
    343 ||	ADDU	A20,A9,A1:A0
    344 	ADDU	A21,B9,B1:B0
    345 ||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
    346 ||	ADDU	A22,A1:A0,A1:A0
    347 	ADDU	A23,B1:B0,B1:B0
    348 ||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
    349 ||	ADDU	A24,A1:A0,A1:A0
    350 	ADDU	A25,B1:B0,B1:B0
    351 ||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
    352 ||	ADDU	A26,A1:A0,A1:A0
    353 	ADDU	A27,B1:B0,B1:B0
    354 ||	ADDU	B8,A1:A0,A1:A0
    355 	STW	A0,*ARG0[3]
    356 ||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
    357 ||	ADDU	A20,A1,A9:A8
    358 	ADDU	A21,B1,B9:B8
    359 ||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
    360 ||	ADDU	A22,A9:A8,A9:A8
    361 	ADDU	A23,B9:B8,B9:B8
    362 ||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
    363 ||	ADDU	A24,A9:A8,A9:A8
    364 	ADDU	A25,B9:B8,B9:B8
    365 ||	ADDU	B0,A9:A8,A9:A8
    366 	STW	A8,*ARG0[4]
    367 ||	ADDU	A26,A9,A1:A0
    368 	ADDU	A27,B9,B1:B0
    369 ||	ADDU	A28,A1:A0,A1:A0
    370 	ADDU	A29,B1:B0,B1:B0
    371 ||	BNOP	RA
    372 ||	ADDU	B8,A1:A0,A1:A0
    373 	STW	A0,*ARG0[5]
    374 ||	ADDU	A30,A1,A9:A8
    375 	ADD	A31,B1,B8
    376 	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
    377 	ADD	B8,A9,A9
    378 ||	STW	A8,*ARG0[6]
    379 	STW	A9,*ARG0[7]
    380 	.endif
    381 	.endasmfunc
    382