Home | History | Annotate | Line # | Download | only in aarch64
      1 #include "arm_arch.h"
      2 
      3 #if __ARM_MAX_ARCH__>=7
      4 .arch	armv8-a+crypto
      5 .text
      6 .globl	gcm_init_v8
      7 .type	gcm_init_v8,%function
      8 .align	4
      9 gcm_init_v8:
     10 	ld1	{v17.2d},[x1]		//load input H
     11 	movi	v19.16b,#0xe1
     12 	shl	v19.2d,v19.2d,#57		//0xc2.0
     13 	ext	v3.16b,v17.16b,v17.16b,#8
     14 	ushr	v18.2d,v19.2d,#63
     15 	dup	v17.4s,v17.s[1]
     16 	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
     17 	ushr	v18.2d,v3.2d,#63
     18 	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
     19 	and	v18.16b,v18.16b,v16.16b
     20 	shl	v3.2d,v3.2d,#1
     21 	ext	v18.16b,v18.16b,v18.16b,#8
     22 	and	v16.16b,v16.16b,v17.16b
     23 	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
     24 	eor	v20.16b,v3.16b,v16.16b		//twisted H
     25 	st1	{v20.2d},[x0],#16		//store Htable[0]
     26 
     27 	//calculate H^2
     28 	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
     29 	pmull	v0.1q,v20.1d,v20.1d
     30 	eor	v16.16b,v16.16b,v20.16b
     31 	pmull2	v2.1q,v20.2d,v20.2d
     32 	pmull	v1.1q,v16.1d,v16.1d
     33 
     34 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
     35 	eor	v18.16b,v0.16b,v2.16b
     36 	eor	v1.16b,v1.16b,v17.16b
     37 	eor	v1.16b,v1.16b,v18.16b
     38 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
     39 
     40 	ins	v2.d[0],v1.d[1]
     41 	ins	v1.d[1],v0.d[0]
     42 	eor	v0.16b,v1.16b,v18.16b
     43 
     44 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
     45 	pmull	v0.1q,v0.1d,v19.1d
     46 	eor	v18.16b,v18.16b,v2.16b
     47 	eor	v22.16b,v0.16b,v18.16b
     48 
     49 	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
     50 	eor	v17.16b,v17.16b,v22.16b
     51 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
     52 	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
     53 	//calculate H^3 and H^4
     54 	pmull	v0.1q,v20.1d, v22.1d
     55 	pmull	v5.1q,v22.1d,v22.1d
     56 	pmull2	v2.1q,v20.2d, v22.2d
     57 	pmull2	v7.1q,v22.2d,v22.2d
     58 	pmull	v1.1q,v16.1d,v17.1d
     59 	pmull	v6.1q,v17.1d,v17.1d
     60 
     61 	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
     62 	ext	v17.16b,v5.16b,v7.16b,#8
     63 	eor	v18.16b,v0.16b,v2.16b
     64 	eor	v1.16b,v1.16b,v16.16b
     65 	eor	v4.16b,v5.16b,v7.16b
     66 	eor	v6.16b,v6.16b,v17.16b
     67 	eor	v1.16b,v1.16b,v18.16b
     68 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
     69 	eor	v6.16b,v6.16b,v4.16b
     70 	pmull	v4.1q,v5.1d,v19.1d
     71 
     72 	ins	v2.d[0],v1.d[1]
     73 	ins	v7.d[0],v6.d[1]
     74 	ins	v1.d[1],v0.d[0]
     75 	ins	v6.d[1],v5.d[0]
     76 	eor	v0.16b,v1.16b,v18.16b
     77 	eor	v5.16b,v6.16b,v4.16b
     78 
     79 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
     80 	ext	v4.16b,v5.16b,v5.16b,#8
     81 	pmull	v0.1q,v0.1d,v19.1d
     82 	pmull	v5.1q,v5.1d,v19.1d
     83 	eor	v18.16b,v18.16b,v2.16b
     84 	eor	v4.16b,v4.16b,v7.16b
     85 	eor	v20.16b, v0.16b,v18.16b		//H^3
     86 	eor	v22.16b,v5.16b,v4.16b		//H^4
     87 
     88 	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
     89 	ext	v17.16b,v22.16b,v22.16b,#8
     90 	eor	v16.16b,v16.16b,v20.16b
     91 	eor	v17.16b,v17.16b,v22.16b
     92 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
     93 	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
     94 	ret
     95 .size	gcm_init_v8,.-gcm_init_v8
     96 .globl	gcm_gmult_v8
     97 .type	gcm_gmult_v8,%function
     98 .align	4
     99 gcm_gmult_v8:
    100 	ld1	{v17.2d},[x0]		//load Xi
    101 	movi	v19.16b,#0xe1
    102 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
    103 	shl	v19.2d,v19.2d,#57
    104 #ifndef __AARCH64EB__
    105 	rev64	v17.16b,v17.16b
    106 #endif
    107 	ext	v3.16b,v17.16b,v17.16b,#8
    108 
    109 	pmull	v0.1q,v20.1d,v3.1d		//H.loXi.lo
    110 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
    111 	pmull2	v2.1q,v20.2d,v3.2d		//H.hiXi.hi
    112 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)(Xi.lo+Xi.hi)
    113 
    114 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    115 	eor	v18.16b,v0.16b,v2.16b
    116 	eor	v1.16b,v1.16b,v17.16b
    117 	eor	v1.16b,v1.16b,v18.16b
    118 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    119 
    120 	ins	v2.d[0],v1.d[1]
    121 	ins	v1.d[1],v0.d[0]
    122 	eor	v0.16b,v1.16b,v18.16b
    123 
    124 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    125 	pmull	v0.1q,v0.1d,v19.1d
    126 	eor	v18.16b,v18.16b,v2.16b
    127 	eor	v0.16b,v0.16b,v18.16b
    128 
    129 #ifndef __AARCH64EB__
    130 	rev64	v0.16b,v0.16b
    131 #endif
    132 	ext	v0.16b,v0.16b,v0.16b,#8
    133 	st1	{v0.2d},[x0]		//write out Xi
    134 
    135 	ret
    136 .size	gcm_gmult_v8,.-gcm_gmult_v8
    137 .globl	gcm_ghash_v8
    138 .type	gcm_ghash_v8,%function
    139 .align	4
    140 gcm_ghash_v8:
    141 	cmp	x3,#64
    142 	b.hs	.Lgcm_ghash_v8_4x
    143 	ld1	{v0.2d},[x0]		//load [rotated] Xi
    144 						//"[rotated]" means that
    145 						//loaded value would have
    146 						//to be rotated in order to
    147 						//make it appear as in
    148 						//algorithm specification
    149 	subs	x3,x3,#32		//see if x3 is 32 or larger
    150 	mov	x12,#16		//x12 is used as post-
    151 						//increment for input pointer;
    152 						//as loop is modulo-scheduled
    153 						//x12 is zeroed just in time
    154 						//to preclude overstepping
    155 						//inp[len], which means that
    156 						//last block[s] are actually
    157 						//loaded twice, but last
    158 						//copy is not processed
    159 	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
    160 	movi	v19.16b,#0xe1
    161 	ld1	{v22.2d},[x1]
    162 	csel	x12,xzr,x12,eq			//is it time to zero x12?
    163 	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
    164 	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
    165 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
    166 #ifndef __AARCH64EB__
    167 	rev64	v16.16b,v16.16b
    168 	rev64	v0.16b,v0.16b
    169 #endif
    170 	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
    171 	b.lo	.Lodd_tail_v8		//x3 was less than 32
    172 	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
    173 #ifndef __AARCH64EB__
    174 	rev64	v17.16b,v17.16b
    175 #endif
    176 	ext	v7.16b,v17.16b,v17.16b,#8
    177 	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
    178 	pmull	v4.1q,v20.1d,v7.1d		//HIi+1
    179 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
    180 	pmull2	v6.1q,v20.2d,v7.2d
    181 	b	.Loop_mod2x_v8
    182 
    183 .align	4
    184 .Loop_mod2x_v8:
    185 	ext	v18.16b,v3.16b,v3.16b,#8
    186 	subs	x3,x3,#32		//is there more data?
    187 	pmull	v0.1q,v22.1d,v3.1d		//H^2.loXi.lo
    188 	csel	x12,xzr,x12,lo			//is it time to zero x12?
    189 
    190 	pmull	v5.1q,v21.1d,v17.1d
    191 	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
    192 	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hiXi.hi
    193 	eor	v0.16b,v0.16b,v4.16b		//accumulate
    194 	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)(Xi.lo+Xi.hi)
    195 	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
    196 
    197 	eor	v2.16b,v2.16b,v6.16b
    198 	csel	x12,xzr,x12,eq			//is it time to zero x12?
    199 	eor	v1.16b,v1.16b,v5.16b
    200 
    201 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    202 	eor	v18.16b,v0.16b,v2.16b
    203 	eor	v1.16b,v1.16b,v17.16b
    204 	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
    205 #ifndef __AARCH64EB__
    206 	rev64	v16.16b,v16.16b
    207 #endif
    208 	eor	v1.16b,v1.16b,v18.16b
    209 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    210 
    211 #ifndef __AARCH64EB__
    212 	rev64	v17.16b,v17.16b
    213 #endif
    214 	ins	v2.d[0],v1.d[1]
    215 	ins	v1.d[1],v0.d[0]
    216 	ext	v7.16b,v17.16b,v17.16b,#8
    217 	ext	v3.16b,v16.16b,v16.16b,#8
    218 	eor	v0.16b,v1.16b,v18.16b
    219 	pmull	v4.1q,v20.1d,v7.1d		//HIi+1
    220 	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
    221 
    222 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    223 	pmull	v0.1q,v0.1d,v19.1d
    224 	eor	v3.16b,v3.16b,v18.16b
    225 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
    226 	eor	v3.16b,v3.16b,v0.16b
    227 	pmull2	v6.1q,v20.2d,v7.2d
    228 	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
    229 
    230 	eor	v2.16b,v2.16b,v18.16b
    231 	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
    232 	adds	x3,x3,#32		//re-construct x3
    233 	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
    234 	b.eq	.Ldone_v8		//is x3 zero?
    235 .Lodd_tail_v8:
    236 	ext	v18.16b,v0.16b,v0.16b,#8
    237 	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
    238 	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
    239 
    240 	pmull	v0.1q,v20.1d,v3.1d		//H.loXi.lo
    241 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
    242 	pmull2	v2.1q,v20.2d,v3.2d		//H.hiXi.hi
    243 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)(Xi.lo+Xi.hi)
    244 
    245 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    246 	eor	v18.16b,v0.16b,v2.16b
    247 	eor	v1.16b,v1.16b,v17.16b
    248 	eor	v1.16b,v1.16b,v18.16b
    249 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    250 
    251 	ins	v2.d[0],v1.d[1]
    252 	ins	v1.d[1],v0.d[0]
    253 	eor	v0.16b,v1.16b,v18.16b
    254 
    255 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    256 	pmull	v0.1q,v0.1d,v19.1d
    257 	eor	v18.16b,v18.16b,v2.16b
    258 	eor	v0.16b,v0.16b,v18.16b
    259 
    260 .Ldone_v8:
    261 #ifndef __AARCH64EB__
    262 	rev64	v0.16b,v0.16b
    263 #endif
    264 	ext	v0.16b,v0.16b,v0.16b,#8
    265 	st1	{v0.2d},[x0]		//write out Xi
    266 
    267 	ret
    268 .size	gcm_ghash_v8,.-gcm_ghash_v8
    269 .type	gcm_ghash_v8_4x,%function
    270 .align	4
    271 gcm_ghash_v8_4x:
    272 .Lgcm_ghash_v8_4x:
    273 	ld1	{v0.2d},[x0]		//load [rotated] Xi
    274 	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
    275 	movi	v19.16b,#0xe1
    276 	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
    277 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
    278 
    279 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
    280 #ifndef __AARCH64EB__
    281 	rev64	v0.16b,v0.16b
    282 	rev64	v5.16b,v5.16b
    283 	rev64	v6.16b,v6.16b
    284 	rev64	v7.16b,v7.16b
    285 	rev64	v4.16b,v4.16b
    286 #endif
    287 	ext	v25.16b,v7.16b,v7.16b,#8
    288 	ext	v24.16b,v6.16b,v6.16b,#8
    289 	ext	v23.16b,v5.16b,v5.16b,#8
    290 
    291 	pmull	v29.1q,v20.1d,v25.1d		//HIi+3
    292 	eor	v7.16b,v7.16b,v25.16b
    293 	pmull2	v31.1q,v20.2d,v25.2d
    294 	pmull	v30.1q,v21.1d,v7.1d
    295 
    296 	pmull	v16.1q,v22.1d,v24.1d		//H^2Ii+2
    297 	eor	v6.16b,v6.16b,v24.16b
    298 	pmull2	v24.1q,v22.2d,v24.2d
    299 	pmull2	v6.1q,v21.2d,v6.2d
    300 
    301 	eor	v29.16b,v29.16b,v16.16b
    302 	eor	v31.16b,v31.16b,v24.16b
    303 	eor	v30.16b,v30.16b,v6.16b
    304 
    305 	pmull	v7.1q,v26.1d,v23.1d		//H^3Ii+1
    306 	eor	v5.16b,v5.16b,v23.16b
    307 	pmull2	v23.1q,v26.2d,v23.2d
    308 	pmull	v5.1q,v27.1d,v5.1d
    309 
    310 	eor	v29.16b,v29.16b,v7.16b
    311 	eor	v31.16b,v31.16b,v23.16b
    312 	eor	v30.16b,v30.16b,v5.16b
    313 
    314 	subs	x3,x3,#128
    315 	b.lo	.Ltail4x
    316 
    317 	b	.Loop4x
    318 
    319 .align	4
    320 .Loop4x:
    321 	eor	v16.16b,v4.16b,v0.16b
    322 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
    323 	ext	v3.16b,v16.16b,v16.16b,#8
    324 #ifndef __AARCH64EB__
    325 	rev64	v5.16b,v5.16b
    326 	rev64	v6.16b,v6.16b
    327 	rev64	v7.16b,v7.16b
    328 	rev64	v4.16b,v4.16b
    329 #endif
    330 
    331 	pmull	v0.1q,v28.1d,v3.1d		//H^4(Xi+Ii)
    332 	eor	v16.16b,v16.16b,v3.16b
    333 	pmull2	v2.1q,v28.2d,v3.2d
    334 	ext	v25.16b,v7.16b,v7.16b,#8
    335 	pmull2	v1.1q,v27.2d,v16.2d
    336 
    337 	eor	v0.16b,v0.16b,v29.16b
    338 	eor	v2.16b,v2.16b,v31.16b
    339 	ext	v24.16b,v6.16b,v6.16b,#8
    340 	eor	v1.16b,v1.16b,v30.16b
    341 	ext	v23.16b,v5.16b,v5.16b,#8
    342 
    343 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    344 	eor	v18.16b,v0.16b,v2.16b
    345 	pmull	v29.1q,v20.1d,v25.1d		//HIi+3
    346 	eor	v7.16b,v7.16b,v25.16b
    347 	eor	v1.16b,v1.16b,v17.16b
    348 	pmull2	v31.1q,v20.2d,v25.2d
    349 	eor	v1.16b,v1.16b,v18.16b
    350 	pmull	v30.1q,v21.1d,v7.1d
    351 
    352 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    353 	ins	v2.d[0],v1.d[1]
    354 	ins	v1.d[1],v0.d[0]
    355 	pmull	v16.1q,v22.1d,v24.1d		//H^2Ii+2
    356 	eor	v6.16b,v6.16b,v24.16b
    357 	pmull2	v24.1q,v22.2d,v24.2d
    358 	eor	v0.16b,v1.16b,v18.16b
    359 	pmull2	v6.1q,v21.2d,v6.2d
    360 
    361 	eor	v29.16b,v29.16b,v16.16b
    362 	eor	v31.16b,v31.16b,v24.16b
    363 	eor	v30.16b,v30.16b,v6.16b
    364 
    365 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    366 	pmull	v0.1q,v0.1d,v19.1d
    367 	pmull	v7.1q,v26.1d,v23.1d		//H^3Ii+1
    368 	eor	v5.16b,v5.16b,v23.16b
    369 	eor	v18.16b,v18.16b,v2.16b
    370 	pmull2	v23.1q,v26.2d,v23.2d
    371 	pmull	v5.1q,v27.1d,v5.1d
    372 
    373 	eor	v0.16b,v0.16b,v18.16b
    374 	eor	v29.16b,v29.16b,v7.16b
    375 	eor	v31.16b,v31.16b,v23.16b
    376 	ext	v0.16b,v0.16b,v0.16b,#8
    377 	eor	v30.16b,v30.16b,v5.16b
    378 
    379 	subs	x3,x3,#64
    380 	b.hs	.Loop4x
    381 
    382 .Ltail4x:
    383 	eor	v16.16b,v4.16b,v0.16b
    384 	ext	v3.16b,v16.16b,v16.16b,#8
    385 
    386 	pmull	v0.1q,v28.1d,v3.1d		//H^4(Xi+Ii)
    387 	eor	v16.16b,v16.16b,v3.16b
    388 	pmull2	v2.1q,v28.2d,v3.2d
    389 	pmull2	v1.1q,v27.2d,v16.2d
    390 
    391 	eor	v0.16b,v0.16b,v29.16b
    392 	eor	v2.16b,v2.16b,v31.16b
    393 	eor	v1.16b,v1.16b,v30.16b
    394 
    395 	adds	x3,x3,#64
    396 	b.eq	.Ldone4x
    397 
    398 	cmp	x3,#32
    399 	b.lo	.Lone
    400 	b.eq	.Ltwo
    401 .Lthree:
    402 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    403 	eor	v18.16b,v0.16b,v2.16b
    404 	eor	v1.16b,v1.16b,v17.16b
    405 	ld1	{v4.2d,v5.2d,v6.2d},[x2]
    406 	eor	v1.16b,v1.16b,v18.16b
    407 #ifndef	__AARCH64EB__
    408 	rev64	v5.16b,v5.16b
    409 	rev64	v6.16b,v6.16b
    410 	rev64	v4.16b,v4.16b
    411 #endif
    412 
    413 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    414 	ins	v2.d[0],v1.d[1]
    415 	ins	v1.d[1],v0.d[0]
    416 	ext	v24.16b,v6.16b,v6.16b,#8
    417 	ext	v23.16b,v5.16b,v5.16b,#8
    418 	eor	v0.16b,v1.16b,v18.16b
    419 
    420 	pmull	v29.1q,v20.1d,v24.1d		//HIi+2
    421 	eor	v6.16b,v6.16b,v24.16b
    422 
    423 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    424 	pmull	v0.1q,v0.1d,v19.1d
    425 	eor	v18.16b,v18.16b,v2.16b
    426 	pmull2	v31.1q,v20.2d,v24.2d
    427 	pmull	v30.1q,v21.1d,v6.1d
    428 	eor	v0.16b,v0.16b,v18.16b
    429 	pmull	v7.1q,v22.1d,v23.1d		//H^2Ii+1
    430 	eor	v5.16b,v5.16b,v23.16b
    431 	ext	v0.16b,v0.16b,v0.16b,#8
    432 
    433 	pmull2	v23.1q,v22.2d,v23.2d
    434 	eor	v16.16b,v4.16b,v0.16b
    435 	pmull2	v5.1q,v21.2d,v5.2d
    436 	ext	v3.16b,v16.16b,v16.16b,#8
    437 
    438 	eor	v29.16b,v29.16b,v7.16b
    439 	eor	v31.16b,v31.16b,v23.16b
    440 	eor	v30.16b,v30.16b,v5.16b
    441 
    442 	pmull	v0.1q,v26.1d,v3.1d		//H^3(Xi+Ii)
    443 	eor	v16.16b,v16.16b,v3.16b
    444 	pmull2	v2.1q,v26.2d,v3.2d
    445 	pmull	v1.1q,v27.1d,v16.1d
    446 
    447 	eor	v0.16b,v0.16b,v29.16b
    448 	eor	v2.16b,v2.16b,v31.16b
    449 	eor	v1.16b,v1.16b,v30.16b
    450 	b	.Ldone4x
    451 
    452 .align	4
    453 .Ltwo:
    454 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    455 	eor	v18.16b,v0.16b,v2.16b
    456 	eor	v1.16b,v1.16b,v17.16b
    457 	ld1	{v4.2d,v5.2d},[x2]
    458 	eor	v1.16b,v1.16b,v18.16b
    459 #ifndef	__AARCH64EB__
    460 	rev64	v5.16b,v5.16b
    461 	rev64	v4.16b,v4.16b
    462 #endif
    463 
    464 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    465 	ins	v2.d[0],v1.d[1]
    466 	ins	v1.d[1],v0.d[0]
    467 	ext	v23.16b,v5.16b,v5.16b,#8
    468 	eor	v0.16b,v1.16b,v18.16b
    469 
    470 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    471 	pmull	v0.1q,v0.1d,v19.1d
    472 	eor	v18.16b,v18.16b,v2.16b
    473 	eor	v0.16b,v0.16b,v18.16b
    474 	ext	v0.16b,v0.16b,v0.16b,#8
    475 
    476 	pmull	v29.1q,v20.1d,v23.1d		//HIi+1
    477 	eor	v5.16b,v5.16b,v23.16b
    478 
    479 	eor	v16.16b,v4.16b,v0.16b
    480 	ext	v3.16b,v16.16b,v16.16b,#8
    481 
    482 	pmull2	v31.1q,v20.2d,v23.2d
    483 	pmull	v30.1q,v21.1d,v5.1d
    484 
    485 	pmull	v0.1q,v22.1d,v3.1d		//H^2(Xi+Ii)
    486 	eor	v16.16b,v16.16b,v3.16b
    487 	pmull2	v2.1q,v22.2d,v3.2d
    488 	pmull2	v1.1q,v21.2d,v16.2d
    489 
    490 	eor	v0.16b,v0.16b,v29.16b
    491 	eor	v2.16b,v2.16b,v31.16b
    492 	eor	v1.16b,v1.16b,v30.16b
    493 	b	.Ldone4x
    494 
    495 .align	4
    496 .Lone:
    497 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    498 	eor	v18.16b,v0.16b,v2.16b
    499 	eor	v1.16b,v1.16b,v17.16b
    500 	ld1	{v4.2d},[x2]
    501 	eor	v1.16b,v1.16b,v18.16b
    502 #ifndef	__AARCH64EB__
    503 	rev64	v4.16b,v4.16b
    504 #endif
    505 
    506 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    507 	ins	v2.d[0],v1.d[1]
    508 	ins	v1.d[1],v0.d[0]
    509 	eor	v0.16b,v1.16b,v18.16b
    510 
    511 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    512 	pmull	v0.1q,v0.1d,v19.1d
    513 	eor	v18.16b,v18.16b,v2.16b
    514 	eor	v0.16b,v0.16b,v18.16b
    515 	ext	v0.16b,v0.16b,v0.16b,#8
    516 
    517 	eor	v16.16b,v4.16b,v0.16b
    518 	ext	v3.16b,v16.16b,v16.16b,#8
    519 
    520 	pmull	v0.1q,v20.1d,v3.1d
    521 	eor	v16.16b,v16.16b,v3.16b
    522 	pmull2	v2.1q,v20.2d,v3.2d
    523 	pmull	v1.1q,v21.1d,v16.1d
    524 
    525 .Ldone4x:
    526 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    527 	eor	v18.16b,v0.16b,v2.16b
    528 	eor	v1.16b,v1.16b,v17.16b
    529 	eor	v1.16b,v1.16b,v18.16b
    530 
    531 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    532 	ins	v2.d[0],v1.d[1]
    533 	ins	v1.d[1],v0.d[0]
    534 	eor	v0.16b,v1.16b,v18.16b
    535 
    536 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    537 	pmull	v0.1q,v0.1d,v19.1d
    538 	eor	v18.16b,v18.16b,v2.16b
    539 	eor	v0.16b,v0.16b,v18.16b
    540 	ext	v0.16b,v0.16b,v0.16b,#8
    541 
    542 #ifndef __AARCH64EB__
    543 	rev64	v0.16b,v0.16b
    544 #endif
    545 	st1	{v0.2d},[x0]		//write out Xi
    546 
    547 	ret
    548 .size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
    549 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    550 .align	2
    551 .align	2
    552 #endif
    553