Home | History | Annotate | Line # | Download | only in aarch64
      1 #include "arm_arch.h"
      2 
      3 #if __ARM_MAX_ARCH__>=7
      4 .arch	armv8-a+crypto
      5 .text
      6 .globl	gcm_init_v8
      7 .type	gcm_init_v8,%function
      8 .align	4
      9 gcm_init_v8:
     10 	AARCH64_VALID_CALL_TARGET
     11 	ld1	{v17.2d},[x1]		//load input H
     12 	movi	v19.16b,#0xe1
     13 	shl	v19.2d,v19.2d,#57		//0xc2.0
     14 	ext	v3.16b,v17.16b,v17.16b,#8
     15 	ushr	v18.2d,v19.2d,#63
     16 	dup	v17.4s,v17.s[1]
     17 	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
     18 	ushr	v18.2d,v3.2d,#63
     19 	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
     20 	and	v18.16b,v18.16b,v16.16b
     21 	shl	v3.2d,v3.2d,#1
     22 	ext	v18.16b,v18.16b,v18.16b,#8
     23 	and	v16.16b,v16.16b,v17.16b
     24 	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
     25 	eor	v20.16b,v3.16b,v16.16b		//twisted H
     26 	st1	{v20.2d},[x0],#16		//store Htable[0]
     27 
     28 	//calculate H^2
     29 	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
     30 	pmull	v0.1q,v20.1d,v20.1d
     31 	eor	v16.16b,v16.16b,v20.16b
     32 	pmull2	v2.1q,v20.2d,v20.2d
     33 	pmull	v1.1q,v16.1d,v16.1d
     34 
     35 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
     36 	eor	v18.16b,v0.16b,v2.16b
     37 	eor	v1.16b,v1.16b,v17.16b
     38 	eor	v1.16b,v1.16b,v18.16b
     39 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
     40 
     41 	ins	v2.d[0],v1.d[1]
     42 	ins	v1.d[1],v0.d[0]
     43 	eor	v0.16b,v1.16b,v18.16b
     44 
     45 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
     46 	pmull	v0.1q,v0.1d,v19.1d
     47 	eor	v18.16b,v18.16b,v2.16b
     48 	eor	v22.16b,v0.16b,v18.16b
     49 
     50 	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
     51 	eor	v17.16b,v17.16b,v22.16b
     52 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
     53 	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
     54 	//calculate H^3 and H^4
     55 	pmull	v0.1q,v20.1d, v22.1d
     56 	pmull	v5.1q,v22.1d,v22.1d
     57 	pmull2	v2.1q,v20.2d, v22.2d
     58 	pmull2	v7.1q,v22.2d,v22.2d
     59 	pmull	v1.1q,v16.1d,v17.1d
     60 	pmull	v6.1q,v17.1d,v17.1d
     61 
     62 	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
     63 	ext	v17.16b,v5.16b,v7.16b,#8
     64 	eor	v18.16b,v0.16b,v2.16b
     65 	eor	v1.16b,v1.16b,v16.16b
     66 	eor	v4.16b,v5.16b,v7.16b
     67 	eor	v6.16b,v6.16b,v17.16b
     68 	eor	v1.16b,v1.16b,v18.16b
     69 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
     70 	eor	v6.16b,v6.16b,v4.16b
     71 	pmull	v4.1q,v5.1d,v19.1d
     72 
     73 	ins	v2.d[0],v1.d[1]
     74 	ins	v7.d[0],v6.d[1]
     75 	ins	v1.d[1],v0.d[0]
     76 	ins	v6.d[1],v5.d[0]
     77 	eor	v0.16b,v1.16b,v18.16b
     78 	eor	v5.16b,v6.16b,v4.16b
     79 
     80 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
     81 	ext	v4.16b,v5.16b,v5.16b,#8
     82 	pmull	v0.1q,v0.1d,v19.1d
     83 	pmull	v5.1q,v5.1d,v19.1d
     84 	eor	v18.16b,v18.16b,v2.16b
     85 	eor	v4.16b,v4.16b,v7.16b
     86 	eor	v23.16b, v0.16b,v18.16b		//H^3
     87 	eor	v25.16b,v5.16b,v4.16b		//H^4
     88 
     89 	ext	v16.16b,v23.16b, v23.16b,#8		//Karatsuba pre-processing
     90 	ext	v17.16b,v25.16b,v25.16b,#8
     91 	ext	v18.16b,v22.16b,v22.16b,#8
     92 	eor	v16.16b,v16.16b,v23.16b
     93 	eor	v17.16b,v17.16b,v25.16b
     94 	eor	v18.16b,v18.16b,v22.16b
     95 	ext	v24.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
     96 	st1	{v23.2d,v24.2d,v25.2d},[x0],#48		//store Htable[3..5]
     97 
     98 	//calculate H^5 and H^6
     99 	pmull	v0.1q,v22.1d, v23.1d
    100 	pmull	v5.1q,v23.1d,v23.1d
    101 	pmull2	v2.1q,v22.2d, v23.2d
    102 	pmull2	v7.1q,v23.2d,v23.2d
    103 	pmull	v1.1q,v16.1d,v18.1d
    104 	pmull	v6.1q,v16.1d,v16.1d
    105 
    106 	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    107 	ext	v17.16b,v5.16b,v7.16b,#8
    108 	eor	v18.16b,v0.16b,v2.16b
    109 	eor	v1.16b,v1.16b,v16.16b
    110 	eor	v4.16b,v5.16b,v7.16b
    111 	eor	v6.16b,v6.16b,v17.16b
    112 	eor	v1.16b,v1.16b,v18.16b
    113 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
    114 	eor	v6.16b,v6.16b,v4.16b
    115 	pmull	v4.1q,v5.1d,v19.1d
    116 
    117 	ins	v2.d[0],v1.d[1]
    118 	ins	v7.d[0],v6.d[1]
    119 	ins	v1.d[1],v0.d[0]
    120 	ins	v6.d[1],v5.d[0]
    121 	eor	v0.16b,v1.16b,v18.16b
    122 	eor	v5.16b,v6.16b,v4.16b
    123 
    124 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
    125 	ext	v4.16b,v5.16b,v5.16b,#8
    126 	pmull	v0.1q,v0.1d,v19.1d
    127 	pmull	v5.1q,v5.1d,v19.1d
    128 	eor	v18.16b,v18.16b,v2.16b
    129 	eor	v4.16b,v4.16b,v7.16b
    130 	eor	v26.16b,v0.16b,v18.16b		//H^5
    131 	eor	v28.16b,v5.16b,v4.16b		//H^6
    132 
    133 	ext	v16.16b,v26.16b, v26.16b,#8		//Karatsuba pre-processing
    134 	ext	v17.16b,v28.16b,v28.16b,#8
    135 	ext	v18.16b,v22.16b,v22.16b,#8
    136 	eor	v16.16b,v16.16b,v26.16b
    137 	eor	v17.16b,v17.16b,v28.16b
    138 	eor	v18.16b,v18.16b,v22.16b
    139 	ext	v27.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
    140 	st1	{v26.2d,v27.2d,v28.2d},[x0],#48		//store Htable[6..8]
    141 
    142 	//calculate H^7 and H^8
    143 	pmull	v0.1q,v22.1d,v26.1d
    144 	pmull	v5.1q,v22.1d,v28.1d
    145 	pmull2	v2.1q,v22.2d,v26.2d
    146 	pmull2	v7.1q,v22.2d,v28.2d
    147 	pmull	v1.1q,v16.1d,v18.1d
    148 	pmull	v6.1q,v17.1d,v18.1d
    149 
    150 	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    151 	ext	v17.16b,v5.16b,v7.16b,#8
    152 	eor	v18.16b,v0.16b,v2.16b
    153 	eor	v1.16b,v1.16b,v16.16b
    154 	eor	v4.16b,v5.16b,v7.16b
    155 	eor	v6.16b,v6.16b,v17.16b
    156 	eor	v1.16b,v1.16b,v18.16b
    157 	pmull	v18.1q,v0.1d,v19.1d		//1st phase
    158 	eor	v6.16b,v6.16b,v4.16b
    159 	pmull	v4.1q,v5.1d,v19.1d
    160 
    161 	ins	v2.d[0],v1.d[1]
    162 	ins	v7.d[0],v6.d[1]
    163 	ins	v1.d[1],v0.d[0]
    164 	ins	v6.d[1],v5.d[0]
    165 	eor	v0.16b,v1.16b,v18.16b
    166 	eor	v5.16b,v6.16b,v4.16b
    167 
    168 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
    169 	ext	v4.16b,v5.16b,v5.16b,#8
    170 	pmull	v0.1q,v0.1d,v19.1d
    171 	pmull	v5.1q,v5.1d,v19.1d
    172 	eor	v18.16b,v18.16b,v2.16b
    173 	eor	v4.16b,v4.16b,v7.16b
    174 	eor	v29.16b,v0.16b,v18.16b		//H^7
    175 	eor	v31.16b,v5.16b,v4.16b		//H^8
    176 
    177 	ext	v16.16b,v29.16b,v29.16b,#8		//Karatsuba pre-processing
    178 	ext	v17.16b,v31.16b,v31.16b,#8
    179 	eor	v16.16b,v16.16b,v29.16b
    180 	eor	v17.16b,v17.16b,v31.16b
    181 	ext	v30.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
    182 	st1	{v29.2d,v30.2d,v31.2d},[x0]		//store Htable[9..11]
    183 	ret
    184 .size	gcm_init_v8,.-gcm_init_v8
    185 .globl	gcm_gmult_v8
    186 .type	gcm_gmult_v8,%function
    187 .align	4
    188 gcm_gmult_v8:
    189 	AARCH64_VALID_CALL_TARGET
    190 	ld1	{v17.2d},[x0]		//load Xi
    191 	movi	v19.16b,#0xe1
    192 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
    193 	shl	v19.2d,v19.2d,#57
    194 #ifndef __AARCH64EB__
    195 	rev64	v17.16b,v17.16b
    196 #endif
    197 	ext	v3.16b,v17.16b,v17.16b,#8
    198 
    199 	pmull	v0.1q,v20.1d,v3.1d		//H.loXi.lo
    200 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
    201 	pmull2	v2.1q,v20.2d,v3.2d		//H.hiXi.hi
    202 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)(Xi.lo+Xi.hi)
    203 
    204 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    205 	eor	v18.16b,v0.16b,v2.16b
    206 	eor	v1.16b,v1.16b,v17.16b
    207 	eor	v1.16b,v1.16b,v18.16b
    208 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    209 
    210 	ins	v2.d[0],v1.d[1]
    211 	ins	v1.d[1],v0.d[0]
    212 	eor	v0.16b,v1.16b,v18.16b
    213 
    214 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    215 	pmull	v0.1q,v0.1d,v19.1d
    216 	eor	v18.16b,v18.16b,v2.16b
    217 	eor	v0.16b,v0.16b,v18.16b
    218 
    219 #ifndef __AARCH64EB__
    220 	rev64	v0.16b,v0.16b
    221 #endif
    222 	ext	v0.16b,v0.16b,v0.16b,#8
    223 	st1	{v0.2d},[x0]		//write out Xi
    224 
    225 	ret
    226 .size	gcm_gmult_v8,.-gcm_gmult_v8
    227 .globl	gcm_ghash_v8
    228 .type	gcm_ghash_v8,%function
    229 .align	4
    230 gcm_ghash_v8:
    231 	AARCH64_VALID_CALL_TARGET
    232 	cmp	x3,#64
    233 	b.hs	.Lgcm_ghash_v8_4x
    234 	ld1	{v0.2d},[x0]		//load [rotated] Xi
    235 						//"[rotated]" means that
    236 						//loaded value would have
    237 						//to be rotated in order to
    238 						//make it appear as in
    239 						//algorithm specification
    240 	subs	x3,x3,#32		//see if x3 is 32 or larger
    241 	mov	x12,#16		//x12 is used as post-
    242 						//increment for input pointer;
    243 						//as loop is modulo-scheduled
    244 						//x12 is zeroed just in time
    245 						//to preclude overstepping
    246 						//inp[len], which means that
    247 						//last block[s] are actually
    248 						//loaded twice, but last
    249 						//copy is not processed
    250 	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
    251 	movi	v19.16b,#0xe1
    252 	ld1	{v22.2d},[x1]
    253 	csel	x12,xzr,x12,eq			//is it time to zero x12?
    254 	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
    255 	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
    256 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
    257 #ifndef __AARCH64EB__
    258 	rev64	v16.16b,v16.16b
    259 	rev64	v0.16b,v0.16b
    260 #endif
    261 	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
    262 	b.lo	.Lodd_tail_v8		//x3 was less than 32
    263 	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
    264 #ifndef __AARCH64EB__
    265 	rev64	v17.16b,v17.16b
    266 #endif
    267 	ext	v7.16b,v17.16b,v17.16b,#8
    268 	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
    269 	pmull	v4.1q,v20.1d,v7.1d		//HIi+1
    270 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
    271 	pmull2	v6.1q,v20.2d,v7.2d
    272 	b	.Loop_mod2x_v8
    273 
    274 .align	4
    275 .Loop_mod2x_v8:
    276 	ext	v18.16b,v3.16b,v3.16b,#8
    277 	subs	x3,x3,#32		//is there more data?
    278 	pmull	v0.1q,v22.1d,v3.1d		//H^2.loXi.lo
    279 	csel	x12,xzr,x12,lo			//is it time to zero x12?
    280 
    281 	pmull	v5.1q,v21.1d,v17.1d
    282 	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
    283 	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hiXi.hi
    284 	eor	v0.16b,v0.16b,v4.16b		//accumulate
    285 	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)(Xi.lo+Xi.hi)
    286 	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
    287 
    288 	eor	v2.16b,v2.16b,v6.16b
    289 	csel	x12,xzr,x12,eq			//is it time to zero x12?
    290 	eor	v1.16b,v1.16b,v5.16b
    291 
    292 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    293 	eor	v18.16b,v0.16b,v2.16b
    294 	eor	v1.16b,v1.16b,v17.16b
    295 	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
    296 #ifndef __AARCH64EB__
    297 	rev64	v16.16b,v16.16b
    298 #endif
    299 	eor	v1.16b,v1.16b,v18.16b
    300 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    301 
    302 #ifndef __AARCH64EB__
    303 	rev64	v17.16b,v17.16b
    304 #endif
    305 	ins	v2.d[0],v1.d[1]
    306 	ins	v1.d[1],v0.d[0]
    307 	ext	v7.16b,v17.16b,v17.16b,#8
    308 	ext	v3.16b,v16.16b,v16.16b,#8
    309 	eor	v0.16b,v1.16b,v18.16b
    310 	pmull	v4.1q,v20.1d,v7.1d		//HIi+1
    311 	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
    312 
    313 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    314 	pmull	v0.1q,v0.1d,v19.1d
    315 	eor	v3.16b,v3.16b,v18.16b
    316 	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
    317 	eor	v3.16b,v3.16b,v0.16b
    318 	pmull2	v6.1q,v20.2d,v7.2d
    319 	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
    320 
    321 	eor	v2.16b,v2.16b,v18.16b
    322 	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
    323 	adds	x3,x3,#32		//re-construct x3
    324 	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
    325 	b.eq	.Ldone_v8		//is x3 zero?
    326 .Lodd_tail_v8:
    327 	ext	v18.16b,v0.16b,v0.16b,#8
    328 	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
    329 	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
    330 
    331 	pmull	v0.1q,v20.1d,v3.1d		//H.loXi.lo
    332 	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
    333 	pmull2	v2.1q,v20.2d,v3.2d		//H.hiXi.hi
    334 	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)(Xi.lo+Xi.hi)
    335 
    336 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    337 	eor	v18.16b,v0.16b,v2.16b
    338 	eor	v1.16b,v1.16b,v17.16b
    339 	eor	v1.16b,v1.16b,v18.16b
    340 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    341 
    342 	ins	v2.d[0],v1.d[1]
    343 	ins	v1.d[1],v0.d[0]
    344 	eor	v0.16b,v1.16b,v18.16b
    345 
    346 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    347 	pmull	v0.1q,v0.1d,v19.1d
    348 	eor	v18.16b,v18.16b,v2.16b
    349 	eor	v0.16b,v0.16b,v18.16b
    350 
    351 .Ldone_v8:
    352 #ifndef __AARCH64EB__
    353 	rev64	v0.16b,v0.16b
    354 #endif
    355 	ext	v0.16b,v0.16b,v0.16b,#8
    356 	st1	{v0.2d},[x0]		//write out Xi
    357 
    358 	ret
    359 .size	gcm_ghash_v8,.-gcm_ghash_v8
    360 .type	gcm_ghash_v8_4x,%function
    361 .align	4
    362 gcm_ghash_v8_4x:
    363 .Lgcm_ghash_v8_4x:
    364 	ld1	{v0.2d},[x0]		//load [rotated] Xi
    365 	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
    366 	movi	v19.16b,#0xe1
    367 	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
    368 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
    369 
    370 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
    371 #ifndef __AARCH64EB__
    372 	rev64	v0.16b,v0.16b
    373 	rev64	v5.16b,v5.16b
    374 	rev64	v6.16b,v6.16b
    375 	rev64	v7.16b,v7.16b
    376 	rev64	v4.16b,v4.16b
    377 #endif
    378 	ext	v25.16b,v7.16b,v7.16b,#8
    379 	ext	v24.16b,v6.16b,v6.16b,#8
    380 	ext	v23.16b,v5.16b,v5.16b,#8
    381 
    382 	pmull	v29.1q,v20.1d,v25.1d		//HIi+3
    383 	eor	v7.16b,v7.16b,v25.16b
    384 	pmull2	v31.1q,v20.2d,v25.2d
    385 	pmull	v30.1q,v21.1d,v7.1d
    386 
    387 	pmull	v16.1q,v22.1d,v24.1d		//H^2Ii+2
    388 	eor	v6.16b,v6.16b,v24.16b
    389 	pmull2	v24.1q,v22.2d,v24.2d
    390 	pmull2	v6.1q,v21.2d,v6.2d
    391 
    392 	eor	v29.16b,v29.16b,v16.16b
    393 	eor	v31.16b,v31.16b,v24.16b
    394 	eor	v30.16b,v30.16b,v6.16b
    395 
    396 	pmull	v7.1q,v26.1d,v23.1d		//H^3Ii+1
    397 	eor	v5.16b,v5.16b,v23.16b
    398 	pmull2	v23.1q,v26.2d,v23.2d
    399 	pmull	v5.1q,v27.1d,v5.1d
    400 
    401 	eor	v29.16b,v29.16b,v7.16b
    402 	eor	v31.16b,v31.16b,v23.16b
    403 	eor	v30.16b,v30.16b,v5.16b
    404 
    405 	subs	x3,x3,#128
    406 	b.lo	.Ltail4x
    407 
    408 	b	.Loop4x
    409 
    410 .align	4
    411 .Loop4x:
    412 	eor	v16.16b,v4.16b,v0.16b
    413 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
    414 	ext	v3.16b,v16.16b,v16.16b,#8
    415 #ifndef __AARCH64EB__
    416 	rev64	v5.16b,v5.16b
    417 	rev64	v6.16b,v6.16b
    418 	rev64	v7.16b,v7.16b
    419 	rev64	v4.16b,v4.16b
    420 #endif
    421 
    422 	pmull	v0.1q,v28.1d,v3.1d		//H^4(Xi+Ii)
    423 	eor	v16.16b,v16.16b,v3.16b
    424 	pmull2	v2.1q,v28.2d,v3.2d
    425 	ext	v25.16b,v7.16b,v7.16b,#8
    426 	pmull2	v1.1q,v27.2d,v16.2d
    427 
    428 	eor	v0.16b,v0.16b,v29.16b
    429 	eor	v2.16b,v2.16b,v31.16b
    430 	ext	v24.16b,v6.16b,v6.16b,#8
    431 	eor	v1.16b,v1.16b,v30.16b
    432 	ext	v23.16b,v5.16b,v5.16b,#8
    433 
    434 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    435 	eor	v18.16b,v0.16b,v2.16b
    436 	pmull	v29.1q,v20.1d,v25.1d		//HIi+3
    437 	eor	v7.16b,v7.16b,v25.16b
    438 	eor	v1.16b,v1.16b,v17.16b
    439 	pmull2	v31.1q,v20.2d,v25.2d
    440 	eor	v1.16b,v1.16b,v18.16b
    441 	pmull	v30.1q,v21.1d,v7.1d
    442 
    443 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    444 	ins	v2.d[0],v1.d[1]
    445 	ins	v1.d[1],v0.d[0]
    446 	pmull	v16.1q,v22.1d,v24.1d		//H^2Ii+2
    447 	eor	v6.16b,v6.16b,v24.16b
    448 	pmull2	v24.1q,v22.2d,v24.2d
    449 	eor	v0.16b,v1.16b,v18.16b
    450 	pmull2	v6.1q,v21.2d,v6.2d
    451 
    452 	eor	v29.16b,v29.16b,v16.16b
    453 	eor	v31.16b,v31.16b,v24.16b
    454 	eor	v30.16b,v30.16b,v6.16b
    455 
    456 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    457 	pmull	v0.1q,v0.1d,v19.1d
    458 	pmull	v7.1q,v26.1d,v23.1d		//H^3Ii+1
    459 	eor	v5.16b,v5.16b,v23.16b
    460 	eor	v18.16b,v18.16b,v2.16b
    461 	pmull2	v23.1q,v26.2d,v23.2d
    462 	pmull	v5.1q,v27.1d,v5.1d
    463 
    464 	eor	v0.16b,v0.16b,v18.16b
    465 	eor	v29.16b,v29.16b,v7.16b
    466 	eor	v31.16b,v31.16b,v23.16b
    467 	ext	v0.16b,v0.16b,v0.16b,#8
    468 	eor	v30.16b,v30.16b,v5.16b
    469 
    470 	subs	x3,x3,#64
    471 	b.hs	.Loop4x
    472 
    473 .Ltail4x:
    474 	eor	v16.16b,v4.16b,v0.16b
    475 	ext	v3.16b,v16.16b,v16.16b,#8
    476 
    477 	pmull	v0.1q,v28.1d,v3.1d		//H^4(Xi+Ii)
    478 	eor	v16.16b,v16.16b,v3.16b
    479 	pmull2	v2.1q,v28.2d,v3.2d
    480 	pmull2	v1.1q,v27.2d,v16.2d
    481 
    482 	eor	v0.16b,v0.16b,v29.16b
    483 	eor	v2.16b,v2.16b,v31.16b
    484 	eor	v1.16b,v1.16b,v30.16b
    485 
    486 	adds	x3,x3,#64
    487 	b.eq	.Ldone4x
    488 
    489 	cmp	x3,#32
    490 	b.lo	.Lone
    491 	b.eq	.Ltwo
    492 .Lthree:
    493 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    494 	eor	v18.16b,v0.16b,v2.16b
    495 	eor	v1.16b,v1.16b,v17.16b
    496 	ld1	{v4.2d,v5.2d,v6.2d},[x2]
    497 	eor	v1.16b,v1.16b,v18.16b
    498 #ifndef	__AARCH64EB__
    499 	rev64	v5.16b,v5.16b
    500 	rev64	v6.16b,v6.16b
    501 	rev64	v4.16b,v4.16b
    502 #endif
    503 
    504 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    505 	ins	v2.d[0],v1.d[1]
    506 	ins	v1.d[1],v0.d[0]
    507 	ext	v24.16b,v6.16b,v6.16b,#8
    508 	ext	v23.16b,v5.16b,v5.16b,#8
    509 	eor	v0.16b,v1.16b,v18.16b
    510 
    511 	pmull	v29.1q,v20.1d,v24.1d		//HIi+2
    512 	eor	v6.16b,v6.16b,v24.16b
    513 
    514 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    515 	pmull	v0.1q,v0.1d,v19.1d
    516 	eor	v18.16b,v18.16b,v2.16b
    517 	pmull2	v31.1q,v20.2d,v24.2d
    518 	pmull	v30.1q,v21.1d,v6.1d
    519 	eor	v0.16b,v0.16b,v18.16b
    520 	pmull	v7.1q,v22.1d,v23.1d		//H^2Ii+1
    521 	eor	v5.16b,v5.16b,v23.16b
    522 	ext	v0.16b,v0.16b,v0.16b,#8
    523 
    524 	pmull2	v23.1q,v22.2d,v23.2d
    525 	eor	v16.16b,v4.16b,v0.16b
    526 	pmull2	v5.1q,v21.2d,v5.2d
    527 	ext	v3.16b,v16.16b,v16.16b,#8
    528 
    529 	eor	v29.16b,v29.16b,v7.16b
    530 	eor	v31.16b,v31.16b,v23.16b
    531 	eor	v30.16b,v30.16b,v5.16b
    532 
    533 	pmull	v0.1q,v26.1d,v3.1d		//H^3(Xi+Ii)
    534 	eor	v16.16b,v16.16b,v3.16b
    535 	pmull2	v2.1q,v26.2d,v3.2d
    536 	pmull	v1.1q,v27.1d,v16.1d
    537 
    538 	eor	v0.16b,v0.16b,v29.16b
    539 	eor	v2.16b,v2.16b,v31.16b
    540 	eor	v1.16b,v1.16b,v30.16b
    541 	b	.Ldone4x
    542 
    543 .align	4
    544 .Ltwo:
    545 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    546 	eor	v18.16b,v0.16b,v2.16b
    547 	eor	v1.16b,v1.16b,v17.16b
    548 	ld1	{v4.2d,v5.2d},[x2]
    549 	eor	v1.16b,v1.16b,v18.16b
    550 #ifndef	__AARCH64EB__
    551 	rev64	v5.16b,v5.16b
    552 	rev64	v4.16b,v4.16b
    553 #endif
    554 
    555 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    556 	ins	v2.d[0],v1.d[1]
    557 	ins	v1.d[1],v0.d[0]
    558 	ext	v23.16b,v5.16b,v5.16b,#8
    559 	eor	v0.16b,v1.16b,v18.16b
    560 
    561 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    562 	pmull	v0.1q,v0.1d,v19.1d
    563 	eor	v18.16b,v18.16b,v2.16b
    564 	eor	v0.16b,v0.16b,v18.16b
    565 	ext	v0.16b,v0.16b,v0.16b,#8
    566 
    567 	pmull	v29.1q,v20.1d,v23.1d		//HIi+1
    568 	eor	v5.16b,v5.16b,v23.16b
    569 
    570 	eor	v16.16b,v4.16b,v0.16b
    571 	ext	v3.16b,v16.16b,v16.16b,#8
    572 
    573 	pmull2	v31.1q,v20.2d,v23.2d
    574 	pmull	v30.1q,v21.1d,v5.1d
    575 
    576 	pmull	v0.1q,v22.1d,v3.1d		//H^2(Xi+Ii)
    577 	eor	v16.16b,v16.16b,v3.16b
    578 	pmull2	v2.1q,v22.2d,v3.2d
    579 	pmull2	v1.1q,v21.2d,v16.2d
    580 
    581 	eor	v0.16b,v0.16b,v29.16b
    582 	eor	v2.16b,v2.16b,v31.16b
    583 	eor	v1.16b,v1.16b,v30.16b
    584 	b	.Ldone4x
    585 
    586 .align	4
    587 .Lone:
    588 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    589 	eor	v18.16b,v0.16b,v2.16b
    590 	eor	v1.16b,v1.16b,v17.16b
    591 	ld1	{v4.2d},[x2]
    592 	eor	v1.16b,v1.16b,v18.16b
    593 #ifndef	__AARCH64EB__
    594 	rev64	v4.16b,v4.16b
    595 #endif
    596 
    597 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    598 	ins	v2.d[0],v1.d[1]
    599 	ins	v1.d[1],v0.d[0]
    600 	eor	v0.16b,v1.16b,v18.16b
    601 
    602 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    603 	pmull	v0.1q,v0.1d,v19.1d
    604 	eor	v18.16b,v18.16b,v2.16b
    605 	eor	v0.16b,v0.16b,v18.16b
    606 	ext	v0.16b,v0.16b,v0.16b,#8
    607 
    608 	eor	v16.16b,v4.16b,v0.16b
    609 	ext	v3.16b,v16.16b,v16.16b,#8
    610 
    611 	pmull	v0.1q,v20.1d,v3.1d
    612 	eor	v16.16b,v16.16b,v3.16b
    613 	pmull2	v2.1q,v20.2d,v3.2d
    614 	pmull	v1.1q,v21.1d,v16.1d
    615 
    616 .Ldone4x:
    617 	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
    618 	eor	v18.16b,v0.16b,v2.16b
    619 	eor	v1.16b,v1.16b,v17.16b
    620 	eor	v1.16b,v1.16b,v18.16b
    621 
    622 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
    623 	ins	v2.d[0],v1.d[1]
    624 	ins	v1.d[1],v0.d[0]
    625 	eor	v0.16b,v1.16b,v18.16b
    626 
    627 	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
    628 	pmull	v0.1q,v0.1d,v19.1d
    629 	eor	v18.16b,v18.16b,v2.16b
    630 	eor	v0.16b,v0.16b,v18.16b
    631 	ext	v0.16b,v0.16b,v0.16b,#8
    632 
    633 #ifndef __AARCH64EB__
    634 	rev64	v0.16b,v0.16b
    635 #endif
    636 	st1	{v0.2d},[x0]		//write out Xi
    637 
    638 	ret
    639 .size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
    640 .section	.rodata
    641 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    642 .align	2
    643 .align	2
    644 #endif
    645