1 #include "arm_arch.h" 2 3 #if __ARM_MAX_ARCH__>=7 4 .arch armv8-a+crypto 5 .text 6 .globl gcm_init_v8 7 .type gcm_init_v8,%function 8 .align 4 9 gcm_init_v8: 10 AARCH64_VALID_CALL_TARGET 11 ld1 {v17.2d},[x1] //load input H 12 movi v19.16b,#0xe1 13 shl v19.2d,v19.2d,#57 //0xc2.0 14 ext v3.16b,v17.16b,v17.16b,#8 15 ushr v18.2d,v19.2d,#63 16 dup v17.4s,v17.s[1] 17 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 18 ushr v18.2d,v3.2d,#63 19 sshr v17.4s,v17.4s,#31 //broadcast carry bit 20 and v18.16b,v18.16b,v16.16b 21 shl v3.2d,v3.2d,#1 22 ext v18.16b,v18.16b,v18.16b,#8 23 and v16.16b,v16.16b,v17.16b 24 orr v3.16b,v3.16b,v18.16b //H<<<=1 25 eor v20.16b,v3.16b,v16.16b //twisted H 26 st1 {v20.2d},[x0],#16 //store Htable[0] 27 28 //calculate H^2 29 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 30 pmull v0.1q,v20.1d,v20.1d 31 eor v16.16b,v16.16b,v20.16b 32 pmull2 v2.1q,v20.2d,v20.2d 33 pmull v1.1q,v16.1d,v16.1d 34 35 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 36 eor v18.16b,v0.16b,v2.16b 37 eor v1.16b,v1.16b,v17.16b 38 eor v1.16b,v1.16b,v18.16b 39 pmull v18.1q,v0.1d,v19.1d //1st phase 40 41 ins v2.d[0],v1.d[1] 42 ins v1.d[1],v0.d[0] 43 eor v0.16b,v1.16b,v18.16b 44 45 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 46 pmull v0.1q,v0.1d,v19.1d 47 eor v18.16b,v18.16b,v2.16b 48 eor v22.16b,v0.16b,v18.16b 49 50 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 51 eor v17.16b,v17.16b,v22.16b 52 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 53 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 54 //calculate H^3 and H^4 55 pmull v0.1q,v20.1d, v22.1d 56 pmull v5.1q,v22.1d,v22.1d 57 pmull2 v2.1q,v20.2d, v22.2d 58 pmull2 v7.1q,v22.2d,v22.2d 59 pmull v1.1q,v16.1d,v17.1d 60 pmull v6.1q,v17.1d,v17.1d 61 62 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 63 ext v17.16b,v5.16b,v7.16b,#8 64 eor v18.16b,v0.16b,v2.16b 65 eor v1.16b,v1.16b,v16.16b 66 eor v4.16b,v5.16b,v7.16b 67 eor v6.16b,v6.16b,v17.16b 68 eor v1.16b,v1.16b,v18.16b 69 pmull v18.1q,v0.1d,v19.1d //1st phase 70 eor v6.16b,v6.16b,v4.16b 71 pmull v4.1q,v5.1d,v19.1d 72 73 ins v2.d[0],v1.d[1] 74 ins v7.d[0],v6.d[1] 75 ins v1.d[1],v0.d[0] 76 ins v6.d[1],v5.d[0] 77 eor v0.16b,v1.16b,v18.16b 78 eor v5.16b,v6.16b,v4.16b 79 80 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 81 ext v4.16b,v5.16b,v5.16b,#8 82 pmull v0.1q,v0.1d,v19.1d 83 pmull v5.1q,v5.1d,v19.1d 84 eor v18.16b,v18.16b,v2.16b 85 eor v4.16b,v4.16b,v7.16b 86 eor v23.16b, v0.16b,v18.16b //H^3 87 eor v25.16b,v5.16b,v4.16b //H^4 88 89 ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing 90 ext v17.16b,v25.16b,v25.16b,#8 91 ext v18.16b,v22.16b,v22.16b,#8 92 eor v16.16b,v16.16b,v23.16b 93 eor v17.16b,v17.16b,v25.16b 94 eor v18.16b,v18.16b,v22.16b 95 ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 96 st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] 97 98 //calculate H^5 and H^6 99 pmull v0.1q,v22.1d, v23.1d 100 pmull v5.1q,v23.1d,v23.1d 101 pmull2 v2.1q,v22.2d, v23.2d 102 pmull2 v7.1q,v23.2d,v23.2d 103 pmull v1.1q,v16.1d,v18.1d 104 pmull v6.1q,v16.1d,v16.1d 105 106 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 107 ext v17.16b,v5.16b,v7.16b,#8 108 eor v18.16b,v0.16b,v2.16b 109 eor v1.16b,v1.16b,v16.16b 110 eor v4.16b,v5.16b,v7.16b 111 eor v6.16b,v6.16b,v17.16b 112 eor v1.16b,v1.16b,v18.16b 113 pmull v18.1q,v0.1d,v19.1d //1st phase 114 eor v6.16b,v6.16b,v4.16b 115 pmull v4.1q,v5.1d,v19.1d 116 117 ins v2.d[0],v1.d[1] 118 ins v7.d[0],v6.d[1] 119 ins v1.d[1],v0.d[0] 120 ins v6.d[1],v5.d[0] 121 eor v0.16b,v1.16b,v18.16b 122 eor v5.16b,v6.16b,v4.16b 123 124 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 125 ext v4.16b,v5.16b,v5.16b,#8 126 pmull v0.1q,v0.1d,v19.1d 127 pmull v5.1q,v5.1d,v19.1d 128 eor v18.16b,v18.16b,v2.16b 129 eor v4.16b,v4.16b,v7.16b 130 eor v26.16b,v0.16b,v18.16b //H^5 131 eor v28.16b,v5.16b,v4.16b //H^6 132 133 ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing 134 ext v17.16b,v28.16b,v28.16b,#8 135 ext v18.16b,v22.16b,v22.16b,#8 136 eor v16.16b,v16.16b,v26.16b 137 eor v17.16b,v17.16b,v28.16b 138 eor v18.16b,v18.16b,v22.16b 139 ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 140 st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] 141 142 //calculate H^7 and H^8 143 pmull v0.1q,v22.1d,v26.1d 144 pmull v5.1q,v22.1d,v28.1d 145 pmull2 v2.1q,v22.2d,v26.2d 146 pmull2 v7.1q,v22.2d,v28.2d 147 pmull v1.1q,v16.1d,v18.1d 148 pmull v6.1q,v17.1d,v18.1d 149 150 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 151 ext v17.16b,v5.16b,v7.16b,#8 152 eor v18.16b,v0.16b,v2.16b 153 eor v1.16b,v1.16b,v16.16b 154 eor v4.16b,v5.16b,v7.16b 155 eor v6.16b,v6.16b,v17.16b 156 eor v1.16b,v1.16b,v18.16b 157 pmull v18.1q,v0.1d,v19.1d //1st phase 158 eor v6.16b,v6.16b,v4.16b 159 pmull v4.1q,v5.1d,v19.1d 160 161 ins v2.d[0],v1.d[1] 162 ins v7.d[0],v6.d[1] 163 ins v1.d[1],v0.d[0] 164 ins v6.d[1],v5.d[0] 165 eor v0.16b,v1.16b,v18.16b 166 eor v5.16b,v6.16b,v4.16b 167 168 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 169 ext v4.16b,v5.16b,v5.16b,#8 170 pmull v0.1q,v0.1d,v19.1d 171 pmull v5.1q,v5.1d,v19.1d 172 eor v18.16b,v18.16b,v2.16b 173 eor v4.16b,v4.16b,v7.16b 174 eor v29.16b,v0.16b,v18.16b //H^7 175 eor v31.16b,v5.16b,v4.16b //H^8 176 177 ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing 178 ext v17.16b,v31.16b,v31.16b,#8 179 eor v16.16b,v16.16b,v29.16b 180 eor v17.16b,v17.16b,v31.16b 181 ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 182 st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11] 183 ret 184 .size gcm_init_v8,.-gcm_init_v8 185 .globl gcm_gmult_v8 186 .type gcm_gmult_v8,%function 187 .align 4 188 gcm_gmult_v8: 189 AARCH64_VALID_CALL_TARGET 190 ld1 {v17.2d},[x0] //load Xi 191 movi v19.16b,#0xe1 192 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 193 shl v19.2d,v19.2d,#57 194 #ifndef __AARCH64EB__ 195 rev64 v17.16b,v17.16b 196 #endif 197 ext v3.16b,v17.16b,v17.16b,#8 198 199 pmull v0.1q,v20.1d,v3.1d //H.loXi.lo 200 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 201 pmull2 v2.1q,v20.2d,v3.2d //H.hiXi.hi 202 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)(Xi.lo+Xi.hi) 203 204 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 205 eor v18.16b,v0.16b,v2.16b 206 eor v1.16b,v1.16b,v17.16b 207 eor v1.16b,v1.16b,v18.16b 208 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 209 210 ins v2.d[0],v1.d[1] 211 ins v1.d[1],v0.d[0] 212 eor v0.16b,v1.16b,v18.16b 213 214 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 215 pmull v0.1q,v0.1d,v19.1d 216 eor v18.16b,v18.16b,v2.16b 217 eor v0.16b,v0.16b,v18.16b 218 219 #ifndef __AARCH64EB__ 220 rev64 v0.16b,v0.16b 221 #endif 222 ext v0.16b,v0.16b,v0.16b,#8 223 st1 {v0.2d},[x0] //write out Xi 224 225 ret 226 .size gcm_gmult_v8,.-gcm_gmult_v8 227 .globl gcm_ghash_v8 228 .type gcm_ghash_v8,%function 229 .align 4 230 gcm_ghash_v8: 231 AARCH64_VALID_CALL_TARGET 232 cmp x3,#64 233 b.hs .Lgcm_ghash_v8_4x 234 ld1 {v0.2d},[x0] //load [rotated] Xi 235 //"[rotated]" means that 236 //loaded value would have 237 //to be rotated in order to 238 //make it appear as in 239 //algorithm specification 240 subs x3,x3,#32 //see if x3 is 32 or larger 241 mov x12,#16 //x12 is used as post- 242 //increment for input pointer; 243 //as loop is modulo-scheduled 244 //x12 is zeroed just in time 245 //to preclude overstepping 246 //inp[len], which means that 247 //last block[s] are actually 248 //loaded twice, but last 249 //copy is not processed 250 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 251 movi v19.16b,#0xe1 252 ld1 {v22.2d},[x1] 253 csel x12,xzr,x12,eq //is it time to zero x12? 254 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 255 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 256 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 257 #ifndef __AARCH64EB__ 258 rev64 v16.16b,v16.16b 259 rev64 v0.16b,v0.16b 260 #endif 261 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 262 b.lo .Lodd_tail_v8 //x3 was less than 32 263 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 264 #ifndef __AARCH64EB__ 265 rev64 v17.16b,v17.16b 266 #endif 267 ext v7.16b,v17.16b,v17.16b,#8 268 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 269 pmull v4.1q,v20.1d,v7.1d //HIi+1 270 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 271 pmull2 v6.1q,v20.2d,v7.2d 272 b .Loop_mod2x_v8 273 274 .align 4 275 .Loop_mod2x_v8: 276 ext v18.16b,v3.16b,v3.16b,#8 277 subs x3,x3,#32 //is there more data? 278 pmull v0.1q,v22.1d,v3.1d //H^2.loXi.lo 279 csel x12,xzr,x12,lo //is it time to zero x12? 280 281 pmull v5.1q,v21.1d,v17.1d 282 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 283 pmull2 v2.1q,v22.2d,v3.2d //H^2.hiXi.hi 284 eor v0.16b,v0.16b,v4.16b //accumulate 285 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)(Xi.lo+Xi.hi) 286 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 287 288 eor v2.16b,v2.16b,v6.16b 289 csel x12,xzr,x12,eq //is it time to zero x12? 290 eor v1.16b,v1.16b,v5.16b 291 292 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 293 eor v18.16b,v0.16b,v2.16b 294 eor v1.16b,v1.16b,v17.16b 295 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 296 #ifndef __AARCH64EB__ 297 rev64 v16.16b,v16.16b 298 #endif 299 eor v1.16b,v1.16b,v18.16b 300 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 301 302 #ifndef __AARCH64EB__ 303 rev64 v17.16b,v17.16b 304 #endif 305 ins v2.d[0],v1.d[1] 306 ins v1.d[1],v0.d[0] 307 ext v7.16b,v17.16b,v17.16b,#8 308 ext v3.16b,v16.16b,v16.16b,#8 309 eor v0.16b,v1.16b,v18.16b 310 pmull v4.1q,v20.1d,v7.1d //HIi+1 311 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 312 313 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 314 pmull v0.1q,v0.1d,v19.1d 315 eor v3.16b,v3.16b,v18.16b 316 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 317 eor v3.16b,v3.16b,v0.16b 318 pmull2 v6.1q,v20.2d,v7.2d 319 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 320 321 eor v2.16b,v2.16b,v18.16b 322 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 323 adds x3,x3,#32 //re-construct x3 324 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 325 b.eq .Ldone_v8 //is x3 zero? 326 .Lodd_tail_v8: 327 ext v18.16b,v0.16b,v0.16b,#8 328 eor v3.16b,v3.16b,v0.16b //inp^=Xi 329 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 330 331 pmull v0.1q,v20.1d,v3.1d //H.loXi.lo 332 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 333 pmull2 v2.1q,v20.2d,v3.2d //H.hiXi.hi 334 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)(Xi.lo+Xi.hi) 335 336 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 337 eor v18.16b,v0.16b,v2.16b 338 eor v1.16b,v1.16b,v17.16b 339 eor v1.16b,v1.16b,v18.16b 340 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 341 342 ins v2.d[0],v1.d[1] 343 ins v1.d[1],v0.d[0] 344 eor v0.16b,v1.16b,v18.16b 345 346 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 347 pmull v0.1q,v0.1d,v19.1d 348 eor v18.16b,v18.16b,v2.16b 349 eor v0.16b,v0.16b,v18.16b 350 351 .Ldone_v8: 352 #ifndef __AARCH64EB__ 353 rev64 v0.16b,v0.16b 354 #endif 355 ext v0.16b,v0.16b,v0.16b,#8 356 st1 {v0.2d},[x0] //write out Xi 357 358 ret 359 .size gcm_ghash_v8,.-gcm_ghash_v8 360 .type gcm_ghash_v8_4x,%function 361 .align 4 362 gcm_ghash_v8_4x: 363 .Lgcm_ghash_v8_4x: 364 ld1 {v0.2d},[x0] //load [rotated] Xi 365 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 366 movi v19.16b,#0xe1 367 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 368 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 369 370 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 371 #ifndef __AARCH64EB__ 372 rev64 v0.16b,v0.16b 373 rev64 v5.16b,v5.16b 374 rev64 v6.16b,v6.16b 375 rev64 v7.16b,v7.16b 376 rev64 v4.16b,v4.16b 377 #endif 378 ext v25.16b,v7.16b,v7.16b,#8 379 ext v24.16b,v6.16b,v6.16b,#8 380 ext v23.16b,v5.16b,v5.16b,#8 381 382 pmull v29.1q,v20.1d,v25.1d //HIi+3 383 eor v7.16b,v7.16b,v25.16b 384 pmull2 v31.1q,v20.2d,v25.2d 385 pmull v30.1q,v21.1d,v7.1d 386 387 pmull v16.1q,v22.1d,v24.1d //H^2Ii+2 388 eor v6.16b,v6.16b,v24.16b 389 pmull2 v24.1q,v22.2d,v24.2d 390 pmull2 v6.1q,v21.2d,v6.2d 391 392 eor v29.16b,v29.16b,v16.16b 393 eor v31.16b,v31.16b,v24.16b 394 eor v30.16b,v30.16b,v6.16b 395 396 pmull v7.1q,v26.1d,v23.1d //H^3Ii+1 397 eor v5.16b,v5.16b,v23.16b 398 pmull2 v23.1q,v26.2d,v23.2d 399 pmull v5.1q,v27.1d,v5.1d 400 401 eor v29.16b,v29.16b,v7.16b 402 eor v31.16b,v31.16b,v23.16b 403 eor v30.16b,v30.16b,v5.16b 404 405 subs x3,x3,#128 406 b.lo .Ltail4x 407 408 b .Loop4x 409 410 .align 4 411 .Loop4x: 412 eor v16.16b,v4.16b,v0.16b 413 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 414 ext v3.16b,v16.16b,v16.16b,#8 415 #ifndef __AARCH64EB__ 416 rev64 v5.16b,v5.16b 417 rev64 v6.16b,v6.16b 418 rev64 v7.16b,v7.16b 419 rev64 v4.16b,v4.16b 420 #endif 421 422 pmull v0.1q,v28.1d,v3.1d //H^4(Xi+Ii) 423 eor v16.16b,v16.16b,v3.16b 424 pmull2 v2.1q,v28.2d,v3.2d 425 ext v25.16b,v7.16b,v7.16b,#8 426 pmull2 v1.1q,v27.2d,v16.2d 427 428 eor v0.16b,v0.16b,v29.16b 429 eor v2.16b,v2.16b,v31.16b 430 ext v24.16b,v6.16b,v6.16b,#8 431 eor v1.16b,v1.16b,v30.16b 432 ext v23.16b,v5.16b,v5.16b,#8 433 434 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 435 eor v18.16b,v0.16b,v2.16b 436 pmull v29.1q,v20.1d,v25.1d //HIi+3 437 eor v7.16b,v7.16b,v25.16b 438 eor v1.16b,v1.16b,v17.16b 439 pmull2 v31.1q,v20.2d,v25.2d 440 eor v1.16b,v1.16b,v18.16b 441 pmull v30.1q,v21.1d,v7.1d 442 443 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 444 ins v2.d[0],v1.d[1] 445 ins v1.d[1],v0.d[0] 446 pmull v16.1q,v22.1d,v24.1d //H^2Ii+2 447 eor v6.16b,v6.16b,v24.16b 448 pmull2 v24.1q,v22.2d,v24.2d 449 eor v0.16b,v1.16b,v18.16b 450 pmull2 v6.1q,v21.2d,v6.2d 451 452 eor v29.16b,v29.16b,v16.16b 453 eor v31.16b,v31.16b,v24.16b 454 eor v30.16b,v30.16b,v6.16b 455 456 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 457 pmull v0.1q,v0.1d,v19.1d 458 pmull v7.1q,v26.1d,v23.1d //H^3Ii+1 459 eor v5.16b,v5.16b,v23.16b 460 eor v18.16b,v18.16b,v2.16b 461 pmull2 v23.1q,v26.2d,v23.2d 462 pmull v5.1q,v27.1d,v5.1d 463 464 eor v0.16b,v0.16b,v18.16b 465 eor v29.16b,v29.16b,v7.16b 466 eor v31.16b,v31.16b,v23.16b 467 ext v0.16b,v0.16b,v0.16b,#8 468 eor v30.16b,v30.16b,v5.16b 469 470 subs x3,x3,#64 471 b.hs .Loop4x 472 473 .Ltail4x: 474 eor v16.16b,v4.16b,v0.16b 475 ext v3.16b,v16.16b,v16.16b,#8 476 477 pmull v0.1q,v28.1d,v3.1d //H^4(Xi+Ii) 478 eor v16.16b,v16.16b,v3.16b 479 pmull2 v2.1q,v28.2d,v3.2d 480 pmull2 v1.1q,v27.2d,v16.2d 481 482 eor v0.16b,v0.16b,v29.16b 483 eor v2.16b,v2.16b,v31.16b 484 eor v1.16b,v1.16b,v30.16b 485 486 adds x3,x3,#64 487 b.eq .Ldone4x 488 489 cmp x3,#32 490 b.lo .Lone 491 b.eq .Ltwo 492 .Lthree: 493 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 494 eor v18.16b,v0.16b,v2.16b 495 eor v1.16b,v1.16b,v17.16b 496 ld1 {v4.2d,v5.2d,v6.2d},[x2] 497 eor v1.16b,v1.16b,v18.16b 498 #ifndef __AARCH64EB__ 499 rev64 v5.16b,v5.16b 500 rev64 v6.16b,v6.16b 501 rev64 v4.16b,v4.16b 502 #endif 503 504 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 505 ins v2.d[0],v1.d[1] 506 ins v1.d[1],v0.d[0] 507 ext v24.16b,v6.16b,v6.16b,#8 508 ext v23.16b,v5.16b,v5.16b,#8 509 eor v0.16b,v1.16b,v18.16b 510 511 pmull v29.1q,v20.1d,v24.1d //HIi+2 512 eor v6.16b,v6.16b,v24.16b 513 514 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 515 pmull v0.1q,v0.1d,v19.1d 516 eor v18.16b,v18.16b,v2.16b 517 pmull2 v31.1q,v20.2d,v24.2d 518 pmull v30.1q,v21.1d,v6.1d 519 eor v0.16b,v0.16b,v18.16b 520 pmull v7.1q,v22.1d,v23.1d //H^2Ii+1 521 eor v5.16b,v5.16b,v23.16b 522 ext v0.16b,v0.16b,v0.16b,#8 523 524 pmull2 v23.1q,v22.2d,v23.2d 525 eor v16.16b,v4.16b,v0.16b 526 pmull2 v5.1q,v21.2d,v5.2d 527 ext v3.16b,v16.16b,v16.16b,#8 528 529 eor v29.16b,v29.16b,v7.16b 530 eor v31.16b,v31.16b,v23.16b 531 eor v30.16b,v30.16b,v5.16b 532 533 pmull v0.1q,v26.1d,v3.1d //H^3(Xi+Ii) 534 eor v16.16b,v16.16b,v3.16b 535 pmull2 v2.1q,v26.2d,v3.2d 536 pmull v1.1q,v27.1d,v16.1d 537 538 eor v0.16b,v0.16b,v29.16b 539 eor v2.16b,v2.16b,v31.16b 540 eor v1.16b,v1.16b,v30.16b 541 b .Ldone4x 542 543 .align 4 544 .Ltwo: 545 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 546 eor v18.16b,v0.16b,v2.16b 547 eor v1.16b,v1.16b,v17.16b 548 ld1 {v4.2d,v5.2d},[x2] 549 eor v1.16b,v1.16b,v18.16b 550 #ifndef __AARCH64EB__ 551 rev64 v5.16b,v5.16b 552 rev64 v4.16b,v4.16b 553 #endif 554 555 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 556 ins v2.d[0],v1.d[1] 557 ins v1.d[1],v0.d[0] 558 ext v23.16b,v5.16b,v5.16b,#8 559 eor v0.16b,v1.16b,v18.16b 560 561 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 562 pmull v0.1q,v0.1d,v19.1d 563 eor v18.16b,v18.16b,v2.16b 564 eor v0.16b,v0.16b,v18.16b 565 ext v0.16b,v0.16b,v0.16b,#8 566 567 pmull v29.1q,v20.1d,v23.1d //HIi+1 568 eor v5.16b,v5.16b,v23.16b 569 570 eor v16.16b,v4.16b,v0.16b 571 ext v3.16b,v16.16b,v16.16b,#8 572 573 pmull2 v31.1q,v20.2d,v23.2d 574 pmull v30.1q,v21.1d,v5.1d 575 576 pmull v0.1q,v22.1d,v3.1d //H^2(Xi+Ii) 577 eor v16.16b,v16.16b,v3.16b 578 pmull2 v2.1q,v22.2d,v3.2d 579 pmull2 v1.1q,v21.2d,v16.2d 580 581 eor v0.16b,v0.16b,v29.16b 582 eor v2.16b,v2.16b,v31.16b 583 eor v1.16b,v1.16b,v30.16b 584 b .Ldone4x 585 586 .align 4 587 .Lone: 588 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 589 eor v18.16b,v0.16b,v2.16b 590 eor v1.16b,v1.16b,v17.16b 591 ld1 {v4.2d},[x2] 592 eor v1.16b,v1.16b,v18.16b 593 #ifndef __AARCH64EB__ 594 rev64 v4.16b,v4.16b 595 #endif 596 597 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 598 ins v2.d[0],v1.d[1] 599 ins v1.d[1],v0.d[0] 600 eor v0.16b,v1.16b,v18.16b 601 602 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 603 pmull v0.1q,v0.1d,v19.1d 604 eor v18.16b,v18.16b,v2.16b 605 eor v0.16b,v0.16b,v18.16b 606 ext v0.16b,v0.16b,v0.16b,#8 607 608 eor v16.16b,v4.16b,v0.16b 609 ext v3.16b,v16.16b,v16.16b,#8 610 611 pmull v0.1q,v20.1d,v3.1d 612 eor v16.16b,v16.16b,v3.16b 613 pmull2 v2.1q,v20.2d,v3.2d 614 pmull v1.1q,v21.1d,v16.1d 615 616 .Ldone4x: 617 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 618 eor v18.16b,v0.16b,v2.16b 619 eor v1.16b,v1.16b,v17.16b 620 eor v1.16b,v1.16b,v18.16b 621 622 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 623 ins v2.d[0],v1.d[1] 624 ins v1.d[1],v0.d[0] 625 eor v0.16b,v1.16b,v18.16b 626 627 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 628 pmull v0.1q,v0.1d,v19.1d 629 eor v18.16b,v18.16b,v2.16b 630 eor v0.16b,v0.16b,v18.16b 631 ext v0.16b,v0.16b,v0.16b,#8 632 633 #ifndef __AARCH64EB__ 634 rev64 v0.16b,v0.16b 635 #endif 636 st1 {v0.2d},[x0] //write out Xi 637 638 ret 639 .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 640 .section .rodata 641 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 642 .align 2 643 .align 2 644 #endif 645