1 #include "arm_asm.h" 2 // Copyright 2022-2026 The OpenSSL Project Authors. All Rights Reserved. 3 // 4 // Licensed under the Apache License 2.0 (the "License"). You may not use 5 // this file except in compliance with the License. You can obtain a copy 6 // in the file LICENSE in the source distribution or at 7 // https://www.openssl.org/source/license.html 8 9 // 10 // This module implements SM4 with ASIMD and AESE on AARCH64 11 // 12 // Dec 2022 13 // 14 15 // $output is the last argument if it looks like a file (it has an extension) 16 // $flavour is the first argument if it doesn't look like a file 17 #include "arm_arch.h" 18 .arch armv8-a+crypto 19 .text 20 21 .type _vpsm4_ex_consts,%object 22 .align 7 23 _vpsm4_ex_consts: 24 .Lck: 25 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 26 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 27 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 28 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 29 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 30 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 31 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 32 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 33 .Lfk: 34 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 35 .Lshuffles: 36 .quad 0x0B0A090807060504,0x030201000F0E0D0C 37 .Lxts_magic: 38 #ifndef __AARCH64EB__ 39 .quad 0x0101010101010187,0x0101010101010101 40 #else 41 .quad 0x0101010101010101,0x0101010101010187 42 #endif 43 .Lsbox_magic: 44 #ifndef __AARCH64EB__ 45 .quad 0x0b0e0104070a0d00,0x0306090c0f020508 46 .quad 0x62185a2042387a00,0x22581a6002783a40 47 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7 48 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead 49 .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc 50 #else 51 .quad 0x0306090c0f020508,0x0b0e0104070a0d00 52 .quad 0x22581a6002783a40,0x62185a2042387a00 53 .quad 0xc10bb67c4a803df7,0x15df62a89e54e923 54 .quad 0x1407c6d56c7fbead,0xb9aa6b78c1d21300 55 .quad 0xe383c1a1fe9edcbc,0x6404462679195b3b 56 #endif 57 .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f 58 59 .size _vpsm4_ex_consts,.-_vpsm4_ex_consts 60 .type _vpsm4_ex_set_key,%function 61 .align 4 62 _vpsm4_ex_set_key: 63 AARCH64_VALID_CALL_TARGET 64 ld1 {v5.4s},[x0] 65 adrp x9, .Lsbox_magic 66 ldr q26, [x9, #:lo12:.Lsbox_magic] 67 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 68 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 69 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 70 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 71 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 72 #ifndef __AARCH64EB__ 73 rev32 v5.16b,v5.16b 74 #endif 75 adrp x5,.Lshuffles 76 add x5,x5,#:lo12:.Lshuffles 77 ld1 {v7.2d},[x5] 78 adrp x5,.Lfk 79 add x5,x5,#:lo12:.Lfk 80 ld1 {v6.2d},[x5] 81 eor v5.16b,v5.16b,v6.16b 82 mov x6,#32 83 adrp x5,.Lck 84 add x5,x5,#:lo12:.Lck 85 movi v0.16b,#64 86 cbnz w2,1f 87 add x1,x1,124 88 1: 89 mov w7,v5.s[1] 90 ldr w8,[x5],#4 91 eor w8,w8,w7 92 mov w7,v5.s[2] 93 eor w8,w8,w7 94 mov w7,v5.s[3] 95 eor w8,w8,w7 96 // optimize sbox using AESE instruction 97 mov v4.s[0],w8 98 tbl v0.16b, {v4.16b}, v26.16b 99 ushr v2.16b, v0.16b, 4 100 and v0.16b, v0.16b, v31.16b 101 tbl v0.16b, {v28.16b}, v0.16b 102 tbl v2.16b, {v27.16b}, v2.16b 103 eor v0.16b, v0.16b, v2.16b 104 eor v1.16b, v1.16b, v1.16b 105 aese v0.16b,v1.16b 106 ushr v2.16b, v0.16b, 4 107 and v0.16b, v0.16b, v31.16b 108 tbl v0.16b, {v30.16b}, v0.16b 109 tbl v2.16b, {v29.16b}, v2.16b 110 eor v0.16b, v0.16b, v2.16b 111 mov w7,v0.s[0] 112 eor w8,w7,w7,ror #19 113 eor w8,w8,w7,ror #9 114 mov w7,v5.s[0] 115 eor w8,w8,w7 116 mov v5.s[0],w8 117 cbz w2,2f 118 str w8,[x1],#4 119 b 3f 120 2: 121 str w8,[x1],#-4 122 3: 123 tbl v5.16b,{v5.16b},v7.16b 124 subs x6,x6,#1 125 b.ne 1b 126 ret 127 .size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key 128 .type _vpsm4_ex_enc_4blks,%function 129 .align 4 130 _vpsm4_ex_enc_4blks: 131 AARCH64_VALID_CALL_TARGET 132 mov x10,x3 133 mov w11,#8 134 10: 135 ldp w7,w8,[x10],8 136 dup v12.4s,w7 137 dup v13.4s,w8 138 139 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 140 eor v14.16b,v6.16b,v7.16b 141 eor v12.16b,v5.16b,v12.16b 142 eor v12.16b,v14.16b,v12.16b 143 // optimize sbox using AESE instruction 144 tbl v0.16b, {v12.16b}, v26.16b 145 ushr v24.16b, v0.16b, 4 146 and v0.16b, v0.16b, v31.16b 147 tbl v0.16b, {v28.16b}, v0.16b 148 tbl v24.16b, {v27.16b}, v24.16b 149 eor v0.16b, v0.16b, v24.16b 150 eor v1.16b, v1.16b, v1.16b 151 aese v0.16b,v1.16b 152 ushr v24.16b, v0.16b, 4 153 and v0.16b, v0.16b, v31.16b 154 tbl v0.16b, {v30.16b}, v0.16b 155 tbl v24.16b, {v29.16b}, v24.16b 156 eor v0.16b, v0.16b, v24.16b 157 mov v12.16b,v0.16b 158 159 // linear transformation 160 ushr v0.4s,v12.4s,32-2 161 ushr v1.4s,v12.4s,32-10 162 ushr v2.4s,v12.4s,32-18 163 ushr v3.4s,v12.4s,32-24 164 sli v0.4s,v12.4s,2 165 sli v1.4s,v12.4s,10 166 sli v2.4s,v12.4s,18 167 sli v3.4s,v12.4s,24 168 eor v24.16b,v0.16b,v12.16b 169 eor v24.16b,v24.16b,v1.16b 170 eor v12.16b,v2.16b,v3.16b 171 eor v12.16b,v12.16b,v24.16b 172 eor v4.16b,v4.16b,v12.16b 173 174 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 175 eor v14.16b,v14.16b,v4.16b 176 eor v13.16b,v14.16b,v13.16b 177 // optimize sbox using AESE instruction 178 tbl v0.16b, {v13.16b}, v26.16b 179 ushr v24.16b, v0.16b, 4 180 and v0.16b, v0.16b, v31.16b 181 tbl v0.16b, {v28.16b}, v0.16b 182 tbl v24.16b, {v27.16b}, v24.16b 183 eor v0.16b, v0.16b, v24.16b 184 eor v1.16b, v1.16b, v1.16b 185 aese v0.16b,v1.16b 186 ushr v24.16b, v0.16b, 4 187 and v0.16b, v0.16b, v31.16b 188 tbl v0.16b, {v30.16b}, v0.16b 189 tbl v24.16b, {v29.16b}, v24.16b 190 eor v0.16b, v0.16b, v24.16b 191 mov v13.16b,v0.16b 192 193 // linear transformation 194 ushr v0.4s,v13.4s,32-2 195 ushr v1.4s,v13.4s,32-10 196 ushr v2.4s,v13.4s,32-18 197 ushr v3.4s,v13.4s,32-24 198 sli v0.4s,v13.4s,2 199 sli v1.4s,v13.4s,10 200 sli v2.4s,v13.4s,18 201 sli v3.4s,v13.4s,24 202 eor v24.16b,v0.16b,v13.16b 203 eor v24.16b,v24.16b,v1.16b 204 eor v13.16b,v2.16b,v3.16b 205 eor v13.16b,v13.16b,v24.16b 206 ldp w7,w8,[x10],8 207 eor v5.16b,v5.16b,v13.16b 208 209 dup v12.4s,w7 210 dup v13.4s,w8 211 212 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 213 eor v14.16b,v4.16b,v5.16b 214 eor v12.16b,v7.16b,v12.16b 215 eor v12.16b,v14.16b,v12.16b 216 // optimize sbox using AESE instruction 217 tbl v0.16b, {v12.16b}, v26.16b 218 ushr v24.16b, v0.16b, 4 219 and v0.16b, v0.16b, v31.16b 220 tbl v0.16b, {v28.16b}, v0.16b 221 tbl v24.16b, {v27.16b}, v24.16b 222 eor v0.16b, v0.16b, v24.16b 223 eor v1.16b, v1.16b, v1.16b 224 aese v0.16b,v1.16b 225 ushr v24.16b, v0.16b, 4 226 and v0.16b, v0.16b, v31.16b 227 tbl v0.16b, {v30.16b}, v0.16b 228 tbl v24.16b, {v29.16b}, v24.16b 229 eor v0.16b, v0.16b, v24.16b 230 mov v12.16b,v0.16b 231 232 // linear transformation 233 ushr v0.4s,v12.4s,32-2 234 ushr v1.4s,v12.4s,32-10 235 ushr v2.4s,v12.4s,32-18 236 ushr v3.4s,v12.4s,32-24 237 sli v0.4s,v12.4s,2 238 sli v1.4s,v12.4s,10 239 sli v2.4s,v12.4s,18 240 sli v3.4s,v12.4s,24 241 eor v24.16b,v0.16b,v12.16b 242 eor v24.16b,v24.16b,v1.16b 243 eor v12.16b,v2.16b,v3.16b 244 eor v12.16b,v12.16b,v24.16b 245 eor v6.16b,v6.16b,v12.16b 246 247 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 248 eor v14.16b,v14.16b,v6.16b 249 eor v13.16b,v14.16b,v13.16b 250 // optimize sbox using AESE instruction 251 tbl v0.16b, {v13.16b}, v26.16b 252 ushr v24.16b, v0.16b, 4 253 and v0.16b, v0.16b, v31.16b 254 tbl v0.16b, {v28.16b}, v0.16b 255 tbl v24.16b, {v27.16b}, v24.16b 256 eor v0.16b, v0.16b, v24.16b 257 eor v1.16b, v1.16b, v1.16b 258 aese v0.16b,v1.16b 259 ushr v24.16b, v0.16b, 4 260 and v0.16b, v0.16b, v31.16b 261 tbl v0.16b, {v30.16b}, v0.16b 262 tbl v24.16b, {v29.16b}, v24.16b 263 eor v0.16b, v0.16b, v24.16b 264 mov v13.16b,v0.16b 265 266 // linear transformation 267 ushr v0.4s,v13.4s,32-2 268 ushr v1.4s,v13.4s,32-10 269 ushr v2.4s,v13.4s,32-18 270 ushr v3.4s,v13.4s,32-24 271 sli v0.4s,v13.4s,2 272 sli v1.4s,v13.4s,10 273 sli v2.4s,v13.4s,18 274 sli v3.4s,v13.4s,24 275 eor v24.16b,v0.16b,v13.16b 276 eor v24.16b,v24.16b,v1.16b 277 eor v13.16b,v2.16b,v3.16b 278 eor v13.16b,v13.16b,v24.16b 279 eor v7.16b,v7.16b,v13.16b 280 subs w11,w11,#1 281 b.ne 10b 282 #ifndef __AARCH64EB__ 283 rev32 v3.16b,v4.16b 284 #else 285 mov v3.16b,v4.16b 286 #endif 287 #ifndef __AARCH64EB__ 288 rev32 v2.16b,v5.16b 289 #else 290 mov v2.16b,v5.16b 291 #endif 292 #ifndef __AARCH64EB__ 293 rev32 v1.16b,v6.16b 294 #else 295 mov v1.16b,v6.16b 296 #endif 297 #ifndef __AARCH64EB__ 298 rev32 v0.16b,v7.16b 299 #else 300 mov v0.16b,v7.16b 301 #endif 302 ret 303 .size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks 304 .type _vpsm4_ex_enc_8blks,%function 305 .align 4 306 _vpsm4_ex_enc_8blks: 307 AARCH64_VALID_CALL_TARGET 308 mov x10,x3 309 mov w11,#8 310 10: 311 ldp w7,w8,[x10],8 312 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 313 dup v12.4s,w7 314 eor v14.16b,v6.16b,v7.16b 315 eor v15.16b,v10.16b,v11.16b 316 eor v0.16b,v5.16b,v12.16b 317 eor v1.16b,v9.16b,v12.16b 318 eor v12.16b,v14.16b,v0.16b 319 eor v13.16b,v15.16b,v1.16b 320 // optimize sbox using AESE instruction 321 tbl v0.16b, {v12.16b}, v26.16b 322 tbl v1.16b, {v13.16b}, v26.16b 323 ushr v24.16b, v0.16b, 4 324 and v0.16b, v0.16b, v31.16b 325 tbl v0.16b, {v28.16b}, v0.16b 326 tbl v24.16b, {v27.16b}, v24.16b 327 eor v0.16b, v0.16b, v24.16b 328 ushr v24.16b, v1.16b, 4 329 and v1.16b, v1.16b, v31.16b 330 tbl v1.16b, {v28.16b}, v1.16b 331 tbl v24.16b, {v27.16b}, v24.16b 332 eor v1.16b, v1.16b, v24.16b 333 eor v25.16b, v25.16b, v25.16b 334 aese v0.16b,v25.16b 335 aese v1.16b,v25.16b 336 ushr v24.16b, v0.16b, 4 337 and v0.16b, v0.16b, v31.16b 338 tbl v0.16b, {v30.16b}, v0.16b 339 tbl v24.16b, {v29.16b}, v24.16b 340 eor v0.16b, v0.16b, v24.16b 341 ushr v24.16b, v1.16b, 4 342 and v1.16b, v1.16b, v31.16b 343 tbl v1.16b, {v30.16b}, v1.16b 344 tbl v24.16b, {v29.16b}, v24.16b 345 eor v1.16b, v1.16b, v24.16b 346 mov v12.16b,v0.16b 347 mov v13.16b,v1.16b 348 349 // linear transformation 350 ushr v0.4s,v12.4s,32-2 351 ushr v25.4s,v13.4s,32-2 352 ushr v1.4s,v12.4s,32-10 353 ushr v2.4s,v12.4s,32-18 354 ushr v3.4s,v12.4s,32-24 355 sli v0.4s,v12.4s,2 356 sli v25.4s,v13.4s,2 357 sli v1.4s,v12.4s,10 358 sli v2.4s,v12.4s,18 359 sli v3.4s,v12.4s,24 360 eor v24.16b,v0.16b,v12.16b 361 eor v24.16b,v24.16b,v1.16b 362 eor v12.16b,v2.16b,v3.16b 363 eor v12.16b,v12.16b,v24.16b 364 ushr v1.4s,v13.4s,32-10 365 ushr v2.4s,v13.4s,32-18 366 ushr v3.4s,v13.4s,32-24 367 sli v1.4s,v13.4s,10 368 sli v2.4s,v13.4s,18 369 sli v3.4s,v13.4s,24 370 eor v24.16b,v25.16b,v13.16b 371 eor v24.16b,v24.16b,v1.16b 372 eor v13.16b,v2.16b,v3.16b 373 eor v13.16b,v13.16b,v24.16b 374 eor v4.16b,v4.16b,v12.16b 375 eor v8.16b,v8.16b,v13.16b 376 377 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 378 dup v13.4s,w8 379 eor v14.16b,v14.16b,v4.16b 380 eor v15.16b,v15.16b,v8.16b 381 eor v12.16b,v14.16b,v13.16b 382 eor v13.16b,v15.16b,v13.16b 383 // optimize sbox using AESE instruction 384 tbl v0.16b, {v12.16b}, v26.16b 385 tbl v1.16b, {v13.16b}, v26.16b 386 ushr v24.16b, v0.16b, 4 387 and v0.16b, v0.16b, v31.16b 388 tbl v0.16b, {v28.16b}, v0.16b 389 tbl v24.16b, {v27.16b}, v24.16b 390 eor v0.16b, v0.16b, v24.16b 391 ushr v24.16b, v1.16b, 4 392 and v1.16b, v1.16b, v31.16b 393 tbl v1.16b, {v28.16b}, v1.16b 394 tbl v24.16b, {v27.16b}, v24.16b 395 eor v1.16b, v1.16b, v24.16b 396 eor v25.16b, v25.16b, v25.16b 397 aese v0.16b,v25.16b 398 aese v1.16b,v25.16b 399 ushr v24.16b, v0.16b, 4 400 and v0.16b, v0.16b, v31.16b 401 tbl v0.16b, {v30.16b}, v0.16b 402 tbl v24.16b, {v29.16b}, v24.16b 403 eor v0.16b, v0.16b, v24.16b 404 ushr v24.16b, v1.16b, 4 405 and v1.16b, v1.16b, v31.16b 406 tbl v1.16b, {v30.16b}, v1.16b 407 tbl v24.16b, {v29.16b}, v24.16b 408 eor v1.16b, v1.16b, v24.16b 409 mov v12.16b,v0.16b 410 mov v13.16b,v1.16b 411 412 // linear transformation 413 ushr v0.4s,v12.4s,32-2 414 ushr v25.4s,v13.4s,32-2 415 ushr v1.4s,v12.4s,32-10 416 ushr v2.4s,v12.4s,32-18 417 ushr v3.4s,v12.4s,32-24 418 sli v0.4s,v12.4s,2 419 sli v25.4s,v13.4s,2 420 sli v1.4s,v12.4s,10 421 sli v2.4s,v12.4s,18 422 sli v3.4s,v12.4s,24 423 eor v24.16b,v0.16b,v12.16b 424 eor v24.16b,v24.16b,v1.16b 425 eor v12.16b,v2.16b,v3.16b 426 eor v12.16b,v12.16b,v24.16b 427 ushr v1.4s,v13.4s,32-10 428 ushr v2.4s,v13.4s,32-18 429 ushr v3.4s,v13.4s,32-24 430 sli v1.4s,v13.4s,10 431 sli v2.4s,v13.4s,18 432 sli v3.4s,v13.4s,24 433 eor v24.16b,v25.16b,v13.16b 434 eor v24.16b,v24.16b,v1.16b 435 eor v13.16b,v2.16b,v3.16b 436 eor v13.16b,v13.16b,v24.16b 437 ldp w7,w8,[x10],8 438 eor v5.16b,v5.16b,v12.16b 439 eor v9.16b,v9.16b,v13.16b 440 441 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 442 dup v12.4s,w7 443 eor v14.16b,v4.16b,v5.16b 444 eor v15.16b,v8.16b,v9.16b 445 eor v0.16b,v7.16b,v12.16b 446 eor v1.16b,v11.16b,v12.16b 447 eor v12.16b,v14.16b,v0.16b 448 eor v13.16b,v15.16b,v1.16b 449 // optimize sbox using AESE instruction 450 tbl v0.16b, {v12.16b}, v26.16b 451 tbl v1.16b, {v13.16b}, v26.16b 452 ushr v24.16b, v0.16b, 4 453 and v0.16b, v0.16b, v31.16b 454 tbl v0.16b, {v28.16b}, v0.16b 455 tbl v24.16b, {v27.16b}, v24.16b 456 eor v0.16b, v0.16b, v24.16b 457 ushr v24.16b, v1.16b, 4 458 and v1.16b, v1.16b, v31.16b 459 tbl v1.16b, {v28.16b}, v1.16b 460 tbl v24.16b, {v27.16b}, v24.16b 461 eor v1.16b, v1.16b, v24.16b 462 eor v25.16b, v25.16b, v25.16b 463 aese v0.16b,v25.16b 464 aese v1.16b,v25.16b 465 ushr v24.16b, v0.16b, 4 466 and v0.16b, v0.16b, v31.16b 467 tbl v0.16b, {v30.16b}, v0.16b 468 tbl v24.16b, {v29.16b}, v24.16b 469 eor v0.16b, v0.16b, v24.16b 470 ushr v24.16b, v1.16b, 4 471 and v1.16b, v1.16b, v31.16b 472 tbl v1.16b, {v30.16b}, v1.16b 473 tbl v24.16b, {v29.16b}, v24.16b 474 eor v1.16b, v1.16b, v24.16b 475 mov v12.16b,v0.16b 476 mov v13.16b,v1.16b 477 478 // linear transformation 479 ushr v0.4s,v12.4s,32-2 480 ushr v25.4s,v13.4s,32-2 481 ushr v1.4s,v12.4s,32-10 482 ushr v2.4s,v12.4s,32-18 483 ushr v3.4s,v12.4s,32-24 484 sli v0.4s,v12.4s,2 485 sli v25.4s,v13.4s,2 486 sli v1.4s,v12.4s,10 487 sli v2.4s,v12.4s,18 488 sli v3.4s,v12.4s,24 489 eor v24.16b,v0.16b,v12.16b 490 eor v24.16b,v24.16b,v1.16b 491 eor v12.16b,v2.16b,v3.16b 492 eor v12.16b,v12.16b,v24.16b 493 ushr v1.4s,v13.4s,32-10 494 ushr v2.4s,v13.4s,32-18 495 ushr v3.4s,v13.4s,32-24 496 sli v1.4s,v13.4s,10 497 sli v2.4s,v13.4s,18 498 sli v3.4s,v13.4s,24 499 eor v24.16b,v25.16b,v13.16b 500 eor v24.16b,v24.16b,v1.16b 501 eor v13.16b,v2.16b,v3.16b 502 eor v13.16b,v13.16b,v24.16b 503 eor v6.16b,v6.16b,v12.16b 504 eor v10.16b,v10.16b,v13.16b 505 506 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 507 dup v13.4s,w8 508 eor v14.16b,v14.16b,v6.16b 509 eor v15.16b,v15.16b,v10.16b 510 eor v12.16b,v14.16b,v13.16b 511 eor v13.16b,v15.16b,v13.16b 512 // optimize sbox using AESE instruction 513 tbl v0.16b, {v12.16b}, v26.16b 514 tbl v1.16b, {v13.16b}, v26.16b 515 ushr v24.16b, v0.16b, 4 516 and v0.16b, v0.16b, v31.16b 517 tbl v0.16b, {v28.16b}, v0.16b 518 tbl v24.16b, {v27.16b}, v24.16b 519 eor v0.16b, v0.16b, v24.16b 520 ushr v24.16b, v1.16b, 4 521 and v1.16b, v1.16b, v31.16b 522 tbl v1.16b, {v28.16b}, v1.16b 523 tbl v24.16b, {v27.16b}, v24.16b 524 eor v1.16b, v1.16b, v24.16b 525 eor v25.16b, v25.16b, v25.16b 526 aese v0.16b,v25.16b 527 aese v1.16b,v25.16b 528 ushr v24.16b, v0.16b, 4 529 and v0.16b, v0.16b, v31.16b 530 tbl v0.16b, {v30.16b}, v0.16b 531 tbl v24.16b, {v29.16b}, v24.16b 532 eor v0.16b, v0.16b, v24.16b 533 ushr v24.16b, v1.16b, 4 534 and v1.16b, v1.16b, v31.16b 535 tbl v1.16b, {v30.16b}, v1.16b 536 tbl v24.16b, {v29.16b}, v24.16b 537 eor v1.16b, v1.16b, v24.16b 538 mov v12.16b,v0.16b 539 mov v13.16b,v1.16b 540 541 // linear transformation 542 ushr v0.4s,v12.4s,32-2 543 ushr v25.4s,v13.4s,32-2 544 ushr v1.4s,v12.4s,32-10 545 ushr v2.4s,v12.4s,32-18 546 ushr v3.4s,v12.4s,32-24 547 sli v0.4s,v12.4s,2 548 sli v25.4s,v13.4s,2 549 sli v1.4s,v12.4s,10 550 sli v2.4s,v12.4s,18 551 sli v3.4s,v12.4s,24 552 eor v24.16b,v0.16b,v12.16b 553 eor v24.16b,v24.16b,v1.16b 554 eor v12.16b,v2.16b,v3.16b 555 eor v12.16b,v12.16b,v24.16b 556 ushr v1.4s,v13.4s,32-10 557 ushr v2.4s,v13.4s,32-18 558 ushr v3.4s,v13.4s,32-24 559 sli v1.4s,v13.4s,10 560 sli v2.4s,v13.4s,18 561 sli v3.4s,v13.4s,24 562 eor v24.16b,v25.16b,v13.16b 563 eor v24.16b,v24.16b,v1.16b 564 eor v13.16b,v2.16b,v3.16b 565 eor v13.16b,v13.16b,v24.16b 566 eor v7.16b,v7.16b,v12.16b 567 eor v11.16b,v11.16b,v13.16b 568 subs w11,w11,#1 569 b.ne 10b 570 #ifndef __AARCH64EB__ 571 rev32 v3.16b,v4.16b 572 #else 573 mov v3.16b,v4.16b 574 #endif 575 #ifndef __AARCH64EB__ 576 rev32 v2.16b,v5.16b 577 #else 578 mov v2.16b,v5.16b 579 #endif 580 #ifndef __AARCH64EB__ 581 rev32 v1.16b,v6.16b 582 #else 583 mov v1.16b,v6.16b 584 #endif 585 #ifndef __AARCH64EB__ 586 rev32 v0.16b,v7.16b 587 #else 588 mov v0.16b,v7.16b 589 #endif 590 #ifndef __AARCH64EB__ 591 rev32 v7.16b,v8.16b 592 #else 593 mov v7.16b,v8.16b 594 #endif 595 #ifndef __AARCH64EB__ 596 rev32 v6.16b,v9.16b 597 #else 598 mov v6.16b,v9.16b 599 #endif 600 #ifndef __AARCH64EB__ 601 rev32 v5.16b,v10.16b 602 #else 603 mov v5.16b,v10.16b 604 #endif 605 #ifndef __AARCH64EB__ 606 rev32 v4.16b,v11.16b 607 #else 608 mov v4.16b,v11.16b 609 #endif 610 ret 611 .size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks 612 .globl vpsm4_ex_set_encrypt_key 613 .type vpsm4_ex_set_encrypt_key,%function 614 .align 5 615 vpsm4_ex_set_encrypt_key: 616 AARCH64_SIGN_LINK_REGISTER 617 stp x29,x30,[sp,#-16]! 618 mov w2,1 619 bl _vpsm4_ex_set_key 620 ldp x29,x30,[sp],#16 621 AARCH64_VALIDATE_LINK_REGISTER 622 ret 623 .size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key 624 .globl vpsm4_ex_set_decrypt_key 625 .type vpsm4_ex_set_decrypt_key,%function 626 .align 5 627 vpsm4_ex_set_decrypt_key: 628 AARCH64_SIGN_LINK_REGISTER 629 stp x29,x30,[sp,#-16]! 630 mov w2,0 631 bl _vpsm4_ex_set_key 632 ldp x29,x30,[sp],#16 633 AARCH64_VALIDATE_LINK_REGISTER 634 ret 635 .size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key 636 .globl vpsm4_ex_encrypt 637 .type vpsm4_ex_encrypt,%function 638 .align 5 639 vpsm4_ex_encrypt: 640 AARCH64_VALID_CALL_TARGET 641 ld1 {v4.4s},[x0] 642 adrp x9, .Lsbox_magic 643 ldr q26, [x9, #:lo12:.Lsbox_magic] 644 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 645 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 646 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 647 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 648 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 649 #ifndef __AARCH64EB__ 650 rev32 v4.16b,v4.16b 651 #endif 652 mov x3,x2 653 mov x10,x3 654 mov w11,#8 655 mov w12,v4.s[0] 656 mov w13,v4.s[1] 657 mov w14,v4.s[2] 658 mov w15,v4.s[3] 659 10: 660 ldp w7,w8,[x10],8 661 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 662 eor w6,w14,w15 663 eor w9,w7,w13 664 eor w6,w6,w9 665 mov v3.s[0],w6 666 // optimize sbox using AESE instruction 667 tbl v0.16b, {v3.16b}, v26.16b 668 ushr v2.16b, v0.16b, 4 669 and v0.16b, v0.16b, v31.16b 670 tbl v0.16b, {v28.16b}, v0.16b 671 tbl v2.16b, {v27.16b}, v2.16b 672 eor v0.16b, v0.16b, v2.16b 673 eor v1.16b, v1.16b, v1.16b 674 aese v0.16b,v1.16b 675 ushr v2.16b, v0.16b, 4 676 and v0.16b, v0.16b, v31.16b 677 tbl v0.16b, {v30.16b}, v0.16b 678 tbl v2.16b, {v29.16b}, v2.16b 679 eor v0.16b, v0.16b, v2.16b 680 681 mov w7,v0.s[0] 682 eor w6,w7,w7,ror #32-2 683 eor w6,w6,w7,ror #32-10 684 eor w6,w6,w7,ror #32-18 685 eor w6,w6,w7,ror #32-24 686 eor w12,w12,w6 687 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 688 eor w6,w14,w15 689 eor w9,w12,w8 690 eor w6,w6,w9 691 mov v3.s[0],w6 692 // optimize sbox using AESE instruction 693 tbl v0.16b, {v3.16b}, v26.16b 694 ushr v2.16b, v0.16b, 4 695 and v0.16b, v0.16b, v31.16b 696 tbl v0.16b, {v28.16b}, v0.16b 697 tbl v2.16b, {v27.16b}, v2.16b 698 eor v0.16b, v0.16b, v2.16b 699 eor v1.16b, v1.16b, v1.16b 700 aese v0.16b,v1.16b 701 ushr v2.16b, v0.16b, 4 702 and v0.16b, v0.16b, v31.16b 703 tbl v0.16b, {v30.16b}, v0.16b 704 tbl v2.16b, {v29.16b}, v2.16b 705 eor v0.16b, v0.16b, v2.16b 706 707 mov w7,v0.s[0] 708 eor w6,w7,w7,ror #32-2 709 eor w6,w6,w7,ror #32-10 710 eor w6,w6,w7,ror #32-18 711 eor w6,w6,w7,ror #32-24 712 ldp w7,w8,[x10],8 713 eor w13,w13,w6 714 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 715 eor w6,w12,w13 716 eor w9,w7,w15 717 eor w6,w6,w9 718 mov v3.s[0],w6 719 // optimize sbox using AESE instruction 720 tbl v0.16b, {v3.16b}, v26.16b 721 ushr v2.16b, v0.16b, 4 722 and v0.16b, v0.16b, v31.16b 723 tbl v0.16b, {v28.16b}, v0.16b 724 tbl v2.16b, {v27.16b}, v2.16b 725 eor v0.16b, v0.16b, v2.16b 726 eor v1.16b, v1.16b, v1.16b 727 aese v0.16b,v1.16b 728 ushr v2.16b, v0.16b, 4 729 and v0.16b, v0.16b, v31.16b 730 tbl v0.16b, {v30.16b}, v0.16b 731 tbl v2.16b, {v29.16b}, v2.16b 732 eor v0.16b, v0.16b, v2.16b 733 734 mov w7,v0.s[0] 735 eor w6,w7,w7,ror #32-2 736 eor w6,w6,w7,ror #32-10 737 eor w6,w6,w7,ror #32-18 738 eor w6,w6,w7,ror #32-24 739 eor w14,w14,w6 740 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 741 eor w6,w12,w13 742 eor w9,w14,w8 743 eor w6,w6,w9 744 mov v3.s[0],w6 745 // optimize sbox using AESE instruction 746 tbl v0.16b, {v3.16b}, v26.16b 747 ushr v2.16b, v0.16b, 4 748 and v0.16b, v0.16b, v31.16b 749 tbl v0.16b, {v28.16b}, v0.16b 750 tbl v2.16b, {v27.16b}, v2.16b 751 eor v0.16b, v0.16b, v2.16b 752 eor v1.16b, v1.16b, v1.16b 753 aese v0.16b,v1.16b 754 ushr v2.16b, v0.16b, 4 755 and v0.16b, v0.16b, v31.16b 756 tbl v0.16b, {v30.16b}, v0.16b 757 tbl v2.16b, {v29.16b}, v2.16b 758 eor v0.16b, v0.16b, v2.16b 759 760 mov w7,v0.s[0] 761 eor w6,w7,w7,ror #32-2 762 eor w6,w6,w7,ror #32-10 763 eor w6,w6,w7,ror #32-18 764 eor w6,w6,w7,ror #32-24 765 eor w15,w15,w6 766 subs w11,w11,#1 767 b.ne 10b 768 mov v4.s[0],w15 769 mov v4.s[1],w14 770 mov v4.s[2],w13 771 mov v4.s[3],w12 772 #ifndef __AARCH64EB__ 773 rev32 v4.16b,v4.16b 774 #endif 775 st1 {v4.4s},[x1] 776 ret 777 .size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt 778 .globl vpsm4_ex_decrypt 779 .type vpsm4_ex_decrypt,%function 780 .align 5 781 vpsm4_ex_decrypt: 782 AARCH64_VALID_CALL_TARGET 783 ld1 {v4.4s},[x0] 784 adrp x9, .Lsbox_magic 785 ldr q26, [x9, #:lo12:.Lsbox_magic] 786 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 787 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 788 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 789 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 790 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 791 #ifndef __AARCH64EB__ 792 rev32 v4.16b,v4.16b 793 #endif 794 mov x3,x2 795 mov x10,x3 796 mov w11,#8 797 mov w12,v4.s[0] 798 mov w13,v4.s[1] 799 mov w14,v4.s[2] 800 mov w15,v4.s[3] 801 10: 802 ldp w7,w8,[x10],8 803 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 804 eor w6,w14,w15 805 eor w9,w7,w13 806 eor w6,w6,w9 807 mov v3.s[0],w6 808 // optimize sbox using AESE instruction 809 tbl v0.16b, {v3.16b}, v26.16b 810 ushr v2.16b, v0.16b, 4 811 and v0.16b, v0.16b, v31.16b 812 tbl v0.16b, {v28.16b}, v0.16b 813 tbl v2.16b, {v27.16b}, v2.16b 814 eor v0.16b, v0.16b, v2.16b 815 eor v1.16b, v1.16b, v1.16b 816 aese v0.16b,v1.16b 817 ushr v2.16b, v0.16b, 4 818 and v0.16b, v0.16b, v31.16b 819 tbl v0.16b, {v30.16b}, v0.16b 820 tbl v2.16b, {v29.16b}, v2.16b 821 eor v0.16b, v0.16b, v2.16b 822 823 mov w7,v0.s[0] 824 eor w6,w7,w7,ror #32-2 825 eor w6,w6,w7,ror #32-10 826 eor w6,w6,w7,ror #32-18 827 eor w6,w6,w7,ror #32-24 828 eor w12,w12,w6 829 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 830 eor w6,w14,w15 831 eor w9,w12,w8 832 eor w6,w6,w9 833 mov v3.s[0],w6 834 // optimize sbox using AESE instruction 835 tbl v0.16b, {v3.16b}, v26.16b 836 ushr v2.16b, v0.16b, 4 837 and v0.16b, v0.16b, v31.16b 838 tbl v0.16b, {v28.16b}, v0.16b 839 tbl v2.16b, {v27.16b}, v2.16b 840 eor v0.16b, v0.16b, v2.16b 841 eor v1.16b, v1.16b, v1.16b 842 aese v0.16b,v1.16b 843 ushr v2.16b, v0.16b, 4 844 and v0.16b, v0.16b, v31.16b 845 tbl v0.16b, {v30.16b}, v0.16b 846 tbl v2.16b, {v29.16b}, v2.16b 847 eor v0.16b, v0.16b, v2.16b 848 849 mov w7,v0.s[0] 850 eor w6,w7,w7,ror #32-2 851 eor w6,w6,w7,ror #32-10 852 eor w6,w6,w7,ror #32-18 853 eor w6,w6,w7,ror #32-24 854 ldp w7,w8,[x10],8 855 eor w13,w13,w6 856 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 857 eor w6,w12,w13 858 eor w9,w7,w15 859 eor w6,w6,w9 860 mov v3.s[0],w6 861 // optimize sbox using AESE instruction 862 tbl v0.16b, {v3.16b}, v26.16b 863 ushr v2.16b, v0.16b, 4 864 and v0.16b, v0.16b, v31.16b 865 tbl v0.16b, {v28.16b}, v0.16b 866 tbl v2.16b, {v27.16b}, v2.16b 867 eor v0.16b, v0.16b, v2.16b 868 eor v1.16b, v1.16b, v1.16b 869 aese v0.16b,v1.16b 870 ushr v2.16b, v0.16b, 4 871 and v0.16b, v0.16b, v31.16b 872 tbl v0.16b, {v30.16b}, v0.16b 873 tbl v2.16b, {v29.16b}, v2.16b 874 eor v0.16b, v0.16b, v2.16b 875 876 mov w7,v0.s[0] 877 eor w6,w7,w7,ror #32-2 878 eor w6,w6,w7,ror #32-10 879 eor w6,w6,w7,ror #32-18 880 eor w6,w6,w7,ror #32-24 881 eor w14,w14,w6 882 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 883 eor w6,w12,w13 884 eor w9,w14,w8 885 eor w6,w6,w9 886 mov v3.s[0],w6 887 // optimize sbox using AESE instruction 888 tbl v0.16b, {v3.16b}, v26.16b 889 ushr v2.16b, v0.16b, 4 890 and v0.16b, v0.16b, v31.16b 891 tbl v0.16b, {v28.16b}, v0.16b 892 tbl v2.16b, {v27.16b}, v2.16b 893 eor v0.16b, v0.16b, v2.16b 894 eor v1.16b, v1.16b, v1.16b 895 aese v0.16b,v1.16b 896 ushr v2.16b, v0.16b, 4 897 and v0.16b, v0.16b, v31.16b 898 tbl v0.16b, {v30.16b}, v0.16b 899 tbl v2.16b, {v29.16b}, v2.16b 900 eor v0.16b, v0.16b, v2.16b 901 902 mov w7,v0.s[0] 903 eor w6,w7,w7,ror #32-2 904 eor w6,w6,w7,ror #32-10 905 eor w6,w6,w7,ror #32-18 906 eor w6,w6,w7,ror #32-24 907 eor w15,w15,w6 908 subs w11,w11,#1 909 b.ne 10b 910 mov v4.s[0],w15 911 mov v4.s[1],w14 912 mov v4.s[2],w13 913 mov v4.s[3],w12 914 #ifndef __AARCH64EB__ 915 rev32 v4.16b,v4.16b 916 #endif 917 st1 {v4.4s},[x1] 918 ret 919 .size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt 920 .globl vpsm4_ex_ecb_encrypt 921 .type vpsm4_ex_ecb_encrypt,%function 922 .align 5 923 vpsm4_ex_ecb_encrypt: 924 AARCH64_SIGN_LINK_REGISTER 925 // convert length into blocks 926 lsr x2,x2,4 927 stp d8,d9,[sp,#-80]! 928 stp d10,d11,[sp,#16] 929 stp d12,d13,[sp,#32] 930 stp d14,d15,[sp,#48] 931 stp x29,x30,[sp,#64] 932 adrp x9, .Lsbox_magic 933 ldr q26, [x9, #:lo12:.Lsbox_magic] 934 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 935 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 936 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 937 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 938 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 939 .Lecb_8_blocks_process: 940 cmp w2,#8 941 b.lt .Lecb_4_blocks_process 942 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 943 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 944 #ifndef __AARCH64EB__ 945 rev32 v4.16b,v4.16b 946 #endif 947 #ifndef __AARCH64EB__ 948 rev32 v5.16b,v5.16b 949 #endif 950 #ifndef __AARCH64EB__ 951 rev32 v6.16b,v6.16b 952 #endif 953 #ifndef __AARCH64EB__ 954 rev32 v7.16b,v7.16b 955 #endif 956 #ifndef __AARCH64EB__ 957 rev32 v8.16b,v8.16b 958 #endif 959 #ifndef __AARCH64EB__ 960 rev32 v9.16b,v9.16b 961 #endif 962 #ifndef __AARCH64EB__ 963 rev32 v10.16b,v10.16b 964 #endif 965 #ifndef __AARCH64EB__ 966 rev32 v11.16b,v11.16b 967 #endif 968 bl _vpsm4_ex_enc_8blks 969 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 970 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 971 subs w2,w2,#8 972 b.gt .Lecb_8_blocks_process 973 b 100f 974 .Lecb_4_blocks_process: 975 cmp w2,#4 976 b.lt 1f 977 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 978 #ifndef __AARCH64EB__ 979 rev32 v4.16b,v4.16b 980 #endif 981 #ifndef __AARCH64EB__ 982 rev32 v5.16b,v5.16b 983 #endif 984 #ifndef __AARCH64EB__ 985 rev32 v6.16b,v6.16b 986 #endif 987 #ifndef __AARCH64EB__ 988 rev32 v7.16b,v7.16b 989 #endif 990 bl _vpsm4_ex_enc_4blks 991 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 992 sub w2,w2,#4 993 1: 994 // process last block 995 cmp w2,#1 996 b.lt 100f 997 b.gt 1f 998 ld1 {v4.4s},[x0] 999 #ifndef __AARCH64EB__ 1000 rev32 v4.16b,v4.16b 1001 #endif 1002 mov x10,x3 1003 mov w11,#8 1004 mov w12,v4.s[0] 1005 mov w13,v4.s[1] 1006 mov w14,v4.s[2] 1007 mov w15,v4.s[3] 1008 10: 1009 ldp w7,w8,[x10],8 1010 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1011 eor w6,w14,w15 1012 eor w9,w7,w13 1013 eor w6,w6,w9 1014 mov v3.s[0],w6 1015 // optimize sbox using AESE instruction 1016 tbl v0.16b, {v3.16b}, v26.16b 1017 ushr v2.16b, v0.16b, 4 1018 and v0.16b, v0.16b, v31.16b 1019 tbl v0.16b, {v28.16b}, v0.16b 1020 tbl v2.16b, {v27.16b}, v2.16b 1021 eor v0.16b, v0.16b, v2.16b 1022 eor v1.16b, v1.16b, v1.16b 1023 aese v0.16b,v1.16b 1024 ushr v2.16b, v0.16b, 4 1025 and v0.16b, v0.16b, v31.16b 1026 tbl v0.16b, {v30.16b}, v0.16b 1027 tbl v2.16b, {v29.16b}, v2.16b 1028 eor v0.16b, v0.16b, v2.16b 1029 1030 mov w7,v0.s[0] 1031 eor w6,w7,w7,ror #32-2 1032 eor w6,w6,w7,ror #32-10 1033 eor w6,w6,w7,ror #32-18 1034 eor w6,w6,w7,ror #32-24 1035 eor w12,w12,w6 1036 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1037 eor w6,w14,w15 1038 eor w9,w12,w8 1039 eor w6,w6,w9 1040 mov v3.s[0],w6 1041 // optimize sbox using AESE instruction 1042 tbl v0.16b, {v3.16b}, v26.16b 1043 ushr v2.16b, v0.16b, 4 1044 and v0.16b, v0.16b, v31.16b 1045 tbl v0.16b, {v28.16b}, v0.16b 1046 tbl v2.16b, {v27.16b}, v2.16b 1047 eor v0.16b, v0.16b, v2.16b 1048 eor v1.16b, v1.16b, v1.16b 1049 aese v0.16b,v1.16b 1050 ushr v2.16b, v0.16b, 4 1051 and v0.16b, v0.16b, v31.16b 1052 tbl v0.16b, {v30.16b}, v0.16b 1053 tbl v2.16b, {v29.16b}, v2.16b 1054 eor v0.16b, v0.16b, v2.16b 1055 1056 mov w7,v0.s[0] 1057 eor w6,w7,w7,ror #32-2 1058 eor w6,w6,w7,ror #32-10 1059 eor w6,w6,w7,ror #32-18 1060 eor w6,w6,w7,ror #32-24 1061 ldp w7,w8,[x10],8 1062 eor w13,w13,w6 1063 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1064 eor w6,w12,w13 1065 eor w9,w7,w15 1066 eor w6,w6,w9 1067 mov v3.s[0],w6 1068 // optimize sbox using AESE instruction 1069 tbl v0.16b, {v3.16b}, v26.16b 1070 ushr v2.16b, v0.16b, 4 1071 and v0.16b, v0.16b, v31.16b 1072 tbl v0.16b, {v28.16b}, v0.16b 1073 tbl v2.16b, {v27.16b}, v2.16b 1074 eor v0.16b, v0.16b, v2.16b 1075 eor v1.16b, v1.16b, v1.16b 1076 aese v0.16b,v1.16b 1077 ushr v2.16b, v0.16b, 4 1078 and v0.16b, v0.16b, v31.16b 1079 tbl v0.16b, {v30.16b}, v0.16b 1080 tbl v2.16b, {v29.16b}, v2.16b 1081 eor v0.16b, v0.16b, v2.16b 1082 1083 mov w7,v0.s[0] 1084 eor w6,w7,w7,ror #32-2 1085 eor w6,w6,w7,ror #32-10 1086 eor w6,w6,w7,ror #32-18 1087 eor w6,w6,w7,ror #32-24 1088 eor w14,w14,w6 1089 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1090 eor w6,w12,w13 1091 eor w9,w14,w8 1092 eor w6,w6,w9 1093 mov v3.s[0],w6 1094 // optimize sbox using AESE instruction 1095 tbl v0.16b, {v3.16b}, v26.16b 1096 ushr v2.16b, v0.16b, 4 1097 and v0.16b, v0.16b, v31.16b 1098 tbl v0.16b, {v28.16b}, v0.16b 1099 tbl v2.16b, {v27.16b}, v2.16b 1100 eor v0.16b, v0.16b, v2.16b 1101 eor v1.16b, v1.16b, v1.16b 1102 aese v0.16b,v1.16b 1103 ushr v2.16b, v0.16b, 4 1104 and v0.16b, v0.16b, v31.16b 1105 tbl v0.16b, {v30.16b}, v0.16b 1106 tbl v2.16b, {v29.16b}, v2.16b 1107 eor v0.16b, v0.16b, v2.16b 1108 1109 mov w7,v0.s[0] 1110 eor w6,w7,w7,ror #32-2 1111 eor w6,w6,w7,ror #32-10 1112 eor w6,w6,w7,ror #32-18 1113 eor w6,w6,w7,ror #32-24 1114 eor w15,w15,w6 1115 subs w11,w11,#1 1116 b.ne 10b 1117 mov v4.s[0],w15 1118 mov v4.s[1],w14 1119 mov v4.s[2],w13 1120 mov v4.s[3],w12 1121 #ifndef __AARCH64EB__ 1122 rev32 v4.16b,v4.16b 1123 #endif 1124 st1 {v4.4s},[x1] 1125 b 100f 1126 1: // process last 2 blocks 1127 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 1128 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 1129 cmp w2,#2 1130 b.gt 1f 1131 #ifndef __AARCH64EB__ 1132 rev32 v4.16b,v4.16b 1133 #endif 1134 #ifndef __AARCH64EB__ 1135 rev32 v5.16b,v5.16b 1136 #endif 1137 #ifndef __AARCH64EB__ 1138 rev32 v6.16b,v6.16b 1139 #endif 1140 #ifndef __AARCH64EB__ 1141 rev32 v7.16b,v7.16b 1142 #endif 1143 bl _vpsm4_ex_enc_4blks 1144 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1145 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] 1146 b 100f 1147 1: // process last 3 blocks 1148 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 1149 #ifndef __AARCH64EB__ 1150 rev32 v4.16b,v4.16b 1151 #endif 1152 #ifndef __AARCH64EB__ 1153 rev32 v5.16b,v5.16b 1154 #endif 1155 #ifndef __AARCH64EB__ 1156 rev32 v6.16b,v6.16b 1157 #endif 1158 #ifndef __AARCH64EB__ 1159 rev32 v7.16b,v7.16b 1160 #endif 1161 bl _vpsm4_ex_enc_4blks 1162 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1163 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 1164 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] 1165 100: 1166 ldp d10,d11,[sp,#16] 1167 ldp d12,d13,[sp,#32] 1168 ldp d14,d15,[sp,#48] 1169 ldp x29,x30,[sp,#64] 1170 ldp d8,d9,[sp],#80 1171 AARCH64_VALIDATE_LINK_REGISTER 1172 ret 1173 .size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt 1174 .globl vpsm4_ex_cbc_encrypt 1175 .type vpsm4_ex_cbc_encrypt,%function 1176 .align 5 1177 vpsm4_ex_cbc_encrypt: 1178 AARCH64_VALID_CALL_TARGET 1179 lsr x2,x2,4 1180 adrp x9, .Lsbox_magic 1181 ldr q26, [x9, #:lo12:.Lsbox_magic] 1182 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 1183 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 1184 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 1185 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 1186 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 1187 cbz w5,.Ldec 1188 ld1 {v3.4s},[x4] 1189 .Lcbc_4_blocks_enc: 1190 cmp w2,#4 1191 b.lt 1f 1192 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1193 eor v4.16b,v4.16b,v3.16b 1194 #ifndef __AARCH64EB__ 1195 rev32 v5.16b,v5.16b 1196 #endif 1197 #ifndef __AARCH64EB__ 1198 rev32 v4.16b,v4.16b 1199 #endif 1200 #ifndef __AARCH64EB__ 1201 rev32 v6.16b,v6.16b 1202 #endif 1203 #ifndef __AARCH64EB__ 1204 rev32 v7.16b,v7.16b 1205 #endif 1206 mov x10,x3 1207 mov w11,#8 1208 mov w12,v4.s[0] 1209 mov w13,v4.s[1] 1210 mov w14,v4.s[2] 1211 mov w15,v4.s[3] 1212 10: 1213 ldp w7,w8,[x10],8 1214 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1215 eor w6,w14,w15 1216 eor w9,w7,w13 1217 eor w6,w6,w9 1218 mov v3.s[0],w6 1219 // optimize sbox using AESE instruction 1220 tbl v0.16b, {v3.16b}, v26.16b 1221 ushr v2.16b, v0.16b, 4 1222 and v0.16b, v0.16b, v31.16b 1223 tbl v0.16b, {v28.16b}, v0.16b 1224 tbl v2.16b, {v27.16b}, v2.16b 1225 eor v0.16b, v0.16b, v2.16b 1226 eor v1.16b, v1.16b, v1.16b 1227 aese v0.16b,v1.16b 1228 ushr v2.16b, v0.16b, 4 1229 and v0.16b, v0.16b, v31.16b 1230 tbl v0.16b, {v30.16b}, v0.16b 1231 tbl v2.16b, {v29.16b}, v2.16b 1232 eor v0.16b, v0.16b, v2.16b 1233 1234 mov w7,v0.s[0] 1235 eor w6,w7,w7,ror #32-2 1236 eor w6,w6,w7,ror #32-10 1237 eor w6,w6,w7,ror #32-18 1238 eor w6,w6,w7,ror #32-24 1239 eor w12,w12,w6 1240 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1241 eor w6,w14,w15 1242 eor w9,w12,w8 1243 eor w6,w6,w9 1244 mov v3.s[0],w6 1245 // optimize sbox using AESE instruction 1246 tbl v0.16b, {v3.16b}, v26.16b 1247 ushr v2.16b, v0.16b, 4 1248 and v0.16b, v0.16b, v31.16b 1249 tbl v0.16b, {v28.16b}, v0.16b 1250 tbl v2.16b, {v27.16b}, v2.16b 1251 eor v0.16b, v0.16b, v2.16b 1252 eor v1.16b, v1.16b, v1.16b 1253 aese v0.16b,v1.16b 1254 ushr v2.16b, v0.16b, 4 1255 and v0.16b, v0.16b, v31.16b 1256 tbl v0.16b, {v30.16b}, v0.16b 1257 tbl v2.16b, {v29.16b}, v2.16b 1258 eor v0.16b, v0.16b, v2.16b 1259 1260 mov w7,v0.s[0] 1261 eor w6,w7,w7,ror #32-2 1262 eor w6,w6,w7,ror #32-10 1263 eor w6,w6,w7,ror #32-18 1264 eor w6,w6,w7,ror #32-24 1265 ldp w7,w8,[x10],8 1266 eor w13,w13,w6 1267 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1268 eor w6,w12,w13 1269 eor w9,w7,w15 1270 eor w6,w6,w9 1271 mov v3.s[0],w6 1272 // optimize sbox using AESE instruction 1273 tbl v0.16b, {v3.16b}, v26.16b 1274 ushr v2.16b, v0.16b, 4 1275 and v0.16b, v0.16b, v31.16b 1276 tbl v0.16b, {v28.16b}, v0.16b 1277 tbl v2.16b, {v27.16b}, v2.16b 1278 eor v0.16b, v0.16b, v2.16b 1279 eor v1.16b, v1.16b, v1.16b 1280 aese v0.16b,v1.16b 1281 ushr v2.16b, v0.16b, 4 1282 and v0.16b, v0.16b, v31.16b 1283 tbl v0.16b, {v30.16b}, v0.16b 1284 tbl v2.16b, {v29.16b}, v2.16b 1285 eor v0.16b, v0.16b, v2.16b 1286 1287 mov w7,v0.s[0] 1288 eor w6,w7,w7,ror #32-2 1289 eor w6,w6,w7,ror #32-10 1290 eor w6,w6,w7,ror #32-18 1291 eor w6,w6,w7,ror #32-24 1292 eor w14,w14,w6 1293 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1294 eor w6,w12,w13 1295 eor w9,w14,w8 1296 eor w6,w6,w9 1297 mov v3.s[0],w6 1298 // optimize sbox using AESE instruction 1299 tbl v0.16b, {v3.16b}, v26.16b 1300 ushr v2.16b, v0.16b, 4 1301 and v0.16b, v0.16b, v31.16b 1302 tbl v0.16b, {v28.16b}, v0.16b 1303 tbl v2.16b, {v27.16b}, v2.16b 1304 eor v0.16b, v0.16b, v2.16b 1305 eor v1.16b, v1.16b, v1.16b 1306 aese v0.16b,v1.16b 1307 ushr v2.16b, v0.16b, 4 1308 and v0.16b, v0.16b, v31.16b 1309 tbl v0.16b, {v30.16b}, v0.16b 1310 tbl v2.16b, {v29.16b}, v2.16b 1311 eor v0.16b, v0.16b, v2.16b 1312 1313 mov w7,v0.s[0] 1314 eor w6,w7,w7,ror #32-2 1315 eor w6,w6,w7,ror #32-10 1316 eor w6,w6,w7,ror #32-18 1317 eor w6,w6,w7,ror #32-24 1318 eor w15,w15,w6 1319 subs w11,w11,#1 1320 b.ne 10b 1321 mov v4.s[0],w15 1322 mov v4.s[1],w14 1323 mov v4.s[2],w13 1324 mov v4.s[3],w12 1325 eor v5.16b,v5.16b,v4.16b 1326 mov x10,x3 1327 mov w11,#8 1328 mov w12,v5.s[0] 1329 mov w13,v5.s[1] 1330 mov w14,v5.s[2] 1331 mov w15,v5.s[3] 1332 10: 1333 ldp w7,w8,[x10],8 1334 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1335 eor w6,w14,w15 1336 eor w9,w7,w13 1337 eor w6,w6,w9 1338 mov v3.s[0],w6 1339 // optimize sbox using AESE instruction 1340 tbl v0.16b, {v3.16b}, v26.16b 1341 ushr v2.16b, v0.16b, 4 1342 and v0.16b, v0.16b, v31.16b 1343 tbl v0.16b, {v28.16b}, v0.16b 1344 tbl v2.16b, {v27.16b}, v2.16b 1345 eor v0.16b, v0.16b, v2.16b 1346 eor v1.16b, v1.16b, v1.16b 1347 aese v0.16b,v1.16b 1348 ushr v2.16b, v0.16b, 4 1349 and v0.16b, v0.16b, v31.16b 1350 tbl v0.16b, {v30.16b}, v0.16b 1351 tbl v2.16b, {v29.16b}, v2.16b 1352 eor v0.16b, v0.16b, v2.16b 1353 1354 mov w7,v0.s[0] 1355 eor w6,w7,w7,ror #32-2 1356 eor w6,w6,w7,ror #32-10 1357 eor w6,w6,w7,ror #32-18 1358 eor w6,w6,w7,ror #32-24 1359 eor w12,w12,w6 1360 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1361 eor w6,w14,w15 1362 eor w9,w12,w8 1363 eor w6,w6,w9 1364 mov v3.s[0],w6 1365 // optimize sbox using AESE instruction 1366 tbl v0.16b, {v3.16b}, v26.16b 1367 ushr v2.16b, v0.16b, 4 1368 and v0.16b, v0.16b, v31.16b 1369 tbl v0.16b, {v28.16b}, v0.16b 1370 tbl v2.16b, {v27.16b}, v2.16b 1371 eor v0.16b, v0.16b, v2.16b 1372 eor v1.16b, v1.16b, v1.16b 1373 aese v0.16b,v1.16b 1374 ushr v2.16b, v0.16b, 4 1375 and v0.16b, v0.16b, v31.16b 1376 tbl v0.16b, {v30.16b}, v0.16b 1377 tbl v2.16b, {v29.16b}, v2.16b 1378 eor v0.16b, v0.16b, v2.16b 1379 1380 mov w7,v0.s[0] 1381 eor w6,w7,w7,ror #32-2 1382 eor w6,w6,w7,ror #32-10 1383 eor w6,w6,w7,ror #32-18 1384 eor w6,w6,w7,ror #32-24 1385 ldp w7,w8,[x10],8 1386 eor w13,w13,w6 1387 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1388 eor w6,w12,w13 1389 eor w9,w7,w15 1390 eor w6,w6,w9 1391 mov v3.s[0],w6 1392 // optimize sbox using AESE instruction 1393 tbl v0.16b, {v3.16b}, v26.16b 1394 ushr v2.16b, v0.16b, 4 1395 and v0.16b, v0.16b, v31.16b 1396 tbl v0.16b, {v28.16b}, v0.16b 1397 tbl v2.16b, {v27.16b}, v2.16b 1398 eor v0.16b, v0.16b, v2.16b 1399 eor v1.16b, v1.16b, v1.16b 1400 aese v0.16b,v1.16b 1401 ushr v2.16b, v0.16b, 4 1402 and v0.16b, v0.16b, v31.16b 1403 tbl v0.16b, {v30.16b}, v0.16b 1404 tbl v2.16b, {v29.16b}, v2.16b 1405 eor v0.16b, v0.16b, v2.16b 1406 1407 mov w7,v0.s[0] 1408 eor w6,w7,w7,ror #32-2 1409 eor w6,w6,w7,ror #32-10 1410 eor w6,w6,w7,ror #32-18 1411 eor w6,w6,w7,ror #32-24 1412 eor w14,w14,w6 1413 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1414 eor w6,w12,w13 1415 eor w9,w14,w8 1416 eor w6,w6,w9 1417 mov v3.s[0],w6 1418 // optimize sbox using AESE instruction 1419 tbl v0.16b, {v3.16b}, v26.16b 1420 ushr v2.16b, v0.16b, 4 1421 and v0.16b, v0.16b, v31.16b 1422 tbl v0.16b, {v28.16b}, v0.16b 1423 tbl v2.16b, {v27.16b}, v2.16b 1424 eor v0.16b, v0.16b, v2.16b 1425 eor v1.16b, v1.16b, v1.16b 1426 aese v0.16b,v1.16b 1427 ushr v2.16b, v0.16b, 4 1428 and v0.16b, v0.16b, v31.16b 1429 tbl v0.16b, {v30.16b}, v0.16b 1430 tbl v2.16b, {v29.16b}, v2.16b 1431 eor v0.16b, v0.16b, v2.16b 1432 1433 mov w7,v0.s[0] 1434 eor w6,w7,w7,ror #32-2 1435 eor w6,w6,w7,ror #32-10 1436 eor w6,w6,w7,ror #32-18 1437 eor w6,w6,w7,ror #32-24 1438 eor w15,w15,w6 1439 subs w11,w11,#1 1440 b.ne 10b 1441 mov v5.s[0],w15 1442 mov v5.s[1],w14 1443 mov v5.s[2],w13 1444 mov v5.s[3],w12 1445 #ifndef __AARCH64EB__ 1446 rev32 v4.16b,v4.16b 1447 #endif 1448 eor v6.16b,v6.16b,v5.16b 1449 mov x10,x3 1450 mov w11,#8 1451 mov w12,v6.s[0] 1452 mov w13,v6.s[1] 1453 mov w14,v6.s[2] 1454 mov w15,v6.s[3] 1455 10: 1456 ldp w7,w8,[x10],8 1457 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1458 eor w6,w14,w15 1459 eor w9,w7,w13 1460 eor w6,w6,w9 1461 mov v3.s[0],w6 1462 // optimize sbox using AESE instruction 1463 tbl v0.16b, {v3.16b}, v26.16b 1464 ushr v2.16b, v0.16b, 4 1465 and v0.16b, v0.16b, v31.16b 1466 tbl v0.16b, {v28.16b}, v0.16b 1467 tbl v2.16b, {v27.16b}, v2.16b 1468 eor v0.16b, v0.16b, v2.16b 1469 eor v1.16b, v1.16b, v1.16b 1470 aese v0.16b,v1.16b 1471 ushr v2.16b, v0.16b, 4 1472 and v0.16b, v0.16b, v31.16b 1473 tbl v0.16b, {v30.16b}, v0.16b 1474 tbl v2.16b, {v29.16b}, v2.16b 1475 eor v0.16b, v0.16b, v2.16b 1476 1477 mov w7,v0.s[0] 1478 eor w6,w7,w7,ror #32-2 1479 eor w6,w6,w7,ror #32-10 1480 eor w6,w6,w7,ror #32-18 1481 eor w6,w6,w7,ror #32-24 1482 eor w12,w12,w6 1483 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1484 eor w6,w14,w15 1485 eor w9,w12,w8 1486 eor w6,w6,w9 1487 mov v3.s[0],w6 1488 // optimize sbox using AESE instruction 1489 tbl v0.16b, {v3.16b}, v26.16b 1490 ushr v2.16b, v0.16b, 4 1491 and v0.16b, v0.16b, v31.16b 1492 tbl v0.16b, {v28.16b}, v0.16b 1493 tbl v2.16b, {v27.16b}, v2.16b 1494 eor v0.16b, v0.16b, v2.16b 1495 eor v1.16b, v1.16b, v1.16b 1496 aese v0.16b,v1.16b 1497 ushr v2.16b, v0.16b, 4 1498 and v0.16b, v0.16b, v31.16b 1499 tbl v0.16b, {v30.16b}, v0.16b 1500 tbl v2.16b, {v29.16b}, v2.16b 1501 eor v0.16b, v0.16b, v2.16b 1502 1503 mov w7,v0.s[0] 1504 eor w6,w7,w7,ror #32-2 1505 eor w6,w6,w7,ror #32-10 1506 eor w6,w6,w7,ror #32-18 1507 eor w6,w6,w7,ror #32-24 1508 ldp w7,w8,[x10],8 1509 eor w13,w13,w6 1510 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1511 eor w6,w12,w13 1512 eor w9,w7,w15 1513 eor w6,w6,w9 1514 mov v3.s[0],w6 1515 // optimize sbox using AESE instruction 1516 tbl v0.16b, {v3.16b}, v26.16b 1517 ushr v2.16b, v0.16b, 4 1518 and v0.16b, v0.16b, v31.16b 1519 tbl v0.16b, {v28.16b}, v0.16b 1520 tbl v2.16b, {v27.16b}, v2.16b 1521 eor v0.16b, v0.16b, v2.16b 1522 eor v1.16b, v1.16b, v1.16b 1523 aese v0.16b,v1.16b 1524 ushr v2.16b, v0.16b, 4 1525 and v0.16b, v0.16b, v31.16b 1526 tbl v0.16b, {v30.16b}, v0.16b 1527 tbl v2.16b, {v29.16b}, v2.16b 1528 eor v0.16b, v0.16b, v2.16b 1529 1530 mov w7,v0.s[0] 1531 eor w6,w7,w7,ror #32-2 1532 eor w6,w6,w7,ror #32-10 1533 eor w6,w6,w7,ror #32-18 1534 eor w6,w6,w7,ror #32-24 1535 eor w14,w14,w6 1536 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1537 eor w6,w12,w13 1538 eor w9,w14,w8 1539 eor w6,w6,w9 1540 mov v3.s[0],w6 1541 // optimize sbox using AESE instruction 1542 tbl v0.16b, {v3.16b}, v26.16b 1543 ushr v2.16b, v0.16b, 4 1544 and v0.16b, v0.16b, v31.16b 1545 tbl v0.16b, {v28.16b}, v0.16b 1546 tbl v2.16b, {v27.16b}, v2.16b 1547 eor v0.16b, v0.16b, v2.16b 1548 eor v1.16b, v1.16b, v1.16b 1549 aese v0.16b,v1.16b 1550 ushr v2.16b, v0.16b, 4 1551 and v0.16b, v0.16b, v31.16b 1552 tbl v0.16b, {v30.16b}, v0.16b 1553 tbl v2.16b, {v29.16b}, v2.16b 1554 eor v0.16b, v0.16b, v2.16b 1555 1556 mov w7,v0.s[0] 1557 eor w6,w7,w7,ror #32-2 1558 eor w6,w6,w7,ror #32-10 1559 eor w6,w6,w7,ror #32-18 1560 eor w6,w6,w7,ror #32-24 1561 eor w15,w15,w6 1562 subs w11,w11,#1 1563 b.ne 10b 1564 mov v6.s[0],w15 1565 mov v6.s[1],w14 1566 mov v6.s[2],w13 1567 mov v6.s[3],w12 1568 #ifndef __AARCH64EB__ 1569 rev32 v5.16b,v5.16b 1570 #endif 1571 eor v7.16b,v7.16b,v6.16b 1572 mov x10,x3 1573 mov w11,#8 1574 mov w12,v7.s[0] 1575 mov w13,v7.s[1] 1576 mov w14,v7.s[2] 1577 mov w15,v7.s[3] 1578 10: 1579 ldp w7,w8,[x10],8 1580 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1581 eor w6,w14,w15 1582 eor w9,w7,w13 1583 eor w6,w6,w9 1584 mov v3.s[0],w6 1585 // optimize sbox using AESE instruction 1586 tbl v0.16b, {v3.16b}, v26.16b 1587 ushr v2.16b, v0.16b, 4 1588 and v0.16b, v0.16b, v31.16b 1589 tbl v0.16b, {v28.16b}, v0.16b 1590 tbl v2.16b, {v27.16b}, v2.16b 1591 eor v0.16b, v0.16b, v2.16b 1592 eor v1.16b, v1.16b, v1.16b 1593 aese v0.16b,v1.16b 1594 ushr v2.16b, v0.16b, 4 1595 and v0.16b, v0.16b, v31.16b 1596 tbl v0.16b, {v30.16b}, v0.16b 1597 tbl v2.16b, {v29.16b}, v2.16b 1598 eor v0.16b, v0.16b, v2.16b 1599 1600 mov w7,v0.s[0] 1601 eor w6,w7,w7,ror #32-2 1602 eor w6,w6,w7,ror #32-10 1603 eor w6,w6,w7,ror #32-18 1604 eor w6,w6,w7,ror #32-24 1605 eor w12,w12,w6 1606 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1607 eor w6,w14,w15 1608 eor w9,w12,w8 1609 eor w6,w6,w9 1610 mov v3.s[0],w6 1611 // optimize sbox using AESE instruction 1612 tbl v0.16b, {v3.16b}, v26.16b 1613 ushr v2.16b, v0.16b, 4 1614 and v0.16b, v0.16b, v31.16b 1615 tbl v0.16b, {v28.16b}, v0.16b 1616 tbl v2.16b, {v27.16b}, v2.16b 1617 eor v0.16b, v0.16b, v2.16b 1618 eor v1.16b, v1.16b, v1.16b 1619 aese v0.16b,v1.16b 1620 ushr v2.16b, v0.16b, 4 1621 and v0.16b, v0.16b, v31.16b 1622 tbl v0.16b, {v30.16b}, v0.16b 1623 tbl v2.16b, {v29.16b}, v2.16b 1624 eor v0.16b, v0.16b, v2.16b 1625 1626 mov w7,v0.s[0] 1627 eor w6,w7,w7,ror #32-2 1628 eor w6,w6,w7,ror #32-10 1629 eor w6,w6,w7,ror #32-18 1630 eor w6,w6,w7,ror #32-24 1631 ldp w7,w8,[x10],8 1632 eor w13,w13,w6 1633 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1634 eor w6,w12,w13 1635 eor w9,w7,w15 1636 eor w6,w6,w9 1637 mov v3.s[0],w6 1638 // optimize sbox using AESE instruction 1639 tbl v0.16b, {v3.16b}, v26.16b 1640 ushr v2.16b, v0.16b, 4 1641 and v0.16b, v0.16b, v31.16b 1642 tbl v0.16b, {v28.16b}, v0.16b 1643 tbl v2.16b, {v27.16b}, v2.16b 1644 eor v0.16b, v0.16b, v2.16b 1645 eor v1.16b, v1.16b, v1.16b 1646 aese v0.16b,v1.16b 1647 ushr v2.16b, v0.16b, 4 1648 and v0.16b, v0.16b, v31.16b 1649 tbl v0.16b, {v30.16b}, v0.16b 1650 tbl v2.16b, {v29.16b}, v2.16b 1651 eor v0.16b, v0.16b, v2.16b 1652 1653 mov w7,v0.s[0] 1654 eor w6,w7,w7,ror #32-2 1655 eor w6,w6,w7,ror #32-10 1656 eor w6,w6,w7,ror #32-18 1657 eor w6,w6,w7,ror #32-24 1658 eor w14,w14,w6 1659 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1660 eor w6,w12,w13 1661 eor w9,w14,w8 1662 eor w6,w6,w9 1663 mov v3.s[0],w6 1664 // optimize sbox using AESE instruction 1665 tbl v0.16b, {v3.16b}, v26.16b 1666 ushr v2.16b, v0.16b, 4 1667 and v0.16b, v0.16b, v31.16b 1668 tbl v0.16b, {v28.16b}, v0.16b 1669 tbl v2.16b, {v27.16b}, v2.16b 1670 eor v0.16b, v0.16b, v2.16b 1671 eor v1.16b, v1.16b, v1.16b 1672 aese v0.16b,v1.16b 1673 ushr v2.16b, v0.16b, 4 1674 and v0.16b, v0.16b, v31.16b 1675 tbl v0.16b, {v30.16b}, v0.16b 1676 tbl v2.16b, {v29.16b}, v2.16b 1677 eor v0.16b, v0.16b, v2.16b 1678 1679 mov w7,v0.s[0] 1680 eor w6,w7,w7,ror #32-2 1681 eor w6,w6,w7,ror #32-10 1682 eor w6,w6,w7,ror #32-18 1683 eor w6,w6,w7,ror #32-24 1684 eor w15,w15,w6 1685 subs w11,w11,#1 1686 b.ne 10b 1687 mov v7.s[0],w15 1688 mov v7.s[1],w14 1689 mov v7.s[2],w13 1690 mov v7.s[3],w12 1691 #ifndef __AARCH64EB__ 1692 rev32 v6.16b,v6.16b 1693 #endif 1694 #ifndef __AARCH64EB__ 1695 rev32 v7.16b,v7.16b 1696 #endif 1697 orr v3.16b,v7.16b,v7.16b 1698 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1699 subs w2,w2,#4 1700 b.ne .Lcbc_4_blocks_enc 1701 b 2f 1702 1: 1703 subs w2,w2,#1 1704 b.lt 2f 1705 ld1 {v4.4s},[x0],#16 1706 eor v3.16b,v3.16b,v4.16b 1707 #ifndef __AARCH64EB__ 1708 rev32 v3.16b,v3.16b 1709 #endif 1710 mov x10,x3 1711 mov w11,#8 1712 mov w12,v3.s[0] 1713 mov w13,v3.s[1] 1714 mov w14,v3.s[2] 1715 mov w15,v3.s[3] 1716 10: 1717 ldp w7,w8,[x10],8 1718 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1719 eor w6,w14,w15 1720 eor w9,w7,w13 1721 eor w6,w6,w9 1722 mov v3.s[0],w6 1723 // optimize sbox using AESE instruction 1724 tbl v0.16b, {v3.16b}, v26.16b 1725 ushr v2.16b, v0.16b, 4 1726 and v0.16b, v0.16b, v31.16b 1727 tbl v0.16b, {v28.16b}, v0.16b 1728 tbl v2.16b, {v27.16b}, v2.16b 1729 eor v0.16b, v0.16b, v2.16b 1730 eor v1.16b, v1.16b, v1.16b 1731 aese v0.16b,v1.16b 1732 ushr v2.16b, v0.16b, 4 1733 and v0.16b, v0.16b, v31.16b 1734 tbl v0.16b, {v30.16b}, v0.16b 1735 tbl v2.16b, {v29.16b}, v2.16b 1736 eor v0.16b, v0.16b, v2.16b 1737 1738 mov w7,v0.s[0] 1739 eor w6,w7,w7,ror #32-2 1740 eor w6,w6,w7,ror #32-10 1741 eor w6,w6,w7,ror #32-18 1742 eor w6,w6,w7,ror #32-24 1743 eor w12,w12,w6 1744 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1745 eor w6,w14,w15 1746 eor w9,w12,w8 1747 eor w6,w6,w9 1748 mov v3.s[0],w6 1749 // optimize sbox using AESE instruction 1750 tbl v0.16b, {v3.16b}, v26.16b 1751 ushr v2.16b, v0.16b, 4 1752 and v0.16b, v0.16b, v31.16b 1753 tbl v0.16b, {v28.16b}, v0.16b 1754 tbl v2.16b, {v27.16b}, v2.16b 1755 eor v0.16b, v0.16b, v2.16b 1756 eor v1.16b, v1.16b, v1.16b 1757 aese v0.16b,v1.16b 1758 ushr v2.16b, v0.16b, 4 1759 and v0.16b, v0.16b, v31.16b 1760 tbl v0.16b, {v30.16b}, v0.16b 1761 tbl v2.16b, {v29.16b}, v2.16b 1762 eor v0.16b, v0.16b, v2.16b 1763 1764 mov w7,v0.s[0] 1765 eor w6,w7,w7,ror #32-2 1766 eor w6,w6,w7,ror #32-10 1767 eor w6,w6,w7,ror #32-18 1768 eor w6,w6,w7,ror #32-24 1769 ldp w7,w8,[x10],8 1770 eor w13,w13,w6 1771 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1772 eor w6,w12,w13 1773 eor w9,w7,w15 1774 eor w6,w6,w9 1775 mov v3.s[0],w6 1776 // optimize sbox using AESE instruction 1777 tbl v0.16b, {v3.16b}, v26.16b 1778 ushr v2.16b, v0.16b, 4 1779 and v0.16b, v0.16b, v31.16b 1780 tbl v0.16b, {v28.16b}, v0.16b 1781 tbl v2.16b, {v27.16b}, v2.16b 1782 eor v0.16b, v0.16b, v2.16b 1783 eor v1.16b, v1.16b, v1.16b 1784 aese v0.16b,v1.16b 1785 ushr v2.16b, v0.16b, 4 1786 and v0.16b, v0.16b, v31.16b 1787 tbl v0.16b, {v30.16b}, v0.16b 1788 tbl v2.16b, {v29.16b}, v2.16b 1789 eor v0.16b, v0.16b, v2.16b 1790 1791 mov w7,v0.s[0] 1792 eor w6,w7,w7,ror #32-2 1793 eor w6,w6,w7,ror #32-10 1794 eor w6,w6,w7,ror #32-18 1795 eor w6,w6,w7,ror #32-24 1796 eor w14,w14,w6 1797 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1798 eor w6,w12,w13 1799 eor w9,w14,w8 1800 eor w6,w6,w9 1801 mov v3.s[0],w6 1802 // optimize sbox using AESE instruction 1803 tbl v0.16b, {v3.16b}, v26.16b 1804 ushr v2.16b, v0.16b, 4 1805 and v0.16b, v0.16b, v31.16b 1806 tbl v0.16b, {v28.16b}, v0.16b 1807 tbl v2.16b, {v27.16b}, v2.16b 1808 eor v0.16b, v0.16b, v2.16b 1809 eor v1.16b, v1.16b, v1.16b 1810 aese v0.16b,v1.16b 1811 ushr v2.16b, v0.16b, 4 1812 and v0.16b, v0.16b, v31.16b 1813 tbl v0.16b, {v30.16b}, v0.16b 1814 tbl v2.16b, {v29.16b}, v2.16b 1815 eor v0.16b, v0.16b, v2.16b 1816 1817 mov w7,v0.s[0] 1818 eor w6,w7,w7,ror #32-2 1819 eor w6,w6,w7,ror #32-10 1820 eor w6,w6,w7,ror #32-18 1821 eor w6,w6,w7,ror #32-24 1822 eor w15,w15,w6 1823 subs w11,w11,#1 1824 b.ne 10b 1825 mov v3.s[0],w15 1826 mov v3.s[1],w14 1827 mov v3.s[2],w13 1828 mov v3.s[3],w12 1829 #ifndef __AARCH64EB__ 1830 rev32 v3.16b,v3.16b 1831 #endif 1832 st1 {v3.4s},[x1],#16 1833 b 1b 1834 2: 1835 // save back IV 1836 st1 {v3.4s},[x4] 1837 ret 1838 1839 .Ldec: 1840 // decryption mode starts 1841 AARCH64_SIGN_LINK_REGISTER 1842 stp d8,d9,[sp,#-80]! 1843 stp d10,d11,[sp,#16] 1844 stp d12,d13,[sp,#32] 1845 stp d14,d15,[sp,#48] 1846 stp x29,x30,[sp,#64] 1847 .Lcbc_8_blocks_dec: 1848 cmp w2,#8 1849 b.lt 1f 1850 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1851 add x10,x0,#64 1852 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] 1853 #ifndef __AARCH64EB__ 1854 rev32 v4.16b,v4.16b 1855 #endif 1856 #ifndef __AARCH64EB__ 1857 rev32 v5.16b,v5.16b 1858 #endif 1859 #ifndef __AARCH64EB__ 1860 rev32 v6.16b,v6.16b 1861 #endif 1862 #ifndef __AARCH64EB__ 1863 rev32 v7.16b,v7.16b 1864 #endif 1865 #ifndef __AARCH64EB__ 1866 rev32 v8.16b,v8.16b 1867 #endif 1868 #ifndef __AARCH64EB__ 1869 rev32 v9.16b,v9.16b 1870 #endif 1871 #ifndef __AARCH64EB__ 1872 rev32 v10.16b,v10.16b 1873 #endif 1874 #ifndef __AARCH64EB__ 1875 rev32 v11.16b,v11.16b 1876 #endif 1877 bl _vpsm4_ex_enc_8blks 1878 zip1 v8.4s,v0.4s,v1.4s 1879 zip2 v9.4s,v0.4s,v1.4s 1880 zip1 v10.4s,v2.4s,v3.4s 1881 zip2 v11.4s,v2.4s,v3.4s 1882 zip1 v0.2d,v8.2d,v10.2d 1883 zip2 v1.2d,v8.2d,v10.2d 1884 zip1 v2.2d,v9.2d,v11.2d 1885 zip2 v3.2d,v9.2d,v11.2d 1886 zip1 v8.4s,v4.4s,v5.4s 1887 zip2 v9.4s,v4.4s,v5.4s 1888 zip1 v10.4s,v6.4s,v7.4s 1889 zip2 v11.4s,v6.4s,v7.4s 1890 zip1 v4.2d,v8.2d,v10.2d 1891 zip2 v5.2d,v8.2d,v10.2d 1892 zip1 v6.2d,v9.2d,v11.2d 1893 zip2 v7.2d,v9.2d,v11.2d 1894 ld1 {v15.4s},[x4] 1895 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 1896 // note ivec1 and vtmpx[3] are reusing the same register 1897 // care needs to be taken to avoid conflict 1898 eor v0.16b,v0.16b,v15.16b 1899 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 1900 eor v1.16b,v1.16b,v8.16b 1901 eor v2.16b,v2.16b,v9.16b 1902 eor v3.16b,v3.16b,v10.16b 1903 // save back IV 1904 st1 {v15.4s}, [x4] 1905 eor v4.16b,v4.16b,v11.16b 1906 eor v5.16b,v5.16b,v12.16b 1907 eor v6.16b,v6.16b,v13.16b 1908 eor v7.16b,v7.16b,v14.16b 1909 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1910 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1911 subs w2,w2,#8 1912 b.gt .Lcbc_8_blocks_dec 1913 b.eq 100f 1914 1: 1915 ld1 {v15.4s},[x4] 1916 .Lcbc_4_blocks_dec: 1917 cmp w2,#4 1918 b.lt 1f 1919 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1920 #ifndef __AARCH64EB__ 1921 rev32 v4.16b,v4.16b 1922 #endif 1923 #ifndef __AARCH64EB__ 1924 rev32 v5.16b,v5.16b 1925 #endif 1926 #ifndef __AARCH64EB__ 1927 rev32 v6.16b,v6.16b 1928 #endif 1929 #ifndef __AARCH64EB__ 1930 rev32 v7.16b,v7.16b 1931 #endif 1932 bl _vpsm4_ex_enc_4blks 1933 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1934 zip1 v8.4s,v0.4s,v1.4s 1935 zip2 v9.4s,v0.4s,v1.4s 1936 zip1 v10.4s,v2.4s,v3.4s 1937 zip2 v11.4s,v2.4s,v3.4s 1938 zip1 v0.2d,v8.2d,v10.2d 1939 zip2 v1.2d,v8.2d,v10.2d 1940 zip1 v2.2d,v9.2d,v11.2d 1941 zip2 v3.2d,v9.2d,v11.2d 1942 eor v0.16b,v0.16b,v15.16b 1943 eor v1.16b,v1.16b,v4.16b 1944 orr v15.16b,v7.16b,v7.16b 1945 eor v2.16b,v2.16b,v5.16b 1946 eor v3.16b,v3.16b,v6.16b 1947 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1948 subs w2,w2,#4 1949 b.gt .Lcbc_4_blocks_dec 1950 // save back IV 1951 st1 {v7.4s}, [x4] 1952 b 100f 1953 1: // last block 1954 subs w2,w2,#1 1955 b.lt 100f 1956 b.gt 1f 1957 ld1 {v4.4s},[x0],#16 1958 // save back IV 1959 st1 {v4.4s}, [x4] 1960 #ifndef __AARCH64EB__ 1961 rev32 v8.16b,v4.16b 1962 #else 1963 mov v8.16b,v4.16b 1964 #endif 1965 mov x10,x3 1966 mov w11,#8 1967 mov w12,v8.s[0] 1968 mov w13,v8.s[1] 1969 mov w14,v8.s[2] 1970 mov w15,v8.s[3] 1971 10: 1972 ldp w7,w8,[x10],8 1973 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1974 eor w6,w14,w15 1975 eor w9,w7,w13 1976 eor w6,w6,w9 1977 mov v3.s[0],w6 1978 // optimize sbox using AESE instruction 1979 tbl v0.16b, {v3.16b}, v26.16b 1980 ushr v2.16b, v0.16b, 4 1981 and v0.16b, v0.16b, v31.16b 1982 tbl v0.16b, {v28.16b}, v0.16b 1983 tbl v2.16b, {v27.16b}, v2.16b 1984 eor v0.16b, v0.16b, v2.16b 1985 eor v1.16b, v1.16b, v1.16b 1986 aese v0.16b,v1.16b 1987 ushr v2.16b, v0.16b, 4 1988 and v0.16b, v0.16b, v31.16b 1989 tbl v0.16b, {v30.16b}, v0.16b 1990 tbl v2.16b, {v29.16b}, v2.16b 1991 eor v0.16b, v0.16b, v2.16b 1992 1993 mov w7,v0.s[0] 1994 eor w6,w7,w7,ror #32-2 1995 eor w6,w6,w7,ror #32-10 1996 eor w6,w6,w7,ror #32-18 1997 eor w6,w6,w7,ror #32-24 1998 eor w12,w12,w6 1999 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2000 eor w6,w14,w15 2001 eor w9,w12,w8 2002 eor w6,w6,w9 2003 mov v3.s[0],w6 2004 // optimize sbox using AESE instruction 2005 tbl v0.16b, {v3.16b}, v26.16b 2006 ushr v2.16b, v0.16b, 4 2007 and v0.16b, v0.16b, v31.16b 2008 tbl v0.16b, {v28.16b}, v0.16b 2009 tbl v2.16b, {v27.16b}, v2.16b 2010 eor v0.16b, v0.16b, v2.16b 2011 eor v1.16b, v1.16b, v1.16b 2012 aese v0.16b,v1.16b 2013 ushr v2.16b, v0.16b, 4 2014 and v0.16b, v0.16b, v31.16b 2015 tbl v0.16b, {v30.16b}, v0.16b 2016 tbl v2.16b, {v29.16b}, v2.16b 2017 eor v0.16b, v0.16b, v2.16b 2018 2019 mov w7,v0.s[0] 2020 eor w6,w7,w7,ror #32-2 2021 eor w6,w6,w7,ror #32-10 2022 eor w6,w6,w7,ror #32-18 2023 eor w6,w6,w7,ror #32-24 2024 ldp w7,w8,[x10],8 2025 eor w13,w13,w6 2026 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2027 eor w6,w12,w13 2028 eor w9,w7,w15 2029 eor w6,w6,w9 2030 mov v3.s[0],w6 2031 // optimize sbox using AESE instruction 2032 tbl v0.16b, {v3.16b}, v26.16b 2033 ushr v2.16b, v0.16b, 4 2034 and v0.16b, v0.16b, v31.16b 2035 tbl v0.16b, {v28.16b}, v0.16b 2036 tbl v2.16b, {v27.16b}, v2.16b 2037 eor v0.16b, v0.16b, v2.16b 2038 eor v1.16b, v1.16b, v1.16b 2039 aese v0.16b,v1.16b 2040 ushr v2.16b, v0.16b, 4 2041 and v0.16b, v0.16b, v31.16b 2042 tbl v0.16b, {v30.16b}, v0.16b 2043 tbl v2.16b, {v29.16b}, v2.16b 2044 eor v0.16b, v0.16b, v2.16b 2045 2046 mov w7,v0.s[0] 2047 eor w6,w7,w7,ror #32-2 2048 eor w6,w6,w7,ror #32-10 2049 eor w6,w6,w7,ror #32-18 2050 eor w6,w6,w7,ror #32-24 2051 eor w14,w14,w6 2052 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2053 eor w6,w12,w13 2054 eor w9,w14,w8 2055 eor w6,w6,w9 2056 mov v3.s[0],w6 2057 // optimize sbox using AESE instruction 2058 tbl v0.16b, {v3.16b}, v26.16b 2059 ushr v2.16b, v0.16b, 4 2060 and v0.16b, v0.16b, v31.16b 2061 tbl v0.16b, {v28.16b}, v0.16b 2062 tbl v2.16b, {v27.16b}, v2.16b 2063 eor v0.16b, v0.16b, v2.16b 2064 eor v1.16b, v1.16b, v1.16b 2065 aese v0.16b,v1.16b 2066 ushr v2.16b, v0.16b, 4 2067 and v0.16b, v0.16b, v31.16b 2068 tbl v0.16b, {v30.16b}, v0.16b 2069 tbl v2.16b, {v29.16b}, v2.16b 2070 eor v0.16b, v0.16b, v2.16b 2071 2072 mov w7,v0.s[0] 2073 eor w6,w7,w7,ror #32-2 2074 eor w6,w6,w7,ror #32-10 2075 eor w6,w6,w7,ror #32-18 2076 eor w6,w6,w7,ror #32-24 2077 eor w15,w15,w6 2078 subs w11,w11,#1 2079 b.ne 10b 2080 mov v8.s[0],w15 2081 mov v8.s[1],w14 2082 mov v8.s[2],w13 2083 mov v8.s[3],w12 2084 #ifndef __AARCH64EB__ 2085 rev32 v8.16b,v8.16b 2086 #endif 2087 eor v8.16b,v8.16b,v15.16b 2088 st1 {v8.4s},[x1],#16 2089 b 100f 2090 1: // last two blocks 2091 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] 2092 add x10,x0,#16 2093 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 2094 subs w2,w2,1 2095 b.gt 1f 2096 #ifndef __AARCH64EB__ 2097 rev32 v4.16b,v4.16b 2098 #endif 2099 #ifndef __AARCH64EB__ 2100 rev32 v5.16b,v5.16b 2101 #endif 2102 #ifndef __AARCH64EB__ 2103 rev32 v6.16b,v6.16b 2104 #endif 2105 #ifndef __AARCH64EB__ 2106 rev32 v7.16b,v7.16b 2107 #endif 2108 bl _vpsm4_ex_enc_4blks 2109 ld1 {v4.4s,v5.4s},[x0],#32 2110 zip1 v8.4s,v0.4s,v1.4s 2111 zip2 v9.4s,v0.4s,v1.4s 2112 zip1 v10.4s,v2.4s,v3.4s 2113 zip2 v11.4s,v2.4s,v3.4s 2114 zip1 v0.2d,v8.2d,v10.2d 2115 zip2 v1.2d,v8.2d,v10.2d 2116 zip1 v2.2d,v9.2d,v11.2d 2117 zip2 v3.2d,v9.2d,v11.2d 2118 eor v0.16b,v0.16b,v15.16b 2119 eor v1.16b,v1.16b,v4.16b 2120 st1 {v0.4s,v1.4s},[x1],#32 2121 // save back IV 2122 st1 {v5.4s}, [x4] 2123 b 100f 2124 1: // last 3 blocks 2125 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] 2126 #ifndef __AARCH64EB__ 2127 rev32 v4.16b,v4.16b 2128 #endif 2129 #ifndef __AARCH64EB__ 2130 rev32 v5.16b,v5.16b 2131 #endif 2132 #ifndef __AARCH64EB__ 2133 rev32 v6.16b,v6.16b 2134 #endif 2135 #ifndef __AARCH64EB__ 2136 rev32 v7.16b,v7.16b 2137 #endif 2138 bl _vpsm4_ex_enc_4blks 2139 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 2140 zip1 v8.4s,v0.4s,v1.4s 2141 zip2 v9.4s,v0.4s,v1.4s 2142 zip1 v10.4s,v2.4s,v3.4s 2143 zip2 v11.4s,v2.4s,v3.4s 2144 zip1 v0.2d,v8.2d,v10.2d 2145 zip2 v1.2d,v8.2d,v10.2d 2146 zip1 v2.2d,v9.2d,v11.2d 2147 zip2 v3.2d,v9.2d,v11.2d 2148 eor v0.16b,v0.16b,v15.16b 2149 eor v1.16b,v1.16b,v4.16b 2150 eor v2.16b,v2.16b,v5.16b 2151 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 2152 // save back IV 2153 st1 {v6.4s}, [x4] 2154 100: 2155 ldp d10,d11,[sp,#16] 2156 ldp d12,d13,[sp,#32] 2157 ldp d14,d15,[sp,#48] 2158 ldp x29,x30,[sp,#64] 2159 ldp d8,d9,[sp],#80 2160 AARCH64_VALIDATE_LINK_REGISTER 2161 ret 2162 .size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt 2163 .globl vpsm4_ex_ctr32_encrypt_blocks 2164 .type vpsm4_ex_ctr32_encrypt_blocks,%function 2165 .align 5 2166 vpsm4_ex_ctr32_encrypt_blocks: 2167 AARCH64_VALID_CALL_TARGET 2168 ld1 {v3.4s},[x4] 2169 #ifndef __AARCH64EB__ 2170 rev32 v3.16b,v3.16b 2171 #endif 2172 adrp x9, .Lsbox_magic 2173 ldr q26, [x9, #:lo12:.Lsbox_magic] 2174 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 2175 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 2176 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 2177 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 2178 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 2179 cmp w2,#1 2180 b.ne 1f 2181 // fast processing for one single block without 2182 // context saving overhead 2183 mov x10,x3 2184 mov w11,#8 2185 mov w12,v3.s[0] 2186 mov w13,v3.s[1] 2187 mov w14,v3.s[2] 2188 mov w15,v3.s[3] 2189 10: 2190 ldp w7,w8,[x10],8 2191 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2192 eor w6,w14,w15 2193 eor w9,w7,w13 2194 eor w6,w6,w9 2195 mov v3.s[0],w6 2196 // optimize sbox using AESE instruction 2197 tbl v0.16b, {v3.16b}, v26.16b 2198 ushr v2.16b, v0.16b, 4 2199 and v0.16b, v0.16b, v31.16b 2200 tbl v0.16b, {v28.16b}, v0.16b 2201 tbl v2.16b, {v27.16b}, v2.16b 2202 eor v0.16b, v0.16b, v2.16b 2203 eor v1.16b, v1.16b, v1.16b 2204 aese v0.16b,v1.16b 2205 ushr v2.16b, v0.16b, 4 2206 and v0.16b, v0.16b, v31.16b 2207 tbl v0.16b, {v30.16b}, v0.16b 2208 tbl v2.16b, {v29.16b}, v2.16b 2209 eor v0.16b, v0.16b, v2.16b 2210 2211 mov w7,v0.s[0] 2212 eor w6,w7,w7,ror #32-2 2213 eor w6,w6,w7,ror #32-10 2214 eor w6,w6,w7,ror #32-18 2215 eor w6,w6,w7,ror #32-24 2216 eor w12,w12,w6 2217 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2218 eor w6,w14,w15 2219 eor w9,w12,w8 2220 eor w6,w6,w9 2221 mov v3.s[0],w6 2222 // optimize sbox using AESE instruction 2223 tbl v0.16b, {v3.16b}, v26.16b 2224 ushr v2.16b, v0.16b, 4 2225 and v0.16b, v0.16b, v31.16b 2226 tbl v0.16b, {v28.16b}, v0.16b 2227 tbl v2.16b, {v27.16b}, v2.16b 2228 eor v0.16b, v0.16b, v2.16b 2229 eor v1.16b, v1.16b, v1.16b 2230 aese v0.16b,v1.16b 2231 ushr v2.16b, v0.16b, 4 2232 and v0.16b, v0.16b, v31.16b 2233 tbl v0.16b, {v30.16b}, v0.16b 2234 tbl v2.16b, {v29.16b}, v2.16b 2235 eor v0.16b, v0.16b, v2.16b 2236 2237 mov w7,v0.s[0] 2238 eor w6,w7,w7,ror #32-2 2239 eor w6,w6,w7,ror #32-10 2240 eor w6,w6,w7,ror #32-18 2241 eor w6,w6,w7,ror #32-24 2242 ldp w7,w8,[x10],8 2243 eor w13,w13,w6 2244 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2245 eor w6,w12,w13 2246 eor w9,w7,w15 2247 eor w6,w6,w9 2248 mov v3.s[0],w6 2249 // optimize sbox using AESE instruction 2250 tbl v0.16b, {v3.16b}, v26.16b 2251 ushr v2.16b, v0.16b, 4 2252 and v0.16b, v0.16b, v31.16b 2253 tbl v0.16b, {v28.16b}, v0.16b 2254 tbl v2.16b, {v27.16b}, v2.16b 2255 eor v0.16b, v0.16b, v2.16b 2256 eor v1.16b, v1.16b, v1.16b 2257 aese v0.16b,v1.16b 2258 ushr v2.16b, v0.16b, 4 2259 and v0.16b, v0.16b, v31.16b 2260 tbl v0.16b, {v30.16b}, v0.16b 2261 tbl v2.16b, {v29.16b}, v2.16b 2262 eor v0.16b, v0.16b, v2.16b 2263 2264 mov w7,v0.s[0] 2265 eor w6,w7,w7,ror #32-2 2266 eor w6,w6,w7,ror #32-10 2267 eor w6,w6,w7,ror #32-18 2268 eor w6,w6,w7,ror #32-24 2269 eor w14,w14,w6 2270 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2271 eor w6,w12,w13 2272 eor w9,w14,w8 2273 eor w6,w6,w9 2274 mov v3.s[0],w6 2275 // optimize sbox using AESE instruction 2276 tbl v0.16b, {v3.16b}, v26.16b 2277 ushr v2.16b, v0.16b, 4 2278 and v0.16b, v0.16b, v31.16b 2279 tbl v0.16b, {v28.16b}, v0.16b 2280 tbl v2.16b, {v27.16b}, v2.16b 2281 eor v0.16b, v0.16b, v2.16b 2282 eor v1.16b, v1.16b, v1.16b 2283 aese v0.16b,v1.16b 2284 ushr v2.16b, v0.16b, 4 2285 and v0.16b, v0.16b, v31.16b 2286 tbl v0.16b, {v30.16b}, v0.16b 2287 tbl v2.16b, {v29.16b}, v2.16b 2288 eor v0.16b, v0.16b, v2.16b 2289 2290 mov w7,v0.s[0] 2291 eor w6,w7,w7,ror #32-2 2292 eor w6,w6,w7,ror #32-10 2293 eor w6,w6,w7,ror #32-18 2294 eor w6,w6,w7,ror #32-24 2295 eor w15,w15,w6 2296 subs w11,w11,#1 2297 b.ne 10b 2298 mov v3.s[0],w15 2299 mov v3.s[1],w14 2300 mov v3.s[2],w13 2301 mov v3.s[3],w12 2302 #ifndef __AARCH64EB__ 2303 rev32 v3.16b,v3.16b 2304 #endif 2305 ld1 {v4.4s},[x0] 2306 eor v4.16b,v4.16b,v3.16b 2307 st1 {v4.4s},[x1] 2308 ret 2309 1: 2310 AARCH64_SIGN_LINK_REGISTER 2311 stp d8,d9,[sp,#-80]! 2312 stp d10,d11,[sp,#16] 2313 stp d12,d13,[sp,#32] 2314 stp d14,d15,[sp,#48] 2315 stp x29,x30,[sp,#64] 2316 mov w12,v3.s[0] 2317 mov w13,v3.s[1] 2318 mov w14,v3.s[2] 2319 mov w5,v3.s[3] 2320 .Lctr32_4_blocks_process: 2321 cmp w2,#4 2322 b.lt 1f 2323 dup v4.4s,w12 2324 dup v5.4s,w13 2325 dup v6.4s,w14 2326 mov v7.s[0],w5 2327 add w5,w5,#1 2328 mov v7.s[1],w5 2329 add w5,w5,#1 2330 mov v7.s[2],w5 2331 add w5,w5,#1 2332 mov v7.s[3],w5 2333 add w5,w5,#1 2334 cmp w2,#8 2335 b.ge .Lctr32_8_blocks_process 2336 bl _vpsm4_ex_enc_4blks 2337 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2338 eor v0.16b,v0.16b,v12.16b 2339 eor v1.16b,v1.16b,v13.16b 2340 eor v2.16b,v2.16b,v14.16b 2341 eor v3.16b,v3.16b,v15.16b 2342 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2343 subs w2,w2,#4 2344 b.ne .Lctr32_4_blocks_process 2345 b 100f 2346 .Lctr32_8_blocks_process: 2347 dup v8.4s,w12 2348 dup v9.4s,w13 2349 dup v10.4s,w14 2350 mov v11.s[0],w5 2351 add w5,w5,#1 2352 mov v11.s[1],w5 2353 add w5,w5,#1 2354 mov v11.s[2],w5 2355 add w5,w5,#1 2356 mov v11.s[3],w5 2357 add w5,w5,#1 2358 bl _vpsm4_ex_enc_8blks 2359 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2360 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2361 eor v0.16b,v0.16b,v12.16b 2362 eor v1.16b,v1.16b,v13.16b 2363 eor v2.16b,v2.16b,v14.16b 2364 eor v3.16b,v3.16b,v15.16b 2365 eor v4.16b,v4.16b,v8.16b 2366 eor v5.16b,v5.16b,v9.16b 2367 eor v6.16b,v6.16b,v10.16b 2368 eor v7.16b,v7.16b,v11.16b 2369 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2370 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2371 subs w2,w2,#8 2372 b.ne .Lctr32_4_blocks_process 2373 b 100f 2374 1: // last block processing 2375 subs w2,w2,#1 2376 b.lt 100f 2377 b.gt 1f 2378 mov v3.s[0],w12 2379 mov v3.s[1],w13 2380 mov v3.s[2],w14 2381 mov v3.s[3],w5 2382 mov x10,x3 2383 mov w11,#8 2384 mov w12,v3.s[0] 2385 mov w13,v3.s[1] 2386 mov w14,v3.s[2] 2387 mov w15,v3.s[3] 2388 10: 2389 ldp w7,w8,[x10],8 2390 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2391 eor w6,w14,w15 2392 eor w9,w7,w13 2393 eor w6,w6,w9 2394 mov v3.s[0],w6 2395 // optimize sbox using AESE instruction 2396 tbl v0.16b, {v3.16b}, v26.16b 2397 ushr v2.16b, v0.16b, 4 2398 and v0.16b, v0.16b, v31.16b 2399 tbl v0.16b, {v28.16b}, v0.16b 2400 tbl v2.16b, {v27.16b}, v2.16b 2401 eor v0.16b, v0.16b, v2.16b 2402 eor v1.16b, v1.16b, v1.16b 2403 aese v0.16b,v1.16b 2404 ushr v2.16b, v0.16b, 4 2405 and v0.16b, v0.16b, v31.16b 2406 tbl v0.16b, {v30.16b}, v0.16b 2407 tbl v2.16b, {v29.16b}, v2.16b 2408 eor v0.16b, v0.16b, v2.16b 2409 2410 mov w7,v0.s[0] 2411 eor w6,w7,w7,ror #32-2 2412 eor w6,w6,w7,ror #32-10 2413 eor w6,w6,w7,ror #32-18 2414 eor w6,w6,w7,ror #32-24 2415 eor w12,w12,w6 2416 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2417 eor w6,w14,w15 2418 eor w9,w12,w8 2419 eor w6,w6,w9 2420 mov v3.s[0],w6 2421 // optimize sbox using AESE instruction 2422 tbl v0.16b, {v3.16b}, v26.16b 2423 ushr v2.16b, v0.16b, 4 2424 and v0.16b, v0.16b, v31.16b 2425 tbl v0.16b, {v28.16b}, v0.16b 2426 tbl v2.16b, {v27.16b}, v2.16b 2427 eor v0.16b, v0.16b, v2.16b 2428 eor v1.16b, v1.16b, v1.16b 2429 aese v0.16b,v1.16b 2430 ushr v2.16b, v0.16b, 4 2431 and v0.16b, v0.16b, v31.16b 2432 tbl v0.16b, {v30.16b}, v0.16b 2433 tbl v2.16b, {v29.16b}, v2.16b 2434 eor v0.16b, v0.16b, v2.16b 2435 2436 mov w7,v0.s[0] 2437 eor w6,w7,w7,ror #32-2 2438 eor w6,w6,w7,ror #32-10 2439 eor w6,w6,w7,ror #32-18 2440 eor w6,w6,w7,ror #32-24 2441 ldp w7,w8,[x10],8 2442 eor w13,w13,w6 2443 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2444 eor w6,w12,w13 2445 eor w9,w7,w15 2446 eor w6,w6,w9 2447 mov v3.s[0],w6 2448 // optimize sbox using AESE instruction 2449 tbl v0.16b, {v3.16b}, v26.16b 2450 ushr v2.16b, v0.16b, 4 2451 and v0.16b, v0.16b, v31.16b 2452 tbl v0.16b, {v28.16b}, v0.16b 2453 tbl v2.16b, {v27.16b}, v2.16b 2454 eor v0.16b, v0.16b, v2.16b 2455 eor v1.16b, v1.16b, v1.16b 2456 aese v0.16b,v1.16b 2457 ushr v2.16b, v0.16b, 4 2458 and v0.16b, v0.16b, v31.16b 2459 tbl v0.16b, {v30.16b}, v0.16b 2460 tbl v2.16b, {v29.16b}, v2.16b 2461 eor v0.16b, v0.16b, v2.16b 2462 2463 mov w7,v0.s[0] 2464 eor w6,w7,w7,ror #32-2 2465 eor w6,w6,w7,ror #32-10 2466 eor w6,w6,w7,ror #32-18 2467 eor w6,w6,w7,ror #32-24 2468 eor w14,w14,w6 2469 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2470 eor w6,w12,w13 2471 eor w9,w14,w8 2472 eor w6,w6,w9 2473 mov v3.s[0],w6 2474 // optimize sbox using AESE instruction 2475 tbl v0.16b, {v3.16b}, v26.16b 2476 ushr v2.16b, v0.16b, 4 2477 and v0.16b, v0.16b, v31.16b 2478 tbl v0.16b, {v28.16b}, v0.16b 2479 tbl v2.16b, {v27.16b}, v2.16b 2480 eor v0.16b, v0.16b, v2.16b 2481 eor v1.16b, v1.16b, v1.16b 2482 aese v0.16b,v1.16b 2483 ushr v2.16b, v0.16b, 4 2484 and v0.16b, v0.16b, v31.16b 2485 tbl v0.16b, {v30.16b}, v0.16b 2486 tbl v2.16b, {v29.16b}, v2.16b 2487 eor v0.16b, v0.16b, v2.16b 2488 2489 mov w7,v0.s[0] 2490 eor w6,w7,w7,ror #32-2 2491 eor w6,w6,w7,ror #32-10 2492 eor w6,w6,w7,ror #32-18 2493 eor w6,w6,w7,ror #32-24 2494 eor w15,w15,w6 2495 subs w11,w11,#1 2496 b.ne 10b 2497 mov v3.s[0],w15 2498 mov v3.s[1],w14 2499 mov v3.s[2],w13 2500 mov v3.s[3],w12 2501 #ifndef __AARCH64EB__ 2502 rev32 v3.16b,v3.16b 2503 #endif 2504 ld1 {v4.4s},[x0] 2505 eor v4.16b,v4.16b,v3.16b 2506 st1 {v4.4s},[x1] 2507 b 100f 2508 1: // last 2 blocks processing 2509 dup v4.4s,w12 2510 dup v5.4s,w13 2511 dup v6.4s,w14 2512 mov v7.s[0],w5 2513 add w5,w5,#1 2514 mov v7.s[1],w5 2515 subs w2,w2,#1 2516 b.ne 1f 2517 bl _vpsm4_ex_enc_4blks 2518 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2519 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2520 eor v0.16b,v0.16b,v12.16b 2521 eor v1.16b,v1.16b,v13.16b 2522 eor v2.16b,v2.16b,v14.16b 2523 eor v3.16b,v3.16b,v15.16b 2524 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2525 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2526 b 100f 2527 1: // last 3 blocks processing 2528 add w5,w5,#1 2529 mov v7.s[2],w5 2530 bl _vpsm4_ex_enc_4blks 2531 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2532 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2533 ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 2534 eor v0.16b,v0.16b,v12.16b 2535 eor v1.16b,v1.16b,v13.16b 2536 eor v2.16b,v2.16b,v14.16b 2537 eor v3.16b,v3.16b,v15.16b 2538 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2539 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2540 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 2541 100: 2542 ldp d10,d11,[sp,#16] 2543 ldp d12,d13,[sp,#32] 2544 ldp d14,d15,[sp,#48] 2545 ldp x29,x30,[sp,#64] 2546 ldp d8,d9,[sp],#80 2547 AARCH64_VALIDATE_LINK_REGISTER 2548 ret 2549 .size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks 2550 .globl vpsm4_ex_xts_encrypt_gb 2551 .type vpsm4_ex_xts_encrypt_gb,%function 2552 .align 5 2553 vpsm4_ex_xts_encrypt_gb: 2554 AARCH64_SIGN_LINK_REGISTER 2555 stp x15, x16, [sp, #-0x10]! 2556 stp x17, x18, [sp, #-0x10]! 2557 stp x19, x20, [sp, #-0x10]! 2558 stp x21, x22, [sp, #-0x10]! 2559 stp x23, x24, [sp, #-0x10]! 2560 stp x25, x26, [sp, #-0x10]! 2561 stp x27, x28, [sp, #-0x10]! 2562 stp x29, x30, [sp, #-0x10]! 2563 stp d8, d9, [sp, #-0x10]! 2564 stp d10, d11, [sp, #-0x10]! 2565 stp d12, d13, [sp, #-0x10]! 2566 stp d14, d15, [sp, #-0x10]! 2567 mov x26,x3 2568 mov x27,x4 2569 mov w28,w6 2570 ld1 {v16.4s}, [x5] 2571 mov x3,x27 2572 adrp x9, .Lsbox_magic 2573 ldr q26, [x9, #:lo12:.Lsbox_magic] 2574 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 2575 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 2576 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 2577 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 2578 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 2579 #ifndef __AARCH64EB__ 2580 rev32 v16.16b,v16.16b 2581 #endif 2582 mov x10,x3 2583 mov w11,#8 2584 mov w12,v16.s[0] 2585 mov w13,v16.s[1] 2586 mov w14,v16.s[2] 2587 mov w15,v16.s[3] 2588 10: 2589 ldp w7,w8,[x10],8 2590 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2591 eor w6,w14,w15 2592 eor w9,w7,w13 2593 eor w6,w6,w9 2594 mov v3.s[0],w6 2595 // optimize sbox using AESE instruction 2596 tbl v0.16b, {v3.16b}, v26.16b 2597 ushr v2.16b, v0.16b, 4 2598 and v0.16b, v0.16b, v31.16b 2599 tbl v0.16b, {v28.16b}, v0.16b 2600 tbl v2.16b, {v27.16b}, v2.16b 2601 eor v0.16b, v0.16b, v2.16b 2602 eor v1.16b, v1.16b, v1.16b 2603 aese v0.16b,v1.16b 2604 ushr v2.16b, v0.16b, 4 2605 and v0.16b, v0.16b, v31.16b 2606 tbl v0.16b, {v30.16b}, v0.16b 2607 tbl v2.16b, {v29.16b}, v2.16b 2608 eor v0.16b, v0.16b, v2.16b 2609 2610 mov w7,v0.s[0] 2611 eor w6,w7,w7,ror #32-2 2612 eor w6,w6,w7,ror #32-10 2613 eor w6,w6,w7,ror #32-18 2614 eor w6,w6,w7,ror #32-24 2615 eor w12,w12,w6 2616 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2617 eor w6,w14,w15 2618 eor w9,w12,w8 2619 eor w6,w6,w9 2620 mov v3.s[0],w6 2621 // optimize sbox using AESE instruction 2622 tbl v0.16b, {v3.16b}, v26.16b 2623 ushr v2.16b, v0.16b, 4 2624 and v0.16b, v0.16b, v31.16b 2625 tbl v0.16b, {v28.16b}, v0.16b 2626 tbl v2.16b, {v27.16b}, v2.16b 2627 eor v0.16b, v0.16b, v2.16b 2628 eor v1.16b, v1.16b, v1.16b 2629 aese v0.16b,v1.16b 2630 ushr v2.16b, v0.16b, 4 2631 and v0.16b, v0.16b, v31.16b 2632 tbl v0.16b, {v30.16b}, v0.16b 2633 tbl v2.16b, {v29.16b}, v2.16b 2634 eor v0.16b, v0.16b, v2.16b 2635 2636 mov w7,v0.s[0] 2637 eor w6,w7,w7,ror #32-2 2638 eor w6,w6,w7,ror #32-10 2639 eor w6,w6,w7,ror #32-18 2640 eor w6,w6,w7,ror #32-24 2641 ldp w7,w8,[x10],8 2642 eor w13,w13,w6 2643 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2644 eor w6,w12,w13 2645 eor w9,w7,w15 2646 eor w6,w6,w9 2647 mov v3.s[0],w6 2648 // optimize sbox using AESE instruction 2649 tbl v0.16b, {v3.16b}, v26.16b 2650 ushr v2.16b, v0.16b, 4 2651 and v0.16b, v0.16b, v31.16b 2652 tbl v0.16b, {v28.16b}, v0.16b 2653 tbl v2.16b, {v27.16b}, v2.16b 2654 eor v0.16b, v0.16b, v2.16b 2655 eor v1.16b, v1.16b, v1.16b 2656 aese v0.16b,v1.16b 2657 ushr v2.16b, v0.16b, 4 2658 and v0.16b, v0.16b, v31.16b 2659 tbl v0.16b, {v30.16b}, v0.16b 2660 tbl v2.16b, {v29.16b}, v2.16b 2661 eor v0.16b, v0.16b, v2.16b 2662 2663 mov w7,v0.s[0] 2664 eor w6,w7,w7,ror #32-2 2665 eor w6,w6,w7,ror #32-10 2666 eor w6,w6,w7,ror #32-18 2667 eor w6,w6,w7,ror #32-24 2668 eor w14,w14,w6 2669 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2670 eor w6,w12,w13 2671 eor w9,w14,w8 2672 eor w6,w6,w9 2673 mov v3.s[0],w6 2674 // optimize sbox using AESE instruction 2675 tbl v0.16b, {v3.16b}, v26.16b 2676 ushr v2.16b, v0.16b, 4 2677 and v0.16b, v0.16b, v31.16b 2678 tbl v0.16b, {v28.16b}, v0.16b 2679 tbl v2.16b, {v27.16b}, v2.16b 2680 eor v0.16b, v0.16b, v2.16b 2681 eor v1.16b, v1.16b, v1.16b 2682 aese v0.16b,v1.16b 2683 ushr v2.16b, v0.16b, 4 2684 and v0.16b, v0.16b, v31.16b 2685 tbl v0.16b, {v30.16b}, v0.16b 2686 tbl v2.16b, {v29.16b}, v2.16b 2687 eor v0.16b, v0.16b, v2.16b 2688 2689 mov w7,v0.s[0] 2690 eor w6,w7,w7,ror #32-2 2691 eor w6,w6,w7,ror #32-10 2692 eor w6,w6,w7,ror #32-18 2693 eor w6,w6,w7,ror #32-24 2694 eor w15,w15,w6 2695 subs w11,w11,#1 2696 b.ne 10b 2697 mov v16.s[0],w15 2698 mov v16.s[1],w14 2699 mov v16.s[2],w13 2700 mov v16.s[3],w12 2701 #ifndef __AARCH64EB__ 2702 rev32 v16.16b,v16.16b 2703 #endif 2704 mov x3,x26 2705 and x29,x2,#0x0F 2706 // convert length into blocks 2707 lsr x2,x2,4 2708 cmp x2,#1 2709 b.lt .return_gb 2710 2711 cmp x29,0 2712 // If the encryption/decryption Length is N times of 16, 2713 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2714 b.eq .xts_encrypt_blocks_gb 2715 2716 // If the encryption/decryption length is not N times of 16, 2717 // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb 2718 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2719 subs x2,x2,#1 2720 b.eq .only_2blks_tweak_gb 2721 .xts_encrypt_blocks_gb: 2722 rbit v16.16b,v16.16b 2723 #ifdef __AARCH64EB__ 2724 rev32 v16.16b,v16.16b 2725 #endif 2726 mov x12,v16.d[0] 2727 mov x13,v16.d[1] 2728 mov w7,0x87 2729 extr x9,x13,x13,#32 2730 extr x15,x13,x12,#63 2731 and w8,w7,w9,asr#31 2732 eor x14,x8,x12,lsl#1 2733 mov w7,0x87 2734 extr x9,x15,x15,#32 2735 extr x17,x15,x14,#63 2736 and w8,w7,w9,asr#31 2737 eor x16,x8,x14,lsl#1 2738 mov w7,0x87 2739 extr x9,x17,x17,#32 2740 extr x19,x17,x16,#63 2741 and w8,w7,w9,asr#31 2742 eor x18,x8,x16,lsl#1 2743 mov w7,0x87 2744 extr x9,x19,x19,#32 2745 extr x21,x19,x18,#63 2746 and w8,w7,w9,asr#31 2747 eor x20,x8,x18,lsl#1 2748 mov w7,0x87 2749 extr x9,x21,x21,#32 2750 extr x23,x21,x20,#63 2751 and w8,w7,w9,asr#31 2752 eor x22,x8,x20,lsl#1 2753 mov w7,0x87 2754 extr x9,x23,x23,#32 2755 extr x25,x23,x22,#63 2756 and w8,w7,w9,asr#31 2757 eor x24,x8,x22,lsl#1 2758 mov w7,0x87 2759 extr x9,x25,x25,#32 2760 extr x27,x25,x24,#63 2761 and w8,w7,w9,asr#31 2762 eor x26,x8,x24,lsl#1 2763 .Lxts_8_blocks_process_gb: 2764 cmp x2,#8 2765 mov v16.d[0],x12 2766 mov v16.d[1],x13 2767 #ifdef __AARCH64EB__ 2768 rev32 v16.16b,v16.16b 2769 #endif 2770 mov w7,0x87 2771 extr x9,x27,x27,#32 2772 extr x13,x27,x26,#63 2773 and w8,w7,w9,asr#31 2774 eor x12,x8,x26,lsl#1 2775 mov v17.d[0],x14 2776 mov v17.d[1],x15 2777 #ifdef __AARCH64EB__ 2778 rev32 v17.16b,v17.16b 2779 #endif 2780 mov w7,0x87 2781 extr x9,x13,x13,#32 2782 extr x15,x13,x12,#63 2783 and w8,w7,w9,asr#31 2784 eor x14,x8,x12,lsl#1 2785 mov v18.d[0],x16 2786 mov v18.d[1],x17 2787 #ifdef __AARCH64EB__ 2788 rev32 v18.16b,v18.16b 2789 #endif 2790 mov w7,0x87 2791 extr x9,x15,x15,#32 2792 extr x17,x15,x14,#63 2793 and w8,w7,w9,asr#31 2794 eor x16,x8,x14,lsl#1 2795 mov v19.d[0],x18 2796 mov v19.d[1],x19 2797 #ifdef __AARCH64EB__ 2798 rev32 v19.16b,v19.16b 2799 #endif 2800 mov w7,0x87 2801 extr x9,x17,x17,#32 2802 extr x19,x17,x16,#63 2803 and w8,w7,w9,asr#31 2804 eor x18,x8,x16,lsl#1 2805 mov v20.d[0],x20 2806 mov v20.d[1],x21 2807 #ifdef __AARCH64EB__ 2808 rev32 v20.16b,v20.16b 2809 #endif 2810 mov w7,0x87 2811 extr x9,x19,x19,#32 2812 extr x21,x19,x18,#63 2813 and w8,w7,w9,asr#31 2814 eor x20,x8,x18,lsl#1 2815 mov v21.d[0],x22 2816 mov v21.d[1],x23 2817 #ifdef __AARCH64EB__ 2818 rev32 v21.16b,v21.16b 2819 #endif 2820 mov w7,0x87 2821 extr x9,x21,x21,#32 2822 extr x23,x21,x20,#63 2823 and w8,w7,w9,asr#31 2824 eor x22,x8,x20,lsl#1 2825 mov v22.d[0],x24 2826 mov v22.d[1],x25 2827 #ifdef __AARCH64EB__ 2828 rev32 v22.16b,v22.16b 2829 #endif 2830 mov w7,0x87 2831 extr x9,x23,x23,#32 2832 extr x25,x23,x22,#63 2833 and w8,w7,w9,asr#31 2834 eor x24,x8,x22,lsl#1 2835 mov v23.d[0],x26 2836 mov v23.d[1],x27 2837 #ifdef __AARCH64EB__ 2838 rev32 v23.16b,v23.16b 2839 #endif 2840 mov w7,0x87 2841 extr x9,x25,x25,#32 2842 extr x27,x25,x24,#63 2843 and w8,w7,w9,asr#31 2844 eor x26,x8,x24,lsl#1 2845 b.lt .Lxts_4_blocks_process_gb 2846 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2847 rbit v16.16b,v16.16b 2848 rbit v17.16b,v17.16b 2849 rbit v18.16b,v18.16b 2850 rbit v19.16b,v19.16b 2851 eor v4.16b, v4.16b, v16.16b 2852 eor v5.16b, v5.16b, v17.16b 2853 eor v6.16b, v6.16b, v18.16b 2854 eor v7.16b, v7.16b, v19.16b 2855 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2856 rbit v20.16b,v20.16b 2857 rbit v21.16b,v21.16b 2858 rbit v22.16b,v22.16b 2859 rbit v23.16b,v23.16b 2860 eor v8.16b, v8.16b, v20.16b 2861 eor v9.16b, v9.16b, v21.16b 2862 eor v10.16b, v10.16b, v22.16b 2863 eor v11.16b, v11.16b, v23.16b 2864 #ifndef __AARCH64EB__ 2865 rev32 v4.16b,v4.16b 2866 #endif 2867 #ifndef __AARCH64EB__ 2868 rev32 v5.16b,v5.16b 2869 #endif 2870 #ifndef __AARCH64EB__ 2871 rev32 v6.16b,v6.16b 2872 #endif 2873 #ifndef __AARCH64EB__ 2874 rev32 v7.16b,v7.16b 2875 #endif 2876 #ifndef __AARCH64EB__ 2877 rev32 v8.16b,v8.16b 2878 #endif 2879 #ifndef __AARCH64EB__ 2880 rev32 v9.16b,v9.16b 2881 #endif 2882 #ifndef __AARCH64EB__ 2883 rev32 v10.16b,v10.16b 2884 #endif 2885 #ifndef __AARCH64EB__ 2886 rev32 v11.16b,v11.16b 2887 #endif 2888 zip1 v0.4s,v4.4s,v5.4s 2889 zip2 v1.4s,v4.4s,v5.4s 2890 zip1 v2.4s,v6.4s,v7.4s 2891 zip2 v3.4s,v6.4s,v7.4s 2892 zip1 v4.2d,v0.2d,v2.2d 2893 zip2 v5.2d,v0.2d,v2.2d 2894 zip1 v6.2d,v1.2d,v3.2d 2895 zip2 v7.2d,v1.2d,v3.2d 2896 zip1 v0.4s,v8.4s,v9.4s 2897 zip2 v1.4s,v8.4s,v9.4s 2898 zip1 v2.4s,v10.4s,v11.4s 2899 zip2 v3.4s,v10.4s,v11.4s 2900 zip1 v8.2d,v0.2d,v2.2d 2901 zip2 v9.2d,v0.2d,v2.2d 2902 zip1 v10.2d,v1.2d,v3.2d 2903 zip2 v11.2d,v1.2d,v3.2d 2904 bl _vpsm4_ex_enc_8blks 2905 zip1 v8.4s,v0.4s,v1.4s 2906 zip2 v9.4s,v0.4s,v1.4s 2907 zip1 v10.4s,v2.4s,v3.4s 2908 zip2 v11.4s,v2.4s,v3.4s 2909 zip1 v0.2d,v8.2d,v10.2d 2910 zip2 v1.2d,v8.2d,v10.2d 2911 zip1 v2.2d,v9.2d,v11.2d 2912 zip2 v3.2d,v9.2d,v11.2d 2913 zip1 v8.4s,v4.4s,v5.4s 2914 zip2 v9.4s,v4.4s,v5.4s 2915 zip1 v10.4s,v6.4s,v7.4s 2916 zip2 v11.4s,v6.4s,v7.4s 2917 zip1 v4.2d,v8.2d,v10.2d 2918 zip2 v5.2d,v8.2d,v10.2d 2919 zip1 v6.2d,v9.2d,v11.2d 2920 zip2 v7.2d,v9.2d,v11.2d 2921 eor v0.16b, v0.16b, v16.16b 2922 eor v1.16b, v1.16b, v17.16b 2923 eor v2.16b, v2.16b, v18.16b 2924 eor v3.16b, v3.16b, v19.16b 2925 eor v4.16b, v4.16b, v20.16b 2926 eor v5.16b, v5.16b, v21.16b 2927 eor v6.16b, v6.16b, v22.16b 2928 eor v7.16b, v7.16b, v23.16b 2929 2930 // save the last tweak 2931 mov v25.16b,v23.16b 2932 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2933 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2934 subs x2,x2,#8 2935 b.gt .Lxts_8_blocks_process_gb 2936 b 100f 2937 .Lxts_4_blocks_process_gb: 2938 cmp x2,#4 2939 b.lt 1f 2940 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2941 rbit v16.16b,v16.16b 2942 rbit v17.16b,v17.16b 2943 rbit v18.16b,v18.16b 2944 rbit v19.16b,v19.16b 2945 eor v4.16b, v4.16b, v16.16b 2946 eor v5.16b, v5.16b, v17.16b 2947 eor v6.16b, v6.16b, v18.16b 2948 eor v7.16b, v7.16b, v19.16b 2949 #ifndef __AARCH64EB__ 2950 rev32 v4.16b,v4.16b 2951 #endif 2952 #ifndef __AARCH64EB__ 2953 rev32 v5.16b,v5.16b 2954 #endif 2955 #ifndef __AARCH64EB__ 2956 rev32 v6.16b,v6.16b 2957 #endif 2958 #ifndef __AARCH64EB__ 2959 rev32 v7.16b,v7.16b 2960 #endif 2961 zip1 v0.4s,v4.4s,v5.4s 2962 zip2 v1.4s,v4.4s,v5.4s 2963 zip1 v2.4s,v6.4s,v7.4s 2964 zip2 v3.4s,v6.4s,v7.4s 2965 zip1 v4.2d,v0.2d,v2.2d 2966 zip2 v5.2d,v0.2d,v2.2d 2967 zip1 v6.2d,v1.2d,v3.2d 2968 zip2 v7.2d,v1.2d,v3.2d 2969 bl _vpsm4_ex_enc_4blks 2970 zip1 v4.4s,v0.4s,v1.4s 2971 zip2 v5.4s,v0.4s,v1.4s 2972 zip1 v6.4s,v2.4s,v3.4s 2973 zip2 v7.4s,v2.4s,v3.4s 2974 zip1 v0.2d,v4.2d,v6.2d 2975 zip2 v1.2d,v4.2d,v6.2d 2976 zip1 v2.2d,v5.2d,v7.2d 2977 zip2 v3.2d,v5.2d,v7.2d 2978 eor v0.16b, v0.16b, v16.16b 2979 eor v1.16b, v1.16b, v17.16b 2980 eor v2.16b, v2.16b, v18.16b 2981 eor v3.16b, v3.16b, v19.16b 2982 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2983 sub x2,x2,#4 2984 mov v16.16b,v20.16b 2985 mov v17.16b,v21.16b 2986 mov v18.16b,v22.16b 2987 // save the last tweak 2988 mov v25.16b,v19.16b 2989 1: 2990 // process last block 2991 cmp x2,#1 2992 b.lt 100f 2993 b.gt 1f 2994 ld1 {v4.4s},[x0],#16 2995 rbit v16.16b,v16.16b 2996 eor v4.16b, v4.16b, v16.16b 2997 #ifndef __AARCH64EB__ 2998 rev32 v4.16b,v4.16b 2999 #endif 3000 mov x10,x3 3001 mov w11,#8 3002 mov w12,v4.s[0] 3003 mov w13,v4.s[1] 3004 mov w14,v4.s[2] 3005 mov w15,v4.s[3] 3006 10: 3007 ldp w7,w8,[x10],8 3008 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3009 eor w6,w14,w15 3010 eor w9,w7,w13 3011 eor w6,w6,w9 3012 mov v3.s[0],w6 3013 // optimize sbox using AESE instruction 3014 tbl v0.16b, {v3.16b}, v26.16b 3015 ushr v2.16b, v0.16b, 4 3016 and v0.16b, v0.16b, v31.16b 3017 tbl v0.16b, {v28.16b}, v0.16b 3018 tbl v2.16b, {v27.16b}, v2.16b 3019 eor v0.16b, v0.16b, v2.16b 3020 eor v1.16b, v1.16b, v1.16b 3021 aese v0.16b,v1.16b 3022 ushr v2.16b, v0.16b, 4 3023 and v0.16b, v0.16b, v31.16b 3024 tbl v0.16b, {v30.16b}, v0.16b 3025 tbl v2.16b, {v29.16b}, v2.16b 3026 eor v0.16b, v0.16b, v2.16b 3027 3028 mov w7,v0.s[0] 3029 eor w6,w7,w7,ror #32-2 3030 eor w6,w6,w7,ror #32-10 3031 eor w6,w6,w7,ror #32-18 3032 eor w6,w6,w7,ror #32-24 3033 eor w12,w12,w6 3034 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3035 eor w6,w14,w15 3036 eor w9,w12,w8 3037 eor w6,w6,w9 3038 mov v3.s[0],w6 3039 // optimize sbox using AESE instruction 3040 tbl v0.16b, {v3.16b}, v26.16b 3041 ushr v2.16b, v0.16b, 4 3042 and v0.16b, v0.16b, v31.16b 3043 tbl v0.16b, {v28.16b}, v0.16b 3044 tbl v2.16b, {v27.16b}, v2.16b 3045 eor v0.16b, v0.16b, v2.16b 3046 eor v1.16b, v1.16b, v1.16b 3047 aese v0.16b,v1.16b 3048 ushr v2.16b, v0.16b, 4 3049 and v0.16b, v0.16b, v31.16b 3050 tbl v0.16b, {v30.16b}, v0.16b 3051 tbl v2.16b, {v29.16b}, v2.16b 3052 eor v0.16b, v0.16b, v2.16b 3053 3054 mov w7,v0.s[0] 3055 eor w6,w7,w7,ror #32-2 3056 eor w6,w6,w7,ror #32-10 3057 eor w6,w6,w7,ror #32-18 3058 eor w6,w6,w7,ror #32-24 3059 ldp w7,w8,[x10],8 3060 eor w13,w13,w6 3061 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3062 eor w6,w12,w13 3063 eor w9,w7,w15 3064 eor w6,w6,w9 3065 mov v3.s[0],w6 3066 // optimize sbox using AESE instruction 3067 tbl v0.16b, {v3.16b}, v26.16b 3068 ushr v2.16b, v0.16b, 4 3069 and v0.16b, v0.16b, v31.16b 3070 tbl v0.16b, {v28.16b}, v0.16b 3071 tbl v2.16b, {v27.16b}, v2.16b 3072 eor v0.16b, v0.16b, v2.16b 3073 eor v1.16b, v1.16b, v1.16b 3074 aese v0.16b,v1.16b 3075 ushr v2.16b, v0.16b, 4 3076 and v0.16b, v0.16b, v31.16b 3077 tbl v0.16b, {v30.16b}, v0.16b 3078 tbl v2.16b, {v29.16b}, v2.16b 3079 eor v0.16b, v0.16b, v2.16b 3080 3081 mov w7,v0.s[0] 3082 eor w6,w7,w7,ror #32-2 3083 eor w6,w6,w7,ror #32-10 3084 eor w6,w6,w7,ror #32-18 3085 eor w6,w6,w7,ror #32-24 3086 eor w14,w14,w6 3087 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3088 eor w6,w12,w13 3089 eor w9,w14,w8 3090 eor w6,w6,w9 3091 mov v3.s[0],w6 3092 // optimize sbox using AESE instruction 3093 tbl v0.16b, {v3.16b}, v26.16b 3094 ushr v2.16b, v0.16b, 4 3095 and v0.16b, v0.16b, v31.16b 3096 tbl v0.16b, {v28.16b}, v0.16b 3097 tbl v2.16b, {v27.16b}, v2.16b 3098 eor v0.16b, v0.16b, v2.16b 3099 eor v1.16b, v1.16b, v1.16b 3100 aese v0.16b,v1.16b 3101 ushr v2.16b, v0.16b, 4 3102 and v0.16b, v0.16b, v31.16b 3103 tbl v0.16b, {v30.16b}, v0.16b 3104 tbl v2.16b, {v29.16b}, v2.16b 3105 eor v0.16b, v0.16b, v2.16b 3106 3107 mov w7,v0.s[0] 3108 eor w6,w7,w7,ror #32-2 3109 eor w6,w6,w7,ror #32-10 3110 eor w6,w6,w7,ror #32-18 3111 eor w6,w6,w7,ror #32-24 3112 eor w15,w15,w6 3113 subs w11,w11,#1 3114 b.ne 10b 3115 mov v4.s[0],w15 3116 mov v4.s[1],w14 3117 mov v4.s[2],w13 3118 mov v4.s[3],w12 3119 #ifndef __AARCH64EB__ 3120 rev32 v4.16b,v4.16b 3121 #endif 3122 eor v4.16b, v4.16b, v16.16b 3123 st1 {v4.4s},[x1],#16 3124 // save the last tweak 3125 mov v25.16b,v16.16b 3126 b 100f 3127 1: // process last 2 blocks 3128 cmp x2,#2 3129 b.gt 1f 3130 ld1 {v4.4s,v5.4s},[x0],#32 3131 rbit v16.16b,v16.16b 3132 rbit v17.16b,v17.16b 3133 eor v4.16b, v4.16b, v16.16b 3134 eor v5.16b, v5.16b, v17.16b 3135 #ifndef __AARCH64EB__ 3136 rev32 v4.16b,v4.16b 3137 #endif 3138 #ifndef __AARCH64EB__ 3139 rev32 v5.16b,v5.16b 3140 #endif 3141 zip1 v0.4s,v4.4s,v5.4s 3142 zip2 v1.4s,v4.4s,v5.4s 3143 zip1 v2.4s,v6.4s,v7.4s 3144 zip2 v3.4s,v6.4s,v7.4s 3145 zip1 v4.2d,v0.2d,v2.2d 3146 zip2 v5.2d,v0.2d,v2.2d 3147 zip1 v6.2d,v1.2d,v3.2d 3148 zip2 v7.2d,v1.2d,v3.2d 3149 bl _vpsm4_ex_enc_4blks 3150 zip1 v4.4s,v0.4s,v1.4s 3151 zip2 v5.4s,v0.4s,v1.4s 3152 zip1 v6.4s,v2.4s,v3.4s 3153 zip2 v7.4s,v2.4s,v3.4s 3154 zip1 v0.2d,v4.2d,v6.2d 3155 zip2 v1.2d,v4.2d,v6.2d 3156 zip1 v2.2d,v5.2d,v7.2d 3157 zip2 v3.2d,v5.2d,v7.2d 3158 eor v0.16b, v0.16b, v16.16b 3159 eor v1.16b, v1.16b, v17.16b 3160 st1 {v0.4s,v1.4s},[x1],#32 3161 // save the last tweak 3162 mov v25.16b,v17.16b 3163 b 100f 3164 1: // process last 3 blocks 3165 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 3166 rbit v16.16b,v16.16b 3167 rbit v17.16b,v17.16b 3168 rbit v18.16b,v18.16b 3169 eor v4.16b, v4.16b, v16.16b 3170 eor v5.16b, v5.16b, v17.16b 3171 eor v6.16b, v6.16b, v18.16b 3172 #ifndef __AARCH64EB__ 3173 rev32 v4.16b,v4.16b 3174 #endif 3175 #ifndef __AARCH64EB__ 3176 rev32 v5.16b,v5.16b 3177 #endif 3178 #ifndef __AARCH64EB__ 3179 rev32 v6.16b,v6.16b 3180 #endif 3181 zip1 v0.4s,v4.4s,v5.4s 3182 zip2 v1.4s,v4.4s,v5.4s 3183 zip1 v2.4s,v6.4s,v7.4s 3184 zip2 v3.4s,v6.4s,v7.4s 3185 zip1 v4.2d,v0.2d,v2.2d 3186 zip2 v5.2d,v0.2d,v2.2d 3187 zip1 v6.2d,v1.2d,v3.2d 3188 zip2 v7.2d,v1.2d,v3.2d 3189 bl _vpsm4_ex_enc_4blks 3190 zip1 v4.4s,v0.4s,v1.4s 3191 zip2 v5.4s,v0.4s,v1.4s 3192 zip1 v6.4s,v2.4s,v3.4s 3193 zip2 v7.4s,v2.4s,v3.4s 3194 zip1 v0.2d,v4.2d,v6.2d 3195 zip2 v1.2d,v4.2d,v6.2d 3196 zip1 v2.2d,v5.2d,v7.2d 3197 zip2 v3.2d,v5.2d,v7.2d 3198 eor v0.16b, v0.16b, v16.16b 3199 eor v1.16b, v1.16b, v17.16b 3200 eor v2.16b, v2.16b, v18.16b 3201 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 3202 // save the last tweak 3203 mov v25.16b,v18.16b 3204 100: 3205 cmp x29,0 3206 b.eq .return_gb 3207 3208 // This branch calculates the last two tweaks, 3209 // while the encryption/decryption length is larger than 32 3210 .last_2blks_tweak_gb: 3211 #ifdef __AARCH64EB__ 3212 rev32 v25.16b,v25.16b 3213 #endif 3214 rbit v2.16b,v25.16b 3215 adrp x9, .Lxts_magic 3216 ldr q0, [x9, #:lo12:.Lxts_magic] 3217 shl v17.16b, v2.16b, #1 3218 ext v1.16b, v2.16b, v2.16b,#15 3219 ushr v1.16b, v1.16b, #7 3220 mul v1.16b, v1.16b, v0.16b 3221 eor v17.16b, v17.16b, v1.16b 3222 rbit v17.16b,v17.16b 3223 rbit v2.16b,v17.16b 3224 adrp x9, .Lxts_magic 3225 ldr q0, [x9, #:lo12:.Lxts_magic] 3226 shl v18.16b, v2.16b, #1 3227 ext v1.16b, v2.16b, v2.16b,#15 3228 ushr v1.16b, v1.16b, #7 3229 mul v1.16b, v1.16b, v0.16b 3230 eor v18.16b, v18.16b, v1.16b 3231 rbit v18.16b,v18.16b 3232 b .check_dec_gb 3233 3234 3235 // This branch calculates the last two tweaks, 3236 // while the encryption/decryption length is equal to 32, who only need two tweaks 3237 .only_2blks_tweak_gb: 3238 mov v17.16b,v16.16b 3239 #ifdef __AARCH64EB__ 3240 rev32 v17.16b,v17.16b 3241 #endif 3242 rbit v2.16b,v17.16b 3243 adrp x9, .Lxts_magic 3244 ldr q0, [x9, #:lo12:.Lxts_magic] 3245 shl v18.16b, v2.16b, #1 3246 ext v1.16b, v2.16b, v2.16b,#15 3247 ushr v1.16b, v1.16b, #7 3248 mul v1.16b, v1.16b, v0.16b 3249 eor v18.16b, v18.16b, v1.16b 3250 rbit v18.16b,v18.16b 3251 b .check_dec_gb 3252 3253 3254 // Determine whether encryption or decryption is required. 3255 // The last two tweaks need to be swapped for decryption. 3256 .check_dec_gb: 3257 // encryption:1 decryption:0 3258 cmp w28,1 3259 b.eq .process_last_2blks_gb 3260 mov v0.16B,v17.16b 3261 mov v17.16B,v18.16b 3262 mov v18.16B,v0.16b 3263 3264 .process_last_2blks_gb: 3265 #ifdef __AARCH64EB__ 3266 rev32 v17.16b,v17.16b 3267 #endif 3268 #ifdef __AARCH64EB__ 3269 rev32 v18.16b,v18.16b 3270 #endif 3271 ld1 {v4.4s},[x0],#16 3272 eor v4.16b, v4.16b, v17.16b 3273 #ifndef __AARCH64EB__ 3274 rev32 v4.16b,v4.16b 3275 #endif 3276 mov x10,x3 3277 mov w11,#8 3278 mov w12,v4.s[0] 3279 mov w13,v4.s[1] 3280 mov w14,v4.s[2] 3281 mov w15,v4.s[3] 3282 10: 3283 ldp w7,w8,[x10],8 3284 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3285 eor w6,w14,w15 3286 eor w9,w7,w13 3287 eor w6,w6,w9 3288 mov v3.s[0],w6 3289 // optimize sbox using AESE instruction 3290 tbl v0.16b, {v3.16b}, v26.16b 3291 ushr v2.16b, v0.16b, 4 3292 and v0.16b, v0.16b, v31.16b 3293 tbl v0.16b, {v28.16b}, v0.16b 3294 tbl v2.16b, {v27.16b}, v2.16b 3295 eor v0.16b, v0.16b, v2.16b 3296 eor v1.16b, v1.16b, v1.16b 3297 aese v0.16b,v1.16b 3298 ushr v2.16b, v0.16b, 4 3299 and v0.16b, v0.16b, v31.16b 3300 tbl v0.16b, {v30.16b}, v0.16b 3301 tbl v2.16b, {v29.16b}, v2.16b 3302 eor v0.16b, v0.16b, v2.16b 3303 3304 mov w7,v0.s[0] 3305 eor w6,w7,w7,ror #32-2 3306 eor w6,w6,w7,ror #32-10 3307 eor w6,w6,w7,ror #32-18 3308 eor w6,w6,w7,ror #32-24 3309 eor w12,w12,w6 3310 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3311 eor w6,w14,w15 3312 eor w9,w12,w8 3313 eor w6,w6,w9 3314 mov v3.s[0],w6 3315 // optimize sbox using AESE instruction 3316 tbl v0.16b, {v3.16b}, v26.16b 3317 ushr v2.16b, v0.16b, 4 3318 and v0.16b, v0.16b, v31.16b 3319 tbl v0.16b, {v28.16b}, v0.16b 3320 tbl v2.16b, {v27.16b}, v2.16b 3321 eor v0.16b, v0.16b, v2.16b 3322 eor v1.16b, v1.16b, v1.16b 3323 aese v0.16b,v1.16b 3324 ushr v2.16b, v0.16b, 4 3325 and v0.16b, v0.16b, v31.16b 3326 tbl v0.16b, {v30.16b}, v0.16b 3327 tbl v2.16b, {v29.16b}, v2.16b 3328 eor v0.16b, v0.16b, v2.16b 3329 3330 mov w7,v0.s[0] 3331 eor w6,w7,w7,ror #32-2 3332 eor w6,w6,w7,ror #32-10 3333 eor w6,w6,w7,ror #32-18 3334 eor w6,w6,w7,ror #32-24 3335 ldp w7,w8,[x10],8 3336 eor w13,w13,w6 3337 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3338 eor w6,w12,w13 3339 eor w9,w7,w15 3340 eor w6,w6,w9 3341 mov v3.s[0],w6 3342 // optimize sbox using AESE instruction 3343 tbl v0.16b, {v3.16b}, v26.16b 3344 ushr v2.16b, v0.16b, 4 3345 and v0.16b, v0.16b, v31.16b 3346 tbl v0.16b, {v28.16b}, v0.16b 3347 tbl v2.16b, {v27.16b}, v2.16b 3348 eor v0.16b, v0.16b, v2.16b 3349 eor v1.16b, v1.16b, v1.16b 3350 aese v0.16b,v1.16b 3351 ushr v2.16b, v0.16b, 4 3352 and v0.16b, v0.16b, v31.16b 3353 tbl v0.16b, {v30.16b}, v0.16b 3354 tbl v2.16b, {v29.16b}, v2.16b 3355 eor v0.16b, v0.16b, v2.16b 3356 3357 mov w7,v0.s[0] 3358 eor w6,w7,w7,ror #32-2 3359 eor w6,w6,w7,ror #32-10 3360 eor w6,w6,w7,ror #32-18 3361 eor w6,w6,w7,ror #32-24 3362 eor w14,w14,w6 3363 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3364 eor w6,w12,w13 3365 eor w9,w14,w8 3366 eor w6,w6,w9 3367 mov v3.s[0],w6 3368 // optimize sbox using AESE instruction 3369 tbl v0.16b, {v3.16b}, v26.16b 3370 ushr v2.16b, v0.16b, 4 3371 and v0.16b, v0.16b, v31.16b 3372 tbl v0.16b, {v28.16b}, v0.16b 3373 tbl v2.16b, {v27.16b}, v2.16b 3374 eor v0.16b, v0.16b, v2.16b 3375 eor v1.16b, v1.16b, v1.16b 3376 aese v0.16b,v1.16b 3377 ushr v2.16b, v0.16b, 4 3378 and v0.16b, v0.16b, v31.16b 3379 tbl v0.16b, {v30.16b}, v0.16b 3380 tbl v2.16b, {v29.16b}, v2.16b 3381 eor v0.16b, v0.16b, v2.16b 3382 3383 mov w7,v0.s[0] 3384 eor w6,w7,w7,ror #32-2 3385 eor w6,w6,w7,ror #32-10 3386 eor w6,w6,w7,ror #32-18 3387 eor w6,w6,w7,ror #32-24 3388 eor w15,w15,w6 3389 subs w11,w11,#1 3390 b.ne 10b 3391 mov v4.s[0],w15 3392 mov v4.s[1],w14 3393 mov v4.s[2],w13 3394 mov v4.s[3],w12 3395 #ifndef __AARCH64EB__ 3396 rev32 v4.16b,v4.16b 3397 #endif 3398 eor v4.16b, v4.16b, v17.16b 3399 st1 {v4.4s},[x1],#16 3400 3401 sub x26,x1,16 3402 .loop_gb: 3403 subs x29,x29,1 3404 ldrb w7,[x26,x29] 3405 ldrb w8,[x0,x29] 3406 strb w8,[x26,x29] 3407 strb w7,[x1,x29] 3408 b.gt .loop_gb 3409 ld1 {v4.4s}, [x26] 3410 eor v4.16b, v4.16b, v18.16b 3411 #ifndef __AARCH64EB__ 3412 rev32 v4.16b,v4.16b 3413 #endif 3414 mov x10,x3 3415 mov w11,#8 3416 mov w12,v4.s[0] 3417 mov w13,v4.s[1] 3418 mov w14,v4.s[2] 3419 mov w15,v4.s[3] 3420 10: 3421 ldp w7,w8,[x10],8 3422 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3423 eor w6,w14,w15 3424 eor w9,w7,w13 3425 eor w6,w6,w9 3426 mov v3.s[0],w6 3427 // optimize sbox using AESE instruction 3428 tbl v0.16b, {v3.16b}, v26.16b 3429 ushr v2.16b, v0.16b, 4 3430 and v0.16b, v0.16b, v31.16b 3431 tbl v0.16b, {v28.16b}, v0.16b 3432 tbl v2.16b, {v27.16b}, v2.16b 3433 eor v0.16b, v0.16b, v2.16b 3434 eor v1.16b, v1.16b, v1.16b 3435 aese v0.16b,v1.16b 3436 ushr v2.16b, v0.16b, 4 3437 and v0.16b, v0.16b, v31.16b 3438 tbl v0.16b, {v30.16b}, v0.16b 3439 tbl v2.16b, {v29.16b}, v2.16b 3440 eor v0.16b, v0.16b, v2.16b 3441 3442 mov w7,v0.s[0] 3443 eor w6,w7,w7,ror #32-2 3444 eor w6,w6,w7,ror #32-10 3445 eor w6,w6,w7,ror #32-18 3446 eor w6,w6,w7,ror #32-24 3447 eor w12,w12,w6 3448 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3449 eor w6,w14,w15 3450 eor w9,w12,w8 3451 eor w6,w6,w9 3452 mov v3.s[0],w6 3453 // optimize sbox using AESE instruction 3454 tbl v0.16b, {v3.16b}, v26.16b 3455 ushr v2.16b, v0.16b, 4 3456 and v0.16b, v0.16b, v31.16b 3457 tbl v0.16b, {v28.16b}, v0.16b 3458 tbl v2.16b, {v27.16b}, v2.16b 3459 eor v0.16b, v0.16b, v2.16b 3460 eor v1.16b, v1.16b, v1.16b 3461 aese v0.16b,v1.16b 3462 ushr v2.16b, v0.16b, 4 3463 and v0.16b, v0.16b, v31.16b 3464 tbl v0.16b, {v30.16b}, v0.16b 3465 tbl v2.16b, {v29.16b}, v2.16b 3466 eor v0.16b, v0.16b, v2.16b 3467 3468 mov w7,v0.s[0] 3469 eor w6,w7,w7,ror #32-2 3470 eor w6,w6,w7,ror #32-10 3471 eor w6,w6,w7,ror #32-18 3472 eor w6,w6,w7,ror #32-24 3473 ldp w7,w8,[x10],8 3474 eor w13,w13,w6 3475 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3476 eor w6,w12,w13 3477 eor w9,w7,w15 3478 eor w6,w6,w9 3479 mov v3.s[0],w6 3480 // optimize sbox using AESE instruction 3481 tbl v0.16b, {v3.16b}, v26.16b 3482 ushr v2.16b, v0.16b, 4 3483 and v0.16b, v0.16b, v31.16b 3484 tbl v0.16b, {v28.16b}, v0.16b 3485 tbl v2.16b, {v27.16b}, v2.16b 3486 eor v0.16b, v0.16b, v2.16b 3487 eor v1.16b, v1.16b, v1.16b 3488 aese v0.16b,v1.16b 3489 ushr v2.16b, v0.16b, 4 3490 and v0.16b, v0.16b, v31.16b 3491 tbl v0.16b, {v30.16b}, v0.16b 3492 tbl v2.16b, {v29.16b}, v2.16b 3493 eor v0.16b, v0.16b, v2.16b 3494 3495 mov w7,v0.s[0] 3496 eor w6,w7,w7,ror #32-2 3497 eor w6,w6,w7,ror #32-10 3498 eor w6,w6,w7,ror #32-18 3499 eor w6,w6,w7,ror #32-24 3500 eor w14,w14,w6 3501 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3502 eor w6,w12,w13 3503 eor w9,w14,w8 3504 eor w6,w6,w9 3505 mov v3.s[0],w6 3506 // optimize sbox using AESE instruction 3507 tbl v0.16b, {v3.16b}, v26.16b 3508 ushr v2.16b, v0.16b, 4 3509 and v0.16b, v0.16b, v31.16b 3510 tbl v0.16b, {v28.16b}, v0.16b 3511 tbl v2.16b, {v27.16b}, v2.16b 3512 eor v0.16b, v0.16b, v2.16b 3513 eor v1.16b, v1.16b, v1.16b 3514 aese v0.16b,v1.16b 3515 ushr v2.16b, v0.16b, 4 3516 and v0.16b, v0.16b, v31.16b 3517 tbl v0.16b, {v30.16b}, v0.16b 3518 tbl v2.16b, {v29.16b}, v2.16b 3519 eor v0.16b, v0.16b, v2.16b 3520 3521 mov w7,v0.s[0] 3522 eor w6,w7,w7,ror #32-2 3523 eor w6,w6,w7,ror #32-10 3524 eor w6,w6,w7,ror #32-18 3525 eor w6,w6,w7,ror #32-24 3526 eor w15,w15,w6 3527 subs w11,w11,#1 3528 b.ne 10b 3529 mov v4.s[0],w15 3530 mov v4.s[1],w14 3531 mov v4.s[2],w13 3532 mov v4.s[3],w12 3533 #ifndef __AARCH64EB__ 3534 rev32 v4.16b,v4.16b 3535 #endif 3536 eor v4.16b, v4.16b, v18.16b 3537 st1 {v4.4s}, [x26] 3538 .return_gb: 3539 ldp d14, d15, [sp], #0x10 3540 ldp d12, d13, [sp], #0x10 3541 ldp d10, d11, [sp], #0x10 3542 ldp d8, d9, [sp], #0x10 3543 ldp x29, x30, [sp], #0x10 3544 ldp x27, x28, [sp], #0x10 3545 ldp x25, x26, [sp], #0x10 3546 ldp x23, x24, [sp], #0x10 3547 ldp x21, x22, [sp], #0x10 3548 ldp x19, x20, [sp], #0x10 3549 ldp x17, x18, [sp], #0x10 3550 ldp x15, x16, [sp], #0x10 3551 AARCH64_VALIDATE_LINK_REGISTER 3552 ret 3553 .size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb 3554 .globl vpsm4_ex_xts_encrypt 3555 .type vpsm4_ex_xts_encrypt,%function 3556 .align 5 3557 vpsm4_ex_xts_encrypt: 3558 AARCH64_SIGN_LINK_REGISTER 3559 stp x15, x16, [sp, #-0x10]! 3560 stp x17, x18, [sp, #-0x10]! 3561 stp x19, x20, [sp, #-0x10]! 3562 stp x21, x22, [sp, #-0x10]! 3563 stp x23, x24, [sp, #-0x10]! 3564 stp x25, x26, [sp, #-0x10]! 3565 stp x27, x28, [sp, #-0x10]! 3566 stp x29, x30, [sp, #-0x10]! 3567 stp d8, d9, [sp, #-0x10]! 3568 stp d10, d11, [sp, #-0x10]! 3569 stp d12, d13, [sp, #-0x10]! 3570 stp d14, d15, [sp, #-0x10]! 3571 mov x26,x3 3572 mov x27,x4 3573 mov w28,w6 3574 ld1 {v16.4s}, [x5] 3575 mov x3,x27 3576 adrp x9, .Lsbox_magic 3577 ldr q26, [x9, #:lo12:.Lsbox_magic] 3578 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 3579 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 3580 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 3581 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 3582 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 3583 #ifndef __AARCH64EB__ 3584 rev32 v16.16b,v16.16b 3585 #endif 3586 mov x10,x3 3587 mov w11,#8 3588 mov w12,v16.s[0] 3589 mov w13,v16.s[1] 3590 mov w14,v16.s[2] 3591 mov w15,v16.s[3] 3592 10: 3593 ldp w7,w8,[x10],8 3594 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3595 eor w6,w14,w15 3596 eor w9,w7,w13 3597 eor w6,w6,w9 3598 mov v3.s[0],w6 3599 // optimize sbox using AESE instruction 3600 tbl v0.16b, {v3.16b}, v26.16b 3601 ushr v2.16b, v0.16b, 4 3602 and v0.16b, v0.16b, v31.16b 3603 tbl v0.16b, {v28.16b}, v0.16b 3604 tbl v2.16b, {v27.16b}, v2.16b 3605 eor v0.16b, v0.16b, v2.16b 3606 eor v1.16b, v1.16b, v1.16b 3607 aese v0.16b,v1.16b 3608 ushr v2.16b, v0.16b, 4 3609 and v0.16b, v0.16b, v31.16b 3610 tbl v0.16b, {v30.16b}, v0.16b 3611 tbl v2.16b, {v29.16b}, v2.16b 3612 eor v0.16b, v0.16b, v2.16b 3613 3614 mov w7,v0.s[0] 3615 eor w6,w7,w7,ror #32-2 3616 eor w6,w6,w7,ror #32-10 3617 eor w6,w6,w7,ror #32-18 3618 eor w6,w6,w7,ror #32-24 3619 eor w12,w12,w6 3620 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3621 eor w6,w14,w15 3622 eor w9,w12,w8 3623 eor w6,w6,w9 3624 mov v3.s[0],w6 3625 // optimize sbox using AESE instruction 3626 tbl v0.16b, {v3.16b}, v26.16b 3627 ushr v2.16b, v0.16b, 4 3628 and v0.16b, v0.16b, v31.16b 3629 tbl v0.16b, {v28.16b}, v0.16b 3630 tbl v2.16b, {v27.16b}, v2.16b 3631 eor v0.16b, v0.16b, v2.16b 3632 eor v1.16b, v1.16b, v1.16b 3633 aese v0.16b,v1.16b 3634 ushr v2.16b, v0.16b, 4 3635 and v0.16b, v0.16b, v31.16b 3636 tbl v0.16b, {v30.16b}, v0.16b 3637 tbl v2.16b, {v29.16b}, v2.16b 3638 eor v0.16b, v0.16b, v2.16b 3639 3640 mov w7,v0.s[0] 3641 eor w6,w7,w7,ror #32-2 3642 eor w6,w6,w7,ror #32-10 3643 eor w6,w6,w7,ror #32-18 3644 eor w6,w6,w7,ror #32-24 3645 ldp w7,w8,[x10],8 3646 eor w13,w13,w6 3647 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3648 eor w6,w12,w13 3649 eor w9,w7,w15 3650 eor w6,w6,w9 3651 mov v3.s[0],w6 3652 // optimize sbox using AESE instruction 3653 tbl v0.16b, {v3.16b}, v26.16b 3654 ushr v2.16b, v0.16b, 4 3655 and v0.16b, v0.16b, v31.16b 3656 tbl v0.16b, {v28.16b}, v0.16b 3657 tbl v2.16b, {v27.16b}, v2.16b 3658 eor v0.16b, v0.16b, v2.16b 3659 eor v1.16b, v1.16b, v1.16b 3660 aese v0.16b,v1.16b 3661 ushr v2.16b, v0.16b, 4 3662 and v0.16b, v0.16b, v31.16b 3663 tbl v0.16b, {v30.16b}, v0.16b 3664 tbl v2.16b, {v29.16b}, v2.16b 3665 eor v0.16b, v0.16b, v2.16b 3666 3667 mov w7,v0.s[0] 3668 eor w6,w7,w7,ror #32-2 3669 eor w6,w6,w7,ror #32-10 3670 eor w6,w6,w7,ror #32-18 3671 eor w6,w6,w7,ror #32-24 3672 eor w14,w14,w6 3673 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3674 eor w6,w12,w13 3675 eor w9,w14,w8 3676 eor w6,w6,w9 3677 mov v3.s[0],w6 3678 // optimize sbox using AESE instruction 3679 tbl v0.16b, {v3.16b}, v26.16b 3680 ushr v2.16b, v0.16b, 4 3681 and v0.16b, v0.16b, v31.16b 3682 tbl v0.16b, {v28.16b}, v0.16b 3683 tbl v2.16b, {v27.16b}, v2.16b 3684 eor v0.16b, v0.16b, v2.16b 3685 eor v1.16b, v1.16b, v1.16b 3686 aese v0.16b,v1.16b 3687 ushr v2.16b, v0.16b, 4 3688 and v0.16b, v0.16b, v31.16b 3689 tbl v0.16b, {v30.16b}, v0.16b 3690 tbl v2.16b, {v29.16b}, v2.16b 3691 eor v0.16b, v0.16b, v2.16b 3692 3693 mov w7,v0.s[0] 3694 eor w6,w7,w7,ror #32-2 3695 eor w6,w6,w7,ror #32-10 3696 eor w6,w6,w7,ror #32-18 3697 eor w6,w6,w7,ror #32-24 3698 eor w15,w15,w6 3699 subs w11,w11,#1 3700 b.ne 10b 3701 mov v16.s[0],w15 3702 mov v16.s[1],w14 3703 mov v16.s[2],w13 3704 mov v16.s[3],w12 3705 #ifndef __AARCH64EB__ 3706 rev32 v16.16b,v16.16b 3707 #endif 3708 mov x3,x26 3709 and x29,x2,#0x0F 3710 // convert length into blocks 3711 lsr x2,x2,4 3712 cmp x2,#1 3713 b.lt .return 3714 3715 cmp x29,0 3716 // If the encryption/decryption Length is N times of 16, 3717 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks 3718 b.eq .xts_encrypt_blocks 3719 3720 // If the encryption/decryption length is not N times of 16, 3721 // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak 3722 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks 3723 subs x2,x2,#1 3724 b.eq .only_2blks_tweak 3725 .xts_encrypt_blocks: 3726 #ifdef __AARCH64EB__ 3727 rev32 v16.16b,v16.16b 3728 #endif 3729 mov x12,v16.d[0] 3730 mov x13,v16.d[1] 3731 mov w7,0x87 3732 extr x9,x13,x13,#32 3733 extr x15,x13,x12,#63 3734 and w8,w7,w9,asr#31 3735 eor x14,x8,x12,lsl#1 3736 mov w7,0x87 3737 extr x9,x15,x15,#32 3738 extr x17,x15,x14,#63 3739 and w8,w7,w9,asr#31 3740 eor x16,x8,x14,lsl#1 3741 mov w7,0x87 3742 extr x9,x17,x17,#32 3743 extr x19,x17,x16,#63 3744 and w8,w7,w9,asr#31 3745 eor x18,x8,x16,lsl#1 3746 mov w7,0x87 3747 extr x9,x19,x19,#32 3748 extr x21,x19,x18,#63 3749 and w8,w7,w9,asr#31 3750 eor x20,x8,x18,lsl#1 3751 mov w7,0x87 3752 extr x9,x21,x21,#32 3753 extr x23,x21,x20,#63 3754 and w8,w7,w9,asr#31 3755 eor x22,x8,x20,lsl#1 3756 mov w7,0x87 3757 extr x9,x23,x23,#32 3758 extr x25,x23,x22,#63 3759 and w8,w7,w9,asr#31 3760 eor x24,x8,x22,lsl#1 3761 mov w7,0x87 3762 extr x9,x25,x25,#32 3763 extr x27,x25,x24,#63 3764 and w8,w7,w9,asr#31 3765 eor x26,x8,x24,lsl#1 3766 .Lxts_8_blocks_process: 3767 cmp x2,#8 3768 mov v16.d[0],x12 3769 mov v16.d[1],x13 3770 #ifdef __AARCH64EB__ 3771 rev32 v16.16b,v16.16b 3772 #endif 3773 mov w7,0x87 3774 extr x9,x27,x27,#32 3775 extr x13,x27,x26,#63 3776 and w8,w7,w9,asr#31 3777 eor x12,x8,x26,lsl#1 3778 mov v17.d[0],x14 3779 mov v17.d[1],x15 3780 #ifdef __AARCH64EB__ 3781 rev32 v17.16b,v17.16b 3782 #endif 3783 mov w7,0x87 3784 extr x9,x13,x13,#32 3785 extr x15,x13,x12,#63 3786 and w8,w7,w9,asr#31 3787 eor x14,x8,x12,lsl#1 3788 mov v18.d[0],x16 3789 mov v18.d[1],x17 3790 #ifdef __AARCH64EB__ 3791 rev32 v18.16b,v18.16b 3792 #endif 3793 mov w7,0x87 3794 extr x9,x15,x15,#32 3795 extr x17,x15,x14,#63 3796 and w8,w7,w9,asr#31 3797 eor x16,x8,x14,lsl#1 3798 mov v19.d[0],x18 3799 mov v19.d[1],x19 3800 #ifdef __AARCH64EB__ 3801 rev32 v19.16b,v19.16b 3802 #endif 3803 mov w7,0x87 3804 extr x9,x17,x17,#32 3805 extr x19,x17,x16,#63 3806 and w8,w7,w9,asr#31 3807 eor x18,x8,x16,lsl#1 3808 mov v20.d[0],x20 3809 mov v20.d[1],x21 3810 #ifdef __AARCH64EB__ 3811 rev32 v20.16b,v20.16b 3812 #endif 3813 mov w7,0x87 3814 extr x9,x19,x19,#32 3815 extr x21,x19,x18,#63 3816 and w8,w7,w9,asr#31 3817 eor x20,x8,x18,lsl#1 3818 mov v21.d[0],x22 3819 mov v21.d[1],x23 3820 #ifdef __AARCH64EB__ 3821 rev32 v21.16b,v21.16b 3822 #endif 3823 mov w7,0x87 3824 extr x9,x21,x21,#32 3825 extr x23,x21,x20,#63 3826 and w8,w7,w9,asr#31 3827 eor x22,x8,x20,lsl#1 3828 mov v22.d[0],x24 3829 mov v22.d[1],x25 3830 #ifdef __AARCH64EB__ 3831 rev32 v22.16b,v22.16b 3832 #endif 3833 mov w7,0x87 3834 extr x9,x23,x23,#32 3835 extr x25,x23,x22,#63 3836 and w8,w7,w9,asr#31 3837 eor x24,x8,x22,lsl#1 3838 mov v23.d[0],x26 3839 mov v23.d[1],x27 3840 #ifdef __AARCH64EB__ 3841 rev32 v23.16b,v23.16b 3842 #endif 3843 mov w7,0x87 3844 extr x9,x25,x25,#32 3845 extr x27,x25,x24,#63 3846 and w8,w7,w9,asr#31 3847 eor x26,x8,x24,lsl#1 3848 b.lt .Lxts_4_blocks_process 3849 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3850 eor v4.16b, v4.16b, v16.16b 3851 eor v5.16b, v5.16b, v17.16b 3852 eor v6.16b, v6.16b, v18.16b 3853 eor v7.16b, v7.16b, v19.16b 3854 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 3855 eor v8.16b, v8.16b, v20.16b 3856 eor v9.16b, v9.16b, v21.16b 3857 eor v10.16b, v10.16b, v22.16b 3858 eor v11.16b, v11.16b, v23.16b 3859 #ifndef __AARCH64EB__ 3860 rev32 v4.16b,v4.16b 3861 #endif 3862 #ifndef __AARCH64EB__ 3863 rev32 v5.16b,v5.16b 3864 #endif 3865 #ifndef __AARCH64EB__ 3866 rev32 v6.16b,v6.16b 3867 #endif 3868 #ifndef __AARCH64EB__ 3869 rev32 v7.16b,v7.16b 3870 #endif 3871 #ifndef __AARCH64EB__ 3872 rev32 v8.16b,v8.16b 3873 #endif 3874 #ifndef __AARCH64EB__ 3875 rev32 v9.16b,v9.16b 3876 #endif 3877 #ifndef __AARCH64EB__ 3878 rev32 v10.16b,v10.16b 3879 #endif 3880 #ifndef __AARCH64EB__ 3881 rev32 v11.16b,v11.16b 3882 #endif 3883 zip1 v0.4s,v4.4s,v5.4s 3884 zip2 v1.4s,v4.4s,v5.4s 3885 zip1 v2.4s,v6.4s,v7.4s 3886 zip2 v3.4s,v6.4s,v7.4s 3887 zip1 v4.2d,v0.2d,v2.2d 3888 zip2 v5.2d,v0.2d,v2.2d 3889 zip1 v6.2d,v1.2d,v3.2d 3890 zip2 v7.2d,v1.2d,v3.2d 3891 zip1 v0.4s,v8.4s,v9.4s 3892 zip2 v1.4s,v8.4s,v9.4s 3893 zip1 v2.4s,v10.4s,v11.4s 3894 zip2 v3.4s,v10.4s,v11.4s 3895 zip1 v8.2d,v0.2d,v2.2d 3896 zip2 v9.2d,v0.2d,v2.2d 3897 zip1 v10.2d,v1.2d,v3.2d 3898 zip2 v11.2d,v1.2d,v3.2d 3899 bl _vpsm4_ex_enc_8blks 3900 zip1 v8.4s,v0.4s,v1.4s 3901 zip2 v9.4s,v0.4s,v1.4s 3902 zip1 v10.4s,v2.4s,v3.4s 3903 zip2 v11.4s,v2.4s,v3.4s 3904 zip1 v0.2d,v8.2d,v10.2d 3905 zip2 v1.2d,v8.2d,v10.2d 3906 zip1 v2.2d,v9.2d,v11.2d 3907 zip2 v3.2d,v9.2d,v11.2d 3908 zip1 v8.4s,v4.4s,v5.4s 3909 zip2 v9.4s,v4.4s,v5.4s 3910 zip1 v10.4s,v6.4s,v7.4s 3911 zip2 v11.4s,v6.4s,v7.4s 3912 zip1 v4.2d,v8.2d,v10.2d 3913 zip2 v5.2d,v8.2d,v10.2d 3914 zip1 v6.2d,v9.2d,v11.2d 3915 zip2 v7.2d,v9.2d,v11.2d 3916 eor v0.16b, v0.16b, v16.16b 3917 eor v1.16b, v1.16b, v17.16b 3918 eor v2.16b, v2.16b, v18.16b 3919 eor v3.16b, v3.16b, v19.16b 3920 eor v4.16b, v4.16b, v20.16b 3921 eor v5.16b, v5.16b, v21.16b 3922 eor v6.16b, v6.16b, v22.16b 3923 eor v7.16b, v7.16b, v23.16b 3924 3925 // save the last tweak 3926 mov v25.16b,v23.16b 3927 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3928 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 3929 subs x2,x2,#8 3930 b.gt .Lxts_8_blocks_process 3931 b 100f 3932 .Lxts_4_blocks_process: 3933 cmp x2,#4 3934 b.lt 1f 3935 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3936 eor v4.16b, v4.16b, v16.16b 3937 eor v5.16b, v5.16b, v17.16b 3938 eor v6.16b, v6.16b, v18.16b 3939 eor v7.16b, v7.16b, v19.16b 3940 #ifndef __AARCH64EB__ 3941 rev32 v4.16b,v4.16b 3942 #endif 3943 #ifndef __AARCH64EB__ 3944 rev32 v5.16b,v5.16b 3945 #endif 3946 #ifndef __AARCH64EB__ 3947 rev32 v6.16b,v6.16b 3948 #endif 3949 #ifndef __AARCH64EB__ 3950 rev32 v7.16b,v7.16b 3951 #endif 3952 zip1 v0.4s,v4.4s,v5.4s 3953 zip2 v1.4s,v4.4s,v5.4s 3954 zip1 v2.4s,v6.4s,v7.4s 3955 zip2 v3.4s,v6.4s,v7.4s 3956 zip1 v4.2d,v0.2d,v2.2d 3957 zip2 v5.2d,v0.2d,v2.2d 3958 zip1 v6.2d,v1.2d,v3.2d 3959 zip2 v7.2d,v1.2d,v3.2d 3960 bl _vpsm4_ex_enc_4blks 3961 zip1 v4.4s,v0.4s,v1.4s 3962 zip2 v5.4s,v0.4s,v1.4s 3963 zip1 v6.4s,v2.4s,v3.4s 3964 zip2 v7.4s,v2.4s,v3.4s 3965 zip1 v0.2d,v4.2d,v6.2d 3966 zip2 v1.2d,v4.2d,v6.2d 3967 zip1 v2.2d,v5.2d,v7.2d 3968 zip2 v3.2d,v5.2d,v7.2d 3969 eor v0.16b, v0.16b, v16.16b 3970 eor v1.16b, v1.16b, v17.16b 3971 eor v2.16b, v2.16b, v18.16b 3972 eor v3.16b, v3.16b, v19.16b 3973 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3974 sub x2,x2,#4 3975 mov v16.16b,v20.16b 3976 mov v17.16b,v21.16b 3977 mov v18.16b,v22.16b 3978 // save the last tweak 3979 mov v25.16b,v19.16b 3980 1: 3981 // process last block 3982 cmp x2,#1 3983 b.lt 100f 3984 b.gt 1f 3985 ld1 {v4.4s},[x0],#16 3986 eor v4.16b, v4.16b, v16.16b 3987 #ifndef __AARCH64EB__ 3988 rev32 v4.16b,v4.16b 3989 #endif 3990 mov x10,x3 3991 mov w11,#8 3992 mov w12,v4.s[0] 3993 mov w13,v4.s[1] 3994 mov w14,v4.s[2] 3995 mov w15,v4.s[3] 3996 10: 3997 ldp w7,w8,[x10],8 3998 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3999 eor w6,w14,w15 4000 eor w9,w7,w13 4001 eor w6,w6,w9 4002 mov v3.s[0],w6 4003 // optimize sbox using AESE instruction 4004 tbl v0.16b, {v3.16b}, v26.16b 4005 ushr v2.16b, v0.16b, 4 4006 and v0.16b, v0.16b, v31.16b 4007 tbl v0.16b, {v28.16b}, v0.16b 4008 tbl v2.16b, {v27.16b}, v2.16b 4009 eor v0.16b, v0.16b, v2.16b 4010 eor v1.16b, v1.16b, v1.16b 4011 aese v0.16b,v1.16b 4012 ushr v2.16b, v0.16b, 4 4013 and v0.16b, v0.16b, v31.16b 4014 tbl v0.16b, {v30.16b}, v0.16b 4015 tbl v2.16b, {v29.16b}, v2.16b 4016 eor v0.16b, v0.16b, v2.16b 4017 4018 mov w7,v0.s[0] 4019 eor w6,w7,w7,ror #32-2 4020 eor w6,w6,w7,ror #32-10 4021 eor w6,w6,w7,ror #32-18 4022 eor w6,w6,w7,ror #32-24 4023 eor w12,w12,w6 4024 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4025 eor w6,w14,w15 4026 eor w9,w12,w8 4027 eor w6,w6,w9 4028 mov v3.s[0],w6 4029 // optimize sbox using AESE instruction 4030 tbl v0.16b, {v3.16b}, v26.16b 4031 ushr v2.16b, v0.16b, 4 4032 and v0.16b, v0.16b, v31.16b 4033 tbl v0.16b, {v28.16b}, v0.16b 4034 tbl v2.16b, {v27.16b}, v2.16b 4035 eor v0.16b, v0.16b, v2.16b 4036 eor v1.16b, v1.16b, v1.16b 4037 aese v0.16b,v1.16b 4038 ushr v2.16b, v0.16b, 4 4039 and v0.16b, v0.16b, v31.16b 4040 tbl v0.16b, {v30.16b}, v0.16b 4041 tbl v2.16b, {v29.16b}, v2.16b 4042 eor v0.16b, v0.16b, v2.16b 4043 4044 mov w7,v0.s[0] 4045 eor w6,w7,w7,ror #32-2 4046 eor w6,w6,w7,ror #32-10 4047 eor w6,w6,w7,ror #32-18 4048 eor w6,w6,w7,ror #32-24 4049 ldp w7,w8,[x10],8 4050 eor w13,w13,w6 4051 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4052 eor w6,w12,w13 4053 eor w9,w7,w15 4054 eor w6,w6,w9 4055 mov v3.s[0],w6 4056 // optimize sbox using AESE instruction 4057 tbl v0.16b, {v3.16b}, v26.16b 4058 ushr v2.16b, v0.16b, 4 4059 and v0.16b, v0.16b, v31.16b 4060 tbl v0.16b, {v28.16b}, v0.16b 4061 tbl v2.16b, {v27.16b}, v2.16b 4062 eor v0.16b, v0.16b, v2.16b 4063 eor v1.16b, v1.16b, v1.16b 4064 aese v0.16b,v1.16b 4065 ushr v2.16b, v0.16b, 4 4066 and v0.16b, v0.16b, v31.16b 4067 tbl v0.16b, {v30.16b}, v0.16b 4068 tbl v2.16b, {v29.16b}, v2.16b 4069 eor v0.16b, v0.16b, v2.16b 4070 4071 mov w7,v0.s[0] 4072 eor w6,w7,w7,ror #32-2 4073 eor w6,w6,w7,ror #32-10 4074 eor w6,w6,w7,ror #32-18 4075 eor w6,w6,w7,ror #32-24 4076 eor w14,w14,w6 4077 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4078 eor w6,w12,w13 4079 eor w9,w14,w8 4080 eor w6,w6,w9 4081 mov v3.s[0],w6 4082 // optimize sbox using AESE instruction 4083 tbl v0.16b, {v3.16b}, v26.16b 4084 ushr v2.16b, v0.16b, 4 4085 and v0.16b, v0.16b, v31.16b 4086 tbl v0.16b, {v28.16b}, v0.16b 4087 tbl v2.16b, {v27.16b}, v2.16b 4088 eor v0.16b, v0.16b, v2.16b 4089 eor v1.16b, v1.16b, v1.16b 4090 aese v0.16b,v1.16b 4091 ushr v2.16b, v0.16b, 4 4092 and v0.16b, v0.16b, v31.16b 4093 tbl v0.16b, {v30.16b}, v0.16b 4094 tbl v2.16b, {v29.16b}, v2.16b 4095 eor v0.16b, v0.16b, v2.16b 4096 4097 mov w7,v0.s[0] 4098 eor w6,w7,w7,ror #32-2 4099 eor w6,w6,w7,ror #32-10 4100 eor w6,w6,w7,ror #32-18 4101 eor w6,w6,w7,ror #32-24 4102 eor w15,w15,w6 4103 subs w11,w11,#1 4104 b.ne 10b 4105 mov v4.s[0],w15 4106 mov v4.s[1],w14 4107 mov v4.s[2],w13 4108 mov v4.s[3],w12 4109 #ifndef __AARCH64EB__ 4110 rev32 v4.16b,v4.16b 4111 #endif 4112 eor v4.16b, v4.16b, v16.16b 4113 st1 {v4.4s},[x1],#16 4114 // save the last tweak 4115 mov v25.16b,v16.16b 4116 b 100f 4117 1: // process last 2 blocks 4118 cmp x2,#2 4119 b.gt 1f 4120 ld1 {v4.4s,v5.4s},[x0],#32 4121 eor v4.16b, v4.16b, v16.16b 4122 eor v5.16b, v5.16b, v17.16b 4123 #ifndef __AARCH64EB__ 4124 rev32 v4.16b,v4.16b 4125 #endif 4126 #ifndef __AARCH64EB__ 4127 rev32 v5.16b,v5.16b 4128 #endif 4129 zip1 v0.4s,v4.4s,v5.4s 4130 zip2 v1.4s,v4.4s,v5.4s 4131 zip1 v2.4s,v6.4s,v7.4s 4132 zip2 v3.4s,v6.4s,v7.4s 4133 zip1 v4.2d,v0.2d,v2.2d 4134 zip2 v5.2d,v0.2d,v2.2d 4135 zip1 v6.2d,v1.2d,v3.2d 4136 zip2 v7.2d,v1.2d,v3.2d 4137 bl _vpsm4_ex_enc_4blks 4138 zip1 v4.4s,v0.4s,v1.4s 4139 zip2 v5.4s,v0.4s,v1.4s 4140 zip1 v6.4s,v2.4s,v3.4s 4141 zip2 v7.4s,v2.4s,v3.4s 4142 zip1 v0.2d,v4.2d,v6.2d 4143 zip2 v1.2d,v4.2d,v6.2d 4144 zip1 v2.2d,v5.2d,v7.2d 4145 zip2 v3.2d,v5.2d,v7.2d 4146 eor v0.16b, v0.16b, v16.16b 4147 eor v1.16b, v1.16b, v17.16b 4148 st1 {v0.4s,v1.4s},[x1],#32 4149 // save the last tweak 4150 mov v25.16b,v17.16b 4151 b 100f 4152 1: // process last 3 blocks 4153 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 4154 eor v4.16b, v4.16b, v16.16b 4155 eor v5.16b, v5.16b, v17.16b 4156 eor v6.16b, v6.16b, v18.16b 4157 #ifndef __AARCH64EB__ 4158 rev32 v4.16b,v4.16b 4159 #endif 4160 #ifndef __AARCH64EB__ 4161 rev32 v5.16b,v5.16b 4162 #endif 4163 #ifndef __AARCH64EB__ 4164 rev32 v6.16b,v6.16b 4165 #endif 4166 zip1 v0.4s,v4.4s,v5.4s 4167 zip2 v1.4s,v4.4s,v5.4s 4168 zip1 v2.4s,v6.4s,v7.4s 4169 zip2 v3.4s,v6.4s,v7.4s 4170 zip1 v4.2d,v0.2d,v2.2d 4171 zip2 v5.2d,v0.2d,v2.2d 4172 zip1 v6.2d,v1.2d,v3.2d 4173 zip2 v7.2d,v1.2d,v3.2d 4174 bl _vpsm4_ex_enc_4blks 4175 zip1 v4.4s,v0.4s,v1.4s 4176 zip2 v5.4s,v0.4s,v1.4s 4177 zip1 v6.4s,v2.4s,v3.4s 4178 zip2 v7.4s,v2.4s,v3.4s 4179 zip1 v0.2d,v4.2d,v6.2d 4180 zip2 v1.2d,v4.2d,v6.2d 4181 zip1 v2.2d,v5.2d,v7.2d 4182 zip2 v3.2d,v5.2d,v7.2d 4183 eor v0.16b, v0.16b, v16.16b 4184 eor v1.16b, v1.16b, v17.16b 4185 eor v2.16b, v2.16b, v18.16b 4186 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 4187 // save the last tweak 4188 mov v25.16b,v18.16b 4189 100: 4190 cmp x29,0 4191 b.eq .return 4192 4193 // This branch calculates the last two tweaks, 4194 // while the encryption/decryption length is larger than 32 4195 .last_2blks_tweak: 4196 #ifdef __AARCH64EB__ 4197 rev32 v25.16b,v25.16b 4198 #endif 4199 mov v2.16b,v25.16b 4200 adrp x9, .Lxts_magic 4201 ldr q0, [x9, #:lo12:.Lxts_magic] 4202 shl v17.16b, v2.16b, #1 4203 ext v1.16b, v2.16b, v2.16b,#15 4204 ushr v1.16b, v1.16b, #7 4205 mul v1.16b, v1.16b, v0.16b 4206 eor v17.16b, v17.16b, v1.16b 4207 mov v2.16b,v17.16b 4208 adrp x9, .Lxts_magic 4209 ldr q0, [x9, #:lo12:.Lxts_magic] 4210 shl v18.16b, v2.16b, #1 4211 ext v1.16b, v2.16b, v2.16b,#15 4212 ushr v1.16b, v1.16b, #7 4213 mul v1.16b, v1.16b, v0.16b 4214 eor v18.16b, v18.16b, v1.16b 4215 b .check_dec 4216 4217 4218 // This branch calculates the last two tweaks, 4219 // while the encryption/decryption length is equal to 32, who only need two tweaks 4220 .only_2blks_tweak: 4221 mov v17.16b,v16.16b 4222 #ifdef __AARCH64EB__ 4223 rev32 v17.16b,v17.16b 4224 #endif 4225 mov v2.16b,v17.16b 4226 adrp x9, .Lxts_magic 4227 ldr q0, [x9, #:lo12:.Lxts_magic] 4228 shl v18.16b, v2.16b, #1 4229 ext v1.16b, v2.16b, v2.16b,#15 4230 ushr v1.16b, v1.16b, #7 4231 mul v1.16b, v1.16b, v0.16b 4232 eor v18.16b, v18.16b, v1.16b 4233 b .check_dec 4234 4235 4236 // Determine whether encryption or decryption is required. 4237 // The last two tweaks need to be swapped for decryption. 4238 .check_dec: 4239 // encryption:1 decryption:0 4240 cmp w28,1 4241 b.eq .process_last_2blks 4242 mov v0.16B,v17.16b 4243 mov v17.16B,v18.16b 4244 mov v18.16B,v0.16b 4245 4246 .process_last_2blks: 4247 #ifdef __AARCH64EB__ 4248 rev32 v17.16b,v17.16b 4249 #endif 4250 #ifdef __AARCH64EB__ 4251 rev32 v18.16b,v18.16b 4252 #endif 4253 ld1 {v4.4s},[x0],#16 4254 eor v4.16b, v4.16b, v17.16b 4255 #ifndef __AARCH64EB__ 4256 rev32 v4.16b,v4.16b 4257 #endif 4258 mov x10,x3 4259 mov w11,#8 4260 mov w12,v4.s[0] 4261 mov w13,v4.s[1] 4262 mov w14,v4.s[2] 4263 mov w15,v4.s[3] 4264 10: 4265 ldp w7,w8,[x10],8 4266 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4267 eor w6,w14,w15 4268 eor w9,w7,w13 4269 eor w6,w6,w9 4270 mov v3.s[0],w6 4271 // optimize sbox using AESE instruction 4272 tbl v0.16b, {v3.16b}, v26.16b 4273 ushr v2.16b, v0.16b, 4 4274 and v0.16b, v0.16b, v31.16b 4275 tbl v0.16b, {v28.16b}, v0.16b 4276 tbl v2.16b, {v27.16b}, v2.16b 4277 eor v0.16b, v0.16b, v2.16b 4278 eor v1.16b, v1.16b, v1.16b 4279 aese v0.16b,v1.16b 4280 ushr v2.16b, v0.16b, 4 4281 and v0.16b, v0.16b, v31.16b 4282 tbl v0.16b, {v30.16b}, v0.16b 4283 tbl v2.16b, {v29.16b}, v2.16b 4284 eor v0.16b, v0.16b, v2.16b 4285 4286 mov w7,v0.s[0] 4287 eor w6,w7,w7,ror #32-2 4288 eor w6,w6,w7,ror #32-10 4289 eor w6,w6,w7,ror #32-18 4290 eor w6,w6,w7,ror #32-24 4291 eor w12,w12,w6 4292 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4293 eor w6,w14,w15 4294 eor w9,w12,w8 4295 eor w6,w6,w9 4296 mov v3.s[0],w6 4297 // optimize sbox using AESE instruction 4298 tbl v0.16b, {v3.16b}, v26.16b 4299 ushr v2.16b, v0.16b, 4 4300 and v0.16b, v0.16b, v31.16b 4301 tbl v0.16b, {v28.16b}, v0.16b 4302 tbl v2.16b, {v27.16b}, v2.16b 4303 eor v0.16b, v0.16b, v2.16b 4304 eor v1.16b, v1.16b, v1.16b 4305 aese v0.16b,v1.16b 4306 ushr v2.16b, v0.16b, 4 4307 and v0.16b, v0.16b, v31.16b 4308 tbl v0.16b, {v30.16b}, v0.16b 4309 tbl v2.16b, {v29.16b}, v2.16b 4310 eor v0.16b, v0.16b, v2.16b 4311 4312 mov w7,v0.s[0] 4313 eor w6,w7,w7,ror #32-2 4314 eor w6,w6,w7,ror #32-10 4315 eor w6,w6,w7,ror #32-18 4316 eor w6,w6,w7,ror #32-24 4317 ldp w7,w8,[x10],8 4318 eor w13,w13,w6 4319 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4320 eor w6,w12,w13 4321 eor w9,w7,w15 4322 eor w6,w6,w9 4323 mov v3.s[0],w6 4324 // optimize sbox using AESE instruction 4325 tbl v0.16b, {v3.16b}, v26.16b 4326 ushr v2.16b, v0.16b, 4 4327 and v0.16b, v0.16b, v31.16b 4328 tbl v0.16b, {v28.16b}, v0.16b 4329 tbl v2.16b, {v27.16b}, v2.16b 4330 eor v0.16b, v0.16b, v2.16b 4331 eor v1.16b, v1.16b, v1.16b 4332 aese v0.16b,v1.16b 4333 ushr v2.16b, v0.16b, 4 4334 and v0.16b, v0.16b, v31.16b 4335 tbl v0.16b, {v30.16b}, v0.16b 4336 tbl v2.16b, {v29.16b}, v2.16b 4337 eor v0.16b, v0.16b, v2.16b 4338 4339 mov w7,v0.s[0] 4340 eor w6,w7,w7,ror #32-2 4341 eor w6,w6,w7,ror #32-10 4342 eor w6,w6,w7,ror #32-18 4343 eor w6,w6,w7,ror #32-24 4344 eor w14,w14,w6 4345 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4346 eor w6,w12,w13 4347 eor w9,w14,w8 4348 eor w6,w6,w9 4349 mov v3.s[0],w6 4350 // optimize sbox using AESE instruction 4351 tbl v0.16b, {v3.16b}, v26.16b 4352 ushr v2.16b, v0.16b, 4 4353 and v0.16b, v0.16b, v31.16b 4354 tbl v0.16b, {v28.16b}, v0.16b 4355 tbl v2.16b, {v27.16b}, v2.16b 4356 eor v0.16b, v0.16b, v2.16b 4357 eor v1.16b, v1.16b, v1.16b 4358 aese v0.16b,v1.16b 4359 ushr v2.16b, v0.16b, 4 4360 and v0.16b, v0.16b, v31.16b 4361 tbl v0.16b, {v30.16b}, v0.16b 4362 tbl v2.16b, {v29.16b}, v2.16b 4363 eor v0.16b, v0.16b, v2.16b 4364 4365 mov w7,v0.s[0] 4366 eor w6,w7,w7,ror #32-2 4367 eor w6,w6,w7,ror #32-10 4368 eor w6,w6,w7,ror #32-18 4369 eor w6,w6,w7,ror #32-24 4370 eor w15,w15,w6 4371 subs w11,w11,#1 4372 b.ne 10b 4373 mov v4.s[0],w15 4374 mov v4.s[1],w14 4375 mov v4.s[2],w13 4376 mov v4.s[3],w12 4377 #ifndef __AARCH64EB__ 4378 rev32 v4.16b,v4.16b 4379 #endif 4380 eor v4.16b, v4.16b, v17.16b 4381 st1 {v4.4s},[x1],#16 4382 4383 sub x26,x1,16 4384 .loop: 4385 subs x29,x29,1 4386 ldrb w7,[x26,x29] 4387 ldrb w8,[x0,x29] 4388 strb w8,[x26,x29] 4389 strb w7,[x1,x29] 4390 b.gt .loop 4391 ld1 {v4.4s}, [x26] 4392 eor v4.16b, v4.16b, v18.16b 4393 #ifndef __AARCH64EB__ 4394 rev32 v4.16b,v4.16b 4395 #endif 4396 mov x10,x3 4397 mov w11,#8 4398 mov w12,v4.s[0] 4399 mov w13,v4.s[1] 4400 mov w14,v4.s[2] 4401 mov w15,v4.s[3] 4402 10: 4403 ldp w7,w8,[x10],8 4404 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4405 eor w6,w14,w15 4406 eor w9,w7,w13 4407 eor w6,w6,w9 4408 mov v3.s[0],w6 4409 // optimize sbox using AESE instruction 4410 tbl v0.16b, {v3.16b}, v26.16b 4411 ushr v2.16b, v0.16b, 4 4412 and v0.16b, v0.16b, v31.16b 4413 tbl v0.16b, {v28.16b}, v0.16b 4414 tbl v2.16b, {v27.16b}, v2.16b 4415 eor v0.16b, v0.16b, v2.16b 4416 eor v1.16b, v1.16b, v1.16b 4417 aese v0.16b,v1.16b 4418 ushr v2.16b, v0.16b, 4 4419 and v0.16b, v0.16b, v31.16b 4420 tbl v0.16b, {v30.16b}, v0.16b 4421 tbl v2.16b, {v29.16b}, v2.16b 4422 eor v0.16b, v0.16b, v2.16b 4423 4424 mov w7,v0.s[0] 4425 eor w6,w7,w7,ror #32-2 4426 eor w6,w6,w7,ror #32-10 4427 eor w6,w6,w7,ror #32-18 4428 eor w6,w6,w7,ror #32-24 4429 eor w12,w12,w6 4430 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4431 eor w6,w14,w15 4432 eor w9,w12,w8 4433 eor w6,w6,w9 4434 mov v3.s[0],w6 4435 // optimize sbox using AESE instruction 4436 tbl v0.16b, {v3.16b}, v26.16b 4437 ushr v2.16b, v0.16b, 4 4438 and v0.16b, v0.16b, v31.16b 4439 tbl v0.16b, {v28.16b}, v0.16b 4440 tbl v2.16b, {v27.16b}, v2.16b 4441 eor v0.16b, v0.16b, v2.16b 4442 eor v1.16b, v1.16b, v1.16b 4443 aese v0.16b,v1.16b 4444 ushr v2.16b, v0.16b, 4 4445 and v0.16b, v0.16b, v31.16b 4446 tbl v0.16b, {v30.16b}, v0.16b 4447 tbl v2.16b, {v29.16b}, v2.16b 4448 eor v0.16b, v0.16b, v2.16b 4449 4450 mov w7,v0.s[0] 4451 eor w6,w7,w7,ror #32-2 4452 eor w6,w6,w7,ror #32-10 4453 eor w6,w6,w7,ror #32-18 4454 eor w6,w6,w7,ror #32-24 4455 ldp w7,w8,[x10],8 4456 eor w13,w13,w6 4457 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4458 eor w6,w12,w13 4459 eor w9,w7,w15 4460 eor w6,w6,w9 4461 mov v3.s[0],w6 4462 // optimize sbox using AESE instruction 4463 tbl v0.16b, {v3.16b}, v26.16b 4464 ushr v2.16b, v0.16b, 4 4465 and v0.16b, v0.16b, v31.16b 4466 tbl v0.16b, {v28.16b}, v0.16b 4467 tbl v2.16b, {v27.16b}, v2.16b 4468 eor v0.16b, v0.16b, v2.16b 4469 eor v1.16b, v1.16b, v1.16b 4470 aese v0.16b,v1.16b 4471 ushr v2.16b, v0.16b, 4 4472 and v0.16b, v0.16b, v31.16b 4473 tbl v0.16b, {v30.16b}, v0.16b 4474 tbl v2.16b, {v29.16b}, v2.16b 4475 eor v0.16b, v0.16b, v2.16b 4476 4477 mov w7,v0.s[0] 4478 eor w6,w7,w7,ror #32-2 4479 eor w6,w6,w7,ror #32-10 4480 eor w6,w6,w7,ror #32-18 4481 eor w6,w6,w7,ror #32-24 4482 eor w14,w14,w6 4483 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4484 eor w6,w12,w13 4485 eor w9,w14,w8 4486 eor w6,w6,w9 4487 mov v3.s[0],w6 4488 // optimize sbox using AESE instruction 4489 tbl v0.16b, {v3.16b}, v26.16b 4490 ushr v2.16b, v0.16b, 4 4491 and v0.16b, v0.16b, v31.16b 4492 tbl v0.16b, {v28.16b}, v0.16b 4493 tbl v2.16b, {v27.16b}, v2.16b 4494 eor v0.16b, v0.16b, v2.16b 4495 eor v1.16b, v1.16b, v1.16b 4496 aese v0.16b,v1.16b 4497 ushr v2.16b, v0.16b, 4 4498 and v0.16b, v0.16b, v31.16b 4499 tbl v0.16b, {v30.16b}, v0.16b 4500 tbl v2.16b, {v29.16b}, v2.16b 4501 eor v0.16b, v0.16b, v2.16b 4502 4503 mov w7,v0.s[0] 4504 eor w6,w7,w7,ror #32-2 4505 eor w6,w6,w7,ror #32-10 4506 eor w6,w6,w7,ror #32-18 4507 eor w6,w6,w7,ror #32-24 4508 eor w15,w15,w6 4509 subs w11,w11,#1 4510 b.ne 10b 4511 mov v4.s[0],w15 4512 mov v4.s[1],w14 4513 mov v4.s[2],w13 4514 mov v4.s[3],w12 4515 #ifndef __AARCH64EB__ 4516 rev32 v4.16b,v4.16b 4517 #endif 4518 eor v4.16b, v4.16b, v18.16b 4519 st1 {v4.4s}, [x26] 4520 .return: 4521 ldp d14, d15, [sp], #0x10 4522 ldp d12, d13, [sp], #0x10 4523 ldp d10, d11, [sp], #0x10 4524 ldp d8, d9, [sp], #0x10 4525 ldp x29, x30, [sp], #0x10 4526 ldp x27, x28, [sp], #0x10 4527 ldp x25, x26, [sp], #0x10 4528 ldp x23, x24, [sp], #0x10 4529 ldp x21, x22, [sp], #0x10 4530 ldp x19, x20, [sp], #0x10 4531 ldp x17, x18, [sp], #0x10 4532 ldp x15, x16, [sp], #0x10 4533 AARCH64_VALIDATE_LINK_REGISTER 4534 ret 4535 .size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt 4536