1 // Copyright 2022-2026 The OpenSSL Project Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License 2.0 (the "License"). You may not use 4 // this file except in compliance with the License. You can obtain a copy 5 // in the file LICENSE in the source distribution or at 6 // https://www.openssl.org/source/license.html 7 8 // 9 // This module implements SM4 with ASIMD and AESE on AARCH64 10 // 11 // Dec 2022 12 // 13 14 // $output is the last argument if it looks like a file (it has an extension) 15 // $flavour is the first argument if it doesn't look like a file 16 #include "arm_arch.h" 17 .arch armv8-a+crypto 18 .text 19 20 .type _vpsm4_ex_consts,%object 21 .align 7 22 _vpsm4_ex_consts: 23 .Lck: 24 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 25 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 26 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 27 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 28 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 29 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 30 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 31 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 32 .Lfk: 33 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 34 .Lshuffles: 35 .quad 0x0B0A090807060504,0x030201000F0E0D0C 36 .Lxts_magic: 37 #ifndef __AARCH64EB__ 38 .quad 0x0101010101010187,0x0101010101010101 39 #else 40 .quad 0x0101010101010101,0x0101010101010187 41 #endif 42 .Lsbox_magic: 43 #ifndef __AARCH64EB__ 44 .quad 0x0b0e0104070a0d00,0x0306090c0f020508 45 .quad 0x62185a2042387a00,0x22581a6002783a40 46 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7 47 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead 48 .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc 49 #else 50 .quad 0x0306090c0f020508,0x0b0e0104070a0d00 51 .quad 0x22581a6002783a40,0x62185a2042387a00 52 .quad 0xc10bb67c4a803df7,0x15df62a89e54e923 53 .quad 0x1407c6d56c7fbead,0xb9aa6b78c1d21300 54 .quad 0xe383c1a1fe9edcbc,0x6404462679195b3b 55 #endif 56 .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f 57 58 .size _vpsm4_ex_consts,.-_vpsm4_ex_consts 59 .type _vpsm4_ex_set_key,%function 60 .align 4 61 _vpsm4_ex_set_key: 62 AARCH64_VALID_CALL_TARGET 63 ld1 {v5.4s},[x0] 64 adrp x9, .Lsbox_magic 65 ldr q26, [x9, #:lo12:.Lsbox_magic] 66 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 67 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 68 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 69 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 70 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 71 #ifndef __AARCH64EB__ 72 rev32 v5.16b,v5.16b 73 #endif 74 adrp x5,.Lshuffles 75 add x5,x5,#:lo12:.Lshuffles 76 ld1 {v7.2d},[x5] 77 adrp x5,.Lfk 78 add x5,x5,#:lo12:.Lfk 79 ld1 {v6.2d},[x5] 80 eor v5.16b,v5.16b,v6.16b 81 mov x6,#32 82 adrp x5,.Lck 83 add x5,x5,#:lo12:.Lck 84 movi v0.16b,#64 85 cbnz w2,1f 86 add x1,x1,124 87 1: 88 mov w7,v5.s[1] 89 ldr w8,[x5],#4 90 eor w8,w8,w7 91 mov w7,v5.s[2] 92 eor w8,w8,w7 93 mov w7,v5.s[3] 94 eor w8,w8,w7 95 // optimize sbox using AESE instruction 96 mov v4.s[0],w8 97 tbl v0.16b, {v4.16b}, v26.16b 98 ushr v2.16b, v0.16b, 4 99 and v0.16b, v0.16b, v31.16b 100 tbl v0.16b, {v28.16b}, v0.16b 101 tbl v2.16b, {v27.16b}, v2.16b 102 eor v0.16b, v0.16b, v2.16b 103 eor v1.16b, v1.16b, v1.16b 104 aese v0.16b,v1.16b 105 ushr v2.16b, v0.16b, 4 106 and v0.16b, v0.16b, v31.16b 107 tbl v0.16b, {v30.16b}, v0.16b 108 tbl v2.16b, {v29.16b}, v2.16b 109 eor v0.16b, v0.16b, v2.16b 110 mov w7,v0.s[0] 111 eor w8,w7,w7,ror #19 112 eor w8,w8,w7,ror #9 113 mov w7,v5.s[0] 114 eor w8,w8,w7 115 mov v5.s[0],w8 116 cbz w2,2f 117 str w8,[x1],#4 118 b 3f 119 2: 120 str w8,[x1],#-4 121 3: 122 tbl v5.16b,{v5.16b},v7.16b 123 subs x6,x6,#1 124 b.ne 1b 125 ret 126 .size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key 127 .type _vpsm4_ex_enc_4blks,%function 128 .align 4 129 _vpsm4_ex_enc_4blks: 130 AARCH64_VALID_CALL_TARGET 131 mov x10,x3 132 mov w11,#8 133 10: 134 ldp w7,w8,[x10],8 135 dup v12.4s,w7 136 dup v13.4s,w8 137 138 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 139 eor v14.16b,v6.16b,v7.16b 140 eor v12.16b,v5.16b,v12.16b 141 eor v12.16b,v14.16b,v12.16b 142 // optimize sbox using AESE instruction 143 tbl v0.16b, {v12.16b}, v26.16b 144 ushr v24.16b, v0.16b, 4 145 and v0.16b, v0.16b, v31.16b 146 tbl v0.16b, {v28.16b}, v0.16b 147 tbl v24.16b, {v27.16b}, v24.16b 148 eor v0.16b, v0.16b, v24.16b 149 eor v1.16b, v1.16b, v1.16b 150 aese v0.16b,v1.16b 151 ushr v24.16b, v0.16b, 4 152 and v0.16b, v0.16b, v31.16b 153 tbl v0.16b, {v30.16b}, v0.16b 154 tbl v24.16b, {v29.16b}, v24.16b 155 eor v0.16b, v0.16b, v24.16b 156 mov v12.16b,v0.16b 157 158 // linear transformation 159 ushr v0.4s,v12.4s,32-2 160 ushr v1.4s,v12.4s,32-10 161 ushr v2.4s,v12.4s,32-18 162 ushr v3.4s,v12.4s,32-24 163 sli v0.4s,v12.4s,2 164 sli v1.4s,v12.4s,10 165 sli v2.4s,v12.4s,18 166 sli v3.4s,v12.4s,24 167 eor v24.16b,v0.16b,v12.16b 168 eor v24.16b,v24.16b,v1.16b 169 eor v12.16b,v2.16b,v3.16b 170 eor v12.16b,v12.16b,v24.16b 171 eor v4.16b,v4.16b,v12.16b 172 173 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 174 eor v14.16b,v14.16b,v4.16b 175 eor v13.16b,v14.16b,v13.16b 176 // optimize sbox using AESE instruction 177 tbl v0.16b, {v13.16b}, v26.16b 178 ushr v24.16b, v0.16b, 4 179 and v0.16b, v0.16b, v31.16b 180 tbl v0.16b, {v28.16b}, v0.16b 181 tbl v24.16b, {v27.16b}, v24.16b 182 eor v0.16b, v0.16b, v24.16b 183 eor v1.16b, v1.16b, v1.16b 184 aese v0.16b,v1.16b 185 ushr v24.16b, v0.16b, 4 186 and v0.16b, v0.16b, v31.16b 187 tbl v0.16b, {v30.16b}, v0.16b 188 tbl v24.16b, {v29.16b}, v24.16b 189 eor v0.16b, v0.16b, v24.16b 190 mov v13.16b,v0.16b 191 192 // linear transformation 193 ushr v0.4s,v13.4s,32-2 194 ushr v1.4s,v13.4s,32-10 195 ushr v2.4s,v13.4s,32-18 196 ushr v3.4s,v13.4s,32-24 197 sli v0.4s,v13.4s,2 198 sli v1.4s,v13.4s,10 199 sli v2.4s,v13.4s,18 200 sli v3.4s,v13.4s,24 201 eor v24.16b,v0.16b,v13.16b 202 eor v24.16b,v24.16b,v1.16b 203 eor v13.16b,v2.16b,v3.16b 204 eor v13.16b,v13.16b,v24.16b 205 ldp w7,w8,[x10],8 206 eor v5.16b,v5.16b,v13.16b 207 208 dup v12.4s,w7 209 dup v13.4s,w8 210 211 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 212 eor v14.16b,v4.16b,v5.16b 213 eor v12.16b,v7.16b,v12.16b 214 eor v12.16b,v14.16b,v12.16b 215 // optimize sbox using AESE instruction 216 tbl v0.16b, {v12.16b}, v26.16b 217 ushr v24.16b, v0.16b, 4 218 and v0.16b, v0.16b, v31.16b 219 tbl v0.16b, {v28.16b}, v0.16b 220 tbl v24.16b, {v27.16b}, v24.16b 221 eor v0.16b, v0.16b, v24.16b 222 eor v1.16b, v1.16b, v1.16b 223 aese v0.16b,v1.16b 224 ushr v24.16b, v0.16b, 4 225 and v0.16b, v0.16b, v31.16b 226 tbl v0.16b, {v30.16b}, v0.16b 227 tbl v24.16b, {v29.16b}, v24.16b 228 eor v0.16b, v0.16b, v24.16b 229 mov v12.16b,v0.16b 230 231 // linear transformation 232 ushr v0.4s,v12.4s,32-2 233 ushr v1.4s,v12.4s,32-10 234 ushr v2.4s,v12.4s,32-18 235 ushr v3.4s,v12.4s,32-24 236 sli v0.4s,v12.4s,2 237 sli v1.4s,v12.4s,10 238 sli v2.4s,v12.4s,18 239 sli v3.4s,v12.4s,24 240 eor v24.16b,v0.16b,v12.16b 241 eor v24.16b,v24.16b,v1.16b 242 eor v12.16b,v2.16b,v3.16b 243 eor v12.16b,v12.16b,v24.16b 244 eor v6.16b,v6.16b,v12.16b 245 246 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 247 eor v14.16b,v14.16b,v6.16b 248 eor v13.16b,v14.16b,v13.16b 249 // optimize sbox using AESE instruction 250 tbl v0.16b, {v13.16b}, v26.16b 251 ushr v24.16b, v0.16b, 4 252 and v0.16b, v0.16b, v31.16b 253 tbl v0.16b, {v28.16b}, v0.16b 254 tbl v24.16b, {v27.16b}, v24.16b 255 eor v0.16b, v0.16b, v24.16b 256 eor v1.16b, v1.16b, v1.16b 257 aese v0.16b,v1.16b 258 ushr v24.16b, v0.16b, 4 259 and v0.16b, v0.16b, v31.16b 260 tbl v0.16b, {v30.16b}, v0.16b 261 tbl v24.16b, {v29.16b}, v24.16b 262 eor v0.16b, v0.16b, v24.16b 263 mov v13.16b,v0.16b 264 265 // linear transformation 266 ushr v0.4s,v13.4s,32-2 267 ushr v1.4s,v13.4s,32-10 268 ushr v2.4s,v13.4s,32-18 269 ushr v3.4s,v13.4s,32-24 270 sli v0.4s,v13.4s,2 271 sli v1.4s,v13.4s,10 272 sli v2.4s,v13.4s,18 273 sli v3.4s,v13.4s,24 274 eor v24.16b,v0.16b,v13.16b 275 eor v24.16b,v24.16b,v1.16b 276 eor v13.16b,v2.16b,v3.16b 277 eor v13.16b,v13.16b,v24.16b 278 eor v7.16b,v7.16b,v13.16b 279 subs w11,w11,#1 280 b.ne 10b 281 #ifndef __AARCH64EB__ 282 rev32 v3.16b,v4.16b 283 #else 284 mov v3.16b,v4.16b 285 #endif 286 #ifndef __AARCH64EB__ 287 rev32 v2.16b,v5.16b 288 #else 289 mov v2.16b,v5.16b 290 #endif 291 #ifndef __AARCH64EB__ 292 rev32 v1.16b,v6.16b 293 #else 294 mov v1.16b,v6.16b 295 #endif 296 #ifndef __AARCH64EB__ 297 rev32 v0.16b,v7.16b 298 #else 299 mov v0.16b,v7.16b 300 #endif 301 ret 302 .size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks 303 .type _vpsm4_ex_enc_8blks,%function 304 .align 4 305 _vpsm4_ex_enc_8blks: 306 AARCH64_VALID_CALL_TARGET 307 mov x10,x3 308 mov w11,#8 309 10: 310 ldp w7,w8,[x10],8 311 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 312 dup v12.4s,w7 313 eor v14.16b,v6.16b,v7.16b 314 eor v15.16b,v10.16b,v11.16b 315 eor v0.16b,v5.16b,v12.16b 316 eor v1.16b,v9.16b,v12.16b 317 eor v12.16b,v14.16b,v0.16b 318 eor v13.16b,v15.16b,v1.16b 319 // optimize sbox using AESE instruction 320 tbl v0.16b, {v12.16b}, v26.16b 321 tbl v1.16b, {v13.16b}, v26.16b 322 ushr v24.16b, v0.16b, 4 323 and v0.16b, v0.16b, v31.16b 324 tbl v0.16b, {v28.16b}, v0.16b 325 tbl v24.16b, {v27.16b}, v24.16b 326 eor v0.16b, v0.16b, v24.16b 327 ushr v24.16b, v1.16b, 4 328 and v1.16b, v1.16b, v31.16b 329 tbl v1.16b, {v28.16b}, v1.16b 330 tbl v24.16b, {v27.16b}, v24.16b 331 eor v1.16b, v1.16b, v24.16b 332 eor v25.16b, v25.16b, v25.16b 333 aese v0.16b,v25.16b 334 aese v1.16b,v25.16b 335 ushr v24.16b, v0.16b, 4 336 and v0.16b, v0.16b, v31.16b 337 tbl v0.16b, {v30.16b}, v0.16b 338 tbl v24.16b, {v29.16b}, v24.16b 339 eor v0.16b, v0.16b, v24.16b 340 ushr v24.16b, v1.16b, 4 341 and v1.16b, v1.16b, v31.16b 342 tbl v1.16b, {v30.16b}, v1.16b 343 tbl v24.16b, {v29.16b}, v24.16b 344 eor v1.16b, v1.16b, v24.16b 345 mov v12.16b,v0.16b 346 mov v13.16b,v1.16b 347 348 // linear transformation 349 ushr v0.4s,v12.4s,32-2 350 ushr v25.4s,v13.4s,32-2 351 ushr v1.4s,v12.4s,32-10 352 ushr v2.4s,v12.4s,32-18 353 ushr v3.4s,v12.4s,32-24 354 sli v0.4s,v12.4s,2 355 sli v25.4s,v13.4s,2 356 sli v1.4s,v12.4s,10 357 sli v2.4s,v12.4s,18 358 sli v3.4s,v12.4s,24 359 eor v24.16b,v0.16b,v12.16b 360 eor v24.16b,v24.16b,v1.16b 361 eor v12.16b,v2.16b,v3.16b 362 eor v12.16b,v12.16b,v24.16b 363 ushr v1.4s,v13.4s,32-10 364 ushr v2.4s,v13.4s,32-18 365 ushr v3.4s,v13.4s,32-24 366 sli v1.4s,v13.4s,10 367 sli v2.4s,v13.4s,18 368 sli v3.4s,v13.4s,24 369 eor v24.16b,v25.16b,v13.16b 370 eor v24.16b,v24.16b,v1.16b 371 eor v13.16b,v2.16b,v3.16b 372 eor v13.16b,v13.16b,v24.16b 373 eor v4.16b,v4.16b,v12.16b 374 eor v8.16b,v8.16b,v13.16b 375 376 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 377 dup v13.4s,w8 378 eor v14.16b,v14.16b,v4.16b 379 eor v15.16b,v15.16b,v8.16b 380 eor v12.16b,v14.16b,v13.16b 381 eor v13.16b,v15.16b,v13.16b 382 // optimize sbox using AESE instruction 383 tbl v0.16b, {v12.16b}, v26.16b 384 tbl v1.16b, {v13.16b}, v26.16b 385 ushr v24.16b, v0.16b, 4 386 and v0.16b, v0.16b, v31.16b 387 tbl v0.16b, {v28.16b}, v0.16b 388 tbl v24.16b, {v27.16b}, v24.16b 389 eor v0.16b, v0.16b, v24.16b 390 ushr v24.16b, v1.16b, 4 391 and v1.16b, v1.16b, v31.16b 392 tbl v1.16b, {v28.16b}, v1.16b 393 tbl v24.16b, {v27.16b}, v24.16b 394 eor v1.16b, v1.16b, v24.16b 395 eor v25.16b, v25.16b, v25.16b 396 aese v0.16b,v25.16b 397 aese v1.16b,v25.16b 398 ushr v24.16b, v0.16b, 4 399 and v0.16b, v0.16b, v31.16b 400 tbl v0.16b, {v30.16b}, v0.16b 401 tbl v24.16b, {v29.16b}, v24.16b 402 eor v0.16b, v0.16b, v24.16b 403 ushr v24.16b, v1.16b, 4 404 and v1.16b, v1.16b, v31.16b 405 tbl v1.16b, {v30.16b}, v1.16b 406 tbl v24.16b, {v29.16b}, v24.16b 407 eor v1.16b, v1.16b, v24.16b 408 mov v12.16b,v0.16b 409 mov v13.16b,v1.16b 410 411 // linear transformation 412 ushr v0.4s,v12.4s,32-2 413 ushr v25.4s,v13.4s,32-2 414 ushr v1.4s,v12.4s,32-10 415 ushr v2.4s,v12.4s,32-18 416 ushr v3.4s,v12.4s,32-24 417 sli v0.4s,v12.4s,2 418 sli v25.4s,v13.4s,2 419 sli v1.4s,v12.4s,10 420 sli v2.4s,v12.4s,18 421 sli v3.4s,v12.4s,24 422 eor v24.16b,v0.16b,v12.16b 423 eor v24.16b,v24.16b,v1.16b 424 eor v12.16b,v2.16b,v3.16b 425 eor v12.16b,v12.16b,v24.16b 426 ushr v1.4s,v13.4s,32-10 427 ushr v2.4s,v13.4s,32-18 428 ushr v3.4s,v13.4s,32-24 429 sli v1.4s,v13.4s,10 430 sli v2.4s,v13.4s,18 431 sli v3.4s,v13.4s,24 432 eor v24.16b,v25.16b,v13.16b 433 eor v24.16b,v24.16b,v1.16b 434 eor v13.16b,v2.16b,v3.16b 435 eor v13.16b,v13.16b,v24.16b 436 ldp w7,w8,[x10],8 437 eor v5.16b,v5.16b,v12.16b 438 eor v9.16b,v9.16b,v13.16b 439 440 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 441 dup v12.4s,w7 442 eor v14.16b,v4.16b,v5.16b 443 eor v15.16b,v8.16b,v9.16b 444 eor v0.16b,v7.16b,v12.16b 445 eor v1.16b,v11.16b,v12.16b 446 eor v12.16b,v14.16b,v0.16b 447 eor v13.16b,v15.16b,v1.16b 448 // optimize sbox using AESE instruction 449 tbl v0.16b, {v12.16b}, v26.16b 450 tbl v1.16b, {v13.16b}, v26.16b 451 ushr v24.16b, v0.16b, 4 452 and v0.16b, v0.16b, v31.16b 453 tbl v0.16b, {v28.16b}, v0.16b 454 tbl v24.16b, {v27.16b}, v24.16b 455 eor v0.16b, v0.16b, v24.16b 456 ushr v24.16b, v1.16b, 4 457 and v1.16b, v1.16b, v31.16b 458 tbl v1.16b, {v28.16b}, v1.16b 459 tbl v24.16b, {v27.16b}, v24.16b 460 eor v1.16b, v1.16b, v24.16b 461 eor v25.16b, v25.16b, v25.16b 462 aese v0.16b,v25.16b 463 aese v1.16b,v25.16b 464 ushr v24.16b, v0.16b, 4 465 and v0.16b, v0.16b, v31.16b 466 tbl v0.16b, {v30.16b}, v0.16b 467 tbl v24.16b, {v29.16b}, v24.16b 468 eor v0.16b, v0.16b, v24.16b 469 ushr v24.16b, v1.16b, 4 470 and v1.16b, v1.16b, v31.16b 471 tbl v1.16b, {v30.16b}, v1.16b 472 tbl v24.16b, {v29.16b}, v24.16b 473 eor v1.16b, v1.16b, v24.16b 474 mov v12.16b,v0.16b 475 mov v13.16b,v1.16b 476 477 // linear transformation 478 ushr v0.4s,v12.4s,32-2 479 ushr v25.4s,v13.4s,32-2 480 ushr v1.4s,v12.4s,32-10 481 ushr v2.4s,v12.4s,32-18 482 ushr v3.4s,v12.4s,32-24 483 sli v0.4s,v12.4s,2 484 sli v25.4s,v13.4s,2 485 sli v1.4s,v12.4s,10 486 sli v2.4s,v12.4s,18 487 sli v3.4s,v12.4s,24 488 eor v24.16b,v0.16b,v12.16b 489 eor v24.16b,v24.16b,v1.16b 490 eor v12.16b,v2.16b,v3.16b 491 eor v12.16b,v12.16b,v24.16b 492 ushr v1.4s,v13.4s,32-10 493 ushr v2.4s,v13.4s,32-18 494 ushr v3.4s,v13.4s,32-24 495 sli v1.4s,v13.4s,10 496 sli v2.4s,v13.4s,18 497 sli v3.4s,v13.4s,24 498 eor v24.16b,v25.16b,v13.16b 499 eor v24.16b,v24.16b,v1.16b 500 eor v13.16b,v2.16b,v3.16b 501 eor v13.16b,v13.16b,v24.16b 502 eor v6.16b,v6.16b,v12.16b 503 eor v10.16b,v10.16b,v13.16b 504 505 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 506 dup v13.4s,w8 507 eor v14.16b,v14.16b,v6.16b 508 eor v15.16b,v15.16b,v10.16b 509 eor v12.16b,v14.16b,v13.16b 510 eor v13.16b,v15.16b,v13.16b 511 // optimize sbox using AESE instruction 512 tbl v0.16b, {v12.16b}, v26.16b 513 tbl v1.16b, {v13.16b}, v26.16b 514 ushr v24.16b, v0.16b, 4 515 and v0.16b, v0.16b, v31.16b 516 tbl v0.16b, {v28.16b}, v0.16b 517 tbl v24.16b, {v27.16b}, v24.16b 518 eor v0.16b, v0.16b, v24.16b 519 ushr v24.16b, v1.16b, 4 520 and v1.16b, v1.16b, v31.16b 521 tbl v1.16b, {v28.16b}, v1.16b 522 tbl v24.16b, {v27.16b}, v24.16b 523 eor v1.16b, v1.16b, v24.16b 524 eor v25.16b, v25.16b, v25.16b 525 aese v0.16b,v25.16b 526 aese v1.16b,v25.16b 527 ushr v24.16b, v0.16b, 4 528 and v0.16b, v0.16b, v31.16b 529 tbl v0.16b, {v30.16b}, v0.16b 530 tbl v24.16b, {v29.16b}, v24.16b 531 eor v0.16b, v0.16b, v24.16b 532 ushr v24.16b, v1.16b, 4 533 and v1.16b, v1.16b, v31.16b 534 tbl v1.16b, {v30.16b}, v1.16b 535 tbl v24.16b, {v29.16b}, v24.16b 536 eor v1.16b, v1.16b, v24.16b 537 mov v12.16b,v0.16b 538 mov v13.16b,v1.16b 539 540 // linear transformation 541 ushr v0.4s,v12.4s,32-2 542 ushr v25.4s,v13.4s,32-2 543 ushr v1.4s,v12.4s,32-10 544 ushr v2.4s,v12.4s,32-18 545 ushr v3.4s,v12.4s,32-24 546 sli v0.4s,v12.4s,2 547 sli v25.4s,v13.4s,2 548 sli v1.4s,v12.4s,10 549 sli v2.4s,v12.4s,18 550 sli v3.4s,v12.4s,24 551 eor v24.16b,v0.16b,v12.16b 552 eor v24.16b,v24.16b,v1.16b 553 eor v12.16b,v2.16b,v3.16b 554 eor v12.16b,v12.16b,v24.16b 555 ushr v1.4s,v13.4s,32-10 556 ushr v2.4s,v13.4s,32-18 557 ushr v3.4s,v13.4s,32-24 558 sli v1.4s,v13.4s,10 559 sli v2.4s,v13.4s,18 560 sli v3.4s,v13.4s,24 561 eor v24.16b,v25.16b,v13.16b 562 eor v24.16b,v24.16b,v1.16b 563 eor v13.16b,v2.16b,v3.16b 564 eor v13.16b,v13.16b,v24.16b 565 eor v7.16b,v7.16b,v12.16b 566 eor v11.16b,v11.16b,v13.16b 567 subs w11,w11,#1 568 b.ne 10b 569 #ifndef __AARCH64EB__ 570 rev32 v3.16b,v4.16b 571 #else 572 mov v3.16b,v4.16b 573 #endif 574 #ifndef __AARCH64EB__ 575 rev32 v2.16b,v5.16b 576 #else 577 mov v2.16b,v5.16b 578 #endif 579 #ifndef __AARCH64EB__ 580 rev32 v1.16b,v6.16b 581 #else 582 mov v1.16b,v6.16b 583 #endif 584 #ifndef __AARCH64EB__ 585 rev32 v0.16b,v7.16b 586 #else 587 mov v0.16b,v7.16b 588 #endif 589 #ifndef __AARCH64EB__ 590 rev32 v7.16b,v8.16b 591 #else 592 mov v7.16b,v8.16b 593 #endif 594 #ifndef __AARCH64EB__ 595 rev32 v6.16b,v9.16b 596 #else 597 mov v6.16b,v9.16b 598 #endif 599 #ifndef __AARCH64EB__ 600 rev32 v5.16b,v10.16b 601 #else 602 mov v5.16b,v10.16b 603 #endif 604 #ifndef __AARCH64EB__ 605 rev32 v4.16b,v11.16b 606 #else 607 mov v4.16b,v11.16b 608 #endif 609 ret 610 .size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks 611 .globl vpsm4_ex_set_encrypt_key 612 .type vpsm4_ex_set_encrypt_key,%function 613 .align 5 614 vpsm4_ex_set_encrypt_key: 615 AARCH64_SIGN_LINK_REGISTER 616 stp x29,x30,[sp,#-16]! 617 mov w2,1 618 bl _vpsm4_ex_set_key 619 ldp x29,x30,[sp],#16 620 AARCH64_VALIDATE_LINK_REGISTER 621 ret 622 .size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key 623 .globl vpsm4_ex_set_decrypt_key 624 .type vpsm4_ex_set_decrypt_key,%function 625 .align 5 626 vpsm4_ex_set_decrypt_key: 627 AARCH64_SIGN_LINK_REGISTER 628 stp x29,x30,[sp,#-16]! 629 mov w2,0 630 bl _vpsm4_ex_set_key 631 ldp x29,x30,[sp],#16 632 AARCH64_VALIDATE_LINK_REGISTER 633 ret 634 .size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key 635 .globl vpsm4_ex_encrypt 636 .type vpsm4_ex_encrypt,%function 637 .align 5 638 vpsm4_ex_encrypt: 639 AARCH64_VALID_CALL_TARGET 640 ld1 {v4.4s},[x0] 641 adrp x9, .Lsbox_magic 642 ldr q26, [x9, #:lo12:.Lsbox_magic] 643 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 644 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 645 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 646 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 647 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 648 #ifndef __AARCH64EB__ 649 rev32 v4.16b,v4.16b 650 #endif 651 mov x3,x2 652 mov x10,x3 653 mov w11,#8 654 mov w12,v4.s[0] 655 mov w13,v4.s[1] 656 mov w14,v4.s[2] 657 mov w15,v4.s[3] 658 10: 659 ldp w7,w8,[x10],8 660 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 661 eor w6,w14,w15 662 eor w9,w7,w13 663 eor w6,w6,w9 664 mov v3.s[0],w6 665 // optimize sbox using AESE instruction 666 tbl v0.16b, {v3.16b}, v26.16b 667 ushr v2.16b, v0.16b, 4 668 and v0.16b, v0.16b, v31.16b 669 tbl v0.16b, {v28.16b}, v0.16b 670 tbl v2.16b, {v27.16b}, v2.16b 671 eor v0.16b, v0.16b, v2.16b 672 eor v1.16b, v1.16b, v1.16b 673 aese v0.16b,v1.16b 674 ushr v2.16b, v0.16b, 4 675 and v0.16b, v0.16b, v31.16b 676 tbl v0.16b, {v30.16b}, v0.16b 677 tbl v2.16b, {v29.16b}, v2.16b 678 eor v0.16b, v0.16b, v2.16b 679 680 mov w7,v0.s[0] 681 eor w6,w7,w7,ror #32-2 682 eor w6,w6,w7,ror #32-10 683 eor w6,w6,w7,ror #32-18 684 eor w6,w6,w7,ror #32-24 685 eor w12,w12,w6 686 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 687 eor w6,w14,w15 688 eor w9,w12,w8 689 eor w6,w6,w9 690 mov v3.s[0],w6 691 // optimize sbox using AESE instruction 692 tbl v0.16b, {v3.16b}, v26.16b 693 ushr v2.16b, v0.16b, 4 694 and v0.16b, v0.16b, v31.16b 695 tbl v0.16b, {v28.16b}, v0.16b 696 tbl v2.16b, {v27.16b}, v2.16b 697 eor v0.16b, v0.16b, v2.16b 698 eor v1.16b, v1.16b, v1.16b 699 aese v0.16b,v1.16b 700 ushr v2.16b, v0.16b, 4 701 and v0.16b, v0.16b, v31.16b 702 tbl v0.16b, {v30.16b}, v0.16b 703 tbl v2.16b, {v29.16b}, v2.16b 704 eor v0.16b, v0.16b, v2.16b 705 706 mov w7,v0.s[0] 707 eor w6,w7,w7,ror #32-2 708 eor w6,w6,w7,ror #32-10 709 eor w6,w6,w7,ror #32-18 710 eor w6,w6,w7,ror #32-24 711 ldp w7,w8,[x10],8 712 eor w13,w13,w6 713 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 714 eor w6,w12,w13 715 eor w9,w7,w15 716 eor w6,w6,w9 717 mov v3.s[0],w6 718 // optimize sbox using AESE instruction 719 tbl v0.16b, {v3.16b}, v26.16b 720 ushr v2.16b, v0.16b, 4 721 and v0.16b, v0.16b, v31.16b 722 tbl v0.16b, {v28.16b}, v0.16b 723 tbl v2.16b, {v27.16b}, v2.16b 724 eor v0.16b, v0.16b, v2.16b 725 eor v1.16b, v1.16b, v1.16b 726 aese v0.16b,v1.16b 727 ushr v2.16b, v0.16b, 4 728 and v0.16b, v0.16b, v31.16b 729 tbl v0.16b, {v30.16b}, v0.16b 730 tbl v2.16b, {v29.16b}, v2.16b 731 eor v0.16b, v0.16b, v2.16b 732 733 mov w7,v0.s[0] 734 eor w6,w7,w7,ror #32-2 735 eor w6,w6,w7,ror #32-10 736 eor w6,w6,w7,ror #32-18 737 eor w6,w6,w7,ror #32-24 738 eor w14,w14,w6 739 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 740 eor w6,w12,w13 741 eor w9,w14,w8 742 eor w6,w6,w9 743 mov v3.s[0],w6 744 // optimize sbox using AESE instruction 745 tbl v0.16b, {v3.16b}, v26.16b 746 ushr v2.16b, v0.16b, 4 747 and v0.16b, v0.16b, v31.16b 748 tbl v0.16b, {v28.16b}, v0.16b 749 tbl v2.16b, {v27.16b}, v2.16b 750 eor v0.16b, v0.16b, v2.16b 751 eor v1.16b, v1.16b, v1.16b 752 aese v0.16b,v1.16b 753 ushr v2.16b, v0.16b, 4 754 and v0.16b, v0.16b, v31.16b 755 tbl v0.16b, {v30.16b}, v0.16b 756 tbl v2.16b, {v29.16b}, v2.16b 757 eor v0.16b, v0.16b, v2.16b 758 759 mov w7,v0.s[0] 760 eor w6,w7,w7,ror #32-2 761 eor w6,w6,w7,ror #32-10 762 eor w6,w6,w7,ror #32-18 763 eor w6,w6,w7,ror #32-24 764 eor w15,w15,w6 765 subs w11,w11,#1 766 b.ne 10b 767 mov v4.s[0],w15 768 mov v4.s[1],w14 769 mov v4.s[2],w13 770 mov v4.s[3],w12 771 #ifndef __AARCH64EB__ 772 rev32 v4.16b,v4.16b 773 #endif 774 st1 {v4.4s},[x1] 775 ret 776 .size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt 777 .globl vpsm4_ex_decrypt 778 .type vpsm4_ex_decrypt,%function 779 .align 5 780 vpsm4_ex_decrypt: 781 AARCH64_VALID_CALL_TARGET 782 ld1 {v4.4s},[x0] 783 adrp x9, .Lsbox_magic 784 ldr q26, [x9, #:lo12:.Lsbox_magic] 785 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 786 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 787 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 788 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 789 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 790 #ifndef __AARCH64EB__ 791 rev32 v4.16b,v4.16b 792 #endif 793 mov x3,x2 794 mov x10,x3 795 mov w11,#8 796 mov w12,v4.s[0] 797 mov w13,v4.s[1] 798 mov w14,v4.s[2] 799 mov w15,v4.s[3] 800 10: 801 ldp w7,w8,[x10],8 802 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 803 eor w6,w14,w15 804 eor w9,w7,w13 805 eor w6,w6,w9 806 mov v3.s[0],w6 807 // optimize sbox using AESE instruction 808 tbl v0.16b, {v3.16b}, v26.16b 809 ushr v2.16b, v0.16b, 4 810 and v0.16b, v0.16b, v31.16b 811 tbl v0.16b, {v28.16b}, v0.16b 812 tbl v2.16b, {v27.16b}, v2.16b 813 eor v0.16b, v0.16b, v2.16b 814 eor v1.16b, v1.16b, v1.16b 815 aese v0.16b,v1.16b 816 ushr v2.16b, v0.16b, 4 817 and v0.16b, v0.16b, v31.16b 818 tbl v0.16b, {v30.16b}, v0.16b 819 tbl v2.16b, {v29.16b}, v2.16b 820 eor v0.16b, v0.16b, v2.16b 821 822 mov w7,v0.s[0] 823 eor w6,w7,w7,ror #32-2 824 eor w6,w6,w7,ror #32-10 825 eor w6,w6,w7,ror #32-18 826 eor w6,w6,w7,ror #32-24 827 eor w12,w12,w6 828 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 829 eor w6,w14,w15 830 eor w9,w12,w8 831 eor w6,w6,w9 832 mov v3.s[0],w6 833 // optimize sbox using AESE instruction 834 tbl v0.16b, {v3.16b}, v26.16b 835 ushr v2.16b, v0.16b, 4 836 and v0.16b, v0.16b, v31.16b 837 tbl v0.16b, {v28.16b}, v0.16b 838 tbl v2.16b, {v27.16b}, v2.16b 839 eor v0.16b, v0.16b, v2.16b 840 eor v1.16b, v1.16b, v1.16b 841 aese v0.16b,v1.16b 842 ushr v2.16b, v0.16b, 4 843 and v0.16b, v0.16b, v31.16b 844 tbl v0.16b, {v30.16b}, v0.16b 845 tbl v2.16b, {v29.16b}, v2.16b 846 eor v0.16b, v0.16b, v2.16b 847 848 mov w7,v0.s[0] 849 eor w6,w7,w7,ror #32-2 850 eor w6,w6,w7,ror #32-10 851 eor w6,w6,w7,ror #32-18 852 eor w6,w6,w7,ror #32-24 853 ldp w7,w8,[x10],8 854 eor w13,w13,w6 855 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 856 eor w6,w12,w13 857 eor w9,w7,w15 858 eor w6,w6,w9 859 mov v3.s[0],w6 860 // optimize sbox using AESE instruction 861 tbl v0.16b, {v3.16b}, v26.16b 862 ushr v2.16b, v0.16b, 4 863 and v0.16b, v0.16b, v31.16b 864 tbl v0.16b, {v28.16b}, v0.16b 865 tbl v2.16b, {v27.16b}, v2.16b 866 eor v0.16b, v0.16b, v2.16b 867 eor v1.16b, v1.16b, v1.16b 868 aese v0.16b,v1.16b 869 ushr v2.16b, v0.16b, 4 870 and v0.16b, v0.16b, v31.16b 871 tbl v0.16b, {v30.16b}, v0.16b 872 tbl v2.16b, {v29.16b}, v2.16b 873 eor v0.16b, v0.16b, v2.16b 874 875 mov w7,v0.s[0] 876 eor w6,w7,w7,ror #32-2 877 eor w6,w6,w7,ror #32-10 878 eor w6,w6,w7,ror #32-18 879 eor w6,w6,w7,ror #32-24 880 eor w14,w14,w6 881 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 882 eor w6,w12,w13 883 eor w9,w14,w8 884 eor w6,w6,w9 885 mov v3.s[0],w6 886 // optimize sbox using AESE instruction 887 tbl v0.16b, {v3.16b}, v26.16b 888 ushr v2.16b, v0.16b, 4 889 and v0.16b, v0.16b, v31.16b 890 tbl v0.16b, {v28.16b}, v0.16b 891 tbl v2.16b, {v27.16b}, v2.16b 892 eor v0.16b, v0.16b, v2.16b 893 eor v1.16b, v1.16b, v1.16b 894 aese v0.16b,v1.16b 895 ushr v2.16b, v0.16b, 4 896 and v0.16b, v0.16b, v31.16b 897 tbl v0.16b, {v30.16b}, v0.16b 898 tbl v2.16b, {v29.16b}, v2.16b 899 eor v0.16b, v0.16b, v2.16b 900 901 mov w7,v0.s[0] 902 eor w6,w7,w7,ror #32-2 903 eor w6,w6,w7,ror #32-10 904 eor w6,w6,w7,ror #32-18 905 eor w6,w6,w7,ror #32-24 906 eor w15,w15,w6 907 subs w11,w11,#1 908 b.ne 10b 909 mov v4.s[0],w15 910 mov v4.s[1],w14 911 mov v4.s[2],w13 912 mov v4.s[3],w12 913 #ifndef __AARCH64EB__ 914 rev32 v4.16b,v4.16b 915 #endif 916 st1 {v4.4s},[x1] 917 ret 918 .size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt 919 .globl vpsm4_ex_ecb_encrypt 920 .type vpsm4_ex_ecb_encrypt,%function 921 .align 5 922 vpsm4_ex_ecb_encrypt: 923 AARCH64_SIGN_LINK_REGISTER 924 // convert length into blocks 925 lsr x2,x2,4 926 stp d8,d9,[sp,#-80]! 927 stp d10,d11,[sp,#16] 928 stp d12,d13,[sp,#32] 929 stp d14,d15,[sp,#48] 930 stp x29,x30,[sp,#64] 931 adrp x9, .Lsbox_magic 932 ldr q26, [x9, #:lo12:.Lsbox_magic] 933 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 934 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 935 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 936 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 937 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 938 .Lecb_8_blocks_process: 939 cmp w2,#8 940 b.lt .Lecb_4_blocks_process 941 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 942 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 943 #ifndef __AARCH64EB__ 944 rev32 v4.16b,v4.16b 945 #endif 946 #ifndef __AARCH64EB__ 947 rev32 v5.16b,v5.16b 948 #endif 949 #ifndef __AARCH64EB__ 950 rev32 v6.16b,v6.16b 951 #endif 952 #ifndef __AARCH64EB__ 953 rev32 v7.16b,v7.16b 954 #endif 955 #ifndef __AARCH64EB__ 956 rev32 v8.16b,v8.16b 957 #endif 958 #ifndef __AARCH64EB__ 959 rev32 v9.16b,v9.16b 960 #endif 961 #ifndef __AARCH64EB__ 962 rev32 v10.16b,v10.16b 963 #endif 964 #ifndef __AARCH64EB__ 965 rev32 v11.16b,v11.16b 966 #endif 967 bl _vpsm4_ex_enc_8blks 968 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 969 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 970 subs w2,w2,#8 971 b.gt .Lecb_8_blocks_process 972 b 100f 973 .Lecb_4_blocks_process: 974 cmp w2,#4 975 b.lt 1f 976 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 977 #ifndef __AARCH64EB__ 978 rev32 v4.16b,v4.16b 979 #endif 980 #ifndef __AARCH64EB__ 981 rev32 v5.16b,v5.16b 982 #endif 983 #ifndef __AARCH64EB__ 984 rev32 v6.16b,v6.16b 985 #endif 986 #ifndef __AARCH64EB__ 987 rev32 v7.16b,v7.16b 988 #endif 989 bl _vpsm4_ex_enc_4blks 990 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 991 sub w2,w2,#4 992 1: 993 // process last block 994 cmp w2,#1 995 b.lt 100f 996 b.gt 1f 997 ld1 {v4.4s},[x0] 998 #ifndef __AARCH64EB__ 999 rev32 v4.16b,v4.16b 1000 #endif 1001 mov x10,x3 1002 mov w11,#8 1003 mov w12,v4.s[0] 1004 mov w13,v4.s[1] 1005 mov w14,v4.s[2] 1006 mov w15,v4.s[3] 1007 10: 1008 ldp w7,w8,[x10],8 1009 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1010 eor w6,w14,w15 1011 eor w9,w7,w13 1012 eor w6,w6,w9 1013 mov v3.s[0],w6 1014 // optimize sbox using AESE instruction 1015 tbl v0.16b, {v3.16b}, v26.16b 1016 ushr v2.16b, v0.16b, 4 1017 and v0.16b, v0.16b, v31.16b 1018 tbl v0.16b, {v28.16b}, v0.16b 1019 tbl v2.16b, {v27.16b}, v2.16b 1020 eor v0.16b, v0.16b, v2.16b 1021 eor v1.16b, v1.16b, v1.16b 1022 aese v0.16b,v1.16b 1023 ushr v2.16b, v0.16b, 4 1024 and v0.16b, v0.16b, v31.16b 1025 tbl v0.16b, {v30.16b}, v0.16b 1026 tbl v2.16b, {v29.16b}, v2.16b 1027 eor v0.16b, v0.16b, v2.16b 1028 1029 mov w7,v0.s[0] 1030 eor w6,w7,w7,ror #32-2 1031 eor w6,w6,w7,ror #32-10 1032 eor w6,w6,w7,ror #32-18 1033 eor w6,w6,w7,ror #32-24 1034 eor w12,w12,w6 1035 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1036 eor w6,w14,w15 1037 eor w9,w12,w8 1038 eor w6,w6,w9 1039 mov v3.s[0],w6 1040 // optimize sbox using AESE instruction 1041 tbl v0.16b, {v3.16b}, v26.16b 1042 ushr v2.16b, v0.16b, 4 1043 and v0.16b, v0.16b, v31.16b 1044 tbl v0.16b, {v28.16b}, v0.16b 1045 tbl v2.16b, {v27.16b}, v2.16b 1046 eor v0.16b, v0.16b, v2.16b 1047 eor v1.16b, v1.16b, v1.16b 1048 aese v0.16b,v1.16b 1049 ushr v2.16b, v0.16b, 4 1050 and v0.16b, v0.16b, v31.16b 1051 tbl v0.16b, {v30.16b}, v0.16b 1052 tbl v2.16b, {v29.16b}, v2.16b 1053 eor v0.16b, v0.16b, v2.16b 1054 1055 mov w7,v0.s[0] 1056 eor w6,w7,w7,ror #32-2 1057 eor w6,w6,w7,ror #32-10 1058 eor w6,w6,w7,ror #32-18 1059 eor w6,w6,w7,ror #32-24 1060 ldp w7,w8,[x10],8 1061 eor w13,w13,w6 1062 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1063 eor w6,w12,w13 1064 eor w9,w7,w15 1065 eor w6,w6,w9 1066 mov v3.s[0],w6 1067 // optimize sbox using AESE instruction 1068 tbl v0.16b, {v3.16b}, v26.16b 1069 ushr v2.16b, v0.16b, 4 1070 and v0.16b, v0.16b, v31.16b 1071 tbl v0.16b, {v28.16b}, v0.16b 1072 tbl v2.16b, {v27.16b}, v2.16b 1073 eor v0.16b, v0.16b, v2.16b 1074 eor v1.16b, v1.16b, v1.16b 1075 aese v0.16b,v1.16b 1076 ushr v2.16b, v0.16b, 4 1077 and v0.16b, v0.16b, v31.16b 1078 tbl v0.16b, {v30.16b}, v0.16b 1079 tbl v2.16b, {v29.16b}, v2.16b 1080 eor v0.16b, v0.16b, v2.16b 1081 1082 mov w7,v0.s[0] 1083 eor w6,w7,w7,ror #32-2 1084 eor w6,w6,w7,ror #32-10 1085 eor w6,w6,w7,ror #32-18 1086 eor w6,w6,w7,ror #32-24 1087 eor w14,w14,w6 1088 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1089 eor w6,w12,w13 1090 eor w9,w14,w8 1091 eor w6,w6,w9 1092 mov v3.s[0],w6 1093 // optimize sbox using AESE instruction 1094 tbl v0.16b, {v3.16b}, v26.16b 1095 ushr v2.16b, v0.16b, 4 1096 and v0.16b, v0.16b, v31.16b 1097 tbl v0.16b, {v28.16b}, v0.16b 1098 tbl v2.16b, {v27.16b}, v2.16b 1099 eor v0.16b, v0.16b, v2.16b 1100 eor v1.16b, v1.16b, v1.16b 1101 aese v0.16b,v1.16b 1102 ushr v2.16b, v0.16b, 4 1103 and v0.16b, v0.16b, v31.16b 1104 tbl v0.16b, {v30.16b}, v0.16b 1105 tbl v2.16b, {v29.16b}, v2.16b 1106 eor v0.16b, v0.16b, v2.16b 1107 1108 mov w7,v0.s[0] 1109 eor w6,w7,w7,ror #32-2 1110 eor w6,w6,w7,ror #32-10 1111 eor w6,w6,w7,ror #32-18 1112 eor w6,w6,w7,ror #32-24 1113 eor w15,w15,w6 1114 subs w11,w11,#1 1115 b.ne 10b 1116 mov v4.s[0],w15 1117 mov v4.s[1],w14 1118 mov v4.s[2],w13 1119 mov v4.s[3],w12 1120 #ifndef __AARCH64EB__ 1121 rev32 v4.16b,v4.16b 1122 #endif 1123 st1 {v4.4s},[x1] 1124 b 100f 1125 1: // process last 2 blocks 1126 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 1127 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 1128 cmp w2,#2 1129 b.gt 1f 1130 #ifndef __AARCH64EB__ 1131 rev32 v4.16b,v4.16b 1132 #endif 1133 #ifndef __AARCH64EB__ 1134 rev32 v5.16b,v5.16b 1135 #endif 1136 #ifndef __AARCH64EB__ 1137 rev32 v6.16b,v6.16b 1138 #endif 1139 #ifndef __AARCH64EB__ 1140 rev32 v7.16b,v7.16b 1141 #endif 1142 bl _vpsm4_ex_enc_4blks 1143 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1144 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] 1145 b 100f 1146 1: // process last 3 blocks 1147 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 1148 #ifndef __AARCH64EB__ 1149 rev32 v4.16b,v4.16b 1150 #endif 1151 #ifndef __AARCH64EB__ 1152 rev32 v5.16b,v5.16b 1153 #endif 1154 #ifndef __AARCH64EB__ 1155 rev32 v6.16b,v6.16b 1156 #endif 1157 #ifndef __AARCH64EB__ 1158 rev32 v7.16b,v7.16b 1159 #endif 1160 bl _vpsm4_ex_enc_4blks 1161 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1162 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 1163 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] 1164 100: 1165 ldp d10,d11,[sp,#16] 1166 ldp d12,d13,[sp,#32] 1167 ldp d14,d15,[sp,#48] 1168 ldp x29,x30,[sp,#64] 1169 ldp d8,d9,[sp],#80 1170 AARCH64_VALIDATE_LINK_REGISTER 1171 ret 1172 .size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt 1173 .globl vpsm4_ex_cbc_encrypt 1174 .type vpsm4_ex_cbc_encrypt,%function 1175 .align 5 1176 vpsm4_ex_cbc_encrypt: 1177 AARCH64_VALID_CALL_TARGET 1178 lsr x2,x2,4 1179 adrp x9, .Lsbox_magic 1180 ldr q26, [x9, #:lo12:.Lsbox_magic] 1181 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 1182 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 1183 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 1184 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 1185 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 1186 cbz w5,.Ldec 1187 ld1 {v3.4s},[x4] 1188 .Lcbc_4_blocks_enc: 1189 cmp w2,#4 1190 b.lt 1f 1191 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1192 eor v4.16b,v4.16b,v3.16b 1193 #ifndef __AARCH64EB__ 1194 rev32 v5.16b,v5.16b 1195 #endif 1196 #ifndef __AARCH64EB__ 1197 rev32 v4.16b,v4.16b 1198 #endif 1199 #ifndef __AARCH64EB__ 1200 rev32 v6.16b,v6.16b 1201 #endif 1202 #ifndef __AARCH64EB__ 1203 rev32 v7.16b,v7.16b 1204 #endif 1205 mov x10,x3 1206 mov w11,#8 1207 mov w12,v4.s[0] 1208 mov w13,v4.s[1] 1209 mov w14,v4.s[2] 1210 mov w15,v4.s[3] 1211 10: 1212 ldp w7,w8,[x10],8 1213 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1214 eor w6,w14,w15 1215 eor w9,w7,w13 1216 eor w6,w6,w9 1217 mov v3.s[0],w6 1218 // optimize sbox using AESE instruction 1219 tbl v0.16b, {v3.16b}, v26.16b 1220 ushr v2.16b, v0.16b, 4 1221 and v0.16b, v0.16b, v31.16b 1222 tbl v0.16b, {v28.16b}, v0.16b 1223 tbl v2.16b, {v27.16b}, v2.16b 1224 eor v0.16b, v0.16b, v2.16b 1225 eor v1.16b, v1.16b, v1.16b 1226 aese v0.16b,v1.16b 1227 ushr v2.16b, v0.16b, 4 1228 and v0.16b, v0.16b, v31.16b 1229 tbl v0.16b, {v30.16b}, v0.16b 1230 tbl v2.16b, {v29.16b}, v2.16b 1231 eor v0.16b, v0.16b, v2.16b 1232 1233 mov w7,v0.s[0] 1234 eor w6,w7,w7,ror #32-2 1235 eor w6,w6,w7,ror #32-10 1236 eor w6,w6,w7,ror #32-18 1237 eor w6,w6,w7,ror #32-24 1238 eor w12,w12,w6 1239 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1240 eor w6,w14,w15 1241 eor w9,w12,w8 1242 eor w6,w6,w9 1243 mov v3.s[0],w6 1244 // optimize sbox using AESE instruction 1245 tbl v0.16b, {v3.16b}, v26.16b 1246 ushr v2.16b, v0.16b, 4 1247 and v0.16b, v0.16b, v31.16b 1248 tbl v0.16b, {v28.16b}, v0.16b 1249 tbl v2.16b, {v27.16b}, v2.16b 1250 eor v0.16b, v0.16b, v2.16b 1251 eor v1.16b, v1.16b, v1.16b 1252 aese v0.16b,v1.16b 1253 ushr v2.16b, v0.16b, 4 1254 and v0.16b, v0.16b, v31.16b 1255 tbl v0.16b, {v30.16b}, v0.16b 1256 tbl v2.16b, {v29.16b}, v2.16b 1257 eor v0.16b, v0.16b, v2.16b 1258 1259 mov w7,v0.s[0] 1260 eor w6,w7,w7,ror #32-2 1261 eor w6,w6,w7,ror #32-10 1262 eor w6,w6,w7,ror #32-18 1263 eor w6,w6,w7,ror #32-24 1264 ldp w7,w8,[x10],8 1265 eor w13,w13,w6 1266 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1267 eor w6,w12,w13 1268 eor w9,w7,w15 1269 eor w6,w6,w9 1270 mov v3.s[0],w6 1271 // optimize sbox using AESE instruction 1272 tbl v0.16b, {v3.16b}, v26.16b 1273 ushr v2.16b, v0.16b, 4 1274 and v0.16b, v0.16b, v31.16b 1275 tbl v0.16b, {v28.16b}, v0.16b 1276 tbl v2.16b, {v27.16b}, v2.16b 1277 eor v0.16b, v0.16b, v2.16b 1278 eor v1.16b, v1.16b, v1.16b 1279 aese v0.16b,v1.16b 1280 ushr v2.16b, v0.16b, 4 1281 and v0.16b, v0.16b, v31.16b 1282 tbl v0.16b, {v30.16b}, v0.16b 1283 tbl v2.16b, {v29.16b}, v2.16b 1284 eor v0.16b, v0.16b, v2.16b 1285 1286 mov w7,v0.s[0] 1287 eor w6,w7,w7,ror #32-2 1288 eor w6,w6,w7,ror #32-10 1289 eor w6,w6,w7,ror #32-18 1290 eor w6,w6,w7,ror #32-24 1291 eor w14,w14,w6 1292 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1293 eor w6,w12,w13 1294 eor w9,w14,w8 1295 eor w6,w6,w9 1296 mov v3.s[0],w6 1297 // optimize sbox using AESE instruction 1298 tbl v0.16b, {v3.16b}, v26.16b 1299 ushr v2.16b, v0.16b, 4 1300 and v0.16b, v0.16b, v31.16b 1301 tbl v0.16b, {v28.16b}, v0.16b 1302 tbl v2.16b, {v27.16b}, v2.16b 1303 eor v0.16b, v0.16b, v2.16b 1304 eor v1.16b, v1.16b, v1.16b 1305 aese v0.16b,v1.16b 1306 ushr v2.16b, v0.16b, 4 1307 and v0.16b, v0.16b, v31.16b 1308 tbl v0.16b, {v30.16b}, v0.16b 1309 tbl v2.16b, {v29.16b}, v2.16b 1310 eor v0.16b, v0.16b, v2.16b 1311 1312 mov w7,v0.s[0] 1313 eor w6,w7,w7,ror #32-2 1314 eor w6,w6,w7,ror #32-10 1315 eor w6,w6,w7,ror #32-18 1316 eor w6,w6,w7,ror #32-24 1317 eor w15,w15,w6 1318 subs w11,w11,#1 1319 b.ne 10b 1320 mov v4.s[0],w15 1321 mov v4.s[1],w14 1322 mov v4.s[2],w13 1323 mov v4.s[3],w12 1324 eor v5.16b,v5.16b,v4.16b 1325 mov x10,x3 1326 mov w11,#8 1327 mov w12,v5.s[0] 1328 mov w13,v5.s[1] 1329 mov w14,v5.s[2] 1330 mov w15,v5.s[3] 1331 10: 1332 ldp w7,w8,[x10],8 1333 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1334 eor w6,w14,w15 1335 eor w9,w7,w13 1336 eor w6,w6,w9 1337 mov v3.s[0],w6 1338 // optimize sbox using AESE instruction 1339 tbl v0.16b, {v3.16b}, v26.16b 1340 ushr v2.16b, v0.16b, 4 1341 and v0.16b, v0.16b, v31.16b 1342 tbl v0.16b, {v28.16b}, v0.16b 1343 tbl v2.16b, {v27.16b}, v2.16b 1344 eor v0.16b, v0.16b, v2.16b 1345 eor v1.16b, v1.16b, v1.16b 1346 aese v0.16b,v1.16b 1347 ushr v2.16b, v0.16b, 4 1348 and v0.16b, v0.16b, v31.16b 1349 tbl v0.16b, {v30.16b}, v0.16b 1350 tbl v2.16b, {v29.16b}, v2.16b 1351 eor v0.16b, v0.16b, v2.16b 1352 1353 mov w7,v0.s[0] 1354 eor w6,w7,w7,ror #32-2 1355 eor w6,w6,w7,ror #32-10 1356 eor w6,w6,w7,ror #32-18 1357 eor w6,w6,w7,ror #32-24 1358 eor w12,w12,w6 1359 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1360 eor w6,w14,w15 1361 eor w9,w12,w8 1362 eor w6,w6,w9 1363 mov v3.s[0],w6 1364 // optimize sbox using AESE instruction 1365 tbl v0.16b, {v3.16b}, v26.16b 1366 ushr v2.16b, v0.16b, 4 1367 and v0.16b, v0.16b, v31.16b 1368 tbl v0.16b, {v28.16b}, v0.16b 1369 tbl v2.16b, {v27.16b}, v2.16b 1370 eor v0.16b, v0.16b, v2.16b 1371 eor v1.16b, v1.16b, v1.16b 1372 aese v0.16b,v1.16b 1373 ushr v2.16b, v0.16b, 4 1374 and v0.16b, v0.16b, v31.16b 1375 tbl v0.16b, {v30.16b}, v0.16b 1376 tbl v2.16b, {v29.16b}, v2.16b 1377 eor v0.16b, v0.16b, v2.16b 1378 1379 mov w7,v0.s[0] 1380 eor w6,w7,w7,ror #32-2 1381 eor w6,w6,w7,ror #32-10 1382 eor w6,w6,w7,ror #32-18 1383 eor w6,w6,w7,ror #32-24 1384 ldp w7,w8,[x10],8 1385 eor w13,w13,w6 1386 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1387 eor w6,w12,w13 1388 eor w9,w7,w15 1389 eor w6,w6,w9 1390 mov v3.s[0],w6 1391 // optimize sbox using AESE instruction 1392 tbl v0.16b, {v3.16b}, v26.16b 1393 ushr v2.16b, v0.16b, 4 1394 and v0.16b, v0.16b, v31.16b 1395 tbl v0.16b, {v28.16b}, v0.16b 1396 tbl v2.16b, {v27.16b}, v2.16b 1397 eor v0.16b, v0.16b, v2.16b 1398 eor v1.16b, v1.16b, v1.16b 1399 aese v0.16b,v1.16b 1400 ushr v2.16b, v0.16b, 4 1401 and v0.16b, v0.16b, v31.16b 1402 tbl v0.16b, {v30.16b}, v0.16b 1403 tbl v2.16b, {v29.16b}, v2.16b 1404 eor v0.16b, v0.16b, v2.16b 1405 1406 mov w7,v0.s[0] 1407 eor w6,w7,w7,ror #32-2 1408 eor w6,w6,w7,ror #32-10 1409 eor w6,w6,w7,ror #32-18 1410 eor w6,w6,w7,ror #32-24 1411 eor w14,w14,w6 1412 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1413 eor w6,w12,w13 1414 eor w9,w14,w8 1415 eor w6,w6,w9 1416 mov v3.s[0],w6 1417 // optimize sbox using AESE instruction 1418 tbl v0.16b, {v3.16b}, v26.16b 1419 ushr v2.16b, v0.16b, 4 1420 and v0.16b, v0.16b, v31.16b 1421 tbl v0.16b, {v28.16b}, v0.16b 1422 tbl v2.16b, {v27.16b}, v2.16b 1423 eor v0.16b, v0.16b, v2.16b 1424 eor v1.16b, v1.16b, v1.16b 1425 aese v0.16b,v1.16b 1426 ushr v2.16b, v0.16b, 4 1427 and v0.16b, v0.16b, v31.16b 1428 tbl v0.16b, {v30.16b}, v0.16b 1429 tbl v2.16b, {v29.16b}, v2.16b 1430 eor v0.16b, v0.16b, v2.16b 1431 1432 mov w7,v0.s[0] 1433 eor w6,w7,w7,ror #32-2 1434 eor w6,w6,w7,ror #32-10 1435 eor w6,w6,w7,ror #32-18 1436 eor w6,w6,w7,ror #32-24 1437 eor w15,w15,w6 1438 subs w11,w11,#1 1439 b.ne 10b 1440 mov v5.s[0],w15 1441 mov v5.s[1],w14 1442 mov v5.s[2],w13 1443 mov v5.s[3],w12 1444 #ifndef __AARCH64EB__ 1445 rev32 v4.16b,v4.16b 1446 #endif 1447 eor v6.16b,v6.16b,v5.16b 1448 mov x10,x3 1449 mov w11,#8 1450 mov w12,v6.s[0] 1451 mov w13,v6.s[1] 1452 mov w14,v6.s[2] 1453 mov w15,v6.s[3] 1454 10: 1455 ldp w7,w8,[x10],8 1456 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1457 eor w6,w14,w15 1458 eor w9,w7,w13 1459 eor w6,w6,w9 1460 mov v3.s[0],w6 1461 // optimize sbox using AESE instruction 1462 tbl v0.16b, {v3.16b}, v26.16b 1463 ushr v2.16b, v0.16b, 4 1464 and v0.16b, v0.16b, v31.16b 1465 tbl v0.16b, {v28.16b}, v0.16b 1466 tbl v2.16b, {v27.16b}, v2.16b 1467 eor v0.16b, v0.16b, v2.16b 1468 eor v1.16b, v1.16b, v1.16b 1469 aese v0.16b,v1.16b 1470 ushr v2.16b, v0.16b, 4 1471 and v0.16b, v0.16b, v31.16b 1472 tbl v0.16b, {v30.16b}, v0.16b 1473 tbl v2.16b, {v29.16b}, v2.16b 1474 eor v0.16b, v0.16b, v2.16b 1475 1476 mov w7,v0.s[0] 1477 eor w6,w7,w7,ror #32-2 1478 eor w6,w6,w7,ror #32-10 1479 eor w6,w6,w7,ror #32-18 1480 eor w6,w6,w7,ror #32-24 1481 eor w12,w12,w6 1482 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1483 eor w6,w14,w15 1484 eor w9,w12,w8 1485 eor w6,w6,w9 1486 mov v3.s[0],w6 1487 // optimize sbox using AESE instruction 1488 tbl v0.16b, {v3.16b}, v26.16b 1489 ushr v2.16b, v0.16b, 4 1490 and v0.16b, v0.16b, v31.16b 1491 tbl v0.16b, {v28.16b}, v0.16b 1492 tbl v2.16b, {v27.16b}, v2.16b 1493 eor v0.16b, v0.16b, v2.16b 1494 eor v1.16b, v1.16b, v1.16b 1495 aese v0.16b,v1.16b 1496 ushr v2.16b, v0.16b, 4 1497 and v0.16b, v0.16b, v31.16b 1498 tbl v0.16b, {v30.16b}, v0.16b 1499 tbl v2.16b, {v29.16b}, v2.16b 1500 eor v0.16b, v0.16b, v2.16b 1501 1502 mov w7,v0.s[0] 1503 eor w6,w7,w7,ror #32-2 1504 eor w6,w6,w7,ror #32-10 1505 eor w6,w6,w7,ror #32-18 1506 eor w6,w6,w7,ror #32-24 1507 ldp w7,w8,[x10],8 1508 eor w13,w13,w6 1509 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1510 eor w6,w12,w13 1511 eor w9,w7,w15 1512 eor w6,w6,w9 1513 mov v3.s[0],w6 1514 // optimize sbox using AESE instruction 1515 tbl v0.16b, {v3.16b}, v26.16b 1516 ushr v2.16b, v0.16b, 4 1517 and v0.16b, v0.16b, v31.16b 1518 tbl v0.16b, {v28.16b}, v0.16b 1519 tbl v2.16b, {v27.16b}, v2.16b 1520 eor v0.16b, v0.16b, v2.16b 1521 eor v1.16b, v1.16b, v1.16b 1522 aese v0.16b,v1.16b 1523 ushr v2.16b, v0.16b, 4 1524 and v0.16b, v0.16b, v31.16b 1525 tbl v0.16b, {v30.16b}, v0.16b 1526 tbl v2.16b, {v29.16b}, v2.16b 1527 eor v0.16b, v0.16b, v2.16b 1528 1529 mov w7,v0.s[0] 1530 eor w6,w7,w7,ror #32-2 1531 eor w6,w6,w7,ror #32-10 1532 eor w6,w6,w7,ror #32-18 1533 eor w6,w6,w7,ror #32-24 1534 eor w14,w14,w6 1535 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1536 eor w6,w12,w13 1537 eor w9,w14,w8 1538 eor w6,w6,w9 1539 mov v3.s[0],w6 1540 // optimize sbox using AESE instruction 1541 tbl v0.16b, {v3.16b}, v26.16b 1542 ushr v2.16b, v0.16b, 4 1543 and v0.16b, v0.16b, v31.16b 1544 tbl v0.16b, {v28.16b}, v0.16b 1545 tbl v2.16b, {v27.16b}, v2.16b 1546 eor v0.16b, v0.16b, v2.16b 1547 eor v1.16b, v1.16b, v1.16b 1548 aese v0.16b,v1.16b 1549 ushr v2.16b, v0.16b, 4 1550 and v0.16b, v0.16b, v31.16b 1551 tbl v0.16b, {v30.16b}, v0.16b 1552 tbl v2.16b, {v29.16b}, v2.16b 1553 eor v0.16b, v0.16b, v2.16b 1554 1555 mov w7,v0.s[0] 1556 eor w6,w7,w7,ror #32-2 1557 eor w6,w6,w7,ror #32-10 1558 eor w6,w6,w7,ror #32-18 1559 eor w6,w6,w7,ror #32-24 1560 eor w15,w15,w6 1561 subs w11,w11,#1 1562 b.ne 10b 1563 mov v6.s[0],w15 1564 mov v6.s[1],w14 1565 mov v6.s[2],w13 1566 mov v6.s[3],w12 1567 #ifndef __AARCH64EB__ 1568 rev32 v5.16b,v5.16b 1569 #endif 1570 eor v7.16b,v7.16b,v6.16b 1571 mov x10,x3 1572 mov w11,#8 1573 mov w12,v7.s[0] 1574 mov w13,v7.s[1] 1575 mov w14,v7.s[2] 1576 mov w15,v7.s[3] 1577 10: 1578 ldp w7,w8,[x10],8 1579 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1580 eor w6,w14,w15 1581 eor w9,w7,w13 1582 eor w6,w6,w9 1583 mov v3.s[0],w6 1584 // optimize sbox using AESE instruction 1585 tbl v0.16b, {v3.16b}, v26.16b 1586 ushr v2.16b, v0.16b, 4 1587 and v0.16b, v0.16b, v31.16b 1588 tbl v0.16b, {v28.16b}, v0.16b 1589 tbl v2.16b, {v27.16b}, v2.16b 1590 eor v0.16b, v0.16b, v2.16b 1591 eor v1.16b, v1.16b, v1.16b 1592 aese v0.16b,v1.16b 1593 ushr v2.16b, v0.16b, 4 1594 and v0.16b, v0.16b, v31.16b 1595 tbl v0.16b, {v30.16b}, v0.16b 1596 tbl v2.16b, {v29.16b}, v2.16b 1597 eor v0.16b, v0.16b, v2.16b 1598 1599 mov w7,v0.s[0] 1600 eor w6,w7,w7,ror #32-2 1601 eor w6,w6,w7,ror #32-10 1602 eor w6,w6,w7,ror #32-18 1603 eor w6,w6,w7,ror #32-24 1604 eor w12,w12,w6 1605 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1606 eor w6,w14,w15 1607 eor w9,w12,w8 1608 eor w6,w6,w9 1609 mov v3.s[0],w6 1610 // optimize sbox using AESE instruction 1611 tbl v0.16b, {v3.16b}, v26.16b 1612 ushr v2.16b, v0.16b, 4 1613 and v0.16b, v0.16b, v31.16b 1614 tbl v0.16b, {v28.16b}, v0.16b 1615 tbl v2.16b, {v27.16b}, v2.16b 1616 eor v0.16b, v0.16b, v2.16b 1617 eor v1.16b, v1.16b, v1.16b 1618 aese v0.16b,v1.16b 1619 ushr v2.16b, v0.16b, 4 1620 and v0.16b, v0.16b, v31.16b 1621 tbl v0.16b, {v30.16b}, v0.16b 1622 tbl v2.16b, {v29.16b}, v2.16b 1623 eor v0.16b, v0.16b, v2.16b 1624 1625 mov w7,v0.s[0] 1626 eor w6,w7,w7,ror #32-2 1627 eor w6,w6,w7,ror #32-10 1628 eor w6,w6,w7,ror #32-18 1629 eor w6,w6,w7,ror #32-24 1630 ldp w7,w8,[x10],8 1631 eor w13,w13,w6 1632 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1633 eor w6,w12,w13 1634 eor w9,w7,w15 1635 eor w6,w6,w9 1636 mov v3.s[0],w6 1637 // optimize sbox using AESE instruction 1638 tbl v0.16b, {v3.16b}, v26.16b 1639 ushr v2.16b, v0.16b, 4 1640 and v0.16b, v0.16b, v31.16b 1641 tbl v0.16b, {v28.16b}, v0.16b 1642 tbl v2.16b, {v27.16b}, v2.16b 1643 eor v0.16b, v0.16b, v2.16b 1644 eor v1.16b, v1.16b, v1.16b 1645 aese v0.16b,v1.16b 1646 ushr v2.16b, v0.16b, 4 1647 and v0.16b, v0.16b, v31.16b 1648 tbl v0.16b, {v30.16b}, v0.16b 1649 tbl v2.16b, {v29.16b}, v2.16b 1650 eor v0.16b, v0.16b, v2.16b 1651 1652 mov w7,v0.s[0] 1653 eor w6,w7,w7,ror #32-2 1654 eor w6,w6,w7,ror #32-10 1655 eor w6,w6,w7,ror #32-18 1656 eor w6,w6,w7,ror #32-24 1657 eor w14,w14,w6 1658 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1659 eor w6,w12,w13 1660 eor w9,w14,w8 1661 eor w6,w6,w9 1662 mov v3.s[0],w6 1663 // optimize sbox using AESE instruction 1664 tbl v0.16b, {v3.16b}, v26.16b 1665 ushr v2.16b, v0.16b, 4 1666 and v0.16b, v0.16b, v31.16b 1667 tbl v0.16b, {v28.16b}, v0.16b 1668 tbl v2.16b, {v27.16b}, v2.16b 1669 eor v0.16b, v0.16b, v2.16b 1670 eor v1.16b, v1.16b, v1.16b 1671 aese v0.16b,v1.16b 1672 ushr v2.16b, v0.16b, 4 1673 and v0.16b, v0.16b, v31.16b 1674 tbl v0.16b, {v30.16b}, v0.16b 1675 tbl v2.16b, {v29.16b}, v2.16b 1676 eor v0.16b, v0.16b, v2.16b 1677 1678 mov w7,v0.s[0] 1679 eor w6,w7,w7,ror #32-2 1680 eor w6,w6,w7,ror #32-10 1681 eor w6,w6,w7,ror #32-18 1682 eor w6,w6,w7,ror #32-24 1683 eor w15,w15,w6 1684 subs w11,w11,#1 1685 b.ne 10b 1686 mov v7.s[0],w15 1687 mov v7.s[1],w14 1688 mov v7.s[2],w13 1689 mov v7.s[3],w12 1690 #ifndef __AARCH64EB__ 1691 rev32 v6.16b,v6.16b 1692 #endif 1693 #ifndef __AARCH64EB__ 1694 rev32 v7.16b,v7.16b 1695 #endif 1696 orr v3.16b,v7.16b,v7.16b 1697 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1698 subs w2,w2,#4 1699 b.ne .Lcbc_4_blocks_enc 1700 b 2f 1701 1: 1702 subs w2,w2,#1 1703 b.lt 2f 1704 ld1 {v4.4s},[x0],#16 1705 eor v3.16b,v3.16b,v4.16b 1706 #ifndef __AARCH64EB__ 1707 rev32 v3.16b,v3.16b 1708 #endif 1709 mov x10,x3 1710 mov w11,#8 1711 mov w12,v3.s[0] 1712 mov w13,v3.s[1] 1713 mov w14,v3.s[2] 1714 mov w15,v3.s[3] 1715 10: 1716 ldp w7,w8,[x10],8 1717 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1718 eor w6,w14,w15 1719 eor w9,w7,w13 1720 eor w6,w6,w9 1721 mov v3.s[0],w6 1722 // optimize sbox using AESE instruction 1723 tbl v0.16b, {v3.16b}, v26.16b 1724 ushr v2.16b, v0.16b, 4 1725 and v0.16b, v0.16b, v31.16b 1726 tbl v0.16b, {v28.16b}, v0.16b 1727 tbl v2.16b, {v27.16b}, v2.16b 1728 eor v0.16b, v0.16b, v2.16b 1729 eor v1.16b, v1.16b, v1.16b 1730 aese v0.16b,v1.16b 1731 ushr v2.16b, v0.16b, 4 1732 and v0.16b, v0.16b, v31.16b 1733 tbl v0.16b, {v30.16b}, v0.16b 1734 tbl v2.16b, {v29.16b}, v2.16b 1735 eor v0.16b, v0.16b, v2.16b 1736 1737 mov w7,v0.s[0] 1738 eor w6,w7,w7,ror #32-2 1739 eor w6,w6,w7,ror #32-10 1740 eor w6,w6,w7,ror #32-18 1741 eor w6,w6,w7,ror #32-24 1742 eor w12,w12,w6 1743 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1744 eor w6,w14,w15 1745 eor w9,w12,w8 1746 eor w6,w6,w9 1747 mov v3.s[0],w6 1748 // optimize sbox using AESE instruction 1749 tbl v0.16b, {v3.16b}, v26.16b 1750 ushr v2.16b, v0.16b, 4 1751 and v0.16b, v0.16b, v31.16b 1752 tbl v0.16b, {v28.16b}, v0.16b 1753 tbl v2.16b, {v27.16b}, v2.16b 1754 eor v0.16b, v0.16b, v2.16b 1755 eor v1.16b, v1.16b, v1.16b 1756 aese v0.16b,v1.16b 1757 ushr v2.16b, v0.16b, 4 1758 and v0.16b, v0.16b, v31.16b 1759 tbl v0.16b, {v30.16b}, v0.16b 1760 tbl v2.16b, {v29.16b}, v2.16b 1761 eor v0.16b, v0.16b, v2.16b 1762 1763 mov w7,v0.s[0] 1764 eor w6,w7,w7,ror #32-2 1765 eor w6,w6,w7,ror #32-10 1766 eor w6,w6,w7,ror #32-18 1767 eor w6,w6,w7,ror #32-24 1768 ldp w7,w8,[x10],8 1769 eor w13,w13,w6 1770 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1771 eor w6,w12,w13 1772 eor w9,w7,w15 1773 eor w6,w6,w9 1774 mov v3.s[0],w6 1775 // optimize sbox using AESE instruction 1776 tbl v0.16b, {v3.16b}, v26.16b 1777 ushr v2.16b, v0.16b, 4 1778 and v0.16b, v0.16b, v31.16b 1779 tbl v0.16b, {v28.16b}, v0.16b 1780 tbl v2.16b, {v27.16b}, v2.16b 1781 eor v0.16b, v0.16b, v2.16b 1782 eor v1.16b, v1.16b, v1.16b 1783 aese v0.16b,v1.16b 1784 ushr v2.16b, v0.16b, 4 1785 and v0.16b, v0.16b, v31.16b 1786 tbl v0.16b, {v30.16b}, v0.16b 1787 tbl v2.16b, {v29.16b}, v2.16b 1788 eor v0.16b, v0.16b, v2.16b 1789 1790 mov w7,v0.s[0] 1791 eor w6,w7,w7,ror #32-2 1792 eor w6,w6,w7,ror #32-10 1793 eor w6,w6,w7,ror #32-18 1794 eor w6,w6,w7,ror #32-24 1795 eor w14,w14,w6 1796 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1797 eor w6,w12,w13 1798 eor w9,w14,w8 1799 eor w6,w6,w9 1800 mov v3.s[0],w6 1801 // optimize sbox using AESE instruction 1802 tbl v0.16b, {v3.16b}, v26.16b 1803 ushr v2.16b, v0.16b, 4 1804 and v0.16b, v0.16b, v31.16b 1805 tbl v0.16b, {v28.16b}, v0.16b 1806 tbl v2.16b, {v27.16b}, v2.16b 1807 eor v0.16b, v0.16b, v2.16b 1808 eor v1.16b, v1.16b, v1.16b 1809 aese v0.16b,v1.16b 1810 ushr v2.16b, v0.16b, 4 1811 and v0.16b, v0.16b, v31.16b 1812 tbl v0.16b, {v30.16b}, v0.16b 1813 tbl v2.16b, {v29.16b}, v2.16b 1814 eor v0.16b, v0.16b, v2.16b 1815 1816 mov w7,v0.s[0] 1817 eor w6,w7,w7,ror #32-2 1818 eor w6,w6,w7,ror #32-10 1819 eor w6,w6,w7,ror #32-18 1820 eor w6,w6,w7,ror #32-24 1821 eor w15,w15,w6 1822 subs w11,w11,#1 1823 b.ne 10b 1824 mov v3.s[0],w15 1825 mov v3.s[1],w14 1826 mov v3.s[2],w13 1827 mov v3.s[3],w12 1828 #ifndef __AARCH64EB__ 1829 rev32 v3.16b,v3.16b 1830 #endif 1831 st1 {v3.4s},[x1],#16 1832 b 1b 1833 2: 1834 // save back IV 1835 st1 {v3.4s},[x4] 1836 ret 1837 1838 .Ldec: 1839 // decryption mode starts 1840 AARCH64_SIGN_LINK_REGISTER 1841 stp d8,d9,[sp,#-80]! 1842 stp d10,d11,[sp,#16] 1843 stp d12,d13,[sp,#32] 1844 stp d14,d15,[sp,#48] 1845 stp x29,x30,[sp,#64] 1846 .Lcbc_8_blocks_dec: 1847 cmp w2,#8 1848 b.lt 1f 1849 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1850 add x10,x0,#64 1851 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] 1852 #ifndef __AARCH64EB__ 1853 rev32 v4.16b,v4.16b 1854 #endif 1855 #ifndef __AARCH64EB__ 1856 rev32 v5.16b,v5.16b 1857 #endif 1858 #ifndef __AARCH64EB__ 1859 rev32 v6.16b,v6.16b 1860 #endif 1861 #ifndef __AARCH64EB__ 1862 rev32 v7.16b,v7.16b 1863 #endif 1864 #ifndef __AARCH64EB__ 1865 rev32 v8.16b,v8.16b 1866 #endif 1867 #ifndef __AARCH64EB__ 1868 rev32 v9.16b,v9.16b 1869 #endif 1870 #ifndef __AARCH64EB__ 1871 rev32 v10.16b,v10.16b 1872 #endif 1873 #ifndef __AARCH64EB__ 1874 rev32 v11.16b,v11.16b 1875 #endif 1876 bl _vpsm4_ex_enc_8blks 1877 zip1 v8.4s,v0.4s,v1.4s 1878 zip2 v9.4s,v0.4s,v1.4s 1879 zip1 v10.4s,v2.4s,v3.4s 1880 zip2 v11.4s,v2.4s,v3.4s 1881 zip1 v0.2d,v8.2d,v10.2d 1882 zip2 v1.2d,v8.2d,v10.2d 1883 zip1 v2.2d,v9.2d,v11.2d 1884 zip2 v3.2d,v9.2d,v11.2d 1885 zip1 v8.4s,v4.4s,v5.4s 1886 zip2 v9.4s,v4.4s,v5.4s 1887 zip1 v10.4s,v6.4s,v7.4s 1888 zip2 v11.4s,v6.4s,v7.4s 1889 zip1 v4.2d,v8.2d,v10.2d 1890 zip2 v5.2d,v8.2d,v10.2d 1891 zip1 v6.2d,v9.2d,v11.2d 1892 zip2 v7.2d,v9.2d,v11.2d 1893 ld1 {v15.4s},[x4] 1894 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 1895 // note ivec1 and vtmpx[3] are reusing the same register 1896 // care needs to be taken to avoid conflict 1897 eor v0.16b,v0.16b,v15.16b 1898 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 1899 eor v1.16b,v1.16b,v8.16b 1900 eor v2.16b,v2.16b,v9.16b 1901 eor v3.16b,v3.16b,v10.16b 1902 // save back IV 1903 st1 {v15.4s}, [x4] 1904 eor v4.16b,v4.16b,v11.16b 1905 eor v5.16b,v5.16b,v12.16b 1906 eor v6.16b,v6.16b,v13.16b 1907 eor v7.16b,v7.16b,v14.16b 1908 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1909 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1910 subs w2,w2,#8 1911 b.gt .Lcbc_8_blocks_dec 1912 b.eq 100f 1913 1: 1914 ld1 {v15.4s},[x4] 1915 .Lcbc_4_blocks_dec: 1916 cmp w2,#4 1917 b.lt 1f 1918 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1919 #ifndef __AARCH64EB__ 1920 rev32 v4.16b,v4.16b 1921 #endif 1922 #ifndef __AARCH64EB__ 1923 rev32 v5.16b,v5.16b 1924 #endif 1925 #ifndef __AARCH64EB__ 1926 rev32 v6.16b,v6.16b 1927 #endif 1928 #ifndef __AARCH64EB__ 1929 rev32 v7.16b,v7.16b 1930 #endif 1931 bl _vpsm4_ex_enc_4blks 1932 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1933 zip1 v8.4s,v0.4s,v1.4s 1934 zip2 v9.4s,v0.4s,v1.4s 1935 zip1 v10.4s,v2.4s,v3.4s 1936 zip2 v11.4s,v2.4s,v3.4s 1937 zip1 v0.2d,v8.2d,v10.2d 1938 zip2 v1.2d,v8.2d,v10.2d 1939 zip1 v2.2d,v9.2d,v11.2d 1940 zip2 v3.2d,v9.2d,v11.2d 1941 eor v0.16b,v0.16b,v15.16b 1942 eor v1.16b,v1.16b,v4.16b 1943 orr v15.16b,v7.16b,v7.16b 1944 eor v2.16b,v2.16b,v5.16b 1945 eor v3.16b,v3.16b,v6.16b 1946 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1947 subs w2,w2,#4 1948 b.gt .Lcbc_4_blocks_dec 1949 // save back IV 1950 st1 {v7.4s}, [x4] 1951 b 100f 1952 1: // last block 1953 subs w2,w2,#1 1954 b.lt 100f 1955 b.gt 1f 1956 ld1 {v4.4s},[x0],#16 1957 // save back IV 1958 st1 {v4.4s}, [x4] 1959 #ifndef __AARCH64EB__ 1960 rev32 v8.16b,v4.16b 1961 #else 1962 mov v8.16b,v4.16b 1963 #endif 1964 mov x10,x3 1965 mov w11,#8 1966 mov w12,v8.s[0] 1967 mov w13,v8.s[1] 1968 mov w14,v8.s[2] 1969 mov w15,v8.s[3] 1970 10: 1971 ldp w7,w8,[x10],8 1972 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1973 eor w6,w14,w15 1974 eor w9,w7,w13 1975 eor w6,w6,w9 1976 mov v3.s[0],w6 1977 // optimize sbox using AESE instruction 1978 tbl v0.16b, {v3.16b}, v26.16b 1979 ushr v2.16b, v0.16b, 4 1980 and v0.16b, v0.16b, v31.16b 1981 tbl v0.16b, {v28.16b}, v0.16b 1982 tbl v2.16b, {v27.16b}, v2.16b 1983 eor v0.16b, v0.16b, v2.16b 1984 eor v1.16b, v1.16b, v1.16b 1985 aese v0.16b,v1.16b 1986 ushr v2.16b, v0.16b, 4 1987 and v0.16b, v0.16b, v31.16b 1988 tbl v0.16b, {v30.16b}, v0.16b 1989 tbl v2.16b, {v29.16b}, v2.16b 1990 eor v0.16b, v0.16b, v2.16b 1991 1992 mov w7,v0.s[0] 1993 eor w6,w7,w7,ror #32-2 1994 eor w6,w6,w7,ror #32-10 1995 eor w6,w6,w7,ror #32-18 1996 eor w6,w6,w7,ror #32-24 1997 eor w12,w12,w6 1998 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1999 eor w6,w14,w15 2000 eor w9,w12,w8 2001 eor w6,w6,w9 2002 mov v3.s[0],w6 2003 // optimize sbox using AESE instruction 2004 tbl v0.16b, {v3.16b}, v26.16b 2005 ushr v2.16b, v0.16b, 4 2006 and v0.16b, v0.16b, v31.16b 2007 tbl v0.16b, {v28.16b}, v0.16b 2008 tbl v2.16b, {v27.16b}, v2.16b 2009 eor v0.16b, v0.16b, v2.16b 2010 eor v1.16b, v1.16b, v1.16b 2011 aese v0.16b,v1.16b 2012 ushr v2.16b, v0.16b, 4 2013 and v0.16b, v0.16b, v31.16b 2014 tbl v0.16b, {v30.16b}, v0.16b 2015 tbl v2.16b, {v29.16b}, v2.16b 2016 eor v0.16b, v0.16b, v2.16b 2017 2018 mov w7,v0.s[0] 2019 eor w6,w7,w7,ror #32-2 2020 eor w6,w6,w7,ror #32-10 2021 eor w6,w6,w7,ror #32-18 2022 eor w6,w6,w7,ror #32-24 2023 ldp w7,w8,[x10],8 2024 eor w13,w13,w6 2025 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2026 eor w6,w12,w13 2027 eor w9,w7,w15 2028 eor w6,w6,w9 2029 mov v3.s[0],w6 2030 // optimize sbox using AESE instruction 2031 tbl v0.16b, {v3.16b}, v26.16b 2032 ushr v2.16b, v0.16b, 4 2033 and v0.16b, v0.16b, v31.16b 2034 tbl v0.16b, {v28.16b}, v0.16b 2035 tbl v2.16b, {v27.16b}, v2.16b 2036 eor v0.16b, v0.16b, v2.16b 2037 eor v1.16b, v1.16b, v1.16b 2038 aese v0.16b,v1.16b 2039 ushr v2.16b, v0.16b, 4 2040 and v0.16b, v0.16b, v31.16b 2041 tbl v0.16b, {v30.16b}, v0.16b 2042 tbl v2.16b, {v29.16b}, v2.16b 2043 eor v0.16b, v0.16b, v2.16b 2044 2045 mov w7,v0.s[0] 2046 eor w6,w7,w7,ror #32-2 2047 eor w6,w6,w7,ror #32-10 2048 eor w6,w6,w7,ror #32-18 2049 eor w6,w6,w7,ror #32-24 2050 eor w14,w14,w6 2051 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2052 eor w6,w12,w13 2053 eor w9,w14,w8 2054 eor w6,w6,w9 2055 mov v3.s[0],w6 2056 // optimize sbox using AESE instruction 2057 tbl v0.16b, {v3.16b}, v26.16b 2058 ushr v2.16b, v0.16b, 4 2059 and v0.16b, v0.16b, v31.16b 2060 tbl v0.16b, {v28.16b}, v0.16b 2061 tbl v2.16b, {v27.16b}, v2.16b 2062 eor v0.16b, v0.16b, v2.16b 2063 eor v1.16b, v1.16b, v1.16b 2064 aese v0.16b,v1.16b 2065 ushr v2.16b, v0.16b, 4 2066 and v0.16b, v0.16b, v31.16b 2067 tbl v0.16b, {v30.16b}, v0.16b 2068 tbl v2.16b, {v29.16b}, v2.16b 2069 eor v0.16b, v0.16b, v2.16b 2070 2071 mov w7,v0.s[0] 2072 eor w6,w7,w7,ror #32-2 2073 eor w6,w6,w7,ror #32-10 2074 eor w6,w6,w7,ror #32-18 2075 eor w6,w6,w7,ror #32-24 2076 eor w15,w15,w6 2077 subs w11,w11,#1 2078 b.ne 10b 2079 mov v8.s[0],w15 2080 mov v8.s[1],w14 2081 mov v8.s[2],w13 2082 mov v8.s[3],w12 2083 #ifndef __AARCH64EB__ 2084 rev32 v8.16b,v8.16b 2085 #endif 2086 eor v8.16b,v8.16b,v15.16b 2087 st1 {v8.4s},[x1],#16 2088 b 100f 2089 1: // last two blocks 2090 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] 2091 add x10,x0,#16 2092 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 2093 subs w2,w2,1 2094 b.gt 1f 2095 #ifndef __AARCH64EB__ 2096 rev32 v4.16b,v4.16b 2097 #endif 2098 #ifndef __AARCH64EB__ 2099 rev32 v5.16b,v5.16b 2100 #endif 2101 #ifndef __AARCH64EB__ 2102 rev32 v6.16b,v6.16b 2103 #endif 2104 #ifndef __AARCH64EB__ 2105 rev32 v7.16b,v7.16b 2106 #endif 2107 bl _vpsm4_ex_enc_4blks 2108 ld1 {v4.4s,v5.4s},[x0],#32 2109 zip1 v8.4s,v0.4s,v1.4s 2110 zip2 v9.4s,v0.4s,v1.4s 2111 zip1 v10.4s,v2.4s,v3.4s 2112 zip2 v11.4s,v2.4s,v3.4s 2113 zip1 v0.2d,v8.2d,v10.2d 2114 zip2 v1.2d,v8.2d,v10.2d 2115 zip1 v2.2d,v9.2d,v11.2d 2116 zip2 v3.2d,v9.2d,v11.2d 2117 eor v0.16b,v0.16b,v15.16b 2118 eor v1.16b,v1.16b,v4.16b 2119 st1 {v0.4s,v1.4s},[x1],#32 2120 // save back IV 2121 st1 {v5.4s}, [x4] 2122 b 100f 2123 1: // last 3 blocks 2124 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] 2125 #ifndef __AARCH64EB__ 2126 rev32 v4.16b,v4.16b 2127 #endif 2128 #ifndef __AARCH64EB__ 2129 rev32 v5.16b,v5.16b 2130 #endif 2131 #ifndef __AARCH64EB__ 2132 rev32 v6.16b,v6.16b 2133 #endif 2134 #ifndef __AARCH64EB__ 2135 rev32 v7.16b,v7.16b 2136 #endif 2137 bl _vpsm4_ex_enc_4blks 2138 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 2139 zip1 v8.4s,v0.4s,v1.4s 2140 zip2 v9.4s,v0.4s,v1.4s 2141 zip1 v10.4s,v2.4s,v3.4s 2142 zip2 v11.4s,v2.4s,v3.4s 2143 zip1 v0.2d,v8.2d,v10.2d 2144 zip2 v1.2d,v8.2d,v10.2d 2145 zip1 v2.2d,v9.2d,v11.2d 2146 zip2 v3.2d,v9.2d,v11.2d 2147 eor v0.16b,v0.16b,v15.16b 2148 eor v1.16b,v1.16b,v4.16b 2149 eor v2.16b,v2.16b,v5.16b 2150 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 2151 // save back IV 2152 st1 {v6.4s}, [x4] 2153 100: 2154 ldp d10,d11,[sp,#16] 2155 ldp d12,d13,[sp,#32] 2156 ldp d14,d15,[sp,#48] 2157 ldp x29,x30,[sp,#64] 2158 ldp d8,d9,[sp],#80 2159 AARCH64_VALIDATE_LINK_REGISTER 2160 ret 2161 .size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt 2162 .globl vpsm4_ex_ctr32_encrypt_blocks 2163 .type vpsm4_ex_ctr32_encrypt_blocks,%function 2164 .align 5 2165 vpsm4_ex_ctr32_encrypt_blocks: 2166 AARCH64_VALID_CALL_TARGET 2167 ld1 {v3.4s},[x4] 2168 #ifndef __AARCH64EB__ 2169 rev32 v3.16b,v3.16b 2170 #endif 2171 adrp x9, .Lsbox_magic 2172 ldr q26, [x9, #:lo12:.Lsbox_magic] 2173 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 2174 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 2175 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 2176 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 2177 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 2178 cmp w2,#1 2179 b.ne 1f 2180 // fast processing for one single block without 2181 // context saving overhead 2182 mov x10,x3 2183 mov w11,#8 2184 mov w12,v3.s[0] 2185 mov w13,v3.s[1] 2186 mov w14,v3.s[2] 2187 mov w15,v3.s[3] 2188 10: 2189 ldp w7,w8,[x10],8 2190 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2191 eor w6,w14,w15 2192 eor w9,w7,w13 2193 eor w6,w6,w9 2194 mov v3.s[0],w6 2195 // optimize sbox using AESE instruction 2196 tbl v0.16b, {v3.16b}, v26.16b 2197 ushr v2.16b, v0.16b, 4 2198 and v0.16b, v0.16b, v31.16b 2199 tbl v0.16b, {v28.16b}, v0.16b 2200 tbl v2.16b, {v27.16b}, v2.16b 2201 eor v0.16b, v0.16b, v2.16b 2202 eor v1.16b, v1.16b, v1.16b 2203 aese v0.16b,v1.16b 2204 ushr v2.16b, v0.16b, 4 2205 and v0.16b, v0.16b, v31.16b 2206 tbl v0.16b, {v30.16b}, v0.16b 2207 tbl v2.16b, {v29.16b}, v2.16b 2208 eor v0.16b, v0.16b, v2.16b 2209 2210 mov w7,v0.s[0] 2211 eor w6,w7,w7,ror #32-2 2212 eor w6,w6,w7,ror #32-10 2213 eor w6,w6,w7,ror #32-18 2214 eor w6,w6,w7,ror #32-24 2215 eor w12,w12,w6 2216 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2217 eor w6,w14,w15 2218 eor w9,w12,w8 2219 eor w6,w6,w9 2220 mov v3.s[0],w6 2221 // optimize sbox using AESE instruction 2222 tbl v0.16b, {v3.16b}, v26.16b 2223 ushr v2.16b, v0.16b, 4 2224 and v0.16b, v0.16b, v31.16b 2225 tbl v0.16b, {v28.16b}, v0.16b 2226 tbl v2.16b, {v27.16b}, v2.16b 2227 eor v0.16b, v0.16b, v2.16b 2228 eor v1.16b, v1.16b, v1.16b 2229 aese v0.16b,v1.16b 2230 ushr v2.16b, v0.16b, 4 2231 and v0.16b, v0.16b, v31.16b 2232 tbl v0.16b, {v30.16b}, v0.16b 2233 tbl v2.16b, {v29.16b}, v2.16b 2234 eor v0.16b, v0.16b, v2.16b 2235 2236 mov w7,v0.s[0] 2237 eor w6,w7,w7,ror #32-2 2238 eor w6,w6,w7,ror #32-10 2239 eor w6,w6,w7,ror #32-18 2240 eor w6,w6,w7,ror #32-24 2241 ldp w7,w8,[x10],8 2242 eor w13,w13,w6 2243 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2244 eor w6,w12,w13 2245 eor w9,w7,w15 2246 eor w6,w6,w9 2247 mov v3.s[0],w6 2248 // optimize sbox using AESE instruction 2249 tbl v0.16b, {v3.16b}, v26.16b 2250 ushr v2.16b, v0.16b, 4 2251 and v0.16b, v0.16b, v31.16b 2252 tbl v0.16b, {v28.16b}, v0.16b 2253 tbl v2.16b, {v27.16b}, v2.16b 2254 eor v0.16b, v0.16b, v2.16b 2255 eor v1.16b, v1.16b, v1.16b 2256 aese v0.16b,v1.16b 2257 ushr v2.16b, v0.16b, 4 2258 and v0.16b, v0.16b, v31.16b 2259 tbl v0.16b, {v30.16b}, v0.16b 2260 tbl v2.16b, {v29.16b}, v2.16b 2261 eor v0.16b, v0.16b, v2.16b 2262 2263 mov w7,v0.s[0] 2264 eor w6,w7,w7,ror #32-2 2265 eor w6,w6,w7,ror #32-10 2266 eor w6,w6,w7,ror #32-18 2267 eor w6,w6,w7,ror #32-24 2268 eor w14,w14,w6 2269 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2270 eor w6,w12,w13 2271 eor w9,w14,w8 2272 eor w6,w6,w9 2273 mov v3.s[0],w6 2274 // optimize sbox using AESE instruction 2275 tbl v0.16b, {v3.16b}, v26.16b 2276 ushr v2.16b, v0.16b, 4 2277 and v0.16b, v0.16b, v31.16b 2278 tbl v0.16b, {v28.16b}, v0.16b 2279 tbl v2.16b, {v27.16b}, v2.16b 2280 eor v0.16b, v0.16b, v2.16b 2281 eor v1.16b, v1.16b, v1.16b 2282 aese v0.16b,v1.16b 2283 ushr v2.16b, v0.16b, 4 2284 and v0.16b, v0.16b, v31.16b 2285 tbl v0.16b, {v30.16b}, v0.16b 2286 tbl v2.16b, {v29.16b}, v2.16b 2287 eor v0.16b, v0.16b, v2.16b 2288 2289 mov w7,v0.s[0] 2290 eor w6,w7,w7,ror #32-2 2291 eor w6,w6,w7,ror #32-10 2292 eor w6,w6,w7,ror #32-18 2293 eor w6,w6,w7,ror #32-24 2294 eor w15,w15,w6 2295 subs w11,w11,#1 2296 b.ne 10b 2297 mov v3.s[0],w15 2298 mov v3.s[1],w14 2299 mov v3.s[2],w13 2300 mov v3.s[3],w12 2301 #ifndef __AARCH64EB__ 2302 rev32 v3.16b,v3.16b 2303 #endif 2304 ld1 {v4.4s},[x0] 2305 eor v4.16b,v4.16b,v3.16b 2306 st1 {v4.4s},[x1] 2307 ret 2308 1: 2309 AARCH64_SIGN_LINK_REGISTER 2310 stp d8,d9,[sp,#-80]! 2311 stp d10,d11,[sp,#16] 2312 stp d12,d13,[sp,#32] 2313 stp d14,d15,[sp,#48] 2314 stp x29,x30,[sp,#64] 2315 mov w12,v3.s[0] 2316 mov w13,v3.s[1] 2317 mov w14,v3.s[2] 2318 mov w5,v3.s[3] 2319 .Lctr32_4_blocks_process: 2320 cmp w2,#4 2321 b.lt 1f 2322 dup v4.4s,w12 2323 dup v5.4s,w13 2324 dup v6.4s,w14 2325 mov v7.s[0],w5 2326 add w5,w5,#1 2327 mov v7.s[1],w5 2328 add w5,w5,#1 2329 mov v7.s[2],w5 2330 add w5,w5,#1 2331 mov v7.s[3],w5 2332 add w5,w5,#1 2333 cmp w2,#8 2334 b.ge .Lctr32_8_blocks_process 2335 bl _vpsm4_ex_enc_4blks 2336 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2337 eor v0.16b,v0.16b,v12.16b 2338 eor v1.16b,v1.16b,v13.16b 2339 eor v2.16b,v2.16b,v14.16b 2340 eor v3.16b,v3.16b,v15.16b 2341 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2342 subs w2,w2,#4 2343 b.ne .Lctr32_4_blocks_process 2344 b 100f 2345 .Lctr32_8_blocks_process: 2346 dup v8.4s,w12 2347 dup v9.4s,w13 2348 dup v10.4s,w14 2349 mov v11.s[0],w5 2350 add w5,w5,#1 2351 mov v11.s[1],w5 2352 add w5,w5,#1 2353 mov v11.s[2],w5 2354 add w5,w5,#1 2355 mov v11.s[3],w5 2356 add w5,w5,#1 2357 bl _vpsm4_ex_enc_8blks 2358 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2359 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2360 eor v0.16b,v0.16b,v12.16b 2361 eor v1.16b,v1.16b,v13.16b 2362 eor v2.16b,v2.16b,v14.16b 2363 eor v3.16b,v3.16b,v15.16b 2364 eor v4.16b,v4.16b,v8.16b 2365 eor v5.16b,v5.16b,v9.16b 2366 eor v6.16b,v6.16b,v10.16b 2367 eor v7.16b,v7.16b,v11.16b 2368 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2369 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2370 subs w2,w2,#8 2371 b.ne .Lctr32_4_blocks_process 2372 b 100f 2373 1: // last block processing 2374 subs w2,w2,#1 2375 b.lt 100f 2376 b.gt 1f 2377 mov v3.s[0],w12 2378 mov v3.s[1],w13 2379 mov v3.s[2],w14 2380 mov v3.s[3],w5 2381 mov x10,x3 2382 mov w11,#8 2383 mov w12,v3.s[0] 2384 mov w13,v3.s[1] 2385 mov w14,v3.s[2] 2386 mov w15,v3.s[3] 2387 10: 2388 ldp w7,w8,[x10],8 2389 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2390 eor w6,w14,w15 2391 eor w9,w7,w13 2392 eor w6,w6,w9 2393 mov v3.s[0],w6 2394 // optimize sbox using AESE instruction 2395 tbl v0.16b, {v3.16b}, v26.16b 2396 ushr v2.16b, v0.16b, 4 2397 and v0.16b, v0.16b, v31.16b 2398 tbl v0.16b, {v28.16b}, v0.16b 2399 tbl v2.16b, {v27.16b}, v2.16b 2400 eor v0.16b, v0.16b, v2.16b 2401 eor v1.16b, v1.16b, v1.16b 2402 aese v0.16b,v1.16b 2403 ushr v2.16b, v0.16b, 4 2404 and v0.16b, v0.16b, v31.16b 2405 tbl v0.16b, {v30.16b}, v0.16b 2406 tbl v2.16b, {v29.16b}, v2.16b 2407 eor v0.16b, v0.16b, v2.16b 2408 2409 mov w7,v0.s[0] 2410 eor w6,w7,w7,ror #32-2 2411 eor w6,w6,w7,ror #32-10 2412 eor w6,w6,w7,ror #32-18 2413 eor w6,w6,w7,ror #32-24 2414 eor w12,w12,w6 2415 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2416 eor w6,w14,w15 2417 eor w9,w12,w8 2418 eor w6,w6,w9 2419 mov v3.s[0],w6 2420 // optimize sbox using AESE instruction 2421 tbl v0.16b, {v3.16b}, v26.16b 2422 ushr v2.16b, v0.16b, 4 2423 and v0.16b, v0.16b, v31.16b 2424 tbl v0.16b, {v28.16b}, v0.16b 2425 tbl v2.16b, {v27.16b}, v2.16b 2426 eor v0.16b, v0.16b, v2.16b 2427 eor v1.16b, v1.16b, v1.16b 2428 aese v0.16b,v1.16b 2429 ushr v2.16b, v0.16b, 4 2430 and v0.16b, v0.16b, v31.16b 2431 tbl v0.16b, {v30.16b}, v0.16b 2432 tbl v2.16b, {v29.16b}, v2.16b 2433 eor v0.16b, v0.16b, v2.16b 2434 2435 mov w7,v0.s[0] 2436 eor w6,w7,w7,ror #32-2 2437 eor w6,w6,w7,ror #32-10 2438 eor w6,w6,w7,ror #32-18 2439 eor w6,w6,w7,ror #32-24 2440 ldp w7,w8,[x10],8 2441 eor w13,w13,w6 2442 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2443 eor w6,w12,w13 2444 eor w9,w7,w15 2445 eor w6,w6,w9 2446 mov v3.s[0],w6 2447 // optimize sbox using AESE instruction 2448 tbl v0.16b, {v3.16b}, v26.16b 2449 ushr v2.16b, v0.16b, 4 2450 and v0.16b, v0.16b, v31.16b 2451 tbl v0.16b, {v28.16b}, v0.16b 2452 tbl v2.16b, {v27.16b}, v2.16b 2453 eor v0.16b, v0.16b, v2.16b 2454 eor v1.16b, v1.16b, v1.16b 2455 aese v0.16b,v1.16b 2456 ushr v2.16b, v0.16b, 4 2457 and v0.16b, v0.16b, v31.16b 2458 tbl v0.16b, {v30.16b}, v0.16b 2459 tbl v2.16b, {v29.16b}, v2.16b 2460 eor v0.16b, v0.16b, v2.16b 2461 2462 mov w7,v0.s[0] 2463 eor w6,w7,w7,ror #32-2 2464 eor w6,w6,w7,ror #32-10 2465 eor w6,w6,w7,ror #32-18 2466 eor w6,w6,w7,ror #32-24 2467 eor w14,w14,w6 2468 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2469 eor w6,w12,w13 2470 eor w9,w14,w8 2471 eor w6,w6,w9 2472 mov v3.s[0],w6 2473 // optimize sbox using AESE instruction 2474 tbl v0.16b, {v3.16b}, v26.16b 2475 ushr v2.16b, v0.16b, 4 2476 and v0.16b, v0.16b, v31.16b 2477 tbl v0.16b, {v28.16b}, v0.16b 2478 tbl v2.16b, {v27.16b}, v2.16b 2479 eor v0.16b, v0.16b, v2.16b 2480 eor v1.16b, v1.16b, v1.16b 2481 aese v0.16b,v1.16b 2482 ushr v2.16b, v0.16b, 4 2483 and v0.16b, v0.16b, v31.16b 2484 tbl v0.16b, {v30.16b}, v0.16b 2485 tbl v2.16b, {v29.16b}, v2.16b 2486 eor v0.16b, v0.16b, v2.16b 2487 2488 mov w7,v0.s[0] 2489 eor w6,w7,w7,ror #32-2 2490 eor w6,w6,w7,ror #32-10 2491 eor w6,w6,w7,ror #32-18 2492 eor w6,w6,w7,ror #32-24 2493 eor w15,w15,w6 2494 subs w11,w11,#1 2495 b.ne 10b 2496 mov v3.s[0],w15 2497 mov v3.s[1],w14 2498 mov v3.s[2],w13 2499 mov v3.s[3],w12 2500 #ifndef __AARCH64EB__ 2501 rev32 v3.16b,v3.16b 2502 #endif 2503 ld1 {v4.4s},[x0] 2504 eor v4.16b,v4.16b,v3.16b 2505 st1 {v4.4s},[x1] 2506 b 100f 2507 1: // last 2 blocks processing 2508 dup v4.4s,w12 2509 dup v5.4s,w13 2510 dup v6.4s,w14 2511 mov v7.s[0],w5 2512 add w5,w5,#1 2513 mov v7.s[1],w5 2514 subs w2,w2,#1 2515 b.ne 1f 2516 bl _vpsm4_ex_enc_4blks 2517 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2518 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2519 eor v0.16b,v0.16b,v12.16b 2520 eor v1.16b,v1.16b,v13.16b 2521 eor v2.16b,v2.16b,v14.16b 2522 eor v3.16b,v3.16b,v15.16b 2523 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2524 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2525 b 100f 2526 1: // last 3 blocks processing 2527 add w5,w5,#1 2528 mov v7.s[2],w5 2529 bl _vpsm4_ex_enc_4blks 2530 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2531 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2532 ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 2533 eor v0.16b,v0.16b,v12.16b 2534 eor v1.16b,v1.16b,v13.16b 2535 eor v2.16b,v2.16b,v14.16b 2536 eor v3.16b,v3.16b,v15.16b 2537 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2538 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2539 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 2540 100: 2541 ldp d10,d11,[sp,#16] 2542 ldp d12,d13,[sp,#32] 2543 ldp d14,d15,[sp,#48] 2544 ldp x29,x30,[sp,#64] 2545 ldp d8,d9,[sp],#80 2546 AARCH64_VALIDATE_LINK_REGISTER 2547 ret 2548 .size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks 2549 .globl vpsm4_ex_xts_encrypt_gb 2550 .type vpsm4_ex_xts_encrypt_gb,%function 2551 .align 5 2552 vpsm4_ex_xts_encrypt_gb: 2553 AARCH64_SIGN_LINK_REGISTER 2554 stp x15, x16, [sp, #-0x10]! 2555 stp x17, x18, [sp, #-0x10]! 2556 stp x19, x20, [sp, #-0x10]! 2557 stp x21, x22, [sp, #-0x10]! 2558 stp x23, x24, [sp, #-0x10]! 2559 stp x25, x26, [sp, #-0x10]! 2560 stp x27, x28, [sp, #-0x10]! 2561 stp x29, x30, [sp, #-0x10]! 2562 stp d8, d9, [sp, #-0x10]! 2563 stp d10, d11, [sp, #-0x10]! 2564 stp d12, d13, [sp, #-0x10]! 2565 stp d14, d15, [sp, #-0x10]! 2566 mov x26,x3 2567 mov x27,x4 2568 mov w28,w6 2569 ld1 {v16.4s}, [x5] 2570 mov x3,x27 2571 adrp x9, .Lsbox_magic 2572 ldr q26, [x9, #:lo12:.Lsbox_magic] 2573 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 2574 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 2575 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 2576 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 2577 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 2578 #ifndef __AARCH64EB__ 2579 rev32 v16.16b,v16.16b 2580 #endif 2581 mov x10,x3 2582 mov w11,#8 2583 mov w12,v16.s[0] 2584 mov w13,v16.s[1] 2585 mov w14,v16.s[2] 2586 mov w15,v16.s[3] 2587 10: 2588 ldp w7,w8,[x10],8 2589 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2590 eor w6,w14,w15 2591 eor w9,w7,w13 2592 eor w6,w6,w9 2593 mov v3.s[0],w6 2594 // optimize sbox using AESE instruction 2595 tbl v0.16b, {v3.16b}, v26.16b 2596 ushr v2.16b, v0.16b, 4 2597 and v0.16b, v0.16b, v31.16b 2598 tbl v0.16b, {v28.16b}, v0.16b 2599 tbl v2.16b, {v27.16b}, v2.16b 2600 eor v0.16b, v0.16b, v2.16b 2601 eor v1.16b, v1.16b, v1.16b 2602 aese v0.16b,v1.16b 2603 ushr v2.16b, v0.16b, 4 2604 and v0.16b, v0.16b, v31.16b 2605 tbl v0.16b, {v30.16b}, v0.16b 2606 tbl v2.16b, {v29.16b}, v2.16b 2607 eor v0.16b, v0.16b, v2.16b 2608 2609 mov w7,v0.s[0] 2610 eor w6,w7,w7,ror #32-2 2611 eor w6,w6,w7,ror #32-10 2612 eor w6,w6,w7,ror #32-18 2613 eor w6,w6,w7,ror #32-24 2614 eor w12,w12,w6 2615 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2616 eor w6,w14,w15 2617 eor w9,w12,w8 2618 eor w6,w6,w9 2619 mov v3.s[0],w6 2620 // optimize sbox using AESE instruction 2621 tbl v0.16b, {v3.16b}, v26.16b 2622 ushr v2.16b, v0.16b, 4 2623 and v0.16b, v0.16b, v31.16b 2624 tbl v0.16b, {v28.16b}, v0.16b 2625 tbl v2.16b, {v27.16b}, v2.16b 2626 eor v0.16b, v0.16b, v2.16b 2627 eor v1.16b, v1.16b, v1.16b 2628 aese v0.16b,v1.16b 2629 ushr v2.16b, v0.16b, 4 2630 and v0.16b, v0.16b, v31.16b 2631 tbl v0.16b, {v30.16b}, v0.16b 2632 tbl v2.16b, {v29.16b}, v2.16b 2633 eor v0.16b, v0.16b, v2.16b 2634 2635 mov w7,v0.s[0] 2636 eor w6,w7,w7,ror #32-2 2637 eor w6,w6,w7,ror #32-10 2638 eor w6,w6,w7,ror #32-18 2639 eor w6,w6,w7,ror #32-24 2640 ldp w7,w8,[x10],8 2641 eor w13,w13,w6 2642 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2643 eor w6,w12,w13 2644 eor w9,w7,w15 2645 eor w6,w6,w9 2646 mov v3.s[0],w6 2647 // optimize sbox using AESE instruction 2648 tbl v0.16b, {v3.16b}, v26.16b 2649 ushr v2.16b, v0.16b, 4 2650 and v0.16b, v0.16b, v31.16b 2651 tbl v0.16b, {v28.16b}, v0.16b 2652 tbl v2.16b, {v27.16b}, v2.16b 2653 eor v0.16b, v0.16b, v2.16b 2654 eor v1.16b, v1.16b, v1.16b 2655 aese v0.16b,v1.16b 2656 ushr v2.16b, v0.16b, 4 2657 and v0.16b, v0.16b, v31.16b 2658 tbl v0.16b, {v30.16b}, v0.16b 2659 tbl v2.16b, {v29.16b}, v2.16b 2660 eor v0.16b, v0.16b, v2.16b 2661 2662 mov w7,v0.s[0] 2663 eor w6,w7,w7,ror #32-2 2664 eor w6,w6,w7,ror #32-10 2665 eor w6,w6,w7,ror #32-18 2666 eor w6,w6,w7,ror #32-24 2667 eor w14,w14,w6 2668 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2669 eor w6,w12,w13 2670 eor w9,w14,w8 2671 eor w6,w6,w9 2672 mov v3.s[0],w6 2673 // optimize sbox using AESE instruction 2674 tbl v0.16b, {v3.16b}, v26.16b 2675 ushr v2.16b, v0.16b, 4 2676 and v0.16b, v0.16b, v31.16b 2677 tbl v0.16b, {v28.16b}, v0.16b 2678 tbl v2.16b, {v27.16b}, v2.16b 2679 eor v0.16b, v0.16b, v2.16b 2680 eor v1.16b, v1.16b, v1.16b 2681 aese v0.16b,v1.16b 2682 ushr v2.16b, v0.16b, 4 2683 and v0.16b, v0.16b, v31.16b 2684 tbl v0.16b, {v30.16b}, v0.16b 2685 tbl v2.16b, {v29.16b}, v2.16b 2686 eor v0.16b, v0.16b, v2.16b 2687 2688 mov w7,v0.s[0] 2689 eor w6,w7,w7,ror #32-2 2690 eor w6,w6,w7,ror #32-10 2691 eor w6,w6,w7,ror #32-18 2692 eor w6,w6,w7,ror #32-24 2693 eor w15,w15,w6 2694 subs w11,w11,#1 2695 b.ne 10b 2696 mov v16.s[0],w15 2697 mov v16.s[1],w14 2698 mov v16.s[2],w13 2699 mov v16.s[3],w12 2700 #ifndef __AARCH64EB__ 2701 rev32 v16.16b,v16.16b 2702 #endif 2703 mov x3,x26 2704 and x29,x2,#0x0F 2705 // convert length into blocks 2706 lsr x2,x2,4 2707 cmp x2,#1 2708 b.lt .return_gb 2709 2710 cmp x29,0 2711 // If the encryption/decryption Length is N times of 16, 2712 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2713 b.eq .xts_encrypt_blocks_gb 2714 2715 // If the encryption/decryption length is not N times of 16, 2716 // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb 2717 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2718 subs x2,x2,#1 2719 b.eq .only_2blks_tweak_gb 2720 .xts_encrypt_blocks_gb: 2721 rbit v16.16b,v16.16b 2722 #ifdef __AARCH64EB__ 2723 rev32 v16.16b,v16.16b 2724 #endif 2725 mov x12,v16.d[0] 2726 mov x13,v16.d[1] 2727 mov w7,0x87 2728 extr x9,x13,x13,#32 2729 extr x15,x13,x12,#63 2730 and w8,w7,w9,asr#31 2731 eor x14,x8,x12,lsl#1 2732 mov w7,0x87 2733 extr x9,x15,x15,#32 2734 extr x17,x15,x14,#63 2735 and w8,w7,w9,asr#31 2736 eor x16,x8,x14,lsl#1 2737 mov w7,0x87 2738 extr x9,x17,x17,#32 2739 extr x19,x17,x16,#63 2740 and w8,w7,w9,asr#31 2741 eor x18,x8,x16,lsl#1 2742 mov w7,0x87 2743 extr x9,x19,x19,#32 2744 extr x21,x19,x18,#63 2745 and w8,w7,w9,asr#31 2746 eor x20,x8,x18,lsl#1 2747 mov w7,0x87 2748 extr x9,x21,x21,#32 2749 extr x23,x21,x20,#63 2750 and w8,w7,w9,asr#31 2751 eor x22,x8,x20,lsl#1 2752 mov w7,0x87 2753 extr x9,x23,x23,#32 2754 extr x25,x23,x22,#63 2755 and w8,w7,w9,asr#31 2756 eor x24,x8,x22,lsl#1 2757 mov w7,0x87 2758 extr x9,x25,x25,#32 2759 extr x27,x25,x24,#63 2760 and w8,w7,w9,asr#31 2761 eor x26,x8,x24,lsl#1 2762 .Lxts_8_blocks_process_gb: 2763 cmp x2,#8 2764 mov v16.d[0],x12 2765 mov v16.d[1],x13 2766 #ifdef __AARCH64EB__ 2767 rev32 v16.16b,v16.16b 2768 #endif 2769 mov w7,0x87 2770 extr x9,x27,x27,#32 2771 extr x13,x27,x26,#63 2772 and w8,w7,w9,asr#31 2773 eor x12,x8,x26,lsl#1 2774 mov v17.d[0],x14 2775 mov v17.d[1],x15 2776 #ifdef __AARCH64EB__ 2777 rev32 v17.16b,v17.16b 2778 #endif 2779 mov w7,0x87 2780 extr x9,x13,x13,#32 2781 extr x15,x13,x12,#63 2782 and w8,w7,w9,asr#31 2783 eor x14,x8,x12,lsl#1 2784 mov v18.d[0],x16 2785 mov v18.d[1],x17 2786 #ifdef __AARCH64EB__ 2787 rev32 v18.16b,v18.16b 2788 #endif 2789 mov w7,0x87 2790 extr x9,x15,x15,#32 2791 extr x17,x15,x14,#63 2792 and w8,w7,w9,asr#31 2793 eor x16,x8,x14,lsl#1 2794 mov v19.d[0],x18 2795 mov v19.d[1],x19 2796 #ifdef __AARCH64EB__ 2797 rev32 v19.16b,v19.16b 2798 #endif 2799 mov w7,0x87 2800 extr x9,x17,x17,#32 2801 extr x19,x17,x16,#63 2802 and w8,w7,w9,asr#31 2803 eor x18,x8,x16,lsl#1 2804 mov v20.d[0],x20 2805 mov v20.d[1],x21 2806 #ifdef __AARCH64EB__ 2807 rev32 v20.16b,v20.16b 2808 #endif 2809 mov w7,0x87 2810 extr x9,x19,x19,#32 2811 extr x21,x19,x18,#63 2812 and w8,w7,w9,asr#31 2813 eor x20,x8,x18,lsl#1 2814 mov v21.d[0],x22 2815 mov v21.d[1],x23 2816 #ifdef __AARCH64EB__ 2817 rev32 v21.16b,v21.16b 2818 #endif 2819 mov w7,0x87 2820 extr x9,x21,x21,#32 2821 extr x23,x21,x20,#63 2822 and w8,w7,w9,asr#31 2823 eor x22,x8,x20,lsl#1 2824 mov v22.d[0],x24 2825 mov v22.d[1],x25 2826 #ifdef __AARCH64EB__ 2827 rev32 v22.16b,v22.16b 2828 #endif 2829 mov w7,0x87 2830 extr x9,x23,x23,#32 2831 extr x25,x23,x22,#63 2832 and w8,w7,w9,asr#31 2833 eor x24,x8,x22,lsl#1 2834 mov v23.d[0],x26 2835 mov v23.d[1],x27 2836 #ifdef __AARCH64EB__ 2837 rev32 v23.16b,v23.16b 2838 #endif 2839 mov w7,0x87 2840 extr x9,x25,x25,#32 2841 extr x27,x25,x24,#63 2842 and w8,w7,w9,asr#31 2843 eor x26,x8,x24,lsl#1 2844 b.lt .Lxts_4_blocks_process_gb 2845 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2846 rbit v16.16b,v16.16b 2847 rbit v17.16b,v17.16b 2848 rbit v18.16b,v18.16b 2849 rbit v19.16b,v19.16b 2850 eor v4.16b, v4.16b, v16.16b 2851 eor v5.16b, v5.16b, v17.16b 2852 eor v6.16b, v6.16b, v18.16b 2853 eor v7.16b, v7.16b, v19.16b 2854 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2855 rbit v20.16b,v20.16b 2856 rbit v21.16b,v21.16b 2857 rbit v22.16b,v22.16b 2858 rbit v23.16b,v23.16b 2859 eor v8.16b, v8.16b, v20.16b 2860 eor v9.16b, v9.16b, v21.16b 2861 eor v10.16b, v10.16b, v22.16b 2862 eor v11.16b, v11.16b, v23.16b 2863 #ifndef __AARCH64EB__ 2864 rev32 v4.16b,v4.16b 2865 #endif 2866 #ifndef __AARCH64EB__ 2867 rev32 v5.16b,v5.16b 2868 #endif 2869 #ifndef __AARCH64EB__ 2870 rev32 v6.16b,v6.16b 2871 #endif 2872 #ifndef __AARCH64EB__ 2873 rev32 v7.16b,v7.16b 2874 #endif 2875 #ifndef __AARCH64EB__ 2876 rev32 v8.16b,v8.16b 2877 #endif 2878 #ifndef __AARCH64EB__ 2879 rev32 v9.16b,v9.16b 2880 #endif 2881 #ifndef __AARCH64EB__ 2882 rev32 v10.16b,v10.16b 2883 #endif 2884 #ifndef __AARCH64EB__ 2885 rev32 v11.16b,v11.16b 2886 #endif 2887 zip1 v0.4s,v4.4s,v5.4s 2888 zip2 v1.4s,v4.4s,v5.4s 2889 zip1 v2.4s,v6.4s,v7.4s 2890 zip2 v3.4s,v6.4s,v7.4s 2891 zip1 v4.2d,v0.2d,v2.2d 2892 zip2 v5.2d,v0.2d,v2.2d 2893 zip1 v6.2d,v1.2d,v3.2d 2894 zip2 v7.2d,v1.2d,v3.2d 2895 zip1 v0.4s,v8.4s,v9.4s 2896 zip2 v1.4s,v8.4s,v9.4s 2897 zip1 v2.4s,v10.4s,v11.4s 2898 zip2 v3.4s,v10.4s,v11.4s 2899 zip1 v8.2d,v0.2d,v2.2d 2900 zip2 v9.2d,v0.2d,v2.2d 2901 zip1 v10.2d,v1.2d,v3.2d 2902 zip2 v11.2d,v1.2d,v3.2d 2903 bl _vpsm4_ex_enc_8blks 2904 zip1 v8.4s,v0.4s,v1.4s 2905 zip2 v9.4s,v0.4s,v1.4s 2906 zip1 v10.4s,v2.4s,v3.4s 2907 zip2 v11.4s,v2.4s,v3.4s 2908 zip1 v0.2d,v8.2d,v10.2d 2909 zip2 v1.2d,v8.2d,v10.2d 2910 zip1 v2.2d,v9.2d,v11.2d 2911 zip2 v3.2d,v9.2d,v11.2d 2912 zip1 v8.4s,v4.4s,v5.4s 2913 zip2 v9.4s,v4.4s,v5.4s 2914 zip1 v10.4s,v6.4s,v7.4s 2915 zip2 v11.4s,v6.4s,v7.4s 2916 zip1 v4.2d,v8.2d,v10.2d 2917 zip2 v5.2d,v8.2d,v10.2d 2918 zip1 v6.2d,v9.2d,v11.2d 2919 zip2 v7.2d,v9.2d,v11.2d 2920 eor v0.16b, v0.16b, v16.16b 2921 eor v1.16b, v1.16b, v17.16b 2922 eor v2.16b, v2.16b, v18.16b 2923 eor v3.16b, v3.16b, v19.16b 2924 eor v4.16b, v4.16b, v20.16b 2925 eor v5.16b, v5.16b, v21.16b 2926 eor v6.16b, v6.16b, v22.16b 2927 eor v7.16b, v7.16b, v23.16b 2928 2929 // save the last tweak 2930 mov v25.16b,v23.16b 2931 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2932 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2933 subs x2,x2,#8 2934 b.gt .Lxts_8_blocks_process_gb 2935 b 100f 2936 .Lxts_4_blocks_process_gb: 2937 cmp x2,#4 2938 b.lt 1f 2939 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2940 rbit v16.16b,v16.16b 2941 rbit v17.16b,v17.16b 2942 rbit v18.16b,v18.16b 2943 rbit v19.16b,v19.16b 2944 eor v4.16b, v4.16b, v16.16b 2945 eor v5.16b, v5.16b, v17.16b 2946 eor v6.16b, v6.16b, v18.16b 2947 eor v7.16b, v7.16b, v19.16b 2948 #ifndef __AARCH64EB__ 2949 rev32 v4.16b,v4.16b 2950 #endif 2951 #ifndef __AARCH64EB__ 2952 rev32 v5.16b,v5.16b 2953 #endif 2954 #ifndef __AARCH64EB__ 2955 rev32 v6.16b,v6.16b 2956 #endif 2957 #ifndef __AARCH64EB__ 2958 rev32 v7.16b,v7.16b 2959 #endif 2960 zip1 v0.4s,v4.4s,v5.4s 2961 zip2 v1.4s,v4.4s,v5.4s 2962 zip1 v2.4s,v6.4s,v7.4s 2963 zip2 v3.4s,v6.4s,v7.4s 2964 zip1 v4.2d,v0.2d,v2.2d 2965 zip2 v5.2d,v0.2d,v2.2d 2966 zip1 v6.2d,v1.2d,v3.2d 2967 zip2 v7.2d,v1.2d,v3.2d 2968 bl _vpsm4_ex_enc_4blks 2969 zip1 v4.4s,v0.4s,v1.4s 2970 zip2 v5.4s,v0.4s,v1.4s 2971 zip1 v6.4s,v2.4s,v3.4s 2972 zip2 v7.4s,v2.4s,v3.4s 2973 zip1 v0.2d,v4.2d,v6.2d 2974 zip2 v1.2d,v4.2d,v6.2d 2975 zip1 v2.2d,v5.2d,v7.2d 2976 zip2 v3.2d,v5.2d,v7.2d 2977 eor v0.16b, v0.16b, v16.16b 2978 eor v1.16b, v1.16b, v17.16b 2979 eor v2.16b, v2.16b, v18.16b 2980 eor v3.16b, v3.16b, v19.16b 2981 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2982 sub x2,x2,#4 2983 mov v16.16b,v20.16b 2984 mov v17.16b,v21.16b 2985 mov v18.16b,v22.16b 2986 // save the last tweak 2987 mov v25.16b,v19.16b 2988 1: 2989 // process last block 2990 cmp x2,#1 2991 b.lt 100f 2992 b.gt 1f 2993 ld1 {v4.4s},[x0],#16 2994 rbit v16.16b,v16.16b 2995 eor v4.16b, v4.16b, v16.16b 2996 #ifndef __AARCH64EB__ 2997 rev32 v4.16b,v4.16b 2998 #endif 2999 mov x10,x3 3000 mov w11,#8 3001 mov w12,v4.s[0] 3002 mov w13,v4.s[1] 3003 mov w14,v4.s[2] 3004 mov w15,v4.s[3] 3005 10: 3006 ldp w7,w8,[x10],8 3007 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3008 eor w6,w14,w15 3009 eor w9,w7,w13 3010 eor w6,w6,w9 3011 mov v3.s[0],w6 3012 // optimize sbox using AESE instruction 3013 tbl v0.16b, {v3.16b}, v26.16b 3014 ushr v2.16b, v0.16b, 4 3015 and v0.16b, v0.16b, v31.16b 3016 tbl v0.16b, {v28.16b}, v0.16b 3017 tbl v2.16b, {v27.16b}, v2.16b 3018 eor v0.16b, v0.16b, v2.16b 3019 eor v1.16b, v1.16b, v1.16b 3020 aese v0.16b,v1.16b 3021 ushr v2.16b, v0.16b, 4 3022 and v0.16b, v0.16b, v31.16b 3023 tbl v0.16b, {v30.16b}, v0.16b 3024 tbl v2.16b, {v29.16b}, v2.16b 3025 eor v0.16b, v0.16b, v2.16b 3026 3027 mov w7,v0.s[0] 3028 eor w6,w7,w7,ror #32-2 3029 eor w6,w6,w7,ror #32-10 3030 eor w6,w6,w7,ror #32-18 3031 eor w6,w6,w7,ror #32-24 3032 eor w12,w12,w6 3033 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3034 eor w6,w14,w15 3035 eor w9,w12,w8 3036 eor w6,w6,w9 3037 mov v3.s[0],w6 3038 // optimize sbox using AESE instruction 3039 tbl v0.16b, {v3.16b}, v26.16b 3040 ushr v2.16b, v0.16b, 4 3041 and v0.16b, v0.16b, v31.16b 3042 tbl v0.16b, {v28.16b}, v0.16b 3043 tbl v2.16b, {v27.16b}, v2.16b 3044 eor v0.16b, v0.16b, v2.16b 3045 eor v1.16b, v1.16b, v1.16b 3046 aese v0.16b,v1.16b 3047 ushr v2.16b, v0.16b, 4 3048 and v0.16b, v0.16b, v31.16b 3049 tbl v0.16b, {v30.16b}, v0.16b 3050 tbl v2.16b, {v29.16b}, v2.16b 3051 eor v0.16b, v0.16b, v2.16b 3052 3053 mov w7,v0.s[0] 3054 eor w6,w7,w7,ror #32-2 3055 eor w6,w6,w7,ror #32-10 3056 eor w6,w6,w7,ror #32-18 3057 eor w6,w6,w7,ror #32-24 3058 ldp w7,w8,[x10],8 3059 eor w13,w13,w6 3060 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3061 eor w6,w12,w13 3062 eor w9,w7,w15 3063 eor w6,w6,w9 3064 mov v3.s[0],w6 3065 // optimize sbox using AESE instruction 3066 tbl v0.16b, {v3.16b}, v26.16b 3067 ushr v2.16b, v0.16b, 4 3068 and v0.16b, v0.16b, v31.16b 3069 tbl v0.16b, {v28.16b}, v0.16b 3070 tbl v2.16b, {v27.16b}, v2.16b 3071 eor v0.16b, v0.16b, v2.16b 3072 eor v1.16b, v1.16b, v1.16b 3073 aese v0.16b,v1.16b 3074 ushr v2.16b, v0.16b, 4 3075 and v0.16b, v0.16b, v31.16b 3076 tbl v0.16b, {v30.16b}, v0.16b 3077 tbl v2.16b, {v29.16b}, v2.16b 3078 eor v0.16b, v0.16b, v2.16b 3079 3080 mov w7,v0.s[0] 3081 eor w6,w7,w7,ror #32-2 3082 eor w6,w6,w7,ror #32-10 3083 eor w6,w6,w7,ror #32-18 3084 eor w6,w6,w7,ror #32-24 3085 eor w14,w14,w6 3086 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3087 eor w6,w12,w13 3088 eor w9,w14,w8 3089 eor w6,w6,w9 3090 mov v3.s[0],w6 3091 // optimize sbox using AESE instruction 3092 tbl v0.16b, {v3.16b}, v26.16b 3093 ushr v2.16b, v0.16b, 4 3094 and v0.16b, v0.16b, v31.16b 3095 tbl v0.16b, {v28.16b}, v0.16b 3096 tbl v2.16b, {v27.16b}, v2.16b 3097 eor v0.16b, v0.16b, v2.16b 3098 eor v1.16b, v1.16b, v1.16b 3099 aese v0.16b,v1.16b 3100 ushr v2.16b, v0.16b, 4 3101 and v0.16b, v0.16b, v31.16b 3102 tbl v0.16b, {v30.16b}, v0.16b 3103 tbl v2.16b, {v29.16b}, v2.16b 3104 eor v0.16b, v0.16b, v2.16b 3105 3106 mov w7,v0.s[0] 3107 eor w6,w7,w7,ror #32-2 3108 eor w6,w6,w7,ror #32-10 3109 eor w6,w6,w7,ror #32-18 3110 eor w6,w6,w7,ror #32-24 3111 eor w15,w15,w6 3112 subs w11,w11,#1 3113 b.ne 10b 3114 mov v4.s[0],w15 3115 mov v4.s[1],w14 3116 mov v4.s[2],w13 3117 mov v4.s[3],w12 3118 #ifndef __AARCH64EB__ 3119 rev32 v4.16b,v4.16b 3120 #endif 3121 eor v4.16b, v4.16b, v16.16b 3122 st1 {v4.4s},[x1],#16 3123 // save the last tweak 3124 mov v25.16b,v16.16b 3125 b 100f 3126 1: // process last 2 blocks 3127 cmp x2,#2 3128 b.gt 1f 3129 ld1 {v4.4s,v5.4s},[x0],#32 3130 rbit v16.16b,v16.16b 3131 rbit v17.16b,v17.16b 3132 eor v4.16b, v4.16b, v16.16b 3133 eor v5.16b, v5.16b, v17.16b 3134 #ifndef __AARCH64EB__ 3135 rev32 v4.16b,v4.16b 3136 #endif 3137 #ifndef __AARCH64EB__ 3138 rev32 v5.16b,v5.16b 3139 #endif 3140 zip1 v0.4s,v4.4s,v5.4s 3141 zip2 v1.4s,v4.4s,v5.4s 3142 zip1 v2.4s,v6.4s,v7.4s 3143 zip2 v3.4s,v6.4s,v7.4s 3144 zip1 v4.2d,v0.2d,v2.2d 3145 zip2 v5.2d,v0.2d,v2.2d 3146 zip1 v6.2d,v1.2d,v3.2d 3147 zip2 v7.2d,v1.2d,v3.2d 3148 bl _vpsm4_ex_enc_4blks 3149 zip1 v4.4s,v0.4s,v1.4s 3150 zip2 v5.4s,v0.4s,v1.4s 3151 zip1 v6.4s,v2.4s,v3.4s 3152 zip2 v7.4s,v2.4s,v3.4s 3153 zip1 v0.2d,v4.2d,v6.2d 3154 zip2 v1.2d,v4.2d,v6.2d 3155 zip1 v2.2d,v5.2d,v7.2d 3156 zip2 v3.2d,v5.2d,v7.2d 3157 eor v0.16b, v0.16b, v16.16b 3158 eor v1.16b, v1.16b, v17.16b 3159 st1 {v0.4s,v1.4s},[x1],#32 3160 // save the last tweak 3161 mov v25.16b,v17.16b 3162 b 100f 3163 1: // process last 3 blocks 3164 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 3165 rbit v16.16b,v16.16b 3166 rbit v17.16b,v17.16b 3167 rbit v18.16b,v18.16b 3168 eor v4.16b, v4.16b, v16.16b 3169 eor v5.16b, v5.16b, v17.16b 3170 eor v6.16b, v6.16b, v18.16b 3171 #ifndef __AARCH64EB__ 3172 rev32 v4.16b,v4.16b 3173 #endif 3174 #ifndef __AARCH64EB__ 3175 rev32 v5.16b,v5.16b 3176 #endif 3177 #ifndef __AARCH64EB__ 3178 rev32 v6.16b,v6.16b 3179 #endif 3180 zip1 v0.4s,v4.4s,v5.4s 3181 zip2 v1.4s,v4.4s,v5.4s 3182 zip1 v2.4s,v6.4s,v7.4s 3183 zip2 v3.4s,v6.4s,v7.4s 3184 zip1 v4.2d,v0.2d,v2.2d 3185 zip2 v5.2d,v0.2d,v2.2d 3186 zip1 v6.2d,v1.2d,v3.2d 3187 zip2 v7.2d,v1.2d,v3.2d 3188 bl _vpsm4_ex_enc_4blks 3189 zip1 v4.4s,v0.4s,v1.4s 3190 zip2 v5.4s,v0.4s,v1.4s 3191 zip1 v6.4s,v2.4s,v3.4s 3192 zip2 v7.4s,v2.4s,v3.4s 3193 zip1 v0.2d,v4.2d,v6.2d 3194 zip2 v1.2d,v4.2d,v6.2d 3195 zip1 v2.2d,v5.2d,v7.2d 3196 zip2 v3.2d,v5.2d,v7.2d 3197 eor v0.16b, v0.16b, v16.16b 3198 eor v1.16b, v1.16b, v17.16b 3199 eor v2.16b, v2.16b, v18.16b 3200 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 3201 // save the last tweak 3202 mov v25.16b,v18.16b 3203 100: 3204 cmp x29,0 3205 b.eq .return_gb 3206 3207 // This branch calculates the last two tweaks, 3208 // while the encryption/decryption length is larger than 32 3209 .last_2blks_tweak_gb: 3210 #ifdef __AARCH64EB__ 3211 rev32 v25.16b,v25.16b 3212 #endif 3213 rbit v2.16b,v25.16b 3214 adrp x9, .Lxts_magic 3215 ldr q0, [x9, #:lo12:.Lxts_magic] 3216 shl v17.16b, v2.16b, #1 3217 ext v1.16b, v2.16b, v2.16b,#15 3218 ushr v1.16b, v1.16b, #7 3219 mul v1.16b, v1.16b, v0.16b 3220 eor v17.16b, v17.16b, v1.16b 3221 rbit v17.16b,v17.16b 3222 rbit v2.16b,v17.16b 3223 adrp x9, .Lxts_magic 3224 ldr q0, [x9, #:lo12:.Lxts_magic] 3225 shl v18.16b, v2.16b, #1 3226 ext v1.16b, v2.16b, v2.16b,#15 3227 ushr v1.16b, v1.16b, #7 3228 mul v1.16b, v1.16b, v0.16b 3229 eor v18.16b, v18.16b, v1.16b 3230 rbit v18.16b,v18.16b 3231 b .check_dec_gb 3232 3233 3234 // This branch calculates the last two tweaks, 3235 // while the encryption/decryption length is equal to 32, who only need two tweaks 3236 .only_2blks_tweak_gb: 3237 mov v17.16b,v16.16b 3238 #ifdef __AARCH64EB__ 3239 rev32 v17.16b,v17.16b 3240 #endif 3241 rbit v2.16b,v17.16b 3242 adrp x9, .Lxts_magic 3243 ldr q0, [x9, #:lo12:.Lxts_magic] 3244 shl v18.16b, v2.16b, #1 3245 ext v1.16b, v2.16b, v2.16b,#15 3246 ushr v1.16b, v1.16b, #7 3247 mul v1.16b, v1.16b, v0.16b 3248 eor v18.16b, v18.16b, v1.16b 3249 rbit v18.16b,v18.16b 3250 b .check_dec_gb 3251 3252 3253 // Determine whether encryption or decryption is required. 3254 // The last two tweaks need to be swapped for decryption. 3255 .check_dec_gb: 3256 // encryption:1 decryption:0 3257 cmp w28,1 3258 b.eq .process_last_2blks_gb 3259 mov v0.16B,v17.16b 3260 mov v17.16B,v18.16b 3261 mov v18.16B,v0.16b 3262 3263 .process_last_2blks_gb: 3264 #ifdef __AARCH64EB__ 3265 rev32 v17.16b,v17.16b 3266 #endif 3267 #ifdef __AARCH64EB__ 3268 rev32 v18.16b,v18.16b 3269 #endif 3270 ld1 {v4.4s},[x0],#16 3271 eor v4.16b, v4.16b, v17.16b 3272 #ifndef __AARCH64EB__ 3273 rev32 v4.16b,v4.16b 3274 #endif 3275 mov x10,x3 3276 mov w11,#8 3277 mov w12,v4.s[0] 3278 mov w13,v4.s[1] 3279 mov w14,v4.s[2] 3280 mov w15,v4.s[3] 3281 10: 3282 ldp w7,w8,[x10],8 3283 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3284 eor w6,w14,w15 3285 eor w9,w7,w13 3286 eor w6,w6,w9 3287 mov v3.s[0],w6 3288 // optimize sbox using AESE instruction 3289 tbl v0.16b, {v3.16b}, v26.16b 3290 ushr v2.16b, v0.16b, 4 3291 and v0.16b, v0.16b, v31.16b 3292 tbl v0.16b, {v28.16b}, v0.16b 3293 tbl v2.16b, {v27.16b}, v2.16b 3294 eor v0.16b, v0.16b, v2.16b 3295 eor v1.16b, v1.16b, v1.16b 3296 aese v0.16b,v1.16b 3297 ushr v2.16b, v0.16b, 4 3298 and v0.16b, v0.16b, v31.16b 3299 tbl v0.16b, {v30.16b}, v0.16b 3300 tbl v2.16b, {v29.16b}, v2.16b 3301 eor v0.16b, v0.16b, v2.16b 3302 3303 mov w7,v0.s[0] 3304 eor w6,w7,w7,ror #32-2 3305 eor w6,w6,w7,ror #32-10 3306 eor w6,w6,w7,ror #32-18 3307 eor w6,w6,w7,ror #32-24 3308 eor w12,w12,w6 3309 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3310 eor w6,w14,w15 3311 eor w9,w12,w8 3312 eor w6,w6,w9 3313 mov v3.s[0],w6 3314 // optimize sbox using AESE instruction 3315 tbl v0.16b, {v3.16b}, v26.16b 3316 ushr v2.16b, v0.16b, 4 3317 and v0.16b, v0.16b, v31.16b 3318 tbl v0.16b, {v28.16b}, v0.16b 3319 tbl v2.16b, {v27.16b}, v2.16b 3320 eor v0.16b, v0.16b, v2.16b 3321 eor v1.16b, v1.16b, v1.16b 3322 aese v0.16b,v1.16b 3323 ushr v2.16b, v0.16b, 4 3324 and v0.16b, v0.16b, v31.16b 3325 tbl v0.16b, {v30.16b}, v0.16b 3326 tbl v2.16b, {v29.16b}, v2.16b 3327 eor v0.16b, v0.16b, v2.16b 3328 3329 mov w7,v0.s[0] 3330 eor w6,w7,w7,ror #32-2 3331 eor w6,w6,w7,ror #32-10 3332 eor w6,w6,w7,ror #32-18 3333 eor w6,w6,w7,ror #32-24 3334 ldp w7,w8,[x10],8 3335 eor w13,w13,w6 3336 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3337 eor w6,w12,w13 3338 eor w9,w7,w15 3339 eor w6,w6,w9 3340 mov v3.s[0],w6 3341 // optimize sbox using AESE instruction 3342 tbl v0.16b, {v3.16b}, v26.16b 3343 ushr v2.16b, v0.16b, 4 3344 and v0.16b, v0.16b, v31.16b 3345 tbl v0.16b, {v28.16b}, v0.16b 3346 tbl v2.16b, {v27.16b}, v2.16b 3347 eor v0.16b, v0.16b, v2.16b 3348 eor v1.16b, v1.16b, v1.16b 3349 aese v0.16b,v1.16b 3350 ushr v2.16b, v0.16b, 4 3351 and v0.16b, v0.16b, v31.16b 3352 tbl v0.16b, {v30.16b}, v0.16b 3353 tbl v2.16b, {v29.16b}, v2.16b 3354 eor v0.16b, v0.16b, v2.16b 3355 3356 mov w7,v0.s[0] 3357 eor w6,w7,w7,ror #32-2 3358 eor w6,w6,w7,ror #32-10 3359 eor w6,w6,w7,ror #32-18 3360 eor w6,w6,w7,ror #32-24 3361 eor w14,w14,w6 3362 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3363 eor w6,w12,w13 3364 eor w9,w14,w8 3365 eor w6,w6,w9 3366 mov v3.s[0],w6 3367 // optimize sbox using AESE instruction 3368 tbl v0.16b, {v3.16b}, v26.16b 3369 ushr v2.16b, v0.16b, 4 3370 and v0.16b, v0.16b, v31.16b 3371 tbl v0.16b, {v28.16b}, v0.16b 3372 tbl v2.16b, {v27.16b}, v2.16b 3373 eor v0.16b, v0.16b, v2.16b 3374 eor v1.16b, v1.16b, v1.16b 3375 aese v0.16b,v1.16b 3376 ushr v2.16b, v0.16b, 4 3377 and v0.16b, v0.16b, v31.16b 3378 tbl v0.16b, {v30.16b}, v0.16b 3379 tbl v2.16b, {v29.16b}, v2.16b 3380 eor v0.16b, v0.16b, v2.16b 3381 3382 mov w7,v0.s[0] 3383 eor w6,w7,w7,ror #32-2 3384 eor w6,w6,w7,ror #32-10 3385 eor w6,w6,w7,ror #32-18 3386 eor w6,w6,w7,ror #32-24 3387 eor w15,w15,w6 3388 subs w11,w11,#1 3389 b.ne 10b 3390 mov v4.s[0],w15 3391 mov v4.s[1],w14 3392 mov v4.s[2],w13 3393 mov v4.s[3],w12 3394 #ifndef __AARCH64EB__ 3395 rev32 v4.16b,v4.16b 3396 #endif 3397 eor v4.16b, v4.16b, v17.16b 3398 st1 {v4.4s},[x1],#16 3399 3400 sub x26,x1,16 3401 .loop_gb: 3402 subs x29,x29,1 3403 ldrb w7,[x26,x29] 3404 ldrb w8,[x0,x29] 3405 strb w8,[x26,x29] 3406 strb w7,[x1,x29] 3407 b.gt .loop_gb 3408 ld1 {v4.4s}, [x26] 3409 eor v4.16b, v4.16b, v18.16b 3410 #ifndef __AARCH64EB__ 3411 rev32 v4.16b,v4.16b 3412 #endif 3413 mov x10,x3 3414 mov w11,#8 3415 mov w12,v4.s[0] 3416 mov w13,v4.s[1] 3417 mov w14,v4.s[2] 3418 mov w15,v4.s[3] 3419 10: 3420 ldp w7,w8,[x10],8 3421 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3422 eor w6,w14,w15 3423 eor w9,w7,w13 3424 eor w6,w6,w9 3425 mov v3.s[0],w6 3426 // optimize sbox using AESE instruction 3427 tbl v0.16b, {v3.16b}, v26.16b 3428 ushr v2.16b, v0.16b, 4 3429 and v0.16b, v0.16b, v31.16b 3430 tbl v0.16b, {v28.16b}, v0.16b 3431 tbl v2.16b, {v27.16b}, v2.16b 3432 eor v0.16b, v0.16b, v2.16b 3433 eor v1.16b, v1.16b, v1.16b 3434 aese v0.16b,v1.16b 3435 ushr v2.16b, v0.16b, 4 3436 and v0.16b, v0.16b, v31.16b 3437 tbl v0.16b, {v30.16b}, v0.16b 3438 tbl v2.16b, {v29.16b}, v2.16b 3439 eor v0.16b, v0.16b, v2.16b 3440 3441 mov w7,v0.s[0] 3442 eor w6,w7,w7,ror #32-2 3443 eor w6,w6,w7,ror #32-10 3444 eor w6,w6,w7,ror #32-18 3445 eor w6,w6,w7,ror #32-24 3446 eor w12,w12,w6 3447 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3448 eor w6,w14,w15 3449 eor w9,w12,w8 3450 eor w6,w6,w9 3451 mov v3.s[0],w6 3452 // optimize sbox using AESE instruction 3453 tbl v0.16b, {v3.16b}, v26.16b 3454 ushr v2.16b, v0.16b, 4 3455 and v0.16b, v0.16b, v31.16b 3456 tbl v0.16b, {v28.16b}, v0.16b 3457 tbl v2.16b, {v27.16b}, v2.16b 3458 eor v0.16b, v0.16b, v2.16b 3459 eor v1.16b, v1.16b, v1.16b 3460 aese v0.16b,v1.16b 3461 ushr v2.16b, v0.16b, 4 3462 and v0.16b, v0.16b, v31.16b 3463 tbl v0.16b, {v30.16b}, v0.16b 3464 tbl v2.16b, {v29.16b}, v2.16b 3465 eor v0.16b, v0.16b, v2.16b 3466 3467 mov w7,v0.s[0] 3468 eor w6,w7,w7,ror #32-2 3469 eor w6,w6,w7,ror #32-10 3470 eor w6,w6,w7,ror #32-18 3471 eor w6,w6,w7,ror #32-24 3472 ldp w7,w8,[x10],8 3473 eor w13,w13,w6 3474 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3475 eor w6,w12,w13 3476 eor w9,w7,w15 3477 eor w6,w6,w9 3478 mov v3.s[0],w6 3479 // optimize sbox using AESE instruction 3480 tbl v0.16b, {v3.16b}, v26.16b 3481 ushr v2.16b, v0.16b, 4 3482 and v0.16b, v0.16b, v31.16b 3483 tbl v0.16b, {v28.16b}, v0.16b 3484 tbl v2.16b, {v27.16b}, v2.16b 3485 eor v0.16b, v0.16b, v2.16b 3486 eor v1.16b, v1.16b, v1.16b 3487 aese v0.16b,v1.16b 3488 ushr v2.16b, v0.16b, 4 3489 and v0.16b, v0.16b, v31.16b 3490 tbl v0.16b, {v30.16b}, v0.16b 3491 tbl v2.16b, {v29.16b}, v2.16b 3492 eor v0.16b, v0.16b, v2.16b 3493 3494 mov w7,v0.s[0] 3495 eor w6,w7,w7,ror #32-2 3496 eor w6,w6,w7,ror #32-10 3497 eor w6,w6,w7,ror #32-18 3498 eor w6,w6,w7,ror #32-24 3499 eor w14,w14,w6 3500 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3501 eor w6,w12,w13 3502 eor w9,w14,w8 3503 eor w6,w6,w9 3504 mov v3.s[0],w6 3505 // optimize sbox using AESE instruction 3506 tbl v0.16b, {v3.16b}, v26.16b 3507 ushr v2.16b, v0.16b, 4 3508 and v0.16b, v0.16b, v31.16b 3509 tbl v0.16b, {v28.16b}, v0.16b 3510 tbl v2.16b, {v27.16b}, v2.16b 3511 eor v0.16b, v0.16b, v2.16b 3512 eor v1.16b, v1.16b, v1.16b 3513 aese v0.16b,v1.16b 3514 ushr v2.16b, v0.16b, 4 3515 and v0.16b, v0.16b, v31.16b 3516 tbl v0.16b, {v30.16b}, v0.16b 3517 tbl v2.16b, {v29.16b}, v2.16b 3518 eor v0.16b, v0.16b, v2.16b 3519 3520 mov w7,v0.s[0] 3521 eor w6,w7,w7,ror #32-2 3522 eor w6,w6,w7,ror #32-10 3523 eor w6,w6,w7,ror #32-18 3524 eor w6,w6,w7,ror #32-24 3525 eor w15,w15,w6 3526 subs w11,w11,#1 3527 b.ne 10b 3528 mov v4.s[0],w15 3529 mov v4.s[1],w14 3530 mov v4.s[2],w13 3531 mov v4.s[3],w12 3532 #ifndef __AARCH64EB__ 3533 rev32 v4.16b,v4.16b 3534 #endif 3535 eor v4.16b, v4.16b, v18.16b 3536 st1 {v4.4s}, [x26] 3537 .return_gb: 3538 ldp d14, d15, [sp], #0x10 3539 ldp d12, d13, [sp], #0x10 3540 ldp d10, d11, [sp], #0x10 3541 ldp d8, d9, [sp], #0x10 3542 ldp x29, x30, [sp], #0x10 3543 ldp x27, x28, [sp], #0x10 3544 ldp x25, x26, [sp], #0x10 3545 ldp x23, x24, [sp], #0x10 3546 ldp x21, x22, [sp], #0x10 3547 ldp x19, x20, [sp], #0x10 3548 ldp x17, x18, [sp], #0x10 3549 ldp x15, x16, [sp], #0x10 3550 AARCH64_VALIDATE_LINK_REGISTER 3551 ret 3552 .size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb 3553 .globl vpsm4_ex_xts_encrypt 3554 .type vpsm4_ex_xts_encrypt,%function 3555 .align 5 3556 vpsm4_ex_xts_encrypt: 3557 AARCH64_SIGN_LINK_REGISTER 3558 stp x15, x16, [sp, #-0x10]! 3559 stp x17, x18, [sp, #-0x10]! 3560 stp x19, x20, [sp, #-0x10]! 3561 stp x21, x22, [sp, #-0x10]! 3562 stp x23, x24, [sp, #-0x10]! 3563 stp x25, x26, [sp, #-0x10]! 3564 stp x27, x28, [sp, #-0x10]! 3565 stp x29, x30, [sp, #-0x10]! 3566 stp d8, d9, [sp, #-0x10]! 3567 stp d10, d11, [sp, #-0x10]! 3568 stp d12, d13, [sp, #-0x10]! 3569 stp d14, d15, [sp, #-0x10]! 3570 mov x26,x3 3571 mov x27,x4 3572 mov w28,w6 3573 ld1 {v16.4s}, [x5] 3574 mov x3,x27 3575 adrp x9, .Lsbox_magic 3576 ldr q26, [x9, #:lo12:.Lsbox_magic] 3577 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 3578 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 3579 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 3580 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 3581 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 3582 #ifndef __AARCH64EB__ 3583 rev32 v16.16b,v16.16b 3584 #endif 3585 mov x10,x3 3586 mov w11,#8 3587 mov w12,v16.s[0] 3588 mov w13,v16.s[1] 3589 mov w14,v16.s[2] 3590 mov w15,v16.s[3] 3591 10: 3592 ldp w7,w8,[x10],8 3593 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3594 eor w6,w14,w15 3595 eor w9,w7,w13 3596 eor w6,w6,w9 3597 mov v3.s[0],w6 3598 // optimize sbox using AESE instruction 3599 tbl v0.16b, {v3.16b}, v26.16b 3600 ushr v2.16b, v0.16b, 4 3601 and v0.16b, v0.16b, v31.16b 3602 tbl v0.16b, {v28.16b}, v0.16b 3603 tbl v2.16b, {v27.16b}, v2.16b 3604 eor v0.16b, v0.16b, v2.16b 3605 eor v1.16b, v1.16b, v1.16b 3606 aese v0.16b,v1.16b 3607 ushr v2.16b, v0.16b, 4 3608 and v0.16b, v0.16b, v31.16b 3609 tbl v0.16b, {v30.16b}, v0.16b 3610 tbl v2.16b, {v29.16b}, v2.16b 3611 eor v0.16b, v0.16b, v2.16b 3612 3613 mov w7,v0.s[0] 3614 eor w6,w7,w7,ror #32-2 3615 eor w6,w6,w7,ror #32-10 3616 eor w6,w6,w7,ror #32-18 3617 eor w6,w6,w7,ror #32-24 3618 eor w12,w12,w6 3619 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3620 eor w6,w14,w15 3621 eor w9,w12,w8 3622 eor w6,w6,w9 3623 mov v3.s[0],w6 3624 // optimize sbox using AESE instruction 3625 tbl v0.16b, {v3.16b}, v26.16b 3626 ushr v2.16b, v0.16b, 4 3627 and v0.16b, v0.16b, v31.16b 3628 tbl v0.16b, {v28.16b}, v0.16b 3629 tbl v2.16b, {v27.16b}, v2.16b 3630 eor v0.16b, v0.16b, v2.16b 3631 eor v1.16b, v1.16b, v1.16b 3632 aese v0.16b,v1.16b 3633 ushr v2.16b, v0.16b, 4 3634 and v0.16b, v0.16b, v31.16b 3635 tbl v0.16b, {v30.16b}, v0.16b 3636 tbl v2.16b, {v29.16b}, v2.16b 3637 eor v0.16b, v0.16b, v2.16b 3638 3639 mov w7,v0.s[0] 3640 eor w6,w7,w7,ror #32-2 3641 eor w6,w6,w7,ror #32-10 3642 eor w6,w6,w7,ror #32-18 3643 eor w6,w6,w7,ror #32-24 3644 ldp w7,w8,[x10],8 3645 eor w13,w13,w6 3646 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3647 eor w6,w12,w13 3648 eor w9,w7,w15 3649 eor w6,w6,w9 3650 mov v3.s[0],w6 3651 // optimize sbox using AESE instruction 3652 tbl v0.16b, {v3.16b}, v26.16b 3653 ushr v2.16b, v0.16b, 4 3654 and v0.16b, v0.16b, v31.16b 3655 tbl v0.16b, {v28.16b}, v0.16b 3656 tbl v2.16b, {v27.16b}, v2.16b 3657 eor v0.16b, v0.16b, v2.16b 3658 eor v1.16b, v1.16b, v1.16b 3659 aese v0.16b,v1.16b 3660 ushr v2.16b, v0.16b, 4 3661 and v0.16b, v0.16b, v31.16b 3662 tbl v0.16b, {v30.16b}, v0.16b 3663 tbl v2.16b, {v29.16b}, v2.16b 3664 eor v0.16b, v0.16b, v2.16b 3665 3666 mov w7,v0.s[0] 3667 eor w6,w7,w7,ror #32-2 3668 eor w6,w6,w7,ror #32-10 3669 eor w6,w6,w7,ror #32-18 3670 eor w6,w6,w7,ror #32-24 3671 eor w14,w14,w6 3672 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3673 eor w6,w12,w13 3674 eor w9,w14,w8 3675 eor w6,w6,w9 3676 mov v3.s[0],w6 3677 // optimize sbox using AESE instruction 3678 tbl v0.16b, {v3.16b}, v26.16b 3679 ushr v2.16b, v0.16b, 4 3680 and v0.16b, v0.16b, v31.16b 3681 tbl v0.16b, {v28.16b}, v0.16b 3682 tbl v2.16b, {v27.16b}, v2.16b 3683 eor v0.16b, v0.16b, v2.16b 3684 eor v1.16b, v1.16b, v1.16b 3685 aese v0.16b,v1.16b 3686 ushr v2.16b, v0.16b, 4 3687 and v0.16b, v0.16b, v31.16b 3688 tbl v0.16b, {v30.16b}, v0.16b 3689 tbl v2.16b, {v29.16b}, v2.16b 3690 eor v0.16b, v0.16b, v2.16b 3691 3692 mov w7,v0.s[0] 3693 eor w6,w7,w7,ror #32-2 3694 eor w6,w6,w7,ror #32-10 3695 eor w6,w6,w7,ror #32-18 3696 eor w6,w6,w7,ror #32-24 3697 eor w15,w15,w6 3698 subs w11,w11,#1 3699 b.ne 10b 3700 mov v16.s[0],w15 3701 mov v16.s[1],w14 3702 mov v16.s[2],w13 3703 mov v16.s[3],w12 3704 #ifndef __AARCH64EB__ 3705 rev32 v16.16b,v16.16b 3706 #endif 3707 mov x3,x26 3708 and x29,x2,#0x0F 3709 // convert length into blocks 3710 lsr x2,x2,4 3711 cmp x2,#1 3712 b.lt .return 3713 3714 cmp x29,0 3715 // If the encryption/decryption Length is N times of 16, 3716 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks 3717 b.eq .xts_encrypt_blocks 3718 3719 // If the encryption/decryption length is not N times of 16, 3720 // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak 3721 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks 3722 subs x2,x2,#1 3723 b.eq .only_2blks_tweak 3724 .xts_encrypt_blocks: 3725 #ifdef __AARCH64EB__ 3726 rev32 v16.16b,v16.16b 3727 #endif 3728 mov x12,v16.d[0] 3729 mov x13,v16.d[1] 3730 mov w7,0x87 3731 extr x9,x13,x13,#32 3732 extr x15,x13,x12,#63 3733 and w8,w7,w9,asr#31 3734 eor x14,x8,x12,lsl#1 3735 mov w7,0x87 3736 extr x9,x15,x15,#32 3737 extr x17,x15,x14,#63 3738 and w8,w7,w9,asr#31 3739 eor x16,x8,x14,lsl#1 3740 mov w7,0x87 3741 extr x9,x17,x17,#32 3742 extr x19,x17,x16,#63 3743 and w8,w7,w9,asr#31 3744 eor x18,x8,x16,lsl#1 3745 mov w7,0x87 3746 extr x9,x19,x19,#32 3747 extr x21,x19,x18,#63 3748 and w8,w7,w9,asr#31 3749 eor x20,x8,x18,lsl#1 3750 mov w7,0x87 3751 extr x9,x21,x21,#32 3752 extr x23,x21,x20,#63 3753 and w8,w7,w9,asr#31 3754 eor x22,x8,x20,lsl#1 3755 mov w7,0x87 3756 extr x9,x23,x23,#32 3757 extr x25,x23,x22,#63 3758 and w8,w7,w9,asr#31 3759 eor x24,x8,x22,lsl#1 3760 mov w7,0x87 3761 extr x9,x25,x25,#32 3762 extr x27,x25,x24,#63 3763 and w8,w7,w9,asr#31 3764 eor x26,x8,x24,lsl#1 3765 .Lxts_8_blocks_process: 3766 cmp x2,#8 3767 mov v16.d[0],x12 3768 mov v16.d[1],x13 3769 #ifdef __AARCH64EB__ 3770 rev32 v16.16b,v16.16b 3771 #endif 3772 mov w7,0x87 3773 extr x9,x27,x27,#32 3774 extr x13,x27,x26,#63 3775 and w8,w7,w9,asr#31 3776 eor x12,x8,x26,lsl#1 3777 mov v17.d[0],x14 3778 mov v17.d[1],x15 3779 #ifdef __AARCH64EB__ 3780 rev32 v17.16b,v17.16b 3781 #endif 3782 mov w7,0x87 3783 extr x9,x13,x13,#32 3784 extr x15,x13,x12,#63 3785 and w8,w7,w9,asr#31 3786 eor x14,x8,x12,lsl#1 3787 mov v18.d[0],x16 3788 mov v18.d[1],x17 3789 #ifdef __AARCH64EB__ 3790 rev32 v18.16b,v18.16b 3791 #endif 3792 mov w7,0x87 3793 extr x9,x15,x15,#32 3794 extr x17,x15,x14,#63 3795 and w8,w7,w9,asr#31 3796 eor x16,x8,x14,lsl#1 3797 mov v19.d[0],x18 3798 mov v19.d[1],x19 3799 #ifdef __AARCH64EB__ 3800 rev32 v19.16b,v19.16b 3801 #endif 3802 mov w7,0x87 3803 extr x9,x17,x17,#32 3804 extr x19,x17,x16,#63 3805 and w8,w7,w9,asr#31 3806 eor x18,x8,x16,lsl#1 3807 mov v20.d[0],x20 3808 mov v20.d[1],x21 3809 #ifdef __AARCH64EB__ 3810 rev32 v20.16b,v20.16b 3811 #endif 3812 mov w7,0x87 3813 extr x9,x19,x19,#32 3814 extr x21,x19,x18,#63 3815 and w8,w7,w9,asr#31 3816 eor x20,x8,x18,lsl#1 3817 mov v21.d[0],x22 3818 mov v21.d[1],x23 3819 #ifdef __AARCH64EB__ 3820 rev32 v21.16b,v21.16b 3821 #endif 3822 mov w7,0x87 3823 extr x9,x21,x21,#32 3824 extr x23,x21,x20,#63 3825 and w8,w7,w9,asr#31 3826 eor x22,x8,x20,lsl#1 3827 mov v22.d[0],x24 3828 mov v22.d[1],x25 3829 #ifdef __AARCH64EB__ 3830 rev32 v22.16b,v22.16b 3831 #endif 3832 mov w7,0x87 3833 extr x9,x23,x23,#32 3834 extr x25,x23,x22,#63 3835 and w8,w7,w9,asr#31 3836 eor x24,x8,x22,lsl#1 3837 mov v23.d[0],x26 3838 mov v23.d[1],x27 3839 #ifdef __AARCH64EB__ 3840 rev32 v23.16b,v23.16b 3841 #endif 3842 mov w7,0x87 3843 extr x9,x25,x25,#32 3844 extr x27,x25,x24,#63 3845 and w8,w7,w9,asr#31 3846 eor x26,x8,x24,lsl#1 3847 b.lt .Lxts_4_blocks_process 3848 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3849 eor v4.16b, v4.16b, v16.16b 3850 eor v5.16b, v5.16b, v17.16b 3851 eor v6.16b, v6.16b, v18.16b 3852 eor v7.16b, v7.16b, v19.16b 3853 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 3854 eor v8.16b, v8.16b, v20.16b 3855 eor v9.16b, v9.16b, v21.16b 3856 eor v10.16b, v10.16b, v22.16b 3857 eor v11.16b, v11.16b, v23.16b 3858 #ifndef __AARCH64EB__ 3859 rev32 v4.16b,v4.16b 3860 #endif 3861 #ifndef __AARCH64EB__ 3862 rev32 v5.16b,v5.16b 3863 #endif 3864 #ifndef __AARCH64EB__ 3865 rev32 v6.16b,v6.16b 3866 #endif 3867 #ifndef __AARCH64EB__ 3868 rev32 v7.16b,v7.16b 3869 #endif 3870 #ifndef __AARCH64EB__ 3871 rev32 v8.16b,v8.16b 3872 #endif 3873 #ifndef __AARCH64EB__ 3874 rev32 v9.16b,v9.16b 3875 #endif 3876 #ifndef __AARCH64EB__ 3877 rev32 v10.16b,v10.16b 3878 #endif 3879 #ifndef __AARCH64EB__ 3880 rev32 v11.16b,v11.16b 3881 #endif 3882 zip1 v0.4s,v4.4s,v5.4s 3883 zip2 v1.4s,v4.4s,v5.4s 3884 zip1 v2.4s,v6.4s,v7.4s 3885 zip2 v3.4s,v6.4s,v7.4s 3886 zip1 v4.2d,v0.2d,v2.2d 3887 zip2 v5.2d,v0.2d,v2.2d 3888 zip1 v6.2d,v1.2d,v3.2d 3889 zip2 v7.2d,v1.2d,v3.2d 3890 zip1 v0.4s,v8.4s,v9.4s 3891 zip2 v1.4s,v8.4s,v9.4s 3892 zip1 v2.4s,v10.4s,v11.4s 3893 zip2 v3.4s,v10.4s,v11.4s 3894 zip1 v8.2d,v0.2d,v2.2d 3895 zip2 v9.2d,v0.2d,v2.2d 3896 zip1 v10.2d,v1.2d,v3.2d 3897 zip2 v11.2d,v1.2d,v3.2d 3898 bl _vpsm4_ex_enc_8blks 3899 zip1 v8.4s,v0.4s,v1.4s 3900 zip2 v9.4s,v0.4s,v1.4s 3901 zip1 v10.4s,v2.4s,v3.4s 3902 zip2 v11.4s,v2.4s,v3.4s 3903 zip1 v0.2d,v8.2d,v10.2d 3904 zip2 v1.2d,v8.2d,v10.2d 3905 zip1 v2.2d,v9.2d,v11.2d 3906 zip2 v3.2d,v9.2d,v11.2d 3907 zip1 v8.4s,v4.4s,v5.4s 3908 zip2 v9.4s,v4.4s,v5.4s 3909 zip1 v10.4s,v6.4s,v7.4s 3910 zip2 v11.4s,v6.4s,v7.4s 3911 zip1 v4.2d,v8.2d,v10.2d 3912 zip2 v5.2d,v8.2d,v10.2d 3913 zip1 v6.2d,v9.2d,v11.2d 3914 zip2 v7.2d,v9.2d,v11.2d 3915 eor v0.16b, v0.16b, v16.16b 3916 eor v1.16b, v1.16b, v17.16b 3917 eor v2.16b, v2.16b, v18.16b 3918 eor v3.16b, v3.16b, v19.16b 3919 eor v4.16b, v4.16b, v20.16b 3920 eor v5.16b, v5.16b, v21.16b 3921 eor v6.16b, v6.16b, v22.16b 3922 eor v7.16b, v7.16b, v23.16b 3923 3924 // save the last tweak 3925 mov v25.16b,v23.16b 3926 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3927 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 3928 subs x2,x2,#8 3929 b.gt .Lxts_8_blocks_process 3930 b 100f 3931 .Lxts_4_blocks_process: 3932 cmp x2,#4 3933 b.lt 1f 3934 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3935 eor v4.16b, v4.16b, v16.16b 3936 eor v5.16b, v5.16b, v17.16b 3937 eor v6.16b, v6.16b, v18.16b 3938 eor v7.16b, v7.16b, v19.16b 3939 #ifndef __AARCH64EB__ 3940 rev32 v4.16b,v4.16b 3941 #endif 3942 #ifndef __AARCH64EB__ 3943 rev32 v5.16b,v5.16b 3944 #endif 3945 #ifndef __AARCH64EB__ 3946 rev32 v6.16b,v6.16b 3947 #endif 3948 #ifndef __AARCH64EB__ 3949 rev32 v7.16b,v7.16b 3950 #endif 3951 zip1 v0.4s,v4.4s,v5.4s 3952 zip2 v1.4s,v4.4s,v5.4s 3953 zip1 v2.4s,v6.4s,v7.4s 3954 zip2 v3.4s,v6.4s,v7.4s 3955 zip1 v4.2d,v0.2d,v2.2d 3956 zip2 v5.2d,v0.2d,v2.2d 3957 zip1 v6.2d,v1.2d,v3.2d 3958 zip2 v7.2d,v1.2d,v3.2d 3959 bl _vpsm4_ex_enc_4blks 3960 zip1 v4.4s,v0.4s,v1.4s 3961 zip2 v5.4s,v0.4s,v1.4s 3962 zip1 v6.4s,v2.4s,v3.4s 3963 zip2 v7.4s,v2.4s,v3.4s 3964 zip1 v0.2d,v4.2d,v6.2d 3965 zip2 v1.2d,v4.2d,v6.2d 3966 zip1 v2.2d,v5.2d,v7.2d 3967 zip2 v3.2d,v5.2d,v7.2d 3968 eor v0.16b, v0.16b, v16.16b 3969 eor v1.16b, v1.16b, v17.16b 3970 eor v2.16b, v2.16b, v18.16b 3971 eor v3.16b, v3.16b, v19.16b 3972 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3973 sub x2,x2,#4 3974 mov v16.16b,v20.16b 3975 mov v17.16b,v21.16b 3976 mov v18.16b,v22.16b 3977 // save the last tweak 3978 mov v25.16b,v19.16b 3979 1: 3980 // process last block 3981 cmp x2,#1 3982 b.lt 100f 3983 b.gt 1f 3984 ld1 {v4.4s},[x0],#16 3985 eor v4.16b, v4.16b, v16.16b 3986 #ifndef __AARCH64EB__ 3987 rev32 v4.16b,v4.16b 3988 #endif 3989 mov x10,x3 3990 mov w11,#8 3991 mov w12,v4.s[0] 3992 mov w13,v4.s[1] 3993 mov w14,v4.s[2] 3994 mov w15,v4.s[3] 3995 10: 3996 ldp w7,w8,[x10],8 3997 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3998 eor w6,w14,w15 3999 eor w9,w7,w13 4000 eor w6,w6,w9 4001 mov v3.s[0],w6 4002 // optimize sbox using AESE instruction 4003 tbl v0.16b, {v3.16b}, v26.16b 4004 ushr v2.16b, v0.16b, 4 4005 and v0.16b, v0.16b, v31.16b 4006 tbl v0.16b, {v28.16b}, v0.16b 4007 tbl v2.16b, {v27.16b}, v2.16b 4008 eor v0.16b, v0.16b, v2.16b 4009 eor v1.16b, v1.16b, v1.16b 4010 aese v0.16b,v1.16b 4011 ushr v2.16b, v0.16b, 4 4012 and v0.16b, v0.16b, v31.16b 4013 tbl v0.16b, {v30.16b}, v0.16b 4014 tbl v2.16b, {v29.16b}, v2.16b 4015 eor v0.16b, v0.16b, v2.16b 4016 4017 mov w7,v0.s[0] 4018 eor w6,w7,w7,ror #32-2 4019 eor w6,w6,w7,ror #32-10 4020 eor w6,w6,w7,ror #32-18 4021 eor w6,w6,w7,ror #32-24 4022 eor w12,w12,w6 4023 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4024 eor w6,w14,w15 4025 eor w9,w12,w8 4026 eor w6,w6,w9 4027 mov v3.s[0],w6 4028 // optimize sbox using AESE instruction 4029 tbl v0.16b, {v3.16b}, v26.16b 4030 ushr v2.16b, v0.16b, 4 4031 and v0.16b, v0.16b, v31.16b 4032 tbl v0.16b, {v28.16b}, v0.16b 4033 tbl v2.16b, {v27.16b}, v2.16b 4034 eor v0.16b, v0.16b, v2.16b 4035 eor v1.16b, v1.16b, v1.16b 4036 aese v0.16b,v1.16b 4037 ushr v2.16b, v0.16b, 4 4038 and v0.16b, v0.16b, v31.16b 4039 tbl v0.16b, {v30.16b}, v0.16b 4040 tbl v2.16b, {v29.16b}, v2.16b 4041 eor v0.16b, v0.16b, v2.16b 4042 4043 mov w7,v0.s[0] 4044 eor w6,w7,w7,ror #32-2 4045 eor w6,w6,w7,ror #32-10 4046 eor w6,w6,w7,ror #32-18 4047 eor w6,w6,w7,ror #32-24 4048 ldp w7,w8,[x10],8 4049 eor w13,w13,w6 4050 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4051 eor w6,w12,w13 4052 eor w9,w7,w15 4053 eor w6,w6,w9 4054 mov v3.s[0],w6 4055 // optimize sbox using AESE instruction 4056 tbl v0.16b, {v3.16b}, v26.16b 4057 ushr v2.16b, v0.16b, 4 4058 and v0.16b, v0.16b, v31.16b 4059 tbl v0.16b, {v28.16b}, v0.16b 4060 tbl v2.16b, {v27.16b}, v2.16b 4061 eor v0.16b, v0.16b, v2.16b 4062 eor v1.16b, v1.16b, v1.16b 4063 aese v0.16b,v1.16b 4064 ushr v2.16b, v0.16b, 4 4065 and v0.16b, v0.16b, v31.16b 4066 tbl v0.16b, {v30.16b}, v0.16b 4067 tbl v2.16b, {v29.16b}, v2.16b 4068 eor v0.16b, v0.16b, v2.16b 4069 4070 mov w7,v0.s[0] 4071 eor w6,w7,w7,ror #32-2 4072 eor w6,w6,w7,ror #32-10 4073 eor w6,w6,w7,ror #32-18 4074 eor w6,w6,w7,ror #32-24 4075 eor w14,w14,w6 4076 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4077 eor w6,w12,w13 4078 eor w9,w14,w8 4079 eor w6,w6,w9 4080 mov v3.s[0],w6 4081 // optimize sbox using AESE instruction 4082 tbl v0.16b, {v3.16b}, v26.16b 4083 ushr v2.16b, v0.16b, 4 4084 and v0.16b, v0.16b, v31.16b 4085 tbl v0.16b, {v28.16b}, v0.16b 4086 tbl v2.16b, {v27.16b}, v2.16b 4087 eor v0.16b, v0.16b, v2.16b 4088 eor v1.16b, v1.16b, v1.16b 4089 aese v0.16b,v1.16b 4090 ushr v2.16b, v0.16b, 4 4091 and v0.16b, v0.16b, v31.16b 4092 tbl v0.16b, {v30.16b}, v0.16b 4093 tbl v2.16b, {v29.16b}, v2.16b 4094 eor v0.16b, v0.16b, v2.16b 4095 4096 mov w7,v0.s[0] 4097 eor w6,w7,w7,ror #32-2 4098 eor w6,w6,w7,ror #32-10 4099 eor w6,w6,w7,ror #32-18 4100 eor w6,w6,w7,ror #32-24 4101 eor w15,w15,w6 4102 subs w11,w11,#1 4103 b.ne 10b 4104 mov v4.s[0],w15 4105 mov v4.s[1],w14 4106 mov v4.s[2],w13 4107 mov v4.s[3],w12 4108 #ifndef __AARCH64EB__ 4109 rev32 v4.16b,v4.16b 4110 #endif 4111 eor v4.16b, v4.16b, v16.16b 4112 st1 {v4.4s},[x1],#16 4113 // save the last tweak 4114 mov v25.16b,v16.16b 4115 b 100f 4116 1: // process last 2 blocks 4117 cmp x2,#2 4118 b.gt 1f 4119 ld1 {v4.4s,v5.4s},[x0],#32 4120 eor v4.16b, v4.16b, v16.16b 4121 eor v5.16b, v5.16b, v17.16b 4122 #ifndef __AARCH64EB__ 4123 rev32 v4.16b,v4.16b 4124 #endif 4125 #ifndef __AARCH64EB__ 4126 rev32 v5.16b,v5.16b 4127 #endif 4128 zip1 v0.4s,v4.4s,v5.4s 4129 zip2 v1.4s,v4.4s,v5.4s 4130 zip1 v2.4s,v6.4s,v7.4s 4131 zip2 v3.4s,v6.4s,v7.4s 4132 zip1 v4.2d,v0.2d,v2.2d 4133 zip2 v5.2d,v0.2d,v2.2d 4134 zip1 v6.2d,v1.2d,v3.2d 4135 zip2 v7.2d,v1.2d,v3.2d 4136 bl _vpsm4_ex_enc_4blks 4137 zip1 v4.4s,v0.4s,v1.4s 4138 zip2 v5.4s,v0.4s,v1.4s 4139 zip1 v6.4s,v2.4s,v3.4s 4140 zip2 v7.4s,v2.4s,v3.4s 4141 zip1 v0.2d,v4.2d,v6.2d 4142 zip2 v1.2d,v4.2d,v6.2d 4143 zip1 v2.2d,v5.2d,v7.2d 4144 zip2 v3.2d,v5.2d,v7.2d 4145 eor v0.16b, v0.16b, v16.16b 4146 eor v1.16b, v1.16b, v17.16b 4147 st1 {v0.4s,v1.4s},[x1],#32 4148 // save the last tweak 4149 mov v25.16b,v17.16b 4150 b 100f 4151 1: // process last 3 blocks 4152 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 4153 eor v4.16b, v4.16b, v16.16b 4154 eor v5.16b, v5.16b, v17.16b 4155 eor v6.16b, v6.16b, v18.16b 4156 #ifndef __AARCH64EB__ 4157 rev32 v4.16b,v4.16b 4158 #endif 4159 #ifndef __AARCH64EB__ 4160 rev32 v5.16b,v5.16b 4161 #endif 4162 #ifndef __AARCH64EB__ 4163 rev32 v6.16b,v6.16b 4164 #endif 4165 zip1 v0.4s,v4.4s,v5.4s 4166 zip2 v1.4s,v4.4s,v5.4s 4167 zip1 v2.4s,v6.4s,v7.4s 4168 zip2 v3.4s,v6.4s,v7.4s 4169 zip1 v4.2d,v0.2d,v2.2d 4170 zip2 v5.2d,v0.2d,v2.2d 4171 zip1 v6.2d,v1.2d,v3.2d 4172 zip2 v7.2d,v1.2d,v3.2d 4173 bl _vpsm4_ex_enc_4blks 4174 zip1 v4.4s,v0.4s,v1.4s 4175 zip2 v5.4s,v0.4s,v1.4s 4176 zip1 v6.4s,v2.4s,v3.4s 4177 zip2 v7.4s,v2.4s,v3.4s 4178 zip1 v0.2d,v4.2d,v6.2d 4179 zip2 v1.2d,v4.2d,v6.2d 4180 zip1 v2.2d,v5.2d,v7.2d 4181 zip2 v3.2d,v5.2d,v7.2d 4182 eor v0.16b, v0.16b, v16.16b 4183 eor v1.16b, v1.16b, v17.16b 4184 eor v2.16b, v2.16b, v18.16b 4185 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 4186 // save the last tweak 4187 mov v25.16b,v18.16b 4188 100: 4189 cmp x29,0 4190 b.eq .return 4191 4192 // This branch calculates the last two tweaks, 4193 // while the encryption/decryption length is larger than 32 4194 .last_2blks_tweak: 4195 #ifdef __AARCH64EB__ 4196 rev32 v25.16b,v25.16b 4197 #endif 4198 mov v2.16b,v25.16b 4199 adrp x9, .Lxts_magic 4200 ldr q0, [x9, #:lo12:.Lxts_magic] 4201 shl v17.16b, v2.16b, #1 4202 ext v1.16b, v2.16b, v2.16b,#15 4203 ushr v1.16b, v1.16b, #7 4204 mul v1.16b, v1.16b, v0.16b 4205 eor v17.16b, v17.16b, v1.16b 4206 mov v2.16b,v17.16b 4207 adrp x9, .Lxts_magic 4208 ldr q0, [x9, #:lo12:.Lxts_magic] 4209 shl v18.16b, v2.16b, #1 4210 ext v1.16b, v2.16b, v2.16b,#15 4211 ushr v1.16b, v1.16b, #7 4212 mul v1.16b, v1.16b, v0.16b 4213 eor v18.16b, v18.16b, v1.16b 4214 b .check_dec 4215 4216 4217 // This branch calculates the last two tweaks, 4218 // while the encryption/decryption length is equal to 32, who only need two tweaks 4219 .only_2blks_tweak: 4220 mov v17.16b,v16.16b 4221 #ifdef __AARCH64EB__ 4222 rev32 v17.16b,v17.16b 4223 #endif 4224 mov v2.16b,v17.16b 4225 adrp x9, .Lxts_magic 4226 ldr q0, [x9, #:lo12:.Lxts_magic] 4227 shl v18.16b, v2.16b, #1 4228 ext v1.16b, v2.16b, v2.16b,#15 4229 ushr v1.16b, v1.16b, #7 4230 mul v1.16b, v1.16b, v0.16b 4231 eor v18.16b, v18.16b, v1.16b 4232 b .check_dec 4233 4234 4235 // Determine whether encryption or decryption is required. 4236 // The last two tweaks need to be swapped for decryption. 4237 .check_dec: 4238 // encryption:1 decryption:0 4239 cmp w28,1 4240 b.eq .process_last_2blks 4241 mov v0.16B,v17.16b 4242 mov v17.16B,v18.16b 4243 mov v18.16B,v0.16b 4244 4245 .process_last_2blks: 4246 #ifdef __AARCH64EB__ 4247 rev32 v17.16b,v17.16b 4248 #endif 4249 #ifdef __AARCH64EB__ 4250 rev32 v18.16b,v18.16b 4251 #endif 4252 ld1 {v4.4s},[x0],#16 4253 eor v4.16b, v4.16b, v17.16b 4254 #ifndef __AARCH64EB__ 4255 rev32 v4.16b,v4.16b 4256 #endif 4257 mov x10,x3 4258 mov w11,#8 4259 mov w12,v4.s[0] 4260 mov w13,v4.s[1] 4261 mov w14,v4.s[2] 4262 mov w15,v4.s[3] 4263 10: 4264 ldp w7,w8,[x10],8 4265 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4266 eor w6,w14,w15 4267 eor w9,w7,w13 4268 eor w6,w6,w9 4269 mov v3.s[0],w6 4270 // optimize sbox using AESE instruction 4271 tbl v0.16b, {v3.16b}, v26.16b 4272 ushr v2.16b, v0.16b, 4 4273 and v0.16b, v0.16b, v31.16b 4274 tbl v0.16b, {v28.16b}, v0.16b 4275 tbl v2.16b, {v27.16b}, v2.16b 4276 eor v0.16b, v0.16b, v2.16b 4277 eor v1.16b, v1.16b, v1.16b 4278 aese v0.16b,v1.16b 4279 ushr v2.16b, v0.16b, 4 4280 and v0.16b, v0.16b, v31.16b 4281 tbl v0.16b, {v30.16b}, v0.16b 4282 tbl v2.16b, {v29.16b}, v2.16b 4283 eor v0.16b, v0.16b, v2.16b 4284 4285 mov w7,v0.s[0] 4286 eor w6,w7,w7,ror #32-2 4287 eor w6,w6,w7,ror #32-10 4288 eor w6,w6,w7,ror #32-18 4289 eor w6,w6,w7,ror #32-24 4290 eor w12,w12,w6 4291 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4292 eor w6,w14,w15 4293 eor w9,w12,w8 4294 eor w6,w6,w9 4295 mov v3.s[0],w6 4296 // optimize sbox using AESE instruction 4297 tbl v0.16b, {v3.16b}, v26.16b 4298 ushr v2.16b, v0.16b, 4 4299 and v0.16b, v0.16b, v31.16b 4300 tbl v0.16b, {v28.16b}, v0.16b 4301 tbl v2.16b, {v27.16b}, v2.16b 4302 eor v0.16b, v0.16b, v2.16b 4303 eor v1.16b, v1.16b, v1.16b 4304 aese v0.16b,v1.16b 4305 ushr v2.16b, v0.16b, 4 4306 and v0.16b, v0.16b, v31.16b 4307 tbl v0.16b, {v30.16b}, v0.16b 4308 tbl v2.16b, {v29.16b}, v2.16b 4309 eor v0.16b, v0.16b, v2.16b 4310 4311 mov w7,v0.s[0] 4312 eor w6,w7,w7,ror #32-2 4313 eor w6,w6,w7,ror #32-10 4314 eor w6,w6,w7,ror #32-18 4315 eor w6,w6,w7,ror #32-24 4316 ldp w7,w8,[x10],8 4317 eor w13,w13,w6 4318 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4319 eor w6,w12,w13 4320 eor w9,w7,w15 4321 eor w6,w6,w9 4322 mov v3.s[0],w6 4323 // optimize sbox using AESE instruction 4324 tbl v0.16b, {v3.16b}, v26.16b 4325 ushr v2.16b, v0.16b, 4 4326 and v0.16b, v0.16b, v31.16b 4327 tbl v0.16b, {v28.16b}, v0.16b 4328 tbl v2.16b, {v27.16b}, v2.16b 4329 eor v0.16b, v0.16b, v2.16b 4330 eor v1.16b, v1.16b, v1.16b 4331 aese v0.16b,v1.16b 4332 ushr v2.16b, v0.16b, 4 4333 and v0.16b, v0.16b, v31.16b 4334 tbl v0.16b, {v30.16b}, v0.16b 4335 tbl v2.16b, {v29.16b}, v2.16b 4336 eor v0.16b, v0.16b, v2.16b 4337 4338 mov w7,v0.s[0] 4339 eor w6,w7,w7,ror #32-2 4340 eor w6,w6,w7,ror #32-10 4341 eor w6,w6,w7,ror #32-18 4342 eor w6,w6,w7,ror #32-24 4343 eor w14,w14,w6 4344 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4345 eor w6,w12,w13 4346 eor w9,w14,w8 4347 eor w6,w6,w9 4348 mov v3.s[0],w6 4349 // optimize sbox using AESE instruction 4350 tbl v0.16b, {v3.16b}, v26.16b 4351 ushr v2.16b, v0.16b, 4 4352 and v0.16b, v0.16b, v31.16b 4353 tbl v0.16b, {v28.16b}, v0.16b 4354 tbl v2.16b, {v27.16b}, v2.16b 4355 eor v0.16b, v0.16b, v2.16b 4356 eor v1.16b, v1.16b, v1.16b 4357 aese v0.16b,v1.16b 4358 ushr v2.16b, v0.16b, 4 4359 and v0.16b, v0.16b, v31.16b 4360 tbl v0.16b, {v30.16b}, v0.16b 4361 tbl v2.16b, {v29.16b}, v2.16b 4362 eor v0.16b, v0.16b, v2.16b 4363 4364 mov w7,v0.s[0] 4365 eor w6,w7,w7,ror #32-2 4366 eor w6,w6,w7,ror #32-10 4367 eor w6,w6,w7,ror #32-18 4368 eor w6,w6,w7,ror #32-24 4369 eor w15,w15,w6 4370 subs w11,w11,#1 4371 b.ne 10b 4372 mov v4.s[0],w15 4373 mov v4.s[1],w14 4374 mov v4.s[2],w13 4375 mov v4.s[3],w12 4376 #ifndef __AARCH64EB__ 4377 rev32 v4.16b,v4.16b 4378 #endif 4379 eor v4.16b, v4.16b, v17.16b 4380 st1 {v4.4s},[x1],#16 4381 4382 sub x26,x1,16 4383 .loop: 4384 subs x29,x29,1 4385 ldrb w7,[x26,x29] 4386 ldrb w8,[x0,x29] 4387 strb w8,[x26,x29] 4388 strb w7,[x1,x29] 4389 b.gt .loop 4390 ld1 {v4.4s}, [x26] 4391 eor v4.16b, v4.16b, v18.16b 4392 #ifndef __AARCH64EB__ 4393 rev32 v4.16b,v4.16b 4394 #endif 4395 mov x10,x3 4396 mov w11,#8 4397 mov w12,v4.s[0] 4398 mov w13,v4.s[1] 4399 mov w14,v4.s[2] 4400 mov w15,v4.s[3] 4401 10: 4402 ldp w7,w8,[x10],8 4403 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4404 eor w6,w14,w15 4405 eor w9,w7,w13 4406 eor w6,w6,w9 4407 mov v3.s[0],w6 4408 // optimize sbox using AESE instruction 4409 tbl v0.16b, {v3.16b}, v26.16b 4410 ushr v2.16b, v0.16b, 4 4411 and v0.16b, v0.16b, v31.16b 4412 tbl v0.16b, {v28.16b}, v0.16b 4413 tbl v2.16b, {v27.16b}, v2.16b 4414 eor v0.16b, v0.16b, v2.16b 4415 eor v1.16b, v1.16b, v1.16b 4416 aese v0.16b,v1.16b 4417 ushr v2.16b, v0.16b, 4 4418 and v0.16b, v0.16b, v31.16b 4419 tbl v0.16b, {v30.16b}, v0.16b 4420 tbl v2.16b, {v29.16b}, v2.16b 4421 eor v0.16b, v0.16b, v2.16b 4422 4423 mov w7,v0.s[0] 4424 eor w6,w7,w7,ror #32-2 4425 eor w6,w6,w7,ror #32-10 4426 eor w6,w6,w7,ror #32-18 4427 eor w6,w6,w7,ror #32-24 4428 eor w12,w12,w6 4429 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4430 eor w6,w14,w15 4431 eor w9,w12,w8 4432 eor w6,w6,w9 4433 mov v3.s[0],w6 4434 // optimize sbox using AESE instruction 4435 tbl v0.16b, {v3.16b}, v26.16b 4436 ushr v2.16b, v0.16b, 4 4437 and v0.16b, v0.16b, v31.16b 4438 tbl v0.16b, {v28.16b}, v0.16b 4439 tbl v2.16b, {v27.16b}, v2.16b 4440 eor v0.16b, v0.16b, v2.16b 4441 eor v1.16b, v1.16b, v1.16b 4442 aese v0.16b,v1.16b 4443 ushr v2.16b, v0.16b, 4 4444 and v0.16b, v0.16b, v31.16b 4445 tbl v0.16b, {v30.16b}, v0.16b 4446 tbl v2.16b, {v29.16b}, v2.16b 4447 eor v0.16b, v0.16b, v2.16b 4448 4449 mov w7,v0.s[0] 4450 eor w6,w7,w7,ror #32-2 4451 eor w6,w6,w7,ror #32-10 4452 eor w6,w6,w7,ror #32-18 4453 eor w6,w6,w7,ror #32-24 4454 ldp w7,w8,[x10],8 4455 eor w13,w13,w6 4456 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4457 eor w6,w12,w13 4458 eor w9,w7,w15 4459 eor w6,w6,w9 4460 mov v3.s[0],w6 4461 // optimize sbox using AESE instruction 4462 tbl v0.16b, {v3.16b}, v26.16b 4463 ushr v2.16b, v0.16b, 4 4464 and v0.16b, v0.16b, v31.16b 4465 tbl v0.16b, {v28.16b}, v0.16b 4466 tbl v2.16b, {v27.16b}, v2.16b 4467 eor v0.16b, v0.16b, v2.16b 4468 eor v1.16b, v1.16b, v1.16b 4469 aese v0.16b,v1.16b 4470 ushr v2.16b, v0.16b, 4 4471 and v0.16b, v0.16b, v31.16b 4472 tbl v0.16b, {v30.16b}, v0.16b 4473 tbl v2.16b, {v29.16b}, v2.16b 4474 eor v0.16b, v0.16b, v2.16b 4475 4476 mov w7,v0.s[0] 4477 eor w6,w7,w7,ror #32-2 4478 eor w6,w6,w7,ror #32-10 4479 eor w6,w6,w7,ror #32-18 4480 eor w6,w6,w7,ror #32-24 4481 eor w14,w14,w6 4482 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4483 eor w6,w12,w13 4484 eor w9,w14,w8 4485 eor w6,w6,w9 4486 mov v3.s[0],w6 4487 // optimize sbox using AESE instruction 4488 tbl v0.16b, {v3.16b}, v26.16b 4489 ushr v2.16b, v0.16b, 4 4490 and v0.16b, v0.16b, v31.16b 4491 tbl v0.16b, {v28.16b}, v0.16b 4492 tbl v2.16b, {v27.16b}, v2.16b 4493 eor v0.16b, v0.16b, v2.16b 4494 eor v1.16b, v1.16b, v1.16b 4495 aese v0.16b,v1.16b 4496 ushr v2.16b, v0.16b, 4 4497 and v0.16b, v0.16b, v31.16b 4498 tbl v0.16b, {v30.16b}, v0.16b 4499 tbl v2.16b, {v29.16b}, v2.16b 4500 eor v0.16b, v0.16b, v2.16b 4501 4502 mov w7,v0.s[0] 4503 eor w6,w7,w7,ror #32-2 4504 eor w6,w6,w7,ror #32-10 4505 eor w6,w6,w7,ror #32-18 4506 eor w6,w6,w7,ror #32-24 4507 eor w15,w15,w6 4508 subs w11,w11,#1 4509 b.ne 10b 4510 mov v4.s[0],w15 4511 mov v4.s[1],w14 4512 mov v4.s[2],w13 4513 mov v4.s[3],w12 4514 #ifndef __AARCH64EB__ 4515 rev32 v4.16b,v4.16b 4516 #endif 4517 eor v4.16b, v4.16b, v18.16b 4518 st1 {v4.4s}, [x26] 4519 .return: 4520 ldp d14, d15, [sp], #0x10 4521 ldp d12, d13, [sp], #0x10 4522 ldp d10, d11, [sp], #0x10 4523 ldp d8, d9, [sp], #0x10 4524 ldp x29, x30, [sp], #0x10 4525 ldp x27, x28, [sp], #0x10 4526 ldp x25, x26, [sp], #0x10 4527 ldp x23, x24, [sp], #0x10 4528 ldp x21, x22, [sp], #0x10 4529 ldp x19, x20, [sp], #0x10 4530 ldp x17, x18, [sp], #0x10 4531 ldp x15, x16, [sp], #0x10 4532 AARCH64_VALIDATE_LINK_REGISTER 4533 ret 4534 .size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt 4535