1 // Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License 2.0 (the "License"). You may not use 4 // this file except in compliance with the License. You can obtain a copy 5 // in the file LICENSE in the source distribution or at 6 // https://www.openssl.org/source/license.html 7 8 // 9 // This module implements SM4 with ASIMD on aarch64 10 // 11 // Feb 2022 12 // 13 14 // $output is the last argument if it looks like a file (it has an extension) 15 // $flavour is the first argument if it doesn't look like a file 16 #include "arm_arch.h" 17 .arch armv8-a 18 .text 19 20 .section .rodata 21 .type _vpsm4_consts,%object 22 .align 7 23 _vpsm4_consts: 24 .Lsbox: 25 .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 26 .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 27 .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 28 .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 29 .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 30 .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 31 .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 32 .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E 33 .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 34 .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 35 .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F 36 .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 37 .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 38 .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 39 .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 40 .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 41 .Lck: 42 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 43 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 44 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 45 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 46 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 47 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 48 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 49 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 50 .Lfk: 51 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 52 .Lshuffles: 53 .quad 0x0B0A090807060504,0x030201000F0E0D0C 54 .Lxts_magic: 55 .quad 0x0101010101010187,0x0101010101010101 56 57 .size _vpsm4_consts,.-_vpsm4_consts 58 59 .previous 60 61 .type _vpsm4_set_key,%function 62 .align 4 63 _vpsm4_set_key: 64 AARCH64_VALID_CALL_TARGET 65 ld1 {v5.4s},[x0] 66 adrp x10,.Lsbox 67 add x10,x10,#:lo12:.Lsbox 68 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 69 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 70 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 71 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 72 #ifndef __AARCH64EB__ 73 rev32 v5.16b,v5.16b 74 #endif 75 adrp x5,.Lshuffles 76 add x5,x5,#:lo12:.Lshuffles 77 ld1 {v7.2d},[x5] 78 adrp x5,.Lfk 79 add x5,x5,#:lo12:.Lfk 80 ld1 {v6.2d},[x5] 81 eor v5.16b,v5.16b,v6.16b 82 mov x6,#32 83 adrp x5,.Lck 84 add x5,x5,#:lo12:.Lck 85 movi v0.16b,#64 86 cbnz w2,1f 87 add x1,x1,124 88 1: 89 mov w7,v5.s[1] 90 ldr w8,[x5],#4 91 eor w8,w8,w7 92 mov w7,v5.s[2] 93 eor w8,w8,w7 94 mov w7,v5.s[3] 95 eor w8,w8,w7 96 // sbox lookup 97 mov v4.s[0],w8 98 tbl v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b 99 sub v4.16b,v4.16b,v0.16b 100 tbx v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b 101 sub v4.16b,v4.16b,v0.16b 102 tbx v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b 103 sub v4.16b,v4.16b,v0.16b 104 tbx v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b 105 mov w7,v1.s[0] 106 eor w8,w7,w7,ror #19 107 eor w8,w8,w7,ror #9 108 mov w7,v5.s[0] 109 eor w8,w8,w7 110 mov v5.s[0],w8 111 cbz w2,2f 112 str w8,[x1],#4 113 b 3f 114 2: 115 str w8,[x1],#-4 116 3: 117 tbl v5.16b,{v5.16b},v7.16b 118 subs x6,x6,#1 119 b.ne 1b 120 ret 121 .size _vpsm4_set_key,.-_vpsm4_set_key 122 .type _vpsm4_enc_4blks,%function 123 .align 4 124 _vpsm4_enc_4blks: 125 AARCH64_VALID_CALL_TARGET 126 mov x10,x3 127 mov w11,#8 128 10: 129 ldp w7,w8,[x10],8 130 dup v12.4s,w7 131 dup v13.4s,w8 132 133 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 134 eor v14.16b,v6.16b,v7.16b 135 eor v12.16b,v5.16b,v12.16b 136 eor v12.16b,v14.16b,v12.16b 137 movi v0.16b,#64 138 movi v1.16b,#128 139 movi v2.16b,#192 140 sub v0.16b,v12.16b,v0.16b 141 sub v1.16b,v12.16b,v1.16b 142 sub v2.16b,v12.16b,v2.16b 143 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 144 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 145 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 146 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 147 add v0.2d,v0.2d,v1.2d 148 add v2.2d,v2.2d,v12.2d 149 add v12.2d,v0.2d,v2.2d 150 151 ushr v0.4s,v12.4s,32-2 152 sli v0.4s,v12.4s,2 153 ushr v2.4s,v12.4s,32-10 154 eor v1.16b,v0.16b,v12.16b 155 sli v2.4s,v12.4s,10 156 eor v1.16b,v2.16b,v1.16b 157 ushr v0.4s,v12.4s,32-18 158 sli v0.4s,v12.4s,18 159 ushr v2.4s,v12.4s,32-24 160 eor v1.16b,v0.16b,v1.16b 161 sli v2.4s,v12.4s,24 162 eor v12.16b,v2.16b,v1.16b 163 eor v4.16b,v4.16b,v12.16b 164 165 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 166 eor v14.16b,v14.16b,v4.16b 167 eor v13.16b,v14.16b,v13.16b 168 movi v0.16b,#64 169 movi v1.16b,#128 170 movi v2.16b,#192 171 sub v0.16b,v13.16b,v0.16b 172 sub v1.16b,v13.16b,v1.16b 173 sub v2.16b,v13.16b,v2.16b 174 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 175 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 176 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 177 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 178 add v0.2d,v0.2d,v1.2d 179 add v2.2d,v2.2d,v13.2d 180 add v13.2d,v0.2d,v2.2d 181 182 ushr v0.4s,v13.4s,32-2 183 sli v0.4s,v13.4s,2 184 ushr v2.4s,v13.4s,32-10 185 eor v1.16b,v0.16b,v13.16b 186 sli v2.4s,v13.4s,10 187 eor v1.16b,v2.16b,v1.16b 188 ushr v0.4s,v13.4s,32-18 189 sli v0.4s,v13.4s,18 190 ushr v2.4s,v13.4s,32-24 191 eor v1.16b,v0.16b,v1.16b 192 sli v2.4s,v13.4s,24 193 eor v13.16b,v2.16b,v1.16b 194 ldp w7,w8,[x10],8 195 eor v5.16b,v5.16b,v13.16b 196 197 dup v12.4s,w7 198 dup v13.4s,w8 199 200 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 201 eor v14.16b,v4.16b,v5.16b 202 eor v12.16b,v7.16b,v12.16b 203 eor v12.16b,v14.16b,v12.16b 204 movi v0.16b,#64 205 movi v1.16b,#128 206 movi v2.16b,#192 207 sub v0.16b,v12.16b,v0.16b 208 sub v1.16b,v12.16b,v1.16b 209 sub v2.16b,v12.16b,v2.16b 210 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 211 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 212 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 213 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 214 add v0.2d,v0.2d,v1.2d 215 add v2.2d,v2.2d,v12.2d 216 add v12.2d,v0.2d,v2.2d 217 218 ushr v0.4s,v12.4s,32-2 219 sli v0.4s,v12.4s,2 220 ushr v2.4s,v12.4s,32-10 221 eor v1.16b,v0.16b,v12.16b 222 sli v2.4s,v12.4s,10 223 eor v1.16b,v2.16b,v1.16b 224 ushr v0.4s,v12.4s,32-18 225 sli v0.4s,v12.4s,18 226 ushr v2.4s,v12.4s,32-24 227 eor v1.16b,v0.16b,v1.16b 228 sli v2.4s,v12.4s,24 229 eor v12.16b,v2.16b,v1.16b 230 eor v6.16b,v6.16b,v12.16b 231 232 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 233 eor v14.16b,v14.16b,v6.16b 234 eor v13.16b,v14.16b,v13.16b 235 movi v0.16b,#64 236 movi v1.16b,#128 237 movi v2.16b,#192 238 sub v0.16b,v13.16b,v0.16b 239 sub v1.16b,v13.16b,v1.16b 240 sub v2.16b,v13.16b,v2.16b 241 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 242 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 243 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 244 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 245 add v0.2d,v0.2d,v1.2d 246 add v2.2d,v2.2d,v13.2d 247 add v13.2d,v0.2d,v2.2d 248 249 ushr v0.4s,v13.4s,32-2 250 sli v0.4s,v13.4s,2 251 ushr v2.4s,v13.4s,32-10 252 eor v1.16b,v0.16b,v13.16b 253 sli v2.4s,v13.4s,10 254 eor v1.16b,v2.16b,v1.16b 255 ushr v0.4s,v13.4s,32-18 256 sli v0.4s,v13.4s,18 257 ushr v2.4s,v13.4s,32-24 258 eor v1.16b,v0.16b,v1.16b 259 sli v2.4s,v13.4s,24 260 eor v13.16b,v2.16b,v1.16b 261 eor v7.16b,v7.16b,v13.16b 262 subs w11,w11,#1 263 b.ne 10b 264 #ifndef __AARCH64EB__ 265 rev32 v3.16b,v4.16b 266 #else 267 mov v3.16b,v4.16b 268 #endif 269 #ifndef __AARCH64EB__ 270 rev32 v2.16b,v5.16b 271 #else 272 mov v2.16b,v5.16b 273 #endif 274 #ifndef __AARCH64EB__ 275 rev32 v1.16b,v6.16b 276 #else 277 mov v1.16b,v6.16b 278 #endif 279 #ifndef __AARCH64EB__ 280 rev32 v0.16b,v7.16b 281 #else 282 mov v0.16b,v7.16b 283 #endif 284 ret 285 .size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks 286 .type _vpsm4_enc_8blks,%function 287 .align 4 288 _vpsm4_enc_8blks: 289 AARCH64_VALID_CALL_TARGET 290 mov x10,x3 291 mov w11,#8 292 10: 293 ldp w7,w8,[x10],8 294 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 295 dup v12.4s,w7 296 eor v14.16b,v6.16b,v7.16b 297 eor v15.16b,v10.16b,v11.16b 298 eor v0.16b,v5.16b,v12.16b 299 eor v1.16b,v9.16b,v12.16b 300 eor v12.16b,v14.16b,v0.16b 301 eor v13.16b,v15.16b,v1.16b 302 movi v3.16b,#64 303 sub v0.16b,v12.16b,v3.16b 304 sub v1.16b,v0.16b,v3.16b 305 sub v2.16b,v1.16b,v3.16b 306 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 307 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 308 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 309 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 310 add v1.2d,v0.2d,v1.2d 311 add v12.2d,v2.2d,v12.2d 312 add v12.2d,v1.2d,v12.2d 313 314 sub v0.16b,v13.16b,v3.16b 315 sub v1.16b,v0.16b,v3.16b 316 sub v2.16b,v1.16b,v3.16b 317 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 318 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 319 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 320 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 321 add v1.2d,v0.2d,v1.2d 322 add v13.2d,v2.2d,v13.2d 323 add v13.2d,v1.2d,v13.2d 324 325 ushr v0.4s,v12.4s,32-2 326 sli v0.4s,v12.4s,2 327 ushr v2.4s,v13.4s,32-2 328 eor v1.16b,v0.16b,v12.16b 329 sli v2.4s,v13.4s,2 330 331 ushr v0.4s,v12.4s,32-10 332 eor v3.16b,v2.16b,v13.16b 333 sli v0.4s,v12.4s,10 334 ushr v2.4s,v13.4s,32-10 335 eor v1.16b,v0.16b,v1.16b 336 sli v2.4s,v13.4s,10 337 338 ushr v0.4s,v12.4s,32-18 339 eor v3.16b,v2.16b,v3.16b 340 sli v0.4s,v12.4s,18 341 ushr v2.4s,v13.4s,32-18 342 eor v1.16b,v0.16b,v1.16b 343 sli v2.4s,v13.4s,18 344 345 ushr v0.4s,v12.4s,32-24 346 eor v3.16b,v2.16b,v3.16b 347 sli v0.4s,v12.4s,24 348 ushr v2.4s,v13.4s,32-24 349 eor v12.16b,v0.16b,v1.16b 350 sli v2.4s,v13.4s,24 351 eor v13.16b,v2.16b,v3.16b 352 eor v4.16b,v4.16b,v12.16b 353 eor v8.16b,v8.16b,v13.16b 354 355 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 356 dup v13.4s,w8 357 eor v14.16b,v14.16b,v4.16b 358 eor v15.16b,v15.16b,v8.16b 359 eor v12.16b,v14.16b,v13.16b 360 eor v13.16b,v15.16b,v13.16b 361 movi v3.16b,#64 362 sub v0.16b,v12.16b,v3.16b 363 sub v1.16b,v0.16b,v3.16b 364 sub v2.16b,v1.16b,v3.16b 365 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 366 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 367 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 368 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 369 add v1.2d,v0.2d,v1.2d 370 add v12.2d,v2.2d,v12.2d 371 add v12.2d,v1.2d,v12.2d 372 373 sub v0.16b,v13.16b,v3.16b 374 sub v1.16b,v0.16b,v3.16b 375 sub v2.16b,v1.16b,v3.16b 376 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 377 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 378 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 379 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 380 add v1.2d,v0.2d,v1.2d 381 add v13.2d,v2.2d,v13.2d 382 add v13.2d,v1.2d,v13.2d 383 384 ushr v0.4s,v12.4s,32-2 385 sli v0.4s,v12.4s,2 386 ushr v2.4s,v13.4s,32-2 387 eor v1.16b,v0.16b,v12.16b 388 sli v2.4s,v13.4s,2 389 390 ushr v0.4s,v12.4s,32-10 391 eor v3.16b,v2.16b,v13.16b 392 sli v0.4s,v12.4s,10 393 ushr v2.4s,v13.4s,32-10 394 eor v1.16b,v0.16b,v1.16b 395 sli v2.4s,v13.4s,10 396 397 ushr v0.4s,v12.4s,32-18 398 eor v3.16b,v2.16b,v3.16b 399 sli v0.4s,v12.4s,18 400 ushr v2.4s,v13.4s,32-18 401 eor v1.16b,v0.16b,v1.16b 402 sli v2.4s,v13.4s,18 403 404 ushr v0.4s,v12.4s,32-24 405 eor v3.16b,v2.16b,v3.16b 406 sli v0.4s,v12.4s,24 407 ushr v2.4s,v13.4s,32-24 408 eor v12.16b,v0.16b,v1.16b 409 sli v2.4s,v13.4s,24 410 eor v13.16b,v2.16b,v3.16b 411 ldp w7,w8,[x10],8 412 eor v5.16b,v5.16b,v12.16b 413 eor v9.16b,v9.16b,v13.16b 414 415 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 416 dup v12.4s,w7 417 eor v14.16b,v4.16b,v5.16b 418 eor v15.16b,v8.16b,v9.16b 419 eor v0.16b,v7.16b,v12.16b 420 eor v1.16b,v11.16b,v12.16b 421 eor v12.16b,v14.16b,v0.16b 422 eor v13.16b,v15.16b,v1.16b 423 movi v3.16b,#64 424 sub v0.16b,v12.16b,v3.16b 425 sub v1.16b,v0.16b,v3.16b 426 sub v2.16b,v1.16b,v3.16b 427 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 428 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 429 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 430 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 431 add v1.2d,v0.2d,v1.2d 432 add v12.2d,v2.2d,v12.2d 433 add v12.2d,v1.2d,v12.2d 434 435 sub v0.16b,v13.16b,v3.16b 436 sub v1.16b,v0.16b,v3.16b 437 sub v2.16b,v1.16b,v3.16b 438 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 439 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 440 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 441 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 442 add v1.2d,v0.2d,v1.2d 443 add v13.2d,v2.2d,v13.2d 444 add v13.2d,v1.2d,v13.2d 445 446 ushr v0.4s,v12.4s,32-2 447 sli v0.4s,v12.4s,2 448 ushr v2.4s,v13.4s,32-2 449 eor v1.16b,v0.16b,v12.16b 450 sli v2.4s,v13.4s,2 451 452 ushr v0.4s,v12.4s,32-10 453 eor v3.16b,v2.16b,v13.16b 454 sli v0.4s,v12.4s,10 455 ushr v2.4s,v13.4s,32-10 456 eor v1.16b,v0.16b,v1.16b 457 sli v2.4s,v13.4s,10 458 459 ushr v0.4s,v12.4s,32-18 460 eor v3.16b,v2.16b,v3.16b 461 sli v0.4s,v12.4s,18 462 ushr v2.4s,v13.4s,32-18 463 eor v1.16b,v0.16b,v1.16b 464 sli v2.4s,v13.4s,18 465 466 ushr v0.4s,v12.4s,32-24 467 eor v3.16b,v2.16b,v3.16b 468 sli v0.4s,v12.4s,24 469 ushr v2.4s,v13.4s,32-24 470 eor v12.16b,v0.16b,v1.16b 471 sli v2.4s,v13.4s,24 472 eor v13.16b,v2.16b,v3.16b 473 eor v6.16b,v6.16b,v12.16b 474 eor v10.16b,v10.16b,v13.16b 475 476 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 477 dup v13.4s,w8 478 eor v14.16b,v14.16b,v6.16b 479 eor v15.16b,v15.16b,v10.16b 480 eor v12.16b,v14.16b,v13.16b 481 eor v13.16b,v15.16b,v13.16b 482 movi v3.16b,#64 483 sub v0.16b,v12.16b,v3.16b 484 sub v1.16b,v0.16b,v3.16b 485 sub v2.16b,v1.16b,v3.16b 486 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 487 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 488 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 489 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 490 add v1.2d,v0.2d,v1.2d 491 add v12.2d,v2.2d,v12.2d 492 add v12.2d,v1.2d,v12.2d 493 494 sub v0.16b,v13.16b,v3.16b 495 sub v1.16b,v0.16b,v3.16b 496 sub v2.16b,v1.16b,v3.16b 497 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 498 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 499 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 500 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 501 add v1.2d,v0.2d,v1.2d 502 add v13.2d,v2.2d,v13.2d 503 add v13.2d,v1.2d,v13.2d 504 505 ushr v0.4s,v12.4s,32-2 506 sli v0.4s,v12.4s,2 507 ushr v2.4s,v13.4s,32-2 508 eor v1.16b,v0.16b,v12.16b 509 sli v2.4s,v13.4s,2 510 511 ushr v0.4s,v12.4s,32-10 512 eor v3.16b,v2.16b,v13.16b 513 sli v0.4s,v12.4s,10 514 ushr v2.4s,v13.4s,32-10 515 eor v1.16b,v0.16b,v1.16b 516 sli v2.4s,v13.4s,10 517 518 ushr v0.4s,v12.4s,32-18 519 eor v3.16b,v2.16b,v3.16b 520 sli v0.4s,v12.4s,18 521 ushr v2.4s,v13.4s,32-18 522 eor v1.16b,v0.16b,v1.16b 523 sli v2.4s,v13.4s,18 524 525 ushr v0.4s,v12.4s,32-24 526 eor v3.16b,v2.16b,v3.16b 527 sli v0.4s,v12.4s,24 528 ushr v2.4s,v13.4s,32-24 529 eor v12.16b,v0.16b,v1.16b 530 sli v2.4s,v13.4s,24 531 eor v13.16b,v2.16b,v3.16b 532 eor v7.16b,v7.16b,v12.16b 533 eor v11.16b,v11.16b,v13.16b 534 subs w11,w11,#1 535 b.ne 10b 536 #ifndef __AARCH64EB__ 537 rev32 v3.16b,v4.16b 538 #else 539 mov v3.16b,v4.16b 540 #endif 541 #ifndef __AARCH64EB__ 542 rev32 v2.16b,v5.16b 543 #else 544 mov v2.16b,v5.16b 545 #endif 546 #ifndef __AARCH64EB__ 547 rev32 v1.16b,v6.16b 548 #else 549 mov v1.16b,v6.16b 550 #endif 551 #ifndef __AARCH64EB__ 552 rev32 v0.16b,v7.16b 553 #else 554 mov v0.16b,v7.16b 555 #endif 556 #ifndef __AARCH64EB__ 557 rev32 v7.16b,v8.16b 558 #else 559 mov v7.16b,v8.16b 560 #endif 561 #ifndef __AARCH64EB__ 562 rev32 v6.16b,v9.16b 563 #else 564 mov v6.16b,v9.16b 565 #endif 566 #ifndef __AARCH64EB__ 567 rev32 v5.16b,v10.16b 568 #else 569 mov v5.16b,v10.16b 570 #endif 571 #ifndef __AARCH64EB__ 572 rev32 v4.16b,v11.16b 573 #else 574 mov v4.16b,v11.16b 575 #endif 576 ret 577 .size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks 578 .globl vpsm4_set_encrypt_key 579 .type vpsm4_set_encrypt_key,%function 580 .align 5 581 vpsm4_set_encrypt_key: 582 AARCH64_SIGN_LINK_REGISTER 583 stp x29,x30,[sp,#-16]! 584 mov w2,1 585 bl _vpsm4_set_key 586 ldp x29,x30,[sp],#16 587 AARCH64_VALIDATE_LINK_REGISTER 588 ret 589 .size vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key 590 .globl vpsm4_set_decrypt_key 591 .type vpsm4_set_decrypt_key,%function 592 .align 5 593 vpsm4_set_decrypt_key: 594 AARCH64_SIGN_LINK_REGISTER 595 stp x29,x30,[sp,#-16]! 596 mov w2,0 597 bl _vpsm4_set_key 598 ldp x29,x30,[sp],#16 599 AARCH64_VALIDATE_LINK_REGISTER 600 ret 601 .size vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key 602 .globl vpsm4_encrypt 603 .type vpsm4_encrypt,%function 604 .align 5 605 vpsm4_encrypt: 606 AARCH64_VALID_CALL_TARGET 607 ld1 {v4.4s},[x0] 608 adrp x10,.Lsbox 609 add x10,x10,#:lo12:.Lsbox 610 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 611 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 612 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 613 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 614 #ifndef __AARCH64EB__ 615 rev32 v4.16b,v4.16b 616 #endif 617 mov x3,x2 618 mov x10,x3 619 mov w11,#8 620 mov w12,v4.s[0] 621 mov w13,v4.s[1] 622 mov w14,v4.s[2] 623 mov w15,v4.s[3] 624 10: 625 ldp w7,w8,[x10],8 626 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 627 eor w6,w14,w15 628 eor w9,w7,w13 629 eor w6,w6,w9 630 movi v1.16b,#64 631 movi v2.16b,#128 632 movi v3.16b,#192 633 mov v0.s[0],w6 634 635 sub v1.16b,v0.16b,v1.16b 636 sub v2.16b,v0.16b,v2.16b 637 sub v3.16b,v0.16b,v3.16b 638 639 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 640 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 641 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 642 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 643 644 mov w6,v0.s[0] 645 mov w7,v1.s[0] 646 mov w9,v2.s[0] 647 add w7,w6,w7 648 mov w6,v3.s[0] 649 add w7,w7,w9 650 add w7,w7,w6 651 652 eor w6,w7,w7,ror #32-2 653 eor w6,w6,w7,ror #32-10 654 eor w6,w6,w7,ror #32-18 655 eor w6,w6,w7,ror #32-24 656 eor w12,w12,w6 657 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 658 eor w6,w14,w15 659 eor w9,w12,w8 660 eor w6,w6,w9 661 movi v1.16b,#64 662 movi v2.16b,#128 663 movi v3.16b,#192 664 mov v0.s[0],w6 665 666 sub v1.16b,v0.16b,v1.16b 667 sub v2.16b,v0.16b,v2.16b 668 sub v3.16b,v0.16b,v3.16b 669 670 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 671 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 672 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 673 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 674 675 mov w6,v0.s[0] 676 mov w7,v1.s[0] 677 mov w9,v2.s[0] 678 add w7,w6,w7 679 mov w6,v3.s[0] 680 add w7,w7,w9 681 add w7,w7,w6 682 683 eor w6,w7,w7,ror #32-2 684 eor w6,w6,w7,ror #32-10 685 eor w6,w6,w7,ror #32-18 686 eor w6,w6,w7,ror #32-24 687 ldp w7,w8,[x10],8 688 eor w13,w13,w6 689 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 690 eor w6,w12,w13 691 eor w9,w7,w15 692 eor w6,w6,w9 693 movi v1.16b,#64 694 movi v2.16b,#128 695 movi v3.16b,#192 696 mov v0.s[0],w6 697 698 sub v1.16b,v0.16b,v1.16b 699 sub v2.16b,v0.16b,v2.16b 700 sub v3.16b,v0.16b,v3.16b 701 702 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 703 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 704 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 705 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 706 707 mov w6,v0.s[0] 708 mov w7,v1.s[0] 709 mov w9,v2.s[0] 710 add w7,w6,w7 711 mov w6,v3.s[0] 712 add w7,w7,w9 713 add w7,w7,w6 714 715 eor w6,w7,w7,ror #32-2 716 eor w6,w6,w7,ror #32-10 717 eor w6,w6,w7,ror #32-18 718 eor w6,w6,w7,ror #32-24 719 eor w14,w14,w6 720 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 721 eor w6,w12,w13 722 eor w9,w14,w8 723 eor w6,w6,w9 724 movi v1.16b,#64 725 movi v2.16b,#128 726 movi v3.16b,#192 727 mov v0.s[0],w6 728 729 sub v1.16b,v0.16b,v1.16b 730 sub v2.16b,v0.16b,v2.16b 731 sub v3.16b,v0.16b,v3.16b 732 733 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 734 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 735 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 736 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 737 738 mov w6,v0.s[0] 739 mov w7,v1.s[0] 740 mov w9,v2.s[0] 741 add w7,w6,w7 742 mov w6,v3.s[0] 743 add w7,w7,w9 744 add w7,w7,w6 745 746 eor w6,w7,w7,ror #32-2 747 eor w6,w6,w7,ror #32-10 748 eor w6,w6,w7,ror #32-18 749 eor w6,w6,w7,ror #32-24 750 eor w15,w15,w6 751 subs w11,w11,#1 752 b.ne 10b 753 mov v4.s[0],w15 754 mov v4.s[1],w14 755 mov v4.s[2],w13 756 mov v4.s[3],w12 757 #ifndef __AARCH64EB__ 758 rev32 v4.16b,v4.16b 759 #endif 760 st1 {v4.4s},[x1] 761 ret 762 .size vpsm4_encrypt,.-vpsm4_encrypt 763 .globl vpsm4_decrypt 764 .type vpsm4_decrypt,%function 765 .align 5 766 vpsm4_decrypt: 767 AARCH64_VALID_CALL_TARGET 768 ld1 {v4.4s},[x0] 769 adrp x10,.Lsbox 770 add x10,x10,#:lo12:.Lsbox 771 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 772 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 773 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 774 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 775 #ifndef __AARCH64EB__ 776 rev32 v4.16b,v4.16b 777 #endif 778 mov x3,x2 779 mov x10,x3 780 mov w11,#8 781 mov w12,v4.s[0] 782 mov w13,v4.s[1] 783 mov w14,v4.s[2] 784 mov w15,v4.s[3] 785 10: 786 ldp w7,w8,[x10],8 787 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 788 eor w6,w14,w15 789 eor w9,w7,w13 790 eor w6,w6,w9 791 movi v1.16b,#64 792 movi v2.16b,#128 793 movi v3.16b,#192 794 mov v0.s[0],w6 795 796 sub v1.16b,v0.16b,v1.16b 797 sub v2.16b,v0.16b,v2.16b 798 sub v3.16b,v0.16b,v3.16b 799 800 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 801 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 802 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 803 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 804 805 mov w6,v0.s[0] 806 mov w7,v1.s[0] 807 mov w9,v2.s[0] 808 add w7,w6,w7 809 mov w6,v3.s[0] 810 add w7,w7,w9 811 add w7,w7,w6 812 813 eor w6,w7,w7,ror #32-2 814 eor w6,w6,w7,ror #32-10 815 eor w6,w6,w7,ror #32-18 816 eor w6,w6,w7,ror #32-24 817 eor w12,w12,w6 818 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 819 eor w6,w14,w15 820 eor w9,w12,w8 821 eor w6,w6,w9 822 movi v1.16b,#64 823 movi v2.16b,#128 824 movi v3.16b,#192 825 mov v0.s[0],w6 826 827 sub v1.16b,v0.16b,v1.16b 828 sub v2.16b,v0.16b,v2.16b 829 sub v3.16b,v0.16b,v3.16b 830 831 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 832 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 833 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 834 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 835 836 mov w6,v0.s[0] 837 mov w7,v1.s[0] 838 mov w9,v2.s[0] 839 add w7,w6,w7 840 mov w6,v3.s[0] 841 add w7,w7,w9 842 add w7,w7,w6 843 844 eor w6,w7,w7,ror #32-2 845 eor w6,w6,w7,ror #32-10 846 eor w6,w6,w7,ror #32-18 847 eor w6,w6,w7,ror #32-24 848 ldp w7,w8,[x10],8 849 eor w13,w13,w6 850 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 851 eor w6,w12,w13 852 eor w9,w7,w15 853 eor w6,w6,w9 854 movi v1.16b,#64 855 movi v2.16b,#128 856 movi v3.16b,#192 857 mov v0.s[0],w6 858 859 sub v1.16b,v0.16b,v1.16b 860 sub v2.16b,v0.16b,v2.16b 861 sub v3.16b,v0.16b,v3.16b 862 863 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 864 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 865 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 866 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 867 868 mov w6,v0.s[0] 869 mov w7,v1.s[0] 870 mov w9,v2.s[0] 871 add w7,w6,w7 872 mov w6,v3.s[0] 873 add w7,w7,w9 874 add w7,w7,w6 875 876 eor w6,w7,w7,ror #32-2 877 eor w6,w6,w7,ror #32-10 878 eor w6,w6,w7,ror #32-18 879 eor w6,w6,w7,ror #32-24 880 eor w14,w14,w6 881 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 882 eor w6,w12,w13 883 eor w9,w14,w8 884 eor w6,w6,w9 885 movi v1.16b,#64 886 movi v2.16b,#128 887 movi v3.16b,#192 888 mov v0.s[0],w6 889 890 sub v1.16b,v0.16b,v1.16b 891 sub v2.16b,v0.16b,v2.16b 892 sub v3.16b,v0.16b,v3.16b 893 894 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 895 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 896 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 897 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 898 899 mov w6,v0.s[0] 900 mov w7,v1.s[0] 901 mov w9,v2.s[0] 902 add w7,w6,w7 903 mov w6,v3.s[0] 904 add w7,w7,w9 905 add w7,w7,w6 906 907 eor w6,w7,w7,ror #32-2 908 eor w6,w6,w7,ror #32-10 909 eor w6,w6,w7,ror #32-18 910 eor w6,w6,w7,ror #32-24 911 eor w15,w15,w6 912 subs w11,w11,#1 913 b.ne 10b 914 mov v4.s[0],w15 915 mov v4.s[1],w14 916 mov v4.s[2],w13 917 mov v4.s[3],w12 918 #ifndef __AARCH64EB__ 919 rev32 v4.16b,v4.16b 920 #endif 921 st1 {v4.4s},[x1] 922 ret 923 .size vpsm4_decrypt,.-vpsm4_decrypt 924 .globl vpsm4_ecb_encrypt 925 .type vpsm4_ecb_encrypt,%function 926 .align 5 927 vpsm4_ecb_encrypt: 928 AARCH64_SIGN_LINK_REGISTER 929 // convert length into blocks 930 lsr x2,x2,4 931 stp d8,d9,[sp,#-80]! 932 stp d10,d11,[sp,#16] 933 stp d12,d13,[sp,#32] 934 stp d14,d15,[sp,#48] 935 stp x29,x30,[sp,#64] 936 adrp x10,.Lsbox 937 add x10,x10,#:lo12:.Lsbox 938 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 939 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 940 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 941 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 942 .Lecb_8_blocks_process: 943 cmp w2,#8 944 b.lt .Lecb_4_blocks_process 945 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 946 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 947 #ifndef __AARCH64EB__ 948 rev32 v4.16b,v4.16b 949 #endif 950 #ifndef __AARCH64EB__ 951 rev32 v5.16b,v5.16b 952 #endif 953 #ifndef __AARCH64EB__ 954 rev32 v6.16b,v6.16b 955 #endif 956 #ifndef __AARCH64EB__ 957 rev32 v7.16b,v7.16b 958 #endif 959 #ifndef __AARCH64EB__ 960 rev32 v8.16b,v8.16b 961 #endif 962 #ifndef __AARCH64EB__ 963 rev32 v9.16b,v9.16b 964 #endif 965 #ifndef __AARCH64EB__ 966 rev32 v10.16b,v10.16b 967 #endif 968 #ifndef __AARCH64EB__ 969 rev32 v11.16b,v11.16b 970 #endif 971 bl _vpsm4_enc_8blks 972 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 973 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 974 subs w2,w2,#8 975 b.gt .Lecb_8_blocks_process 976 b 100f 977 .Lecb_4_blocks_process: 978 cmp w2,#4 979 b.lt 1f 980 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 981 #ifndef __AARCH64EB__ 982 rev32 v4.16b,v4.16b 983 #endif 984 #ifndef __AARCH64EB__ 985 rev32 v5.16b,v5.16b 986 #endif 987 #ifndef __AARCH64EB__ 988 rev32 v6.16b,v6.16b 989 #endif 990 #ifndef __AARCH64EB__ 991 rev32 v7.16b,v7.16b 992 #endif 993 bl _vpsm4_enc_4blks 994 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 995 sub w2,w2,#4 996 1: 997 // process last block 998 cmp w2,#1 999 b.lt 100f 1000 b.gt 1f 1001 ld1 {v4.4s},[x0] 1002 #ifndef __AARCH64EB__ 1003 rev32 v4.16b,v4.16b 1004 #endif 1005 mov x10,x3 1006 mov w11,#8 1007 mov w12,v4.s[0] 1008 mov w13,v4.s[1] 1009 mov w14,v4.s[2] 1010 mov w15,v4.s[3] 1011 10: 1012 ldp w7,w8,[x10],8 1013 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1014 eor w6,w14,w15 1015 eor w9,w7,w13 1016 eor w6,w6,w9 1017 movi v1.16b,#64 1018 movi v2.16b,#128 1019 movi v3.16b,#192 1020 mov v0.s[0],w6 1021 1022 sub v1.16b,v0.16b,v1.16b 1023 sub v2.16b,v0.16b,v2.16b 1024 sub v3.16b,v0.16b,v3.16b 1025 1026 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1027 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1028 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1029 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1030 1031 mov w6,v0.s[0] 1032 mov w7,v1.s[0] 1033 mov w9,v2.s[0] 1034 add w7,w6,w7 1035 mov w6,v3.s[0] 1036 add w7,w7,w9 1037 add w7,w7,w6 1038 1039 eor w6,w7,w7,ror #32-2 1040 eor w6,w6,w7,ror #32-10 1041 eor w6,w6,w7,ror #32-18 1042 eor w6,w6,w7,ror #32-24 1043 eor w12,w12,w6 1044 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1045 eor w6,w14,w15 1046 eor w9,w12,w8 1047 eor w6,w6,w9 1048 movi v1.16b,#64 1049 movi v2.16b,#128 1050 movi v3.16b,#192 1051 mov v0.s[0],w6 1052 1053 sub v1.16b,v0.16b,v1.16b 1054 sub v2.16b,v0.16b,v2.16b 1055 sub v3.16b,v0.16b,v3.16b 1056 1057 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1058 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1059 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1060 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1061 1062 mov w6,v0.s[0] 1063 mov w7,v1.s[0] 1064 mov w9,v2.s[0] 1065 add w7,w6,w7 1066 mov w6,v3.s[0] 1067 add w7,w7,w9 1068 add w7,w7,w6 1069 1070 eor w6,w7,w7,ror #32-2 1071 eor w6,w6,w7,ror #32-10 1072 eor w6,w6,w7,ror #32-18 1073 eor w6,w6,w7,ror #32-24 1074 ldp w7,w8,[x10],8 1075 eor w13,w13,w6 1076 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1077 eor w6,w12,w13 1078 eor w9,w7,w15 1079 eor w6,w6,w9 1080 movi v1.16b,#64 1081 movi v2.16b,#128 1082 movi v3.16b,#192 1083 mov v0.s[0],w6 1084 1085 sub v1.16b,v0.16b,v1.16b 1086 sub v2.16b,v0.16b,v2.16b 1087 sub v3.16b,v0.16b,v3.16b 1088 1089 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1090 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1091 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1092 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1093 1094 mov w6,v0.s[0] 1095 mov w7,v1.s[0] 1096 mov w9,v2.s[0] 1097 add w7,w6,w7 1098 mov w6,v3.s[0] 1099 add w7,w7,w9 1100 add w7,w7,w6 1101 1102 eor w6,w7,w7,ror #32-2 1103 eor w6,w6,w7,ror #32-10 1104 eor w6,w6,w7,ror #32-18 1105 eor w6,w6,w7,ror #32-24 1106 eor w14,w14,w6 1107 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1108 eor w6,w12,w13 1109 eor w9,w14,w8 1110 eor w6,w6,w9 1111 movi v1.16b,#64 1112 movi v2.16b,#128 1113 movi v3.16b,#192 1114 mov v0.s[0],w6 1115 1116 sub v1.16b,v0.16b,v1.16b 1117 sub v2.16b,v0.16b,v2.16b 1118 sub v3.16b,v0.16b,v3.16b 1119 1120 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1121 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1122 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1123 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1124 1125 mov w6,v0.s[0] 1126 mov w7,v1.s[0] 1127 mov w9,v2.s[0] 1128 add w7,w6,w7 1129 mov w6,v3.s[0] 1130 add w7,w7,w9 1131 add w7,w7,w6 1132 1133 eor w6,w7,w7,ror #32-2 1134 eor w6,w6,w7,ror #32-10 1135 eor w6,w6,w7,ror #32-18 1136 eor w6,w6,w7,ror #32-24 1137 eor w15,w15,w6 1138 subs w11,w11,#1 1139 b.ne 10b 1140 mov v4.s[0],w15 1141 mov v4.s[1],w14 1142 mov v4.s[2],w13 1143 mov v4.s[3],w12 1144 #ifndef __AARCH64EB__ 1145 rev32 v4.16b,v4.16b 1146 #endif 1147 st1 {v4.4s},[x1] 1148 b 100f 1149 1: // process last 2 blocks 1150 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 1151 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 1152 cmp w2,#2 1153 b.gt 1f 1154 #ifndef __AARCH64EB__ 1155 rev32 v4.16b,v4.16b 1156 #endif 1157 #ifndef __AARCH64EB__ 1158 rev32 v5.16b,v5.16b 1159 #endif 1160 #ifndef __AARCH64EB__ 1161 rev32 v6.16b,v6.16b 1162 #endif 1163 #ifndef __AARCH64EB__ 1164 rev32 v7.16b,v7.16b 1165 #endif 1166 bl _vpsm4_enc_4blks 1167 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1168 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] 1169 b 100f 1170 1: // process last 3 blocks 1171 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 1172 #ifndef __AARCH64EB__ 1173 rev32 v4.16b,v4.16b 1174 #endif 1175 #ifndef __AARCH64EB__ 1176 rev32 v5.16b,v5.16b 1177 #endif 1178 #ifndef __AARCH64EB__ 1179 rev32 v6.16b,v6.16b 1180 #endif 1181 #ifndef __AARCH64EB__ 1182 rev32 v7.16b,v7.16b 1183 #endif 1184 bl _vpsm4_enc_4blks 1185 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1186 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 1187 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] 1188 100: 1189 ldp d10,d11,[sp,#16] 1190 ldp d12,d13,[sp,#32] 1191 ldp d14,d15,[sp,#48] 1192 ldp x29,x30,[sp,#64] 1193 ldp d8,d9,[sp],#80 1194 AARCH64_VALIDATE_LINK_REGISTER 1195 ret 1196 .size vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt 1197 .globl vpsm4_cbc_encrypt 1198 .type vpsm4_cbc_encrypt,%function 1199 .align 5 1200 vpsm4_cbc_encrypt: 1201 AARCH64_VALID_CALL_TARGET 1202 lsr x2,x2,4 1203 adrp x10,.Lsbox 1204 add x10,x10,#:lo12:.Lsbox 1205 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 1206 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 1207 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 1208 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 1209 cbz w5,.Ldec 1210 ld1 {v3.4s},[x4] 1211 .Lcbc_4_blocks_enc: 1212 cmp w2,#4 1213 b.lt 1f 1214 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1215 eor v4.16b,v4.16b,v3.16b 1216 #ifndef __AARCH64EB__ 1217 rev32 v5.16b,v5.16b 1218 #endif 1219 #ifndef __AARCH64EB__ 1220 rev32 v4.16b,v4.16b 1221 #endif 1222 #ifndef __AARCH64EB__ 1223 rev32 v6.16b,v6.16b 1224 #endif 1225 #ifndef __AARCH64EB__ 1226 rev32 v7.16b,v7.16b 1227 #endif 1228 mov x10,x3 1229 mov w11,#8 1230 mov w12,v4.s[0] 1231 mov w13,v4.s[1] 1232 mov w14,v4.s[2] 1233 mov w15,v4.s[3] 1234 10: 1235 ldp w7,w8,[x10],8 1236 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1237 eor w6,w14,w15 1238 eor w9,w7,w13 1239 eor w6,w6,w9 1240 movi v1.16b,#64 1241 movi v2.16b,#128 1242 movi v3.16b,#192 1243 mov v0.s[0],w6 1244 1245 sub v1.16b,v0.16b,v1.16b 1246 sub v2.16b,v0.16b,v2.16b 1247 sub v3.16b,v0.16b,v3.16b 1248 1249 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1250 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1251 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1252 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1253 1254 mov w6,v0.s[0] 1255 mov w7,v1.s[0] 1256 mov w9,v2.s[0] 1257 add w7,w6,w7 1258 mov w6,v3.s[0] 1259 add w7,w7,w9 1260 add w7,w7,w6 1261 1262 eor w6,w7,w7,ror #32-2 1263 eor w6,w6,w7,ror #32-10 1264 eor w6,w6,w7,ror #32-18 1265 eor w6,w6,w7,ror #32-24 1266 eor w12,w12,w6 1267 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1268 eor w6,w14,w15 1269 eor w9,w12,w8 1270 eor w6,w6,w9 1271 movi v1.16b,#64 1272 movi v2.16b,#128 1273 movi v3.16b,#192 1274 mov v0.s[0],w6 1275 1276 sub v1.16b,v0.16b,v1.16b 1277 sub v2.16b,v0.16b,v2.16b 1278 sub v3.16b,v0.16b,v3.16b 1279 1280 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1281 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1282 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1283 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1284 1285 mov w6,v0.s[0] 1286 mov w7,v1.s[0] 1287 mov w9,v2.s[0] 1288 add w7,w6,w7 1289 mov w6,v3.s[0] 1290 add w7,w7,w9 1291 add w7,w7,w6 1292 1293 eor w6,w7,w7,ror #32-2 1294 eor w6,w6,w7,ror #32-10 1295 eor w6,w6,w7,ror #32-18 1296 eor w6,w6,w7,ror #32-24 1297 ldp w7,w8,[x10],8 1298 eor w13,w13,w6 1299 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1300 eor w6,w12,w13 1301 eor w9,w7,w15 1302 eor w6,w6,w9 1303 movi v1.16b,#64 1304 movi v2.16b,#128 1305 movi v3.16b,#192 1306 mov v0.s[0],w6 1307 1308 sub v1.16b,v0.16b,v1.16b 1309 sub v2.16b,v0.16b,v2.16b 1310 sub v3.16b,v0.16b,v3.16b 1311 1312 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1313 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1314 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1315 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1316 1317 mov w6,v0.s[0] 1318 mov w7,v1.s[0] 1319 mov w9,v2.s[0] 1320 add w7,w6,w7 1321 mov w6,v3.s[0] 1322 add w7,w7,w9 1323 add w7,w7,w6 1324 1325 eor w6,w7,w7,ror #32-2 1326 eor w6,w6,w7,ror #32-10 1327 eor w6,w6,w7,ror #32-18 1328 eor w6,w6,w7,ror #32-24 1329 eor w14,w14,w6 1330 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1331 eor w6,w12,w13 1332 eor w9,w14,w8 1333 eor w6,w6,w9 1334 movi v1.16b,#64 1335 movi v2.16b,#128 1336 movi v3.16b,#192 1337 mov v0.s[0],w6 1338 1339 sub v1.16b,v0.16b,v1.16b 1340 sub v2.16b,v0.16b,v2.16b 1341 sub v3.16b,v0.16b,v3.16b 1342 1343 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1344 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1345 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1346 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1347 1348 mov w6,v0.s[0] 1349 mov w7,v1.s[0] 1350 mov w9,v2.s[0] 1351 add w7,w6,w7 1352 mov w6,v3.s[0] 1353 add w7,w7,w9 1354 add w7,w7,w6 1355 1356 eor w6,w7,w7,ror #32-2 1357 eor w6,w6,w7,ror #32-10 1358 eor w6,w6,w7,ror #32-18 1359 eor w6,w6,w7,ror #32-24 1360 eor w15,w15,w6 1361 subs w11,w11,#1 1362 b.ne 10b 1363 mov v4.s[0],w15 1364 mov v4.s[1],w14 1365 mov v4.s[2],w13 1366 mov v4.s[3],w12 1367 eor v5.16b,v5.16b,v4.16b 1368 mov x10,x3 1369 mov w11,#8 1370 mov w12,v5.s[0] 1371 mov w13,v5.s[1] 1372 mov w14,v5.s[2] 1373 mov w15,v5.s[3] 1374 10: 1375 ldp w7,w8,[x10],8 1376 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1377 eor w6,w14,w15 1378 eor w9,w7,w13 1379 eor w6,w6,w9 1380 movi v1.16b,#64 1381 movi v2.16b,#128 1382 movi v3.16b,#192 1383 mov v0.s[0],w6 1384 1385 sub v1.16b,v0.16b,v1.16b 1386 sub v2.16b,v0.16b,v2.16b 1387 sub v3.16b,v0.16b,v3.16b 1388 1389 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1390 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1391 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1392 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1393 1394 mov w6,v0.s[0] 1395 mov w7,v1.s[0] 1396 mov w9,v2.s[0] 1397 add w7,w6,w7 1398 mov w6,v3.s[0] 1399 add w7,w7,w9 1400 add w7,w7,w6 1401 1402 eor w6,w7,w7,ror #32-2 1403 eor w6,w6,w7,ror #32-10 1404 eor w6,w6,w7,ror #32-18 1405 eor w6,w6,w7,ror #32-24 1406 eor w12,w12,w6 1407 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1408 eor w6,w14,w15 1409 eor w9,w12,w8 1410 eor w6,w6,w9 1411 movi v1.16b,#64 1412 movi v2.16b,#128 1413 movi v3.16b,#192 1414 mov v0.s[0],w6 1415 1416 sub v1.16b,v0.16b,v1.16b 1417 sub v2.16b,v0.16b,v2.16b 1418 sub v3.16b,v0.16b,v3.16b 1419 1420 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1421 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1422 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1423 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1424 1425 mov w6,v0.s[0] 1426 mov w7,v1.s[0] 1427 mov w9,v2.s[0] 1428 add w7,w6,w7 1429 mov w6,v3.s[0] 1430 add w7,w7,w9 1431 add w7,w7,w6 1432 1433 eor w6,w7,w7,ror #32-2 1434 eor w6,w6,w7,ror #32-10 1435 eor w6,w6,w7,ror #32-18 1436 eor w6,w6,w7,ror #32-24 1437 ldp w7,w8,[x10],8 1438 eor w13,w13,w6 1439 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1440 eor w6,w12,w13 1441 eor w9,w7,w15 1442 eor w6,w6,w9 1443 movi v1.16b,#64 1444 movi v2.16b,#128 1445 movi v3.16b,#192 1446 mov v0.s[0],w6 1447 1448 sub v1.16b,v0.16b,v1.16b 1449 sub v2.16b,v0.16b,v2.16b 1450 sub v3.16b,v0.16b,v3.16b 1451 1452 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1453 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1454 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1455 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1456 1457 mov w6,v0.s[0] 1458 mov w7,v1.s[0] 1459 mov w9,v2.s[0] 1460 add w7,w6,w7 1461 mov w6,v3.s[0] 1462 add w7,w7,w9 1463 add w7,w7,w6 1464 1465 eor w6,w7,w7,ror #32-2 1466 eor w6,w6,w7,ror #32-10 1467 eor w6,w6,w7,ror #32-18 1468 eor w6,w6,w7,ror #32-24 1469 eor w14,w14,w6 1470 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1471 eor w6,w12,w13 1472 eor w9,w14,w8 1473 eor w6,w6,w9 1474 movi v1.16b,#64 1475 movi v2.16b,#128 1476 movi v3.16b,#192 1477 mov v0.s[0],w6 1478 1479 sub v1.16b,v0.16b,v1.16b 1480 sub v2.16b,v0.16b,v2.16b 1481 sub v3.16b,v0.16b,v3.16b 1482 1483 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1484 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1485 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1486 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1487 1488 mov w6,v0.s[0] 1489 mov w7,v1.s[0] 1490 mov w9,v2.s[0] 1491 add w7,w6,w7 1492 mov w6,v3.s[0] 1493 add w7,w7,w9 1494 add w7,w7,w6 1495 1496 eor w6,w7,w7,ror #32-2 1497 eor w6,w6,w7,ror #32-10 1498 eor w6,w6,w7,ror #32-18 1499 eor w6,w6,w7,ror #32-24 1500 eor w15,w15,w6 1501 subs w11,w11,#1 1502 b.ne 10b 1503 mov v5.s[0],w15 1504 mov v5.s[1],w14 1505 mov v5.s[2],w13 1506 mov v5.s[3],w12 1507 #ifndef __AARCH64EB__ 1508 rev32 v4.16b,v4.16b 1509 #endif 1510 eor v6.16b,v6.16b,v5.16b 1511 mov x10,x3 1512 mov w11,#8 1513 mov w12,v6.s[0] 1514 mov w13,v6.s[1] 1515 mov w14,v6.s[2] 1516 mov w15,v6.s[3] 1517 10: 1518 ldp w7,w8,[x10],8 1519 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1520 eor w6,w14,w15 1521 eor w9,w7,w13 1522 eor w6,w6,w9 1523 movi v1.16b,#64 1524 movi v2.16b,#128 1525 movi v3.16b,#192 1526 mov v0.s[0],w6 1527 1528 sub v1.16b,v0.16b,v1.16b 1529 sub v2.16b,v0.16b,v2.16b 1530 sub v3.16b,v0.16b,v3.16b 1531 1532 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1533 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1534 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1535 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1536 1537 mov w6,v0.s[0] 1538 mov w7,v1.s[0] 1539 mov w9,v2.s[0] 1540 add w7,w6,w7 1541 mov w6,v3.s[0] 1542 add w7,w7,w9 1543 add w7,w7,w6 1544 1545 eor w6,w7,w7,ror #32-2 1546 eor w6,w6,w7,ror #32-10 1547 eor w6,w6,w7,ror #32-18 1548 eor w6,w6,w7,ror #32-24 1549 eor w12,w12,w6 1550 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1551 eor w6,w14,w15 1552 eor w9,w12,w8 1553 eor w6,w6,w9 1554 movi v1.16b,#64 1555 movi v2.16b,#128 1556 movi v3.16b,#192 1557 mov v0.s[0],w6 1558 1559 sub v1.16b,v0.16b,v1.16b 1560 sub v2.16b,v0.16b,v2.16b 1561 sub v3.16b,v0.16b,v3.16b 1562 1563 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1564 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1565 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1566 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1567 1568 mov w6,v0.s[0] 1569 mov w7,v1.s[0] 1570 mov w9,v2.s[0] 1571 add w7,w6,w7 1572 mov w6,v3.s[0] 1573 add w7,w7,w9 1574 add w7,w7,w6 1575 1576 eor w6,w7,w7,ror #32-2 1577 eor w6,w6,w7,ror #32-10 1578 eor w6,w6,w7,ror #32-18 1579 eor w6,w6,w7,ror #32-24 1580 ldp w7,w8,[x10],8 1581 eor w13,w13,w6 1582 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1583 eor w6,w12,w13 1584 eor w9,w7,w15 1585 eor w6,w6,w9 1586 movi v1.16b,#64 1587 movi v2.16b,#128 1588 movi v3.16b,#192 1589 mov v0.s[0],w6 1590 1591 sub v1.16b,v0.16b,v1.16b 1592 sub v2.16b,v0.16b,v2.16b 1593 sub v3.16b,v0.16b,v3.16b 1594 1595 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1596 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1597 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1598 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1599 1600 mov w6,v0.s[0] 1601 mov w7,v1.s[0] 1602 mov w9,v2.s[0] 1603 add w7,w6,w7 1604 mov w6,v3.s[0] 1605 add w7,w7,w9 1606 add w7,w7,w6 1607 1608 eor w6,w7,w7,ror #32-2 1609 eor w6,w6,w7,ror #32-10 1610 eor w6,w6,w7,ror #32-18 1611 eor w6,w6,w7,ror #32-24 1612 eor w14,w14,w6 1613 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1614 eor w6,w12,w13 1615 eor w9,w14,w8 1616 eor w6,w6,w9 1617 movi v1.16b,#64 1618 movi v2.16b,#128 1619 movi v3.16b,#192 1620 mov v0.s[0],w6 1621 1622 sub v1.16b,v0.16b,v1.16b 1623 sub v2.16b,v0.16b,v2.16b 1624 sub v3.16b,v0.16b,v3.16b 1625 1626 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1627 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1628 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1629 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1630 1631 mov w6,v0.s[0] 1632 mov w7,v1.s[0] 1633 mov w9,v2.s[0] 1634 add w7,w6,w7 1635 mov w6,v3.s[0] 1636 add w7,w7,w9 1637 add w7,w7,w6 1638 1639 eor w6,w7,w7,ror #32-2 1640 eor w6,w6,w7,ror #32-10 1641 eor w6,w6,w7,ror #32-18 1642 eor w6,w6,w7,ror #32-24 1643 eor w15,w15,w6 1644 subs w11,w11,#1 1645 b.ne 10b 1646 mov v6.s[0],w15 1647 mov v6.s[1],w14 1648 mov v6.s[2],w13 1649 mov v6.s[3],w12 1650 #ifndef __AARCH64EB__ 1651 rev32 v5.16b,v5.16b 1652 #endif 1653 eor v7.16b,v7.16b,v6.16b 1654 mov x10,x3 1655 mov w11,#8 1656 mov w12,v7.s[0] 1657 mov w13,v7.s[1] 1658 mov w14,v7.s[2] 1659 mov w15,v7.s[3] 1660 10: 1661 ldp w7,w8,[x10],8 1662 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1663 eor w6,w14,w15 1664 eor w9,w7,w13 1665 eor w6,w6,w9 1666 movi v1.16b,#64 1667 movi v2.16b,#128 1668 movi v3.16b,#192 1669 mov v0.s[0],w6 1670 1671 sub v1.16b,v0.16b,v1.16b 1672 sub v2.16b,v0.16b,v2.16b 1673 sub v3.16b,v0.16b,v3.16b 1674 1675 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1676 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1677 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1678 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1679 1680 mov w6,v0.s[0] 1681 mov w7,v1.s[0] 1682 mov w9,v2.s[0] 1683 add w7,w6,w7 1684 mov w6,v3.s[0] 1685 add w7,w7,w9 1686 add w7,w7,w6 1687 1688 eor w6,w7,w7,ror #32-2 1689 eor w6,w6,w7,ror #32-10 1690 eor w6,w6,w7,ror #32-18 1691 eor w6,w6,w7,ror #32-24 1692 eor w12,w12,w6 1693 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1694 eor w6,w14,w15 1695 eor w9,w12,w8 1696 eor w6,w6,w9 1697 movi v1.16b,#64 1698 movi v2.16b,#128 1699 movi v3.16b,#192 1700 mov v0.s[0],w6 1701 1702 sub v1.16b,v0.16b,v1.16b 1703 sub v2.16b,v0.16b,v2.16b 1704 sub v3.16b,v0.16b,v3.16b 1705 1706 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1707 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1708 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1709 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1710 1711 mov w6,v0.s[0] 1712 mov w7,v1.s[0] 1713 mov w9,v2.s[0] 1714 add w7,w6,w7 1715 mov w6,v3.s[0] 1716 add w7,w7,w9 1717 add w7,w7,w6 1718 1719 eor w6,w7,w7,ror #32-2 1720 eor w6,w6,w7,ror #32-10 1721 eor w6,w6,w7,ror #32-18 1722 eor w6,w6,w7,ror #32-24 1723 ldp w7,w8,[x10],8 1724 eor w13,w13,w6 1725 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1726 eor w6,w12,w13 1727 eor w9,w7,w15 1728 eor w6,w6,w9 1729 movi v1.16b,#64 1730 movi v2.16b,#128 1731 movi v3.16b,#192 1732 mov v0.s[0],w6 1733 1734 sub v1.16b,v0.16b,v1.16b 1735 sub v2.16b,v0.16b,v2.16b 1736 sub v3.16b,v0.16b,v3.16b 1737 1738 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1739 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1740 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1741 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1742 1743 mov w6,v0.s[0] 1744 mov w7,v1.s[0] 1745 mov w9,v2.s[0] 1746 add w7,w6,w7 1747 mov w6,v3.s[0] 1748 add w7,w7,w9 1749 add w7,w7,w6 1750 1751 eor w6,w7,w7,ror #32-2 1752 eor w6,w6,w7,ror #32-10 1753 eor w6,w6,w7,ror #32-18 1754 eor w6,w6,w7,ror #32-24 1755 eor w14,w14,w6 1756 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1757 eor w6,w12,w13 1758 eor w9,w14,w8 1759 eor w6,w6,w9 1760 movi v1.16b,#64 1761 movi v2.16b,#128 1762 movi v3.16b,#192 1763 mov v0.s[0],w6 1764 1765 sub v1.16b,v0.16b,v1.16b 1766 sub v2.16b,v0.16b,v2.16b 1767 sub v3.16b,v0.16b,v3.16b 1768 1769 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1770 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1771 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1772 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1773 1774 mov w6,v0.s[0] 1775 mov w7,v1.s[0] 1776 mov w9,v2.s[0] 1777 add w7,w6,w7 1778 mov w6,v3.s[0] 1779 add w7,w7,w9 1780 add w7,w7,w6 1781 1782 eor w6,w7,w7,ror #32-2 1783 eor w6,w6,w7,ror #32-10 1784 eor w6,w6,w7,ror #32-18 1785 eor w6,w6,w7,ror #32-24 1786 eor w15,w15,w6 1787 subs w11,w11,#1 1788 b.ne 10b 1789 mov v7.s[0],w15 1790 mov v7.s[1],w14 1791 mov v7.s[2],w13 1792 mov v7.s[3],w12 1793 #ifndef __AARCH64EB__ 1794 rev32 v6.16b,v6.16b 1795 #endif 1796 #ifndef __AARCH64EB__ 1797 rev32 v7.16b,v7.16b 1798 #endif 1799 orr v3.16b,v7.16b,v7.16b 1800 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1801 subs w2,w2,#4 1802 b.ne .Lcbc_4_blocks_enc 1803 b 2f 1804 1: 1805 subs w2,w2,#1 1806 b.lt 2f 1807 ld1 {v4.4s},[x0],#16 1808 eor v3.16b,v3.16b,v4.16b 1809 #ifndef __AARCH64EB__ 1810 rev32 v3.16b,v3.16b 1811 #endif 1812 mov x10,x3 1813 mov w11,#8 1814 mov w12,v3.s[0] 1815 mov w13,v3.s[1] 1816 mov w14,v3.s[2] 1817 mov w15,v3.s[3] 1818 10: 1819 ldp w7,w8,[x10],8 1820 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1821 eor w6,w14,w15 1822 eor w9,w7,w13 1823 eor w6,w6,w9 1824 movi v1.16b,#64 1825 movi v2.16b,#128 1826 movi v3.16b,#192 1827 mov v0.s[0],w6 1828 1829 sub v1.16b,v0.16b,v1.16b 1830 sub v2.16b,v0.16b,v2.16b 1831 sub v3.16b,v0.16b,v3.16b 1832 1833 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1834 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1835 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1836 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1837 1838 mov w6,v0.s[0] 1839 mov w7,v1.s[0] 1840 mov w9,v2.s[0] 1841 add w7,w6,w7 1842 mov w6,v3.s[0] 1843 add w7,w7,w9 1844 add w7,w7,w6 1845 1846 eor w6,w7,w7,ror #32-2 1847 eor w6,w6,w7,ror #32-10 1848 eor w6,w6,w7,ror #32-18 1849 eor w6,w6,w7,ror #32-24 1850 eor w12,w12,w6 1851 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1852 eor w6,w14,w15 1853 eor w9,w12,w8 1854 eor w6,w6,w9 1855 movi v1.16b,#64 1856 movi v2.16b,#128 1857 movi v3.16b,#192 1858 mov v0.s[0],w6 1859 1860 sub v1.16b,v0.16b,v1.16b 1861 sub v2.16b,v0.16b,v2.16b 1862 sub v3.16b,v0.16b,v3.16b 1863 1864 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1865 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1866 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1867 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1868 1869 mov w6,v0.s[0] 1870 mov w7,v1.s[0] 1871 mov w9,v2.s[0] 1872 add w7,w6,w7 1873 mov w6,v3.s[0] 1874 add w7,w7,w9 1875 add w7,w7,w6 1876 1877 eor w6,w7,w7,ror #32-2 1878 eor w6,w6,w7,ror #32-10 1879 eor w6,w6,w7,ror #32-18 1880 eor w6,w6,w7,ror #32-24 1881 ldp w7,w8,[x10],8 1882 eor w13,w13,w6 1883 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1884 eor w6,w12,w13 1885 eor w9,w7,w15 1886 eor w6,w6,w9 1887 movi v1.16b,#64 1888 movi v2.16b,#128 1889 movi v3.16b,#192 1890 mov v0.s[0],w6 1891 1892 sub v1.16b,v0.16b,v1.16b 1893 sub v2.16b,v0.16b,v2.16b 1894 sub v3.16b,v0.16b,v3.16b 1895 1896 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1897 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1898 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1899 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1900 1901 mov w6,v0.s[0] 1902 mov w7,v1.s[0] 1903 mov w9,v2.s[0] 1904 add w7,w6,w7 1905 mov w6,v3.s[0] 1906 add w7,w7,w9 1907 add w7,w7,w6 1908 1909 eor w6,w7,w7,ror #32-2 1910 eor w6,w6,w7,ror #32-10 1911 eor w6,w6,w7,ror #32-18 1912 eor w6,w6,w7,ror #32-24 1913 eor w14,w14,w6 1914 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1915 eor w6,w12,w13 1916 eor w9,w14,w8 1917 eor w6,w6,w9 1918 movi v1.16b,#64 1919 movi v2.16b,#128 1920 movi v3.16b,#192 1921 mov v0.s[0],w6 1922 1923 sub v1.16b,v0.16b,v1.16b 1924 sub v2.16b,v0.16b,v2.16b 1925 sub v3.16b,v0.16b,v3.16b 1926 1927 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1928 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1929 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1930 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1931 1932 mov w6,v0.s[0] 1933 mov w7,v1.s[0] 1934 mov w9,v2.s[0] 1935 add w7,w6,w7 1936 mov w6,v3.s[0] 1937 add w7,w7,w9 1938 add w7,w7,w6 1939 1940 eor w6,w7,w7,ror #32-2 1941 eor w6,w6,w7,ror #32-10 1942 eor w6,w6,w7,ror #32-18 1943 eor w6,w6,w7,ror #32-24 1944 eor w15,w15,w6 1945 subs w11,w11,#1 1946 b.ne 10b 1947 mov v3.s[0],w15 1948 mov v3.s[1],w14 1949 mov v3.s[2],w13 1950 mov v3.s[3],w12 1951 #ifndef __AARCH64EB__ 1952 rev32 v3.16b,v3.16b 1953 #endif 1954 st1 {v3.4s},[x1],#16 1955 b 1b 1956 2: 1957 // save back IV 1958 st1 {v3.4s},[x4] 1959 ret 1960 1961 .Ldec: 1962 // decryption mode starts 1963 AARCH64_SIGN_LINK_REGISTER 1964 stp d8,d9,[sp,#-80]! 1965 stp d10,d11,[sp,#16] 1966 stp d12,d13,[sp,#32] 1967 stp d14,d15,[sp,#48] 1968 stp x29,x30,[sp,#64] 1969 .Lcbc_8_blocks_dec: 1970 cmp w2,#8 1971 b.lt 1f 1972 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1973 add x10,x0,#64 1974 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] 1975 #ifndef __AARCH64EB__ 1976 rev32 v4.16b,v4.16b 1977 #endif 1978 #ifndef __AARCH64EB__ 1979 rev32 v5.16b,v5.16b 1980 #endif 1981 #ifndef __AARCH64EB__ 1982 rev32 v6.16b,v6.16b 1983 #endif 1984 #ifndef __AARCH64EB__ 1985 rev32 v7.16b,v7.16b 1986 #endif 1987 #ifndef __AARCH64EB__ 1988 rev32 v8.16b,v8.16b 1989 #endif 1990 #ifndef __AARCH64EB__ 1991 rev32 v9.16b,v9.16b 1992 #endif 1993 #ifndef __AARCH64EB__ 1994 rev32 v10.16b,v10.16b 1995 #endif 1996 #ifndef __AARCH64EB__ 1997 rev32 v11.16b,v11.16b 1998 #endif 1999 bl _vpsm4_enc_8blks 2000 zip1 v8.4s,v0.4s,v1.4s 2001 zip2 v9.4s,v0.4s,v1.4s 2002 zip1 v10.4s,v2.4s,v3.4s 2003 zip2 v11.4s,v2.4s,v3.4s 2004 zip1 v0.2d,v8.2d,v10.2d 2005 zip2 v1.2d,v8.2d,v10.2d 2006 zip1 v2.2d,v9.2d,v11.2d 2007 zip2 v3.2d,v9.2d,v11.2d 2008 zip1 v8.4s,v4.4s,v5.4s 2009 zip2 v9.4s,v4.4s,v5.4s 2010 zip1 v10.4s,v6.4s,v7.4s 2011 zip2 v11.4s,v6.4s,v7.4s 2012 zip1 v4.2d,v8.2d,v10.2d 2013 zip2 v5.2d,v8.2d,v10.2d 2014 zip1 v6.2d,v9.2d,v11.2d 2015 zip2 v7.2d,v9.2d,v11.2d 2016 ld1 {v15.4s},[x4] 2017 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2018 // note ivec1 and vtmpx[3] are reusing the same register 2019 // care needs to be taken to avoid conflict 2020 eor v0.16b,v0.16b,v15.16b 2021 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2022 eor v1.16b,v1.16b,v8.16b 2023 eor v2.16b,v2.16b,v9.16b 2024 eor v3.16b,v3.16b,v10.16b 2025 // save back IV 2026 st1 {v15.4s}, [x4] 2027 eor v4.16b,v4.16b,v11.16b 2028 eor v5.16b,v5.16b,v12.16b 2029 eor v6.16b,v6.16b,v13.16b 2030 eor v7.16b,v7.16b,v14.16b 2031 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2032 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2033 subs w2,w2,#8 2034 b.gt .Lcbc_8_blocks_dec 2035 b.eq 100f 2036 1: 2037 ld1 {v15.4s},[x4] 2038 .Lcbc_4_blocks_dec: 2039 cmp w2,#4 2040 b.lt 1f 2041 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 2042 #ifndef __AARCH64EB__ 2043 rev32 v4.16b,v4.16b 2044 #endif 2045 #ifndef __AARCH64EB__ 2046 rev32 v5.16b,v5.16b 2047 #endif 2048 #ifndef __AARCH64EB__ 2049 rev32 v6.16b,v6.16b 2050 #endif 2051 #ifndef __AARCH64EB__ 2052 rev32 v7.16b,v7.16b 2053 #endif 2054 bl _vpsm4_enc_4blks 2055 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2056 zip1 v8.4s,v0.4s,v1.4s 2057 zip2 v9.4s,v0.4s,v1.4s 2058 zip1 v10.4s,v2.4s,v3.4s 2059 zip2 v11.4s,v2.4s,v3.4s 2060 zip1 v0.2d,v8.2d,v10.2d 2061 zip2 v1.2d,v8.2d,v10.2d 2062 zip1 v2.2d,v9.2d,v11.2d 2063 zip2 v3.2d,v9.2d,v11.2d 2064 eor v0.16b,v0.16b,v15.16b 2065 eor v1.16b,v1.16b,v4.16b 2066 orr v15.16b,v7.16b,v7.16b 2067 eor v2.16b,v2.16b,v5.16b 2068 eor v3.16b,v3.16b,v6.16b 2069 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2070 subs w2,w2,#4 2071 b.gt .Lcbc_4_blocks_dec 2072 // save back IV 2073 st1 {v7.4s}, [x4] 2074 b 100f 2075 1: // last block 2076 subs w2,w2,#1 2077 b.lt 100f 2078 b.gt 1f 2079 ld1 {v4.4s},[x0],#16 2080 // save back IV 2081 st1 {v4.4s}, [x4] 2082 #ifndef __AARCH64EB__ 2083 rev32 v8.16b,v4.16b 2084 #else 2085 mov v8.16b,v4.16b 2086 #endif 2087 mov x10,x3 2088 mov w11,#8 2089 mov w12,v8.s[0] 2090 mov w13,v8.s[1] 2091 mov w14,v8.s[2] 2092 mov w15,v8.s[3] 2093 10: 2094 ldp w7,w8,[x10],8 2095 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2096 eor w6,w14,w15 2097 eor w9,w7,w13 2098 eor w6,w6,w9 2099 movi v1.16b,#64 2100 movi v2.16b,#128 2101 movi v3.16b,#192 2102 mov v0.s[0],w6 2103 2104 sub v1.16b,v0.16b,v1.16b 2105 sub v2.16b,v0.16b,v2.16b 2106 sub v3.16b,v0.16b,v3.16b 2107 2108 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2109 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2110 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2111 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2112 2113 mov w6,v0.s[0] 2114 mov w7,v1.s[0] 2115 mov w9,v2.s[0] 2116 add w7,w6,w7 2117 mov w6,v3.s[0] 2118 add w7,w7,w9 2119 add w7,w7,w6 2120 2121 eor w6,w7,w7,ror #32-2 2122 eor w6,w6,w7,ror #32-10 2123 eor w6,w6,w7,ror #32-18 2124 eor w6,w6,w7,ror #32-24 2125 eor w12,w12,w6 2126 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2127 eor w6,w14,w15 2128 eor w9,w12,w8 2129 eor w6,w6,w9 2130 movi v1.16b,#64 2131 movi v2.16b,#128 2132 movi v3.16b,#192 2133 mov v0.s[0],w6 2134 2135 sub v1.16b,v0.16b,v1.16b 2136 sub v2.16b,v0.16b,v2.16b 2137 sub v3.16b,v0.16b,v3.16b 2138 2139 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2140 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2141 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2142 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2143 2144 mov w6,v0.s[0] 2145 mov w7,v1.s[0] 2146 mov w9,v2.s[0] 2147 add w7,w6,w7 2148 mov w6,v3.s[0] 2149 add w7,w7,w9 2150 add w7,w7,w6 2151 2152 eor w6,w7,w7,ror #32-2 2153 eor w6,w6,w7,ror #32-10 2154 eor w6,w6,w7,ror #32-18 2155 eor w6,w6,w7,ror #32-24 2156 ldp w7,w8,[x10],8 2157 eor w13,w13,w6 2158 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2159 eor w6,w12,w13 2160 eor w9,w7,w15 2161 eor w6,w6,w9 2162 movi v1.16b,#64 2163 movi v2.16b,#128 2164 movi v3.16b,#192 2165 mov v0.s[0],w6 2166 2167 sub v1.16b,v0.16b,v1.16b 2168 sub v2.16b,v0.16b,v2.16b 2169 sub v3.16b,v0.16b,v3.16b 2170 2171 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2172 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2173 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2174 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2175 2176 mov w6,v0.s[0] 2177 mov w7,v1.s[0] 2178 mov w9,v2.s[0] 2179 add w7,w6,w7 2180 mov w6,v3.s[0] 2181 add w7,w7,w9 2182 add w7,w7,w6 2183 2184 eor w6,w7,w7,ror #32-2 2185 eor w6,w6,w7,ror #32-10 2186 eor w6,w6,w7,ror #32-18 2187 eor w6,w6,w7,ror #32-24 2188 eor w14,w14,w6 2189 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2190 eor w6,w12,w13 2191 eor w9,w14,w8 2192 eor w6,w6,w9 2193 movi v1.16b,#64 2194 movi v2.16b,#128 2195 movi v3.16b,#192 2196 mov v0.s[0],w6 2197 2198 sub v1.16b,v0.16b,v1.16b 2199 sub v2.16b,v0.16b,v2.16b 2200 sub v3.16b,v0.16b,v3.16b 2201 2202 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2203 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2204 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2205 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2206 2207 mov w6,v0.s[0] 2208 mov w7,v1.s[0] 2209 mov w9,v2.s[0] 2210 add w7,w6,w7 2211 mov w6,v3.s[0] 2212 add w7,w7,w9 2213 add w7,w7,w6 2214 2215 eor w6,w7,w7,ror #32-2 2216 eor w6,w6,w7,ror #32-10 2217 eor w6,w6,w7,ror #32-18 2218 eor w6,w6,w7,ror #32-24 2219 eor w15,w15,w6 2220 subs w11,w11,#1 2221 b.ne 10b 2222 mov v8.s[0],w15 2223 mov v8.s[1],w14 2224 mov v8.s[2],w13 2225 mov v8.s[3],w12 2226 #ifndef __AARCH64EB__ 2227 rev32 v8.16b,v8.16b 2228 #endif 2229 eor v8.16b,v8.16b,v15.16b 2230 st1 {v8.4s},[x1],#16 2231 b 100f 2232 1: // last two blocks 2233 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] 2234 add x10,x0,#16 2235 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 2236 subs w2,w2,1 2237 b.gt 1f 2238 #ifndef __AARCH64EB__ 2239 rev32 v4.16b,v4.16b 2240 #endif 2241 #ifndef __AARCH64EB__ 2242 rev32 v5.16b,v5.16b 2243 #endif 2244 #ifndef __AARCH64EB__ 2245 rev32 v6.16b,v6.16b 2246 #endif 2247 #ifndef __AARCH64EB__ 2248 rev32 v7.16b,v7.16b 2249 #endif 2250 bl _vpsm4_enc_4blks 2251 ld1 {v4.4s,v5.4s},[x0],#32 2252 zip1 v8.4s,v0.4s,v1.4s 2253 zip2 v9.4s,v0.4s,v1.4s 2254 zip1 v10.4s,v2.4s,v3.4s 2255 zip2 v11.4s,v2.4s,v3.4s 2256 zip1 v0.2d,v8.2d,v10.2d 2257 zip2 v1.2d,v8.2d,v10.2d 2258 zip1 v2.2d,v9.2d,v11.2d 2259 zip2 v3.2d,v9.2d,v11.2d 2260 eor v0.16b,v0.16b,v15.16b 2261 eor v1.16b,v1.16b,v4.16b 2262 st1 {v0.4s,v1.4s},[x1],#32 2263 // save back IV 2264 st1 {v5.4s}, [x4] 2265 b 100f 2266 1: // last 3 blocks 2267 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] 2268 #ifndef __AARCH64EB__ 2269 rev32 v4.16b,v4.16b 2270 #endif 2271 #ifndef __AARCH64EB__ 2272 rev32 v5.16b,v5.16b 2273 #endif 2274 #ifndef __AARCH64EB__ 2275 rev32 v6.16b,v6.16b 2276 #endif 2277 #ifndef __AARCH64EB__ 2278 rev32 v7.16b,v7.16b 2279 #endif 2280 bl _vpsm4_enc_4blks 2281 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 2282 zip1 v8.4s,v0.4s,v1.4s 2283 zip2 v9.4s,v0.4s,v1.4s 2284 zip1 v10.4s,v2.4s,v3.4s 2285 zip2 v11.4s,v2.4s,v3.4s 2286 zip1 v0.2d,v8.2d,v10.2d 2287 zip2 v1.2d,v8.2d,v10.2d 2288 zip1 v2.2d,v9.2d,v11.2d 2289 zip2 v3.2d,v9.2d,v11.2d 2290 eor v0.16b,v0.16b,v15.16b 2291 eor v1.16b,v1.16b,v4.16b 2292 eor v2.16b,v2.16b,v5.16b 2293 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 2294 // save back IV 2295 st1 {v6.4s}, [x4] 2296 100: 2297 ldp d10,d11,[sp,#16] 2298 ldp d12,d13,[sp,#32] 2299 ldp d14,d15,[sp,#48] 2300 ldp x29,x30,[sp,#64] 2301 ldp d8,d9,[sp],#80 2302 AARCH64_VALIDATE_LINK_REGISTER 2303 ret 2304 .size vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt 2305 .globl vpsm4_ctr32_encrypt_blocks 2306 .type vpsm4_ctr32_encrypt_blocks,%function 2307 .align 5 2308 vpsm4_ctr32_encrypt_blocks: 2309 AARCH64_VALID_CALL_TARGET 2310 ld1 {v3.4s},[x4] 2311 #ifndef __AARCH64EB__ 2312 rev32 v3.16b,v3.16b 2313 #endif 2314 adrp x10,.Lsbox 2315 add x10,x10,#:lo12:.Lsbox 2316 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 2317 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 2318 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 2319 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 2320 cmp w2,#1 2321 b.ne 1f 2322 // fast processing for one single block without 2323 // context saving overhead 2324 mov x10,x3 2325 mov w11,#8 2326 mov w12,v3.s[0] 2327 mov w13,v3.s[1] 2328 mov w14,v3.s[2] 2329 mov w15,v3.s[3] 2330 10: 2331 ldp w7,w8,[x10],8 2332 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2333 eor w6,w14,w15 2334 eor w9,w7,w13 2335 eor w6,w6,w9 2336 movi v1.16b,#64 2337 movi v2.16b,#128 2338 movi v3.16b,#192 2339 mov v0.s[0],w6 2340 2341 sub v1.16b,v0.16b,v1.16b 2342 sub v2.16b,v0.16b,v2.16b 2343 sub v3.16b,v0.16b,v3.16b 2344 2345 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2346 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2347 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2348 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2349 2350 mov w6,v0.s[0] 2351 mov w7,v1.s[0] 2352 mov w9,v2.s[0] 2353 add w7,w6,w7 2354 mov w6,v3.s[0] 2355 add w7,w7,w9 2356 add w7,w7,w6 2357 2358 eor w6,w7,w7,ror #32-2 2359 eor w6,w6,w7,ror #32-10 2360 eor w6,w6,w7,ror #32-18 2361 eor w6,w6,w7,ror #32-24 2362 eor w12,w12,w6 2363 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2364 eor w6,w14,w15 2365 eor w9,w12,w8 2366 eor w6,w6,w9 2367 movi v1.16b,#64 2368 movi v2.16b,#128 2369 movi v3.16b,#192 2370 mov v0.s[0],w6 2371 2372 sub v1.16b,v0.16b,v1.16b 2373 sub v2.16b,v0.16b,v2.16b 2374 sub v3.16b,v0.16b,v3.16b 2375 2376 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2377 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2378 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2379 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2380 2381 mov w6,v0.s[0] 2382 mov w7,v1.s[0] 2383 mov w9,v2.s[0] 2384 add w7,w6,w7 2385 mov w6,v3.s[0] 2386 add w7,w7,w9 2387 add w7,w7,w6 2388 2389 eor w6,w7,w7,ror #32-2 2390 eor w6,w6,w7,ror #32-10 2391 eor w6,w6,w7,ror #32-18 2392 eor w6,w6,w7,ror #32-24 2393 ldp w7,w8,[x10],8 2394 eor w13,w13,w6 2395 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2396 eor w6,w12,w13 2397 eor w9,w7,w15 2398 eor w6,w6,w9 2399 movi v1.16b,#64 2400 movi v2.16b,#128 2401 movi v3.16b,#192 2402 mov v0.s[0],w6 2403 2404 sub v1.16b,v0.16b,v1.16b 2405 sub v2.16b,v0.16b,v2.16b 2406 sub v3.16b,v0.16b,v3.16b 2407 2408 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2409 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2410 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2411 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2412 2413 mov w6,v0.s[0] 2414 mov w7,v1.s[0] 2415 mov w9,v2.s[0] 2416 add w7,w6,w7 2417 mov w6,v3.s[0] 2418 add w7,w7,w9 2419 add w7,w7,w6 2420 2421 eor w6,w7,w7,ror #32-2 2422 eor w6,w6,w7,ror #32-10 2423 eor w6,w6,w7,ror #32-18 2424 eor w6,w6,w7,ror #32-24 2425 eor w14,w14,w6 2426 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2427 eor w6,w12,w13 2428 eor w9,w14,w8 2429 eor w6,w6,w9 2430 movi v1.16b,#64 2431 movi v2.16b,#128 2432 movi v3.16b,#192 2433 mov v0.s[0],w6 2434 2435 sub v1.16b,v0.16b,v1.16b 2436 sub v2.16b,v0.16b,v2.16b 2437 sub v3.16b,v0.16b,v3.16b 2438 2439 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2440 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2441 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2442 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2443 2444 mov w6,v0.s[0] 2445 mov w7,v1.s[0] 2446 mov w9,v2.s[0] 2447 add w7,w6,w7 2448 mov w6,v3.s[0] 2449 add w7,w7,w9 2450 add w7,w7,w6 2451 2452 eor w6,w7,w7,ror #32-2 2453 eor w6,w6,w7,ror #32-10 2454 eor w6,w6,w7,ror #32-18 2455 eor w6,w6,w7,ror #32-24 2456 eor w15,w15,w6 2457 subs w11,w11,#1 2458 b.ne 10b 2459 mov v3.s[0],w15 2460 mov v3.s[1],w14 2461 mov v3.s[2],w13 2462 mov v3.s[3],w12 2463 #ifndef __AARCH64EB__ 2464 rev32 v3.16b,v3.16b 2465 #endif 2466 ld1 {v4.4s},[x0] 2467 eor v4.16b,v4.16b,v3.16b 2468 st1 {v4.4s},[x1] 2469 ret 2470 1: 2471 AARCH64_SIGN_LINK_REGISTER 2472 stp d8,d9,[sp,#-80]! 2473 stp d10,d11,[sp,#16] 2474 stp d12,d13,[sp,#32] 2475 stp d14,d15,[sp,#48] 2476 stp x29,x30,[sp,#64] 2477 mov w12,v3.s[0] 2478 mov w13,v3.s[1] 2479 mov w14,v3.s[2] 2480 mov w5,v3.s[3] 2481 .Lctr32_4_blocks_process: 2482 cmp w2,#4 2483 b.lt 1f 2484 dup v4.4s,w12 2485 dup v5.4s,w13 2486 dup v6.4s,w14 2487 mov v7.s[0],w5 2488 add w5,w5,#1 2489 mov v7.s[1],w5 2490 add w5,w5,#1 2491 mov v7.s[2],w5 2492 add w5,w5,#1 2493 mov v7.s[3],w5 2494 add w5,w5,#1 2495 cmp w2,#8 2496 b.ge .Lctr32_8_blocks_process 2497 bl _vpsm4_enc_4blks 2498 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2499 eor v0.16b,v0.16b,v12.16b 2500 eor v1.16b,v1.16b,v13.16b 2501 eor v2.16b,v2.16b,v14.16b 2502 eor v3.16b,v3.16b,v15.16b 2503 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2504 subs w2,w2,#4 2505 b.ne .Lctr32_4_blocks_process 2506 b 100f 2507 .Lctr32_8_blocks_process: 2508 dup v8.4s,w12 2509 dup v9.4s,w13 2510 dup v10.4s,w14 2511 mov v11.s[0],w5 2512 add w5,w5,#1 2513 mov v11.s[1],w5 2514 add w5,w5,#1 2515 mov v11.s[2],w5 2516 add w5,w5,#1 2517 mov v11.s[3],w5 2518 add w5,w5,#1 2519 bl _vpsm4_enc_8blks 2520 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2521 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2522 eor v0.16b,v0.16b,v12.16b 2523 eor v1.16b,v1.16b,v13.16b 2524 eor v2.16b,v2.16b,v14.16b 2525 eor v3.16b,v3.16b,v15.16b 2526 eor v4.16b,v4.16b,v8.16b 2527 eor v5.16b,v5.16b,v9.16b 2528 eor v6.16b,v6.16b,v10.16b 2529 eor v7.16b,v7.16b,v11.16b 2530 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2531 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2532 subs w2,w2,#8 2533 b.ne .Lctr32_4_blocks_process 2534 b 100f 2535 1: // last block processing 2536 subs w2,w2,#1 2537 b.lt 100f 2538 b.gt 1f 2539 mov v3.s[0],w12 2540 mov v3.s[1],w13 2541 mov v3.s[2],w14 2542 mov v3.s[3],w5 2543 mov x10,x3 2544 mov w11,#8 2545 mov w12,v3.s[0] 2546 mov w13,v3.s[1] 2547 mov w14,v3.s[2] 2548 mov w15,v3.s[3] 2549 10: 2550 ldp w7,w8,[x10],8 2551 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2552 eor w6,w14,w15 2553 eor w9,w7,w13 2554 eor w6,w6,w9 2555 movi v1.16b,#64 2556 movi v2.16b,#128 2557 movi v3.16b,#192 2558 mov v0.s[0],w6 2559 2560 sub v1.16b,v0.16b,v1.16b 2561 sub v2.16b,v0.16b,v2.16b 2562 sub v3.16b,v0.16b,v3.16b 2563 2564 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2565 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2566 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2567 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2568 2569 mov w6,v0.s[0] 2570 mov w7,v1.s[0] 2571 mov w9,v2.s[0] 2572 add w7,w6,w7 2573 mov w6,v3.s[0] 2574 add w7,w7,w9 2575 add w7,w7,w6 2576 2577 eor w6,w7,w7,ror #32-2 2578 eor w6,w6,w7,ror #32-10 2579 eor w6,w6,w7,ror #32-18 2580 eor w6,w6,w7,ror #32-24 2581 eor w12,w12,w6 2582 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2583 eor w6,w14,w15 2584 eor w9,w12,w8 2585 eor w6,w6,w9 2586 movi v1.16b,#64 2587 movi v2.16b,#128 2588 movi v3.16b,#192 2589 mov v0.s[0],w6 2590 2591 sub v1.16b,v0.16b,v1.16b 2592 sub v2.16b,v0.16b,v2.16b 2593 sub v3.16b,v0.16b,v3.16b 2594 2595 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2596 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2597 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2598 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2599 2600 mov w6,v0.s[0] 2601 mov w7,v1.s[0] 2602 mov w9,v2.s[0] 2603 add w7,w6,w7 2604 mov w6,v3.s[0] 2605 add w7,w7,w9 2606 add w7,w7,w6 2607 2608 eor w6,w7,w7,ror #32-2 2609 eor w6,w6,w7,ror #32-10 2610 eor w6,w6,w7,ror #32-18 2611 eor w6,w6,w7,ror #32-24 2612 ldp w7,w8,[x10],8 2613 eor w13,w13,w6 2614 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2615 eor w6,w12,w13 2616 eor w9,w7,w15 2617 eor w6,w6,w9 2618 movi v1.16b,#64 2619 movi v2.16b,#128 2620 movi v3.16b,#192 2621 mov v0.s[0],w6 2622 2623 sub v1.16b,v0.16b,v1.16b 2624 sub v2.16b,v0.16b,v2.16b 2625 sub v3.16b,v0.16b,v3.16b 2626 2627 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2628 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2629 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2630 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2631 2632 mov w6,v0.s[0] 2633 mov w7,v1.s[0] 2634 mov w9,v2.s[0] 2635 add w7,w6,w7 2636 mov w6,v3.s[0] 2637 add w7,w7,w9 2638 add w7,w7,w6 2639 2640 eor w6,w7,w7,ror #32-2 2641 eor w6,w6,w7,ror #32-10 2642 eor w6,w6,w7,ror #32-18 2643 eor w6,w6,w7,ror #32-24 2644 eor w14,w14,w6 2645 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2646 eor w6,w12,w13 2647 eor w9,w14,w8 2648 eor w6,w6,w9 2649 movi v1.16b,#64 2650 movi v2.16b,#128 2651 movi v3.16b,#192 2652 mov v0.s[0],w6 2653 2654 sub v1.16b,v0.16b,v1.16b 2655 sub v2.16b,v0.16b,v2.16b 2656 sub v3.16b,v0.16b,v3.16b 2657 2658 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2659 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2660 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2661 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2662 2663 mov w6,v0.s[0] 2664 mov w7,v1.s[0] 2665 mov w9,v2.s[0] 2666 add w7,w6,w7 2667 mov w6,v3.s[0] 2668 add w7,w7,w9 2669 add w7,w7,w6 2670 2671 eor w6,w7,w7,ror #32-2 2672 eor w6,w6,w7,ror #32-10 2673 eor w6,w6,w7,ror #32-18 2674 eor w6,w6,w7,ror #32-24 2675 eor w15,w15,w6 2676 subs w11,w11,#1 2677 b.ne 10b 2678 mov v3.s[0],w15 2679 mov v3.s[1],w14 2680 mov v3.s[2],w13 2681 mov v3.s[3],w12 2682 #ifndef __AARCH64EB__ 2683 rev32 v3.16b,v3.16b 2684 #endif 2685 ld1 {v4.4s},[x0] 2686 eor v4.16b,v4.16b,v3.16b 2687 st1 {v4.4s},[x1] 2688 b 100f 2689 1: // last 2 blocks processing 2690 dup v4.4s,w12 2691 dup v5.4s,w13 2692 dup v6.4s,w14 2693 mov v7.s[0],w5 2694 add w5,w5,#1 2695 mov v7.s[1],w5 2696 subs w2,w2,#1 2697 b.ne 1f 2698 bl _vpsm4_enc_4blks 2699 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2700 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2701 eor v0.16b,v0.16b,v12.16b 2702 eor v1.16b,v1.16b,v13.16b 2703 eor v2.16b,v2.16b,v14.16b 2704 eor v3.16b,v3.16b,v15.16b 2705 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2706 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2707 b 100f 2708 1: // last 3 blocks processing 2709 add w5,w5,#1 2710 mov v7.s[2],w5 2711 bl _vpsm4_enc_4blks 2712 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2713 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2714 ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 2715 eor v0.16b,v0.16b,v12.16b 2716 eor v1.16b,v1.16b,v13.16b 2717 eor v2.16b,v2.16b,v14.16b 2718 eor v3.16b,v3.16b,v15.16b 2719 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2720 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2721 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 2722 100: 2723 ldp d10,d11,[sp,#16] 2724 ldp d12,d13,[sp,#32] 2725 ldp d14,d15,[sp,#48] 2726 ldp x29,x30,[sp,#64] 2727 ldp d8,d9,[sp],#80 2728 AARCH64_VALIDATE_LINK_REGISTER 2729 ret 2730 .size vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks 2731 .globl vpsm4_xts_encrypt_gb 2732 .type vpsm4_xts_encrypt_gb,%function 2733 .align 5 2734 vpsm4_xts_encrypt_gb: 2735 AARCH64_SIGN_LINK_REGISTER 2736 stp x15, x16, [sp, #-0x10]! 2737 stp x17, x18, [sp, #-0x10]! 2738 stp x19, x20, [sp, #-0x10]! 2739 stp x21, x22, [sp, #-0x10]! 2740 stp x23, x24, [sp, #-0x10]! 2741 stp x25, x26, [sp, #-0x10]! 2742 stp x27, x28, [sp, #-0x10]! 2743 stp x29, x30, [sp, #-0x10]! 2744 stp d8, d9, [sp, #-0x10]! 2745 stp d10, d11, [sp, #-0x10]! 2746 stp d12, d13, [sp, #-0x10]! 2747 stp d14, d15, [sp, #-0x10]! 2748 mov x26,x3 2749 mov x27,x4 2750 mov w28,w6 2751 ld1 {v8.4s}, [x5] 2752 mov x3,x27 2753 adrp x10,.Lsbox 2754 add x10,x10,#:lo12:.Lsbox 2755 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 2756 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 2757 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 2758 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 2759 #ifndef __AARCH64EB__ 2760 rev32 v8.16b,v8.16b 2761 #endif 2762 mov x10,x3 2763 mov w11,#8 2764 mov w12,v8.s[0] 2765 mov w13,v8.s[1] 2766 mov w14,v8.s[2] 2767 mov w15,v8.s[3] 2768 10: 2769 ldp w7,w8,[x10],8 2770 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2771 eor w6,w14,w15 2772 eor w9,w7,w13 2773 eor w6,w6,w9 2774 movi v1.16b,#64 2775 movi v2.16b,#128 2776 movi v3.16b,#192 2777 mov v0.s[0],w6 2778 2779 sub v1.16b,v0.16b,v1.16b 2780 sub v2.16b,v0.16b,v2.16b 2781 sub v3.16b,v0.16b,v3.16b 2782 2783 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2784 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2785 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2786 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2787 2788 mov w6,v0.s[0] 2789 mov w7,v1.s[0] 2790 mov w9,v2.s[0] 2791 add w7,w6,w7 2792 mov w6,v3.s[0] 2793 add w7,w7,w9 2794 add w7,w7,w6 2795 2796 eor w6,w7,w7,ror #32-2 2797 eor w6,w6,w7,ror #32-10 2798 eor w6,w6,w7,ror #32-18 2799 eor w6,w6,w7,ror #32-24 2800 eor w12,w12,w6 2801 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2802 eor w6,w14,w15 2803 eor w9,w12,w8 2804 eor w6,w6,w9 2805 movi v1.16b,#64 2806 movi v2.16b,#128 2807 movi v3.16b,#192 2808 mov v0.s[0],w6 2809 2810 sub v1.16b,v0.16b,v1.16b 2811 sub v2.16b,v0.16b,v2.16b 2812 sub v3.16b,v0.16b,v3.16b 2813 2814 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2815 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2816 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2817 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2818 2819 mov w6,v0.s[0] 2820 mov w7,v1.s[0] 2821 mov w9,v2.s[0] 2822 add w7,w6,w7 2823 mov w6,v3.s[0] 2824 add w7,w7,w9 2825 add w7,w7,w6 2826 2827 eor w6,w7,w7,ror #32-2 2828 eor w6,w6,w7,ror #32-10 2829 eor w6,w6,w7,ror #32-18 2830 eor w6,w6,w7,ror #32-24 2831 ldp w7,w8,[x10],8 2832 eor w13,w13,w6 2833 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2834 eor w6,w12,w13 2835 eor w9,w7,w15 2836 eor w6,w6,w9 2837 movi v1.16b,#64 2838 movi v2.16b,#128 2839 movi v3.16b,#192 2840 mov v0.s[0],w6 2841 2842 sub v1.16b,v0.16b,v1.16b 2843 sub v2.16b,v0.16b,v2.16b 2844 sub v3.16b,v0.16b,v3.16b 2845 2846 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2847 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2848 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2849 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2850 2851 mov w6,v0.s[0] 2852 mov w7,v1.s[0] 2853 mov w9,v2.s[0] 2854 add w7,w6,w7 2855 mov w6,v3.s[0] 2856 add w7,w7,w9 2857 add w7,w7,w6 2858 2859 eor w6,w7,w7,ror #32-2 2860 eor w6,w6,w7,ror #32-10 2861 eor w6,w6,w7,ror #32-18 2862 eor w6,w6,w7,ror #32-24 2863 eor w14,w14,w6 2864 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2865 eor w6,w12,w13 2866 eor w9,w14,w8 2867 eor w6,w6,w9 2868 movi v1.16b,#64 2869 movi v2.16b,#128 2870 movi v3.16b,#192 2871 mov v0.s[0],w6 2872 2873 sub v1.16b,v0.16b,v1.16b 2874 sub v2.16b,v0.16b,v2.16b 2875 sub v3.16b,v0.16b,v3.16b 2876 2877 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2878 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2879 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2880 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2881 2882 mov w6,v0.s[0] 2883 mov w7,v1.s[0] 2884 mov w9,v2.s[0] 2885 add w7,w6,w7 2886 mov w6,v3.s[0] 2887 add w7,w7,w9 2888 add w7,w7,w6 2889 2890 eor w6,w7,w7,ror #32-2 2891 eor w6,w6,w7,ror #32-10 2892 eor w6,w6,w7,ror #32-18 2893 eor w6,w6,w7,ror #32-24 2894 eor w15,w15,w6 2895 subs w11,w11,#1 2896 b.ne 10b 2897 mov v8.s[0],w15 2898 mov v8.s[1],w14 2899 mov v8.s[2],w13 2900 mov v8.s[3],w12 2901 #ifndef __AARCH64EB__ 2902 rev32 v8.16b,v8.16b 2903 #endif 2904 mov x3,x26 2905 and x29,x2,#0x0F 2906 // convert length into blocks 2907 lsr x2,x2,4 2908 cmp x2,#1 2909 b.lt .return_gb 2910 2911 cmp x29,0 2912 // If the encryption/decryption Length is N times of 16, 2913 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2914 b.eq .xts_encrypt_blocks_gb 2915 2916 // If the encryption/decryption length is not N times of 16, 2917 // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb 2918 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2919 subs x2,x2,#1 2920 b.eq .only_2blks_tweak_gb 2921 .xts_encrypt_blocks_gb: 2922 rbit v8.16b,v8.16b 2923 #ifdef __AARCH64EB__ 2924 rev32 v8.16b,v8.16b 2925 #endif 2926 mov x12,v8.d[0] 2927 mov x13,v8.d[1] 2928 mov w7,0x87 2929 extr x9,x13,x13,#32 2930 extr x15,x13,x12,#63 2931 and w8,w7,w9,asr#31 2932 eor x14,x8,x12,lsl#1 2933 mov w7,0x87 2934 extr x9,x15,x15,#32 2935 extr x17,x15,x14,#63 2936 and w8,w7,w9,asr#31 2937 eor x16,x8,x14,lsl#1 2938 mov w7,0x87 2939 extr x9,x17,x17,#32 2940 extr x19,x17,x16,#63 2941 and w8,w7,w9,asr#31 2942 eor x18,x8,x16,lsl#1 2943 mov w7,0x87 2944 extr x9,x19,x19,#32 2945 extr x21,x19,x18,#63 2946 and w8,w7,w9,asr#31 2947 eor x20,x8,x18,lsl#1 2948 mov w7,0x87 2949 extr x9,x21,x21,#32 2950 extr x23,x21,x20,#63 2951 and w8,w7,w9,asr#31 2952 eor x22,x8,x20,lsl#1 2953 mov w7,0x87 2954 extr x9,x23,x23,#32 2955 extr x25,x23,x22,#63 2956 and w8,w7,w9,asr#31 2957 eor x24,x8,x22,lsl#1 2958 mov w7,0x87 2959 extr x9,x25,x25,#32 2960 extr x27,x25,x24,#63 2961 and w8,w7,w9,asr#31 2962 eor x26,x8,x24,lsl#1 2963 .Lxts_8_blocks_process_gb: 2964 cmp x2,#8 2965 b.lt .Lxts_4_blocks_process_gb 2966 mov v0.d[0],x12 2967 mov v0.d[1],x13 2968 #ifdef __AARCH64EB__ 2969 rev32 v0.16b,v0.16b 2970 #endif 2971 mov v1.d[0],x14 2972 mov v1.d[1],x15 2973 #ifdef __AARCH64EB__ 2974 rev32 v1.16b,v1.16b 2975 #endif 2976 mov v2.d[0],x16 2977 mov v2.d[1],x17 2978 #ifdef __AARCH64EB__ 2979 rev32 v2.16b,v2.16b 2980 #endif 2981 mov v3.d[0],x18 2982 mov v3.d[1],x19 2983 #ifdef __AARCH64EB__ 2984 rev32 v3.16b,v3.16b 2985 #endif 2986 mov v12.d[0],x20 2987 mov v12.d[1],x21 2988 #ifdef __AARCH64EB__ 2989 rev32 v12.16b,v12.16b 2990 #endif 2991 mov v13.d[0],x22 2992 mov v13.d[1],x23 2993 #ifdef __AARCH64EB__ 2994 rev32 v13.16b,v13.16b 2995 #endif 2996 mov v14.d[0],x24 2997 mov v14.d[1],x25 2998 #ifdef __AARCH64EB__ 2999 rev32 v14.16b,v14.16b 3000 #endif 3001 mov v15.d[0],x26 3002 mov v15.d[1],x27 3003 #ifdef __AARCH64EB__ 3004 rev32 v15.16b,v15.16b 3005 #endif 3006 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3007 rbit v0.16b,v0.16b 3008 rbit v1.16b,v1.16b 3009 rbit v2.16b,v2.16b 3010 rbit v3.16b,v3.16b 3011 eor v4.16b, v4.16b, v0.16b 3012 eor v5.16b, v5.16b, v1.16b 3013 eor v6.16b, v6.16b, v2.16b 3014 eor v7.16b, v7.16b, v3.16b 3015 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 3016 rbit v12.16b,v12.16b 3017 rbit v13.16b,v13.16b 3018 rbit v14.16b,v14.16b 3019 rbit v15.16b,v15.16b 3020 eor v8.16b, v8.16b, v12.16b 3021 eor v9.16b, v9.16b, v13.16b 3022 eor v10.16b, v10.16b, v14.16b 3023 eor v11.16b, v11.16b, v15.16b 3024 #ifndef __AARCH64EB__ 3025 rev32 v4.16b,v4.16b 3026 #endif 3027 #ifndef __AARCH64EB__ 3028 rev32 v5.16b,v5.16b 3029 #endif 3030 #ifndef __AARCH64EB__ 3031 rev32 v6.16b,v6.16b 3032 #endif 3033 #ifndef __AARCH64EB__ 3034 rev32 v7.16b,v7.16b 3035 #endif 3036 #ifndef __AARCH64EB__ 3037 rev32 v8.16b,v8.16b 3038 #endif 3039 #ifndef __AARCH64EB__ 3040 rev32 v9.16b,v9.16b 3041 #endif 3042 #ifndef __AARCH64EB__ 3043 rev32 v10.16b,v10.16b 3044 #endif 3045 #ifndef __AARCH64EB__ 3046 rev32 v11.16b,v11.16b 3047 #endif 3048 zip1 v0.4s,v4.4s,v5.4s 3049 zip2 v1.4s,v4.4s,v5.4s 3050 zip1 v2.4s,v6.4s,v7.4s 3051 zip2 v3.4s,v6.4s,v7.4s 3052 zip1 v4.2d,v0.2d,v2.2d 3053 zip2 v5.2d,v0.2d,v2.2d 3054 zip1 v6.2d,v1.2d,v3.2d 3055 zip2 v7.2d,v1.2d,v3.2d 3056 zip1 v0.4s,v8.4s,v9.4s 3057 zip2 v1.4s,v8.4s,v9.4s 3058 zip1 v2.4s,v10.4s,v11.4s 3059 zip2 v3.4s,v10.4s,v11.4s 3060 zip1 v8.2d,v0.2d,v2.2d 3061 zip2 v9.2d,v0.2d,v2.2d 3062 zip1 v10.2d,v1.2d,v3.2d 3063 zip2 v11.2d,v1.2d,v3.2d 3064 bl _vpsm4_enc_8blks 3065 zip1 v8.4s,v0.4s,v1.4s 3066 zip2 v9.4s,v0.4s,v1.4s 3067 zip1 v10.4s,v2.4s,v3.4s 3068 zip2 v11.4s,v2.4s,v3.4s 3069 zip1 v0.2d,v8.2d,v10.2d 3070 zip2 v1.2d,v8.2d,v10.2d 3071 zip1 v2.2d,v9.2d,v11.2d 3072 zip2 v3.2d,v9.2d,v11.2d 3073 zip1 v8.4s,v4.4s,v5.4s 3074 zip2 v9.4s,v4.4s,v5.4s 3075 zip1 v10.4s,v6.4s,v7.4s 3076 zip2 v11.4s,v6.4s,v7.4s 3077 zip1 v4.2d,v8.2d,v10.2d 3078 zip2 v5.2d,v8.2d,v10.2d 3079 zip1 v6.2d,v9.2d,v11.2d 3080 zip2 v7.2d,v9.2d,v11.2d 3081 mov v12.d[0],x12 3082 mov v12.d[1],x13 3083 #ifdef __AARCH64EB__ 3084 rev32 v12.16b,v12.16b 3085 #endif 3086 mov w7,0x87 3087 extr x9,x27,x27,#32 3088 extr x13,x27,x26,#63 3089 and w8,w7,w9,asr#31 3090 eor x12,x8,x26,lsl#1 3091 mov v13.d[0],x14 3092 mov v13.d[1],x15 3093 #ifdef __AARCH64EB__ 3094 rev32 v13.16b,v13.16b 3095 #endif 3096 mov w7,0x87 3097 extr x9,x13,x13,#32 3098 extr x15,x13,x12,#63 3099 and w8,w7,w9,asr#31 3100 eor x14,x8,x12,lsl#1 3101 mov v14.d[0],x16 3102 mov v14.d[1],x17 3103 #ifdef __AARCH64EB__ 3104 rev32 v14.16b,v14.16b 3105 #endif 3106 mov w7,0x87 3107 extr x9,x15,x15,#32 3108 extr x17,x15,x14,#63 3109 and w8,w7,w9,asr#31 3110 eor x16,x8,x14,lsl#1 3111 mov v15.d[0],x18 3112 mov v15.d[1],x19 3113 #ifdef __AARCH64EB__ 3114 rev32 v15.16b,v15.16b 3115 #endif 3116 mov w7,0x87 3117 extr x9,x17,x17,#32 3118 extr x19,x17,x16,#63 3119 and w8,w7,w9,asr#31 3120 eor x18,x8,x16,lsl#1 3121 mov v8.d[0],x20 3122 mov v8.d[1],x21 3123 #ifdef __AARCH64EB__ 3124 rev32 v8.16b,v8.16b 3125 #endif 3126 mov w7,0x87 3127 extr x9,x19,x19,#32 3128 extr x21,x19,x18,#63 3129 and w8,w7,w9,asr#31 3130 eor x20,x8,x18,lsl#1 3131 mov v9.d[0],x22 3132 mov v9.d[1],x23 3133 #ifdef __AARCH64EB__ 3134 rev32 v9.16b,v9.16b 3135 #endif 3136 mov w7,0x87 3137 extr x9,x21,x21,#32 3138 extr x23,x21,x20,#63 3139 and w8,w7,w9,asr#31 3140 eor x22,x8,x20,lsl#1 3141 mov v10.d[0],x24 3142 mov v10.d[1],x25 3143 #ifdef __AARCH64EB__ 3144 rev32 v10.16b,v10.16b 3145 #endif 3146 mov w7,0x87 3147 extr x9,x23,x23,#32 3148 extr x25,x23,x22,#63 3149 and w8,w7,w9,asr#31 3150 eor x24,x8,x22,lsl#1 3151 mov v11.d[0],x26 3152 mov v11.d[1],x27 3153 #ifdef __AARCH64EB__ 3154 rev32 v11.16b,v11.16b 3155 #endif 3156 mov w7,0x87 3157 extr x9,x25,x25,#32 3158 extr x27,x25,x24,#63 3159 and w8,w7,w9,asr#31 3160 eor x26,x8,x24,lsl#1 3161 eor v0.16b, v0.16b, v12.16b 3162 eor v1.16b, v1.16b, v13.16b 3163 eor v2.16b, v2.16b, v14.16b 3164 eor v3.16b, v3.16b, v15.16b 3165 eor v4.16b, v4.16b, v8.16b 3166 eor v5.16b, v5.16b, v9.16b 3167 eor v6.16b, v6.16b, v10.16b 3168 eor v7.16b, v7.16b, v11.16b 3169 3170 // save the last tweak 3171 st1 {v11.4s},[x5] 3172 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3173 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 3174 subs x2,x2,#8 3175 b.gt .Lxts_8_blocks_process_gb 3176 b 100f 3177 .Lxts_4_blocks_process_gb: 3178 mov v8.d[0],x12 3179 mov v8.d[1],x13 3180 #ifdef __AARCH64EB__ 3181 rev32 v8.16b,v8.16b 3182 #endif 3183 mov v9.d[0],x14 3184 mov v9.d[1],x15 3185 #ifdef __AARCH64EB__ 3186 rev32 v9.16b,v9.16b 3187 #endif 3188 mov v10.d[0],x16 3189 mov v10.d[1],x17 3190 #ifdef __AARCH64EB__ 3191 rev32 v10.16b,v10.16b 3192 #endif 3193 mov v11.d[0],x18 3194 mov v11.d[1],x19 3195 #ifdef __AARCH64EB__ 3196 rev32 v11.16b,v11.16b 3197 #endif 3198 cmp x2,#4 3199 b.lt 1f 3200 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3201 rbit v8.16b,v8.16b 3202 rbit v9.16b,v9.16b 3203 rbit v10.16b,v10.16b 3204 rbit v11.16b,v11.16b 3205 eor v4.16b, v4.16b, v8.16b 3206 eor v5.16b, v5.16b, v9.16b 3207 eor v6.16b, v6.16b, v10.16b 3208 eor v7.16b, v7.16b, v11.16b 3209 #ifndef __AARCH64EB__ 3210 rev32 v4.16b,v4.16b 3211 #endif 3212 #ifndef __AARCH64EB__ 3213 rev32 v5.16b,v5.16b 3214 #endif 3215 #ifndef __AARCH64EB__ 3216 rev32 v6.16b,v6.16b 3217 #endif 3218 #ifndef __AARCH64EB__ 3219 rev32 v7.16b,v7.16b 3220 #endif 3221 zip1 v0.4s,v4.4s,v5.4s 3222 zip2 v1.4s,v4.4s,v5.4s 3223 zip1 v2.4s,v6.4s,v7.4s 3224 zip2 v3.4s,v6.4s,v7.4s 3225 zip1 v4.2d,v0.2d,v2.2d 3226 zip2 v5.2d,v0.2d,v2.2d 3227 zip1 v6.2d,v1.2d,v3.2d 3228 zip2 v7.2d,v1.2d,v3.2d 3229 bl _vpsm4_enc_4blks 3230 zip1 v4.4s,v0.4s,v1.4s 3231 zip2 v5.4s,v0.4s,v1.4s 3232 zip1 v6.4s,v2.4s,v3.4s 3233 zip2 v7.4s,v2.4s,v3.4s 3234 zip1 v0.2d,v4.2d,v6.2d 3235 zip2 v1.2d,v4.2d,v6.2d 3236 zip1 v2.2d,v5.2d,v7.2d 3237 zip2 v3.2d,v5.2d,v7.2d 3238 eor v0.16b, v0.16b, v8.16b 3239 eor v1.16b, v1.16b, v9.16b 3240 eor v2.16b, v2.16b, v10.16b 3241 eor v3.16b, v3.16b, v11.16b 3242 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3243 sub x2,x2,#4 3244 mov v8.d[0],x20 3245 mov v8.d[1],x21 3246 #ifdef __AARCH64EB__ 3247 rev32 v8.16b,v8.16b 3248 #endif 3249 mov v9.d[0],x22 3250 mov v9.d[1],x23 3251 #ifdef __AARCH64EB__ 3252 rev32 v9.16b,v9.16b 3253 #endif 3254 mov v10.d[0],x24 3255 mov v10.d[1],x25 3256 #ifdef __AARCH64EB__ 3257 rev32 v10.16b,v10.16b 3258 #endif 3259 // save the last tweak 3260 st1 {v11.4s},[x5] 3261 1: 3262 // process last block 3263 cmp x2,#1 3264 b.lt 100f 3265 b.gt 1f 3266 ld1 {v4.4s},[x0],#16 3267 rbit v8.16b,v8.16b 3268 eor v4.16b, v4.16b, v8.16b 3269 #ifndef __AARCH64EB__ 3270 rev32 v4.16b,v4.16b 3271 #endif 3272 mov x10,x3 3273 mov w11,#8 3274 mov w12,v4.s[0] 3275 mov w13,v4.s[1] 3276 mov w14,v4.s[2] 3277 mov w15,v4.s[3] 3278 10: 3279 ldp w7,w8,[x10],8 3280 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3281 eor w6,w14,w15 3282 eor w9,w7,w13 3283 eor w6,w6,w9 3284 movi v1.16b,#64 3285 movi v2.16b,#128 3286 movi v3.16b,#192 3287 mov v0.s[0],w6 3288 3289 sub v1.16b,v0.16b,v1.16b 3290 sub v2.16b,v0.16b,v2.16b 3291 sub v3.16b,v0.16b,v3.16b 3292 3293 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3294 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3295 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3296 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3297 3298 mov w6,v0.s[0] 3299 mov w7,v1.s[0] 3300 mov w9,v2.s[0] 3301 add w7,w6,w7 3302 mov w6,v3.s[0] 3303 add w7,w7,w9 3304 add w7,w7,w6 3305 3306 eor w6,w7,w7,ror #32-2 3307 eor w6,w6,w7,ror #32-10 3308 eor w6,w6,w7,ror #32-18 3309 eor w6,w6,w7,ror #32-24 3310 eor w12,w12,w6 3311 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3312 eor w6,w14,w15 3313 eor w9,w12,w8 3314 eor w6,w6,w9 3315 movi v1.16b,#64 3316 movi v2.16b,#128 3317 movi v3.16b,#192 3318 mov v0.s[0],w6 3319 3320 sub v1.16b,v0.16b,v1.16b 3321 sub v2.16b,v0.16b,v2.16b 3322 sub v3.16b,v0.16b,v3.16b 3323 3324 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3325 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3326 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3327 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3328 3329 mov w6,v0.s[0] 3330 mov w7,v1.s[0] 3331 mov w9,v2.s[0] 3332 add w7,w6,w7 3333 mov w6,v3.s[0] 3334 add w7,w7,w9 3335 add w7,w7,w6 3336 3337 eor w6,w7,w7,ror #32-2 3338 eor w6,w6,w7,ror #32-10 3339 eor w6,w6,w7,ror #32-18 3340 eor w6,w6,w7,ror #32-24 3341 ldp w7,w8,[x10],8 3342 eor w13,w13,w6 3343 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3344 eor w6,w12,w13 3345 eor w9,w7,w15 3346 eor w6,w6,w9 3347 movi v1.16b,#64 3348 movi v2.16b,#128 3349 movi v3.16b,#192 3350 mov v0.s[0],w6 3351 3352 sub v1.16b,v0.16b,v1.16b 3353 sub v2.16b,v0.16b,v2.16b 3354 sub v3.16b,v0.16b,v3.16b 3355 3356 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3357 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3358 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3359 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3360 3361 mov w6,v0.s[0] 3362 mov w7,v1.s[0] 3363 mov w9,v2.s[0] 3364 add w7,w6,w7 3365 mov w6,v3.s[0] 3366 add w7,w7,w9 3367 add w7,w7,w6 3368 3369 eor w6,w7,w7,ror #32-2 3370 eor w6,w6,w7,ror #32-10 3371 eor w6,w6,w7,ror #32-18 3372 eor w6,w6,w7,ror #32-24 3373 eor w14,w14,w6 3374 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3375 eor w6,w12,w13 3376 eor w9,w14,w8 3377 eor w6,w6,w9 3378 movi v1.16b,#64 3379 movi v2.16b,#128 3380 movi v3.16b,#192 3381 mov v0.s[0],w6 3382 3383 sub v1.16b,v0.16b,v1.16b 3384 sub v2.16b,v0.16b,v2.16b 3385 sub v3.16b,v0.16b,v3.16b 3386 3387 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3388 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3389 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3390 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3391 3392 mov w6,v0.s[0] 3393 mov w7,v1.s[0] 3394 mov w9,v2.s[0] 3395 add w7,w6,w7 3396 mov w6,v3.s[0] 3397 add w7,w7,w9 3398 add w7,w7,w6 3399 3400 eor w6,w7,w7,ror #32-2 3401 eor w6,w6,w7,ror #32-10 3402 eor w6,w6,w7,ror #32-18 3403 eor w6,w6,w7,ror #32-24 3404 eor w15,w15,w6 3405 subs w11,w11,#1 3406 b.ne 10b 3407 mov v4.s[0],w15 3408 mov v4.s[1],w14 3409 mov v4.s[2],w13 3410 mov v4.s[3],w12 3411 #ifndef __AARCH64EB__ 3412 rev32 v4.16b,v4.16b 3413 #endif 3414 eor v4.16b, v4.16b, v8.16b 3415 st1 {v4.4s},[x1],#16 3416 // save the last tweak 3417 st1 {v8.4s},[x5] 3418 b 100f 3419 1: // process last 2 blocks 3420 cmp x2,#2 3421 b.gt 1f 3422 ld1 {v4.4s,v5.4s},[x0],#32 3423 rbit v8.16b,v8.16b 3424 rbit v9.16b,v9.16b 3425 eor v4.16b, v4.16b, v8.16b 3426 eor v5.16b, v5.16b, v9.16b 3427 #ifndef __AARCH64EB__ 3428 rev32 v4.16b,v4.16b 3429 #endif 3430 #ifndef __AARCH64EB__ 3431 rev32 v5.16b,v5.16b 3432 #endif 3433 zip1 v0.4s,v4.4s,v5.4s 3434 zip2 v1.4s,v4.4s,v5.4s 3435 zip1 v2.4s,v6.4s,v7.4s 3436 zip2 v3.4s,v6.4s,v7.4s 3437 zip1 v4.2d,v0.2d,v2.2d 3438 zip2 v5.2d,v0.2d,v2.2d 3439 zip1 v6.2d,v1.2d,v3.2d 3440 zip2 v7.2d,v1.2d,v3.2d 3441 bl _vpsm4_enc_4blks 3442 zip1 v4.4s,v0.4s,v1.4s 3443 zip2 v5.4s,v0.4s,v1.4s 3444 zip1 v6.4s,v2.4s,v3.4s 3445 zip2 v7.4s,v2.4s,v3.4s 3446 zip1 v0.2d,v4.2d,v6.2d 3447 zip2 v1.2d,v4.2d,v6.2d 3448 zip1 v2.2d,v5.2d,v7.2d 3449 zip2 v3.2d,v5.2d,v7.2d 3450 eor v0.16b, v0.16b, v8.16b 3451 eor v1.16b, v1.16b, v9.16b 3452 st1 {v0.4s,v1.4s},[x1],#32 3453 // save the last tweak 3454 st1 {v9.4s},[x5] 3455 b 100f 3456 1: // process last 3 blocks 3457 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 3458 rbit v8.16b,v8.16b 3459 rbit v9.16b,v9.16b 3460 rbit v10.16b,v10.16b 3461 eor v4.16b, v4.16b, v8.16b 3462 eor v5.16b, v5.16b, v9.16b 3463 eor v6.16b, v6.16b, v10.16b 3464 #ifndef __AARCH64EB__ 3465 rev32 v4.16b,v4.16b 3466 #endif 3467 #ifndef __AARCH64EB__ 3468 rev32 v5.16b,v5.16b 3469 #endif 3470 #ifndef __AARCH64EB__ 3471 rev32 v6.16b,v6.16b 3472 #endif 3473 zip1 v0.4s,v4.4s,v5.4s 3474 zip2 v1.4s,v4.4s,v5.4s 3475 zip1 v2.4s,v6.4s,v7.4s 3476 zip2 v3.4s,v6.4s,v7.4s 3477 zip1 v4.2d,v0.2d,v2.2d 3478 zip2 v5.2d,v0.2d,v2.2d 3479 zip1 v6.2d,v1.2d,v3.2d 3480 zip2 v7.2d,v1.2d,v3.2d 3481 bl _vpsm4_enc_4blks 3482 zip1 v4.4s,v0.4s,v1.4s 3483 zip2 v5.4s,v0.4s,v1.4s 3484 zip1 v6.4s,v2.4s,v3.4s 3485 zip2 v7.4s,v2.4s,v3.4s 3486 zip1 v0.2d,v4.2d,v6.2d 3487 zip2 v1.2d,v4.2d,v6.2d 3488 zip1 v2.2d,v5.2d,v7.2d 3489 zip2 v3.2d,v5.2d,v7.2d 3490 eor v0.16b, v0.16b, v8.16b 3491 eor v1.16b, v1.16b, v9.16b 3492 eor v2.16b, v2.16b, v10.16b 3493 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 3494 // save the last tweak 3495 st1 {v10.4s},[x5] 3496 100: 3497 cmp x29,0 3498 b.eq .return_gb 3499 3500 // This branch calculates the last two tweaks, 3501 // while the encryption/decryption length is larger than 32 3502 .last_2blks_tweak_gb: 3503 ld1 {v8.4s},[x5] 3504 #ifdef __AARCH64EB__ 3505 rev32 v8.16b,v8.16b 3506 #endif 3507 rbit v2.16b,v8.16b 3508 adrp x10,.Lxts_magic 3509 ldr q0, [x10, #:lo12:.Lxts_magic] 3510 shl v9.16b, v2.16b, #1 3511 ext v1.16b, v2.16b, v2.16b,#15 3512 ushr v1.16b, v1.16b, #7 3513 mul v1.16b, v1.16b, v0.16b 3514 eor v9.16b, v9.16b, v1.16b 3515 rbit v9.16b,v9.16b 3516 rbit v2.16b,v9.16b 3517 adrp x10,.Lxts_magic 3518 ldr q0, [x10, #:lo12:.Lxts_magic] 3519 shl v10.16b, v2.16b, #1 3520 ext v1.16b, v2.16b, v2.16b,#15 3521 ushr v1.16b, v1.16b, #7 3522 mul v1.16b, v1.16b, v0.16b 3523 eor v10.16b, v10.16b, v1.16b 3524 rbit v10.16b,v10.16b 3525 b .check_dec_gb 3526 3527 3528 // This branch calculates the last two tweaks, 3529 // while the encryption/decryption length is equal to 32, who only need two tweaks 3530 .only_2blks_tweak_gb: 3531 mov v9.16b,v8.16b 3532 #ifdef __AARCH64EB__ 3533 rev32 v9.16b,v9.16b 3534 #endif 3535 rbit v2.16b,v9.16b 3536 adrp x10,.Lxts_magic 3537 ldr q0, [x10, #:lo12:.Lxts_magic] 3538 shl v10.16b, v2.16b, #1 3539 ext v1.16b, v2.16b, v2.16b,#15 3540 ushr v1.16b, v1.16b, #7 3541 mul v1.16b, v1.16b, v0.16b 3542 eor v10.16b, v10.16b, v1.16b 3543 rbit v10.16b,v10.16b 3544 b .check_dec_gb 3545 3546 3547 // Determine whether encryption or decryption is required. 3548 // The last two tweaks need to be swapped for decryption. 3549 .check_dec_gb: 3550 // encryption:1 decryption:0 3551 cmp w28,1 3552 b.eq .process_last_2blks_gb 3553 mov v0.16B,v9.16b 3554 mov v9.16B,v10.16b 3555 mov v10.16B,v0.16b 3556 3557 .process_last_2blks_gb: 3558 #ifdef __AARCH64EB__ 3559 rev32 v9.16b,v9.16b 3560 #endif 3561 #ifdef __AARCH64EB__ 3562 rev32 v10.16b,v10.16b 3563 #endif 3564 ld1 {v4.4s},[x0],#16 3565 eor v4.16b, v4.16b, v9.16b 3566 #ifndef __AARCH64EB__ 3567 rev32 v4.16b,v4.16b 3568 #endif 3569 mov x10,x3 3570 mov w11,#8 3571 mov w12,v4.s[0] 3572 mov w13,v4.s[1] 3573 mov w14,v4.s[2] 3574 mov w15,v4.s[3] 3575 10: 3576 ldp w7,w8,[x10],8 3577 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3578 eor w6,w14,w15 3579 eor w9,w7,w13 3580 eor w6,w6,w9 3581 movi v1.16b,#64 3582 movi v2.16b,#128 3583 movi v3.16b,#192 3584 mov v0.s[0],w6 3585 3586 sub v1.16b,v0.16b,v1.16b 3587 sub v2.16b,v0.16b,v2.16b 3588 sub v3.16b,v0.16b,v3.16b 3589 3590 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3591 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3592 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3593 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3594 3595 mov w6,v0.s[0] 3596 mov w7,v1.s[0] 3597 mov w9,v2.s[0] 3598 add w7,w6,w7 3599 mov w6,v3.s[0] 3600 add w7,w7,w9 3601 add w7,w7,w6 3602 3603 eor w6,w7,w7,ror #32-2 3604 eor w6,w6,w7,ror #32-10 3605 eor w6,w6,w7,ror #32-18 3606 eor w6,w6,w7,ror #32-24 3607 eor w12,w12,w6 3608 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3609 eor w6,w14,w15 3610 eor w9,w12,w8 3611 eor w6,w6,w9 3612 movi v1.16b,#64 3613 movi v2.16b,#128 3614 movi v3.16b,#192 3615 mov v0.s[0],w6 3616 3617 sub v1.16b,v0.16b,v1.16b 3618 sub v2.16b,v0.16b,v2.16b 3619 sub v3.16b,v0.16b,v3.16b 3620 3621 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3622 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3623 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3624 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3625 3626 mov w6,v0.s[0] 3627 mov w7,v1.s[0] 3628 mov w9,v2.s[0] 3629 add w7,w6,w7 3630 mov w6,v3.s[0] 3631 add w7,w7,w9 3632 add w7,w7,w6 3633 3634 eor w6,w7,w7,ror #32-2 3635 eor w6,w6,w7,ror #32-10 3636 eor w6,w6,w7,ror #32-18 3637 eor w6,w6,w7,ror #32-24 3638 ldp w7,w8,[x10],8 3639 eor w13,w13,w6 3640 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3641 eor w6,w12,w13 3642 eor w9,w7,w15 3643 eor w6,w6,w9 3644 movi v1.16b,#64 3645 movi v2.16b,#128 3646 movi v3.16b,#192 3647 mov v0.s[0],w6 3648 3649 sub v1.16b,v0.16b,v1.16b 3650 sub v2.16b,v0.16b,v2.16b 3651 sub v3.16b,v0.16b,v3.16b 3652 3653 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3654 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3655 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3656 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3657 3658 mov w6,v0.s[0] 3659 mov w7,v1.s[0] 3660 mov w9,v2.s[0] 3661 add w7,w6,w7 3662 mov w6,v3.s[0] 3663 add w7,w7,w9 3664 add w7,w7,w6 3665 3666 eor w6,w7,w7,ror #32-2 3667 eor w6,w6,w7,ror #32-10 3668 eor w6,w6,w7,ror #32-18 3669 eor w6,w6,w7,ror #32-24 3670 eor w14,w14,w6 3671 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3672 eor w6,w12,w13 3673 eor w9,w14,w8 3674 eor w6,w6,w9 3675 movi v1.16b,#64 3676 movi v2.16b,#128 3677 movi v3.16b,#192 3678 mov v0.s[0],w6 3679 3680 sub v1.16b,v0.16b,v1.16b 3681 sub v2.16b,v0.16b,v2.16b 3682 sub v3.16b,v0.16b,v3.16b 3683 3684 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3685 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3686 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3687 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3688 3689 mov w6,v0.s[0] 3690 mov w7,v1.s[0] 3691 mov w9,v2.s[0] 3692 add w7,w6,w7 3693 mov w6,v3.s[0] 3694 add w7,w7,w9 3695 add w7,w7,w6 3696 3697 eor w6,w7,w7,ror #32-2 3698 eor w6,w6,w7,ror #32-10 3699 eor w6,w6,w7,ror #32-18 3700 eor w6,w6,w7,ror #32-24 3701 eor w15,w15,w6 3702 subs w11,w11,#1 3703 b.ne 10b 3704 mov v4.s[0],w15 3705 mov v4.s[1],w14 3706 mov v4.s[2],w13 3707 mov v4.s[3],w12 3708 #ifndef __AARCH64EB__ 3709 rev32 v4.16b,v4.16b 3710 #endif 3711 eor v4.16b, v4.16b, v9.16b 3712 st1 {v4.4s},[x1],#16 3713 3714 sub x26,x1,16 3715 .loop_gb: 3716 subs x29,x29,1 3717 ldrb w7,[x26,x29] 3718 ldrb w8,[x0,x29] 3719 strb w8,[x26,x29] 3720 strb w7,[x1,x29] 3721 b.gt .loop_gb 3722 ld1 {v4.4s}, [x26] 3723 eor v4.16b, v4.16b, v10.16b 3724 #ifndef __AARCH64EB__ 3725 rev32 v4.16b,v4.16b 3726 #endif 3727 mov x10,x3 3728 mov w11,#8 3729 mov w12,v4.s[0] 3730 mov w13,v4.s[1] 3731 mov w14,v4.s[2] 3732 mov w15,v4.s[3] 3733 10: 3734 ldp w7,w8,[x10],8 3735 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3736 eor w6,w14,w15 3737 eor w9,w7,w13 3738 eor w6,w6,w9 3739 movi v1.16b,#64 3740 movi v2.16b,#128 3741 movi v3.16b,#192 3742 mov v0.s[0],w6 3743 3744 sub v1.16b,v0.16b,v1.16b 3745 sub v2.16b,v0.16b,v2.16b 3746 sub v3.16b,v0.16b,v3.16b 3747 3748 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3749 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3750 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3751 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3752 3753 mov w6,v0.s[0] 3754 mov w7,v1.s[0] 3755 mov w9,v2.s[0] 3756 add w7,w6,w7 3757 mov w6,v3.s[0] 3758 add w7,w7,w9 3759 add w7,w7,w6 3760 3761 eor w6,w7,w7,ror #32-2 3762 eor w6,w6,w7,ror #32-10 3763 eor w6,w6,w7,ror #32-18 3764 eor w6,w6,w7,ror #32-24 3765 eor w12,w12,w6 3766 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3767 eor w6,w14,w15 3768 eor w9,w12,w8 3769 eor w6,w6,w9 3770 movi v1.16b,#64 3771 movi v2.16b,#128 3772 movi v3.16b,#192 3773 mov v0.s[0],w6 3774 3775 sub v1.16b,v0.16b,v1.16b 3776 sub v2.16b,v0.16b,v2.16b 3777 sub v3.16b,v0.16b,v3.16b 3778 3779 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3780 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3781 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3782 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3783 3784 mov w6,v0.s[0] 3785 mov w7,v1.s[0] 3786 mov w9,v2.s[0] 3787 add w7,w6,w7 3788 mov w6,v3.s[0] 3789 add w7,w7,w9 3790 add w7,w7,w6 3791 3792 eor w6,w7,w7,ror #32-2 3793 eor w6,w6,w7,ror #32-10 3794 eor w6,w6,w7,ror #32-18 3795 eor w6,w6,w7,ror #32-24 3796 ldp w7,w8,[x10],8 3797 eor w13,w13,w6 3798 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3799 eor w6,w12,w13 3800 eor w9,w7,w15 3801 eor w6,w6,w9 3802 movi v1.16b,#64 3803 movi v2.16b,#128 3804 movi v3.16b,#192 3805 mov v0.s[0],w6 3806 3807 sub v1.16b,v0.16b,v1.16b 3808 sub v2.16b,v0.16b,v2.16b 3809 sub v3.16b,v0.16b,v3.16b 3810 3811 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3812 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3813 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3814 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3815 3816 mov w6,v0.s[0] 3817 mov w7,v1.s[0] 3818 mov w9,v2.s[0] 3819 add w7,w6,w7 3820 mov w6,v3.s[0] 3821 add w7,w7,w9 3822 add w7,w7,w6 3823 3824 eor w6,w7,w7,ror #32-2 3825 eor w6,w6,w7,ror #32-10 3826 eor w6,w6,w7,ror #32-18 3827 eor w6,w6,w7,ror #32-24 3828 eor w14,w14,w6 3829 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3830 eor w6,w12,w13 3831 eor w9,w14,w8 3832 eor w6,w6,w9 3833 movi v1.16b,#64 3834 movi v2.16b,#128 3835 movi v3.16b,#192 3836 mov v0.s[0],w6 3837 3838 sub v1.16b,v0.16b,v1.16b 3839 sub v2.16b,v0.16b,v2.16b 3840 sub v3.16b,v0.16b,v3.16b 3841 3842 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3843 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3844 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3845 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3846 3847 mov w6,v0.s[0] 3848 mov w7,v1.s[0] 3849 mov w9,v2.s[0] 3850 add w7,w6,w7 3851 mov w6,v3.s[0] 3852 add w7,w7,w9 3853 add w7,w7,w6 3854 3855 eor w6,w7,w7,ror #32-2 3856 eor w6,w6,w7,ror #32-10 3857 eor w6,w6,w7,ror #32-18 3858 eor w6,w6,w7,ror #32-24 3859 eor w15,w15,w6 3860 subs w11,w11,#1 3861 b.ne 10b 3862 mov v4.s[0],w15 3863 mov v4.s[1],w14 3864 mov v4.s[2],w13 3865 mov v4.s[3],w12 3866 #ifndef __AARCH64EB__ 3867 rev32 v4.16b,v4.16b 3868 #endif 3869 eor v4.16b, v4.16b, v10.16b 3870 st1 {v4.4s}, [x26] 3871 .return_gb: 3872 ldp d14, d15, [sp], #0x10 3873 ldp d12, d13, [sp], #0x10 3874 ldp d10, d11, [sp], #0x10 3875 ldp d8, d9, [sp], #0x10 3876 ldp x29, x30, [sp], #0x10 3877 ldp x27, x28, [sp], #0x10 3878 ldp x25, x26, [sp], #0x10 3879 ldp x23, x24, [sp], #0x10 3880 ldp x21, x22, [sp], #0x10 3881 ldp x19, x20, [sp], #0x10 3882 ldp x17, x18, [sp], #0x10 3883 ldp x15, x16, [sp], #0x10 3884 AARCH64_VALIDATE_LINK_REGISTER 3885 ret 3886 .size vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb 3887 .globl vpsm4_xts_encrypt 3888 .type vpsm4_xts_encrypt,%function 3889 .align 5 3890 vpsm4_xts_encrypt: 3891 AARCH64_SIGN_LINK_REGISTER 3892 stp x15, x16, [sp, #-0x10]! 3893 stp x17, x18, [sp, #-0x10]! 3894 stp x19, x20, [sp, #-0x10]! 3895 stp x21, x22, [sp, #-0x10]! 3896 stp x23, x24, [sp, #-0x10]! 3897 stp x25, x26, [sp, #-0x10]! 3898 stp x27, x28, [sp, #-0x10]! 3899 stp x29, x30, [sp, #-0x10]! 3900 stp d8, d9, [sp, #-0x10]! 3901 stp d10, d11, [sp, #-0x10]! 3902 stp d12, d13, [sp, #-0x10]! 3903 stp d14, d15, [sp, #-0x10]! 3904 mov x26,x3 3905 mov x27,x4 3906 mov w28,w6 3907 ld1 {v8.4s}, [x5] 3908 mov x3,x27 3909 adrp x10,.Lsbox 3910 add x10,x10,#:lo12:.Lsbox 3911 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 3912 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 3913 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 3914 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 3915 #ifndef __AARCH64EB__ 3916 rev32 v8.16b,v8.16b 3917 #endif 3918 mov x10,x3 3919 mov w11,#8 3920 mov w12,v8.s[0] 3921 mov w13,v8.s[1] 3922 mov w14,v8.s[2] 3923 mov w15,v8.s[3] 3924 10: 3925 ldp w7,w8,[x10],8 3926 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3927 eor w6,w14,w15 3928 eor w9,w7,w13 3929 eor w6,w6,w9 3930 movi v1.16b,#64 3931 movi v2.16b,#128 3932 movi v3.16b,#192 3933 mov v0.s[0],w6 3934 3935 sub v1.16b,v0.16b,v1.16b 3936 sub v2.16b,v0.16b,v2.16b 3937 sub v3.16b,v0.16b,v3.16b 3938 3939 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3940 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3941 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3942 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3943 3944 mov w6,v0.s[0] 3945 mov w7,v1.s[0] 3946 mov w9,v2.s[0] 3947 add w7,w6,w7 3948 mov w6,v3.s[0] 3949 add w7,w7,w9 3950 add w7,w7,w6 3951 3952 eor w6,w7,w7,ror #32-2 3953 eor w6,w6,w7,ror #32-10 3954 eor w6,w6,w7,ror #32-18 3955 eor w6,w6,w7,ror #32-24 3956 eor w12,w12,w6 3957 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3958 eor w6,w14,w15 3959 eor w9,w12,w8 3960 eor w6,w6,w9 3961 movi v1.16b,#64 3962 movi v2.16b,#128 3963 movi v3.16b,#192 3964 mov v0.s[0],w6 3965 3966 sub v1.16b,v0.16b,v1.16b 3967 sub v2.16b,v0.16b,v2.16b 3968 sub v3.16b,v0.16b,v3.16b 3969 3970 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3971 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3972 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3973 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3974 3975 mov w6,v0.s[0] 3976 mov w7,v1.s[0] 3977 mov w9,v2.s[0] 3978 add w7,w6,w7 3979 mov w6,v3.s[0] 3980 add w7,w7,w9 3981 add w7,w7,w6 3982 3983 eor w6,w7,w7,ror #32-2 3984 eor w6,w6,w7,ror #32-10 3985 eor w6,w6,w7,ror #32-18 3986 eor w6,w6,w7,ror #32-24 3987 ldp w7,w8,[x10],8 3988 eor w13,w13,w6 3989 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3990 eor w6,w12,w13 3991 eor w9,w7,w15 3992 eor w6,w6,w9 3993 movi v1.16b,#64 3994 movi v2.16b,#128 3995 movi v3.16b,#192 3996 mov v0.s[0],w6 3997 3998 sub v1.16b,v0.16b,v1.16b 3999 sub v2.16b,v0.16b,v2.16b 4000 sub v3.16b,v0.16b,v3.16b 4001 4002 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4003 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4004 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4005 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4006 4007 mov w6,v0.s[0] 4008 mov w7,v1.s[0] 4009 mov w9,v2.s[0] 4010 add w7,w6,w7 4011 mov w6,v3.s[0] 4012 add w7,w7,w9 4013 add w7,w7,w6 4014 4015 eor w6,w7,w7,ror #32-2 4016 eor w6,w6,w7,ror #32-10 4017 eor w6,w6,w7,ror #32-18 4018 eor w6,w6,w7,ror #32-24 4019 eor w14,w14,w6 4020 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4021 eor w6,w12,w13 4022 eor w9,w14,w8 4023 eor w6,w6,w9 4024 movi v1.16b,#64 4025 movi v2.16b,#128 4026 movi v3.16b,#192 4027 mov v0.s[0],w6 4028 4029 sub v1.16b,v0.16b,v1.16b 4030 sub v2.16b,v0.16b,v2.16b 4031 sub v3.16b,v0.16b,v3.16b 4032 4033 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4034 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4035 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4036 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4037 4038 mov w6,v0.s[0] 4039 mov w7,v1.s[0] 4040 mov w9,v2.s[0] 4041 add w7,w6,w7 4042 mov w6,v3.s[0] 4043 add w7,w7,w9 4044 add w7,w7,w6 4045 4046 eor w6,w7,w7,ror #32-2 4047 eor w6,w6,w7,ror #32-10 4048 eor w6,w6,w7,ror #32-18 4049 eor w6,w6,w7,ror #32-24 4050 eor w15,w15,w6 4051 subs w11,w11,#1 4052 b.ne 10b 4053 mov v8.s[0],w15 4054 mov v8.s[1],w14 4055 mov v8.s[2],w13 4056 mov v8.s[3],w12 4057 #ifndef __AARCH64EB__ 4058 rev32 v8.16b,v8.16b 4059 #endif 4060 mov x3,x26 4061 and x29,x2,#0x0F 4062 // convert length into blocks 4063 lsr x2,x2,4 4064 cmp x2,#1 4065 b.lt .return 4066 4067 cmp x29,0 4068 // If the encryption/decryption Length is N times of 16, 4069 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks 4070 b.eq .xts_encrypt_blocks 4071 4072 // If the encryption/decryption length is not N times of 16, 4073 // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak 4074 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks 4075 subs x2,x2,#1 4076 b.eq .only_2blks_tweak 4077 .xts_encrypt_blocks: 4078 #ifdef __AARCH64EB__ 4079 rev32 v8.16b,v8.16b 4080 #endif 4081 mov x12,v8.d[0] 4082 mov x13,v8.d[1] 4083 mov w7,0x87 4084 extr x9,x13,x13,#32 4085 extr x15,x13,x12,#63 4086 and w8,w7,w9,asr#31 4087 eor x14,x8,x12,lsl#1 4088 mov w7,0x87 4089 extr x9,x15,x15,#32 4090 extr x17,x15,x14,#63 4091 and w8,w7,w9,asr#31 4092 eor x16,x8,x14,lsl#1 4093 mov w7,0x87 4094 extr x9,x17,x17,#32 4095 extr x19,x17,x16,#63 4096 and w8,w7,w9,asr#31 4097 eor x18,x8,x16,lsl#1 4098 mov w7,0x87 4099 extr x9,x19,x19,#32 4100 extr x21,x19,x18,#63 4101 and w8,w7,w9,asr#31 4102 eor x20,x8,x18,lsl#1 4103 mov w7,0x87 4104 extr x9,x21,x21,#32 4105 extr x23,x21,x20,#63 4106 and w8,w7,w9,asr#31 4107 eor x22,x8,x20,lsl#1 4108 mov w7,0x87 4109 extr x9,x23,x23,#32 4110 extr x25,x23,x22,#63 4111 and w8,w7,w9,asr#31 4112 eor x24,x8,x22,lsl#1 4113 mov w7,0x87 4114 extr x9,x25,x25,#32 4115 extr x27,x25,x24,#63 4116 and w8,w7,w9,asr#31 4117 eor x26,x8,x24,lsl#1 4118 .Lxts_8_blocks_process: 4119 cmp x2,#8 4120 b.lt .Lxts_4_blocks_process 4121 mov v0.d[0],x12 4122 mov v0.d[1],x13 4123 #ifdef __AARCH64EB__ 4124 rev32 v0.16b,v0.16b 4125 #endif 4126 mov v1.d[0],x14 4127 mov v1.d[1],x15 4128 #ifdef __AARCH64EB__ 4129 rev32 v1.16b,v1.16b 4130 #endif 4131 mov v2.d[0],x16 4132 mov v2.d[1],x17 4133 #ifdef __AARCH64EB__ 4134 rev32 v2.16b,v2.16b 4135 #endif 4136 mov v3.d[0],x18 4137 mov v3.d[1],x19 4138 #ifdef __AARCH64EB__ 4139 rev32 v3.16b,v3.16b 4140 #endif 4141 mov v12.d[0],x20 4142 mov v12.d[1],x21 4143 #ifdef __AARCH64EB__ 4144 rev32 v12.16b,v12.16b 4145 #endif 4146 mov v13.d[0],x22 4147 mov v13.d[1],x23 4148 #ifdef __AARCH64EB__ 4149 rev32 v13.16b,v13.16b 4150 #endif 4151 mov v14.d[0],x24 4152 mov v14.d[1],x25 4153 #ifdef __AARCH64EB__ 4154 rev32 v14.16b,v14.16b 4155 #endif 4156 mov v15.d[0],x26 4157 mov v15.d[1],x27 4158 #ifdef __AARCH64EB__ 4159 rev32 v15.16b,v15.16b 4160 #endif 4161 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 4162 eor v4.16b, v4.16b, v0.16b 4163 eor v5.16b, v5.16b, v1.16b 4164 eor v6.16b, v6.16b, v2.16b 4165 eor v7.16b, v7.16b, v3.16b 4166 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 4167 eor v8.16b, v8.16b, v12.16b 4168 eor v9.16b, v9.16b, v13.16b 4169 eor v10.16b, v10.16b, v14.16b 4170 eor v11.16b, v11.16b, v15.16b 4171 #ifndef __AARCH64EB__ 4172 rev32 v4.16b,v4.16b 4173 #endif 4174 #ifndef __AARCH64EB__ 4175 rev32 v5.16b,v5.16b 4176 #endif 4177 #ifndef __AARCH64EB__ 4178 rev32 v6.16b,v6.16b 4179 #endif 4180 #ifndef __AARCH64EB__ 4181 rev32 v7.16b,v7.16b 4182 #endif 4183 #ifndef __AARCH64EB__ 4184 rev32 v8.16b,v8.16b 4185 #endif 4186 #ifndef __AARCH64EB__ 4187 rev32 v9.16b,v9.16b 4188 #endif 4189 #ifndef __AARCH64EB__ 4190 rev32 v10.16b,v10.16b 4191 #endif 4192 #ifndef __AARCH64EB__ 4193 rev32 v11.16b,v11.16b 4194 #endif 4195 zip1 v0.4s,v4.4s,v5.4s 4196 zip2 v1.4s,v4.4s,v5.4s 4197 zip1 v2.4s,v6.4s,v7.4s 4198 zip2 v3.4s,v6.4s,v7.4s 4199 zip1 v4.2d,v0.2d,v2.2d 4200 zip2 v5.2d,v0.2d,v2.2d 4201 zip1 v6.2d,v1.2d,v3.2d 4202 zip2 v7.2d,v1.2d,v3.2d 4203 zip1 v0.4s,v8.4s,v9.4s 4204 zip2 v1.4s,v8.4s,v9.4s 4205 zip1 v2.4s,v10.4s,v11.4s 4206 zip2 v3.4s,v10.4s,v11.4s 4207 zip1 v8.2d,v0.2d,v2.2d 4208 zip2 v9.2d,v0.2d,v2.2d 4209 zip1 v10.2d,v1.2d,v3.2d 4210 zip2 v11.2d,v1.2d,v3.2d 4211 bl _vpsm4_enc_8blks 4212 zip1 v8.4s,v0.4s,v1.4s 4213 zip2 v9.4s,v0.4s,v1.4s 4214 zip1 v10.4s,v2.4s,v3.4s 4215 zip2 v11.4s,v2.4s,v3.4s 4216 zip1 v0.2d,v8.2d,v10.2d 4217 zip2 v1.2d,v8.2d,v10.2d 4218 zip1 v2.2d,v9.2d,v11.2d 4219 zip2 v3.2d,v9.2d,v11.2d 4220 zip1 v8.4s,v4.4s,v5.4s 4221 zip2 v9.4s,v4.4s,v5.4s 4222 zip1 v10.4s,v6.4s,v7.4s 4223 zip2 v11.4s,v6.4s,v7.4s 4224 zip1 v4.2d,v8.2d,v10.2d 4225 zip2 v5.2d,v8.2d,v10.2d 4226 zip1 v6.2d,v9.2d,v11.2d 4227 zip2 v7.2d,v9.2d,v11.2d 4228 mov v12.d[0],x12 4229 mov v12.d[1],x13 4230 #ifdef __AARCH64EB__ 4231 rev32 v12.16b,v12.16b 4232 #endif 4233 mov w7,0x87 4234 extr x9,x27,x27,#32 4235 extr x13,x27,x26,#63 4236 and w8,w7,w9,asr#31 4237 eor x12,x8,x26,lsl#1 4238 mov v13.d[0],x14 4239 mov v13.d[1],x15 4240 #ifdef __AARCH64EB__ 4241 rev32 v13.16b,v13.16b 4242 #endif 4243 mov w7,0x87 4244 extr x9,x13,x13,#32 4245 extr x15,x13,x12,#63 4246 and w8,w7,w9,asr#31 4247 eor x14,x8,x12,lsl#1 4248 mov v14.d[0],x16 4249 mov v14.d[1],x17 4250 #ifdef __AARCH64EB__ 4251 rev32 v14.16b,v14.16b 4252 #endif 4253 mov w7,0x87 4254 extr x9,x15,x15,#32 4255 extr x17,x15,x14,#63 4256 and w8,w7,w9,asr#31 4257 eor x16,x8,x14,lsl#1 4258 mov v15.d[0],x18 4259 mov v15.d[1],x19 4260 #ifdef __AARCH64EB__ 4261 rev32 v15.16b,v15.16b 4262 #endif 4263 mov w7,0x87 4264 extr x9,x17,x17,#32 4265 extr x19,x17,x16,#63 4266 and w8,w7,w9,asr#31 4267 eor x18,x8,x16,lsl#1 4268 mov v8.d[0],x20 4269 mov v8.d[1],x21 4270 #ifdef __AARCH64EB__ 4271 rev32 v8.16b,v8.16b 4272 #endif 4273 mov w7,0x87 4274 extr x9,x19,x19,#32 4275 extr x21,x19,x18,#63 4276 and w8,w7,w9,asr#31 4277 eor x20,x8,x18,lsl#1 4278 mov v9.d[0],x22 4279 mov v9.d[1],x23 4280 #ifdef __AARCH64EB__ 4281 rev32 v9.16b,v9.16b 4282 #endif 4283 mov w7,0x87 4284 extr x9,x21,x21,#32 4285 extr x23,x21,x20,#63 4286 and w8,w7,w9,asr#31 4287 eor x22,x8,x20,lsl#1 4288 mov v10.d[0],x24 4289 mov v10.d[1],x25 4290 #ifdef __AARCH64EB__ 4291 rev32 v10.16b,v10.16b 4292 #endif 4293 mov w7,0x87 4294 extr x9,x23,x23,#32 4295 extr x25,x23,x22,#63 4296 and w8,w7,w9,asr#31 4297 eor x24,x8,x22,lsl#1 4298 mov v11.d[0],x26 4299 mov v11.d[1],x27 4300 #ifdef __AARCH64EB__ 4301 rev32 v11.16b,v11.16b 4302 #endif 4303 mov w7,0x87 4304 extr x9,x25,x25,#32 4305 extr x27,x25,x24,#63 4306 and w8,w7,w9,asr#31 4307 eor x26,x8,x24,lsl#1 4308 eor v0.16b, v0.16b, v12.16b 4309 eor v1.16b, v1.16b, v13.16b 4310 eor v2.16b, v2.16b, v14.16b 4311 eor v3.16b, v3.16b, v15.16b 4312 eor v4.16b, v4.16b, v8.16b 4313 eor v5.16b, v5.16b, v9.16b 4314 eor v6.16b, v6.16b, v10.16b 4315 eor v7.16b, v7.16b, v11.16b 4316 4317 // save the last tweak 4318 st1 {v11.4s},[x5] 4319 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 4320 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 4321 subs x2,x2,#8 4322 b.gt .Lxts_8_blocks_process 4323 b 100f 4324 .Lxts_4_blocks_process: 4325 mov v8.d[0],x12 4326 mov v8.d[1],x13 4327 #ifdef __AARCH64EB__ 4328 rev32 v8.16b,v8.16b 4329 #endif 4330 mov v9.d[0],x14 4331 mov v9.d[1],x15 4332 #ifdef __AARCH64EB__ 4333 rev32 v9.16b,v9.16b 4334 #endif 4335 mov v10.d[0],x16 4336 mov v10.d[1],x17 4337 #ifdef __AARCH64EB__ 4338 rev32 v10.16b,v10.16b 4339 #endif 4340 mov v11.d[0],x18 4341 mov v11.d[1],x19 4342 #ifdef __AARCH64EB__ 4343 rev32 v11.16b,v11.16b 4344 #endif 4345 cmp x2,#4 4346 b.lt 1f 4347 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 4348 eor v4.16b, v4.16b, v8.16b 4349 eor v5.16b, v5.16b, v9.16b 4350 eor v6.16b, v6.16b, v10.16b 4351 eor v7.16b, v7.16b, v11.16b 4352 #ifndef __AARCH64EB__ 4353 rev32 v4.16b,v4.16b 4354 #endif 4355 #ifndef __AARCH64EB__ 4356 rev32 v5.16b,v5.16b 4357 #endif 4358 #ifndef __AARCH64EB__ 4359 rev32 v6.16b,v6.16b 4360 #endif 4361 #ifndef __AARCH64EB__ 4362 rev32 v7.16b,v7.16b 4363 #endif 4364 zip1 v0.4s,v4.4s,v5.4s 4365 zip2 v1.4s,v4.4s,v5.4s 4366 zip1 v2.4s,v6.4s,v7.4s 4367 zip2 v3.4s,v6.4s,v7.4s 4368 zip1 v4.2d,v0.2d,v2.2d 4369 zip2 v5.2d,v0.2d,v2.2d 4370 zip1 v6.2d,v1.2d,v3.2d 4371 zip2 v7.2d,v1.2d,v3.2d 4372 bl _vpsm4_enc_4blks 4373 zip1 v4.4s,v0.4s,v1.4s 4374 zip2 v5.4s,v0.4s,v1.4s 4375 zip1 v6.4s,v2.4s,v3.4s 4376 zip2 v7.4s,v2.4s,v3.4s 4377 zip1 v0.2d,v4.2d,v6.2d 4378 zip2 v1.2d,v4.2d,v6.2d 4379 zip1 v2.2d,v5.2d,v7.2d 4380 zip2 v3.2d,v5.2d,v7.2d 4381 eor v0.16b, v0.16b, v8.16b 4382 eor v1.16b, v1.16b, v9.16b 4383 eor v2.16b, v2.16b, v10.16b 4384 eor v3.16b, v3.16b, v11.16b 4385 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 4386 sub x2,x2,#4 4387 mov v8.d[0],x20 4388 mov v8.d[1],x21 4389 #ifdef __AARCH64EB__ 4390 rev32 v8.16b,v8.16b 4391 #endif 4392 mov v9.d[0],x22 4393 mov v9.d[1],x23 4394 #ifdef __AARCH64EB__ 4395 rev32 v9.16b,v9.16b 4396 #endif 4397 mov v10.d[0],x24 4398 mov v10.d[1],x25 4399 #ifdef __AARCH64EB__ 4400 rev32 v10.16b,v10.16b 4401 #endif 4402 // save the last tweak 4403 st1 {v11.4s},[x5] 4404 1: 4405 // process last block 4406 cmp x2,#1 4407 b.lt 100f 4408 b.gt 1f 4409 ld1 {v4.4s},[x0],#16 4410 eor v4.16b, v4.16b, v8.16b 4411 #ifndef __AARCH64EB__ 4412 rev32 v4.16b,v4.16b 4413 #endif 4414 mov x10,x3 4415 mov w11,#8 4416 mov w12,v4.s[0] 4417 mov w13,v4.s[1] 4418 mov w14,v4.s[2] 4419 mov w15,v4.s[3] 4420 10: 4421 ldp w7,w8,[x10],8 4422 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4423 eor w6,w14,w15 4424 eor w9,w7,w13 4425 eor w6,w6,w9 4426 movi v1.16b,#64 4427 movi v2.16b,#128 4428 movi v3.16b,#192 4429 mov v0.s[0],w6 4430 4431 sub v1.16b,v0.16b,v1.16b 4432 sub v2.16b,v0.16b,v2.16b 4433 sub v3.16b,v0.16b,v3.16b 4434 4435 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4436 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4437 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4438 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4439 4440 mov w6,v0.s[0] 4441 mov w7,v1.s[0] 4442 mov w9,v2.s[0] 4443 add w7,w6,w7 4444 mov w6,v3.s[0] 4445 add w7,w7,w9 4446 add w7,w7,w6 4447 4448 eor w6,w7,w7,ror #32-2 4449 eor w6,w6,w7,ror #32-10 4450 eor w6,w6,w7,ror #32-18 4451 eor w6,w6,w7,ror #32-24 4452 eor w12,w12,w6 4453 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4454 eor w6,w14,w15 4455 eor w9,w12,w8 4456 eor w6,w6,w9 4457 movi v1.16b,#64 4458 movi v2.16b,#128 4459 movi v3.16b,#192 4460 mov v0.s[0],w6 4461 4462 sub v1.16b,v0.16b,v1.16b 4463 sub v2.16b,v0.16b,v2.16b 4464 sub v3.16b,v0.16b,v3.16b 4465 4466 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4467 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4468 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4469 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4470 4471 mov w6,v0.s[0] 4472 mov w7,v1.s[0] 4473 mov w9,v2.s[0] 4474 add w7,w6,w7 4475 mov w6,v3.s[0] 4476 add w7,w7,w9 4477 add w7,w7,w6 4478 4479 eor w6,w7,w7,ror #32-2 4480 eor w6,w6,w7,ror #32-10 4481 eor w6,w6,w7,ror #32-18 4482 eor w6,w6,w7,ror #32-24 4483 ldp w7,w8,[x10],8 4484 eor w13,w13,w6 4485 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4486 eor w6,w12,w13 4487 eor w9,w7,w15 4488 eor w6,w6,w9 4489 movi v1.16b,#64 4490 movi v2.16b,#128 4491 movi v3.16b,#192 4492 mov v0.s[0],w6 4493 4494 sub v1.16b,v0.16b,v1.16b 4495 sub v2.16b,v0.16b,v2.16b 4496 sub v3.16b,v0.16b,v3.16b 4497 4498 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4499 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4500 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4501 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4502 4503 mov w6,v0.s[0] 4504 mov w7,v1.s[0] 4505 mov w9,v2.s[0] 4506 add w7,w6,w7 4507 mov w6,v3.s[0] 4508 add w7,w7,w9 4509 add w7,w7,w6 4510 4511 eor w6,w7,w7,ror #32-2 4512 eor w6,w6,w7,ror #32-10 4513 eor w6,w6,w7,ror #32-18 4514 eor w6,w6,w7,ror #32-24 4515 eor w14,w14,w6 4516 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4517 eor w6,w12,w13 4518 eor w9,w14,w8 4519 eor w6,w6,w9 4520 movi v1.16b,#64 4521 movi v2.16b,#128 4522 movi v3.16b,#192 4523 mov v0.s[0],w6 4524 4525 sub v1.16b,v0.16b,v1.16b 4526 sub v2.16b,v0.16b,v2.16b 4527 sub v3.16b,v0.16b,v3.16b 4528 4529 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4530 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4531 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4532 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4533 4534 mov w6,v0.s[0] 4535 mov w7,v1.s[0] 4536 mov w9,v2.s[0] 4537 add w7,w6,w7 4538 mov w6,v3.s[0] 4539 add w7,w7,w9 4540 add w7,w7,w6 4541 4542 eor w6,w7,w7,ror #32-2 4543 eor w6,w6,w7,ror #32-10 4544 eor w6,w6,w7,ror #32-18 4545 eor w6,w6,w7,ror #32-24 4546 eor w15,w15,w6 4547 subs w11,w11,#1 4548 b.ne 10b 4549 mov v4.s[0],w15 4550 mov v4.s[1],w14 4551 mov v4.s[2],w13 4552 mov v4.s[3],w12 4553 #ifndef __AARCH64EB__ 4554 rev32 v4.16b,v4.16b 4555 #endif 4556 eor v4.16b, v4.16b, v8.16b 4557 st1 {v4.4s},[x1],#16 4558 // save the last tweak 4559 st1 {v8.4s},[x5] 4560 b 100f 4561 1: // process last 2 blocks 4562 cmp x2,#2 4563 b.gt 1f 4564 ld1 {v4.4s,v5.4s},[x0],#32 4565 eor v4.16b, v4.16b, v8.16b 4566 eor v5.16b, v5.16b, v9.16b 4567 #ifndef __AARCH64EB__ 4568 rev32 v4.16b,v4.16b 4569 #endif 4570 #ifndef __AARCH64EB__ 4571 rev32 v5.16b,v5.16b 4572 #endif 4573 zip1 v0.4s,v4.4s,v5.4s 4574 zip2 v1.4s,v4.4s,v5.4s 4575 zip1 v2.4s,v6.4s,v7.4s 4576 zip2 v3.4s,v6.4s,v7.4s 4577 zip1 v4.2d,v0.2d,v2.2d 4578 zip2 v5.2d,v0.2d,v2.2d 4579 zip1 v6.2d,v1.2d,v3.2d 4580 zip2 v7.2d,v1.2d,v3.2d 4581 bl _vpsm4_enc_4blks 4582 zip1 v4.4s,v0.4s,v1.4s 4583 zip2 v5.4s,v0.4s,v1.4s 4584 zip1 v6.4s,v2.4s,v3.4s 4585 zip2 v7.4s,v2.4s,v3.4s 4586 zip1 v0.2d,v4.2d,v6.2d 4587 zip2 v1.2d,v4.2d,v6.2d 4588 zip1 v2.2d,v5.2d,v7.2d 4589 zip2 v3.2d,v5.2d,v7.2d 4590 eor v0.16b, v0.16b, v8.16b 4591 eor v1.16b, v1.16b, v9.16b 4592 st1 {v0.4s,v1.4s},[x1],#32 4593 // save the last tweak 4594 st1 {v9.4s},[x5] 4595 b 100f 4596 1: // process last 3 blocks 4597 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 4598 eor v4.16b, v4.16b, v8.16b 4599 eor v5.16b, v5.16b, v9.16b 4600 eor v6.16b, v6.16b, v10.16b 4601 #ifndef __AARCH64EB__ 4602 rev32 v4.16b,v4.16b 4603 #endif 4604 #ifndef __AARCH64EB__ 4605 rev32 v5.16b,v5.16b 4606 #endif 4607 #ifndef __AARCH64EB__ 4608 rev32 v6.16b,v6.16b 4609 #endif 4610 zip1 v0.4s,v4.4s,v5.4s 4611 zip2 v1.4s,v4.4s,v5.4s 4612 zip1 v2.4s,v6.4s,v7.4s 4613 zip2 v3.4s,v6.4s,v7.4s 4614 zip1 v4.2d,v0.2d,v2.2d 4615 zip2 v5.2d,v0.2d,v2.2d 4616 zip1 v6.2d,v1.2d,v3.2d 4617 zip2 v7.2d,v1.2d,v3.2d 4618 bl _vpsm4_enc_4blks 4619 zip1 v4.4s,v0.4s,v1.4s 4620 zip2 v5.4s,v0.4s,v1.4s 4621 zip1 v6.4s,v2.4s,v3.4s 4622 zip2 v7.4s,v2.4s,v3.4s 4623 zip1 v0.2d,v4.2d,v6.2d 4624 zip2 v1.2d,v4.2d,v6.2d 4625 zip1 v2.2d,v5.2d,v7.2d 4626 zip2 v3.2d,v5.2d,v7.2d 4627 eor v0.16b, v0.16b, v8.16b 4628 eor v1.16b, v1.16b, v9.16b 4629 eor v2.16b, v2.16b, v10.16b 4630 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 4631 // save the last tweak 4632 st1 {v10.4s},[x5] 4633 100: 4634 cmp x29,0 4635 b.eq .return 4636 4637 // This branch calculates the last two tweaks, 4638 // while the encryption/decryption length is larger than 32 4639 .last_2blks_tweak: 4640 ld1 {v8.4s},[x5] 4641 #ifdef __AARCH64EB__ 4642 rev32 v8.16b,v8.16b 4643 #endif 4644 mov v2.16b,v8.16b 4645 adrp x10,.Lxts_magic 4646 ldr q0, [x10, #:lo12:.Lxts_magic] 4647 shl v9.16b, v2.16b, #1 4648 ext v1.16b, v2.16b, v2.16b,#15 4649 ushr v1.16b, v1.16b, #7 4650 mul v1.16b, v1.16b, v0.16b 4651 eor v9.16b, v9.16b, v1.16b 4652 mov v2.16b,v9.16b 4653 adrp x10,.Lxts_magic 4654 ldr q0, [x10, #:lo12:.Lxts_magic] 4655 shl v10.16b, v2.16b, #1 4656 ext v1.16b, v2.16b, v2.16b,#15 4657 ushr v1.16b, v1.16b, #7 4658 mul v1.16b, v1.16b, v0.16b 4659 eor v10.16b, v10.16b, v1.16b 4660 b .check_dec 4661 4662 4663 // This branch calculates the last two tweaks, 4664 // while the encryption/decryption length is equal to 32, who only need two tweaks 4665 .only_2blks_tweak: 4666 mov v9.16b,v8.16b 4667 #ifdef __AARCH64EB__ 4668 rev32 v9.16b,v9.16b 4669 #endif 4670 mov v2.16b,v9.16b 4671 adrp x10,.Lxts_magic 4672 ldr q0, [x10, #:lo12:.Lxts_magic] 4673 shl v10.16b, v2.16b, #1 4674 ext v1.16b, v2.16b, v2.16b,#15 4675 ushr v1.16b, v1.16b, #7 4676 mul v1.16b, v1.16b, v0.16b 4677 eor v10.16b, v10.16b, v1.16b 4678 b .check_dec 4679 4680 4681 // Determine whether encryption or decryption is required. 4682 // The last two tweaks need to be swapped for decryption. 4683 .check_dec: 4684 // encryption:1 decryption:0 4685 cmp w28,1 4686 b.eq .process_last_2blks 4687 mov v0.16B,v9.16b 4688 mov v9.16B,v10.16b 4689 mov v10.16B,v0.16b 4690 4691 .process_last_2blks: 4692 #ifdef __AARCH64EB__ 4693 rev32 v9.16b,v9.16b 4694 #endif 4695 #ifdef __AARCH64EB__ 4696 rev32 v10.16b,v10.16b 4697 #endif 4698 ld1 {v4.4s},[x0],#16 4699 eor v4.16b, v4.16b, v9.16b 4700 #ifndef __AARCH64EB__ 4701 rev32 v4.16b,v4.16b 4702 #endif 4703 mov x10,x3 4704 mov w11,#8 4705 mov w12,v4.s[0] 4706 mov w13,v4.s[1] 4707 mov w14,v4.s[2] 4708 mov w15,v4.s[3] 4709 10: 4710 ldp w7,w8,[x10],8 4711 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4712 eor w6,w14,w15 4713 eor w9,w7,w13 4714 eor w6,w6,w9 4715 movi v1.16b,#64 4716 movi v2.16b,#128 4717 movi v3.16b,#192 4718 mov v0.s[0],w6 4719 4720 sub v1.16b,v0.16b,v1.16b 4721 sub v2.16b,v0.16b,v2.16b 4722 sub v3.16b,v0.16b,v3.16b 4723 4724 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4725 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4726 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4727 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4728 4729 mov w6,v0.s[0] 4730 mov w7,v1.s[0] 4731 mov w9,v2.s[0] 4732 add w7,w6,w7 4733 mov w6,v3.s[0] 4734 add w7,w7,w9 4735 add w7,w7,w6 4736 4737 eor w6,w7,w7,ror #32-2 4738 eor w6,w6,w7,ror #32-10 4739 eor w6,w6,w7,ror #32-18 4740 eor w6,w6,w7,ror #32-24 4741 eor w12,w12,w6 4742 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4743 eor w6,w14,w15 4744 eor w9,w12,w8 4745 eor w6,w6,w9 4746 movi v1.16b,#64 4747 movi v2.16b,#128 4748 movi v3.16b,#192 4749 mov v0.s[0],w6 4750 4751 sub v1.16b,v0.16b,v1.16b 4752 sub v2.16b,v0.16b,v2.16b 4753 sub v3.16b,v0.16b,v3.16b 4754 4755 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4756 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4757 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4758 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4759 4760 mov w6,v0.s[0] 4761 mov w7,v1.s[0] 4762 mov w9,v2.s[0] 4763 add w7,w6,w7 4764 mov w6,v3.s[0] 4765 add w7,w7,w9 4766 add w7,w7,w6 4767 4768 eor w6,w7,w7,ror #32-2 4769 eor w6,w6,w7,ror #32-10 4770 eor w6,w6,w7,ror #32-18 4771 eor w6,w6,w7,ror #32-24 4772 ldp w7,w8,[x10],8 4773 eor w13,w13,w6 4774 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4775 eor w6,w12,w13 4776 eor w9,w7,w15 4777 eor w6,w6,w9 4778 movi v1.16b,#64 4779 movi v2.16b,#128 4780 movi v3.16b,#192 4781 mov v0.s[0],w6 4782 4783 sub v1.16b,v0.16b,v1.16b 4784 sub v2.16b,v0.16b,v2.16b 4785 sub v3.16b,v0.16b,v3.16b 4786 4787 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4788 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4789 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4790 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4791 4792 mov w6,v0.s[0] 4793 mov w7,v1.s[0] 4794 mov w9,v2.s[0] 4795 add w7,w6,w7 4796 mov w6,v3.s[0] 4797 add w7,w7,w9 4798 add w7,w7,w6 4799 4800 eor w6,w7,w7,ror #32-2 4801 eor w6,w6,w7,ror #32-10 4802 eor w6,w6,w7,ror #32-18 4803 eor w6,w6,w7,ror #32-24 4804 eor w14,w14,w6 4805 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4806 eor w6,w12,w13 4807 eor w9,w14,w8 4808 eor w6,w6,w9 4809 movi v1.16b,#64 4810 movi v2.16b,#128 4811 movi v3.16b,#192 4812 mov v0.s[0],w6 4813 4814 sub v1.16b,v0.16b,v1.16b 4815 sub v2.16b,v0.16b,v2.16b 4816 sub v3.16b,v0.16b,v3.16b 4817 4818 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4819 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4820 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4821 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4822 4823 mov w6,v0.s[0] 4824 mov w7,v1.s[0] 4825 mov w9,v2.s[0] 4826 add w7,w6,w7 4827 mov w6,v3.s[0] 4828 add w7,w7,w9 4829 add w7,w7,w6 4830 4831 eor w6,w7,w7,ror #32-2 4832 eor w6,w6,w7,ror #32-10 4833 eor w6,w6,w7,ror #32-18 4834 eor w6,w6,w7,ror #32-24 4835 eor w15,w15,w6 4836 subs w11,w11,#1 4837 b.ne 10b 4838 mov v4.s[0],w15 4839 mov v4.s[1],w14 4840 mov v4.s[2],w13 4841 mov v4.s[3],w12 4842 #ifndef __AARCH64EB__ 4843 rev32 v4.16b,v4.16b 4844 #endif 4845 eor v4.16b, v4.16b, v9.16b 4846 st1 {v4.4s},[x1],#16 4847 4848 sub x26,x1,16 4849 .loop: 4850 subs x29,x29,1 4851 ldrb w7,[x26,x29] 4852 ldrb w8,[x0,x29] 4853 strb w8,[x26,x29] 4854 strb w7,[x1,x29] 4855 b.gt .loop 4856 ld1 {v4.4s}, [x26] 4857 eor v4.16b, v4.16b, v10.16b 4858 #ifndef __AARCH64EB__ 4859 rev32 v4.16b,v4.16b 4860 #endif 4861 mov x10,x3 4862 mov w11,#8 4863 mov w12,v4.s[0] 4864 mov w13,v4.s[1] 4865 mov w14,v4.s[2] 4866 mov w15,v4.s[3] 4867 10: 4868 ldp w7,w8,[x10],8 4869 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4870 eor w6,w14,w15 4871 eor w9,w7,w13 4872 eor w6,w6,w9 4873 movi v1.16b,#64 4874 movi v2.16b,#128 4875 movi v3.16b,#192 4876 mov v0.s[0],w6 4877 4878 sub v1.16b,v0.16b,v1.16b 4879 sub v2.16b,v0.16b,v2.16b 4880 sub v3.16b,v0.16b,v3.16b 4881 4882 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4883 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4884 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4885 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4886 4887 mov w6,v0.s[0] 4888 mov w7,v1.s[0] 4889 mov w9,v2.s[0] 4890 add w7,w6,w7 4891 mov w6,v3.s[0] 4892 add w7,w7,w9 4893 add w7,w7,w6 4894 4895 eor w6,w7,w7,ror #32-2 4896 eor w6,w6,w7,ror #32-10 4897 eor w6,w6,w7,ror #32-18 4898 eor w6,w6,w7,ror #32-24 4899 eor w12,w12,w6 4900 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4901 eor w6,w14,w15 4902 eor w9,w12,w8 4903 eor w6,w6,w9 4904 movi v1.16b,#64 4905 movi v2.16b,#128 4906 movi v3.16b,#192 4907 mov v0.s[0],w6 4908 4909 sub v1.16b,v0.16b,v1.16b 4910 sub v2.16b,v0.16b,v2.16b 4911 sub v3.16b,v0.16b,v3.16b 4912 4913 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4914 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4915 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4916 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4917 4918 mov w6,v0.s[0] 4919 mov w7,v1.s[0] 4920 mov w9,v2.s[0] 4921 add w7,w6,w7 4922 mov w6,v3.s[0] 4923 add w7,w7,w9 4924 add w7,w7,w6 4925 4926 eor w6,w7,w7,ror #32-2 4927 eor w6,w6,w7,ror #32-10 4928 eor w6,w6,w7,ror #32-18 4929 eor w6,w6,w7,ror #32-24 4930 ldp w7,w8,[x10],8 4931 eor w13,w13,w6 4932 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4933 eor w6,w12,w13 4934 eor w9,w7,w15 4935 eor w6,w6,w9 4936 movi v1.16b,#64 4937 movi v2.16b,#128 4938 movi v3.16b,#192 4939 mov v0.s[0],w6 4940 4941 sub v1.16b,v0.16b,v1.16b 4942 sub v2.16b,v0.16b,v2.16b 4943 sub v3.16b,v0.16b,v3.16b 4944 4945 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4946 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4947 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4948 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4949 4950 mov w6,v0.s[0] 4951 mov w7,v1.s[0] 4952 mov w9,v2.s[0] 4953 add w7,w6,w7 4954 mov w6,v3.s[0] 4955 add w7,w7,w9 4956 add w7,w7,w6 4957 4958 eor w6,w7,w7,ror #32-2 4959 eor w6,w6,w7,ror #32-10 4960 eor w6,w6,w7,ror #32-18 4961 eor w6,w6,w7,ror #32-24 4962 eor w14,w14,w6 4963 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4964 eor w6,w12,w13 4965 eor w9,w14,w8 4966 eor w6,w6,w9 4967 movi v1.16b,#64 4968 movi v2.16b,#128 4969 movi v3.16b,#192 4970 mov v0.s[0],w6 4971 4972 sub v1.16b,v0.16b,v1.16b 4973 sub v2.16b,v0.16b,v2.16b 4974 sub v3.16b,v0.16b,v3.16b 4975 4976 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4977 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4978 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4979 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4980 4981 mov w6,v0.s[0] 4982 mov w7,v1.s[0] 4983 mov w9,v2.s[0] 4984 add w7,w6,w7 4985 mov w6,v3.s[0] 4986 add w7,w7,w9 4987 add w7,w7,w6 4988 4989 eor w6,w7,w7,ror #32-2 4990 eor w6,w6,w7,ror #32-10 4991 eor w6,w6,w7,ror #32-18 4992 eor w6,w6,w7,ror #32-24 4993 eor w15,w15,w6 4994 subs w11,w11,#1 4995 b.ne 10b 4996 mov v4.s[0],w15 4997 mov v4.s[1],w14 4998 mov v4.s[2],w13 4999 mov v4.s[3],w12 5000 #ifndef __AARCH64EB__ 5001 rev32 v4.16b,v4.16b 5002 #endif 5003 eor v4.16b, v4.16b, v10.16b 5004 st1 {v4.4s}, [x26] 5005 .return: 5006 ldp d14, d15, [sp], #0x10 5007 ldp d12, d13, [sp], #0x10 5008 ldp d10, d11, [sp], #0x10 5009 ldp d8, d9, [sp], #0x10 5010 ldp x29, x30, [sp], #0x10 5011 ldp x27, x28, [sp], #0x10 5012 ldp x25, x26, [sp], #0x10 5013 ldp x23, x24, [sp], #0x10 5014 ldp x21, x22, [sp], #0x10 5015 ldp x19, x20, [sp], #0x10 5016 ldp x17, x18, [sp], #0x10 5017 ldp x15, x16, [sp], #0x10 5018 AARCH64_VALIDATE_LINK_REGISTER 5019 ret 5020 .size vpsm4_xts_encrypt,.-vpsm4_xts_encrypt 5021