1 1.2 christos #include "arm_arch.h" 2 1.2 christos 3 1.2 christos .section .rodata 4 1.1 christos 5 1.1 christos .type _vpaes_consts,%object 6 1.1 christos .align 7 // totally strategic alignment 7 1.1 christos _vpaes_consts: 8 1.1 christos .Lk_mc_forward: // mc_forward 9 1.1 christos .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 10 1.1 christos .quad 0x080B0A0904070605, 0x000302010C0F0E0D 11 1.1 christos .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 12 1.1 christos .quad 0x000302010C0F0E0D, 0x080B0A0904070605 13 1.1 christos .Lk_mc_backward: // mc_backward 14 1.1 christos .quad 0x0605040702010003, 0x0E0D0C0F0A09080B 15 1.1 christos .quad 0x020100030E0D0C0F, 0x0A09080B06050407 16 1.1 christos .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 17 1.1 christos .quad 0x0A09080B06050407, 0x020100030E0D0C0F 18 1.1 christos .Lk_sr: // sr 19 1.1 christos .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 20 1.1 christos .quad 0x030E09040F0A0500, 0x0B06010C07020D08 21 1.1 christos .quad 0x0F060D040B020900, 0x070E050C030A0108 22 1.1 christos .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 23 1.1 christos 24 1.1 christos // 25 1.1 christos // "Hot" constants 26 1.1 christos // 27 1.1 christos .Lk_inv: // inv, inva 28 1.1 christos .quad 0x0E05060F0D080180, 0x040703090A0B0C02 29 1.1 christos .quad 0x01040A060F0B0780, 0x030D0E0C02050809 30 1.1 christos .Lk_ipt: // input transform (lo, hi) 31 1.1 christos .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 32 1.1 christos .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 33 1.1 christos .Lk_sbo: // sbou, sbot 34 1.1 christos .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 35 1.1 christos .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 36 1.1 christos .Lk_sb1: // sb1u, sb1t 37 1.1 christos .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 38 1.1 christos .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 39 1.1 christos .Lk_sb2: // sb2u, sb2t 40 1.1 christos .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 41 1.1 christos .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 42 1.1 christos 43 1.1 christos // 44 1.1 christos // Decryption stuff 45 1.1 christos // 46 1.1 christos .Lk_dipt: // decryption input transform 47 1.1 christos .quad 0x0F505B040B545F00, 0x154A411E114E451A 48 1.1 christos .quad 0x86E383E660056500, 0x12771772F491F194 49 1.1 christos .Lk_dsbo: // decryption sbox final output 50 1.1 christos .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 51 1.1 christos .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 52 1.1 christos .Lk_dsb9: // decryption sbox output *9*u, *9*t 53 1.1 christos .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 54 1.1 christos .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 55 1.1 christos .Lk_dsbd: // decryption sbox output *D*u, *D*t 56 1.1 christos .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 57 1.1 christos .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 58 1.1 christos .Lk_dsbb: // decryption sbox output *B*u, *B*t 59 1.1 christos .quad 0xD022649296B44200, 0x602646F6B0F2D404 60 1.1 christos .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 61 1.1 christos .Lk_dsbe: // decryption sbox output *E*u, *E*t 62 1.1 christos .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 63 1.1 christos .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 64 1.1 christos 65 1.1 christos // 66 1.1 christos // Key schedule constants 67 1.1 christos // 68 1.1 christos .Lk_dksd: // decryption key schedule: invskew x*D 69 1.1 christos .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 70 1.1 christos .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 71 1.1 christos .Lk_dksb: // decryption key schedule: invskew x*B 72 1.1 christos .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 73 1.1 christos .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 74 1.1 christos .Lk_dkse: // decryption key schedule: invskew x*E + 0x63 75 1.1 christos .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 76 1.1 christos .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 77 1.1 christos .Lk_dks9: // decryption key schedule: invskew x*9 78 1.1 christos .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 79 1.1 christos .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 80 1.1 christos 81 1.1 christos .Lk_rcon: // rcon 82 1.1 christos .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 83 1.1 christos 84 1.1 christos .Lk_opt: // output transform 85 1.1 christos .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 86 1.1 christos .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 87 1.1 christos .Lk_deskew: // deskew tables: inverts the sbox's "skew" 88 1.1 christos .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 89 1.1 christos .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 90 1.1 christos 91 1.1 christos .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 92 1.1 christos .align 2 93 1.1 christos .size _vpaes_consts,.-_vpaes_consts 94 1.1 christos .align 6 95 1.2 christos 96 1.2 christos .text 97 1.2 christos 98 1.1 christos // 99 1.1 christos // _aes_preheat 100 1.1 christos // 101 1.1 christos // Fills register %r10 -> .aes_consts (so you can -fPIC) 102 1.1 christos // and %xmm9-%xmm15 as specified below. 103 1.1 christos // 104 1.1 christos .type _vpaes_encrypt_preheat,%function 105 1.1 christos .align 4 106 1.1 christos _vpaes_encrypt_preheat: 107 1.2 christos adrp x10, .Lk_inv 108 1.2 christos add x10, x10, #:lo12:.Lk_inv 109 1.1 christos movi v17.16b, #0x0f 110 1.1 christos ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 111 1.1 christos ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 112 1.1 christos ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 113 1.1 christos ret 114 1.1 christos .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 115 1.1 christos 116 1.1 christos // 117 1.1 christos // _aes_encrypt_core 118 1.1 christos // 119 1.1 christos // AES-encrypt %xmm0. 120 1.1 christos // 121 1.1 christos // Inputs: 122 1.1 christos // %xmm0 = input 123 1.1 christos // %xmm9-%xmm15 as in _vpaes_preheat 124 1.1 christos // (%rdx) = scheduled keys 125 1.1 christos // 126 1.1 christos // Output in %xmm0 127 1.1 christos // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 128 1.1 christos // Preserves %xmm6 - %xmm8 so you get some local vectors 129 1.1 christos // 130 1.1 christos // 131 1.1 christos .type _vpaes_encrypt_core,%function 132 1.1 christos .align 4 133 1.1 christos _vpaes_encrypt_core: 134 1.1 christos mov x9, x2 135 1.1 christos ldr w8, [x2,#240] // pull rounds 136 1.2 christos adrp x11, .Lk_mc_forward+16 137 1.2 christos add x11, x11, #:lo12:.Lk_mc_forward+16 138 1.1 christos // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 139 1.1 christos ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 140 1.1 christos and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 141 1.1 christos ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 142 1.1 christos tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 143 1.1 christos // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 144 1.1 christos tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 145 1.1 christos eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 146 1.1 christos eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 147 1.1 christos b .Lenc_entry 148 1.1 christos 149 1.1 christos .align 4 150 1.1 christos .Lenc_loop: 151 1.1 christos // middle of middle round 152 1.1 christos add x10, x11, #0x40 153 1.1 christos tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 154 1.1 christos ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 155 1.1 christos tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 156 1.1 christos eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 157 1.1 christos tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 158 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 159 1.1 christos tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 160 1.1 christos ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 161 1.1 christos tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 162 1.1 christos eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 163 1.1 christos tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 164 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 165 1.1 christos tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 166 1.1 christos eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 167 1.1 christos and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 168 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 169 1.1 christos sub w8, w8, #1 // nr-- 170 1.1 christos 171 1.1 christos .Lenc_entry: 172 1.1 christos // top of round 173 1.1 christos and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 174 1.1 christos ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 175 1.1 christos tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 176 1.1 christos eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 177 1.1 christos tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 178 1.1 christos tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 179 1.1 christos eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 180 1.1 christos eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 181 1.1 christos tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 182 1.1 christos tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 183 1.1 christos eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 184 1.1 christos eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 185 1.1 christos ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 186 1.1 christos cbnz w8, .Lenc_loop 187 1.1 christos 188 1.1 christos // middle of last round 189 1.1 christos add x10, x11, #0x80 190 1.1 christos // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 191 1.1 christos // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 192 1.1 christos tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 193 1.1 christos ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 194 1.1 christos tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 195 1.1 christos eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 196 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 197 1.1 christos tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 198 1.1 christos ret 199 1.1 christos .size _vpaes_encrypt_core,.-_vpaes_encrypt_core 200 1.1 christos 201 1.1 christos .globl vpaes_encrypt 202 1.1 christos .type vpaes_encrypt,%function 203 1.1 christos .align 4 204 1.1 christos vpaes_encrypt: 205 1.2 christos AARCH64_SIGN_LINK_REGISTER 206 1.1 christos stp x29,x30,[sp,#-16]! 207 1.1 christos add x29,sp,#0 208 1.1 christos 209 1.1 christos ld1 {v7.16b}, [x0] 210 1.1 christos bl _vpaes_encrypt_preheat 211 1.1 christos bl _vpaes_encrypt_core 212 1.1 christos st1 {v0.16b}, [x1] 213 1.1 christos 214 1.1 christos ldp x29,x30,[sp],#16 215 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 216 1.1 christos ret 217 1.1 christos .size vpaes_encrypt,.-vpaes_encrypt 218 1.1 christos 219 1.1 christos .type _vpaes_encrypt_2x,%function 220 1.1 christos .align 4 221 1.1 christos _vpaes_encrypt_2x: 222 1.1 christos mov x9, x2 223 1.1 christos ldr w8, [x2,#240] // pull rounds 224 1.2 christos adrp x11, .Lk_mc_forward+16 225 1.2 christos add x11, x11, #:lo12:.Lk_mc_forward+16 226 1.1 christos // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 227 1.1 christos ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 228 1.1 christos and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 229 1.1 christos ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 230 1.1 christos and v9.16b, v15.16b, v17.16b 231 1.1 christos ushr v8.16b, v15.16b, #4 232 1.1 christos tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 233 1.1 christos tbl v9.16b, {v20.16b}, v9.16b 234 1.1 christos // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 235 1.1 christos tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 236 1.1 christos tbl v10.16b, {v21.16b}, v8.16b 237 1.1 christos eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 238 1.1 christos eor v8.16b, v9.16b, v16.16b 239 1.1 christos eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 240 1.1 christos eor v8.16b, v8.16b, v10.16b 241 1.1 christos b .Lenc_2x_entry 242 1.1 christos 243 1.1 christos .align 4 244 1.1 christos .Lenc_2x_loop: 245 1.1 christos // middle of middle round 246 1.1 christos add x10, x11, #0x40 247 1.1 christos tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 248 1.1 christos tbl v12.16b, {v25.16b}, v10.16b 249 1.1 christos ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 250 1.1 christos tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 251 1.1 christos tbl v8.16b, {v24.16b}, v11.16b 252 1.1 christos eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 253 1.1 christos eor v12.16b, v12.16b, v16.16b 254 1.1 christos tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 255 1.1 christos tbl v13.16b, {v27.16b}, v10.16b 256 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 257 1.1 christos eor v8.16b, v8.16b, v12.16b 258 1.1 christos tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 259 1.1 christos tbl v10.16b, {v26.16b}, v11.16b 260 1.1 christos ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 261 1.1 christos tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 262 1.1 christos tbl v11.16b, {v8.16b}, v1.16b 263 1.1 christos eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 264 1.1 christos eor v10.16b, v10.16b, v13.16b 265 1.1 christos tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 266 1.1 christos tbl v8.16b, {v8.16b}, v4.16b 267 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 268 1.1 christos eor v11.16b, v11.16b, v10.16b 269 1.1 christos tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 270 1.1 christos tbl v12.16b, {v11.16b},v1.16b 271 1.1 christos eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 272 1.1 christos eor v8.16b, v8.16b, v11.16b 273 1.1 christos and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 274 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 275 1.1 christos eor v8.16b, v8.16b, v12.16b 276 1.1 christos sub w8, w8, #1 // nr-- 277 1.1 christos 278 1.1 christos .Lenc_2x_entry: 279 1.1 christos // top of round 280 1.1 christos and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 281 1.1 christos ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 282 1.1 christos and v9.16b, v8.16b, v17.16b 283 1.1 christos ushr v8.16b, v8.16b, #4 284 1.1 christos tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 285 1.1 christos tbl v13.16b, {v19.16b},v9.16b 286 1.1 christos eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 287 1.1 christos eor v9.16b, v9.16b, v8.16b 288 1.1 christos tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 289 1.1 christos tbl v11.16b, {v18.16b},v8.16b 290 1.1 christos tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 291 1.1 christos tbl v12.16b, {v18.16b},v9.16b 292 1.1 christos eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 293 1.1 christos eor v11.16b, v11.16b, v13.16b 294 1.1 christos eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 295 1.1 christos eor v12.16b, v12.16b, v13.16b 296 1.1 christos tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 297 1.1 christos tbl v10.16b, {v18.16b},v11.16b 298 1.1 christos tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 299 1.1 christos tbl v11.16b, {v18.16b},v12.16b 300 1.1 christos eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 301 1.1 christos eor v10.16b, v10.16b, v9.16b 302 1.1 christos eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 303 1.1 christos eor v11.16b, v11.16b, v8.16b 304 1.1 christos ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 305 1.1 christos cbnz w8, .Lenc_2x_loop 306 1.1 christos 307 1.1 christos // middle of last round 308 1.1 christos add x10, x11, #0x80 309 1.1 christos // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 310 1.1 christos // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 311 1.1 christos tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 312 1.1 christos tbl v12.16b, {v22.16b}, v10.16b 313 1.1 christos ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 314 1.1 christos tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 315 1.1 christos tbl v8.16b, {v23.16b}, v11.16b 316 1.1 christos eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 317 1.1 christos eor v12.16b, v12.16b, v16.16b 318 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 319 1.1 christos eor v8.16b, v8.16b, v12.16b 320 1.1 christos tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 321 1.1 christos tbl v1.16b, {v8.16b},v1.16b 322 1.1 christos ret 323 1.1 christos .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 324 1.1 christos 325 1.1 christos .type _vpaes_decrypt_preheat,%function 326 1.1 christos .align 4 327 1.1 christos _vpaes_decrypt_preheat: 328 1.2 christos adrp x10, .Lk_inv 329 1.2 christos add x10, x10, #:lo12:.Lk_inv 330 1.1 christos movi v17.16b, #0x0f 331 1.2 christos adrp x11, .Lk_dipt 332 1.2 christos add x11, x11, #:lo12:.Lk_dipt 333 1.1 christos ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 334 1.1 christos ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 335 1.1 christos ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 336 1.1 christos ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 337 1.1 christos ret 338 1.1 christos .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 339 1.1 christos 340 1.1 christos // 341 1.1 christos // Decryption core 342 1.1 christos // 343 1.1 christos // Same API as encryption core. 344 1.1 christos // 345 1.1 christos .type _vpaes_decrypt_core,%function 346 1.1 christos .align 4 347 1.1 christos _vpaes_decrypt_core: 348 1.1 christos mov x9, x2 349 1.1 christos ldr w8, [x2,#240] // pull rounds 350 1.1 christos 351 1.1 christos // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 352 1.1 christos lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 353 1.1 christos eor x11, x11, #0x30 // xor $0x30, %r11 354 1.2 christos adrp x10, .Lk_sr 355 1.2 christos add x10, x10, #:lo12:.Lk_sr 356 1.1 christos and x11, x11, #0x30 // and $0x30, %r11 357 1.1 christos add x11, x11, x10 358 1.2 christos adrp x10, .Lk_mc_forward+48 359 1.2 christos add x10, x10, #:lo12:.Lk_mc_forward+48 360 1.1 christos 361 1.1 christos ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 362 1.1 christos and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 363 1.1 christos ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 364 1.1 christos tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 365 1.1 christos ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 366 1.1 christos // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 367 1.1 christos tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 368 1.1 christos eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 369 1.1 christos eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 370 1.1 christos b .Ldec_entry 371 1.1 christos 372 1.1 christos .align 4 373 1.1 christos .Ldec_loop: 374 1.1 christos // 375 1.1 christos // Inverse mix columns 376 1.1 christos // 377 1.1 christos // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 378 1.1 christos // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 379 1.1 christos tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 380 1.1 christos tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 381 1.1 christos eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 382 1.1 christos // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 383 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 384 1.1 christos // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 385 1.1 christos 386 1.1 christos tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 387 1.1 christos tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 388 1.1 christos tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 389 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 390 1.1 christos // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 391 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 392 1.1 christos // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 393 1.1 christos 394 1.1 christos tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 395 1.1 christos tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 396 1.1 christos tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 397 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 398 1.1 christos // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 399 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 400 1.1 christos // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 401 1.1 christos 402 1.1 christos tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 403 1.1 christos tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 404 1.1 christos tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 405 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 406 1.1 christos ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 407 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 408 1.1 christos sub w8, w8, #1 // sub $1,%rax # nr-- 409 1.1 christos 410 1.1 christos .Ldec_entry: 411 1.1 christos // top of round 412 1.1 christos and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 413 1.1 christos ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 414 1.1 christos tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 415 1.1 christos eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 416 1.1 christos tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 417 1.1 christos tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 418 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 419 1.1 christos eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 420 1.1 christos tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 421 1.1 christos tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 422 1.1 christos eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 423 1.1 christos eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 424 1.1 christos ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 425 1.1 christos cbnz w8, .Ldec_loop 426 1.1 christos 427 1.1 christos // middle of last round 428 1.1 christos // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 429 1.1 christos tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 430 1.1 christos // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 431 1.1 christos ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 432 1.1 christos tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 433 1.1 christos eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 434 1.1 christos eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 435 1.1 christos tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 436 1.1 christos ret 437 1.1 christos .size _vpaes_decrypt_core,.-_vpaes_decrypt_core 438 1.1 christos 439 1.1 christos .globl vpaes_decrypt 440 1.1 christos .type vpaes_decrypt,%function 441 1.1 christos .align 4 442 1.1 christos vpaes_decrypt: 443 1.2 christos AARCH64_SIGN_LINK_REGISTER 444 1.1 christos stp x29,x30,[sp,#-16]! 445 1.1 christos add x29,sp,#0 446 1.1 christos 447 1.1 christos ld1 {v7.16b}, [x0] 448 1.1 christos bl _vpaes_decrypt_preheat 449 1.1 christos bl _vpaes_decrypt_core 450 1.1 christos st1 {v0.16b}, [x1] 451 1.1 christos 452 1.1 christos ldp x29,x30,[sp],#16 453 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 454 1.1 christos ret 455 1.1 christos .size vpaes_decrypt,.-vpaes_decrypt 456 1.1 christos 457 1.1 christos // v14-v15 input, v0-v1 output 458 1.1 christos .type _vpaes_decrypt_2x,%function 459 1.1 christos .align 4 460 1.1 christos _vpaes_decrypt_2x: 461 1.1 christos mov x9, x2 462 1.1 christos ldr w8, [x2,#240] // pull rounds 463 1.1 christos 464 1.1 christos // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 465 1.1 christos lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 466 1.1 christos eor x11, x11, #0x30 // xor $0x30, %r11 467 1.2 christos adrp x10, .Lk_sr 468 1.2 christos add x10, x10, #:lo12:.Lk_sr 469 1.1 christos and x11, x11, #0x30 // and $0x30, %r11 470 1.1 christos add x11, x11, x10 471 1.2 christos adrp x10, .Lk_mc_forward+48 472 1.2 christos add x10, x10, #:lo12:.Lk_mc_forward+48 473 1.1 christos 474 1.1 christos ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 475 1.1 christos and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 476 1.1 christos ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 477 1.1 christos and v9.16b, v15.16b, v17.16b 478 1.1 christos ushr v8.16b, v15.16b, #4 479 1.1 christos tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 480 1.1 christos tbl v10.16b, {v20.16b},v9.16b 481 1.1 christos ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 482 1.1 christos // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 483 1.1 christos tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 484 1.1 christos tbl v8.16b, {v21.16b},v8.16b 485 1.1 christos eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 486 1.1 christos eor v10.16b, v10.16b, v16.16b 487 1.1 christos eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 488 1.1 christos eor v8.16b, v8.16b, v10.16b 489 1.1 christos b .Ldec_2x_entry 490 1.1 christos 491 1.1 christos .align 4 492 1.1 christos .Ldec_2x_loop: 493 1.1 christos // 494 1.1 christos // Inverse mix columns 495 1.1 christos // 496 1.1 christos // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 497 1.1 christos // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 498 1.1 christos tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 499 1.1 christos tbl v12.16b, {v24.16b}, v10.16b 500 1.1 christos tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 501 1.1 christos tbl v9.16b, {v25.16b}, v11.16b 502 1.1 christos eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 503 1.1 christos eor v8.16b, v12.16b, v16.16b 504 1.1 christos // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 505 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 506 1.1 christos eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 507 1.1 christos // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 508 1.1 christos 509 1.1 christos tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 510 1.1 christos tbl v12.16b, {v26.16b}, v10.16b 511 1.1 christos tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 512 1.1 christos tbl v8.16b, {v8.16b},v5.16b 513 1.1 christos tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 514 1.1 christos tbl v9.16b, {v27.16b}, v11.16b 515 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 516 1.1 christos eor v8.16b, v8.16b, v12.16b 517 1.1 christos // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 518 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 519 1.1 christos eor v8.16b, v8.16b, v9.16b 520 1.1 christos // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 521 1.1 christos 522 1.1 christos tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 523 1.1 christos tbl v12.16b, {v28.16b}, v10.16b 524 1.1 christos tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 525 1.1 christos tbl v8.16b, {v8.16b},v5.16b 526 1.1 christos tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 527 1.1 christos tbl v9.16b, {v29.16b}, v11.16b 528 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 529 1.1 christos eor v8.16b, v8.16b, v12.16b 530 1.1 christos // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 531 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 532 1.1 christos eor v8.16b, v8.16b, v9.16b 533 1.1 christos // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 534 1.1 christos 535 1.1 christos tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 536 1.1 christos tbl v12.16b, {v30.16b}, v10.16b 537 1.1 christos tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 538 1.1 christos tbl v8.16b, {v8.16b},v5.16b 539 1.1 christos tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 540 1.1 christos tbl v9.16b, {v31.16b}, v11.16b 541 1.1 christos eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 542 1.1 christos eor v8.16b, v8.16b, v12.16b 543 1.1 christos ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 544 1.1 christos eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 545 1.1 christos eor v8.16b, v8.16b, v9.16b 546 1.1 christos sub w8, w8, #1 // sub $1,%rax # nr-- 547 1.1 christos 548 1.1 christos .Ldec_2x_entry: 549 1.1 christos // top of round 550 1.1 christos and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 551 1.1 christos ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 552 1.1 christos and v9.16b, v8.16b, v17.16b 553 1.1 christos ushr v8.16b, v8.16b, #4 554 1.1 christos tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 555 1.1 christos tbl v10.16b, {v19.16b},v9.16b 556 1.1 christos eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 557 1.1 christos eor v9.16b, v9.16b, v8.16b 558 1.1 christos tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 559 1.1 christos tbl v11.16b, {v18.16b},v8.16b 560 1.1 christos tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 561 1.1 christos tbl v12.16b, {v18.16b},v9.16b 562 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 563 1.1 christos eor v11.16b, v11.16b, v10.16b 564 1.1 christos eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 565 1.1 christos eor v12.16b, v12.16b, v10.16b 566 1.1 christos tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 567 1.1 christos tbl v10.16b, {v18.16b},v11.16b 568 1.1 christos tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 569 1.1 christos tbl v11.16b, {v18.16b},v12.16b 570 1.1 christos eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 571 1.1 christos eor v10.16b, v10.16b, v9.16b 572 1.1 christos eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 573 1.1 christos eor v11.16b, v11.16b, v8.16b 574 1.1 christos ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 575 1.1 christos cbnz w8, .Ldec_2x_loop 576 1.1 christos 577 1.1 christos // middle of last round 578 1.1 christos // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 579 1.1 christos tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 580 1.1 christos tbl v12.16b, {v22.16b}, v10.16b 581 1.1 christos // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 582 1.1 christos tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 583 1.1 christos tbl v9.16b, {v23.16b}, v11.16b 584 1.1 christos ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 585 1.1 christos eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 586 1.1 christos eor v12.16b, v12.16b, v16.16b 587 1.1 christos eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 588 1.1 christos eor v8.16b, v9.16b, v12.16b 589 1.1 christos tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 590 1.1 christos tbl v1.16b, {v8.16b},v2.16b 591 1.1 christos ret 592 1.1 christos .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 593 1.1 christos //////////////////////////////////////////////////////// 594 1.1 christos // // 595 1.1 christos // AES key schedule // 596 1.1 christos // // 597 1.1 christos //////////////////////////////////////////////////////// 598 1.1 christos .type _vpaes_key_preheat,%function 599 1.1 christos .align 4 600 1.1 christos _vpaes_key_preheat: 601 1.2 christos adrp x10, .Lk_inv 602 1.2 christos add x10, x10, #:lo12:.Lk_inv 603 1.1 christos movi v16.16b, #0x5b // .Lk_s63 604 1.2 christos adrp x11, .Lk_sb1 605 1.2 christos add x11, x11, #:lo12:.Lk_sb1 606 1.1 christos movi v17.16b, #0x0f // .Lk_s0F 607 1.1 christos ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 608 1.2 christos adrp x10, .Lk_dksd 609 1.2 christos add x10, x10, #:lo12:.Lk_dksd 610 1.1 christos ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 611 1.2 christos adrp x11, .Lk_mc_forward 612 1.2 christos add x11, x11, #:lo12:.Lk_mc_forward 613 1.1 christos ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 614 1.1 christos ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 615 1.1 christos ld1 {v8.2d}, [x10] // .Lk_rcon 616 1.1 christos ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 617 1.1 christos ret 618 1.1 christos .size _vpaes_key_preheat,.-_vpaes_key_preheat 619 1.1 christos 620 1.1 christos .type _vpaes_schedule_core,%function 621 1.1 christos .align 4 622 1.1 christos _vpaes_schedule_core: 623 1.2 christos AARCH64_SIGN_LINK_REGISTER 624 1.1 christos stp x29, x30, [sp,#-16]! 625 1.1 christos add x29,sp,#0 626 1.1 christos 627 1.1 christos bl _vpaes_key_preheat // load the tables 628 1.1 christos 629 1.1 christos ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 630 1.1 christos 631 1.1 christos // input transform 632 1.1 christos mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 633 1.1 christos bl _vpaes_schedule_transform 634 1.1 christos mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 635 1.1 christos 636 1.2 christos adrp x10, .Lk_sr 637 1.2 christos add x10, x10, #:lo12:.Lk_sr 638 1.1 christos add x8, x8, x10 639 1.1 christos cbnz w3, .Lschedule_am_decrypting 640 1.1 christos 641 1.1 christos // encrypting, output zeroth round key after transform 642 1.1 christos st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 643 1.1 christos b .Lschedule_go 644 1.1 christos 645 1.1 christos .Lschedule_am_decrypting: 646 1.1 christos // decrypting, output zeroth round key after shiftrows 647 1.1 christos ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 648 1.1 christos tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 649 1.1 christos st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 650 1.1 christos eor x8, x8, #0x30 // xor $0x30, %r8 651 1.1 christos 652 1.1 christos .Lschedule_go: 653 1.1 christos cmp w1, #192 // cmp $192, %esi 654 1.1 christos b.hi .Lschedule_256 655 1.1 christos b.eq .Lschedule_192 656 1.1 christos // 128: fall though 657 1.1 christos 658 1.1 christos // 659 1.1 christos // .schedule_128 660 1.1 christos // 661 1.1 christos // 128-bit specific part of key schedule. 662 1.1 christos // 663 1.1 christos // This schedule is really simple, because all its parts 664 1.1 christos // are accomplished by the subroutines. 665 1.1 christos // 666 1.1 christos .Lschedule_128: 667 1.1 christos mov x0, #10 // mov $10, %esi 668 1.1 christos 669 1.1 christos .Loop_schedule_128: 670 1.1 christos sub x0, x0, #1 // dec %esi 671 1.1 christos bl _vpaes_schedule_round 672 1.1 christos cbz x0, .Lschedule_mangle_last 673 1.1 christos bl _vpaes_schedule_mangle // write output 674 1.1 christos b .Loop_schedule_128 675 1.1 christos 676 1.1 christos // 677 1.1 christos // .aes_schedule_192 678 1.1 christos // 679 1.1 christos // 192-bit specific part of key schedule. 680 1.1 christos // 681 1.1 christos // The main body of this schedule is the same as the 128-bit 682 1.1 christos // schedule, but with more smearing. The long, high side is 683 1.1 christos // stored in %xmm7 as before, and the short, low side is in 684 1.1 christos // the high bits of %xmm6. 685 1.1 christos // 686 1.1 christos // This schedule is somewhat nastier, however, because each 687 1.1 christos // round produces 192 bits of key material, or 1.5 round keys. 688 1.1 christos // Therefore, on each cycle we do 2 rounds and produce 3 round 689 1.1 christos // keys. 690 1.1 christos // 691 1.1 christos .align 4 692 1.1 christos .Lschedule_192: 693 1.1 christos sub x0, x0, #8 694 1.1 christos ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 695 1.1 christos bl _vpaes_schedule_transform // input transform 696 1.1 christos mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 697 1.1 christos eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 698 1.1 christos ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 699 1.1 christos mov x0, #4 // mov $4, %esi 700 1.1 christos 701 1.1 christos .Loop_schedule_192: 702 1.1 christos sub x0, x0, #1 // dec %esi 703 1.1 christos bl _vpaes_schedule_round 704 1.1 christos ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 705 1.1 christos bl _vpaes_schedule_mangle // save key n 706 1.1 christos bl _vpaes_schedule_192_smear 707 1.1 christos bl _vpaes_schedule_mangle // save key n+1 708 1.1 christos bl _vpaes_schedule_round 709 1.1 christos cbz x0, .Lschedule_mangle_last 710 1.1 christos bl _vpaes_schedule_mangle // save key n+2 711 1.1 christos bl _vpaes_schedule_192_smear 712 1.1 christos b .Loop_schedule_192 713 1.1 christos 714 1.1 christos // 715 1.1 christos // .aes_schedule_256 716 1.1 christos // 717 1.1 christos // 256-bit specific part of key schedule. 718 1.1 christos // 719 1.1 christos // The structure here is very similar to the 128-bit 720 1.1 christos // schedule, but with an additional "low side" in 721 1.1 christos // %xmm6. The low side's rounds are the same as the 722 1.1 christos // high side's, except no rcon and no rotation. 723 1.1 christos // 724 1.1 christos .align 4 725 1.1 christos .Lschedule_256: 726 1.1 christos ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 727 1.1 christos bl _vpaes_schedule_transform // input transform 728 1.1 christos mov x0, #7 // mov $7, %esi 729 1.1 christos 730 1.1 christos .Loop_schedule_256: 731 1.1 christos sub x0, x0, #1 // dec %esi 732 1.1 christos bl _vpaes_schedule_mangle // output low result 733 1.1 christos mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 734 1.1 christos 735 1.1 christos // high round 736 1.1 christos bl _vpaes_schedule_round 737 1.1 christos cbz x0, .Lschedule_mangle_last 738 1.1 christos bl _vpaes_schedule_mangle 739 1.1 christos 740 1.1 christos // low round. swap xmm7 and xmm6 741 1.1 christos dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 742 1.1 christos movi v4.16b, #0 743 1.1 christos mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 744 1.1 christos mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 745 1.1 christos bl _vpaes_schedule_low_round 746 1.1 christos mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 747 1.1 christos 748 1.1 christos b .Loop_schedule_256 749 1.1 christos 750 1.1 christos // 751 1.1 christos // .aes_schedule_mangle_last 752 1.1 christos // 753 1.1 christos // Mangler for last round of key schedule 754 1.1 christos // Mangles %xmm0 755 1.1 christos // when encrypting, outputs out(%xmm0) ^ 63 756 1.1 christos // when decrypting, outputs unskew(%xmm0) 757 1.1 christos // 758 1.1 christos // Always called right before return... jumps to cleanup and exits 759 1.1 christos // 760 1.1 christos .align 4 761 1.1 christos .Lschedule_mangle_last: 762 1.1 christos // schedule last round key from xmm0 763 1.2 christos adrp x11, .Lk_deskew 764 1.2 christos add x11, x11, #:lo12:.Lk_deskew 765 1.1 christos cbnz w3, .Lschedule_mangle_last_dec 766 1.1 christos 767 1.1 christos // encrypting 768 1.1 christos ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 769 1.2 christos adrp x11, .Lk_opt 770 1.2 christos add x11, x11, #:lo12:.Lk_opt 771 1.1 christos add x2, x2, #32 // add $32, %rdx 772 1.1 christos tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 773 1.1 christos 774 1.1 christos .Lschedule_mangle_last_dec: 775 1.1 christos ld1 {v20.2d,v21.2d}, [x11] // reload constants 776 1.1 christos sub x2, x2, #16 // add $-16, %rdx 777 1.1 christos eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 778 1.1 christos bl _vpaes_schedule_transform // output transform 779 1.1 christos st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 780 1.1 christos 781 1.1 christos // cleanup 782 1.1 christos eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 783 1.1 christos eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 784 1.1 christos eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 785 1.1 christos eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 786 1.1 christos eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 787 1.1 christos eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 788 1.1 christos eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 789 1.1 christos eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 790 1.1 christos ldp x29, x30, [sp],#16 791 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 792 1.1 christos ret 793 1.1 christos .size _vpaes_schedule_core,.-_vpaes_schedule_core 794 1.1 christos 795 1.1 christos // 796 1.1 christos // .aes_schedule_192_smear 797 1.1 christos // 798 1.1 christos // Smear the short, low side in the 192-bit key schedule. 799 1.1 christos // 800 1.1 christos // Inputs: 801 1.1 christos // %xmm7: high side, b a x y 802 1.1 christos // %xmm6: low side, d c 0 0 803 1.1 christos // %xmm13: 0 804 1.1 christos // 805 1.1 christos // Outputs: 806 1.1 christos // %xmm6: b+c+d b+c 0 0 807 1.1 christos // %xmm0: b+c+d b+c b a 808 1.1 christos // 809 1.1 christos .type _vpaes_schedule_192_smear,%function 810 1.1 christos .align 4 811 1.1 christos _vpaes_schedule_192_smear: 812 1.1 christos movi v1.16b, #0 813 1.1 christos dup v0.4s, v7.s[3] 814 1.1 christos ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 815 1.1 christos ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 816 1.1 christos eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 817 1.1 christos eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 818 1.1 christos eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 819 1.1 christos mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 820 1.1 christos ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 821 1.1 christos ret 822 1.1 christos .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 823 1.1 christos 824 1.1 christos // 825 1.1 christos // .aes_schedule_round 826 1.1 christos // 827 1.1 christos // Runs one main round of the key schedule on %xmm0, %xmm7 828 1.1 christos // 829 1.1 christos // Specifically, runs subbytes on the high dword of %xmm0 830 1.1 christos // then rotates it by one byte and xors into the low dword of 831 1.1 christos // %xmm7. 832 1.1 christos // 833 1.1 christos // Adds rcon from low byte of %xmm8, then rotates %xmm8 for 834 1.1 christos // next rcon. 835 1.1 christos // 836 1.1 christos // Smears the dwords of %xmm7 by xoring the low into the 837 1.1 christos // second low, result into third, result into highest. 838 1.1 christos // 839 1.1 christos // Returns results in %xmm7 = %xmm0. 840 1.1 christos // Clobbers %xmm1-%xmm4, %r11. 841 1.1 christos // 842 1.1 christos .type _vpaes_schedule_round,%function 843 1.1 christos .align 4 844 1.1 christos _vpaes_schedule_round: 845 1.1 christos // extract rcon from xmm8 846 1.1 christos movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 847 1.1 christos ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 848 1.1 christos ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 849 1.1 christos eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 850 1.1 christos 851 1.1 christos // rotate 852 1.1 christos dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 853 1.1 christos ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 854 1.1 christos 855 1.1 christos // fall through... 856 1.1 christos 857 1.1 christos // low round: same as high round, but no rotation and no rcon. 858 1.1 christos _vpaes_schedule_low_round: 859 1.1 christos // smear xmm7 860 1.1 christos ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 861 1.1 christos eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 862 1.1 christos ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 863 1.1 christos 864 1.1 christos // subbytes 865 1.1 christos and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 866 1.1 christos ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 867 1.1 christos eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 868 1.1 christos tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 869 1.1 christos eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 870 1.1 christos tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 871 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 872 1.1 christos tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 873 1.1 christos eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 874 1.1 christos tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 875 1.1 christos eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 876 1.1 christos tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 877 1.1 christos eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 878 1.1 christos eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 879 1.1 christos tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 880 1.1 christos tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 881 1.1 christos eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 882 1.1 christos 883 1.1 christos // add in smeared stuff 884 1.1 christos eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 885 1.1 christos eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 886 1.1 christos ret 887 1.1 christos .size _vpaes_schedule_round,.-_vpaes_schedule_round 888 1.1 christos 889 1.1 christos // 890 1.1 christos // .aes_schedule_transform 891 1.1 christos // 892 1.1 christos // Linear-transform %xmm0 according to tables at (%r11) 893 1.1 christos // 894 1.1 christos // Requires that %xmm9 = 0x0F0F... as in preheat 895 1.1 christos // Output in %xmm0 896 1.1 christos // Clobbers %xmm1, %xmm2 897 1.1 christos // 898 1.1 christos .type _vpaes_schedule_transform,%function 899 1.1 christos .align 4 900 1.1 christos _vpaes_schedule_transform: 901 1.1 christos and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 902 1.1 christos ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 903 1.1 christos // vmovdqa (%r11), %xmm2 # lo 904 1.1 christos tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 905 1.1 christos // vmovdqa 16(%r11), %xmm1 # hi 906 1.1 christos tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 907 1.1 christos eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 908 1.1 christos ret 909 1.1 christos .size _vpaes_schedule_transform,.-_vpaes_schedule_transform 910 1.1 christos 911 1.1 christos // 912 1.1 christos // .aes_schedule_mangle 913 1.1 christos // 914 1.1 christos // Mangle xmm0 from (basis-transformed) standard version 915 1.1 christos // to our version. 916 1.1 christos // 917 1.1 christos // On encrypt, 918 1.1 christos // xor with 0x63 919 1.1 christos // multiply by circulant 0,1,1,1 920 1.1 christos // apply shiftrows transform 921 1.1 christos // 922 1.1 christos // On decrypt, 923 1.1 christos // xor with 0x63 924 1.1 christos // multiply by "inverse mixcolumns" circulant E,B,D,9 925 1.1 christos // deskew 926 1.1 christos // apply shiftrows transform 927 1.1 christos // 928 1.1 christos // 929 1.1 christos // Writes out to (%rdx), and increments or decrements it 930 1.1 christos // Keeps track of round number mod 4 in %r8 931 1.1 christos // Preserves xmm0 932 1.1 christos // Clobbers xmm1-xmm5 933 1.1 christos // 934 1.1 christos .type _vpaes_schedule_mangle,%function 935 1.1 christos .align 4 936 1.1 christos _vpaes_schedule_mangle: 937 1.1 christos mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 938 1.1 christos // vmovdqa .Lk_mc_forward(%rip),%xmm5 939 1.1 christos cbnz w3, .Lschedule_mangle_dec 940 1.1 christos 941 1.1 christos // encrypting 942 1.1 christos eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 943 1.1 christos add x2, x2, #16 // add $16, %rdx 944 1.1 christos tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 945 1.1 christos tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 946 1.1 christos tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 947 1.1 christos eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 948 1.1 christos ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 949 1.1 christos eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 950 1.1 christos 951 1.1 christos b .Lschedule_mangle_both 952 1.1 christos .align 4 953 1.1 christos .Lschedule_mangle_dec: 954 1.1 christos // inverse mix columns 955 1.1 christos // lea .Lk_dksd(%rip),%r11 956 1.1 christos ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 957 1.1 christos and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 958 1.1 christos 959 1.1 christos // vmovdqa 0x00(%r11), %xmm2 960 1.1 christos tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 961 1.1 christos // vmovdqa 0x10(%r11), %xmm3 962 1.1 christos tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 963 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 964 1.1 christos tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 965 1.1 christos 966 1.1 christos // vmovdqa 0x20(%r11), %xmm2 967 1.1 christos tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 968 1.1 christos eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 969 1.1 christos // vmovdqa 0x30(%r11), %xmm3 970 1.1 christos tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 971 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 972 1.1 christos tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 973 1.1 christos 974 1.1 christos // vmovdqa 0x40(%r11), %xmm2 975 1.1 christos tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 976 1.1 christos eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 977 1.1 christos // vmovdqa 0x50(%r11), %xmm3 978 1.1 christos tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 979 1.1 christos eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 980 1.1 christos 981 1.1 christos // vmovdqa 0x60(%r11), %xmm2 982 1.1 christos tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 983 1.1 christos tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 984 1.1 christos // vmovdqa 0x70(%r11), %xmm4 985 1.1 christos tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 986 1.1 christos ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 987 1.1 christos eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 988 1.1 christos eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 989 1.1 christos 990 1.1 christos sub x2, x2, #16 // add $-16, %rdx 991 1.1 christos 992 1.1 christos .Lschedule_mangle_both: 993 1.1 christos tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 994 1.1 christos add x8, x8, #64-16 // add $-16, %r8 995 1.1 christos and x8, x8, #~(1<<6) // and $0x30, %r8 996 1.1 christos st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 997 1.1 christos ret 998 1.1 christos .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 999 1.1 christos 1000 1.1 christos .globl vpaes_set_encrypt_key 1001 1.1 christos .type vpaes_set_encrypt_key,%function 1002 1.1 christos .align 4 1003 1.1 christos vpaes_set_encrypt_key: 1004 1.2 christos AARCH64_SIGN_LINK_REGISTER 1005 1.1 christos stp x29,x30,[sp,#-16]! 1006 1.1 christos add x29,sp,#0 1007 1.1 christos stp d8,d9,[sp,#-16]! // ABI spec says so 1008 1.1 christos 1009 1.1 christos lsr w9, w1, #5 // shr $5,%eax 1010 1.1 christos add w9, w9, #5 // $5,%eax 1011 1.1 christos str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1012 1.1 christos 1013 1.1 christos mov w3, #0 // mov $0,%ecx 1014 1.1 christos mov x8, #0x30 // mov $0x30,%r8d 1015 1.1 christos bl _vpaes_schedule_core 1016 1.1 christos eor x0, x0, x0 1017 1.1 christos 1018 1.1 christos ldp d8,d9,[sp],#16 1019 1.1 christos ldp x29,x30,[sp],#16 1020 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1021 1.1 christos ret 1022 1.1 christos .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1023 1.1 christos 1024 1.1 christos .globl vpaes_set_decrypt_key 1025 1.1 christos .type vpaes_set_decrypt_key,%function 1026 1.1 christos .align 4 1027 1.1 christos vpaes_set_decrypt_key: 1028 1.2 christos AARCH64_SIGN_LINK_REGISTER 1029 1.1 christos stp x29,x30,[sp,#-16]! 1030 1.1 christos add x29,sp,#0 1031 1.1 christos stp d8,d9,[sp,#-16]! // ABI spec says so 1032 1.1 christos 1033 1.1 christos lsr w9, w1, #5 // shr $5,%eax 1034 1.1 christos add w9, w9, #5 // $5,%eax 1035 1.1 christos str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1036 1.1 christos lsl w9, w9, #4 // shl $4,%eax 1037 1.1 christos add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1038 1.1 christos add x2, x2, x9 1039 1.1 christos 1040 1.1 christos mov w3, #1 // mov $1,%ecx 1041 1.1 christos lsr w8, w1, #1 // shr $1,%r8d 1042 1.1 christos and x8, x8, #32 // and $32,%r8d 1043 1.1 christos eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1044 1.1 christos bl _vpaes_schedule_core 1045 1.1 christos 1046 1.1 christos ldp d8,d9,[sp],#16 1047 1.1 christos ldp x29,x30,[sp],#16 1048 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1049 1.1 christos ret 1050 1.1 christos .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1051 1.1 christos .globl vpaes_cbc_encrypt 1052 1.1 christos .type vpaes_cbc_encrypt,%function 1053 1.1 christos .align 4 1054 1.1 christos vpaes_cbc_encrypt: 1055 1.2 christos AARCH64_SIGN_LINK_REGISTER 1056 1.1 christos cbz x2, .Lcbc_abort 1057 1.1 christos cmp w5, #0 // check direction 1058 1.1 christos b.eq vpaes_cbc_decrypt 1059 1.1 christos 1060 1.1 christos stp x29,x30,[sp,#-16]! 1061 1.1 christos add x29,sp,#0 1062 1.1 christos 1063 1.1 christos mov x17, x2 // reassign 1064 1.1 christos mov x2, x3 // reassign 1065 1.1 christos 1066 1.1 christos ld1 {v0.16b}, [x4] // load ivec 1067 1.1 christos bl _vpaes_encrypt_preheat 1068 1.1 christos b .Lcbc_enc_loop 1069 1.1 christos 1070 1.1 christos .align 4 1071 1.1 christos .Lcbc_enc_loop: 1072 1.1 christos ld1 {v7.16b}, [x0],#16 // load input 1073 1.1 christos eor v7.16b, v7.16b, v0.16b // xor with ivec 1074 1.1 christos bl _vpaes_encrypt_core 1075 1.1 christos st1 {v0.16b}, [x1],#16 // save output 1076 1.1 christos subs x17, x17, #16 1077 1.1 christos b.hi .Lcbc_enc_loop 1078 1.1 christos 1079 1.1 christos st1 {v0.16b}, [x4] // write ivec 1080 1.1 christos 1081 1.1 christos ldp x29,x30,[sp],#16 1082 1.1 christos .Lcbc_abort: 1083 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1084 1.1 christos ret 1085 1.1 christos .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1086 1.1 christos 1087 1.1 christos .type vpaes_cbc_decrypt,%function 1088 1.1 christos .align 4 1089 1.1 christos vpaes_cbc_decrypt: 1090 1.2 christos // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1091 1.2 christos // only from vpaes_cbc_encrypt which has already signed the return address. 1092 1.1 christos stp x29,x30,[sp,#-16]! 1093 1.1 christos add x29,sp,#0 1094 1.1 christos stp d8,d9,[sp,#-16]! // ABI spec says so 1095 1.1 christos stp d10,d11,[sp,#-16]! 1096 1.1 christos stp d12,d13,[sp,#-16]! 1097 1.1 christos stp d14,d15,[sp,#-16]! 1098 1.1 christos 1099 1.1 christos mov x17, x2 // reassign 1100 1.1 christos mov x2, x3 // reassign 1101 1.1 christos ld1 {v6.16b}, [x4] // load ivec 1102 1.1 christos bl _vpaes_decrypt_preheat 1103 1.1 christos tst x17, #16 1104 1.1 christos b.eq .Lcbc_dec_loop2x 1105 1.1 christos 1106 1.1 christos ld1 {v7.16b}, [x0], #16 // load input 1107 1.1 christos bl _vpaes_decrypt_core 1108 1.1 christos eor v0.16b, v0.16b, v6.16b // xor with ivec 1109 1.1 christos orr v6.16b, v7.16b, v7.16b // next ivec value 1110 1.1 christos st1 {v0.16b}, [x1], #16 1111 1.1 christos subs x17, x17, #16 1112 1.1 christos b.ls .Lcbc_dec_done 1113 1.1 christos 1114 1.1 christos .align 4 1115 1.1 christos .Lcbc_dec_loop2x: 1116 1.1 christos ld1 {v14.16b,v15.16b}, [x0], #32 1117 1.1 christos bl _vpaes_decrypt_2x 1118 1.1 christos eor v0.16b, v0.16b, v6.16b // xor with ivec 1119 1.1 christos eor v1.16b, v1.16b, v14.16b 1120 1.1 christos orr v6.16b, v15.16b, v15.16b 1121 1.1 christos st1 {v0.16b,v1.16b}, [x1], #32 1122 1.1 christos subs x17, x17, #32 1123 1.1 christos b.hi .Lcbc_dec_loop2x 1124 1.1 christos 1125 1.1 christos .Lcbc_dec_done: 1126 1.1 christos st1 {v6.16b}, [x4] 1127 1.1 christos 1128 1.1 christos ldp d14,d15,[sp],#16 1129 1.1 christos ldp d12,d13,[sp],#16 1130 1.1 christos ldp d10,d11,[sp],#16 1131 1.1 christos ldp d8,d9,[sp],#16 1132 1.1 christos ldp x29,x30,[sp],#16 1133 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1134 1.1 christos ret 1135 1.1 christos .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1136 1.1 christos .globl vpaes_ecb_encrypt 1137 1.1 christos .type vpaes_ecb_encrypt,%function 1138 1.1 christos .align 4 1139 1.1 christos vpaes_ecb_encrypt: 1140 1.2 christos AARCH64_SIGN_LINK_REGISTER 1141 1.1 christos stp x29,x30,[sp,#-16]! 1142 1.1 christos add x29,sp,#0 1143 1.1 christos stp d8,d9,[sp,#-16]! // ABI spec says so 1144 1.1 christos stp d10,d11,[sp,#-16]! 1145 1.1 christos stp d12,d13,[sp,#-16]! 1146 1.1 christos stp d14,d15,[sp,#-16]! 1147 1.1 christos 1148 1.1 christos mov x17, x2 1149 1.1 christos mov x2, x3 1150 1.1 christos bl _vpaes_encrypt_preheat 1151 1.1 christos tst x17, #16 1152 1.1 christos b.eq .Lecb_enc_loop 1153 1.1 christos 1154 1.1 christos ld1 {v7.16b}, [x0],#16 1155 1.1 christos bl _vpaes_encrypt_core 1156 1.1 christos st1 {v0.16b}, [x1],#16 1157 1.1 christos subs x17, x17, #16 1158 1.1 christos b.ls .Lecb_enc_done 1159 1.1 christos 1160 1.1 christos .align 4 1161 1.1 christos .Lecb_enc_loop: 1162 1.1 christos ld1 {v14.16b,v15.16b}, [x0], #32 1163 1.1 christos bl _vpaes_encrypt_2x 1164 1.1 christos st1 {v0.16b,v1.16b}, [x1], #32 1165 1.1 christos subs x17, x17, #32 1166 1.1 christos b.hi .Lecb_enc_loop 1167 1.1 christos 1168 1.1 christos .Lecb_enc_done: 1169 1.1 christos ldp d14,d15,[sp],#16 1170 1.1 christos ldp d12,d13,[sp],#16 1171 1.1 christos ldp d10,d11,[sp],#16 1172 1.1 christos ldp d8,d9,[sp],#16 1173 1.1 christos ldp x29,x30,[sp],#16 1174 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1175 1.1 christos ret 1176 1.1 christos .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt 1177 1.1 christos 1178 1.1 christos .globl vpaes_ecb_decrypt 1179 1.1 christos .type vpaes_ecb_decrypt,%function 1180 1.1 christos .align 4 1181 1.1 christos vpaes_ecb_decrypt: 1182 1.2 christos AARCH64_SIGN_LINK_REGISTER 1183 1.1 christos stp x29,x30,[sp,#-16]! 1184 1.1 christos add x29,sp,#0 1185 1.1 christos stp d8,d9,[sp,#-16]! // ABI spec says so 1186 1.1 christos stp d10,d11,[sp,#-16]! 1187 1.1 christos stp d12,d13,[sp,#-16]! 1188 1.1 christos stp d14,d15,[sp,#-16]! 1189 1.1 christos 1190 1.1 christos mov x17, x2 1191 1.1 christos mov x2, x3 1192 1.1 christos bl _vpaes_decrypt_preheat 1193 1.1 christos tst x17, #16 1194 1.1 christos b.eq .Lecb_dec_loop 1195 1.1 christos 1196 1.1 christos ld1 {v7.16b}, [x0],#16 1197 1.1 christos bl _vpaes_encrypt_core 1198 1.1 christos st1 {v0.16b}, [x1],#16 1199 1.1 christos subs x17, x17, #16 1200 1.1 christos b.ls .Lecb_dec_done 1201 1.1 christos 1202 1.1 christos .align 4 1203 1.1 christos .Lecb_dec_loop: 1204 1.1 christos ld1 {v14.16b,v15.16b}, [x0], #32 1205 1.1 christos bl _vpaes_decrypt_2x 1206 1.1 christos st1 {v0.16b,v1.16b}, [x1], #32 1207 1.1 christos subs x17, x17, #32 1208 1.1 christos b.hi .Lecb_dec_loop 1209 1.1 christos 1210 1.1 christos .Lecb_dec_done: 1211 1.1 christos ldp d14,d15,[sp],#16 1212 1.1 christos ldp d12,d13,[sp],#16 1213 1.1 christos ldp d10,d11,[sp],#16 1214 1.1 christos ldp d8,d9,[sp],#16 1215 1.1 christos ldp x29,x30,[sp],#16 1216 1.2 christos AARCH64_VALIDATE_LINK_REGISTER 1217 1.1 christos ret 1218 1.1 christos .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt 1219