vpaes-armv8.S revision 1.2 1 #include "arm_arch.h"
2
3 .section .rodata
4
5 .type _vpaes_consts,%object
6 .align 7 // totally strategic alignment
7 _vpaes_consts:
8 .Lk_mc_forward: // mc_forward
9 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
10 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
11 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
12 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
13 .Lk_mc_backward: // mc_backward
14 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
15 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
16 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
17 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
18 .Lk_sr: // sr
19 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
20 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
21 .quad 0x0F060D040B020900, 0x070E050C030A0108
22 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
23
24 //
25 // "Hot" constants
26 //
27 .Lk_inv: // inv, inva
28 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
29 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
30 .Lk_ipt: // input transform (lo, hi)
31 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
32 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
33 .Lk_sbo: // sbou, sbot
34 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
35 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
36 .Lk_sb1: // sb1u, sb1t
37 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
38 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
39 .Lk_sb2: // sb2u, sb2t
40 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
41 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
42
43 //
44 // Decryption stuff
45 //
46 .Lk_dipt: // decryption input transform
47 .quad 0x0F505B040B545F00, 0x154A411E114E451A
48 .quad 0x86E383E660056500, 0x12771772F491F194
49 .Lk_dsbo: // decryption sbox final output
50 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
51 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
52 .Lk_dsb9: // decryption sbox output *9*u, *9*t
53 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
54 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
55 .Lk_dsbd: // decryption sbox output *D*u, *D*t
56 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
57 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
58 .Lk_dsbb: // decryption sbox output *B*u, *B*t
59 .quad 0xD022649296B44200, 0x602646F6B0F2D404
60 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
61 .Lk_dsbe: // decryption sbox output *E*u, *E*t
62 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
63 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
64
65 //
66 // Key schedule constants
67 //
68 .Lk_dksd: // decryption key schedule: invskew x*D
69 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
70 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
71 .Lk_dksb: // decryption key schedule: invskew x*B
72 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
73 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
74 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
75 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
76 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
77 .Lk_dks9: // decryption key schedule: invskew x*9
78 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
79 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
80
81 .Lk_rcon: // rcon
82 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
83
84 .Lk_opt: // output transform
85 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
86 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
87 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
88 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
89 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
90
91 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
92 .align 2
93 .size _vpaes_consts,.-_vpaes_consts
94 .align 6
95
96 .text
97
98 //
99 // _aes_preheat
100 //
101 // Fills register %r10 -> .aes_consts (so you can -fPIC)
102 // and %xmm9-%xmm15 as specified below.
103 //
104 .type _vpaes_encrypt_preheat,%function
105 .align 4
106 _vpaes_encrypt_preheat:
107 adrp x10, .Lk_inv
108 add x10, x10, #:lo12:.Lk_inv
109 movi v17.16b, #0x0f
110 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
111 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
112 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
113 ret
114 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
115
116 //
117 // _aes_encrypt_core
118 //
119 // AES-encrypt %xmm0.
120 //
121 // Inputs:
122 // %xmm0 = input
123 // %xmm9-%xmm15 as in _vpaes_preheat
124 // (%rdx) = scheduled keys
125 //
126 // Output in %xmm0
127 // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
128 // Preserves %xmm6 - %xmm8 so you get some local vectors
129 //
130 //
131 .type _vpaes_encrypt_core,%function
132 .align 4
133 _vpaes_encrypt_core:
134 mov x9, x2
135 ldr w8, [x2,#240] // pull rounds
136 adrp x11, .Lk_mc_forward+16
137 add x11, x11, #:lo12:.Lk_mc_forward+16
138 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
139 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
140 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
141 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
142 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
143 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
144 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
145 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
146 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
147 b .Lenc_entry
148
149 .align 4
150 .Lenc_loop:
151 // middle of middle round
152 add x10, x11, #0x40
153 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
154 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
155 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
156 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
157 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
158 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
159 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
160 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
161 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
162 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
163 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
164 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
165 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
166 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
167 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
168 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
169 sub w8, w8, #1 // nr--
170
171 .Lenc_entry:
172 // top of round
173 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
174 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
175 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
176 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
177 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
178 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
179 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
180 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
181 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
182 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
183 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
184 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
185 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
186 cbnz w8, .Lenc_loop
187
188 // middle of last round
189 add x10, x11, #0x80
190 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
191 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
192 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
193 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
194 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
195 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
196 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
197 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
198 ret
199 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
200
201 .globl vpaes_encrypt
202 .type vpaes_encrypt,%function
203 .align 4
204 vpaes_encrypt:
205 AARCH64_SIGN_LINK_REGISTER
206 stp x29,x30,[sp,#-16]!
207 add x29,sp,#0
208
209 ld1 {v7.16b}, [x0]
210 bl _vpaes_encrypt_preheat
211 bl _vpaes_encrypt_core
212 st1 {v0.16b}, [x1]
213
214 ldp x29,x30,[sp],#16
215 AARCH64_VALIDATE_LINK_REGISTER
216 ret
217 .size vpaes_encrypt,.-vpaes_encrypt
218
219 .type _vpaes_encrypt_2x,%function
220 .align 4
221 _vpaes_encrypt_2x:
222 mov x9, x2
223 ldr w8, [x2,#240] // pull rounds
224 adrp x11, .Lk_mc_forward+16
225 add x11, x11, #:lo12:.Lk_mc_forward+16
226 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
227 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
228 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
229 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
230 and v9.16b, v15.16b, v17.16b
231 ushr v8.16b, v15.16b, #4
232 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
233 tbl v9.16b, {v20.16b}, v9.16b
234 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
235 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
236 tbl v10.16b, {v21.16b}, v8.16b
237 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
238 eor v8.16b, v9.16b, v16.16b
239 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
240 eor v8.16b, v8.16b, v10.16b
241 b .Lenc_2x_entry
242
243 .align 4
244 .Lenc_2x_loop:
245 // middle of middle round
246 add x10, x11, #0x40
247 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
248 tbl v12.16b, {v25.16b}, v10.16b
249 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
250 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
251 tbl v8.16b, {v24.16b}, v11.16b
252 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
253 eor v12.16b, v12.16b, v16.16b
254 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
255 tbl v13.16b, {v27.16b}, v10.16b
256 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
257 eor v8.16b, v8.16b, v12.16b
258 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
259 tbl v10.16b, {v26.16b}, v11.16b
260 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
261 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
262 tbl v11.16b, {v8.16b}, v1.16b
263 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
264 eor v10.16b, v10.16b, v13.16b
265 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
266 tbl v8.16b, {v8.16b}, v4.16b
267 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
268 eor v11.16b, v11.16b, v10.16b
269 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
270 tbl v12.16b, {v11.16b},v1.16b
271 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
272 eor v8.16b, v8.16b, v11.16b
273 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
274 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
275 eor v8.16b, v8.16b, v12.16b
276 sub w8, w8, #1 // nr--
277
278 .Lenc_2x_entry:
279 // top of round
280 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
281 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
282 and v9.16b, v8.16b, v17.16b
283 ushr v8.16b, v8.16b, #4
284 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
285 tbl v13.16b, {v19.16b},v9.16b
286 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
287 eor v9.16b, v9.16b, v8.16b
288 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
289 tbl v11.16b, {v18.16b},v8.16b
290 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
291 tbl v12.16b, {v18.16b},v9.16b
292 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
293 eor v11.16b, v11.16b, v13.16b
294 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
295 eor v12.16b, v12.16b, v13.16b
296 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
297 tbl v10.16b, {v18.16b},v11.16b
298 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
299 tbl v11.16b, {v18.16b},v12.16b
300 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
301 eor v10.16b, v10.16b, v9.16b
302 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
303 eor v11.16b, v11.16b, v8.16b
304 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
305 cbnz w8, .Lenc_2x_loop
306
307 // middle of last round
308 add x10, x11, #0x80
309 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
310 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
311 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
312 tbl v12.16b, {v22.16b}, v10.16b
313 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
314 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
315 tbl v8.16b, {v23.16b}, v11.16b
316 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
317 eor v12.16b, v12.16b, v16.16b
318 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
319 eor v8.16b, v8.16b, v12.16b
320 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
321 tbl v1.16b, {v8.16b},v1.16b
322 ret
323 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
324
325 .type _vpaes_decrypt_preheat,%function
326 .align 4
327 _vpaes_decrypt_preheat:
328 adrp x10, .Lk_inv
329 add x10, x10, #:lo12:.Lk_inv
330 movi v17.16b, #0x0f
331 adrp x11, .Lk_dipt
332 add x11, x11, #:lo12:.Lk_dipt
333 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
334 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
335 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
336 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
337 ret
338 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
339
340 //
341 // Decryption core
342 //
343 // Same API as encryption core.
344 //
345 .type _vpaes_decrypt_core,%function
346 .align 4
347 _vpaes_decrypt_core:
348 mov x9, x2
349 ldr w8, [x2,#240] // pull rounds
350
351 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
352 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
353 eor x11, x11, #0x30 // xor $0x30, %r11
354 adrp x10, .Lk_sr
355 add x10, x10, #:lo12:.Lk_sr
356 and x11, x11, #0x30 // and $0x30, %r11
357 add x11, x11, x10
358 adrp x10, .Lk_mc_forward+48
359 add x10, x10, #:lo12:.Lk_mc_forward+48
360
361 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
362 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
363 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
364 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
365 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
366 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
367 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
368 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
369 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
370 b .Ldec_entry
371
372 .align 4
373 .Ldec_loop:
374 //
375 // Inverse mix columns
376 //
377 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
378 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
379 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
380 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
381 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
382 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
383 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
384 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
385
386 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
387 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
388 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
389 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
390 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
391 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
392 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
393
394 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
395 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
396 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
397 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
398 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
399 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
400 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
401
402 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
403 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
404 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
405 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
406 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
407 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
408 sub w8, w8, #1 // sub $1,%rax # nr--
409
410 .Ldec_entry:
411 // top of round
412 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
413 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
414 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
415 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
416 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
417 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
418 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
419 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
420 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
421 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
422 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
423 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
424 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
425 cbnz w8, .Ldec_loop
426
427 // middle of last round
428 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
429 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
430 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
431 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
432 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
433 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
434 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
435 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
436 ret
437 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
438
439 .globl vpaes_decrypt
440 .type vpaes_decrypt,%function
441 .align 4
442 vpaes_decrypt:
443 AARCH64_SIGN_LINK_REGISTER
444 stp x29,x30,[sp,#-16]!
445 add x29,sp,#0
446
447 ld1 {v7.16b}, [x0]
448 bl _vpaes_decrypt_preheat
449 bl _vpaes_decrypt_core
450 st1 {v0.16b}, [x1]
451
452 ldp x29,x30,[sp],#16
453 AARCH64_VALIDATE_LINK_REGISTER
454 ret
455 .size vpaes_decrypt,.-vpaes_decrypt
456
457 // v14-v15 input, v0-v1 output
458 .type _vpaes_decrypt_2x,%function
459 .align 4
460 _vpaes_decrypt_2x:
461 mov x9, x2
462 ldr w8, [x2,#240] // pull rounds
463
464 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
465 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
466 eor x11, x11, #0x30 // xor $0x30, %r11
467 adrp x10, .Lk_sr
468 add x10, x10, #:lo12:.Lk_sr
469 and x11, x11, #0x30 // and $0x30, %r11
470 add x11, x11, x10
471 adrp x10, .Lk_mc_forward+48
472 add x10, x10, #:lo12:.Lk_mc_forward+48
473
474 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
475 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
476 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
477 and v9.16b, v15.16b, v17.16b
478 ushr v8.16b, v15.16b, #4
479 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
480 tbl v10.16b, {v20.16b},v9.16b
481 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
482 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
483 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
484 tbl v8.16b, {v21.16b},v8.16b
485 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
486 eor v10.16b, v10.16b, v16.16b
487 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
488 eor v8.16b, v8.16b, v10.16b
489 b .Ldec_2x_entry
490
491 .align 4
492 .Ldec_2x_loop:
493 //
494 // Inverse mix columns
495 //
496 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
497 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
498 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
499 tbl v12.16b, {v24.16b}, v10.16b
500 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
501 tbl v9.16b, {v25.16b}, v11.16b
502 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
503 eor v8.16b, v12.16b, v16.16b
504 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
505 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
506 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
507 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
508
509 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
510 tbl v12.16b, {v26.16b}, v10.16b
511 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
512 tbl v8.16b, {v8.16b},v5.16b
513 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
514 tbl v9.16b, {v27.16b}, v11.16b
515 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
516 eor v8.16b, v8.16b, v12.16b
517 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
518 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
519 eor v8.16b, v8.16b, v9.16b
520 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
521
522 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
523 tbl v12.16b, {v28.16b}, v10.16b
524 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
525 tbl v8.16b, {v8.16b},v5.16b
526 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
527 tbl v9.16b, {v29.16b}, v11.16b
528 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
529 eor v8.16b, v8.16b, v12.16b
530 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
531 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
532 eor v8.16b, v8.16b, v9.16b
533 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
534
535 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
536 tbl v12.16b, {v30.16b}, v10.16b
537 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
538 tbl v8.16b, {v8.16b},v5.16b
539 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
540 tbl v9.16b, {v31.16b}, v11.16b
541 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
542 eor v8.16b, v8.16b, v12.16b
543 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
544 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
545 eor v8.16b, v8.16b, v9.16b
546 sub w8, w8, #1 // sub $1,%rax # nr--
547
548 .Ldec_2x_entry:
549 // top of round
550 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
551 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
552 and v9.16b, v8.16b, v17.16b
553 ushr v8.16b, v8.16b, #4
554 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
555 tbl v10.16b, {v19.16b},v9.16b
556 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
557 eor v9.16b, v9.16b, v8.16b
558 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
559 tbl v11.16b, {v18.16b},v8.16b
560 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
561 tbl v12.16b, {v18.16b},v9.16b
562 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
563 eor v11.16b, v11.16b, v10.16b
564 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
565 eor v12.16b, v12.16b, v10.16b
566 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
567 tbl v10.16b, {v18.16b},v11.16b
568 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
569 tbl v11.16b, {v18.16b},v12.16b
570 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
571 eor v10.16b, v10.16b, v9.16b
572 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
573 eor v11.16b, v11.16b, v8.16b
574 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
575 cbnz w8, .Ldec_2x_loop
576
577 // middle of last round
578 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
579 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
580 tbl v12.16b, {v22.16b}, v10.16b
581 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
582 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
583 tbl v9.16b, {v23.16b}, v11.16b
584 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
585 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
586 eor v12.16b, v12.16b, v16.16b
587 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
588 eor v8.16b, v9.16b, v12.16b
589 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
590 tbl v1.16b, {v8.16b},v2.16b
591 ret
592 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
593 ////////////////////////////////////////////////////////
594 // //
595 // AES key schedule //
596 // //
597 ////////////////////////////////////////////////////////
598 .type _vpaes_key_preheat,%function
599 .align 4
600 _vpaes_key_preheat:
601 adrp x10, .Lk_inv
602 add x10, x10, #:lo12:.Lk_inv
603 movi v16.16b, #0x5b // .Lk_s63
604 adrp x11, .Lk_sb1
605 add x11, x11, #:lo12:.Lk_sb1
606 movi v17.16b, #0x0f // .Lk_s0F
607 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt
608 adrp x10, .Lk_dksd
609 add x10, x10, #:lo12:.Lk_dksd
610 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1
611 adrp x11, .Lk_mc_forward
612 add x11, x11, #:lo12:.Lk_mc_forward
613 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
614 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
615 ld1 {v8.2d}, [x10] // .Lk_rcon
616 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
617 ret
618 .size _vpaes_key_preheat,.-_vpaes_key_preheat
619
620 .type _vpaes_schedule_core,%function
621 .align 4
622 _vpaes_schedule_core:
623 AARCH64_SIGN_LINK_REGISTER
624 stp x29, x30, [sp,#-16]!
625 add x29,sp,#0
626
627 bl _vpaes_key_preheat // load the tables
628
629 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
630
631 // input transform
632 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
633 bl _vpaes_schedule_transform
634 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
635
636 adrp x10, .Lk_sr
637 add x10, x10, #:lo12:.Lk_sr
638 add x8, x8, x10
639 cbnz w3, .Lschedule_am_decrypting
640
641 // encrypting, output zeroth round key after transform
642 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
643 b .Lschedule_go
644
645 .Lschedule_am_decrypting:
646 // decrypting, output zeroth round key after shiftrows
647 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
648 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
649 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
650 eor x8, x8, #0x30 // xor $0x30, %r8
651
652 .Lschedule_go:
653 cmp w1, #192 // cmp $192, %esi
654 b.hi .Lschedule_256
655 b.eq .Lschedule_192
656 // 128: fall though
657
658 //
659 // .schedule_128
660 //
661 // 128-bit specific part of key schedule.
662 //
663 // This schedule is really simple, because all its parts
664 // are accomplished by the subroutines.
665 //
666 .Lschedule_128:
667 mov x0, #10 // mov $10, %esi
668
669 .Loop_schedule_128:
670 sub x0, x0, #1 // dec %esi
671 bl _vpaes_schedule_round
672 cbz x0, .Lschedule_mangle_last
673 bl _vpaes_schedule_mangle // write output
674 b .Loop_schedule_128
675
676 //
677 // .aes_schedule_192
678 //
679 // 192-bit specific part of key schedule.
680 //
681 // The main body of this schedule is the same as the 128-bit
682 // schedule, but with more smearing. The long, high side is
683 // stored in %xmm7 as before, and the short, low side is in
684 // the high bits of %xmm6.
685 //
686 // This schedule is somewhat nastier, however, because each
687 // round produces 192 bits of key material, or 1.5 round keys.
688 // Therefore, on each cycle we do 2 rounds and produce 3 round
689 // keys.
690 //
691 .align 4
692 .Lschedule_192:
693 sub x0, x0, #8
694 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
695 bl _vpaes_schedule_transform // input transform
696 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
697 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
698 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
699 mov x0, #4 // mov $4, %esi
700
701 .Loop_schedule_192:
702 sub x0, x0, #1 // dec %esi
703 bl _vpaes_schedule_round
704 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
705 bl _vpaes_schedule_mangle // save key n
706 bl _vpaes_schedule_192_smear
707 bl _vpaes_schedule_mangle // save key n+1
708 bl _vpaes_schedule_round
709 cbz x0, .Lschedule_mangle_last
710 bl _vpaes_schedule_mangle // save key n+2
711 bl _vpaes_schedule_192_smear
712 b .Loop_schedule_192
713
714 //
715 // .aes_schedule_256
716 //
717 // 256-bit specific part of key schedule.
718 //
719 // The structure here is very similar to the 128-bit
720 // schedule, but with an additional "low side" in
721 // %xmm6. The low side's rounds are the same as the
722 // high side's, except no rcon and no rotation.
723 //
724 .align 4
725 .Lschedule_256:
726 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
727 bl _vpaes_schedule_transform // input transform
728 mov x0, #7 // mov $7, %esi
729
730 .Loop_schedule_256:
731 sub x0, x0, #1 // dec %esi
732 bl _vpaes_schedule_mangle // output low result
733 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
734
735 // high round
736 bl _vpaes_schedule_round
737 cbz x0, .Lschedule_mangle_last
738 bl _vpaes_schedule_mangle
739
740 // low round. swap xmm7 and xmm6
741 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
742 movi v4.16b, #0
743 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
744 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
745 bl _vpaes_schedule_low_round
746 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
747
748 b .Loop_schedule_256
749
750 //
751 // .aes_schedule_mangle_last
752 //
753 // Mangler for last round of key schedule
754 // Mangles %xmm0
755 // when encrypting, outputs out(%xmm0) ^ 63
756 // when decrypting, outputs unskew(%xmm0)
757 //
758 // Always called right before return... jumps to cleanup and exits
759 //
760 .align 4
761 .Lschedule_mangle_last:
762 // schedule last round key from xmm0
763 adrp x11, .Lk_deskew
764 add x11, x11, #:lo12:.Lk_deskew
765 cbnz w3, .Lschedule_mangle_last_dec
766
767 // encrypting
768 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
769 adrp x11, .Lk_opt
770 add x11, x11, #:lo12:.Lk_opt
771 add x2, x2, #32 // add $32, %rdx
772 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
773
774 .Lschedule_mangle_last_dec:
775 ld1 {v20.2d,v21.2d}, [x11] // reload constants
776 sub x2, x2, #16 // add $-16, %rdx
777 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
778 bl _vpaes_schedule_transform // output transform
779 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
780
781 // cleanup
782 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
783 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
784 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
785 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
786 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
787 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
788 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
789 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
790 ldp x29, x30, [sp],#16
791 AARCH64_VALIDATE_LINK_REGISTER
792 ret
793 .size _vpaes_schedule_core,.-_vpaes_schedule_core
794
795 //
796 // .aes_schedule_192_smear
797 //
798 // Smear the short, low side in the 192-bit key schedule.
799 //
800 // Inputs:
801 // %xmm7: high side, b a x y
802 // %xmm6: low side, d c 0 0
803 // %xmm13: 0
804 //
805 // Outputs:
806 // %xmm6: b+c+d b+c 0 0
807 // %xmm0: b+c+d b+c b a
808 //
809 .type _vpaes_schedule_192_smear,%function
810 .align 4
811 _vpaes_schedule_192_smear:
812 movi v1.16b, #0
813 dup v0.4s, v7.s[3]
814 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
815 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
816 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
817 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
818 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
819 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
820 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
821 ret
822 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
823
824 //
825 // .aes_schedule_round
826 //
827 // Runs one main round of the key schedule on %xmm0, %xmm7
828 //
829 // Specifically, runs subbytes on the high dword of %xmm0
830 // then rotates it by one byte and xors into the low dword of
831 // %xmm7.
832 //
833 // Adds rcon from low byte of %xmm8, then rotates %xmm8 for
834 // next rcon.
835 //
836 // Smears the dwords of %xmm7 by xoring the low into the
837 // second low, result into third, result into highest.
838 //
839 // Returns results in %xmm7 = %xmm0.
840 // Clobbers %xmm1-%xmm4, %r11.
841 //
842 .type _vpaes_schedule_round,%function
843 .align 4
844 _vpaes_schedule_round:
845 // extract rcon from xmm8
846 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
847 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
848 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
849 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
850
851 // rotate
852 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
853 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
854
855 // fall through...
856
857 // low round: same as high round, but no rotation and no rcon.
858 _vpaes_schedule_low_round:
859 // smear xmm7
860 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
861 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
862 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
863
864 // subbytes
865 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
866 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
867 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
868 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
869 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
870 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
871 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
872 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
873 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
874 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
875 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
876 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
877 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
878 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
879 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
880 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
881 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
882
883 // add in smeared stuff
884 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
885 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
886 ret
887 .size _vpaes_schedule_round,.-_vpaes_schedule_round
888
889 //
890 // .aes_schedule_transform
891 //
892 // Linear-transform %xmm0 according to tables at (%r11)
893 //
894 // Requires that %xmm9 = 0x0F0F... as in preheat
895 // Output in %xmm0
896 // Clobbers %xmm1, %xmm2
897 //
898 .type _vpaes_schedule_transform,%function
899 .align 4
900 _vpaes_schedule_transform:
901 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
902 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
903 // vmovdqa (%r11), %xmm2 # lo
904 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
905 // vmovdqa 16(%r11), %xmm1 # hi
906 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
907 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
908 ret
909 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
910
911 //
912 // .aes_schedule_mangle
913 //
914 // Mangle xmm0 from (basis-transformed) standard version
915 // to our version.
916 //
917 // On encrypt,
918 // xor with 0x63
919 // multiply by circulant 0,1,1,1
920 // apply shiftrows transform
921 //
922 // On decrypt,
923 // xor with 0x63
924 // multiply by "inverse mixcolumns" circulant E,B,D,9
925 // deskew
926 // apply shiftrows transform
927 //
928 //
929 // Writes out to (%rdx), and increments or decrements it
930 // Keeps track of round number mod 4 in %r8
931 // Preserves xmm0
932 // Clobbers xmm1-xmm5
933 //
934 .type _vpaes_schedule_mangle,%function
935 .align 4
936 _vpaes_schedule_mangle:
937 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
938 // vmovdqa .Lk_mc_forward(%rip),%xmm5
939 cbnz w3, .Lschedule_mangle_dec
940
941 // encrypting
942 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
943 add x2, x2, #16 // add $16, %rdx
944 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
945 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
946 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
947 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
948 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
949 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
950
951 b .Lschedule_mangle_both
952 .align 4
953 .Lschedule_mangle_dec:
954 // inverse mix columns
955 // lea .Lk_dksd(%rip),%r11
956 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
957 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
958
959 // vmovdqa 0x00(%r11), %xmm2
960 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
961 // vmovdqa 0x10(%r11), %xmm3
962 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
963 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
964 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
965
966 // vmovdqa 0x20(%r11), %xmm2
967 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
968 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
969 // vmovdqa 0x30(%r11), %xmm3
970 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
971 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
972 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
973
974 // vmovdqa 0x40(%r11), %xmm2
975 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
976 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
977 // vmovdqa 0x50(%r11), %xmm3
978 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
979 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
980
981 // vmovdqa 0x60(%r11), %xmm2
982 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
983 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
984 // vmovdqa 0x70(%r11), %xmm4
985 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
986 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
987 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
988 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
989
990 sub x2, x2, #16 // add $-16, %rdx
991
992 .Lschedule_mangle_both:
993 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
994 add x8, x8, #64-16 // add $-16, %r8
995 and x8, x8, #~(1<<6) // and $0x30, %r8
996 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
997 ret
998 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
999
1000 .globl vpaes_set_encrypt_key
1001 .type vpaes_set_encrypt_key,%function
1002 .align 4
1003 vpaes_set_encrypt_key:
1004 AARCH64_SIGN_LINK_REGISTER
1005 stp x29,x30,[sp,#-16]!
1006 add x29,sp,#0
1007 stp d8,d9,[sp,#-16]! // ABI spec says so
1008
1009 lsr w9, w1, #5 // shr $5,%eax
1010 add w9, w9, #5 // $5,%eax
1011 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1012
1013 mov w3, #0 // mov $0,%ecx
1014 mov x8, #0x30 // mov $0x30,%r8d
1015 bl _vpaes_schedule_core
1016 eor x0, x0, x0
1017
1018 ldp d8,d9,[sp],#16
1019 ldp x29,x30,[sp],#16
1020 AARCH64_VALIDATE_LINK_REGISTER
1021 ret
1022 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1023
1024 .globl vpaes_set_decrypt_key
1025 .type vpaes_set_decrypt_key,%function
1026 .align 4
1027 vpaes_set_decrypt_key:
1028 AARCH64_SIGN_LINK_REGISTER
1029 stp x29,x30,[sp,#-16]!
1030 add x29,sp,#0
1031 stp d8,d9,[sp,#-16]! // ABI spec says so
1032
1033 lsr w9, w1, #5 // shr $5,%eax
1034 add w9, w9, #5 // $5,%eax
1035 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1036 lsl w9, w9, #4 // shl $4,%eax
1037 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1038 add x2, x2, x9
1039
1040 mov w3, #1 // mov $1,%ecx
1041 lsr w8, w1, #1 // shr $1,%r8d
1042 and x8, x8, #32 // and $32,%r8d
1043 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
1044 bl _vpaes_schedule_core
1045
1046 ldp d8,d9,[sp],#16
1047 ldp x29,x30,[sp],#16
1048 AARCH64_VALIDATE_LINK_REGISTER
1049 ret
1050 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1051 .globl vpaes_cbc_encrypt
1052 .type vpaes_cbc_encrypt,%function
1053 .align 4
1054 vpaes_cbc_encrypt:
1055 AARCH64_SIGN_LINK_REGISTER
1056 cbz x2, .Lcbc_abort
1057 cmp w5, #0 // check direction
1058 b.eq vpaes_cbc_decrypt
1059
1060 stp x29,x30,[sp,#-16]!
1061 add x29,sp,#0
1062
1063 mov x17, x2 // reassign
1064 mov x2, x3 // reassign
1065
1066 ld1 {v0.16b}, [x4] // load ivec
1067 bl _vpaes_encrypt_preheat
1068 b .Lcbc_enc_loop
1069
1070 .align 4
1071 .Lcbc_enc_loop:
1072 ld1 {v7.16b}, [x0],#16 // load input
1073 eor v7.16b, v7.16b, v0.16b // xor with ivec
1074 bl _vpaes_encrypt_core
1075 st1 {v0.16b}, [x1],#16 // save output
1076 subs x17, x17, #16
1077 b.hi .Lcbc_enc_loop
1078
1079 st1 {v0.16b}, [x4] // write ivec
1080
1081 ldp x29,x30,[sp],#16
1082 .Lcbc_abort:
1083 AARCH64_VALIDATE_LINK_REGISTER
1084 ret
1085 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1086
1087 .type vpaes_cbc_decrypt,%function
1088 .align 4
1089 vpaes_cbc_decrypt:
1090 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
1091 // only from vpaes_cbc_encrypt which has already signed the return address.
1092 stp x29,x30,[sp,#-16]!
1093 add x29,sp,#0
1094 stp d8,d9,[sp,#-16]! // ABI spec says so
1095 stp d10,d11,[sp,#-16]!
1096 stp d12,d13,[sp,#-16]!
1097 stp d14,d15,[sp,#-16]!
1098
1099 mov x17, x2 // reassign
1100 mov x2, x3 // reassign
1101 ld1 {v6.16b}, [x4] // load ivec
1102 bl _vpaes_decrypt_preheat
1103 tst x17, #16
1104 b.eq .Lcbc_dec_loop2x
1105
1106 ld1 {v7.16b}, [x0], #16 // load input
1107 bl _vpaes_decrypt_core
1108 eor v0.16b, v0.16b, v6.16b // xor with ivec
1109 orr v6.16b, v7.16b, v7.16b // next ivec value
1110 st1 {v0.16b}, [x1], #16
1111 subs x17, x17, #16
1112 b.ls .Lcbc_dec_done
1113
1114 .align 4
1115 .Lcbc_dec_loop2x:
1116 ld1 {v14.16b,v15.16b}, [x0], #32
1117 bl _vpaes_decrypt_2x
1118 eor v0.16b, v0.16b, v6.16b // xor with ivec
1119 eor v1.16b, v1.16b, v14.16b
1120 orr v6.16b, v15.16b, v15.16b
1121 st1 {v0.16b,v1.16b}, [x1], #32
1122 subs x17, x17, #32
1123 b.hi .Lcbc_dec_loop2x
1124
1125 .Lcbc_dec_done:
1126 st1 {v6.16b}, [x4]
1127
1128 ldp d14,d15,[sp],#16
1129 ldp d12,d13,[sp],#16
1130 ldp d10,d11,[sp],#16
1131 ldp d8,d9,[sp],#16
1132 ldp x29,x30,[sp],#16
1133 AARCH64_VALIDATE_LINK_REGISTER
1134 ret
1135 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1136 .globl vpaes_ecb_encrypt
1137 .type vpaes_ecb_encrypt,%function
1138 .align 4
1139 vpaes_ecb_encrypt:
1140 AARCH64_SIGN_LINK_REGISTER
1141 stp x29,x30,[sp,#-16]!
1142 add x29,sp,#0
1143 stp d8,d9,[sp,#-16]! // ABI spec says so
1144 stp d10,d11,[sp,#-16]!
1145 stp d12,d13,[sp,#-16]!
1146 stp d14,d15,[sp,#-16]!
1147
1148 mov x17, x2
1149 mov x2, x3
1150 bl _vpaes_encrypt_preheat
1151 tst x17, #16
1152 b.eq .Lecb_enc_loop
1153
1154 ld1 {v7.16b}, [x0],#16
1155 bl _vpaes_encrypt_core
1156 st1 {v0.16b}, [x1],#16
1157 subs x17, x17, #16
1158 b.ls .Lecb_enc_done
1159
1160 .align 4
1161 .Lecb_enc_loop:
1162 ld1 {v14.16b,v15.16b}, [x0], #32
1163 bl _vpaes_encrypt_2x
1164 st1 {v0.16b,v1.16b}, [x1], #32
1165 subs x17, x17, #32
1166 b.hi .Lecb_enc_loop
1167
1168 .Lecb_enc_done:
1169 ldp d14,d15,[sp],#16
1170 ldp d12,d13,[sp],#16
1171 ldp d10,d11,[sp],#16
1172 ldp d8,d9,[sp],#16
1173 ldp x29,x30,[sp],#16
1174 AARCH64_VALIDATE_LINK_REGISTER
1175 ret
1176 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1177
1178 .globl vpaes_ecb_decrypt
1179 .type vpaes_ecb_decrypt,%function
1180 .align 4
1181 vpaes_ecb_decrypt:
1182 AARCH64_SIGN_LINK_REGISTER
1183 stp x29,x30,[sp,#-16]!
1184 add x29,sp,#0
1185 stp d8,d9,[sp,#-16]! // ABI spec says so
1186 stp d10,d11,[sp,#-16]!
1187 stp d12,d13,[sp,#-16]!
1188 stp d14,d15,[sp,#-16]!
1189
1190 mov x17, x2
1191 mov x2, x3
1192 bl _vpaes_decrypt_preheat
1193 tst x17, #16
1194 b.eq .Lecb_dec_loop
1195
1196 ld1 {v7.16b}, [x0],#16
1197 bl _vpaes_encrypt_core
1198 st1 {v0.16b}, [x1],#16
1199 subs x17, x17, #16
1200 b.ls .Lecb_dec_done
1201
1202 .align 4
1203 .Lecb_dec_loop:
1204 ld1 {v14.16b,v15.16b}, [x0], #32
1205 bl _vpaes_decrypt_2x
1206 st1 {v0.16b,v1.16b}, [x1], #32
1207 subs x17, x17, #32
1208 b.hi .Lecb_dec_loop
1209
1210 .Lecb_dec_done:
1211 ldp d14,d15,[sp],#16
1212 ldp d12,d13,[sp],#16
1213 ldp d10,d11,[sp],#16
1214 ldp d8,d9,[sp],#16
1215 ldp x29,x30,[sp],#16
1216 AARCH64_VALIDATE_LINK_REGISTER
1217 ret
1218 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1219