aes_ni_64.S revision 1.4 1 /* $NetBSD: aes_ni_64.S,v 1.4 2020/07/25 22:29:06 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <machine/asm.h>
30
31 /*
32 * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined
33 * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned)
34 * Packed Single, defined to operate on binary32 floats. They have
35 * exactly the same architectural effects (move a 128-bit quantity from
36 * memory into an xmm register).
37 *
38 * In principle, they might have different microarchitectural effects
39 * so that MOVAPS/MOVUPS might incur a penalty when the register is
40 * later used for integer paths, but in practice they don't. So we use
41 * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS.
42 */
43 #define movdqa movaps
44 #define movdqu movups
45
46 /*
47 * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi)
48 *
49 * Expand a 16-byte AES-128 key into 10 round keys.
50 *
51 * Standard ABI calling convention.
52 */
53 ENTRY(aesni_setenckey128)
54 movdqu (%rsi),%xmm0 /* load master key into %xmm0 */
55 movdqa %xmm0,(%rdi) /* store master key as the first round key */
56 lea 0x10(%rdi),%rdi /* advance %rdi to next round key */
57 aeskeygenassist $0x1,%xmm0,%xmm2
58 call aesni_expand128
59 aeskeygenassist $0x2,%xmm0,%xmm2
60 call aesni_expand128
61 aeskeygenassist $0x4,%xmm0,%xmm2
62 call aesni_expand128
63 aeskeygenassist $0x8,%xmm0,%xmm2
64 call aesni_expand128
65 aeskeygenassist $0x10,%xmm0,%xmm2
66 call aesni_expand128
67 aeskeygenassist $0x20,%xmm0,%xmm2
68 call aesni_expand128
69 aeskeygenassist $0x40,%xmm0,%xmm2
70 call aesni_expand128
71 aeskeygenassist $0x80,%xmm0,%xmm2
72 call aesni_expand128
73 aeskeygenassist $0x1b,%xmm0,%xmm2
74 call aesni_expand128
75 aeskeygenassist $0x36,%xmm0,%xmm2
76 call aesni_expand128
77 ret
78 END(aesni_setenckey128)
79
80 /*
81 * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi)
82 *
83 * Expand a 24-byte AES-192 key into 12 round keys.
84 *
85 * Standard ABI calling convention.
86 */
87 ENTRY(aesni_setenckey192)
88 movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */
89 movq 0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */
90 movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */
91 lea 0x10(%rdi),%rdi /* advance %rdi to next round key */
92 aeskeygenassist $0x1,%xmm1,%xmm2
93 call aesni_expand192a
94 aeskeygenassist $0x2,%xmm0,%xmm2
95 call aesni_expand192b
96 aeskeygenassist $0x4,%xmm1,%xmm2
97 call aesni_expand192a
98 aeskeygenassist $0x8,%xmm0,%xmm2
99 call aesni_expand192b
100 aeskeygenassist $0x10,%xmm1,%xmm2
101 call aesni_expand192a
102 aeskeygenassist $0x20,%xmm0,%xmm2
103 call aesni_expand192b
104 aeskeygenassist $0x40,%xmm1,%xmm2
105 call aesni_expand192a
106 aeskeygenassist $0x80,%xmm0,%xmm2
107 call aesni_expand192b
108 ret
109 END(aesni_setenckey192)
110
111 /*
112 * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi)
113 *
114 * Expand a 32-byte AES-256 key into 14 round keys.
115 *
116 * Standard ABI calling convention.
117 */
118 ENTRY(aesni_setenckey256)
119 movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */
120 movdqu 0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */
121 movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */
122 movdqa %xmm1,0x10(%rdi) /* store master key [128:256) as round key */
123 lea 0x20(%rdi),%rdi /* advance %rdi to next round key */
124 aeskeygenassist $0x1,%xmm1,%xmm2
125 call aesni_expand256a
126 aeskeygenassist $0x1,%xmm0,%xmm2
127 call aesni_expand256b
128 aeskeygenassist $0x2,%xmm1,%xmm2
129 call aesni_expand256a
130 aeskeygenassist $0x2,%xmm0,%xmm2
131 call aesni_expand256b
132 aeskeygenassist $0x4,%xmm1,%xmm2
133 call aesni_expand256a
134 aeskeygenassist $0x4,%xmm0,%xmm2
135 call aesni_expand256b
136 aeskeygenassist $0x8,%xmm1,%xmm2
137 call aesni_expand256a
138 aeskeygenassist $0x8,%xmm0,%xmm2
139 call aesni_expand256b
140 aeskeygenassist $0x10,%xmm1,%xmm2
141 call aesni_expand256a
142 aeskeygenassist $0x10,%xmm0,%xmm2
143 call aesni_expand256b
144 aeskeygenassist $0x20,%xmm1,%xmm2
145 call aesni_expand256a
146 aeskeygenassist $0x20,%xmm0,%xmm2
147 call aesni_expand256b
148 aeskeygenassist $0x40,%xmm1,%xmm2
149 call aesni_expand256a
150 ret
151 END(aesni_setenckey256)
152
153 /*
154 * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0,
155 * uint128_t keygenassist@xmm2)
156 *
157 * 1. Compute the AES-128 round key using the previous round key.
158 * 2. Store it at *rkp.
159 * 3. Set %xmm0 to it.
160 * 4. Advance %rdi to point at the next round key.
161 *
162 * Internal ABI. On entry:
163 *
164 * %rdi = rkp, pointer to round key to compute
165 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
166 * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON)
167 *
168 * On exit:
169 *
170 * %rdi = &rkp[1], rkp advanced by one round key
171 * %xmm0 = rk, the round key we just computed
172 * %xmm2 = garbage
173 * %xmm4 = garbage
174 * %xmm5 = garbage
175 * %xmm6 = garbage
176 *
177 * Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15,
178 * and all other registers).
179 */
180 .text
181 _ALIGN_TEXT
182 .type aesni_expand128,@function
183 aesni_expand128:
184 /*
185 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
186 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON.
187 */
188 pshufd $0b11111111,%xmm2,%xmm2
189
190 /*
191 * %xmm4 := (0, prk[0], prk[1], prk[2])
192 * %xmm5 := (0, 0, prk[0], prk[1])
193 * %xmm6 := (0, 0, 0, prk[0])
194 */
195 movdqa %xmm0,%xmm4
196 movdqa %xmm0,%xmm5
197 movdqa %xmm0,%xmm6
198 pslldq $4,%xmm4
199 pslldq $8,%xmm5
200 pslldq $12,%xmm6
201
202 /*
203 * %xmm0 := (rk[0] = t ^ prk[0],
204 * rk[1] = t ^ prk[0] ^ prk[1],
205 * rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2],
206 * rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3])
207 */
208 pxor %xmm2,%xmm0
209 pxor %xmm4,%xmm0
210 pxor %xmm5,%xmm0
211 pxor %xmm6,%xmm0
212
213 movdqa %xmm0,(%rdi) /* store round key */
214 lea 0x10(%rdi),%rdi /* advance to next round key address */
215 ret
216 END(aesni_expand128)
217
218 /*
219 * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0,
220 * uint64_t rklo@xmm1, uint128_t keygenassist@xmm2)
221 *
222 * Set even-numbered AES-192 round key.
223 *
224 * Internal ABI. On entry:
225 *
226 * %rdi = rkp, pointer to two round keys to compute
227 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
228 * %xmm1 = (rklo[0], rklo[1], xxx, xxx)
229 * %xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx)
230 *
231 * On exit:
232 *
233 * %rdi = &rkp[2], rkp advanced by two round keys
234 * %xmm0 = nrk, second round key we just computed
235 * %xmm1 = rk, first round key we just computed
236 * %xmm2 = garbage
237 * %xmm4 = garbage
238 * %xmm5 = garbage
239 * %xmm6 = garbage
240 * %xmm7 = garbage
241 */
242 .text
243 _ALIGN_TEXT
244 .type aesni_expand192a,@function
245 aesni_expand192a:
246 /*
247 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]),
248 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON.
249 */
250 pshufd $0b01010101,%xmm2,%xmm2
251
252 /*
253 * We need to compute:
254 *
255 * rk[0] := rklo[0]
256 * rk[1] := rklo[1]
257 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
258 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
259 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
260 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
261 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
262 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
263 * ^ rklo[1]
264 */
265
266 /*
267 * %xmm4 := (prk[0], prk[1], prk[2], prk[3])
268 * %xmm5 := (0, prk[0], prk[1], prk[2])
269 * %xmm6 := (0, 0, prk[0], prk[1])
270 * %xmm7 := (0, 0, 0, prk[0])
271 */
272 movdqa %xmm0,%xmm4
273 movdqa %xmm0,%xmm5
274 movdqa %xmm0,%xmm6
275 movdqa %xmm0,%xmm7
276 pslldq $4,%xmm5
277 pslldq $8,%xmm6
278 pslldq $12,%xmm7
279
280 /* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */
281 pxor %xmm2,%xmm4
282 pxor %xmm5,%xmm4
283 pxor %xmm6,%xmm4
284 pxor %xmm7,%xmm4
285
286 /*
287 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and
288 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]);
289 * and we have yet to compute nrk[2] or nrk[3], which requires
290 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...). We need
291 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and
292 * nrk into %xmm0.
293 */
294
295 /* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */
296 pshufd $0b11111110,%xmm4,%xmm0
297
298 /*
299 * %xmm6 := (0, 0, rklo[0], rklo[1])
300 * %xmm7 := (0, 0, 0, rklo[0])
301 */
302 movdqa %xmm1,%xmm6
303 movdqa %xmm1,%xmm7
304
305 pslldq $8,%xmm6
306 pslldq $12,%xmm7
307
308 /*
309 * %xmm0 := (nrk[0],
310 * nrk[1],
311 * nrk[2] = nrk[1] ^ rklo[0],
312 * nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1])
313 */
314 pxor %xmm6,%xmm0
315 pxor %xmm7,%xmm0
316
317 /* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */
318 shufps $0b01000100,%xmm4,%xmm1
319
320 movdqa %xmm1,(%rdi) /* store round key */
321 movdqa %xmm0,0x10(%rdi) /* store next round key */
322 lea 0x20(%rdi),%rdi /* advance two round keys */
323 ret
324 END(aesni_expand192a)
325
326 /*
327 * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0,
328 * uint128_t keygenassist@xmm2)
329 *
330 * Set odd-numbered AES-192 round key.
331 *
332 * Internal ABI. On entry:
333 *
334 * %rdi = rkp, pointer to round key to compute
335 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
336 * %xmm1 = (xxx, xxx, pprk[2], pprk[3])
337 * %xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON)
338 *
339 * On exit:
340 *
341 * %rdi = &rkp[1], rkp advanced by one round key
342 * %xmm0 = rk, the round key we just computed
343 * %xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key
344 * %xmm2 = garbage
345 * %xmm4 = garbage
346 * %xmm5 = garbage
347 * %xmm6 = garbage
348 * %xmm7 = garbage
349 */
350 .text
351 _ALIGN_TEXT
352 .type aesni_expand192b,@function
353 aesni_expand192b:
354 /*
355 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
356 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON.
357 */
358 pshufd $0b11111111,%xmm2,%xmm2
359
360 /*
361 * We need to compute:
362 *
363 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2]
364 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3]
365 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
366 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
367 * ^ prk[1]
368 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
369 * ^ prk[1] ^ prk[2]
370 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
371 * ^ prk[1] ^ prk[2] ^ prk[3]
372 */
373
374 /* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */
375 shufps $0b01001110,%xmm0,%xmm1
376
377 /*
378 * %xmm5 := (0, pprk[2], pprk[3], prk[0])
379 * %xmm6 := (0, 0, pprk[2], pprk[3])
380 * %xmm7 := (0, 0, 0, pprk[2])
381 */
382 movdqa %xmm1,%xmm5
383 movdqa %xmm1,%xmm6
384 movdqa %xmm1,%xmm7
385 pslldq $4,%xmm5
386 pslldq $8,%xmm6
387 pslldq $12,%xmm7
388
389 /* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */
390 pxor %xmm2,%xmm1
391 pxor %xmm5,%xmm1
392 pxor %xmm6,%xmm1
393 pxor %xmm7,%xmm1
394
395 /* %xmm4 := (prk[2], prk[3], xxx, xxx) */
396 pshufd $0b00001110,%xmm0,%xmm4
397
398 /* %xmm5 := (0, prk[2], xxx, xxx) */
399 movdqa %xmm4,%xmm5
400 pslldq $4,%xmm5
401
402 /* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */
403 movdqa %xmm1,%xmm0
404
405 /* %xmm1 := (rk[3], rk[3], xxx, xxx) */
406 shufps $0b00001111,%xmm1,%xmm1
407
408 /*
409 * %xmm1 := (nrk[0] = rk[3] ^ prk[2],
410 * nrk[1] = rk[3] ^ prk[2] ^ prk[3],
411 * xxx,
412 * xxx)
413 */
414 pxor %xmm4,%xmm1
415 pxor %xmm5,%xmm1
416
417 movdqa %xmm0,(%rdi) /* store round key */
418 lea 0x10(%rdi),%rdi /* advance to next round key address */
419 ret
420 END(aesni_expand192b)
421
422 /*
423 * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0,
424 * uint128_t prk@xmm1, uint128_t keygenassist@xmm2)
425 *
426 * Set even-numbered AES-256 round key.
427 *
428 * Internal ABI. On entry:
429 *
430 * %rdi = rkp, pointer to round key to compute
431 * %xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3])
432 * %xmm1 = (prk[0], prk[1], prk[2], prk[3])
433 * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])))
434 *
435 * On exit:
436 *
437 * %rdi = &rkp[1], rkp advanced by one round key
438 * %xmm0 = rk, the round key we just computed
439 * %xmm1 = prk, previous round key, preserved from entry
440 * %xmm2 = garbage
441 * %xmm4 = garbage
442 * %xmm5 = garbage
443 * %xmm6 = garbage
444 *
445 * The computation turns out to be the same as for AES-128; the
446 * previous round key does not figure into it, only the
447 * previous-previous round key.
448 */
449 aesni_expand256a = aesni_expand128
450
451 /*
452 * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0,
453 * uint128_t pprk@xmm1, uint128_t keygenassist@xmm2)
454 *
455 * Set odd-numbered AES-256 round key.
456 *
457 * Internal ABI. On entry:
458 *
459 * %rdi = rkp, pointer to round key to compute
460 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
461 * %xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3])
462 * %xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx)
463 *
464 * On exit:
465 *
466 * %rdi = &rkp[1], rkp advanced by one round key
467 * %xmm0 = prk, previous round key, preserved from entry
468 * %xmm1 = rk, the round key we just computed
469 * %xmm2 = garbage
470 * %xmm4 = garbage
471 * %xmm5 = garbage
472 * %xmm6 = garbage
473 */
474 .text
475 _ALIGN_TEXT
476 .type aesni_expand256b,@function
477 aesni_expand256b:
478 /*
479 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
480 * i.e., set each word of %xmm2 to t := Sub(prk[3]).
481 */
482 pshufd $0b10101010,%xmm2,%xmm2
483
484 /*
485 * %xmm4 := (0, pprk[0], pprk[1], pprk[2])
486 * %xmm5 := (0, 0, pprk[0], pprk[1])
487 * %xmm6 := (0, 0, 0, pprk[0])
488 */
489 movdqa %xmm1,%xmm4
490 movdqa %xmm1,%xmm5
491 movdqa %xmm1,%xmm6
492 pslldq $4,%xmm4
493 pslldq $8,%xmm5
494 pslldq $12,%xmm6
495
496 /*
497 * %xmm0 := (rk[0] = t ^ pprk[0],
498 * rk[1] = t ^ pprk[0] ^ pprk[1],
499 * rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2],
500 * rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3])
501 */
502 pxor %xmm2,%xmm1
503 pxor %xmm4,%xmm1
504 pxor %xmm5,%xmm1
505 pxor %xmm6,%xmm1
506
507 movdqa %xmm1,(%rdi) /* store round key */
508 lea 0x10(%rdi),%rdi /* advance to next round key address */
509 ret
510 END(aesni_expand256b)
511
512 /*
513 * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi,
514 * uint32_t nrounds@rdx)
515 *
516 * Convert AES encryption round keys to AES decryption round keys.
517 * `rounds' must be between 10 and 14.
518 *
519 * Standard ABI calling convention.
520 */
521 ENTRY(aesni_enctodec)
522 shl $4,%edx /* rdx := byte offset of last round key */
523 movdqa (%rdi,%rdx),%xmm0 /* load last round key */
524 movdqa %xmm0,(%rsi) /* store last round key verbatim */
525 jmp 2f
526 1: movdqa (%rdi,%rdx),%xmm0 /* load round key */
527 aesimc %xmm0,%xmm0 /* convert encryption to decryption */
528 movdqa %xmm0,(%rsi) /* store round key */
529 2: sub $0x10,%rdx /* advance to next round key */
530 lea 0x10(%rsi),%rsi
531 jnz 1b /* repeat if more rounds */
532 movdqa (%rdi),%xmm0 /* load first round key */
533 movdqa %xmm0,(%rsi) /* store first round key verbatim */
534 ret
535 END(aesni_enctodec)
536
537 /*
538 * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi,
539 * uint8_t out[16] @rdx, uint32_t nrounds@ecx)
540 *
541 * Encrypt a single block.
542 *
543 * Standard ABI calling convention.
544 */
545 ENTRY(aesni_enc)
546 movdqu (%rsi),%xmm0
547 call aesni_enc1
548 movdqu %xmm0,(%rdx)
549 ret
550 END(aesni_enc)
551
552 /*
553 * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi,
554 * uint8_t out[16] @rdx, uint32_t nrounds@ecx)
555 *
556 * Decrypt a single block.
557 *
558 * Standard ABI calling convention.
559 */
560 ENTRY(aesni_dec)
561 movdqu (%rsi),%xmm0
562 call aesni_dec1
563 movdqu %xmm0,(%rdx)
564 ret
565 END(aesni_dec)
566
567 /*
568 * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
569 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8,
570 * uint32_t nrounds@r9d)
571 *
572 * Encrypt a contiguous sequence of blocks with AES-CBC.
573 *
574 * nbytes must be an integral multiple of 16.
575 *
576 * Standard ABI calling convention.
577 */
578 ENTRY(aesni_cbc_enc)
579 cmp $0,%rcx
580 jz 2f
581 mov %rcx,%r10 /* r10 := nbytes */
582 movdqu (%r8),%xmm0 /* xmm0 := chaining value */
583 1: movdqu (%rsi),%xmm1 /* xmm1 := plaintext block */
584 lea 0x10(%rsi),%rsi
585 pxor %xmm1,%xmm0 /* xmm0 := cv ^ ptxt */
586 mov %r9d,%ecx /* ecx := nrounds */
587 call aesni_enc1 /* xmm0 := ciphertext block */
588 movdqu %xmm0,(%rdx)
589 lea 0x10(%rdx),%rdx
590 sub $0x10,%r10
591 jnz 1b /* repeat if r10 is nonzero */
592 movdqu %xmm0,(%r8) /* store chaining value */
593 2: ret
594 END(aesni_cbc_enc)
595
596 /*
597 * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
598 * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
599 * uint32_t nrounds@r9)
600 *
601 * Decrypt a contiguous sequence of blocks with AES-CBC.
602 *
603 * nbytes must be a positive integral multiple of 16. This routine
604 * is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once.
605 *
606 * Standard ABI calling convention.
607 */
608 ENTRY(aesni_cbc_dec1)
609 push %rbp /* create stack frame uint128[1] */
610 mov %rsp,%rbp
611 sub $0x10,%rsp
612 movdqu (%r8),%xmm8 /* xmm8 := iv */
613 movdqa %xmm8,(%rsp) /* save iv */
614 mov %rcx,%r10 /* r10 := nbytes */
615 movdqu -0x10(%rsi,%r10),%xmm0 /* xmm0 := last ciphertext block */
616 movdqu %xmm0,(%r8) /* update iv */
617 jmp 2f
618 1: movdqu -0x10(%rsi,%r10),%xmm8 /* xmm8 := chaining value */
619 pxor %xmm8,%xmm0 /* xmm0 := ptxt */
620 movdqu %xmm0,(%rdx,%r10) /* store plaintext block */
621 movdqa %xmm8,%xmm0 /* move cv = ciphertext block */
622 2: mov %r9d,%ecx /* ecx := nrounds */
623 call aesni_dec1 /* xmm0 := cv ^ ptxt */
624 sub $0x10,%r10
625 jnz 1b /* repeat if more blocks */
626 pxor (%rsp),%xmm0 /* xmm0 := ptxt */
627 movdqu %xmm0,(%rdx) /* store first plaintext block */
628 leave
629 ret
630 END(aesni_cbc_dec1)
631
632 /*
633 * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
634 * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
635 * uint32_t nrounds@r9)
636 *
637 * Decrypt a contiguous sequence of 8-block units with AES-CBC.
638 *
639 * nbytes must be a positive integral multiple of 128.
640 *
641 * Standard ABI calling convention.
642 */
643 ENTRY(aesni_cbc_dec8)
644 push %rbp /* create stack frame uint128[1] */
645 mov %rsp,%rbp
646 sub $0x10,%rsp
647 movdqu (%r8),%xmm8 /* xmm8 := iv */
648 movdqa %xmm8,(%rsp) /* save iv */
649 mov %rcx,%r10 /* r10 := nbytes */
650 movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := ciphertext block[n-1] */
651 movdqu %xmm7,(%r8) /* update iv */
652 jmp 2f
653 1: movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := cv[0] */
654 pxor %xmm7,%xmm0 /* xmm0 := ptxt[0] */
655 movdqu %xmm0,(%rdx,%r10) /* store plaintext block */
656 2: movdqu -0x20(%rsi,%r10),%xmm6 /* xmm6 := ciphertext block[n-2] */
657 movdqu -0x30(%rsi,%r10),%xmm5 /* xmm5 := ciphertext block[n-3] */
658 movdqu -0x40(%rsi,%r10),%xmm4 /* xmm4 := ciphertext block[n-4] */
659 movdqu -0x50(%rsi,%r10),%xmm3 /* xmm3 := ciphertext block[n-5] */
660 movdqu -0x60(%rsi,%r10),%xmm2 /* xmm2 := ciphertext block[n-6] */
661 movdqu -0x70(%rsi,%r10),%xmm1 /* xmm1 := ciphertext block[n-7] */
662 movdqu -0x80(%rsi,%r10),%xmm0 /* xmm0 := ciphertext block[n-8] */
663 movdqa %xmm6,%xmm15 /* xmm[8+i] := cv[i], 0<i<8 */
664 movdqa %xmm5,%xmm14
665 movdqa %xmm4,%xmm13
666 movdqa %xmm3,%xmm12
667 movdqa %xmm2,%xmm11
668 movdqa %xmm1,%xmm10
669 movdqa %xmm0,%xmm9
670 mov %r9d,%ecx /* ecx := nrounds */
671 call aesni_dec8 /* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */
672 pxor %xmm15,%xmm7 /* xmm[i] := ptxt[i], 0<i<8 */
673 pxor %xmm14,%xmm6
674 pxor %xmm13,%xmm5
675 pxor %xmm12,%xmm4
676 pxor %xmm11,%xmm3
677 pxor %xmm10,%xmm2
678 pxor %xmm9,%xmm1
679 movdqu %xmm7,-0x10(%rdx,%r10) /* store plaintext blocks */
680 movdqu %xmm6,-0x20(%rdx,%r10)
681 movdqu %xmm5,-0x30(%rdx,%r10)
682 movdqu %xmm4,-0x40(%rdx,%r10)
683 movdqu %xmm3,-0x50(%rdx,%r10)
684 movdqu %xmm2,-0x60(%rdx,%r10)
685 movdqu %xmm1,-0x70(%rdx,%r10)
686 sub $0x80,%r10
687 jnz 1b /* repeat if more blocks */
688 pxor (%rsp),%xmm0 /* xmm0 := ptxt[0] */
689 movdqu %xmm0,(%rdx) /* store first plaintext block */
690 leave
691 ret
692 END(aesni_cbc_dec8)
693
694 /*
695 * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
696 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
697 * uint32_t nrounds@r9d)
698 *
699 * Encrypt a contiguous sequence of blocks with AES-XTS.
700 *
701 * nbytes must be a positive integral multiple of 16. This routine
702 * is not vectorized; use aesni_xts_enc8 for >=8 blocks at once.
703 *
704 * Standard ABI calling convention.
705 */
706 ENTRY(aesni_xts_enc1)
707 mov %rcx,%r10 /* r10 := nbytes */
708 movdqu (%r8),%xmm15 /* xmm15 := tweak */
709 1: movdqu (%rsi),%xmm0 /* xmm0 := ptxt */
710 lea 0x10(%rsi),%rsi /* advance rdi to next block */
711 pxor %xmm15,%xmm0 /* xmm0 := ptxt ^ tweak */
712 mov %r9d,%ecx /* ecx := nrounds */
713 call aesni_enc1 /* xmm0 := AES(ptxt ^ tweak) */
714 pxor %xmm15,%xmm0 /* xmm0 := AES(ptxt ^ tweak) ^ tweak */
715 movdqu %xmm0,(%rdx) /* store ciphertext block */
716 lea 0x10(%rdx),%rdx /* advance rsi to next block */
717 call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */
718 sub $0x10,%r10
719 jnz 1b /* repeat if more blocks */
720 movdqu %xmm15,(%r8) /* update tweak */
721 ret
722 END(aesni_xts_enc1)
723
724 /*
725 * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
726 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
727 * uint32_t nrounds@r9d)
728 *
729 * Encrypt a contiguous sequence of blocks with AES-XTS.
730 *
731 * nbytes must be a positive integral multiple of 128.
732 *
733 * Standard ABI calling convention.
734 */
735 ENTRY(aesni_xts_enc8)
736 push %rbp /* create stack frame uint128[1] */
737 mov %rsp,%rbp
738 sub $0x10,%rsp
739 mov %rcx,%r10 /* r10 := nbytes */
740 movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */
741 1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */
742 call aesni_xts_mulx /* xmm15 := tweak[1] */
743 movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */
744 call aesni_xts_mulx /* xmm15 := tweak[2] */
745 movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */
746 call aesni_xts_mulx /* xmm15 := tweak[3] */
747 movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */
748 call aesni_xts_mulx /* xmm15 := tweak[4] */
749 movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */
750 call aesni_xts_mulx /* xmm15 := tweak[5] */
751 movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */
752 call aesni_xts_mulx /* xmm15 := tweak[6] */
753 movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */
754 call aesni_xts_mulx /* xmm15 := tweak[7] */
755 movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */
756 movdqu 0x10(%rsi),%xmm1
757 movdqu 0x20(%rsi),%xmm2
758 movdqu 0x30(%rsi),%xmm3
759 movdqu 0x40(%rsi),%xmm4
760 movdqu 0x50(%rsi),%xmm5
761 movdqu 0x60(%rsi),%xmm6
762 movdqu 0x70(%rsi),%xmm7
763 lea 0x80(%rsi),%rsi /* advance rsi to next block group */
764 movdqa %xmm8,(%rsp) /* save tweak[0] */
765 pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */
766 pxor %xmm9,%xmm1
767 pxor %xmm10,%xmm2
768 pxor %xmm11,%xmm3
769 pxor %xmm12,%xmm4
770 pxor %xmm13,%xmm5
771 pxor %xmm14,%xmm6
772 pxor %xmm15,%xmm7
773 mov %r9d,%ecx /* ecx := nrounds */
774 call aesni_enc8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
775 pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */
776 pxor %xmm9,%xmm1
777 pxor %xmm10,%xmm2
778 pxor %xmm11,%xmm3
779 pxor %xmm12,%xmm4
780 pxor %xmm13,%xmm5
781 pxor %xmm14,%xmm6
782 pxor %xmm15,%xmm7
783 movdqu %xmm0,(%rdx) /* store ciphertext blocks */
784 movdqu %xmm1,0x10(%rdx)
785 movdqu %xmm2,0x20(%rdx)
786 movdqu %xmm3,0x30(%rdx)
787 movdqu %xmm4,0x40(%rdx)
788 movdqu %xmm5,0x50(%rdx)
789 movdqu %xmm6,0x60(%rdx)
790 movdqu %xmm7,0x70(%rdx)
791 lea 0x80(%rdx),%rdx /* advance rdx to next block group */
792 call aesni_xts_mulx /* xmm15 := tweak[8] */
793 sub $0x80,%r10
794 jnz 1b /* repeat if more block groups */
795 movdqu %xmm15,(%r8) /* update tweak */
796 leave
797 ret
798 END(aesni_xts_enc8)
799
800 /*
801 * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
802 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
803 * uint32_t nrounds@r9d)
804 *
805 * Decrypt a contiguous sequence of blocks with AES-XTS.
806 *
807 * nbytes must be a positive integral multiple of 16. This routine
808 * is not vectorized; use aesni_xts_dec8 for >=8 blocks at once.
809 *
810 * Standard ABI calling convention.
811 */
812 ENTRY(aesni_xts_dec1)
813 mov %rcx,%r10 /* r10 := nbytes */
814 movdqu (%r8),%xmm15 /* xmm15 := tweak */
815 1: movdqu (%rsi),%xmm0 /* xmm0 := ctxt */
816 lea 0x10(%rsi),%rsi /* advance rdi to next block */
817 pxor %xmm15,%xmm0 /* xmm0 := ctxt ^ tweak */
818 mov %r9d,%ecx /* ecx := nrounds */
819 call aesni_dec1 /* xmm0 := AES(ctxt ^ tweak) */
820 pxor %xmm15,%xmm0 /* xmm0 := AES(ctxt ^ tweak) ^ tweak */
821 movdqu %xmm0,(%rdx) /* store plaintext block */
822 lea 0x10(%rdx),%rdx /* advance rsi to next block */
823 call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */
824 sub $0x10,%r10
825 jnz 1b /* repeat if more blocks */
826 movdqu %xmm15,(%r8) /* update tweak */
827 ret
828 END(aesni_xts_dec1)
829
830 /*
831 * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
832 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
833 * uint32_t nrounds@r9d)
834 *
835 * Decrypt a contiguous sequence of blocks with AES-XTS.
836 *
837 * nbytes must be a positive integral multiple of 128.
838 *
839 * Standard ABI calling convention.
840 */
841 ENTRY(aesni_xts_dec8)
842 push %rbp /* create stack frame uint128[1] */
843 mov %rsp,%rbp
844 sub $0x10,%rsp
845 mov %rcx,%r10 /* r10 := nbytes */
846 movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */
847 1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */
848 call aesni_xts_mulx /* xmm15 := tweak[1] */
849 movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */
850 call aesni_xts_mulx /* xmm15 := tweak[2] */
851 movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */
852 call aesni_xts_mulx /* xmm15 := tweak[3] */
853 movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */
854 call aesni_xts_mulx /* xmm51 := tweak[4] */
855 movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */
856 call aesni_xts_mulx /* xmm15 := tweak[5] */
857 movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */
858 call aesni_xts_mulx /* xmm15 := tweak[6] */
859 movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */
860 call aesni_xts_mulx /* xmm15 := tweak[7] */
861 movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */
862 movdqu 0x10(%rsi),%xmm1
863 movdqu 0x20(%rsi),%xmm2
864 movdqu 0x30(%rsi),%xmm3
865 movdqu 0x40(%rsi),%xmm4
866 movdqu 0x50(%rsi),%xmm5
867 movdqu 0x60(%rsi),%xmm6
868 movdqu 0x70(%rsi),%xmm7
869 lea 0x80(%rsi),%rsi /* advance rsi to next block group */
870 movdqa %xmm8,(%rsp) /* save tweak[0] */
871 pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */
872 pxor %xmm9,%xmm1
873 pxor %xmm10,%xmm2
874 pxor %xmm11,%xmm3
875 pxor %xmm12,%xmm4
876 pxor %xmm13,%xmm5
877 pxor %xmm14,%xmm6
878 pxor %xmm15,%xmm7
879 mov %r9d,%ecx /* ecx := nrounds */
880 call aesni_dec8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
881 pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */
882 pxor %xmm9,%xmm1
883 pxor %xmm10,%xmm2
884 pxor %xmm11,%xmm3
885 pxor %xmm12,%xmm4
886 pxor %xmm13,%xmm5
887 pxor %xmm14,%xmm6
888 pxor %xmm15,%xmm7
889 movdqu %xmm0,(%rdx) /* store ciphertext blocks */
890 movdqu %xmm1,0x10(%rdx)
891 movdqu %xmm2,0x20(%rdx)
892 movdqu %xmm3,0x30(%rdx)
893 movdqu %xmm4,0x40(%rdx)
894 movdqu %xmm5,0x50(%rdx)
895 movdqu %xmm6,0x60(%rdx)
896 movdqu %xmm7,0x70(%rdx)
897 lea 0x80(%rdx),%rdx /* advance rdx to next block group */
898 call aesni_xts_mulx /* xmm15 := tweak[8] */
899 sub $0x80,%r10
900 jnz 1b /* repeat if more block groups */
901 movdqu %xmm15,(%r8) /* update tweak */
902 leave
903 ret
904 END(aesni_xts_dec8)
905
906 /*
907 * aesni_xts_mulx(tweak@xmm15)
908 *
909 * Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
910 * Uses %xmm0 as temporary.
911 */
912 .text
913 _ALIGN_TEXT
914 .type aesni_xts_mulx,@function
915 aesni_xts_mulx:
916 /*
917 * Simultaneously determine
918 * (a) whether the high bit of the low quadword must be
919 * shifted into the low bit of the high quadword, and
920 * (b) whether the high bit of the high quadword must be
921 * carried into x^128 = x^7 + x^2 + x + 1.
922 */
923 pxor %xmm0,%xmm0 /* xmm0 := 0 */
924 pcmpgtq %xmm15,%xmm0 /* xmm0[i] := -1 if 0 > xmm15[i] else 0 */
925 pshufd $0b01001110,%xmm0,%xmm0 /* swap halves of xmm0 */
926 pand xtscarry(%rip),%xmm0 /* copy xtscarry according to mask */
927 psllq $1,%xmm15 /* shift */
928 pxor %xmm0,%xmm15 /* incorporate (a) and (b) */
929 ret
930 END(aesni_xts_mulx)
931
932 .section .rodata
933 .p2align 4
934 .type xtscarry,@object
935 xtscarry:
936 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
937 END(xtscarry)
938
939 /*
940 * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi)
941 *
942 * Update an AES-XTS tweak.
943 *
944 * Standard ABI calling convention.
945 */
946 ENTRY(aesni_xts_update)
947 movdqu (%rdi),%xmm15
948 call aesni_xts_mulx
949 movdqu %xmm15,(%rsi)
950 ret
951 END(aesni_xts_update)
952
953 /*
954 * aesni_cbcmac_update1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
955 * size_t nbytes@rdx, uint8_t auth[16] @rcx, uint32_t nrounds@r8d)
956 *
957 * Update CBC-MAC.
958 *
959 * nbytes must be a positive integral multiple of 16.
960 *
961 * Standard ABI calling convention.
962 */
963 ENTRY(aesni_cbcmac_update1)
964 movdqu (%rcx),%xmm0 /* xmm0 := auth */
965 mov %rdx,%r10 /* r10 := nbytes */
966 mov %rcx,%rdx /* rdx := &auth */
967 1: pxor (%rsi),%xmm0 /* xmm0 ^= plaintext block */
968 lea 0x10(%rsi),%rsi
969 mov %r8d,%ecx /* ecx := nrounds */
970 call aesni_enc1 /* xmm0 := auth'; trash rax,rcx,xmm8 */
971 sub $0x10,%r10
972 jnz 1b
973 movdqu %xmm0,(%rdx) /* store auth' */
974 ret
975 END(aesni_cbcmac_update1)
976
977 /*
978 * aesni_ccm_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
979 * uint8_t *out@rdx, size_t nbytes@rcx,
980 * uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
981 *
982 * Update CCM encryption.
983 *
984 * nbytes must be a positive integral multiple of 16.
985 *
986 * Standard ABI calling convention.
987 */
988 ENTRY(aesni_ccm_enc1)
989 mov %rcx,%r10 /* r10 := nbytes */
990 movdqu 0x10(%r8),%xmm2 /* xmm2 := ctr (be) */
991 movdqa bswap32(%rip),%xmm4 /* xmm4 := bswap32 table */
992 movdqa ctr32_inc(%rip),%xmm5 /* xmm5 := (0,0,0,1) (le) */
993 movdqu (%r8),%xmm0 /* xmm0 := auth */
994 pshufb %xmm4,%xmm2 /* xmm2 := ctr (le) */
995 1: movdqu (%rsi),%xmm3 /* xmm3 := plaintext block */
996 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */
997 lea 0x10(%rsi),%rsi
998 movdqa %xmm2,%xmm1 /* xmm1 := ctr (le) */
999 mov %r9d,%ecx /* ecx := nrounds */
1000 pshufb %xmm4,%xmm1 /* xmm1 := ctr (be) */
1001 pxor %xmm3,%xmm0 /* xmm0 := auth ^ ptxt */
1002 call aesni_enc2 /* trash rax/rcx/xmm8 */
1003 pxor %xmm1,%xmm3 /* xmm3 := ciphertext block */
1004 sub $0x10,%r10 /* count down bytes */
1005 movdqu %xmm3,(%rdx) /* store ciphertext block */
1006 lea 0x10(%rdx),%rdx
1007 jnz 1b /* repeat if more blocks */
1008 pshufb %xmm4,%xmm2 /* xmm2 := ctr (be) */
1009 movdqu %xmm0,(%r8) /* store updated auth */
1010 movdqu %xmm2,0x10(%r8) /* store updated ctr */
1011 ret
1012 END(aesni_ccm_enc1)
1013
1014 /*
1015 * aesni_ccm_dec1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
1016 * uint8_t *out@rdx, size_t nbytes@rcx,
1017 * uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
1018 *
1019 * Update CCM decryption.
1020 *
1021 * nbytes must be a positive integral multiple of 16.
1022 *
1023 * Standard ABI calling convention.
1024 */
1025 ENTRY(aesni_ccm_dec1)
1026 movdqu 0x10(%r8),%xmm2 /* xmm2 := ctr (be) */
1027 movdqa bswap32(%rip),%xmm4 /* xmm4 := bswap32 table */
1028 movdqa ctr32_inc(%rip),%xmm5 /* xmm5 := (0,0,0,1) (le) */
1029 movdqu (%r8),%xmm1 /* xmm1 := auth */
1030 pshufb %xmm4,%xmm2 /* xmm2 := ctr (le) */
1031 mov %rcx,%r10 /* r10 := nbytes */
1032
1033 /* Decrypt the first block. */
1034 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */
1035 mov %r9d,%ecx /* ecx := nrounds */
1036 movdqa %xmm2,%xmm0 /* xmm0 := ctr (le) */
1037 movdqu (%rsi),%xmm3 /* xmm3 := ctxt */
1038 pshufb %xmm4,%xmm0 /* xmm0 := ctr (be) */
1039 lea 0x10(%rsi),%rsi
1040 call aesni_enc1 /* xmm0 := pad; trash rax/rcx/xmm8 */
1041 jmp 2f
1042
1043 1: /*
1044 * Authenticate the last block and decrypt the next block
1045 * simultaneously.
1046 *
1047 * xmm1 = auth ^ ptxt[-1]
1048 * xmm2 = ctr[-1] (le)
1049 */
1050 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */
1051 mov %r9d,%ecx /* ecx := nrounds */
1052 movdqa %xmm2,%xmm0 /* xmm0 := ctr (le) */
1053 movdqu (%rsi),%xmm3 /* xmm3 := ctxt */
1054 pshufb %xmm4,%xmm0 /* xmm0 := ctr (be) */
1055 lea 0x10(%rsi),%rsi
1056 call aesni_enc2 /* xmm0 := pad, xmm1 := auth';
1057 * trash rax/rcx/xmm8 */
1058 2: pxor %xmm0,%xmm3 /* xmm3 := ptxt */
1059 sub $0x10,%r10
1060 movdqu %xmm3,(%rdx) /* store plaintext */
1061 lea 0x10(%rdx),%rdx
1062 pxor %xmm3,%xmm1 /* xmm1 := auth ^ ptxt */
1063 jnz 1b
1064
1065 /* Authenticate the last block. */
1066 movdqa %xmm1,%xmm0 /* xmm0 := auth ^ ptxt */
1067 mov %r9d,%ecx /* ecx := nrounds */
1068 call aesni_enc1 /* xmm0 := auth' */
1069 pshufb %xmm4,%xmm2 /* xmm2 := ctr (be) */
1070 movdqu %xmm0,(%r8) /* store updated auth */
1071 movdqu %xmm2,0x10(%r8) /* store updated ctr */
1072 ret
1073 END(aesni_ccm_dec1)
1074
1075 .section .rodata
1076 .p2align 4
1077 .type bswap32,@object
1078 bswap32:
1079 .byte 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
1080 END(bswap32)
1081
1082 .section .rodata
1083 .p2align 4
1084 .type ctr32_inc,@object
1085 ctr32_inc:
1086 .byte 0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0
1087 END(ctr32_inc)
1088
1089 /*
1090 * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0,
1091 * uint32_t nrounds@ecx)
1092 *
1093 * Encrypt a single AES block in %xmm0.
1094 *
1095 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1096 */
1097 .text
1098 _ALIGN_TEXT
1099 .type aesni_enc1,@function
1100 aesni_enc1:
1101 pxor (%rdi),%xmm0 /* xor in first round key */
1102 shl $4,%ecx /* ecx := total byte size of round keys */
1103 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
1104 neg %rcx /* rcx := byte offset of round key from end */
1105 jmp 2f
1106 1: aesenc %xmm8,%xmm0
1107 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1108 add $0x10,%rcx
1109 jnz 1b /* repeat if more rounds */
1110 aesenclast %xmm8,%xmm0
1111 ret
1112 END(aesni_enc1)
1113
1114 /*
1115 * aesni_enc2(const struct aesenc *enckey@rdi, uint128_t block0@xmm0,
1116 * uint128_t block1@xmm1, uint32_t nrounds@ecx)
1117 *
1118 * Encrypt two AES blocks in %xmm0 and %xmm1.
1119 *
1120 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1121 */
1122 .text
1123 _ALIGN_TEXT
1124 .type aesni_enc2,@function
1125 aesni_enc2:
1126 movdqa (%rdi),%xmm8 /* xmm8 := first round key */
1127 shl $4,%ecx /* ecx := total byte size of round keys */
1128 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
1129 neg %rcx /* rcx := byte offset of round key from end */
1130 pxor %xmm8,%xmm0 /* xor in first round key */
1131 pxor %xmm8,%xmm1
1132 jmp 2f
1133 1: aesenc %xmm8,%xmm0
1134 aesenc %xmm8,%xmm1
1135 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1136 add $0x10,%rcx
1137 jnz 1b /* repeat if there's more */
1138 aesenclast %xmm8,%xmm0
1139 aesenclast %xmm8,%xmm1
1140 ret
1141 END(aesni_enc2)
1142
1143 /*
1144 * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ...,
1145 * block7@xmm7, uint32_t nrounds@ecx)
1146 *
1147 * Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
1148 *
1149 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1150 */
1151 .text
1152 _ALIGN_TEXT
1153 .type aesni_enc8,@function
1154 aesni_enc8:
1155 movdqa (%rdi),%xmm8 /* xor in first round key */
1156 pxor %xmm8,%xmm0
1157 pxor %xmm8,%xmm1
1158 pxor %xmm8,%xmm2
1159 pxor %xmm8,%xmm3
1160 pxor %xmm8,%xmm4
1161 pxor %xmm8,%xmm5
1162 pxor %xmm8,%xmm6
1163 pxor %xmm8,%xmm7
1164 shl $4,%ecx /* ecx := total byte size of round keys */
1165 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
1166 neg %rcx /* rcx := byte offset of round key from end */
1167 jmp 2f
1168 1: aesenc %xmm8,%xmm0
1169 aesenc %xmm8,%xmm1
1170 aesenc %xmm8,%xmm2
1171 aesenc %xmm8,%xmm3
1172 aesenc %xmm8,%xmm4
1173 aesenc %xmm8,%xmm5
1174 aesenc %xmm8,%xmm6
1175 aesenc %xmm8,%xmm7
1176 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1177 add $0x10,%rcx
1178 jnz 1b /* repeat if more rounds */
1179 aesenclast %xmm8,%xmm0
1180 aesenclast %xmm8,%xmm1
1181 aesenclast %xmm8,%xmm2
1182 aesenclast %xmm8,%xmm3
1183 aesenclast %xmm8,%xmm4
1184 aesenclast %xmm8,%xmm5
1185 aesenclast %xmm8,%xmm6
1186 aesenclast %xmm8,%xmm7
1187 ret
1188 END(aesni_enc8)
1189
1190 /*
1191 * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0,
1192 * uint32_t nrounds@ecx)
1193 *
1194 * Decrypt a single AES block in %xmm0.
1195 *
1196 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1197 */
1198 .text
1199 _ALIGN_TEXT
1200 .type aesni_dec1,@function
1201 aesni_dec1:
1202 pxor (%rdi),%xmm0 /* xor in first round key */
1203 shl $4,%ecx /* ecx := byte offset of round key */
1204 lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */
1205 neg %rcx /* rcx := byte offset of round key from end */
1206 jmp 2f
1207 1: aesdec %xmm8,%xmm0
1208 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1209 add $0x10,%rcx
1210 jnz 1b /* repeat if more rounds */
1211 aesdeclast %xmm8,%xmm0
1212 ret
1213 END(aesni_dec1)
1214
1215 /*
1216 * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ...,
1217 * block7@xmm7, uint32_t nrounds@ecx)
1218 *
1219 * Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
1220 *
1221 * Internal ABI. Uses %xmm8 as temporary. Destroys %rcx.
1222 */
1223 .text
1224 _ALIGN_TEXT
1225 .type aesni_dec8,@function
1226 aesni_dec8:
1227 movdqa (%rdi),%xmm8 /* xor in first round key */
1228 pxor %xmm8,%xmm0
1229 pxor %xmm8,%xmm1
1230 pxor %xmm8,%xmm2
1231 pxor %xmm8,%xmm3
1232 pxor %xmm8,%xmm4
1233 pxor %xmm8,%xmm5
1234 pxor %xmm8,%xmm6
1235 pxor %xmm8,%xmm7
1236 shl $4,%ecx /* ecx := byte offset of round key */
1237 lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */
1238 neg %rcx /* rcx := byte offset of round key from end */
1239 jmp 2f
1240 1: aesdec %xmm8,%xmm0
1241 aesdec %xmm8,%xmm1
1242 aesdec %xmm8,%xmm2
1243 aesdec %xmm8,%xmm3
1244 aesdec %xmm8,%xmm4
1245 aesdec %xmm8,%xmm5
1246 aesdec %xmm8,%xmm6
1247 aesdec %xmm8,%xmm7
1248 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1249 add $0x10,%rcx
1250 jnz 1b /* repeat if more rounds */
1251 aesdeclast %xmm8,%xmm0
1252 aesdeclast %xmm8,%xmm1
1253 aesdeclast %xmm8,%xmm2
1254 aesdeclast %xmm8,%xmm3
1255 aesdeclast %xmm8,%xmm4
1256 aesdeclast %xmm8,%xmm5
1257 aesdeclast %xmm8,%xmm6
1258 aesdeclast %xmm8,%xmm7
1259 ret
1260 END(aesni_dec8)
1261