aes_ni_64.S revision 1.6 1 /* $NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <machine/asm.h>
30
31 RCSID("$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $")
32
33 /*
34 * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined
35 * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned)
36 * Packed Single, defined to operate on binary32 floats. They have
37 * exactly the same architectural effects (move a 128-bit quantity from
38 * memory into an xmm register).
39 *
40 * In principle, they might have different microarchitectural effects
41 * so that MOVAPS/MOVUPS might incur a penalty when the register is
42 * later used for integer paths, but in practice they don't. So we use
43 * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS.
44 */
45 #define movdqa movaps
46 #define movdqu movups
47
48 /*
49 * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi)
50 *
51 * Expand a 16-byte AES-128 key into 10 round keys.
52 *
53 * Standard ABI calling convention.
54 */
55 ENTRY(aesni_setenckey128)
56 movdqu (%rsi),%xmm0 /* load master key into %xmm0 */
57 movdqa %xmm0,(%rdi) /* store master key as the first round key */
58 lea 0x10(%rdi),%rdi /* advance %rdi to next round key */
59 aeskeygenassist $0x1,%xmm0,%xmm2
60 call aesni_expand128
61 aeskeygenassist $0x2,%xmm0,%xmm2
62 call aesni_expand128
63 aeskeygenassist $0x4,%xmm0,%xmm2
64 call aesni_expand128
65 aeskeygenassist $0x8,%xmm0,%xmm2
66 call aesni_expand128
67 aeskeygenassist $0x10,%xmm0,%xmm2
68 call aesni_expand128
69 aeskeygenassist $0x20,%xmm0,%xmm2
70 call aesni_expand128
71 aeskeygenassist $0x40,%xmm0,%xmm2
72 call aesni_expand128
73 aeskeygenassist $0x80,%xmm0,%xmm2
74 call aesni_expand128
75 aeskeygenassist $0x1b,%xmm0,%xmm2
76 call aesni_expand128
77 aeskeygenassist $0x36,%xmm0,%xmm2
78 call aesni_expand128
79 ret
80 END(aesni_setenckey128)
81
82 /*
83 * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi)
84 *
85 * Expand a 24-byte AES-192 key into 12 round keys.
86 *
87 * Standard ABI calling convention.
88 */
89 ENTRY(aesni_setenckey192)
90 movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */
91 movq 0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */
92 movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */
93 lea 0x10(%rdi),%rdi /* advance %rdi to next round key */
94 aeskeygenassist $0x1,%xmm1,%xmm2
95 call aesni_expand192a
96 aeskeygenassist $0x2,%xmm0,%xmm2
97 call aesni_expand192b
98 aeskeygenassist $0x4,%xmm1,%xmm2
99 call aesni_expand192a
100 aeskeygenassist $0x8,%xmm0,%xmm2
101 call aesni_expand192b
102 aeskeygenassist $0x10,%xmm1,%xmm2
103 call aesni_expand192a
104 aeskeygenassist $0x20,%xmm0,%xmm2
105 call aesni_expand192b
106 aeskeygenassist $0x40,%xmm1,%xmm2
107 call aesni_expand192a
108 aeskeygenassist $0x80,%xmm0,%xmm2
109 call aesni_expand192b
110 ret
111 END(aesni_setenckey192)
112
113 /*
114 * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi)
115 *
116 * Expand a 32-byte AES-256 key into 14 round keys.
117 *
118 * Standard ABI calling convention.
119 */
120 ENTRY(aesni_setenckey256)
121 movdqu (%rsi),%xmm0 /* load master key [0:128) into %xmm0 */
122 movdqu 0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */
123 movdqa %xmm0,(%rdi) /* store master key [0:128) as round key */
124 movdqa %xmm1,0x10(%rdi) /* store master key [128:256) as round key */
125 lea 0x20(%rdi),%rdi /* advance %rdi to next round key */
126 aeskeygenassist $0x1,%xmm1,%xmm2
127 call aesni_expand256a
128 aeskeygenassist $0x1,%xmm0,%xmm2
129 call aesni_expand256b
130 aeskeygenassist $0x2,%xmm1,%xmm2
131 call aesni_expand256a
132 aeskeygenassist $0x2,%xmm0,%xmm2
133 call aesni_expand256b
134 aeskeygenassist $0x4,%xmm1,%xmm2
135 call aesni_expand256a
136 aeskeygenassist $0x4,%xmm0,%xmm2
137 call aesni_expand256b
138 aeskeygenassist $0x8,%xmm1,%xmm2
139 call aesni_expand256a
140 aeskeygenassist $0x8,%xmm0,%xmm2
141 call aesni_expand256b
142 aeskeygenassist $0x10,%xmm1,%xmm2
143 call aesni_expand256a
144 aeskeygenassist $0x10,%xmm0,%xmm2
145 call aesni_expand256b
146 aeskeygenassist $0x20,%xmm1,%xmm2
147 call aesni_expand256a
148 aeskeygenassist $0x20,%xmm0,%xmm2
149 call aesni_expand256b
150 aeskeygenassist $0x40,%xmm1,%xmm2
151 call aesni_expand256a
152 ret
153 END(aesni_setenckey256)
154
155 /*
156 * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0,
157 * uint128_t keygenassist@xmm2)
158 *
159 * 1. Compute the AES-128 round key using the previous round key.
160 * 2. Store it at *rkp.
161 * 3. Set %xmm0 to it.
162 * 4. Advance %rdi to point at the next round key.
163 *
164 * Internal ABI. On entry:
165 *
166 * %rdi = rkp, pointer to round key to compute
167 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
168 * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON)
169 *
170 * On exit:
171 *
172 * %rdi = &rkp[1], rkp advanced by one round key
173 * %xmm0 = rk, the round key we just computed
174 * %xmm2 = garbage
175 * %xmm4 = garbage
176 * %xmm5 = garbage
177 * %xmm6 = garbage
178 *
179 * Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15,
180 * and all other registers).
181 */
182 .text
183 _ALIGN_TEXT
184 .type aesni_expand128,@function
185 aesni_expand128:
186 /*
187 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
188 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON.
189 */
190 pshufd $0b11111111,%xmm2,%xmm2
191
192 /*
193 * %xmm4 := (0, prk[0], prk[1], prk[2])
194 * %xmm5 := (0, 0, prk[0], prk[1])
195 * %xmm6 := (0, 0, 0, prk[0])
196 */
197 movdqa %xmm0,%xmm4
198 movdqa %xmm0,%xmm5
199 movdqa %xmm0,%xmm6
200 pslldq $4,%xmm4
201 pslldq $8,%xmm5
202 pslldq $12,%xmm6
203
204 /*
205 * %xmm0 := (rk[0] = t ^ prk[0],
206 * rk[1] = t ^ prk[0] ^ prk[1],
207 * rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2],
208 * rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3])
209 */
210 pxor %xmm2,%xmm0
211 pxor %xmm4,%xmm0
212 pxor %xmm5,%xmm0
213 pxor %xmm6,%xmm0
214
215 movdqa %xmm0,(%rdi) /* store round key */
216 lea 0x10(%rdi),%rdi /* advance to next round key address */
217 ret
218 END(aesni_expand128)
219
220 /*
221 * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0,
222 * uint64_t rklo@xmm1, uint128_t keygenassist@xmm2)
223 *
224 * Set even-numbered AES-192 round key.
225 *
226 * Internal ABI. On entry:
227 *
228 * %rdi = rkp, pointer to two round keys to compute
229 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
230 * %xmm1 = (rklo[0], rklo[1], xxx, xxx)
231 * %xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx)
232 *
233 * On exit:
234 *
235 * %rdi = &rkp[2], rkp advanced by two round keys
236 * %xmm0 = nrk, second round key we just computed
237 * %xmm1 = rk, first round key we just computed
238 * %xmm2 = garbage
239 * %xmm4 = garbage
240 * %xmm5 = garbage
241 * %xmm6 = garbage
242 * %xmm7 = garbage
243 */
244 .text
245 _ALIGN_TEXT
246 .type aesni_expand192a,@function
247 aesni_expand192a:
248 /*
249 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]),
250 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON.
251 */
252 pshufd $0b01010101,%xmm2,%xmm2
253
254 /*
255 * We need to compute:
256 *
257 * rk[0] := rklo[0]
258 * rk[1] := rklo[1]
259 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
260 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
261 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
262 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
263 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
264 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
265 * ^ rklo[1]
266 */
267
268 /*
269 * %xmm4 := (prk[0], prk[1], prk[2], prk[3])
270 * %xmm5 := (0, prk[0], prk[1], prk[2])
271 * %xmm6 := (0, 0, prk[0], prk[1])
272 * %xmm7 := (0, 0, 0, prk[0])
273 */
274 movdqa %xmm0,%xmm4
275 movdqa %xmm0,%xmm5
276 movdqa %xmm0,%xmm6
277 movdqa %xmm0,%xmm7
278 pslldq $4,%xmm5
279 pslldq $8,%xmm6
280 pslldq $12,%xmm7
281
282 /* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */
283 pxor %xmm2,%xmm4
284 pxor %xmm5,%xmm4
285 pxor %xmm6,%xmm4
286 pxor %xmm7,%xmm4
287
288 /*
289 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and
290 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]);
291 * and we have yet to compute nrk[2] or nrk[3], which requires
292 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...). We need
293 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and
294 * nrk into %xmm0.
295 */
296
297 /* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */
298 pshufd $0b11111110,%xmm4,%xmm0
299
300 /*
301 * %xmm6 := (0, 0, rklo[0], rklo[1])
302 * %xmm7 := (0, 0, 0, rklo[0])
303 */
304 movdqa %xmm1,%xmm6
305 movdqa %xmm1,%xmm7
306
307 pslldq $8,%xmm6
308 pslldq $12,%xmm7
309
310 /*
311 * %xmm0 := (nrk[0],
312 * nrk[1],
313 * nrk[2] = nrk[1] ^ rklo[0],
314 * nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1])
315 */
316 pxor %xmm6,%xmm0
317 pxor %xmm7,%xmm0
318
319 /* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */
320 shufps $0b01000100,%xmm4,%xmm1
321
322 movdqa %xmm1,(%rdi) /* store round key */
323 movdqa %xmm0,0x10(%rdi) /* store next round key */
324 lea 0x20(%rdi),%rdi /* advance two round keys */
325 ret
326 END(aesni_expand192a)
327
328 /*
329 * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0,
330 * uint128_t keygenassist@xmm2)
331 *
332 * Set odd-numbered AES-192 round key.
333 *
334 * Internal ABI. On entry:
335 *
336 * %rdi = rkp, pointer to round key to compute
337 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
338 * %xmm1 = (xxx, xxx, pprk[2], pprk[3])
339 * %xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON)
340 *
341 * On exit:
342 *
343 * %rdi = &rkp[1], rkp advanced by one round key
344 * %xmm0 = rk, the round key we just computed
345 * %xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key
346 * %xmm2 = garbage
347 * %xmm4 = garbage
348 * %xmm5 = garbage
349 * %xmm6 = garbage
350 * %xmm7 = garbage
351 */
352 .text
353 _ALIGN_TEXT
354 .type aesni_expand192b,@function
355 aesni_expand192b:
356 /*
357 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
358 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON.
359 */
360 pshufd $0b11111111,%xmm2,%xmm2
361
362 /*
363 * We need to compute:
364 *
365 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2]
366 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3]
367 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
368 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
369 * ^ prk[1]
370 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
371 * ^ prk[1] ^ prk[2]
372 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
373 * ^ prk[1] ^ prk[2] ^ prk[3]
374 */
375
376 /* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */
377 shufps $0b01001110,%xmm0,%xmm1
378
379 /*
380 * %xmm5 := (0, pprk[2], pprk[3], prk[0])
381 * %xmm6 := (0, 0, pprk[2], pprk[3])
382 * %xmm7 := (0, 0, 0, pprk[2])
383 */
384 movdqa %xmm1,%xmm5
385 movdqa %xmm1,%xmm6
386 movdqa %xmm1,%xmm7
387 pslldq $4,%xmm5
388 pslldq $8,%xmm6
389 pslldq $12,%xmm7
390
391 /* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */
392 pxor %xmm2,%xmm1
393 pxor %xmm5,%xmm1
394 pxor %xmm6,%xmm1
395 pxor %xmm7,%xmm1
396
397 /* %xmm4 := (prk[2], prk[3], xxx, xxx) */
398 pshufd $0b00001110,%xmm0,%xmm4
399
400 /* %xmm5 := (0, prk[2], xxx, xxx) */
401 movdqa %xmm4,%xmm5
402 pslldq $4,%xmm5
403
404 /* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */
405 movdqa %xmm1,%xmm0
406
407 /* %xmm1 := (rk[3], rk[3], xxx, xxx) */
408 shufps $0b00001111,%xmm1,%xmm1
409
410 /*
411 * %xmm1 := (nrk[0] = rk[3] ^ prk[2],
412 * nrk[1] = rk[3] ^ prk[2] ^ prk[3],
413 * xxx,
414 * xxx)
415 */
416 pxor %xmm4,%xmm1
417 pxor %xmm5,%xmm1
418
419 movdqa %xmm0,(%rdi) /* store round key */
420 lea 0x10(%rdi),%rdi /* advance to next round key address */
421 ret
422 END(aesni_expand192b)
423
424 /*
425 * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0,
426 * uint128_t prk@xmm1, uint128_t keygenassist@xmm2)
427 *
428 * Set even-numbered AES-256 round key.
429 *
430 * Internal ABI. On entry:
431 *
432 * %rdi = rkp, pointer to round key to compute
433 * %xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3])
434 * %xmm1 = (prk[0], prk[1], prk[2], prk[3])
435 * %xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])))
436 *
437 * On exit:
438 *
439 * %rdi = &rkp[1], rkp advanced by one round key
440 * %xmm0 = rk, the round key we just computed
441 * %xmm1 = prk, previous round key, preserved from entry
442 * %xmm2 = garbage
443 * %xmm4 = garbage
444 * %xmm5 = garbage
445 * %xmm6 = garbage
446 *
447 * The computation turns out to be the same as for AES-128; the
448 * previous round key does not figure into it, only the
449 * previous-previous round key.
450 */
451 aesni_expand256a = aesni_expand128
452
453 /*
454 * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0,
455 * uint128_t pprk@xmm1, uint128_t keygenassist@xmm2)
456 *
457 * Set odd-numbered AES-256 round key.
458 *
459 * Internal ABI. On entry:
460 *
461 * %rdi = rkp, pointer to round key to compute
462 * %xmm0 = (prk[0], prk[1], prk[2], prk[3])
463 * %xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3])
464 * %xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx)
465 *
466 * On exit:
467 *
468 * %rdi = &rkp[1], rkp advanced by one round key
469 * %xmm0 = prk, previous round key, preserved from entry
470 * %xmm1 = rk, the round key we just computed
471 * %xmm2 = garbage
472 * %xmm4 = garbage
473 * %xmm5 = garbage
474 * %xmm6 = garbage
475 */
476 .text
477 _ALIGN_TEXT
478 .type aesni_expand256b,@function
479 aesni_expand256b:
480 /*
481 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
482 * i.e., set each word of %xmm2 to t := Sub(prk[3]).
483 */
484 pshufd $0b10101010,%xmm2,%xmm2
485
486 /*
487 * %xmm4 := (0, pprk[0], pprk[1], pprk[2])
488 * %xmm5 := (0, 0, pprk[0], pprk[1])
489 * %xmm6 := (0, 0, 0, pprk[0])
490 */
491 movdqa %xmm1,%xmm4
492 movdqa %xmm1,%xmm5
493 movdqa %xmm1,%xmm6
494 pslldq $4,%xmm4
495 pslldq $8,%xmm5
496 pslldq $12,%xmm6
497
498 /*
499 * %xmm0 := (rk[0] = t ^ pprk[0],
500 * rk[1] = t ^ pprk[0] ^ pprk[1],
501 * rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2],
502 * rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3])
503 */
504 pxor %xmm2,%xmm1
505 pxor %xmm4,%xmm1
506 pxor %xmm5,%xmm1
507 pxor %xmm6,%xmm1
508
509 movdqa %xmm1,(%rdi) /* store round key */
510 lea 0x10(%rdi),%rdi /* advance to next round key address */
511 ret
512 END(aesni_expand256b)
513
514 /*
515 * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi,
516 * uint32_t nrounds@rdx)
517 *
518 * Convert AES encryption round keys to AES decryption round keys.
519 * `rounds' must be between 10 and 14.
520 *
521 * Standard ABI calling convention.
522 */
523 ENTRY(aesni_enctodec)
524 shl $4,%edx /* rdx := byte offset of last round key */
525 movdqa (%rdi,%rdx),%xmm0 /* load last round key */
526 movdqa %xmm0,(%rsi) /* store last round key verbatim */
527 jmp 2f
528 _ALIGN_TEXT
529 1: movdqa (%rdi,%rdx),%xmm0 /* load round key */
530 aesimc %xmm0,%xmm0 /* convert encryption to decryption */
531 movdqa %xmm0,(%rsi) /* store round key */
532 2: sub $0x10,%rdx /* advance to next round key */
533 lea 0x10(%rsi),%rsi
534 jnz 1b /* repeat if more rounds */
535 movdqa (%rdi),%xmm0 /* load first round key */
536 movdqa %xmm0,(%rsi) /* store first round key verbatim */
537 ret
538 END(aesni_enctodec)
539
540 /*
541 * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi,
542 * uint8_t out[16] @rdx, uint32_t nrounds@ecx)
543 *
544 * Encrypt a single block.
545 *
546 * Standard ABI calling convention.
547 */
548 ENTRY(aesni_enc)
549 movdqu (%rsi),%xmm0
550 call aesni_enc1
551 movdqu %xmm0,(%rdx)
552 ret
553 END(aesni_enc)
554
555 /*
556 * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi,
557 * uint8_t out[16] @rdx, uint32_t nrounds@ecx)
558 *
559 * Decrypt a single block.
560 *
561 * Standard ABI calling convention.
562 */
563 ENTRY(aesni_dec)
564 movdqu (%rsi),%xmm0
565 call aesni_dec1
566 movdqu %xmm0,(%rdx)
567 ret
568 END(aesni_dec)
569
570 /*
571 * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
572 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8,
573 * uint32_t nrounds@r9d)
574 *
575 * Encrypt a contiguous sequence of blocks with AES-CBC.
576 *
577 * nbytes must be an integral multiple of 16.
578 *
579 * Standard ABI calling convention.
580 */
581 ENTRY(aesni_cbc_enc)
582 cmp $0,%rcx
583 jz 2f
584 mov %rcx,%r10 /* r10 := nbytes */
585 movdqu (%r8),%xmm0 /* xmm0 := chaining value */
586 _ALIGN_TEXT
587 1: movdqu (%rsi),%xmm1 /* xmm1 := plaintext block */
588 lea 0x10(%rsi),%rsi
589 pxor %xmm1,%xmm0 /* xmm0 := cv ^ ptxt */
590 mov %r9d,%ecx /* ecx := nrounds */
591 call aesni_enc1 /* xmm0 := ciphertext block */
592 movdqu %xmm0,(%rdx)
593 lea 0x10(%rdx),%rdx
594 sub $0x10,%r10
595 jnz 1b /* repeat if r10 is nonzero */
596 movdqu %xmm0,(%r8) /* store chaining value */
597 2: ret
598 END(aesni_cbc_enc)
599
600 /*
601 * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
602 * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
603 * uint32_t nrounds@r9)
604 *
605 * Decrypt a contiguous sequence of blocks with AES-CBC.
606 *
607 * nbytes must be a positive integral multiple of 16. This routine
608 * is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once.
609 *
610 * Standard ABI calling convention.
611 */
612 ENTRY(aesni_cbc_dec1)
613 push %rbp /* create stack frame uint128[1] */
614 mov %rsp,%rbp
615 sub $0x10,%rsp
616 movdqu (%r8),%xmm8 /* xmm8 := iv */
617 movdqa %xmm8,(%rsp) /* save iv */
618 mov %rcx,%r10 /* r10 := nbytes */
619 movdqu -0x10(%rsi,%r10),%xmm0 /* xmm0 := last ciphertext block */
620 movdqu %xmm0,(%r8) /* update iv */
621 jmp 2f
622 _ALIGN_TEXT
623 1: movdqu -0x10(%rsi,%r10),%xmm8 /* xmm8 := chaining value */
624 pxor %xmm8,%xmm0 /* xmm0 := ptxt */
625 movdqu %xmm0,(%rdx,%r10) /* store plaintext block */
626 movdqa %xmm8,%xmm0 /* move cv = ciphertext block */
627 2: mov %r9d,%ecx /* ecx := nrounds */
628 call aesni_dec1 /* xmm0 := cv ^ ptxt */
629 sub $0x10,%r10
630 jnz 1b /* repeat if more blocks */
631 pxor (%rsp),%xmm0 /* xmm0 := ptxt */
632 movdqu %xmm0,(%rdx) /* store first plaintext block */
633 leave
634 ret
635 END(aesni_cbc_dec1)
636
637 /*
638 * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
639 * uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
640 * uint32_t nrounds@r9)
641 *
642 * Decrypt a contiguous sequence of 8-block units with AES-CBC.
643 *
644 * nbytes must be a positive integral multiple of 128.
645 *
646 * Standard ABI calling convention.
647 */
648 ENTRY(aesni_cbc_dec8)
649 push %rbp /* create stack frame uint128[1] */
650 mov %rsp,%rbp
651 sub $0x10,%rsp
652 movdqu (%r8),%xmm8 /* xmm8 := iv */
653 movdqa %xmm8,(%rsp) /* save iv */
654 mov %rcx,%r10 /* r10 := nbytes */
655 movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := ciphertext block[n-1] */
656 movdqu %xmm7,(%r8) /* update iv */
657 jmp 2f
658 _ALIGN_TEXT
659 1: movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := cv[0] */
660 pxor %xmm7,%xmm0 /* xmm0 := ptxt[0] */
661 movdqu %xmm0,(%rdx,%r10) /* store plaintext block */
662 2: movdqu -0x20(%rsi,%r10),%xmm6 /* xmm6 := ciphertext block[n-2] */
663 movdqu -0x30(%rsi,%r10),%xmm5 /* xmm5 := ciphertext block[n-3] */
664 movdqu -0x40(%rsi,%r10),%xmm4 /* xmm4 := ciphertext block[n-4] */
665 movdqu -0x50(%rsi,%r10),%xmm3 /* xmm3 := ciphertext block[n-5] */
666 movdqu -0x60(%rsi,%r10),%xmm2 /* xmm2 := ciphertext block[n-6] */
667 movdqu -0x70(%rsi,%r10),%xmm1 /* xmm1 := ciphertext block[n-7] */
668 movdqu -0x80(%rsi,%r10),%xmm0 /* xmm0 := ciphertext block[n-8] */
669 movdqa %xmm6,%xmm15 /* xmm[8+i] := cv[i], 0<i<8 */
670 movdqa %xmm5,%xmm14
671 movdqa %xmm4,%xmm13
672 movdqa %xmm3,%xmm12
673 movdqa %xmm2,%xmm11
674 movdqa %xmm1,%xmm10
675 movdqa %xmm0,%xmm9
676 mov %r9d,%ecx /* ecx := nrounds */
677 call aesni_dec8 /* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */
678 pxor %xmm15,%xmm7 /* xmm[i] := ptxt[i], 0<i<8 */
679 pxor %xmm14,%xmm6
680 pxor %xmm13,%xmm5
681 pxor %xmm12,%xmm4
682 pxor %xmm11,%xmm3
683 pxor %xmm10,%xmm2
684 pxor %xmm9,%xmm1
685 movdqu %xmm7,-0x10(%rdx,%r10) /* store plaintext blocks */
686 movdqu %xmm6,-0x20(%rdx,%r10)
687 movdqu %xmm5,-0x30(%rdx,%r10)
688 movdqu %xmm4,-0x40(%rdx,%r10)
689 movdqu %xmm3,-0x50(%rdx,%r10)
690 movdqu %xmm2,-0x60(%rdx,%r10)
691 movdqu %xmm1,-0x70(%rdx,%r10)
692 sub $0x80,%r10
693 jnz 1b /* repeat if more blocks */
694 pxor (%rsp),%xmm0 /* xmm0 := ptxt[0] */
695 movdqu %xmm0,(%rdx) /* store first plaintext block */
696 leave
697 ret
698 END(aesni_cbc_dec8)
699
700 /*
701 * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
702 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
703 * uint32_t nrounds@r9d)
704 *
705 * Encrypt a contiguous sequence of blocks with AES-XTS.
706 *
707 * nbytes must be a positive integral multiple of 16. This routine
708 * is not vectorized; use aesni_xts_enc8 for >=8 blocks at once.
709 *
710 * Standard ABI calling convention.
711 */
712 ENTRY(aesni_xts_enc1)
713 mov %rcx,%r10 /* r10 := nbytes */
714 movdqu (%r8),%xmm15 /* xmm15 := tweak */
715 _ALIGN_TEXT
716 1: movdqu (%rsi),%xmm0 /* xmm0 := ptxt */
717 lea 0x10(%rsi),%rsi /* advance rdi to next block */
718 pxor %xmm15,%xmm0 /* xmm0 := ptxt ^ tweak */
719 mov %r9d,%ecx /* ecx := nrounds */
720 call aesni_enc1 /* xmm0 := AES(ptxt ^ tweak) */
721 pxor %xmm15,%xmm0 /* xmm0 := AES(ptxt ^ tweak) ^ tweak */
722 movdqu %xmm0,(%rdx) /* store ciphertext block */
723 lea 0x10(%rdx),%rdx /* advance rsi to next block */
724 call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */
725 sub $0x10,%r10
726 jnz 1b /* repeat if more blocks */
727 movdqu %xmm15,(%r8) /* update tweak */
728 ret
729 END(aesni_xts_enc1)
730
731 /*
732 * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
733 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
734 * uint32_t nrounds@r9d)
735 *
736 * Encrypt a contiguous sequence of blocks with AES-XTS.
737 *
738 * nbytes must be a positive integral multiple of 128.
739 *
740 * Standard ABI calling convention.
741 */
742 ENTRY(aesni_xts_enc8)
743 push %rbp /* create stack frame uint128[1] */
744 mov %rsp,%rbp
745 sub $0x10,%rsp
746 mov %rcx,%r10 /* r10 := nbytes */
747 movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */
748 _ALIGN_TEXT
749 1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */
750 call aesni_xts_mulx /* xmm15 := tweak[1] */
751 movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */
752 call aesni_xts_mulx /* xmm15 := tweak[2] */
753 movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */
754 call aesni_xts_mulx /* xmm15 := tweak[3] */
755 movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */
756 call aesni_xts_mulx /* xmm15 := tweak[4] */
757 movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */
758 call aesni_xts_mulx /* xmm15 := tweak[5] */
759 movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */
760 call aesni_xts_mulx /* xmm15 := tweak[6] */
761 movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */
762 call aesni_xts_mulx /* xmm15 := tweak[7] */
763 movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */
764 movdqu 0x10(%rsi),%xmm1
765 movdqu 0x20(%rsi),%xmm2
766 movdqu 0x30(%rsi),%xmm3
767 movdqu 0x40(%rsi),%xmm4
768 movdqu 0x50(%rsi),%xmm5
769 movdqu 0x60(%rsi),%xmm6
770 movdqu 0x70(%rsi),%xmm7
771 lea 0x80(%rsi),%rsi /* advance rsi to next block group */
772 movdqa %xmm8,(%rsp) /* save tweak[0] */
773 pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */
774 pxor %xmm9,%xmm1
775 pxor %xmm10,%xmm2
776 pxor %xmm11,%xmm3
777 pxor %xmm12,%xmm4
778 pxor %xmm13,%xmm5
779 pxor %xmm14,%xmm6
780 pxor %xmm15,%xmm7
781 mov %r9d,%ecx /* ecx := nrounds */
782 call aesni_enc8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
783 pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */
784 pxor %xmm9,%xmm1
785 pxor %xmm10,%xmm2
786 pxor %xmm11,%xmm3
787 pxor %xmm12,%xmm4
788 pxor %xmm13,%xmm5
789 pxor %xmm14,%xmm6
790 pxor %xmm15,%xmm7
791 movdqu %xmm0,(%rdx) /* store ciphertext blocks */
792 movdqu %xmm1,0x10(%rdx)
793 movdqu %xmm2,0x20(%rdx)
794 movdqu %xmm3,0x30(%rdx)
795 movdqu %xmm4,0x40(%rdx)
796 movdqu %xmm5,0x50(%rdx)
797 movdqu %xmm6,0x60(%rdx)
798 movdqu %xmm7,0x70(%rdx)
799 lea 0x80(%rdx),%rdx /* advance rdx to next block group */
800 call aesni_xts_mulx /* xmm15 := tweak[8] */
801 sub $0x80,%r10
802 jnz 1b /* repeat if more block groups */
803 movdqu %xmm15,(%r8) /* update tweak */
804 leave
805 ret
806 END(aesni_xts_enc8)
807
808 /*
809 * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
810 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
811 * uint32_t nrounds@r9d)
812 *
813 * Decrypt a contiguous sequence of blocks with AES-XTS.
814 *
815 * nbytes must be a positive integral multiple of 16. This routine
816 * is not vectorized; use aesni_xts_dec8 for >=8 blocks at once.
817 *
818 * Standard ABI calling convention.
819 */
820 ENTRY(aesni_xts_dec1)
821 mov %rcx,%r10 /* r10 := nbytes */
822 movdqu (%r8),%xmm15 /* xmm15 := tweak */
823 _ALIGN_TEXT
824 1: movdqu (%rsi),%xmm0 /* xmm0 := ctxt */
825 lea 0x10(%rsi),%rsi /* advance rdi to next block */
826 pxor %xmm15,%xmm0 /* xmm0 := ctxt ^ tweak */
827 mov %r9d,%ecx /* ecx := nrounds */
828 call aesni_dec1 /* xmm0 := AES(ctxt ^ tweak) */
829 pxor %xmm15,%xmm0 /* xmm0 := AES(ctxt ^ tweak) ^ tweak */
830 movdqu %xmm0,(%rdx) /* store plaintext block */
831 lea 0x10(%rdx),%rdx /* advance rsi to next block */
832 call aesni_xts_mulx /* xmm15 *= x; trash xmm0 */
833 sub $0x10,%r10
834 jnz 1b /* repeat if more blocks */
835 movdqu %xmm15,(%r8) /* update tweak */
836 ret
837 END(aesni_xts_dec1)
838
839 /*
840 * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
841 * uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
842 * uint32_t nrounds@r9d)
843 *
844 * Decrypt a contiguous sequence of blocks with AES-XTS.
845 *
846 * nbytes must be a positive integral multiple of 128.
847 *
848 * Standard ABI calling convention.
849 */
850 ENTRY(aesni_xts_dec8)
851 push %rbp /* create stack frame uint128[1] */
852 mov %rsp,%rbp
853 sub $0x10,%rsp
854 mov %rcx,%r10 /* r10 := nbytes */
855 movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */
856 _ALIGN_TEXT
857 1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */
858 call aesni_xts_mulx /* xmm15 := tweak[1] */
859 movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */
860 call aesni_xts_mulx /* xmm15 := tweak[2] */
861 movdqa %xmm15,%xmm10 /* xmm10 := tweak[2] */
862 call aesni_xts_mulx /* xmm15 := tweak[3] */
863 movdqa %xmm15,%xmm11 /* xmm11 := tweak[3] */
864 call aesni_xts_mulx /* xmm51 := tweak[4] */
865 movdqa %xmm15,%xmm12 /* xmm12 := tweak[4] */
866 call aesni_xts_mulx /* xmm15 := tweak[5] */
867 movdqa %xmm15,%xmm13 /* xmm13 := tweak[5] */
868 call aesni_xts_mulx /* xmm15 := tweak[6] */
869 movdqa %xmm15,%xmm14 /* xmm14 := tweak[6] */
870 call aesni_xts_mulx /* xmm15 := tweak[7] */
871 movdqu (%rsi),%xmm0 /* xmm[i] := ptxt[i] */
872 movdqu 0x10(%rsi),%xmm1
873 movdqu 0x20(%rsi),%xmm2
874 movdqu 0x30(%rsi),%xmm3
875 movdqu 0x40(%rsi),%xmm4
876 movdqu 0x50(%rsi),%xmm5
877 movdqu 0x60(%rsi),%xmm6
878 movdqu 0x70(%rsi),%xmm7
879 lea 0x80(%rsi),%rsi /* advance rsi to next block group */
880 movdqa %xmm8,(%rsp) /* save tweak[0] */
881 pxor %xmm8,%xmm0 /* xmm[i] := ptxt[i] ^ tweak[i] */
882 pxor %xmm9,%xmm1
883 pxor %xmm10,%xmm2
884 pxor %xmm11,%xmm3
885 pxor %xmm12,%xmm4
886 pxor %xmm13,%xmm5
887 pxor %xmm14,%xmm6
888 pxor %xmm15,%xmm7
889 mov %r9d,%ecx /* ecx := nrounds */
890 call aesni_dec8 /* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
891 pxor (%rsp),%xmm0 /* xmm[i] := AES(...) ^ tweak[i] */
892 pxor %xmm9,%xmm1
893 pxor %xmm10,%xmm2
894 pxor %xmm11,%xmm3
895 pxor %xmm12,%xmm4
896 pxor %xmm13,%xmm5
897 pxor %xmm14,%xmm6
898 pxor %xmm15,%xmm7
899 movdqu %xmm0,(%rdx) /* store ciphertext blocks */
900 movdqu %xmm1,0x10(%rdx)
901 movdqu %xmm2,0x20(%rdx)
902 movdqu %xmm3,0x30(%rdx)
903 movdqu %xmm4,0x40(%rdx)
904 movdqu %xmm5,0x50(%rdx)
905 movdqu %xmm6,0x60(%rdx)
906 movdqu %xmm7,0x70(%rdx)
907 lea 0x80(%rdx),%rdx /* advance rdx to next block group */
908 call aesni_xts_mulx /* xmm15 := tweak[8] */
909 sub $0x80,%r10
910 jnz 1b /* repeat if more block groups */
911 movdqu %xmm15,(%r8) /* update tweak */
912 leave
913 ret
914 END(aesni_xts_dec8)
915
916 /*
917 * aesni_xts_mulx(tweak@xmm15)
918 *
919 * Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
920 * Uses %xmm0 as temporary.
921 */
922 .text
923 _ALIGN_TEXT
924 .type aesni_xts_mulx,@function
925 aesni_xts_mulx:
926 /*
927 * Simultaneously determine
928 * (a) whether the high bit of the low quadword must be
929 * shifted into the low bit of the high quadword, and
930 * (b) whether the high bit of the high quadword must be
931 * carried into x^128 = x^7 + x^2 + x + 1.
932 */
933 pxor %xmm0,%xmm0 /* xmm0 := 0 */
934 pcmpgtq %xmm15,%xmm0 /* xmm0[i] := -1 if 0 > xmm15[i] else 0 */
935 pshufd $0b01001110,%xmm0,%xmm0 /* swap halves of xmm0 */
936 pand xtscarry(%rip),%xmm0 /* copy xtscarry according to mask */
937 psllq $1,%xmm15 /* shift */
938 pxor %xmm0,%xmm15 /* incorporate (a) and (b) */
939 ret
940 END(aesni_xts_mulx)
941
942 .section .rodata
943 .p2align 4
944 .type xtscarry,@object
945 xtscarry:
946 .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0
947 END(xtscarry)
948
949 /*
950 * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi)
951 *
952 * Update an AES-XTS tweak.
953 *
954 * Standard ABI calling convention.
955 */
956 ENTRY(aesni_xts_update)
957 movdqu (%rdi),%xmm15
958 call aesni_xts_mulx
959 movdqu %xmm15,(%rsi)
960 ret
961 END(aesni_xts_update)
962
963 /*
964 * aesni_cbcmac_update1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
965 * size_t nbytes@rdx, uint8_t auth[16] @rcx, uint32_t nrounds@r8d)
966 *
967 * Update CBC-MAC.
968 *
969 * nbytes must be a positive integral multiple of 16.
970 *
971 * Standard ABI calling convention.
972 */
973 ENTRY(aesni_cbcmac_update1)
974 movdqu (%rcx),%xmm0 /* xmm0 := auth */
975 mov %rdx,%r10 /* r10 := nbytes */
976 mov %rcx,%rdx /* rdx := &auth */
977 _ALIGN_TEXT
978 1: pxor (%rsi),%xmm0 /* xmm0 ^= plaintext block */
979 lea 0x10(%rsi),%rsi
980 mov %r8d,%ecx /* ecx := nrounds */
981 call aesni_enc1 /* xmm0 := auth'; trash rax,rcx,xmm8 */
982 sub $0x10,%r10
983 jnz 1b
984 movdqu %xmm0,(%rdx) /* store auth' */
985 ret
986 END(aesni_cbcmac_update1)
987
988 /*
989 * aesni_ccm_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
990 * uint8_t *out@rdx, size_t nbytes@rcx,
991 * uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
992 *
993 * Update CCM encryption.
994 *
995 * nbytes must be a positive integral multiple of 16.
996 *
997 * Standard ABI calling convention.
998 */
999 ENTRY(aesni_ccm_enc1)
1000 mov %rcx,%r10 /* r10 := nbytes */
1001 movdqu 0x10(%r8),%xmm2 /* xmm2 := ctr (be) */
1002 movdqa bswap32(%rip),%xmm4 /* xmm4 := bswap32 table */
1003 movdqa ctr32_inc(%rip),%xmm5 /* xmm5 := (0,0,0,1) (le) */
1004 movdqu (%r8),%xmm0 /* xmm0 := auth */
1005 pshufb %xmm4,%xmm2 /* xmm2 := ctr (le) */
1006 _ALIGN_TEXT
1007 1: movdqu (%rsi),%xmm3 /* xmm3 := plaintext block */
1008 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */
1009 lea 0x10(%rsi),%rsi
1010 movdqa %xmm2,%xmm1 /* xmm1 := ctr (le) */
1011 mov %r9d,%ecx /* ecx := nrounds */
1012 pshufb %xmm4,%xmm1 /* xmm1 := ctr (be) */
1013 pxor %xmm3,%xmm0 /* xmm0 := auth ^ ptxt */
1014 call aesni_enc2 /* trash rax/rcx/xmm8 */
1015 pxor %xmm1,%xmm3 /* xmm3 := ciphertext block */
1016 sub $0x10,%r10 /* count down bytes */
1017 movdqu %xmm3,(%rdx) /* store ciphertext block */
1018 lea 0x10(%rdx),%rdx
1019 jnz 1b /* repeat if more blocks */
1020 pshufb %xmm4,%xmm2 /* xmm2 := ctr (be) */
1021 movdqu %xmm0,(%r8) /* store updated auth */
1022 movdqu %xmm2,0x10(%r8) /* store updated ctr */
1023 ret
1024 END(aesni_ccm_enc1)
1025
1026 /*
1027 * aesni_ccm_dec1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
1028 * uint8_t *out@rdx, size_t nbytes@rcx,
1029 * uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
1030 *
1031 * Update CCM decryption.
1032 *
1033 * nbytes must be a positive integral multiple of 16.
1034 *
1035 * Standard ABI calling convention.
1036 */
1037 ENTRY(aesni_ccm_dec1)
1038 movdqu 0x10(%r8),%xmm2 /* xmm2 := ctr (be) */
1039 movdqa bswap32(%rip),%xmm4 /* xmm4 := bswap32 table */
1040 movdqa ctr32_inc(%rip),%xmm5 /* xmm5 := (0,0,0,1) (le) */
1041 movdqu (%r8),%xmm1 /* xmm1 := auth */
1042 pshufb %xmm4,%xmm2 /* xmm2 := ctr (le) */
1043 mov %rcx,%r10 /* r10 := nbytes */
1044
1045 /* Decrypt the first block. */
1046 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */
1047 mov %r9d,%ecx /* ecx := nrounds */
1048 movdqa %xmm2,%xmm0 /* xmm0 := ctr (le) */
1049 movdqu (%rsi),%xmm3 /* xmm3 := ctxt */
1050 pshufb %xmm4,%xmm0 /* xmm0 := ctr (be) */
1051 lea 0x10(%rsi),%rsi
1052 call aesni_enc1 /* xmm0 := pad; trash rax/rcx/xmm8 */
1053 jmp 2f
1054
1055 _ALIGN_TEXT
1056 1: /*
1057 * Authenticate the last block and decrypt the next block
1058 * simultaneously.
1059 *
1060 * xmm1 = auth ^ ptxt[-1]
1061 * xmm2 = ctr[-1] (le)
1062 */
1063 paddd %xmm5,%xmm2 /* increment ctr (32-bit) */
1064 mov %r9d,%ecx /* ecx := nrounds */
1065 movdqa %xmm2,%xmm0 /* xmm0 := ctr (le) */
1066 movdqu (%rsi),%xmm3 /* xmm3 := ctxt */
1067 pshufb %xmm4,%xmm0 /* xmm0 := ctr (be) */
1068 lea 0x10(%rsi),%rsi
1069 call aesni_enc2 /* xmm0 := pad, xmm1 := auth';
1070 * trash rax/rcx/xmm8 */
1071 2: pxor %xmm0,%xmm3 /* xmm3 := ptxt */
1072 sub $0x10,%r10
1073 movdqu %xmm3,(%rdx) /* store plaintext */
1074 lea 0x10(%rdx),%rdx
1075 pxor %xmm3,%xmm1 /* xmm1 := auth ^ ptxt */
1076 jnz 1b
1077
1078 /* Authenticate the last block. */
1079 movdqa %xmm1,%xmm0 /* xmm0 := auth ^ ptxt */
1080 mov %r9d,%ecx /* ecx := nrounds */
1081 call aesni_enc1 /* xmm0 := auth' */
1082 pshufb %xmm4,%xmm2 /* xmm2 := ctr (be) */
1083 movdqu %xmm0,(%r8) /* store updated auth */
1084 movdqu %xmm2,0x10(%r8) /* store updated ctr */
1085 ret
1086 END(aesni_ccm_dec1)
1087
1088 .section .rodata
1089 .p2align 4
1090 .type bswap32,@object
1091 bswap32:
1092 .byte 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
1093 END(bswap32)
1094
1095 .section .rodata
1096 .p2align 4
1097 .type ctr32_inc,@object
1098 ctr32_inc:
1099 .byte 0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0
1100 END(ctr32_inc)
1101
1102 /*
1103 * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0,
1104 * uint32_t nrounds@ecx)
1105 *
1106 * Encrypt a single AES block in %xmm0.
1107 *
1108 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1109 */
1110 .text
1111 _ALIGN_TEXT
1112 .type aesni_enc1,@function
1113 aesni_enc1:
1114 pxor (%rdi),%xmm0 /* xor in first round key */
1115 shl $4,%ecx /* ecx := total byte size of round keys */
1116 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
1117 neg %rcx /* rcx := byte offset of round key from end */
1118 jmp 2f
1119 _ALIGN_TEXT
1120 1: aesenc %xmm8,%xmm0
1121 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1122 add $0x10,%rcx
1123 jnz 1b /* repeat if more rounds */
1124 aesenclast %xmm8,%xmm0
1125 ret
1126 END(aesni_enc1)
1127
1128 /*
1129 * aesni_enc2(const struct aesenc *enckey@rdi, uint128_t block0@xmm0,
1130 * uint128_t block1@xmm1, uint32_t nrounds@ecx)
1131 *
1132 * Encrypt two AES blocks in %xmm0 and %xmm1.
1133 *
1134 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1135 */
1136 .text
1137 _ALIGN_TEXT
1138 .type aesni_enc2,@function
1139 aesni_enc2:
1140 movdqa (%rdi),%xmm8 /* xmm8 := first round key */
1141 shl $4,%ecx /* ecx := total byte size of round keys */
1142 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
1143 neg %rcx /* rcx := byte offset of round key from end */
1144 pxor %xmm8,%xmm0 /* xor in first round key */
1145 pxor %xmm8,%xmm1
1146 jmp 2f
1147 _ALIGN_TEXT
1148 1: aesenc %xmm8,%xmm0
1149 aesenc %xmm8,%xmm1
1150 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1151 add $0x10,%rcx
1152 jnz 1b /* repeat if there's more */
1153 aesenclast %xmm8,%xmm0
1154 aesenclast %xmm8,%xmm1
1155 ret
1156 END(aesni_enc2)
1157
1158 /*
1159 * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ...,
1160 * block7@xmm7, uint32_t nrounds@ecx)
1161 *
1162 * Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
1163 *
1164 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1165 */
1166 .text
1167 _ALIGN_TEXT
1168 .type aesni_enc8,@function
1169 aesni_enc8:
1170 movdqa (%rdi),%xmm8 /* xor in first round key */
1171 pxor %xmm8,%xmm0
1172 pxor %xmm8,%xmm1
1173 pxor %xmm8,%xmm2
1174 pxor %xmm8,%xmm3
1175 pxor %xmm8,%xmm4
1176 pxor %xmm8,%xmm5
1177 pxor %xmm8,%xmm6
1178 pxor %xmm8,%xmm7
1179 shl $4,%ecx /* ecx := total byte size of round keys */
1180 lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
1181 neg %rcx /* rcx := byte offset of round key from end */
1182 jmp 2f
1183 _ALIGN_TEXT
1184 1: aesenc %xmm8,%xmm0
1185 aesenc %xmm8,%xmm1
1186 aesenc %xmm8,%xmm2
1187 aesenc %xmm8,%xmm3
1188 aesenc %xmm8,%xmm4
1189 aesenc %xmm8,%xmm5
1190 aesenc %xmm8,%xmm6
1191 aesenc %xmm8,%xmm7
1192 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1193 add $0x10,%rcx
1194 jnz 1b /* repeat if more rounds */
1195 aesenclast %xmm8,%xmm0
1196 aesenclast %xmm8,%xmm1
1197 aesenclast %xmm8,%xmm2
1198 aesenclast %xmm8,%xmm3
1199 aesenclast %xmm8,%xmm4
1200 aesenclast %xmm8,%xmm5
1201 aesenclast %xmm8,%xmm6
1202 aesenclast %xmm8,%xmm7
1203 ret
1204 END(aesni_enc8)
1205
1206 /*
1207 * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0,
1208 * uint32_t nrounds@ecx)
1209 *
1210 * Decrypt a single AES block in %xmm0.
1211 *
1212 * Internal ABI. Uses %rax and %xmm8 as temporaries. Destroys %ecx.
1213 */
1214 .text
1215 _ALIGN_TEXT
1216 .type aesni_dec1,@function
1217 aesni_dec1:
1218 pxor (%rdi),%xmm0 /* xor in first round key */
1219 shl $4,%ecx /* ecx := byte offset of round key */
1220 lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */
1221 neg %rcx /* rcx := byte offset of round key from end */
1222 jmp 2f
1223 _ALIGN_TEXT
1224 1: aesdec %xmm8,%xmm0
1225 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1226 add $0x10,%rcx
1227 jnz 1b /* repeat if more rounds */
1228 aesdeclast %xmm8,%xmm0
1229 ret
1230 END(aesni_dec1)
1231
1232 /*
1233 * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ...,
1234 * block7@xmm7, uint32_t nrounds@ecx)
1235 *
1236 * Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
1237 *
1238 * Internal ABI. Uses %xmm8 as temporary. Destroys %rcx.
1239 */
1240 .text
1241 _ALIGN_TEXT
1242 .type aesni_dec8,@function
1243 aesni_dec8:
1244 movdqa (%rdi),%xmm8 /* xor in first round key */
1245 pxor %xmm8,%xmm0
1246 pxor %xmm8,%xmm1
1247 pxor %xmm8,%xmm2
1248 pxor %xmm8,%xmm3
1249 pxor %xmm8,%xmm4
1250 pxor %xmm8,%xmm5
1251 pxor %xmm8,%xmm6
1252 pxor %xmm8,%xmm7
1253 shl $4,%ecx /* ecx := byte offset of round key */
1254 lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */
1255 neg %rcx /* rcx := byte offset of round key from end */
1256 jmp 2f
1257 _ALIGN_TEXT
1258 1: aesdec %xmm8,%xmm0
1259 aesdec %xmm8,%xmm1
1260 aesdec %xmm8,%xmm2
1261 aesdec %xmm8,%xmm3
1262 aesdec %xmm8,%xmm4
1263 aesdec %xmm8,%xmm5
1264 aesdec %xmm8,%xmm6
1265 aesdec %xmm8,%xmm7
1266 2: movdqa (%rax,%rcx),%xmm8 /* load round key */
1267 add $0x10,%rcx
1268 jnz 1b /* repeat if more rounds */
1269 aesdeclast %xmm8,%xmm0
1270 aesdeclast %xmm8,%xmm1
1271 aesdeclast %xmm8,%xmm2
1272 aesdeclast %xmm8,%xmm3
1273 aesdeclast %xmm8,%xmm4
1274 aesdeclast %xmm8,%xmm5
1275 aesdeclast %xmm8,%xmm6
1276 aesdeclast %xmm8,%xmm7
1277 ret
1278 END(aesni_dec8)
1279