aes_neon_32.S revision 1.3 1 /* $NetBSD: aes_neon_32.S,v 1.3 2020/07/27 20:53:22 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <arm/asm.h>
30
31 .fpu neon
32
33 .text
34 .p2align 2
35 .Lconstants_addr:
36 .long .Lconstants - .
37
38 .section .rodata
39 .p2align 4
40 .Lconstants:
41
42 .type inv,_ASM_TYPE_OBJECT
43 inv:
44 .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
45 .byte 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
46 END(inv)
47
48 .type inva,_ASM_TYPE_OBJECT
49 inva:
50 .byte 0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
51 .byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
52 END(inva)
53
54 .type mc_forward,_ASM_TYPE_OBJECT
55 mc_forward:
56 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */
57 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
58
59 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */
60 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
61
62 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */
63 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
64
65 .Lmc_forward_3:
66 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */
67 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
68 END(mc_forward)
69
70 .type mc_backward,_ASM_TYPE_OBJECT
71 mc_backward:
72 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */
73 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
74
75 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */
76 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
77
78 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */
79 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
80
81 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */
82 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
83 END(mc_backward)
84
85 .type sr,_ASM_TYPE_OBJECT
86 sr:
87 .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 /* 0 */
88 .byte 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
89
90 .byte 0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03 /* 1 */
91 .byte 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
92
93 .byte 0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F /* 2 */
94 .byte 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
95
96 .byte 0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B /* 3 */
97 .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
98 END(sr)
99
100 .type iptlo,_ASM_TYPE_OBJECT
101 iptlo:
102 .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
103 .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
104 END(iptlo)
105
106 .type ipthi,_ASM_TYPE_OBJECT
107 ipthi:
108 .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
109 .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
110 END(ipthi)
111
112 .type sb1_0,_ASM_TYPE_OBJECT
113 sb1_0:
114 .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
115 .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
116 END(sb1_0)
117
118 .type sb1_1,_ASM_TYPE_OBJECT
119 sb1_1:
120 .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
121 .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
122 END(sb1_1)
123
124 .type sb2_0,_ASM_TYPE_OBJECT
125 sb2_0:
126 .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
127 .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
128 END(sb2_0)
129
130 .type sb2_1,_ASM_TYPE_OBJECT
131 sb2_1:
132 .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
133 .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
134 END(sb2_1)
135
136 .type sbo_0,_ASM_TYPE_OBJECT
137 sbo_0:
138 .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
139 .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
140 END(sbo_0)
141
142 .type sbo_1,_ASM_TYPE_OBJECT
143 sbo_1:
144 .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
145 .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
146 END(sbo_1)
147
148 .type diptlo,_ASM_TYPE_OBJECT
149 diptlo:
150 .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
151 .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
152 END(diptlo)
153
154 .type dipthi,_ASM_TYPE_OBJECT
155 dipthi:
156 .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
157 .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
158 END(dipthi)
159
160 .type dsb9_0,_ASM_TYPE_OBJECT
161 dsb9_0:
162 .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
163 .byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
164 END(dsb9_0)
165
166 .type dsb9_1,_ASM_TYPE_OBJECT
167 dsb9_1:
168 .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
169 .byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
170 END(dsb9_1)
171
172 .type dsbd_0,_ASM_TYPE_OBJECT
173 dsbd_0:
174 .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
175 .byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
176 END(dsbd_0)
177
178 .type dsbd_1,_ASM_TYPE_OBJECT
179 dsbd_1:
180 .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
181 .byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
182 END(dsbd_1)
183
184 .type dsbb_0,_ASM_TYPE_OBJECT
185 dsbb_0:
186 .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
187 .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
188 END(dsbb_0)
189
190 .type dsbb_1,_ASM_TYPE_OBJECT
191 dsbb_1:
192 .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
193 .byte 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
194 END(dsbb_1)
195
196 .type dsbe_0,_ASM_TYPE_OBJECT
197 dsbe_0:
198 .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
199 .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
200 END(dsbe_0)
201
202 .type dsbe_1,_ASM_TYPE_OBJECT
203 dsbe_1:
204 .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
205 .byte 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
206 END(dsbe_1)
207
208 .type dsbo_0,_ASM_TYPE_OBJECT
209 dsbo_0:
210 .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
211 .byte 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
212 END(dsbo_0)
213
214 .type dsbo_1,_ASM_TYPE_OBJECT
215 dsbo_1:
216 .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
217 .byte 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
218 END(dsbo_1)
219
220 /*
221 * aes_neon_enc1(enc, x, nrounds)
222 *
223 * With -mfloat-abi=hard:
224 *
225 * uint8x16_t@q0
226 * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
227 * unsigned nrounds@r1)
228 *
229 * With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
230 *
231 * uint8x16_t@(r0,r1,r2,r3)
232 * aes_neon_enc1(const struct aesenc *enc@r0,
233 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
234 */
235 ENTRY(aes_neon_enc1)
236 #ifdef _KERNEL
237 vmov d0, r2, r3 /* d0 := x lo */
238 vldr d1, [sp] /* d1 := x hi */
239 ldr r1, [sp, #8] /* r1 := nrounds */
240 #endif
241 push {r4, r5, r6, r7, r8, r10, r11, lr}
242 vpush {d8-d15}
243
244 /*
245 * r3: rmod4
246 * r4: mc_forward
247 * r5: mc_backward
248 * r6,r7,r8,r10,r11,r12: temporaries
249 * q0={d0-d1}: x/ak/A
250 * q1={d2-d3}: 0x0f0f...
251 * q2={d4-d5}: lo/k/j/io
252 * q3={d6-d7}: hi/i/jo
253 * q4={d8-d9}: iptlo
254 * q5={d10-d11}: ipthi
255 * q6={d12-d13}: sb1[0]/sbo[0]
256 * q7={d14-d15}: sb1[1]/sbo[1]
257 * q8={d16-d17}: sb2[0]
258 * q9={d18-d19}: sb2[1]
259 * q10={d20-d21}: inv
260 * q11={d22-d23}: inva
261 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
262 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
263 * q14={d28-d29}: rk/A2/A2_B_D
264 * q15={d30-d31}: A2_B/sr[rmod4]
265 */
266
267 /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
268 ldr r12, .Lconstants_addr
269 adr r11, .Lconstants_addr
270
271 vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
272 movw r3, #0
273 vmov.i8 q1, #0x0f
274
275 /* r12 := .Lconstants */
276 add r12, r12, r11
277
278 /* (q4, q5) := (iptlo, ipthi) */
279 add r6, r12, #(iptlo - .Lconstants)
280 add r7, r12, #(ipthi - .Lconstants)
281 vld1.64 {d8-d9}, [r6 :128]
282 vld1.64 {d10-d11}, [r7 :128]
283
284 /* load the rest of the constants */
285 add r4, r12, #(sb1_0 - .Lconstants)
286 add r5, r12, #(sb1_1 - .Lconstants)
287 add r6, r12, #(sb2_0 - .Lconstants)
288 add r7, r12, #(sb2_1 - .Lconstants)
289 add r8, r12, #(inv - .Lconstants)
290 add r10, r12, #(inva - .Lconstants)
291 vld1.64 {d12-d13}, [r4 :128] /* q6 = sb1[0] */
292 vld1.64 {d14-d15}, [r5 :128] /* q7 = sb1[1] */
293 vld1.64 {d16-d17}, [r6 :128] /* q8 = sb2[0] */
294 vld1.64 {d18-d19}, [r7 :128] /* q9 = sb2[1] */
295 vld1.64 {d20-d21}, [r8 :128] /* q10 = inv */
296 vld1.64 {d22-d23}, [r10 :128] /* q11 = inva */
297
298 /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
299 add r4, r12, #(mc_forward - .Lconstants)
300 add r5, r12, #(mc_backward - .Lconstants)
301
302 /* (q2, q3) := (lo, hi) */
303 vshr.u8 q3, q0, #4
304 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
305 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
306
307 /* (q2, q3) := (iptlo(lo), ipthi(hi)) */
308 vtbl.8 d4, {d8-d9}, d4
309 vtbl.8 d5, {d8-d9}, d5
310 vtbl.8 d6, {d10-d11}, d6
311 vtbl.8 d7, {d10-d11}, d7
312
313 /* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
314 veor q0, q14, q2
315 veor q0, q0, q3
316
317 b 2f
318
319 _ALIGN_TEXT
320 1: vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
321
322 /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
323 vtbl.8 d24, {d12-d13}, d4
324 vtbl.8 d25, {d12-d13}, d5
325 vtbl.8 d26, {d14-d15}, d6
326 vtbl.8 d27, {d14-d15}, d7
327 veor q0, q14, q12
328 veor q0, q0, q13
329
330 /* q14 := A2 = sb2_0[io] + sb2_1[jo] */
331 vtbl.8 d24, {d16-d17}, d4
332 vtbl.8 d25, {d16-d17}, d5
333 vtbl.8 d26, {d18-d19}, d6
334 vtbl.8 d27, {d18-d19}, d7
335 veor q14, q12, q13
336
337 /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
338 add r6, r4, r3, lsl #4
339 add r7, r5, r3, lsl #4
340 vld1.64 {d24-d25}, [r6]
341 vld1.64 {d26-d27}, [r7]
342
343 /* q15 := A2_B = A2 + A(mcf) */
344 vtbl.8 d30, {d0-d1}, d24
345 vtbl.8 d31, {d0-d1}, d25
346 veor q15, q15, q14
347
348 /* q14 := A2_B_D = A2_B + A(mcb) */
349 vtbl.8 d28, {d0-d1}, d26
350 vtbl.8 d29, {d0-d1}, d27
351 veor q14, q14, q15
352
353 /* q0 := x = A2_B_D + A2_B(mcf) */
354 vtbl.8 d0, {d30-d31}, d24
355 vtbl.8 d1, {d30-d31}, d25
356 veor q0, q0, q14
357
358 2: /*
359 * SubBytes
360 */
361
362 /* (q2, q3) := (k, i) */
363 vshr.u8 q3, q0, #4
364 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
365 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
366
367 /* q0 := a/k */
368 vtbl.8 d0, {d22-d23}, d4
369 vtbl.8 d1, {d22-d23}, d5
370
371 /* q2 := j = i + k */
372 veor q2, q3, q2
373
374 /* q12 := ir = 1/i */
375 vtbl.8 d24, {d20-d21}, d6
376 vtbl.8 d25, {d20-d21}, d7
377
378 /* q13 := jr = 1/j */
379 vtbl.8 d26, {d20-d21}, d4
380 vtbl.8 d27, {d20-d21}, d5
381
382 /* q12 := iak = 1/i + a/k */
383 veor q12, q12, q0
384
385 /* q13 := jak = 1/j + a/k */
386 veor q13, q13, q0
387
388 /* q12 := iakr = 1/(1/i + a/k) */
389 vtbl.8 d24, {d20-d21}, d24
390 vtbl.8 d25, {d20-d21}, d25
391
392 /* q13 := jakr = 1/(1/j + a/k) */
393 vtbl.8 d26, {d20-d21}, d26
394 vtbl.8 d27, {d20-d21}, d27
395
396 /* q2 := io = j + 1/(1/i + a/k) */
397 veor q2, q2, q12
398
399 /* q3 := jo = i + 1/(1/j + a/k) */
400 veor q3, q3, q13
401
402 /* advance round */
403 add r3, r3, #1
404 subs r1, r1, #1
405 and r3, r3, #3
406 bne 1b
407
408 /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
409 add r8, r12, #(sr - .Lconstants)
410 add r6, r12, #(sbo_0 - .Lconstants)
411 add r7, r12, #(sbo_1 - .Lconstants)
412 add r8, r8, r3, lsl #4
413 vld1.64 {d12-d13}, [r6 :128]
414 vld1.64 {d14-d15}, [r7 :128]
415 vld1.64 {d30-d31}, [r8 :128]
416
417 vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
418
419 /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
420 vtbl.8 d4, {d12-d13}, d4
421 vtbl.8 d5, {d12-d13}, d5
422 vtbl.8 d6, {d14-d15}, d6
423 vtbl.8 d7, {d14-d15}, d7
424
425 /* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
426 veor q2, q2, q14
427 veor q2, q2, q3
428
429 /* q0 := x(sr[rmod4]) */
430 vtbl.8 d0, {d4-d5}, d30
431 vtbl.8 d1, {d4-d5}, d31
432
433 vpop {d8-d15}
434 pop {r4, r5, r6, r7, r8, r10, r11, lr}
435 #ifdef _KERNEL
436 vmov r0, r1, d0
437 vmov r2, r3, d1
438 #endif
439 bx lr
440 END(aes_neon_enc1)
441
442 /*
443 * aes_neon_dec1(dec, x, nrounds)
444 *
445 * With -mfloat-abi=hard:
446 *
447 * uint8x16_t@q0
448 * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
449 * unsigned nrounds@r1)
450 *
451 * With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
452 *
453 * uint8x16_t@(r0,r1,r2,r3)
454 * aes_neon_dec1(const struct aesdec *dec@r0,
455 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
456 */
457 ENTRY(aes_neon_dec1)
458 #ifdef _KERNEL
459 vmov d0, r2, r3 /* d0 := x lo */
460 vldr d1, [sp] /* d1 := x hi */
461 ldr r1, [sp, #8] /* r1 := nrounds */
462 #endif
463 push {r4, r5, r6, r7, r8, r10, r11, lr}
464 vpush {d8-d15}
465
466 /*
467 * r3: 3 & ~(nrounds - 1)
468 * q0={d0-d1}: x/ak
469 * q1={d2-d3}: 0x0f0f...
470 * q2={d4-d5}: lo/k/j/io
471 * q3={d6-d7}: hi/i/jo
472 * q4={d8-d9}: diptlo/dsb9[0]
473 * q5={d10-d11}: dipthi/dsb9[1]
474 * q6={d12-d13}: dsbb[0]/dsbo[0]
475 * q7={d14-d15}: dsbb[1]/dsbo[1]
476 * q8={d16-d17}: dsbd[0]/dsbe[0]
477 * q9={d18-d19}: dsbd[1]/dsbe[0]
478 * q10={d20-d21}: inv
479 * q11={d22-d23}: inva
480 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
481 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
482 * q14={d28-d29}: rk/xmc
483 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
484 */
485
486 /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
487 ldr r12, .Lconstants_addr
488 adr r11, .Lconstants_addr
489
490 vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
491 rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */
492 vmov.i8 q1, #0x0f
493 and r3, r3, #3 /* r3 := 3 & ~(x - 1) */
494
495 /* r12 := .Lconstants */
496 add r12, r12, r11
497
498 /* (q4, q5) := (diptlo, dipthi) */
499 add r6, r12, #(diptlo - .Lconstants)
500 add r7, r12, #(dipthi - .Lconstants)
501 vld1.64 {d8-d9}, [r6 :128]
502 vld1.64 {d10-d11}, [r7 :128]
503
504 /* load the rest of the constants */
505 add r4, r12, #(dsbb_0 - .Lconstants)
506 add r5, r12, #(dsbb_1 - .Lconstants)
507 add r6, r12, #(inv - .Lconstants)
508 add r7, r12, #(inva - .Lconstants)
509 add r8, r12, #(.Lmc_forward_3 - .Lconstants)
510 vld1.64 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */
511 vld1.64 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */
512 vld1.64 {d20-d21}, [r6 :128] /* q10 := inv */
513 vld1.64 {d22-d23}, [r7 :128] /* q11 := inva */
514 vld1.64 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */
515
516 /* (q2, q3) := (lo, hi) */
517 vshr.u8 q3, q0, #4
518 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
519 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
520
521 /* (q2, q3) := (diptlo(lo), dipthi(hi)) */
522 vtbl.8 d4, {d8-d9}, d4
523 vtbl.8 d5, {d8-d9}, d5
524 vtbl.8 d6, {d10-d11}, d6
525 vtbl.8 d7, {d10-d11}, d7
526
527 /* load dsb9 */
528 add r4, r12, #(dsb9_0 - .Lconstants)
529 add r5, r12, #(dsb9_1 - .Lconstants)
530 vld1.64 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */
531 vld1.64 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */
532
533 /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
534 veor q0, q14, q2
535 veor q0, q0, q3
536
537 b 2f
538
539 _ALIGN_TEXT
540 1: /* load dsbd */
541 add r4, r12, #(dsbd_0 - .Lconstants)
542 vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */
543 vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */
544
545 vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
546
547 /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
548 vtbl.8 d24, {d8-d9}, d4
549 vtbl.8 d25, {d8-d9}, d5
550 vtbl.8 d26, {d10-d11}, d6
551 vtbl.8 d27, {d10-d11}, d7
552 veor q0, q14, q12
553 veor q0, q0, q13
554
555 /* q14 := x(mc) */
556 vtbl.8 d28, {d0-d1}, d30
557 vtbl.8 d29, {d0-d1}, d31
558
559 /* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
560 vtbl.8 d24, {d16-d17}, d4
561 vtbl.8 d25, {d16-d17}, d5
562 vtbl.8 d26, {d18-d19}, d6
563 vtbl.8 d27, {d18-d19}, d7
564 veor q0, q14, q12
565 veor q0, q0, q13
566
567 /* load dsbe */
568 add r4, r12, #(dsbe_0 - .Lconstants)
569 vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */
570 vld1.64 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */
571
572 /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
573 vtbl.8 d28, {d0-d1}, d30
574 vtbl.8 d29, {d0-d1}, d31
575 vtbl.8 d24, {d12-d13}, d4
576 vtbl.8 d25, {d12-d13}, d5
577 vtbl.8 d26, {d14-d15}, d6
578 vtbl.8 d27, {d14-d15}, d7
579 veor q0, q14, q12
580 veor q0, q0, q13
581
582 /* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
583 vtbl.8 d28, {d0-d1}, d30
584 vtbl.8 d29, {d0-d1}, d31
585 vtbl.8 d24, {d16-d17}, d4
586 vtbl.8 d25, {d16-d17}, d5
587 vtbl.8 d26, {d18-d19}, d6
588 vtbl.8 d27, {d18-d19}, d7
589 veor q0, q14, q12
590 veor q0, q0, q13
591
592 /* q15 := mc := mc <<< 12*8 */
593 vext.8 q15, q15, q15, #12
594
595 2: /*
596 * SubBytes
597 */
598
599 /* (q2, q3) := (k, i) */
600 vshr.u8 q3, q0, #4
601 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
602 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
603
604 /* q0 := a/k */
605 vtbl.8 d0, {d22-d23}, d4
606 vtbl.8 d1, {d22-d23}, d5
607
608 /* q2 := j = i + k */
609 veor q2, q3, q2
610
611 /* q12 := ir = 1/i */
612 vtbl.8 d24, {d20-d21}, d6
613 vtbl.8 d25, {d20-d21}, d7
614
615 /* q13 := jr = 1/j */
616 vtbl.8 d26, {d20-d21}, d4
617 vtbl.8 d27, {d20-d21}, d5
618
619 /* q12 := iak = 1/i + a/k */
620 veor q12, q12, q0
621
622 /* q13 := jak = 1/j + a/k */
623 veor q13, q13, q0
624
625 /* q12 := iakr = 1/(1/i + a/k) */
626 vtbl.8 d24, {d20-d21}, d24
627 vtbl.8 d25, {d20-d21}, d25
628
629 /* q13 := jakr = 1/(1/j + a/k) */
630 vtbl.8 d26, {d20-d21}, d26
631 vtbl.8 d27, {d20-d21}, d27
632
633 /* q2 := io = j + 1/(1/i + a/k) */
634 veor q2, q2, q12
635
636 /* q3 := jo = i + 1/(1/j + a/k) */
637 veor q3, q3, q13
638
639 /* advance round */
640 subs r1, r1, #1
641 bne 1b
642
643 /* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
644 add r8, r12, #(sr - .Lconstants)
645 add r6, r12, #(dsbo_0 - .Lconstants)
646 add r7, r12, #(dsbo_1 - .Lconstants)
647 add r8, r8, r3, lsl #4
648 vld1.64 {d12-d13}, [r6 :128]
649 vld1.64 {d14-d15}, [r7 :128]
650 vld1.64 {d30-d31}, [r8 :128]
651
652 vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
653
654 /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
655 vtbl.8 d4, {d12-d13}, d4
656 vtbl.8 d5, {d12-d13}, d5
657 vtbl.8 d6, {d14-d15}, d6
658 vtbl.8 d7, {d14-d15}, d7
659
660 /* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
661 veor q2, q2, q14
662 veor q2, q2, q3
663
664 /* q0 := x(sr[i]) */
665 vtbl.8 d0, {d4-d5}, d30
666 vtbl.8 d1, {d4-d5}, d31
667
668 vpop {d8-d15}
669 pop {r4, r5, r6, r7, r8, r10, r11, lr}
670 #ifdef _KERNEL
671 vmov r0, r1, d0
672 vmov r2, r3, d1
673 #endif
674 bx lr
675 END(aes_neon_dec1)
676