aes_neon_32.S revision 1.6 1 /* $NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <arm/asm.h>
30
31 RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
32
33 .fpu neon
34
35 .text
36 .p2align 2
37 .Lconstants_addr:
38 .long .Lconstants - .
39
40 .section .rodata
41 .p2align 4
42 .Lconstants:
43
44 .type inv,_ASM_TYPE_OBJECT
45 inv:
46 .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
47 .byte 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
48 END(inv)
49
50 .type inva,_ASM_TYPE_OBJECT
51 inva:
52 .byte 0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
53 .byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
54 END(inva)
55
56 .type mc_forward,_ASM_TYPE_OBJECT
57 mc_forward:
58 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */
59 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
60
61 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */
62 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
63
64 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */
65 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
66
67 .Lmc_forward_3:
68 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */
69 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
70 END(mc_forward)
71
72 .type mc_backward,_ASM_TYPE_OBJECT
73 mc_backward:
74 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */
75 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
76
77 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */
78 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
79
80 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */
81 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
82
83 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */
84 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
85 END(mc_backward)
86
87 .type sr,_ASM_TYPE_OBJECT
88 sr:
89 .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 /* 0 */
90 .byte 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
91
92 .byte 0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03 /* 1 */
93 .byte 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
94
95 .byte 0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F /* 2 */
96 .byte 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
97
98 .byte 0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B /* 3 */
99 .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
100 END(sr)
101
102 .type iptlo,_ASM_TYPE_OBJECT
103 iptlo:
104 .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
105 .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
106 END(iptlo)
107
108 .type ipthi,_ASM_TYPE_OBJECT
109 ipthi:
110 .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
111 .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
112 END(ipthi)
113
114 .type sb1_0,_ASM_TYPE_OBJECT
115 sb1_0:
116 .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
117 .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
118 END(sb1_0)
119
120 .type sb1_1,_ASM_TYPE_OBJECT
121 sb1_1:
122 .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
123 .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
124 END(sb1_1)
125
126 .type sb2_0,_ASM_TYPE_OBJECT
127 sb2_0:
128 .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
129 .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
130 END(sb2_0)
131
132 .type sb2_1,_ASM_TYPE_OBJECT
133 sb2_1:
134 .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
135 .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
136 END(sb2_1)
137
138 .type sbo_0,_ASM_TYPE_OBJECT
139 sbo_0:
140 .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
141 .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
142 END(sbo_0)
143
144 .type sbo_1,_ASM_TYPE_OBJECT
145 sbo_1:
146 .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
147 .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
148 END(sbo_1)
149
150 .type diptlo,_ASM_TYPE_OBJECT
151 diptlo:
152 .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
153 .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
154 END(diptlo)
155
156 .type dipthi,_ASM_TYPE_OBJECT
157 dipthi:
158 .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
159 .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
160 END(dipthi)
161
162 .type dsb9_0,_ASM_TYPE_OBJECT
163 dsb9_0:
164 .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
165 .byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
166 END(dsb9_0)
167
168 .type dsb9_1,_ASM_TYPE_OBJECT
169 dsb9_1:
170 .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
171 .byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
172 END(dsb9_1)
173
174 .type dsbd_0,_ASM_TYPE_OBJECT
175 dsbd_0:
176 .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
177 .byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
178 END(dsbd_0)
179
180 .type dsbd_1,_ASM_TYPE_OBJECT
181 dsbd_1:
182 .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
183 .byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
184 END(dsbd_1)
185
186 .type dsbb_0,_ASM_TYPE_OBJECT
187 dsbb_0:
188 .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
189 .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
190 END(dsbb_0)
191
192 .type dsbb_1,_ASM_TYPE_OBJECT
193 dsbb_1:
194 .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
195 .byte 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
196 END(dsbb_1)
197
198 .type dsbe_0,_ASM_TYPE_OBJECT
199 dsbe_0:
200 .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
201 .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
202 END(dsbe_0)
203
204 .type dsbe_1,_ASM_TYPE_OBJECT
205 dsbe_1:
206 .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
207 .byte 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
208 END(dsbe_1)
209
210 .type dsbo_0,_ASM_TYPE_OBJECT
211 dsbo_0:
212 .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
213 .byte 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
214 END(dsbo_0)
215
216 .type dsbo_1,_ASM_TYPE_OBJECT
217 dsbo_1:
218 .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
219 .byte 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
220 END(dsbo_1)
221
222 /*
223 * aes_neon_enc1(enc, x, nrounds)
224 *
225 * With -mfloat-abi=hard:
226 *
227 * uint8x16_t@q0
228 * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
229 * unsigned nrounds@r1)
230 *
231 * With -mfloat-abi=soft(fp) (i.e., __SOFTFP__):
232 *
233 * uint8x16_t@(r0,r1,r2,r3)
234 * aes_neon_enc1(const struct aesenc *enc@r0,
235 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
236 */
237 ENTRY(aes_neon_enc1)
238 #ifdef __SOFTFP__
239 #ifdef __ARM_BIG_ENDIAN
240 vmov d0, r3, r2 /* d0 := x lo */
241 #else
242 vmov d0, r2, r3 /* d0 := x lo */
243 #endif
244 vldr d1, [sp] /* d1 := x hi */
245 ldr r1, [sp, #8] /* r1 := nrounds */
246 #endif
247 push {r4, r5, r6, r7, r8, r10, r11, lr}
248 vpush {d8-d15}
249
250 /*
251 * r3: rmod4
252 * r4: mc_forward
253 * r5: mc_backward
254 * r6,r7,r8,r10,r11,r12: temporaries
255 * q0={d0-d1}: x/ak/A
256 * q1={d2-d3}: 0x0f0f...
257 * q2={d4-d5}: lo/k/j/io
258 * q3={d6-d7}: hi/i/jo
259 * q4={d8-d9}: iptlo
260 * q5={d10-d11}: ipthi
261 * q6={d12-d13}: sb1[0]/sbo[0]
262 * q7={d14-d15}: sb1[1]/sbo[1]
263 * q8={d16-d17}: sb2[0]
264 * q9={d18-d19}: sb2[1]
265 * q10={d20-d21}: inv
266 * q11={d22-d23}: inva
267 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
268 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
269 * q14={d28-d29}: rk/A2/A2_B_D
270 * q15={d30-d31}: A2_B/sr[rmod4]
271 */
272
273 /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
274 ldr r12, .Lconstants_addr
275 adr r11, .Lconstants_addr
276
277 vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
278 movw r3, #0
279 vmov.i8 q1, #0x0f
280
281 /* r12 := .Lconstants */
282 add r12, r12, r11
283
284 /* (q4, q5) := (iptlo, ipthi) */
285 add r6, r12, #(iptlo - .Lconstants)
286 add r7, r12, #(ipthi - .Lconstants)
287 vld1.8 {d8-d9}, [r6 :128]
288 vld1.8 {d10-d11}, [r7 :128]
289
290 /* load the rest of the constants */
291 add r4, r12, #(sb1_0 - .Lconstants)
292 add r5, r12, #(sb1_1 - .Lconstants)
293 add r6, r12, #(sb2_0 - .Lconstants)
294 add r7, r12, #(sb2_1 - .Lconstants)
295 add r8, r12, #(inv - .Lconstants)
296 add r10, r12, #(inva - .Lconstants)
297 vld1.8 {d12-d13}, [r4 :128] /* q6 = sb1[0] */
298 vld1.8 {d14-d15}, [r5 :128] /* q7 = sb1[1] */
299 vld1.8 {d16-d17}, [r6 :128] /* q8 = sb2[0] */
300 vld1.8 {d18-d19}, [r7 :128] /* q9 = sb2[1] */
301 vld1.8 {d20-d21}, [r8 :128] /* q10 = inv */
302 vld1.8 {d22-d23}, [r10 :128] /* q11 = inva */
303
304 /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
305 add r4, r12, #(mc_forward - .Lconstants)
306 add r5, r12, #(mc_backward - .Lconstants)
307
308 /* (q2, q3) := (lo, hi) */
309 vshr.u8 q3, q0, #4
310 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
311 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
312
313 /* (q2, q3) := (iptlo(lo), ipthi(hi)) */
314 vtbl.8 d4, {d8-d9}, d4
315 vtbl.8 d5, {d8-d9}, d5
316 vtbl.8 d6, {d10-d11}, d6
317 vtbl.8 d7, {d10-d11}, d7
318
319 /* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
320 veor q0, q14, q2
321 veor q0, q0, q3
322
323 b 2f
324
325 _ALIGN_TEXT
326 1: vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
327
328 /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
329 vtbl.8 d24, {d12-d13}, d4
330 vtbl.8 d25, {d12-d13}, d5
331 vtbl.8 d26, {d14-d15}, d6
332 vtbl.8 d27, {d14-d15}, d7
333 veor q0, q14, q12
334 veor q0, q0, q13
335
336 /* q14 := A2 = sb2_0[io] + sb2_1[jo] */
337 vtbl.8 d24, {d16-d17}, d4
338 vtbl.8 d25, {d16-d17}, d5
339 vtbl.8 d26, {d18-d19}, d6
340 vtbl.8 d27, {d18-d19}, d7
341 veor q14, q12, q13
342
343 /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
344 add r6, r4, r3, lsl #4
345 add r7, r5, r3, lsl #4
346 vld1.8 {d24-d25}, [r6]
347 vld1.8 {d26-d27}, [r7]
348
349 /* q15 := A2_B = A2 + A(mcf) */
350 vtbl.8 d30, {d0-d1}, d24
351 vtbl.8 d31, {d0-d1}, d25
352 veor q15, q15, q14
353
354 /* q14 := A2_B_D = A2_B + A(mcb) */
355 vtbl.8 d28, {d0-d1}, d26
356 vtbl.8 d29, {d0-d1}, d27
357 veor q14, q14, q15
358
359 /* q0 := x = A2_B_D + A2_B(mcf) */
360 vtbl.8 d0, {d30-d31}, d24
361 vtbl.8 d1, {d30-d31}, d25
362 veor q0, q0, q14
363
364 2: /*
365 * SubBytes
366 */
367
368 /* (q2, q3) := (k, i) */
369 vshr.u8 q3, q0, #4
370 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
371 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
372
373 /* q0 := a/k */
374 vtbl.8 d0, {d22-d23}, d4
375 vtbl.8 d1, {d22-d23}, d5
376
377 /* q2 := j = i + k */
378 veor q2, q3, q2
379
380 /* q12 := ir = 1/i */
381 vtbl.8 d24, {d20-d21}, d6
382 vtbl.8 d25, {d20-d21}, d7
383
384 /* q13 := jr = 1/j */
385 vtbl.8 d26, {d20-d21}, d4
386 vtbl.8 d27, {d20-d21}, d5
387
388 /* q12 := iak = 1/i + a/k */
389 veor q12, q12, q0
390
391 /* q13 := jak = 1/j + a/k */
392 veor q13, q13, q0
393
394 /* q12 := iakr = 1/(1/i + a/k) */
395 vtbl.8 d24, {d20-d21}, d24
396 vtbl.8 d25, {d20-d21}, d25
397
398 /* q13 := jakr = 1/(1/j + a/k) */
399 vtbl.8 d26, {d20-d21}, d26
400 vtbl.8 d27, {d20-d21}, d27
401
402 /* q2 := io = j + 1/(1/i + a/k) */
403 veor q2, q2, q12
404
405 /* q3 := jo = i + 1/(1/j + a/k) */
406 veor q3, q3, q13
407
408 /* advance round */
409 add r3, r3, #1
410 subs r1, r1, #1
411 and r3, r3, #3
412 bne 1b
413
414 /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
415 add r8, r12, #(sr - .Lconstants)
416 add r6, r12, #(sbo_0 - .Lconstants)
417 add r7, r12, #(sbo_1 - .Lconstants)
418 add r8, r8, r3, lsl #4
419 vld1.8 {d12-d13}, [r6 :128]
420 vld1.8 {d14-d15}, [r7 :128]
421 vld1.8 {d30-d31}, [r8 :128]
422
423 vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
424
425 /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
426 vtbl.8 d4, {d12-d13}, d4
427 vtbl.8 d5, {d12-d13}, d5
428 vtbl.8 d6, {d14-d15}, d6
429 vtbl.8 d7, {d14-d15}, d7
430
431 /* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
432 veor q2, q2, q14
433 veor q2, q2, q3
434
435 /* q0 := x(sr[rmod4]) */
436 vtbl.8 d0, {d4-d5}, d30
437 vtbl.8 d1, {d4-d5}, d31
438
439 vpop {d8-d15}
440 pop {r4, r5, r6, r7, r8, r10, r11, lr}
441 #ifdef __SOFTFP__
442 #ifdef __ARM_BIG_ENDIAN
443 vmov r1, r0, d0
444 vmov r3, r2, d1
445 #else
446 vmov r0, r1, d0
447 vmov r2, r3, d1
448 #endif
449 #endif
450 bx lr
451 END(aes_neon_enc1)
452
453 /*
454 * aes_neon_dec1(dec, x, nrounds)
455 *
456 * With -mfloat-abi=hard:
457 *
458 * uint8x16_t@q0
459 * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
460 * unsigned nrounds@r1)
461 *
462 * With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
463 *
464 * uint8x16_t@(r0,r1,r2,r3)
465 * aes_neon_dec1(const struct aesdec *dec@r0,
466 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
467 */
468 ENTRY(aes_neon_dec1)
469 #ifdef __SOFTFP__
470 #ifdef __ARM_BIG_ENDIAN
471 vmov d0, r3, r2 /* d0 := x lo */
472 #else
473 vmov d0, r2, r3 /* d0 := x lo */
474 #endif
475 vldr d1, [sp] /* d1 := x hi */
476 ldr r1, [sp, #8] /* r1 := nrounds */
477 #endif
478 push {r4, r5, r6, r7, r8, r10, r11, lr}
479 vpush {d8-d15}
480
481 /*
482 * r3: 3 & ~(nrounds - 1)
483 * q0={d0-d1}: x/ak
484 * q1={d2-d3}: 0x0f0f...
485 * q2={d4-d5}: lo/k/j/io
486 * q3={d6-d7}: hi/i/jo
487 * q4={d8-d9}: diptlo/dsb9[0]
488 * q5={d10-d11}: dipthi/dsb9[1]
489 * q6={d12-d13}: dsbb[0]/dsbo[0]
490 * q7={d14-d15}: dsbb[1]/dsbo[1]
491 * q8={d16-d17}: dsbd[0]/dsbe[0]
492 * q9={d18-d19}: dsbd[1]/dsbe[0]
493 * q10={d20-d21}: inv
494 * q11={d22-d23}: inva
495 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
496 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
497 * q14={d28-d29}: rk/xmc
498 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
499 */
500
501 /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
502 ldr r12, .Lconstants_addr
503 adr r11, .Lconstants_addr
504
505 vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
506 rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */
507 vmov.i8 q1, #0x0f
508 and r3, r3, #3 /* r3 := 3 & ~(x - 1) */
509
510 /* r12 := .Lconstants */
511 add r12, r12, r11
512
513 /* (q4, q5) := (diptlo, dipthi) */
514 add r6, r12, #(diptlo - .Lconstants)
515 add r7, r12, #(dipthi - .Lconstants)
516 vld1.8 {d8-d9}, [r6 :128]
517 vld1.8 {d10-d11}, [r7 :128]
518
519 /* load the rest of the constants */
520 add r4, r12, #(dsbb_0 - .Lconstants)
521 add r5, r12, #(dsbb_1 - .Lconstants)
522 add r6, r12, #(inv - .Lconstants)
523 add r7, r12, #(inva - .Lconstants)
524 add r8, r12, #(.Lmc_forward_3 - .Lconstants)
525 vld1.8 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */
526 vld1.8 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */
527 vld1.8 {d20-d21}, [r6 :128] /* q10 := inv */
528 vld1.8 {d22-d23}, [r7 :128] /* q11 := inva */
529 vld1.8 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */
530
531 /* (q2, q3) := (lo, hi) */
532 vshr.u8 q3, q0, #4
533 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
534 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
535
536 /* (q2, q3) := (diptlo(lo), dipthi(hi)) */
537 vtbl.8 d4, {d8-d9}, d4
538 vtbl.8 d5, {d8-d9}, d5
539 vtbl.8 d6, {d10-d11}, d6
540 vtbl.8 d7, {d10-d11}, d7
541
542 /* load dsb9 */
543 add r4, r12, #(dsb9_0 - .Lconstants)
544 add r5, r12, #(dsb9_1 - .Lconstants)
545 vld1.8 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */
546 vld1.8 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */
547
548 /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
549 veor q0, q14, q2
550 veor q0, q0, q3
551
552 b 2f
553
554 _ALIGN_TEXT
555 1: /* load dsbd */
556 add r4, r12, #(dsbd_0 - .Lconstants)
557 vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */
558 vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */
559
560 vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
561
562 /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
563 vtbl.8 d24, {d8-d9}, d4
564 vtbl.8 d25, {d8-d9}, d5
565 vtbl.8 d26, {d10-d11}, d6
566 vtbl.8 d27, {d10-d11}, d7
567 veor q0, q14, q12
568 veor q0, q0, q13
569
570 /* q14 := x(mc) */
571 vtbl.8 d28, {d0-d1}, d30
572 vtbl.8 d29, {d0-d1}, d31
573
574 /* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
575 vtbl.8 d24, {d16-d17}, d4
576 vtbl.8 d25, {d16-d17}, d5
577 vtbl.8 d26, {d18-d19}, d6
578 vtbl.8 d27, {d18-d19}, d7
579 veor q0, q14, q12
580 veor q0, q0, q13
581
582 /* load dsbe */
583 add r4, r12, #(dsbe_0 - .Lconstants)
584 vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */
585 vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */
586
587 /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
588 vtbl.8 d28, {d0-d1}, d30
589 vtbl.8 d29, {d0-d1}, d31
590 vtbl.8 d24, {d12-d13}, d4
591 vtbl.8 d25, {d12-d13}, d5
592 vtbl.8 d26, {d14-d15}, d6
593 vtbl.8 d27, {d14-d15}, d7
594 veor q0, q14, q12
595 veor q0, q0, q13
596
597 /* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
598 vtbl.8 d28, {d0-d1}, d30
599 vtbl.8 d29, {d0-d1}, d31
600 vtbl.8 d24, {d16-d17}, d4
601 vtbl.8 d25, {d16-d17}, d5
602 vtbl.8 d26, {d18-d19}, d6
603 vtbl.8 d27, {d18-d19}, d7
604 veor q0, q14, q12
605 veor q0, q0, q13
606
607 /* q15 := mc := mc <<< 12*8 */
608 vext.8 q15, q15, q15, #12
609
610 2: /*
611 * SubBytes
612 */
613
614 /* (q2, q3) := (k, i) */
615 vshr.u8 q3, q0, #4
616 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
617 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
618
619 /* q0 := a/k */
620 vtbl.8 d0, {d22-d23}, d4
621 vtbl.8 d1, {d22-d23}, d5
622
623 /* q2 := j = i + k */
624 veor q2, q3, q2
625
626 /* q12 := ir = 1/i */
627 vtbl.8 d24, {d20-d21}, d6
628 vtbl.8 d25, {d20-d21}, d7
629
630 /* q13 := jr = 1/j */
631 vtbl.8 d26, {d20-d21}, d4
632 vtbl.8 d27, {d20-d21}, d5
633
634 /* q12 := iak = 1/i + a/k */
635 veor q12, q12, q0
636
637 /* q13 := jak = 1/j + a/k */
638 veor q13, q13, q0
639
640 /* q12 := iakr = 1/(1/i + a/k) */
641 vtbl.8 d24, {d20-d21}, d24
642 vtbl.8 d25, {d20-d21}, d25
643
644 /* q13 := jakr = 1/(1/j + a/k) */
645 vtbl.8 d26, {d20-d21}, d26
646 vtbl.8 d27, {d20-d21}, d27
647
648 /* q2 := io = j + 1/(1/i + a/k) */
649 veor q2, q2, q12
650
651 /* q3 := jo = i + 1/(1/j + a/k) */
652 veor q3, q3, q13
653
654 /* advance round */
655 subs r1, r1, #1
656 bne 1b
657
658 /* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
659 add r8, r12, #(sr - .Lconstants)
660 add r6, r12, #(dsbo_0 - .Lconstants)
661 add r7, r12, #(dsbo_1 - .Lconstants)
662 add r8, r8, r3, lsl #4
663 vld1.8 {d12-d13}, [r6 :128]
664 vld1.8 {d14-d15}, [r7 :128]
665 vld1.8 {d30-d31}, [r8 :128]
666
667 vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
668
669 /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
670 vtbl.8 d4, {d12-d13}, d4
671 vtbl.8 d5, {d12-d13}, d5
672 vtbl.8 d6, {d14-d15}, d6
673 vtbl.8 d7, {d14-d15}, d7
674
675 /* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
676 veor q2, q2, q14
677 veor q2, q2, q3
678
679 /* q0 := x(sr[i]) */
680 vtbl.8 d0, {d4-d5}, d30
681 vtbl.8 d1, {d4-d5}, d31
682
683 vpop {d8-d15}
684 pop {r4, r5, r6, r7, r8, r10, r11, lr}
685 #ifdef __SOFTFP__
686 #ifdef __ARM_BIG_ENDIAN
687 vmov r1, r0, d0
688 vmov r3, r2, d1
689 #else
690 vmov r0, r1, d0
691 vmov r2, r3, d1
692 #endif
693 #endif
694 bx lr
695 END(aes_neon_dec1)
696