aes_neon_32.S revision 1.7 1 /* $NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <arm/asm.h>
30
31 RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $")
32
33 .fpu neon
34
35 .text
36 .p2align 2
37 .Lconstants_addr:
38 .long .Lconstants - .
39
40 .section .rodata
41 .p2align 5
42 .Lconstants:
43
44 .Linv_inva: /* inv and inva must be consecutive */
45 .type inv,_ASM_TYPE_OBJECT
46 inv:
47 .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
48 .byte 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
49 END(inv)
50
51 .type inva,_ASM_TYPE_OBJECT
52 inva:
53 .byte 0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
54 .byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
55 END(inva)
56
57 .type mc_forward,_ASM_TYPE_OBJECT
58 mc_forward:
59 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */
60 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
61
62 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */
63 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
64
65 .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */
66 .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
67
68 .Lmc_forward_3:
69 .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */
70 .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
71 END(mc_forward)
72
73 .type mc_backward,_ASM_TYPE_OBJECT
74 mc_backward:
75 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */
76 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
77
78 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */
79 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
80
81 .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */
82 .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
83
84 .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */
85 .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
86 END(mc_backward)
87
88 .type sr,_ASM_TYPE_OBJECT
89 sr:
90 .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 /* 0 */
91 .byte 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
92
93 .byte 0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03 /* 1 */
94 .byte 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
95
96 .byte 0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F /* 2 */
97 .byte 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
98
99 .byte 0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B /* 3 */
100 .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
101 END(sr)
102
103 .type ipt,_ASM_TYPE_OBJECT
104 ipt:
105 .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2 /* lo */
106 .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
107 .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */
108 .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
109 END(ipt)
110
111 .type sb1,_ASM_TYPE_OBJECT
112 sb1:
113 .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */
114 .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
115 .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */
116 .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
117 END(sb1)
118
119 .type sb2,_ASM_TYPE_OBJECT
120 sb2:
121 .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */
122 .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
123 .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */
124 .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
125 END(sb2)
126
127 .type sbo,_ASM_TYPE_OBJECT
128 sbo:
129 .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */
130 .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
131 .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */
132 .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
133 END(sbo)
134
135 .type dipt,_ASM_TYPE_OBJECT
136 dipt:
137 .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F /* lo */
138 .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
139 .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86 /* hi */
140 .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
141 END(dipt)
142
143 .type dsb9,_ASM_TYPE_OBJECT
144 dsb9:
145 .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85 /* 0 */
146 .byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
147 .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0 /* 1 */
148 .byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
149 END(dsb9)
150
151 .type dsbd,_ASM_TYPE_OBJECT
152 dsbd:
153 .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D /* 0 */
154 .byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
155 .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C /* 1 */
156 .byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
157 END(dsbd)
158
159 .type dsbb,_ASM_TYPE_OBJECT
160 dsbb:
161 .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0 /* 0 */
162 .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
163 .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1 /* 1 */
164 .byte 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
165 END(dsbb)
166
167 .type dsbe,_ASM_TYPE_OBJECT
168 dsbe:
169 .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46 /* 0 */
170 .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
171 .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C /* 1 */
172 .byte 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
173 END(dsbe)
174
175 .type dsbo,_ASM_TYPE_OBJECT
176 dsbo:
177 .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13 /* 0 */
178 .byte 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
179 .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12 /* 1 */
180 .byte 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
181 END(dsbo)
182
183 /*
184 * aes_neon_enc1(enc, x, nrounds)
185 *
186 * With -mfloat-abi=hard:
187 *
188 * uint8x16_t@q0
189 * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
190 * unsigned nrounds@r1)
191 *
192 * With -mfloat-abi=soft(fp) (i.e., __SOFTFP__):
193 *
194 * uint8x16_t@(r0,r1,r2,r3)
195 * aes_neon_enc1(const struct aesenc *enc@r0,
196 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
197 */
198 ENTRY(aes_neon_enc1)
199 #ifdef __SOFTFP__
200 #ifdef __ARM_BIG_ENDIAN
201 vmov d0, r3, r2 /* d0 := x lo */
202 #else
203 vmov d0, r2, r3 /* d0 := x lo */
204 #endif
205 vldr d1, [sp] /* d1 := x hi */
206 ldr r1, [sp, #8] /* r1 := nrounds */
207 #endif
208 push {r4, r5, r6, r7, r8, r10, r11, lr}
209 vpush {d8-d15}
210
211 /*
212 * r3: rmod4
213 * r4: mc_forward
214 * r5: mc_backward
215 * r6,r7,r8,r10,r11,r12: temporaries
216 * q0={d0-d1}: x/ak/A
217 * q1={d2-d3}: 0x0f0f...
218 * q2={d4-d5}: lo/k/j/io
219 * q3={d6-d7}: hi/i/jo
220 * q4={d8-d9}: iptlo
221 * q5={d10-d11}: ipthi
222 * q6={d12-d13}: sb1[0]/sbo[0]
223 * q7={d14-d15}: sb1[1]/sbo[1]
224 * q8={d16-d17}: sb2[0]
225 * q9={d18-d19}: sb2[1]
226 * q10={d20-d21}: inv
227 * q11={d22-d23}: inva
228 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
229 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
230 * q14={d28-d29}: rk/A2/A2_B_D
231 * q15={d30-d31}: A2_B/sr[rmod4]
232 */
233
234 /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
235 ldr r12, .Lconstants_addr
236 adr r11, .Lconstants_addr
237
238 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
239 movw r3, #0
240 vmov.i8 q1, #0x0f
241
242 /* r12 := .Lconstants */
243 add r12, r12, r11
244
245 /* (q4, q5) := (iptlo, ipthi) */
246 add r6, r12, #(ipt - .Lconstants)
247 vld1.8 {q4-q5}, [r6 :256]
248
249 /* load the rest of the constants */
250 add r4, r12, #(sb1 - .Lconstants)
251 add r6, r12, #(sb2 - .Lconstants)
252 add r8, r12, #(.Linv_inva - .Lconstants)
253 vld1.8 {q6-q7}, [r4 :256] /* q6 = sb1[0], q7 = sb1[1] */
254 vld1.8 {q8-q9}, [r6 :256] /* q8 = sb2[0], q9 = sb2[1] */
255 vld1.8 {q10-q11}, [r8 :256] /* q10 = inv, q11 = inva */
256
257 /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
258 add r4, r12, #(mc_forward - .Lconstants)
259 add r5, r12, #(mc_backward - .Lconstants)
260
261 /* (q2, q3) := (lo, hi) */
262 vshr.u8 q3, q0, #4
263 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
264 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
265
266 /* (q2, q3) := (iptlo(lo), ipthi(hi)) */
267 vtbl.8 d4, {d8-d9}, d4
268 vtbl.8 d5, {d8-d9}, d5
269 vtbl.8 d6, {d10-d11}, d6
270 vtbl.8 d7, {d10-d11}, d7
271
272 /* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
273 veor q0, q14, q2
274 veor q0, q0, q3
275
276 b 2f
277
278 _ALIGN_TEXT
279 1: vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
280
281 /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
282 vtbl.8 d24, {d12-d13}, d4
283 vtbl.8 d25, {d12-d13}, d5
284 vtbl.8 d26, {d14-d15}, d6
285 vtbl.8 d27, {d14-d15}, d7
286 veor q0, q14, q12
287 veor q0, q0, q13
288
289 /* q14 := A2 = sb2_0[io] + sb2_1[jo] */
290 vtbl.8 d24, {d16-d17}, d4
291 vtbl.8 d25, {d16-d17}, d5
292 vtbl.8 d26, {d18-d19}, d6
293 vtbl.8 d27, {d18-d19}, d7
294 veor q14, q12, q13
295
296 /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
297 add r6, r4, r3, lsl #4
298 add r7, r5, r3, lsl #4
299 vld1.8 {q12}, [r6 :128]
300 vld1.8 {q13}, [r7 :128]
301
302 /* q15 := A2_B = A2 + A(mcf) */
303 vtbl.8 d30, {d0-d1}, d24
304 vtbl.8 d31, {d0-d1}, d25
305 veor q15, q15, q14
306
307 /* q14 := A2_B_D = A2_B + A(mcb) */
308 vtbl.8 d28, {d0-d1}, d26
309 vtbl.8 d29, {d0-d1}, d27
310 veor q14, q14, q15
311
312 /* q0 := x = A2_B_D + A2_B(mcf) */
313 vtbl.8 d0, {d30-d31}, d24
314 vtbl.8 d1, {d30-d31}, d25
315 veor q0, q0, q14
316
317 2: /*
318 * SubBytes
319 */
320
321 /* (q2, q3) := (k, i) */
322 vshr.u8 q3, q0, #4
323 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
324 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
325
326 /* q0 := a/k */
327 vtbl.8 d0, {d22-d23}, d4
328 vtbl.8 d1, {d22-d23}, d5
329
330 /* q2 := j = i + k */
331 veor q2, q3, q2
332
333 /* q12 := ir = 1/i */
334 vtbl.8 d24, {d20-d21}, d6
335 vtbl.8 d25, {d20-d21}, d7
336
337 /* q13 := jr = 1/j */
338 vtbl.8 d26, {d20-d21}, d4
339 vtbl.8 d27, {d20-d21}, d5
340
341 /* q12 := iak = 1/i + a/k */
342 veor q12, q12, q0
343
344 /* q13 := jak = 1/j + a/k */
345 veor q13, q13, q0
346
347 /* q12 := iakr = 1/(1/i + a/k) */
348 vtbl.8 d24, {d20-d21}, d24
349 vtbl.8 d25, {d20-d21}, d25
350
351 /* q13 := jakr = 1/(1/j + a/k) */
352 vtbl.8 d26, {d20-d21}, d26
353 vtbl.8 d27, {d20-d21}, d27
354
355 /* q2 := io = j + 1/(1/i + a/k) */
356 veor q2, q2, q12
357
358 /* q3 := jo = i + 1/(1/j + a/k) */
359 veor q3, q3, q13
360
361 /* advance round */
362 add r3, r3, #1
363 subs r1, r1, #1
364 and r3, r3, #3
365 bne 1b
366
367 /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
368 add r8, r12, #(sr - .Lconstants)
369 add r6, r12, #(sbo - .Lconstants)
370 add r8, r8, r3, lsl #4
371 vld1.8 {q6-q7}, [r6 :256]
372 vld1.8 {q15}, [r8 :128]
373
374 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
375
376 /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
377 vtbl.8 d4, {d12-d13}, d4
378 vtbl.8 d5, {d12-d13}, d5
379 vtbl.8 d6, {d14-d15}, d6
380 vtbl.8 d7, {d14-d15}, d7
381
382 /* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
383 veor q2, q2, q14
384 veor q2, q2, q3
385
386 /* q0 := x(sr[rmod4]) */
387 vtbl.8 d0, {d4-d5}, d30
388 vtbl.8 d1, {d4-d5}, d31
389
390 vpop {d8-d15}
391 pop {r4, r5, r6, r7, r8, r10, r11, lr}
392 #ifdef __SOFTFP__
393 #ifdef __ARM_BIG_ENDIAN
394 vmov r1, r0, d0
395 vmov r3, r2, d1
396 #else
397 vmov r0, r1, d0
398 vmov r2, r3, d1
399 #endif
400 #endif
401 bx lr
402 END(aes_neon_enc1)
403
404 /*
405 * aes_neon_dec1(dec, x, nrounds)
406 *
407 * With -mfloat-abi=hard:
408 *
409 * uint8x16_t@q0
410 * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
411 * unsigned nrounds@r1)
412 *
413 * With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
414 *
415 * uint8x16_t@(r0,r1,r2,r3)
416 * aes_neon_dec1(const struct aesdec *dec@r0,
417 * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
418 */
419 ENTRY(aes_neon_dec1)
420 #ifdef __SOFTFP__
421 #ifdef __ARM_BIG_ENDIAN
422 vmov d0, r3, r2 /* d0 := x lo */
423 #else
424 vmov d0, r2, r3 /* d0 := x lo */
425 #endif
426 vldr d1, [sp] /* d1 := x hi */
427 ldr r1, [sp, #8] /* r1 := nrounds */
428 #endif
429 push {r4, r5, r6, r7, r8, r10, r11, lr}
430 vpush {d8-d15}
431
432 /*
433 * r3: 3 & ~(nrounds - 1)
434 * q0={d0-d1}: x/ak
435 * q1={d2-d3}: 0x0f0f...
436 * q2={d4-d5}: lo/k/j/io
437 * q3={d6-d7}: hi/i/jo
438 * q4={d8-d9}: diptlo/dsb9[0]
439 * q5={d10-d11}: dipthi/dsb9[1]
440 * q6={d12-d13}: dsbb[0]/dsbo[0]
441 * q7={d14-d15}: dsbb[1]/dsbo[1]
442 * q8={d16-d17}: dsbd[0]/dsbe[0]
443 * q9={d18-d19}: dsbd[1]/dsbe[0]
444 * q10={d20-d21}: inv
445 * q11={d22-d23}: inva
446 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
447 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
448 * q14={d28-d29}: rk/xmc
449 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
450 */
451
452 /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
453 ldr r12, .Lconstants_addr
454 adr r11, .Lconstants_addr
455
456 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
457 rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */
458 vmov.i8 q1, #0x0f
459 and r3, r3, #3 /* r3 := 3 & ~(x - 1) */
460
461 /* r12 := .Lconstants */
462 add r12, r12, r11
463
464 /* (q4, q5) := (diptlo, dipthi) */
465 add r6, r12, #(dipt - .Lconstants)
466 vld1.8 {q4-q5}, [r6 :256]
467
468 /* load the rest of the constants */
469 add r4, r12, #(dsbb - .Lconstants)
470 add r6, r12, #(.Linv_inva - .Lconstants)
471 add r8, r12, #(.Lmc_forward_3 - .Lconstants)
472 vld1.8 {q6-q7}, [r4 :256] /* q6 := dsbb[0], q7 := dsbb[1] */
473 vld1.8 {q10-q11}, [r6 :256] /* q10 := inv, q11 := inva */
474 vld1.8 {q15}, [r8 :128] /* q15 := mc_forward[3] */
475
476 /* (q2, q3) := (lo, hi) */
477 vshr.u8 q3, q0, #4
478 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
479 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
480
481 /* (q2, q3) := (diptlo(lo), dipthi(hi)) */
482 vtbl.8 d4, {d8-d9}, d4
483 vtbl.8 d5, {d8-d9}, d5
484 vtbl.8 d6, {d10-d11}, d6
485 vtbl.8 d7, {d10-d11}, d7
486
487 /* load dsb9 */
488 add r4, r12, #(dsb9 - .Lconstants)
489 vld1.8 {q4-q5}, [r4 :256] /* q4 := dsb9[0], q5 := dsb9[1] */
490
491 /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
492 veor q0, q14, q2
493 veor q0, q0, q3
494
495 b 2f
496
497 _ALIGN_TEXT
498 1: /* load dsbd */
499 add r4, r12, #(dsbd - .Lconstants)
500 vld1.8 {q8-q9}, [r4 :256] /* q8 := dsbd[0], q9 := dsbd[1] */
501
502 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
503
504 /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
505 vtbl.8 d24, {d8-d9}, d4
506 vtbl.8 d25, {d8-d9}, d5
507 vtbl.8 d26, {d10-d11}, d6
508 vtbl.8 d27, {d10-d11}, d7
509 veor q0, q14, q12
510 veor q0, q0, q13
511
512 /* q14 := x(mc) */
513 vtbl.8 d28, {d0-d1}, d30
514 vtbl.8 d29, {d0-d1}, d31
515
516 /* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
517 vtbl.8 d24, {d16-d17}, d4
518 vtbl.8 d25, {d16-d17}, d5
519 vtbl.8 d26, {d18-d19}, d6
520 vtbl.8 d27, {d18-d19}, d7
521 veor q0, q14, q12
522 veor q0, q0, q13
523
524 /* load dsbe */
525 add r4, r12, #(dsbe - .Lconstants)
526 vld1.8 {q8-q9}, [r4 :256]! /* q8 := dsbe[0], q9 := dsbe[1] */
527
528 /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
529 vtbl.8 d28, {d0-d1}, d30
530 vtbl.8 d29, {d0-d1}, d31
531 vtbl.8 d24, {d12-d13}, d4
532 vtbl.8 d25, {d12-d13}, d5
533 vtbl.8 d26, {d14-d15}, d6
534 vtbl.8 d27, {d14-d15}, d7
535 veor q0, q14, q12
536 veor q0, q0, q13
537
538 /* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
539 vtbl.8 d28, {d0-d1}, d30
540 vtbl.8 d29, {d0-d1}, d31
541 vtbl.8 d24, {d16-d17}, d4
542 vtbl.8 d25, {d16-d17}, d5
543 vtbl.8 d26, {d18-d19}, d6
544 vtbl.8 d27, {d18-d19}, d7
545 veor q0, q14, q12
546 veor q0, q0, q13
547
548 /* q15 := mc := mc <<< 12*8 */
549 vext.8 q15, q15, q15, #12
550
551 2: /*
552 * SubBytes
553 */
554
555 /* (q2, q3) := (k, i) */
556 vshr.u8 q3, q0, #4
557 vand q2, q0, q1 /* q2 := x & 0x0f0f... */
558 vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
559
560 /* q0 := a/k */
561 vtbl.8 d0, {d22-d23}, d4
562 vtbl.8 d1, {d22-d23}, d5
563
564 /* q2 := j = i + k */
565 veor q2, q3, q2
566
567 /* q12 := ir = 1/i */
568 vtbl.8 d24, {d20-d21}, d6
569 vtbl.8 d25, {d20-d21}, d7
570
571 /* q13 := jr = 1/j */
572 vtbl.8 d26, {d20-d21}, d4
573 vtbl.8 d27, {d20-d21}, d5
574
575 /* q12 := iak = 1/i + a/k */
576 veor q12, q12, q0
577
578 /* q13 := jak = 1/j + a/k */
579 veor q13, q13, q0
580
581 /* q12 := iakr = 1/(1/i + a/k) */
582 vtbl.8 d24, {d20-d21}, d24
583 vtbl.8 d25, {d20-d21}, d25
584
585 /* q13 := jakr = 1/(1/j + a/k) */
586 vtbl.8 d26, {d20-d21}, d26
587 vtbl.8 d27, {d20-d21}, d27
588
589 /* q2 := io = j + 1/(1/i + a/k) */
590 veor q2, q2, q12
591
592 /* q3 := jo = i + 1/(1/j + a/k) */
593 veor q3, q3, q13
594
595 /* advance round */
596 subs r1, r1, #1
597 bne 1b
598
599 /* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
600 add r8, r12, #(sr - .Lconstants)
601 add r6, r12, #(dsbo - .Lconstants)
602 add r8, r8, r3, lsl #4
603 vld1.8 {q6-q7}, [r6 :256]
604 vld1.8 {q15}, [r8 :128]
605
606 vld1.8 {q14}, [r0 :128]! /* q14 = *rk++ */
607
608 /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
609 vtbl.8 d4, {d12-d13}, d4
610 vtbl.8 d5, {d12-d13}, d5
611 vtbl.8 d6, {d14-d15}, d6
612 vtbl.8 d7, {d14-d15}, d7
613
614 /* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
615 veor q2, q2, q14
616 veor q2, q2, q3
617
618 /* q0 := x(sr[i]) */
619 vtbl.8 d0, {d4-d5}, d30
620 vtbl.8 d1, {d4-d5}, d31
621
622 vpop {d8-d15}
623 pop {r4, r5, r6, r7, r8, r10, r11, lr}
624 #ifdef __SOFTFP__
625 #ifdef __ARM_BIG_ENDIAN
626 vmov r1, r0, d0
627 vmov r3, r2, d1
628 #else
629 vmov r0, r1, d0
630 vmov r2, r3, d1
631 #endif
632 #endif
633 bx lr
634 END(aes_neon_dec1)
635