aes_neon_32.S revision 1.6 1 1.6 riastrad /* $NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $ */
2 1.1 riastrad
3 1.1 riastrad /*-
4 1.1 riastrad * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 1.1 riastrad * All rights reserved.
6 1.1 riastrad *
7 1.1 riastrad * Redistribution and use in source and binary forms, with or without
8 1.1 riastrad * modification, are permitted provided that the following conditions
9 1.1 riastrad * are met:
10 1.1 riastrad * 1. Redistributions of source code must retain the above copyright
11 1.1 riastrad * notice, this list of conditions and the following disclaimer.
12 1.1 riastrad * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 riastrad * notice, this list of conditions and the following disclaimer in the
14 1.1 riastrad * documentation and/or other materials provided with the distribution.
15 1.1 riastrad *
16 1.1 riastrad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 riastrad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 riastrad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 riastrad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 riastrad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 riastrad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 riastrad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 riastrad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 riastrad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 riastrad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 riastrad * POSSIBILITY OF SUCH DAMAGE.
27 1.1 riastrad */
28 1.1 riastrad
29 1.1 riastrad #include <arm/asm.h>
30 1.1 riastrad
31 1.6 riastrad RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
32 1.4 riastrad
33 1.1 riastrad .fpu neon
34 1.1 riastrad
35 1.2 riastrad .text
36 1.2 riastrad .p2align 2
37 1.2 riastrad .Lconstants_addr:
38 1.2 riastrad .long .Lconstants - .
39 1.2 riastrad
40 1.1 riastrad .section .rodata
41 1.1 riastrad .p2align 4
42 1.2 riastrad .Lconstants:
43 1.1 riastrad
44 1.1 riastrad .type inv,_ASM_TYPE_OBJECT
45 1.1 riastrad inv:
46 1.1 riastrad .byte 0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
47 1.1 riastrad .byte 0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
48 1.1 riastrad END(inv)
49 1.1 riastrad
50 1.1 riastrad .type inva,_ASM_TYPE_OBJECT
51 1.1 riastrad inva:
52 1.1 riastrad .byte 0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
53 1.1 riastrad .byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
54 1.1 riastrad END(inva)
55 1.1 riastrad
56 1.1 riastrad .type mc_forward,_ASM_TYPE_OBJECT
57 1.1 riastrad mc_forward:
58 1.1 riastrad .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */
59 1.1 riastrad .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
60 1.1 riastrad
61 1.1 riastrad .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */
62 1.1 riastrad .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
63 1.1 riastrad
64 1.1 riastrad .byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */
65 1.1 riastrad .byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
66 1.1 riastrad
67 1.1 riastrad .Lmc_forward_3:
68 1.1 riastrad .byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */
69 1.1 riastrad .byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
70 1.1 riastrad END(mc_forward)
71 1.1 riastrad
72 1.1 riastrad .type mc_backward,_ASM_TYPE_OBJECT
73 1.1 riastrad mc_backward:
74 1.1 riastrad .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */
75 1.1 riastrad .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
76 1.1 riastrad
77 1.1 riastrad .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */
78 1.1 riastrad .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
79 1.1 riastrad
80 1.1 riastrad .byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */
81 1.1 riastrad .byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
82 1.1 riastrad
83 1.1 riastrad .byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */
84 1.1 riastrad .byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
85 1.1 riastrad END(mc_backward)
86 1.1 riastrad
87 1.1 riastrad .type sr,_ASM_TYPE_OBJECT
88 1.1 riastrad sr:
89 1.1 riastrad .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 /* 0 */
90 1.1 riastrad .byte 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
91 1.1 riastrad
92 1.1 riastrad .byte 0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03 /* 1 */
93 1.1 riastrad .byte 0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
94 1.1 riastrad
95 1.1 riastrad .byte 0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F /* 2 */
96 1.1 riastrad .byte 0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
97 1.1 riastrad
98 1.1 riastrad .byte 0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B /* 3 */
99 1.1 riastrad .byte 0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
100 1.1 riastrad END(sr)
101 1.1 riastrad
102 1.1 riastrad .type iptlo,_ASM_TYPE_OBJECT
103 1.1 riastrad iptlo:
104 1.1 riastrad .byte 0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
105 1.1 riastrad .byte 0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
106 1.1 riastrad END(iptlo)
107 1.1 riastrad
108 1.1 riastrad .type ipthi,_ASM_TYPE_OBJECT
109 1.1 riastrad ipthi:
110 1.1 riastrad .byte 0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
111 1.1 riastrad .byte 0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
112 1.1 riastrad END(ipthi)
113 1.1 riastrad
114 1.1 riastrad .type sb1_0,_ASM_TYPE_OBJECT
115 1.1 riastrad sb1_0:
116 1.1 riastrad .byte 0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
117 1.1 riastrad .byte 0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
118 1.1 riastrad END(sb1_0)
119 1.1 riastrad
120 1.1 riastrad .type sb1_1,_ASM_TYPE_OBJECT
121 1.1 riastrad sb1_1:
122 1.1 riastrad .byte 0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
123 1.1 riastrad .byte 0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
124 1.1 riastrad END(sb1_1)
125 1.1 riastrad
126 1.1 riastrad .type sb2_0,_ASM_TYPE_OBJECT
127 1.1 riastrad sb2_0:
128 1.1 riastrad .byte 0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
129 1.1 riastrad .byte 0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
130 1.1 riastrad END(sb2_0)
131 1.1 riastrad
132 1.1 riastrad .type sb2_1,_ASM_TYPE_OBJECT
133 1.1 riastrad sb2_1:
134 1.1 riastrad .byte 0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
135 1.1 riastrad .byte 0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
136 1.1 riastrad END(sb2_1)
137 1.1 riastrad
138 1.1 riastrad .type sbo_0,_ASM_TYPE_OBJECT
139 1.1 riastrad sbo_0:
140 1.1 riastrad .byte 0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
141 1.1 riastrad .byte 0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
142 1.1 riastrad END(sbo_0)
143 1.1 riastrad
144 1.1 riastrad .type sbo_1,_ASM_TYPE_OBJECT
145 1.1 riastrad sbo_1:
146 1.1 riastrad .byte 0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
147 1.1 riastrad .byte 0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
148 1.1 riastrad END(sbo_1)
149 1.1 riastrad
150 1.1 riastrad .type diptlo,_ASM_TYPE_OBJECT
151 1.1 riastrad diptlo:
152 1.1 riastrad .byte 0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
153 1.1 riastrad .byte 0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
154 1.1 riastrad END(diptlo)
155 1.1 riastrad
156 1.1 riastrad .type dipthi,_ASM_TYPE_OBJECT
157 1.1 riastrad dipthi:
158 1.1 riastrad .byte 0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
159 1.1 riastrad .byte 0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
160 1.1 riastrad END(dipthi)
161 1.1 riastrad
162 1.1 riastrad .type dsb9_0,_ASM_TYPE_OBJECT
163 1.1 riastrad dsb9_0:
164 1.1 riastrad .byte 0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
165 1.1 riastrad .byte 0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
166 1.1 riastrad END(dsb9_0)
167 1.1 riastrad
168 1.1 riastrad .type dsb9_1,_ASM_TYPE_OBJECT
169 1.1 riastrad dsb9_1:
170 1.1 riastrad .byte 0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
171 1.1 riastrad .byte 0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
172 1.1 riastrad END(dsb9_1)
173 1.1 riastrad
174 1.1 riastrad .type dsbd_0,_ASM_TYPE_OBJECT
175 1.1 riastrad dsbd_0:
176 1.1 riastrad .byte 0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
177 1.1 riastrad .byte 0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
178 1.1 riastrad END(dsbd_0)
179 1.1 riastrad
180 1.1 riastrad .type dsbd_1,_ASM_TYPE_OBJECT
181 1.1 riastrad dsbd_1:
182 1.1 riastrad .byte 0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
183 1.1 riastrad .byte 0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
184 1.1 riastrad END(dsbd_1)
185 1.1 riastrad
186 1.1 riastrad .type dsbb_0,_ASM_TYPE_OBJECT
187 1.1 riastrad dsbb_0:
188 1.1 riastrad .byte 0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
189 1.1 riastrad .byte 0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
190 1.1 riastrad END(dsbb_0)
191 1.1 riastrad
192 1.1 riastrad .type dsbb_1,_ASM_TYPE_OBJECT
193 1.1 riastrad dsbb_1:
194 1.1 riastrad .byte 0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
195 1.1 riastrad .byte 0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
196 1.1 riastrad END(dsbb_1)
197 1.1 riastrad
198 1.1 riastrad .type dsbe_0,_ASM_TYPE_OBJECT
199 1.1 riastrad dsbe_0:
200 1.1 riastrad .byte 0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
201 1.1 riastrad .byte 0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
202 1.1 riastrad END(dsbe_0)
203 1.1 riastrad
204 1.1 riastrad .type dsbe_1,_ASM_TYPE_OBJECT
205 1.1 riastrad dsbe_1:
206 1.1 riastrad .byte 0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
207 1.1 riastrad .byte 0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
208 1.1 riastrad END(dsbe_1)
209 1.1 riastrad
210 1.1 riastrad .type dsbo_0,_ASM_TYPE_OBJECT
211 1.1 riastrad dsbo_0:
212 1.1 riastrad .byte 0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
213 1.1 riastrad .byte 0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
214 1.1 riastrad END(dsbo_0)
215 1.1 riastrad
216 1.1 riastrad .type dsbo_1,_ASM_TYPE_OBJECT
217 1.1 riastrad dsbo_1:
218 1.1 riastrad .byte 0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
219 1.1 riastrad .byte 0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
220 1.1 riastrad END(dsbo_1)
221 1.1 riastrad
222 1.1 riastrad /*
223 1.1 riastrad * aes_neon_enc1(enc, x, nrounds)
224 1.1 riastrad *
225 1.1 riastrad * With -mfloat-abi=hard:
226 1.1 riastrad *
227 1.1 riastrad * uint8x16_t@q0
228 1.1 riastrad * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
229 1.1 riastrad * unsigned nrounds@r1)
230 1.1 riastrad *
231 1.6 riastrad * With -mfloat-abi=soft(fp) (i.e., __SOFTFP__):
232 1.1 riastrad *
233 1.1 riastrad * uint8x16_t@(r0,r1,r2,r3)
234 1.1 riastrad * aes_neon_enc1(const struct aesenc *enc@r0,
235 1.1 riastrad * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
236 1.1 riastrad */
237 1.1 riastrad ENTRY(aes_neon_enc1)
238 1.6 riastrad #ifdef __SOFTFP__
239 1.6 riastrad #ifdef __ARM_BIG_ENDIAN
240 1.6 riastrad vmov d0, r3, r2 /* d0 := x lo */
241 1.6 riastrad #else
242 1.1 riastrad vmov d0, r2, r3 /* d0 := x lo */
243 1.6 riastrad #endif
244 1.1 riastrad vldr d1, [sp] /* d1 := x hi */
245 1.1 riastrad ldr r1, [sp, #8] /* r1 := nrounds */
246 1.1 riastrad #endif
247 1.1 riastrad push {r4, r5, r6, r7, r8, r10, r11, lr}
248 1.1 riastrad vpush {d8-d15}
249 1.1 riastrad
250 1.1 riastrad /*
251 1.1 riastrad * r3: rmod4
252 1.1 riastrad * r4: mc_forward
253 1.1 riastrad * r5: mc_backward
254 1.2 riastrad * r6,r7,r8,r10,r11,r12: temporaries
255 1.1 riastrad * q0={d0-d1}: x/ak/A
256 1.1 riastrad * q1={d2-d3}: 0x0f0f...
257 1.1 riastrad * q2={d4-d5}: lo/k/j/io
258 1.1 riastrad * q3={d6-d7}: hi/i/jo
259 1.1 riastrad * q4={d8-d9}: iptlo
260 1.1 riastrad * q5={d10-d11}: ipthi
261 1.1 riastrad * q6={d12-d13}: sb1[0]/sbo[0]
262 1.1 riastrad * q7={d14-d15}: sb1[1]/sbo[1]
263 1.1 riastrad * q8={d16-d17}: sb2[0]
264 1.1 riastrad * q9={d18-d19}: sb2[1]
265 1.1 riastrad * q10={d20-d21}: inv
266 1.1 riastrad * q11={d22-d23}: inva
267 1.1 riastrad * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
268 1.1 riastrad * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
269 1.1 riastrad * q14={d28-d29}: rk/A2/A2_B_D
270 1.1 riastrad * q15={d30-d31}: A2_B/sr[rmod4]
271 1.1 riastrad */
272 1.1 riastrad
273 1.2 riastrad /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
274 1.2 riastrad ldr r12, .Lconstants_addr
275 1.2 riastrad adr r11, .Lconstants_addr
276 1.2 riastrad
277 1.5 riastrad vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
278 1.1 riastrad movw r3, #0
279 1.1 riastrad vmov.i8 q1, #0x0f
280 1.1 riastrad
281 1.2 riastrad /* r12 := .Lconstants */
282 1.2 riastrad add r12, r12, r11
283 1.2 riastrad
284 1.1 riastrad /* (q4, q5) := (iptlo, ipthi) */
285 1.2 riastrad add r6, r12, #(iptlo - .Lconstants)
286 1.2 riastrad add r7, r12, #(ipthi - .Lconstants)
287 1.5 riastrad vld1.8 {d8-d9}, [r6 :128]
288 1.5 riastrad vld1.8 {d10-d11}, [r7 :128]
289 1.1 riastrad
290 1.1 riastrad /* load the rest of the constants */
291 1.2 riastrad add r4, r12, #(sb1_0 - .Lconstants)
292 1.2 riastrad add r5, r12, #(sb1_1 - .Lconstants)
293 1.2 riastrad add r6, r12, #(sb2_0 - .Lconstants)
294 1.2 riastrad add r7, r12, #(sb2_1 - .Lconstants)
295 1.2 riastrad add r8, r12, #(inv - .Lconstants)
296 1.2 riastrad add r10, r12, #(inva - .Lconstants)
297 1.5 riastrad vld1.8 {d12-d13}, [r4 :128] /* q6 = sb1[0] */
298 1.5 riastrad vld1.8 {d14-d15}, [r5 :128] /* q7 = sb1[1] */
299 1.5 riastrad vld1.8 {d16-d17}, [r6 :128] /* q8 = sb2[0] */
300 1.5 riastrad vld1.8 {d18-d19}, [r7 :128] /* q9 = sb2[1] */
301 1.5 riastrad vld1.8 {d20-d21}, [r8 :128] /* q10 = inv */
302 1.5 riastrad vld1.8 {d22-d23}, [r10 :128] /* q11 = inva */
303 1.1 riastrad
304 1.1 riastrad /* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
305 1.2 riastrad add r4, r12, #(mc_forward - .Lconstants)
306 1.2 riastrad add r5, r12, #(mc_backward - .Lconstants)
307 1.1 riastrad
308 1.1 riastrad /* (q2, q3) := (lo, hi) */
309 1.1 riastrad vshr.u8 q3, q0, #4
310 1.1 riastrad vand q2, q0, q1 /* q2 := x & 0x0f0f... */
311 1.1 riastrad vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
312 1.1 riastrad
313 1.1 riastrad /* (q2, q3) := (iptlo(lo), ipthi(hi)) */
314 1.1 riastrad vtbl.8 d4, {d8-d9}, d4
315 1.1 riastrad vtbl.8 d5, {d8-d9}, d5
316 1.1 riastrad vtbl.8 d6, {d10-d11}, d6
317 1.1 riastrad vtbl.8 d7, {d10-d11}, d7
318 1.1 riastrad
319 1.1 riastrad /* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
320 1.1 riastrad veor q0, q14, q2
321 1.1 riastrad veor q0, q0, q3
322 1.1 riastrad
323 1.1 riastrad b 2f
324 1.1 riastrad
325 1.3 riastrad _ALIGN_TEXT
326 1.5 riastrad 1: vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
327 1.1 riastrad
328 1.1 riastrad /* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
329 1.1 riastrad vtbl.8 d24, {d12-d13}, d4
330 1.1 riastrad vtbl.8 d25, {d12-d13}, d5
331 1.1 riastrad vtbl.8 d26, {d14-d15}, d6
332 1.1 riastrad vtbl.8 d27, {d14-d15}, d7
333 1.1 riastrad veor q0, q14, q12
334 1.1 riastrad veor q0, q0, q13
335 1.1 riastrad
336 1.1 riastrad /* q14 := A2 = sb2_0[io] + sb2_1[jo] */
337 1.1 riastrad vtbl.8 d24, {d16-d17}, d4
338 1.1 riastrad vtbl.8 d25, {d16-d17}, d5
339 1.1 riastrad vtbl.8 d26, {d18-d19}, d6
340 1.1 riastrad vtbl.8 d27, {d18-d19}, d7
341 1.1 riastrad veor q14, q12, q13
342 1.1 riastrad
343 1.1 riastrad /* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
344 1.1 riastrad add r6, r4, r3, lsl #4
345 1.1 riastrad add r7, r5, r3, lsl #4
346 1.5 riastrad vld1.8 {d24-d25}, [r6]
347 1.5 riastrad vld1.8 {d26-d27}, [r7]
348 1.1 riastrad
349 1.1 riastrad /* q15 := A2_B = A2 + A(mcf) */
350 1.1 riastrad vtbl.8 d30, {d0-d1}, d24
351 1.1 riastrad vtbl.8 d31, {d0-d1}, d25
352 1.1 riastrad veor q15, q15, q14
353 1.1 riastrad
354 1.1 riastrad /* q14 := A2_B_D = A2_B + A(mcb) */
355 1.1 riastrad vtbl.8 d28, {d0-d1}, d26
356 1.1 riastrad vtbl.8 d29, {d0-d1}, d27
357 1.1 riastrad veor q14, q14, q15
358 1.1 riastrad
359 1.1 riastrad /* q0 := x = A2_B_D + A2_B(mcf) */
360 1.1 riastrad vtbl.8 d0, {d30-d31}, d24
361 1.1 riastrad vtbl.8 d1, {d30-d31}, d25
362 1.1 riastrad veor q0, q0, q14
363 1.1 riastrad
364 1.1 riastrad 2: /*
365 1.1 riastrad * SubBytes
366 1.1 riastrad */
367 1.1 riastrad
368 1.1 riastrad /* (q2, q3) := (k, i) */
369 1.1 riastrad vshr.u8 q3, q0, #4
370 1.1 riastrad vand q2, q0, q1 /* q2 := x & 0x0f0f... */
371 1.1 riastrad vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
372 1.1 riastrad
373 1.1 riastrad /* q0 := a/k */
374 1.1 riastrad vtbl.8 d0, {d22-d23}, d4
375 1.1 riastrad vtbl.8 d1, {d22-d23}, d5
376 1.1 riastrad
377 1.1 riastrad /* q2 := j = i + k */
378 1.1 riastrad veor q2, q3, q2
379 1.1 riastrad
380 1.1 riastrad /* q12 := ir = 1/i */
381 1.1 riastrad vtbl.8 d24, {d20-d21}, d6
382 1.1 riastrad vtbl.8 d25, {d20-d21}, d7
383 1.1 riastrad
384 1.1 riastrad /* q13 := jr = 1/j */
385 1.1 riastrad vtbl.8 d26, {d20-d21}, d4
386 1.1 riastrad vtbl.8 d27, {d20-d21}, d5
387 1.1 riastrad
388 1.1 riastrad /* q12 := iak = 1/i + a/k */
389 1.1 riastrad veor q12, q12, q0
390 1.1 riastrad
391 1.1 riastrad /* q13 := jak = 1/j + a/k */
392 1.1 riastrad veor q13, q13, q0
393 1.1 riastrad
394 1.1 riastrad /* q12 := iakr = 1/(1/i + a/k) */
395 1.1 riastrad vtbl.8 d24, {d20-d21}, d24
396 1.1 riastrad vtbl.8 d25, {d20-d21}, d25
397 1.1 riastrad
398 1.1 riastrad /* q13 := jakr = 1/(1/j + a/k) */
399 1.1 riastrad vtbl.8 d26, {d20-d21}, d26
400 1.1 riastrad vtbl.8 d27, {d20-d21}, d27
401 1.1 riastrad
402 1.1 riastrad /* q2 := io = j + 1/(1/i + a/k) */
403 1.1 riastrad veor q2, q2, q12
404 1.1 riastrad
405 1.1 riastrad /* q3 := jo = i + 1/(1/j + a/k) */
406 1.1 riastrad veor q3, q3, q13
407 1.1 riastrad
408 1.1 riastrad /* advance round */
409 1.1 riastrad add r3, r3, #1
410 1.1 riastrad subs r1, r1, #1
411 1.1 riastrad and r3, r3, #3
412 1.1 riastrad bne 1b
413 1.1 riastrad
414 1.1 riastrad /* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
415 1.2 riastrad add r8, r12, #(sr - .Lconstants)
416 1.2 riastrad add r6, r12, #(sbo_0 - .Lconstants)
417 1.2 riastrad add r7, r12, #(sbo_1 - .Lconstants)
418 1.1 riastrad add r8, r8, r3, lsl #4
419 1.5 riastrad vld1.8 {d12-d13}, [r6 :128]
420 1.5 riastrad vld1.8 {d14-d15}, [r7 :128]
421 1.5 riastrad vld1.8 {d30-d31}, [r8 :128]
422 1.1 riastrad
423 1.5 riastrad vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
424 1.1 riastrad
425 1.1 riastrad /* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
426 1.1 riastrad vtbl.8 d4, {d12-d13}, d4
427 1.1 riastrad vtbl.8 d5, {d12-d13}, d5
428 1.1 riastrad vtbl.8 d6, {d14-d15}, d6
429 1.1 riastrad vtbl.8 d7, {d14-d15}, d7
430 1.1 riastrad
431 1.1 riastrad /* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
432 1.1 riastrad veor q2, q2, q14
433 1.1 riastrad veor q2, q2, q3
434 1.1 riastrad
435 1.1 riastrad /* q0 := x(sr[rmod4]) */
436 1.1 riastrad vtbl.8 d0, {d4-d5}, d30
437 1.1 riastrad vtbl.8 d1, {d4-d5}, d31
438 1.1 riastrad
439 1.1 riastrad vpop {d8-d15}
440 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, r11, lr}
441 1.6 riastrad #ifdef __SOFTFP__
442 1.6 riastrad #ifdef __ARM_BIG_ENDIAN
443 1.6 riastrad vmov r1, r0, d0
444 1.6 riastrad vmov r3, r2, d1
445 1.6 riastrad #else
446 1.1 riastrad vmov r0, r1, d0
447 1.1 riastrad vmov r2, r3, d1
448 1.1 riastrad #endif
449 1.6 riastrad #endif
450 1.1 riastrad bx lr
451 1.1 riastrad END(aes_neon_enc1)
452 1.1 riastrad
453 1.1 riastrad /*
454 1.1 riastrad * aes_neon_dec1(dec, x, nrounds)
455 1.1 riastrad *
456 1.1 riastrad * With -mfloat-abi=hard:
457 1.1 riastrad *
458 1.1 riastrad * uint8x16_t@q0
459 1.1 riastrad * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
460 1.1 riastrad * unsigned nrounds@r1)
461 1.1 riastrad *
462 1.1 riastrad * With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
463 1.1 riastrad *
464 1.1 riastrad * uint8x16_t@(r0,r1,r2,r3)
465 1.1 riastrad * aes_neon_dec1(const struct aesdec *dec@r0,
466 1.1 riastrad * uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
467 1.1 riastrad */
468 1.1 riastrad ENTRY(aes_neon_dec1)
469 1.6 riastrad #ifdef __SOFTFP__
470 1.6 riastrad #ifdef __ARM_BIG_ENDIAN
471 1.6 riastrad vmov d0, r3, r2 /* d0 := x lo */
472 1.6 riastrad #else
473 1.1 riastrad vmov d0, r2, r3 /* d0 := x lo */
474 1.6 riastrad #endif
475 1.1 riastrad vldr d1, [sp] /* d1 := x hi */
476 1.1 riastrad ldr r1, [sp, #8] /* r1 := nrounds */
477 1.1 riastrad #endif
478 1.1 riastrad push {r4, r5, r6, r7, r8, r10, r11, lr}
479 1.1 riastrad vpush {d8-d15}
480 1.1 riastrad
481 1.1 riastrad /*
482 1.1 riastrad * r3: 3 & ~(nrounds - 1)
483 1.1 riastrad * q0={d0-d1}: x/ak
484 1.1 riastrad * q1={d2-d3}: 0x0f0f...
485 1.1 riastrad * q2={d4-d5}: lo/k/j/io
486 1.1 riastrad * q3={d6-d7}: hi/i/jo
487 1.1 riastrad * q4={d8-d9}: diptlo/dsb9[0]
488 1.1 riastrad * q5={d10-d11}: dipthi/dsb9[1]
489 1.1 riastrad * q6={d12-d13}: dsbb[0]/dsbo[0]
490 1.1 riastrad * q7={d14-d15}: dsbb[1]/dsbo[1]
491 1.1 riastrad * q8={d16-d17}: dsbd[0]/dsbe[0]
492 1.1 riastrad * q9={d18-d19}: dsbd[1]/dsbe[0]
493 1.1 riastrad * q10={d20-d21}: inv
494 1.1 riastrad * q11={d22-d23}: inva
495 1.1 riastrad * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
496 1.1 riastrad * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
497 1.1 riastrad * q14={d28-d29}: rk/xmc
498 1.1 riastrad * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
499 1.1 riastrad */
500 1.1 riastrad
501 1.2 riastrad /* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
502 1.2 riastrad ldr r12, .Lconstants_addr
503 1.2 riastrad adr r11, .Lconstants_addr
504 1.2 riastrad
505 1.5 riastrad vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
506 1.1 riastrad rsb r3, r1, #0 /* r3 := ~(x - 1) = -x */
507 1.1 riastrad vmov.i8 q1, #0x0f
508 1.1 riastrad and r3, r3, #3 /* r3 := 3 & ~(x - 1) */
509 1.1 riastrad
510 1.2 riastrad /* r12 := .Lconstants */
511 1.2 riastrad add r12, r12, r11
512 1.2 riastrad
513 1.1 riastrad /* (q4, q5) := (diptlo, dipthi) */
514 1.2 riastrad add r6, r12, #(diptlo - .Lconstants)
515 1.2 riastrad add r7, r12, #(dipthi - .Lconstants)
516 1.5 riastrad vld1.8 {d8-d9}, [r6 :128]
517 1.5 riastrad vld1.8 {d10-d11}, [r7 :128]
518 1.1 riastrad
519 1.1 riastrad /* load the rest of the constants */
520 1.2 riastrad add r4, r12, #(dsbb_0 - .Lconstants)
521 1.2 riastrad add r5, r12, #(dsbb_1 - .Lconstants)
522 1.2 riastrad add r6, r12, #(inv - .Lconstants)
523 1.2 riastrad add r7, r12, #(inva - .Lconstants)
524 1.2 riastrad add r8, r12, #(.Lmc_forward_3 - .Lconstants)
525 1.5 riastrad vld1.8 {d12-d13}, [r4 :128] /* q6 := dsbb[0] */
526 1.5 riastrad vld1.8 {d14-d15}, [r5 :128] /* q7 := dsbb[1] */
527 1.5 riastrad vld1.8 {d20-d21}, [r6 :128] /* q10 := inv */
528 1.5 riastrad vld1.8 {d22-d23}, [r7 :128] /* q11 := inva */
529 1.5 riastrad vld1.8 {d30-d31}, [r8 :128] /* q15 := mc_forward[3] */
530 1.1 riastrad
531 1.1 riastrad /* (q2, q3) := (lo, hi) */
532 1.1 riastrad vshr.u8 q3, q0, #4
533 1.1 riastrad vand q2, q0, q1 /* q2 := x & 0x0f0f... */
534 1.1 riastrad vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
535 1.1 riastrad
536 1.1 riastrad /* (q2, q3) := (diptlo(lo), dipthi(hi)) */
537 1.1 riastrad vtbl.8 d4, {d8-d9}, d4
538 1.1 riastrad vtbl.8 d5, {d8-d9}, d5
539 1.1 riastrad vtbl.8 d6, {d10-d11}, d6
540 1.1 riastrad vtbl.8 d7, {d10-d11}, d7
541 1.1 riastrad
542 1.1 riastrad /* load dsb9 */
543 1.2 riastrad add r4, r12, #(dsb9_0 - .Lconstants)
544 1.2 riastrad add r5, r12, #(dsb9_1 - .Lconstants)
545 1.5 riastrad vld1.8 {d8-d9}, [r4 :128] /* q4 := dsb9[0] */
546 1.5 riastrad vld1.8 {d10-d11}, [r5 :128] /* q5 := dsb9[1] */
547 1.1 riastrad
548 1.1 riastrad /* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
549 1.1 riastrad veor q0, q14, q2
550 1.1 riastrad veor q0, q0, q3
551 1.1 riastrad
552 1.1 riastrad b 2f
553 1.1 riastrad
554 1.3 riastrad _ALIGN_TEXT
555 1.1 riastrad 1: /* load dsbd */
556 1.2 riastrad add r4, r12, #(dsbd_0 - .Lconstants)
557 1.5 riastrad vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */
558 1.5 riastrad vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbd[1] */
559 1.1 riastrad
560 1.5 riastrad vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
561 1.1 riastrad
562 1.1 riastrad /* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
563 1.1 riastrad vtbl.8 d24, {d8-d9}, d4
564 1.1 riastrad vtbl.8 d25, {d8-d9}, d5
565 1.1 riastrad vtbl.8 d26, {d10-d11}, d6
566 1.1 riastrad vtbl.8 d27, {d10-d11}, d7
567 1.1 riastrad veor q0, q14, q12
568 1.1 riastrad veor q0, q0, q13
569 1.1 riastrad
570 1.1 riastrad /* q14 := x(mc) */
571 1.1 riastrad vtbl.8 d28, {d0-d1}, d30
572 1.1 riastrad vtbl.8 d29, {d0-d1}, d31
573 1.1 riastrad
574 1.1 riastrad /* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
575 1.1 riastrad vtbl.8 d24, {d16-d17}, d4
576 1.1 riastrad vtbl.8 d25, {d16-d17}, d5
577 1.1 riastrad vtbl.8 d26, {d18-d19}, d6
578 1.1 riastrad vtbl.8 d27, {d18-d19}, d7
579 1.1 riastrad veor q0, q14, q12
580 1.1 riastrad veor q0, q0, q13
581 1.1 riastrad
582 1.1 riastrad /* load dsbe */
583 1.2 riastrad add r4, r12, #(dsbe_0 - .Lconstants)
584 1.5 riastrad vld1.8 {d16-d17}, [r4 :128]! /* q8 := dsbe[0] */
585 1.5 riastrad vld1.8 {d18-d19}, [r4 :128] /* q9 := dsbe[1] */
586 1.1 riastrad
587 1.1 riastrad /* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
588 1.1 riastrad vtbl.8 d28, {d0-d1}, d30
589 1.1 riastrad vtbl.8 d29, {d0-d1}, d31
590 1.1 riastrad vtbl.8 d24, {d12-d13}, d4
591 1.1 riastrad vtbl.8 d25, {d12-d13}, d5
592 1.1 riastrad vtbl.8 d26, {d14-d15}, d6
593 1.1 riastrad vtbl.8 d27, {d14-d15}, d7
594 1.1 riastrad veor q0, q14, q12
595 1.1 riastrad veor q0, q0, q13
596 1.1 riastrad
597 1.1 riastrad /* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
598 1.1 riastrad vtbl.8 d28, {d0-d1}, d30
599 1.1 riastrad vtbl.8 d29, {d0-d1}, d31
600 1.1 riastrad vtbl.8 d24, {d16-d17}, d4
601 1.1 riastrad vtbl.8 d25, {d16-d17}, d5
602 1.1 riastrad vtbl.8 d26, {d18-d19}, d6
603 1.1 riastrad vtbl.8 d27, {d18-d19}, d7
604 1.1 riastrad veor q0, q14, q12
605 1.1 riastrad veor q0, q0, q13
606 1.1 riastrad
607 1.1 riastrad /* q15 := mc := mc <<< 12*8 */
608 1.1 riastrad vext.8 q15, q15, q15, #12
609 1.1 riastrad
610 1.1 riastrad 2: /*
611 1.1 riastrad * SubBytes
612 1.1 riastrad */
613 1.1 riastrad
614 1.1 riastrad /* (q2, q3) := (k, i) */
615 1.1 riastrad vshr.u8 q3, q0, #4
616 1.1 riastrad vand q2, q0, q1 /* q2 := x & 0x0f0f... */
617 1.1 riastrad vand q3, q3, q1 /* q3 := (x >> 4) & 0x0f0f... */
618 1.1 riastrad
619 1.1 riastrad /* q0 := a/k */
620 1.1 riastrad vtbl.8 d0, {d22-d23}, d4
621 1.1 riastrad vtbl.8 d1, {d22-d23}, d5
622 1.1 riastrad
623 1.1 riastrad /* q2 := j = i + k */
624 1.1 riastrad veor q2, q3, q2
625 1.1 riastrad
626 1.1 riastrad /* q12 := ir = 1/i */
627 1.1 riastrad vtbl.8 d24, {d20-d21}, d6
628 1.1 riastrad vtbl.8 d25, {d20-d21}, d7
629 1.1 riastrad
630 1.1 riastrad /* q13 := jr = 1/j */
631 1.1 riastrad vtbl.8 d26, {d20-d21}, d4
632 1.1 riastrad vtbl.8 d27, {d20-d21}, d5
633 1.1 riastrad
634 1.1 riastrad /* q12 := iak = 1/i + a/k */
635 1.1 riastrad veor q12, q12, q0
636 1.1 riastrad
637 1.1 riastrad /* q13 := jak = 1/j + a/k */
638 1.1 riastrad veor q13, q13, q0
639 1.1 riastrad
640 1.1 riastrad /* q12 := iakr = 1/(1/i + a/k) */
641 1.1 riastrad vtbl.8 d24, {d20-d21}, d24
642 1.1 riastrad vtbl.8 d25, {d20-d21}, d25
643 1.1 riastrad
644 1.1 riastrad /* q13 := jakr = 1/(1/j + a/k) */
645 1.1 riastrad vtbl.8 d26, {d20-d21}, d26
646 1.1 riastrad vtbl.8 d27, {d20-d21}, d27
647 1.1 riastrad
648 1.1 riastrad /* q2 := io = j + 1/(1/i + a/k) */
649 1.1 riastrad veor q2, q2, q12
650 1.1 riastrad
651 1.1 riastrad /* q3 := jo = i + 1/(1/j + a/k) */
652 1.1 riastrad veor q3, q3, q13
653 1.1 riastrad
654 1.1 riastrad /* advance round */
655 1.1 riastrad subs r1, r1, #1
656 1.1 riastrad bne 1b
657 1.1 riastrad
658 1.1 riastrad /* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
659 1.2 riastrad add r8, r12, #(sr - .Lconstants)
660 1.2 riastrad add r6, r12, #(dsbo_0 - .Lconstants)
661 1.2 riastrad add r7, r12, #(dsbo_1 - .Lconstants)
662 1.1 riastrad add r8, r8, r3, lsl #4
663 1.5 riastrad vld1.8 {d12-d13}, [r6 :128]
664 1.5 riastrad vld1.8 {d14-d15}, [r7 :128]
665 1.5 riastrad vld1.8 {d30-d31}, [r8 :128]
666 1.1 riastrad
667 1.5 riastrad vld1.8 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
668 1.1 riastrad
669 1.1 riastrad /* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
670 1.1 riastrad vtbl.8 d4, {d12-d13}, d4
671 1.1 riastrad vtbl.8 d5, {d12-d13}, d5
672 1.1 riastrad vtbl.8 d6, {d14-d15}, d6
673 1.1 riastrad vtbl.8 d7, {d14-d15}, d7
674 1.1 riastrad
675 1.1 riastrad /* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
676 1.1 riastrad veor q2, q2, q14
677 1.1 riastrad veor q2, q2, q3
678 1.1 riastrad
679 1.1 riastrad /* q0 := x(sr[i]) */
680 1.1 riastrad vtbl.8 d0, {d4-d5}, d30
681 1.1 riastrad vtbl.8 d1, {d4-d5}, d31
682 1.1 riastrad
683 1.1 riastrad vpop {d8-d15}
684 1.1 riastrad pop {r4, r5, r6, r7, r8, r10, r11, lr}
685 1.6 riastrad #ifdef __SOFTFP__
686 1.6 riastrad #ifdef __ARM_BIG_ENDIAN
687 1.6 riastrad vmov r1, r0, d0
688 1.6 riastrad vmov r3, r2, d1
689 1.6 riastrad #else
690 1.1 riastrad vmov r0, r1, d0
691 1.1 riastrad vmov r2, r3, d1
692 1.1 riastrad #endif
693 1.6 riastrad #endif
694 1.1 riastrad bx lr
695 1.1 riastrad END(aes_neon_dec1)
696