Home | History | Annotate | Line # | Download | only in arm
aes_neon_32.S revision 1.7
      1 /*	$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <arm/asm.h>
     30 
     31 RCSID("$NetBSD: aes_neon_32.S,v 1.7 2020/09/10 11:29:02 riastradh Exp $")
     32 
     33 	.fpu	neon
     34 
     35 	.text
     36 	.p2align 2
     37 .Lconstants_addr:
     38 	.long	.Lconstants - .
     39 
     40 	.section .rodata
     41 	.p2align 5
     42 .Lconstants:
     43 
     44 .Linv_inva:	/* inv and inva must be consecutive */
     45 	.type	inv,_ASM_TYPE_OBJECT
     46 inv:
     47 	.byte	0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
     48 	.byte	0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
     49 END(inv)
     50 
     51 	.type	inva,_ASM_TYPE_OBJECT
     52 inva:
     53 	.byte	0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
     54 	.byte	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
     55 END(inva)
     56 
     57 	.type	mc_forward,_ASM_TYPE_OBJECT
     58 mc_forward:
     59 	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04	/* 0 */
     60 	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
     61 
     62 	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08	/* 1 */
     63 	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
     64 
     65 	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C	/* 2 */
     66 	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
     67 
     68 .Lmc_forward_3:
     69 	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00	/* 3 */
     70 	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
     71 END(mc_forward)
     72 
     73 	.type	mc_backward,_ASM_TYPE_OBJECT
     74 mc_backward:
     75 	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06	/* 0 */
     76 	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
     77 
     78 	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02	/* 1 */
     79 	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
     80 
     81 	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E	/* 2 */
     82 	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
     83 
     84 	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A	/* 3 */
     85 	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
     86 END(mc_backward)
     87 
     88 	.type	sr,_ASM_TYPE_OBJECT
     89 sr:
     90 	.byte	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07	/* 0 */
     91 	.byte	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
     92 
     93 	.byte	0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03	/* 1 */
     94 	.byte	0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
     95 
     96 	.byte	0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F	/* 2 */
     97 	.byte	0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
     98 
     99 	.byte	0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B	/* 3 */
    100 	.byte	0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
    101 END(sr)
    102 
    103 	.type	ipt,_ASM_TYPE_OBJECT
    104 ipt:
    105 	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2	/* lo */
    106 	.byte	0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
    107 	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C /* hi */
    108 	.byte	0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
    109 END(ipt)
    110 
    111 	.type	sb1,_ASM_TYPE_OBJECT
    112 sb1:
    113 	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1 /* 0 */
    114 	.byte	0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
    115 	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36 /* 1 */
    116 	.byte	0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
    117 END(sb1)
    118 
    119 	.type	sb2,_ASM_TYPE_OBJECT
    120 sb2:
    121 	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2 /* 0 */
    122 	.byte	0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
    123 	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69 /* 1 */
    124 	.byte	0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
    125 END(sb2)
    126 
    127 	.type	sbo,_ASM_TYPE_OBJECT
    128 sbo:
    129 	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0 /* 0 */
    130 	.byte	0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
    131 	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF /* 1 */
    132 	.byte	0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
    133 END(sbo)
    134 
    135 	.type	dipt,_ASM_TYPE_OBJECT
    136 dipt:
    137 	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F	/* lo */
    138 	.byte	0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
    139 	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86	/* hi */
    140 	.byte	0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
    141 END(dipt)
    142 
    143 	.type	dsb9,_ASM_TYPE_OBJECT
    144 dsb9:
    145 	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85	/* 0 */
    146 	.byte	0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
    147 	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0	/* 1 */
    148 	.byte	0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
    149 END(dsb9)
    150 
    151 	.type	dsbd,_ASM_TYPE_OBJECT
    152 dsbd:
    153 	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D	/* 0 */
    154 	.byte	0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
    155 	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C	/* 1 */
    156 	.byte	0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
    157 END(dsbd)
    158 
    159 	.type	dsbb,_ASM_TYPE_OBJECT
    160 dsbb:
    161 	.byte	0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0	/* 0 */
    162 	.byte	0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
    163 	.byte	0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1	/* 1 */
    164 	.byte	0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
    165 END(dsbb)
    166 
    167 	.type	dsbe,_ASM_TYPE_OBJECT
    168 dsbe:
    169 	.byte	0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46	/* 0 */
    170 	.byte	0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
    171 	.byte	0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C	/* 1 */
    172 	.byte	0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
    173 END(dsbe)
    174 
    175 	.type	dsbo,_ASM_TYPE_OBJECT
    176 dsbo:
    177 	.byte	0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13	/* 0 */
    178 	.byte	0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
    179 	.byte	0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12	/* 1 */
    180 	.byte	0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
    181 END(dsbo)
    182 
    183 /*
    184  * aes_neon_enc1(enc, x, nrounds)
    185  *
    186  *	With -mfloat-abi=hard:
    187  *
    188  * uint8x16_t@q0
    189  * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
    190  *     unsigned nrounds@r1)
    191  *
    192  *	With -mfloat-abi=soft(fp) (i.e., __SOFTFP__):
    193  *
    194  * uint8x16_t@(r0,r1,r2,r3)
    195  * aes_neon_enc1(const struct aesenc *enc@r0,
    196  *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
    197  */
    198 ENTRY(aes_neon_enc1)
    199 #ifdef __SOFTFP__
    200 #ifdef __ARM_BIG_ENDIAN
    201 	vmov	d0, r3, r2		/* d0 := x lo */
    202 #else
    203 	vmov	d0, r2, r3		/* d0 := x lo */
    204 #endif
    205 	vldr	d1, [sp]		/* d1 := x hi */
    206 	ldr	r1, [sp, #8]		/* r1 := nrounds */
    207 #endif
    208 	push	{r4, r5, r6, r7, r8, r10, r11, lr}
    209 	vpush	{d8-d15}
    210 
    211 	/*
    212 	 * r3: rmod4
    213 	 * r4: mc_forward
    214 	 * r5: mc_backward
    215 	 * r6,r7,r8,r10,r11,r12: temporaries
    216 	 * q0={d0-d1}: x/ak/A
    217 	 * q1={d2-d3}: 0x0f0f...
    218 	 * q2={d4-d5}: lo/k/j/io
    219 	 * q3={d6-d7}: hi/i/jo
    220 	 * q4={d8-d9}: iptlo
    221 	 * q5={d10-d11}: ipthi
    222 	 * q6={d12-d13}: sb1[0]/sbo[0]
    223 	 * q7={d14-d15}: sb1[1]/sbo[1]
    224 	 * q8={d16-d17}: sb2[0]
    225 	 * q9={d18-d19}: sb2[1]
    226 	 * q10={d20-d21}: inv
    227 	 * q11={d22-d23}: inva
    228 	 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
    229 	 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
    230 	 * q14={d28-d29}: rk/A2/A2_B_D
    231 	 * q15={d30-d31}: A2_B/sr[rmod4]
    232 	 */
    233 
    234 	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
    235 	ldr	r12, .Lconstants_addr
    236 	adr	r11, .Lconstants_addr
    237 
    238 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
    239 	movw	r3, #0
    240 	vmov.i8	q1, #0x0f
    241 
    242 	/* r12 := .Lconstants */
    243 	add	r12, r12, r11
    244 
    245 	/* (q4, q5) := (iptlo, ipthi) */
    246 	add	r6, r12, #(ipt - .Lconstants)
    247 	vld1.8	{q4-q5}, [r6 :256]
    248 
    249 	/* load the rest of the constants */
    250 	add	r4, r12, #(sb1 - .Lconstants)
    251 	add	r6, r12, #(sb2 - .Lconstants)
    252 	add	r8, r12, #(.Linv_inva - .Lconstants)
    253 	vld1.8	{q6-q7}, [r4 :256]	/* q6 = sb1[0], q7 = sb1[1] */
    254 	vld1.8	{q8-q9}, [r6 :256]	/* q8 = sb2[0], q9 = sb2[1] */
    255 	vld1.8	{q10-q11}, [r8 :256]	/* q10 = inv, q11 = inva */
    256 
    257 	/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
    258 	add	r4, r12, #(mc_forward - .Lconstants)
    259 	add	r5, r12, #(mc_backward - .Lconstants)
    260 
    261 	/* (q2, q3) := (lo, hi) */
    262 	vshr.u8	q3, q0, #4
    263 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    264 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    265 
    266 	/* (q2, q3) := (iptlo(lo), ipthi(hi)) */
    267 	vtbl.8	d4, {d8-d9}, d4
    268 	vtbl.8	d5, {d8-d9}, d5
    269 	vtbl.8	d6, {d10-d11}, d6
    270 	vtbl.8	d7, {d10-d11}, d7
    271 
    272 	/* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
    273 	veor	q0, q14, q2
    274 	veor	q0, q0, q3
    275 
    276 	b	2f
    277 
    278 	_ALIGN_TEXT
    279 1:	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
    280 
    281 	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
    282 	vtbl.8	d24, {d12-d13}, d4
    283 	vtbl.8	d25, {d12-d13}, d5
    284 	vtbl.8	d26, {d14-d15}, d6
    285 	vtbl.8	d27, {d14-d15}, d7
    286 	veor	q0, q14, q12
    287 	veor	q0, q0, q13
    288 
    289 	/* q14 := A2 = sb2_0[io] + sb2_1[jo] */
    290 	vtbl.8	d24, {d16-d17}, d4
    291 	vtbl.8	d25, {d16-d17}, d5
    292 	vtbl.8	d26, {d18-d19}, d6
    293 	vtbl.8	d27, {d18-d19}, d7
    294 	veor	q14, q12, q13
    295 
    296 	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
    297 	add	r6, r4, r3, lsl #4
    298 	add	r7, r5, r3, lsl #4
    299 	vld1.8	{q12}, [r6 :128]
    300 	vld1.8	{q13}, [r7 :128]
    301 
    302 	/* q15 := A2_B = A2 + A(mcf) */
    303 	vtbl.8	d30, {d0-d1}, d24
    304 	vtbl.8	d31, {d0-d1}, d25
    305 	veor	q15, q15, q14
    306 
    307 	/* q14 := A2_B_D = A2_B + A(mcb) */
    308 	vtbl.8	d28, {d0-d1}, d26
    309 	vtbl.8	d29, {d0-d1}, d27
    310 	veor	q14, q14, q15
    311 
    312 	/* q0 := x = A2_B_D + A2_B(mcf) */
    313 	vtbl.8	d0, {d30-d31}, d24
    314 	vtbl.8	d1, {d30-d31}, d25
    315 	veor	q0, q0, q14
    316 
    317 2:	/*
    318 	 * SubBytes
    319 	 */
    320 
    321 	/* (q2, q3) := (k, i) */
    322 	vshr.u8	q3, q0, #4
    323 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    324 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    325 
    326 	/* q0 := a/k */
    327 	vtbl.8	d0, {d22-d23}, d4
    328 	vtbl.8	d1, {d22-d23}, d5
    329 
    330 	/* q2 := j = i + k */
    331 	veor	q2, q3, q2
    332 
    333 	/* q12 := ir = 1/i */
    334 	vtbl.8	d24, {d20-d21}, d6
    335 	vtbl.8	d25, {d20-d21}, d7
    336 
    337 	/* q13 := jr = 1/j */
    338 	vtbl.8	d26, {d20-d21}, d4
    339 	vtbl.8	d27, {d20-d21}, d5
    340 
    341 	/* q12 := iak = 1/i + a/k */
    342 	veor	q12, q12, q0
    343 
    344 	/* q13 := jak = 1/j + a/k */
    345 	veor	q13, q13, q0
    346 
    347 	/* q12 := iakr = 1/(1/i + a/k) */
    348 	vtbl.8	d24, {d20-d21}, d24
    349 	vtbl.8	d25, {d20-d21}, d25
    350 
    351 	/* q13 := jakr = 1/(1/j + a/k) */
    352 	vtbl.8	d26, {d20-d21}, d26
    353 	vtbl.8	d27, {d20-d21}, d27
    354 
    355 	/* q2 := io = j + 1/(1/i + a/k) */
    356 	veor	q2, q2, q12
    357 
    358 	/* q3 := jo = i + 1/(1/j + a/k) */
    359 	veor	q3, q3, q13
    360 
    361 	/* advance round */
    362 	add	r3, r3, #1
    363 	subs	r1, r1, #1
    364 	and	r3, r3, #3
    365 	bne	1b
    366 
    367 	/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
    368 	add	r8, r12, #(sr - .Lconstants)
    369 	add	r6, r12, #(sbo - .Lconstants)
    370 	add	r8, r8, r3, lsl #4
    371 	vld1.8	{q6-q7}, [r6 :256]
    372 	vld1.8	{q15}, [r8 :128]
    373 
    374 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
    375 
    376 	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
    377 	vtbl.8	d4, {d12-d13}, d4
    378 	vtbl.8	d5, {d12-d13}, d5
    379 	vtbl.8	d6, {d14-d15}, d6
    380 	vtbl.8	d7, {d14-d15}, d7
    381 
    382 	/* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
    383 	veor	q2, q2, q14
    384 	veor	q2, q2, q3
    385 
    386 	/* q0 := x(sr[rmod4]) */
    387 	vtbl.8	d0, {d4-d5}, d30
    388 	vtbl.8	d1, {d4-d5}, d31
    389 
    390 	vpop	{d8-d15}
    391 	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
    392 #ifdef __SOFTFP__
    393 #ifdef __ARM_BIG_ENDIAN
    394 	vmov	r1, r0, d0
    395 	vmov	r3, r2, d1
    396 #else
    397 	vmov	r0, r1, d0
    398 	vmov	r2, r3, d1
    399 #endif
    400 #endif
    401 	bx	lr
    402 END(aes_neon_enc1)
    403 
    404 /*
    405  * aes_neon_dec1(dec, x, nrounds)
    406  *
    407  *	With -mfloat-abi=hard:
    408  *
    409  * uint8x16_t@q0
    410  * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
    411  *     unsigned nrounds@r1)
    412  *
    413  *	With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
    414  *
    415  * uint8x16_t@(r0,r1,r2,r3)
    416  * aes_neon_dec1(const struct aesdec *dec@r0,
    417  *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
    418  */
    419 ENTRY(aes_neon_dec1)
    420 #ifdef __SOFTFP__
    421 #ifdef __ARM_BIG_ENDIAN
    422 	vmov	d0, r3, r2		/* d0 := x lo */
    423 #else
    424 	vmov	d0, r2, r3		/* d0 := x lo */
    425 #endif
    426 	vldr	d1, [sp]		/* d1 := x hi */
    427 	ldr	r1, [sp, #8]		/* r1 := nrounds */
    428 #endif
    429 	push	{r4, r5, r6, r7, r8, r10, r11, lr}
    430 	vpush	{d8-d15}
    431 
    432 	/*
    433 	 * r3: 3 & ~(nrounds - 1)
    434 	 * q0={d0-d1}: x/ak
    435 	 * q1={d2-d3}: 0x0f0f...
    436 	 * q2={d4-d5}: lo/k/j/io
    437 	 * q3={d6-d7}: hi/i/jo
    438 	 * q4={d8-d9}: diptlo/dsb9[0]
    439 	 * q5={d10-d11}: dipthi/dsb9[1]
    440 	 * q6={d12-d13}: dsbb[0]/dsbo[0]
    441 	 * q7={d14-d15}: dsbb[1]/dsbo[1]
    442 	 * q8={d16-d17}: dsbd[0]/dsbe[0]
    443 	 * q9={d18-d19}: dsbd[1]/dsbe[0]
    444 	 * q10={d20-d21}: inv
    445 	 * q11={d22-d23}: inva
    446 	 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
    447 	 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
    448 	 * q14={d28-d29}: rk/xmc
    449 	 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
    450 	 */
    451 
    452 	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
    453 	ldr	r12, .Lconstants_addr
    454 	adr	r11, .Lconstants_addr
    455 
    456 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
    457 	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
    458 	vmov.i8	q1, #0x0f
    459 	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
    460 
    461 	/* r12 := .Lconstants */
    462 	add	r12, r12, r11
    463 
    464 	/* (q4, q5) := (diptlo, dipthi) */
    465 	add	r6, r12, #(dipt - .Lconstants)
    466 	vld1.8	{q4-q5}, [r6 :256]
    467 
    468 	/* load the rest of the constants */
    469 	add	r4, r12, #(dsbb - .Lconstants)
    470 	add	r6, r12, #(.Linv_inva - .Lconstants)
    471 	add	r8, r12, #(.Lmc_forward_3 - .Lconstants)
    472 	vld1.8	{q6-q7}, [r4 :256]	/* q6 := dsbb[0], q7 := dsbb[1] */
    473 	vld1.8	{q10-q11}, [r6 :256]	/* q10 := inv, q11 := inva */
    474 	vld1.8	{q15}, [r8 :128]	/* q15 := mc_forward[3] */
    475 
    476 	/* (q2, q3) := (lo, hi) */
    477 	vshr.u8	q3, q0, #4
    478 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    479 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    480 
    481 	/* (q2, q3) := (diptlo(lo), dipthi(hi)) */
    482 	vtbl.8	d4, {d8-d9}, d4
    483 	vtbl.8	d5, {d8-d9}, d5
    484 	vtbl.8	d6, {d10-d11}, d6
    485 	vtbl.8	d7, {d10-d11}, d7
    486 
    487 	/* load dsb9 */
    488 	add	r4, r12, #(dsb9 - .Lconstants)
    489 	vld1.8	{q4-q5}, [r4 :256]	/* q4 := dsb9[0], q5 := dsb9[1] */
    490 
    491 	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
    492 	veor	q0, q14, q2
    493 	veor	q0, q0, q3
    494 
    495 	b	2f
    496 
    497 	_ALIGN_TEXT
    498 1:	/* load dsbd */
    499 	add	r4, r12, #(dsbd - .Lconstants)
    500 	vld1.8	{q8-q9}, [r4 :256]	/* q8 := dsbd[0], q9 := dsbd[1] */
    501 
    502 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
    503 
    504 	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
    505 	vtbl.8	d24, {d8-d9}, d4
    506 	vtbl.8	d25, {d8-d9}, d5
    507 	vtbl.8	d26, {d10-d11}, d6
    508 	vtbl.8	d27, {d10-d11}, d7
    509 	veor	q0, q14, q12
    510 	veor	q0, q0, q13
    511 
    512 	/* q14 := x(mc) */
    513 	vtbl.8	d28, {d0-d1}, d30
    514 	vtbl.8	d29, {d0-d1}, d31
    515 
    516 	/* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
    517 	vtbl.8	d24, {d16-d17}, d4
    518 	vtbl.8	d25, {d16-d17}, d5
    519 	vtbl.8	d26, {d18-d19}, d6
    520 	vtbl.8	d27, {d18-d19}, d7
    521 	veor	q0, q14, q12
    522 	veor	q0, q0, q13
    523 
    524 	/* load dsbe */
    525 	add	r4, r12, #(dsbe - .Lconstants)
    526 	vld1.8	{q8-q9}, [r4 :256]!	/* q8 := dsbe[0], q9 := dsbe[1] */
    527 
    528 	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
    529 	vtbl.8	d28, {d0-d1}, d30
    530 	vtbl.8	d29, {d0-d1}, d31
    531 	vtbl.8	d24, {d12-d13}, d4
    532 	vtbl.8	d25, {d12-d13}, d5
    533 	vtbl.8	d26, {d14-d15}, d6
    534 	vtbl.8	d27, {d14-d15}, d7
    535 	veor	q0, q14, q12
    536 	veor	q0, q0, q13
    537 
    538 	/* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
    539 	vtbl.8	d28, {d0-d1}, d30
    540 	vtbl.8	d29, {d0-d1}, d31
    541 	vtbl.8	d24, {d16-d17}, d4
    542 	vtbl.8	d25, {d16-d17}, d5
    543 	vtbl.8	d26, {d18-d19}, d6
    544 	vtbl.8	d27, {d18-d19}, d7
    545 	veor	q0, q14, q12
    546 	veor	q0, q0, q13
    547 
    548 	/* q15 := mc := mc <<< 12*8 */
    549 	vext.8	q15, q15, q15, #12
    550 
    551 2:	/*
    552 	 * SubBytes
    553 	 */
    554 
    555 	/* (q2, q3) := (k, i) */
    556 	vshr.u8	q3, q0, #4
    557 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    558 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    559 
    560 	/* q0 := a/k */
    561 	vtbl.8	d0, {d22-d23}, d4
    562 	vtbl.8	d1, {d22-d23}, d5
    563 
    564 	/* q2 := j = i + k */
    565 	veor	q2, q3, q2
    566 
    567 	/* q12 := ir = 1/i */
    568 	vtbl.8	d24, {d20-d21}, d6
    569 	vtbl.8	d25, {d20-d21}, d7
    570 
    571 	/* q13 := jr = 1/j */
    572 	vtbl.8	d26, {d20-d21}, d4
    573 	vtbl.8	d27, {d20-d21}, d5
    574 
    575 	/* q12 := iak = 1/i + a/k */
    576 	veor	q12, q12, q0
    577 
    578 	/* q13 := jak = 1/j + a/k */
    579 	veor	q13, q13, q0
    580 
    581 	/* q12 := iakr = 1/(1/i + a/k) */
    582 	vtbl.8	d24, {d20-d21}, d24
    583 	vtbl.8	d25, {d20-d21}, d25
    584 
    585 	/* q13 := jakr = 1/(1/j + a/k) */
    586 	vtbl.8	d26, {d20-d21}, d26
    587 	vtbl.8	d27, {d20-d21}, d27
    588 
    589 	/* q2 := io = j + 1/(1/i + a/k) */
    590 	veor	q2, q2, q12
    591 
    592 	/* q3 := jo = i + 1/(1/j + a/k) */
    593 	veor	q3, q3, q13
    594 
    595 	/* advance round */
    596 	subs	r1, r1, #1
    597 	bne	1b
    598 
    599 	/* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
    600 	add	r8, r12, #(sr - .Lconstants)
    601 	add	r6, r12, #(dsbo - .Lconstants)
    602 	add	r8, r8, r3, lsl #4
    603 	vld1.8	{q6-q7}, [r6 :256]
    604 	vld1.8	{q15}, [r8 :128]
    605 
    606 	vld1.8	{q14}, [r0 :128]!	/* q14 = *rk++ */
    607 
    608 	/* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
    609 	vtbl.8	d4, {d12-d13}, d4
    610 	vtbl.8	d5, {d12-d13}, d5
    611 	vtbl.8	d6, {d14-d15}, d6
    612 	vtbl.8	d7, {d14-d15}, d7
    613 
    614 	/* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
    615 	veor	q2, q2, q14
    616 	veor	q2, q2, q3
    617 
    618 	/* q0 := x(sr[i]) */
    619 	vtbl.8	d0, {d4-d5}, d30
    620 	vtbl.8	d1, {d4-d5}, d31
    621 
    622 	vpop	{d8-d15}
    623 	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
    624 #ifdef __SOFTFP__
    625 #ifdef __ARM_BIG_ENDIAN
    626 	vmov	r1, r0, d0
    627 	vmov	r3, r2, d1
    628 #else
    629 	vmov	r0, r1, d0
    630 	vmov	r2, r3, d1
    631 #endif
    632 #endif
    633 	bx	lr
    634 END(aes_neon_dec1)
    635