Home | History | Annotate | Line # | Download | only in arm
aes_neon_32.S revision 1.6
      1 /*	$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <arm/asm.h>
     30 
     31 RCSID("$NetBSD: aes_neon_32.S,v 1.6 2020/08/16 18:02:03 riastradh Exp $")
     32 
     33 	.fpu	neon
     34 
     35 	.text
     36 	.p2align 2
     37 .Lconstants_addr:
     38 	.long	.Lconstants - .
     39 
     40 	.section .rodata
     41 	.p2align 4
     42 .Lconstants:
     43 
     44 	.type	inv,_ASM_TYPE_OBJECT
     45 inv:
     46 	.byte	0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E
     47 	.byte	0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04
     48 END(inv)
     49 
     50 	.type	inva,_ASM_TYPE_OBJECT
     51 inva:
     52 	.byte	0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01
     53 	.byte	0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
     54 END(inva)
     55 
     56 	.type	mc_forward,_ASM_TYPE_OBJECT
     57 mc_forward:
     58 	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04	/* 0 */
     59 	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
     60 
     61 	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08	/* 1 */
     62 	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
     63 
     64 	.byte	0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C	/* 2 */
     65 	.byte	0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
     66 
     67 .Lmc_forward_3:
     68 	.byte	0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00	/* 3 */
     69 	.byte	0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
     70 END(mc_forward)
     71 
     72 	.type	mc_backward,_ASM_TYPE_OBJECT
     73 mc_backward:
     74 	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06	/* 0 */
     75 	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
     76 
     77 	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02	/* 1 */
     78 	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
     79 
     80 	.byte	0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E	/* 2 */
     81 	.byte	0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
     82 
     83 	.byte	0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A	/* 3 */
     84 	.byte	0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
     85 END(mc_backward)
     86 
     87 	.type	sr,_ASM_TYPE_OBJECT
     88 sr:
     89 	.byte	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07	/* 0 */
     90 	.byte	0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
     91 
     92 	.byte	0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03	/* 1 */
     93 	.byte	0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B
     94 
     95 	.byte	0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F	/* 2 */
     96 	.byte	0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07
     97 
     98 	.byte	0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B	/* 3 */
     99 	.byte	0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03
    100 END(sr)
    101 
    102 	.type	iptlo,_ASM_TYPE_OBJECT
    103 iptlo:
    104 	.byte	0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2
    105 	.byte	0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA
    106 END(iptlo)
    107 
    108 	.type	ipthi,_ASM_TYPE_OBJECT
    109 ipthi:
    110 	.byte	0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C
    111 	.byte	0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD
    112 END(ipthi)
    113 
    114 	.type	sb1_0,_ASM_TYPE_OBJECT
    115 sb1_0:
    116 	.byte	0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1
    117 	.byte	0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5
    118 END(sb1_0)
    119 
    120 	.type	sb1_1,_ASM_TYPE_OBJECT
    121 sb1_1:
    122 	.byte	0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36
    123 	.byte	0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B
    124 END(sb1_1)
    125 
    126 	.type	sb2_0,_ASM_TYPE_OBJECT
    127 sb2_0:
    128 	.byte	0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2
    129 	.byte	0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E
    130 END(sb2_0)
    131 
    132 	.type	sb2_1,_ASM_TYPE_OBJECT
    133 sb2_1:
    134 	.byte	0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69
    135 	.byte	0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2
    136 END(sb2_1)
    137 
    138 	.type	sbo_0,_ASM_TYPE_OBJECT
    139 sbo_0:
    140 	.byte	0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0
    141 	.byte	0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15
    142 END(sbo_0)
    143 
    144 	.type	sbo_1,_ASM_TYPE_OBJECT
    145 sbo_1:
    146 	.byte	0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF
    147 	.byte	0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E
    148 END(sbo_1)
    149 
    150 	.type	diptlo,_ASM_TYPE_OBJECT
    151 diptlo:
    152 	.byte	0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F
    153 	.byte	0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15
    154 END(diptlo)
    155 
    156 	.type	dipthi,_ASM_TYPE_OBJECT
    157 dipthi:
    158 	.byte	0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86
    159 	.byte	0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12
    160 END(dipthi)
    161 
    162 	.type	dsb9_0,_ASM_TYPE_OBJECT
    163 dsb9_0:
    164 	.byte	0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85
    165 	.byte	0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA
    166 END(dsb9_0)
    167 
    168 	.type	dsb9_1,_ASM_TYPE_OBJECT
    169 dsb9_1:
    170 	.byte	0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0
    171 	.byte	0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72
    172 END(dsb9_1)
    173 
    174 	.type	dsbd_0,_ASM_TYPE_OBJECT
    175 dsbd_0:
    176 	.byte	0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D
    177 	.byte	0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5
    178 END(dsbd_0)
    179 
    180 	.type	dsbd_1,_ASM_TYPE_OBJECT
    181 dsbd_1:
    182 	.byte	0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C
    183 	.byte	0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29
    184 END(dsbd_1)
    185 
    186 	.type	dsbb_0,_ASM_TYPE_OBJECT
    187 dsbb_0:
    188 	.byte	0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0
    189 	.byte	0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60
    190 END(dsbb_0)
    191 
    192 	.type	dsbb_1,_ASM_TYPE_OBJECT
    193 dsbb_1:
    194 	.byte	0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1
    195 	.byte	0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3
    196 END(dsbb_1)
    197 
    198 	.type	dsbe_0,_ASM_TYPE_OBJECT
    199 dsbe_0:
    200 	.byte	0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46
    201 	.byte	0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22
    202 END(dsbe_0)
    203 
    204 	.type	dsbe_1,_ASM_TYPE_OBJECT
    205 dsbe_1:
    206 	.byte	0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C
    207 	.byte	0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94
    208 END(dsbe_1)
    209 
    210 	.type	dsbo_0,_ASM_TYPE_OBJECT
    211 dsbo_0:
    212 	.byte	0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13
    213 	.byte	0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7
    214 END(dsbo_0)
    215 
    216 	.type	dsbo_1,_ASM_TYPE_OBJECT
    217 dsbo_1:
    218 	.byte	0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12
    219 	.byte	0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA
    220 END(dsbo_1)
    221 
    222 /*
    223  * aes_neon_enc1(enc, x, nrounds)
    224  *
    225  *	With -mfloat-abi=hard:
    226  *
    227  * uint8x16_t@q0
    228  * aes_neon_enc1(const struct aesenc *enc@r0, uint8x16_t x@q0,
    229  *     unsigned nrounds@r1)
    230  *
    231  *	With -mfloat-abi=soft(fp) (i.e., __SOFTFP__):
    232  *
    233  * uint8x16_t@(r0,r1,r2,r3)
    234  * aes_neon_enc1(const struct aesenc *enc@r0,
    235  *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
    236  */
    237 ENTRY(aes_neon_enc1)
    238 #ifdef __SOFTFP__
    239 #ifdef __ARM_BIG_ENDIAN
    240 	vmov	d0, r3, r2		/* d0 := x lo */
    241 #else
    242 	vmov	d0, r2, r3		/* d0 := x lo */
    243 #endif
    244 	vldr	d1, [sp]		/* d1 := x hi */
    245 	ldr	r1, [sp, #8]		/* r1 := nrounds */
    246 #endif
    247 	push	{r4, r5, r6, r7, r8, r10, r11, lr}
    248 	vpush	{d8-d15}
    249 
    250 	/*
    251 	 * r3: rmod4
    252 	 * r4: mc_forward
    253 	 * r5: mc_backward
    254 	 * r6,r7,r8,r10,r11,r12: temporaries
    255 	 * q0={d0-d1}: x/ak/A
    256 	 * q1={d2-d3}: 0x0f0f...
    257 	 * q2={d4-d5}: lo/k/j/io
    258 	 * q3={d6-d7}: hi/i/jo
    259 	 * q4={d8-d9}: iptlo
    260 	 * q5={d10-d11}: ipthi
    261 	 * q6={d12-d13}: sb1[0]/sbo[0]
    262 	 * q7={d14-d15}: sb1[1]/sbo[1]
    263 	 * q8={d16-d17}: sb2[0]
    264 	 * q9={d18-d19}: sb2[1]
    265 	 * q10={d20-d21}: inv
    266 	 * q11={d22-d23}: inva
    267 	 * q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
    268 	 * q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
    269 	 * q14={d28-d29}: rk/A2/A2_B_D
    270 	 * q15={d30-d31}: A2_B/sr[rmod4]
    271 	 */
    272 
    273 	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
    274 	ldr	r12, .Lconstants_addr
    275 	adr	r11, .Lconstants_addr
    276 
    277 	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
    278 	movw	r3, #0
    279 	vmov.i8	q1, #0x0f
    280 
    281 	/* r12 := .Lconstants */
    282 	add	r12, r12, r11
    283 
    284 	/* (q4, q5) := (iptlo, ipthi) */
    285 	add	r6, r12, #(iptlo - .Lconstants)
    286 	add	r7, r12, #(ipthi - .Lconstants)
    287 	vld1.8	{d8-d9}, [r6 :128]
    288 	vld1.8	{d10-d11}, [r7 :128]
    289 
    290 	/* load the rest of the constants */
    291 	add	r4, r12, #(sb1_0 - .Lconstants)
    292 	add	r5, r12, #(sb1_1 - .Lconstants)
    293 	add	r6, r12, #(sb2_0 - .Lconstants)
    294 	add	r7, r12, #(sb2_1 - .Lconstants)
    295 	add	r8, r12, #(inv - .Lconstants)
    296 	add	r10, r12, #(inva - .Lconstants)
    297 	vld1.8	{d12-d13}, [r4 :128]	/* q6 = sb1[0] */
    298 	vld1.8	{d14-d15}, [r5 :128]	/* q7 = sb1[1] */
    299 	vld1.8	{d16-d17}, [r6 :128]	/* q8 = sb2[0] */
    300 	vld1.8	{d18-d19}, [r7 :128]	/* q9 = sb2[1] */
    301 	vld1.8	{d20-d21}, [r8 :128]	/* q10 = inv */
    302 	vld1.8	{d22-d23}, [r10 :128]	/* q11 = inva */
    303 
    304 	/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
    305 	add	r4, r12, #(mc_forward - .Lconstants)
    306 	add	r5, r12, #(mc_backward - .Lconstants)
    307 
    308 	/* (q2, q3) := (lo, hi) */
    309 	vshr.u8	q3, q0, #4
    310 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    311 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    312 
    313 	/* (q2, q3) := (iptlo(lo), ipthi(hi)) */
    314 	vtbl.8	d4, {d8-d9}, d4
    315 	vtbl.8	d5, {d8-d9}, d5
    316 	vtbl.8	d6, {d10-d11}, d6
    317 	vtbl.8	d7, {d10-d11}, d7
    318 
    319 	/* q0 := rk[0] + iptlo(lo) + ipthi(hi) */
    320 	veor	q0, q14, q2
    321 	veor	q0, q0, q3
    322 
    323 	b	2f
    324 
    325 	_ALIGN_TEXT
    326 1:	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
    327 
    328 	/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
    329 	vtbl.8	d24, {d12-d13}, d4
    330 	vtbl.8	d25, {d12-d13}, d5
    331 	vtbl.8	d26, {d14-d15}, d6
    332 	vtbl.8	d27, {d14-d15}, d7
    333 	veor	q0, q14, q12
    334 	veor	q0, q0, q13
    335 
    336 	/* q14 := A2 = sb2_0[io] + sb2_1[jo] */
    337 	vtbl.8	d24, {d16-d17}, d4
    338 	vtbl.8	d25, {d16-d17}, d5
    339 	vtbl.8	d26, {d18-d19}, d6
    340 	vtbl.8	d27, {d18-d19}, d7
    341 	veor	q14, q12, q13
    342 
    343 	/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
    344 	add	r6, r4, r3, lsl #4
    345 	add	r7, r5, r3, lsl #4
    346 	vld1.8	{d24-d25}, [r6]
    347 	vld1.8	{d26-d27}, [r7]
    348 
    349 	/* q15 := A2_B = A2 + A(mcf) */
    350 	vtbl.8	d30, {d0-d1}, d24
    351 	vtbl.8	d31, {d0-d1}, d25
    352 	veor	q15, q15, q14
    353 
    354 	/* q14 := A2_B_D = A2_B + A(mcb) */
    355 	vtbl.8	d28, {d0-d1}, d26
    356 	vtbl.8	d29, {d0-d1}, d27
    357 	veor	q14, q14, q15
    358 
    359 	/* q0 := x = A2_B_D + A2_B(mcf) */
    360 	vtbl.8	d0, {d30-d31}, d24
    361 	vtbl.8	d1, {d30-d31}, d25
    362 	veor	q0, q0, q14
    363 
    364 2:	/*
    365 	 * SubBytes
    366 	 */
    367 
    368 	/* (q2, q3) := (k, i) */
    369 	vshr.u8	q3, q0, #4
    370 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    371 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    372 
    373 	/* q0 := a/k */
    374 	vtbl.8	d0, {d22-d23}, d4
    375 	vtbl.8	d1, {d22-d23}, d5
    376 
    377 	/* q2 := j = i + k */
    378 	veor	q2, q3, q2
    379 
    380 	/* q12 := ir = 1/i */
    381 	vtbl.8	d24, {d20-d21}, d6
    382 	vtbl.8	d25, {d20-d21}, d7
    383 
    384 	/* q13 := jr = 1/j */
    385 	vtbl.8	d26, {d20-d21}, d4
    386 	vtbl.8	d27, {d20-d21}, d5
    387 
    388 	/* q12 := iak = 1/i + a/k */
    389 	veor	q12, q12, q0
    390 
    391 	/* q13 := jak = 1/j + a/k */
    392 	veor	q13, q13, q0
    393 
    394 	/* q12 := iakr = 1/(1/i + a/k) */
    395 	vtbl.8	d24, {d20-d21}, d24
    396 	vtbl.8	d25, {d20-d21}, d25
    397 
    398 	/* q13 := jakr = 1/(1/j + a/k) */
    399 	vtbl.8	d26, {d20-d21}, d26
    400 	vtbl.8	d27, {d20-d21}, d27
    401 
    402 	/* q2 := io = j + 1/(1/i + a/k) */
    403 	veor	q2, q2, q12
    404 
    405 	/* q3 := jo = i + 1/(1/j + a/k) */
    406 	veor	q3, q3, q13
    407 
    408 	/* advance round */
    409 	add	r3, r3, #1
    410 	subs	r1, r1, #1
    411 	and	r3, r3, #3
    412 	bne	1b
    413 
    414 	/* (q6, q7, q15) := (sbo[0], sbo[1], sr[rmod4]) */
    415 	add	r8, r12, #(sr - .Lconstants)
    416 	add	r6, r12, #(sbo_0 - .Lconstants)
    417 	add	r7, r12, #(sbo_1 - .Lconstants)
    418 	add	r8, r8, r3, lsl #4
    419 	vld1.8	{d12-d13}, [r6 :128]
    420 	vld1.8	{d14-d15}, [r7 :128]
    421 	vld1.8	{d30-d31}, [r8 :128]
    422 
    423 	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
    424 
    425 	/* (q2, q3) := (sbo_0(io), sbo_1(jo)) */
    426 	vtbl.8	d4, {d12-d13}, d4
    427 	vtbl.8	d5, {d12-d13}, d5
    428 	vtbl.8	d6, {d14-d15}, d6
    429 	vtbl.8	d7, {d14-d15}, d7
    430 
    431 	/* q2 := x = rk[nr] + sbo_0(io) + sbo_1(jo) */
    432 	veor	q2, q2, q14
    433 	veor	q2, q2, q3
    434 
    435 	/* q0 := x(sr[rmod4]) */
    436 	vtbl.8	d0, {d4-d5}, d30
    437 	vtbl.8	d1, {d4-d5}, d31
    438 
    439 	vpop	{d8-d15}
    440 	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
    441 #ifdef __SOFTFP__
    442 #ifdef __ARM_BIG_ENDIAN
    443 	vmov	r1, r0, d0
    444 	vmov	r3, r2, d1
    445 #else
    446 	vmov	r0, r1, d0
    447 	vmov	r2, r3, d1
    448 #endif
    449 #endif
    450 	bx	lr
    451 END(aes_neon_enc1)
    452 
    453 /*
    454  * aes_neon_dec1(dec, x, nrounds)
    455  *
    456  *	With -mfloat-abi=hard:
    457  *
    458  * uint8x16_t@q0
    459  * aes_neon_dec1(const struct aesdec *dec@r0, uint8x16_t x@q0,
    460  *     unsigned nrounds@r1)
    461  *
    462  *	With -mfloat-abi=soft(fp) (here spelled `#ifdef _KERNEL'):
    463  *
    464  * uint8x16_t@(r0,r1,r2,r3)
    465  * aes_neon_dec1(const struct aesdec *dec@r0,
    466  *     uint8x16_t x@(r2,r3,sp[0],sp[4]), nrounds@sp[8])
    467  */
    468 ENTRY(aes_neon_dec1)
    469 #ifdef __SOFTFP__
    470 #ifdef __ARM_BIG_ENDIAN
    471 	vmov	d0, r3, r2		/* d0 := x lo */
    472 #else
    473 	vmov	d0, r2, r3		/* d0 := x lo */
    474 #endif
    475 	vldr	d1, [sp]		/* d1 := x hi */
    476 	ldr	r1, [sp, #8]		/* r1 := nrounds */
    477 #endif
    478 	push	{r4, r5, r6, r7, r8, r10, r11, lr}
    479 	vpush	{d8-d15}
    480 
    481 	/*
    482 	 * r3: 3 & ~(nrounds - 1)
    483 	 * q0={d0-d1}: x/ak
    484 	 * q1={d2-d3}: 0x0f0f...
    485 	 * q2={d4-d5}: lo/k/j/io
    486 	 * q3={d6-d7}: hi/i/jo
    487 	 * q4={d8-d9}: diptlo/dsb9[0]
    488 	 * q5={d10-d11}: dipthi/dsb9[1]
    489 	 * q6={d12-d13}: dsbb[0]/dsbo[0]
    490 	 * q7={d14-d15}: dsbb[1]/dsbo[1]
    491 	 * q8={d16-d17}: dsbd[0]/dsbe[0]
    492 	 * q9={d18-d19}: dsbd[1]/dsbe[0]
    493 	 * q10={d20-d21}: inv
    494 	 * q11={d22-d23}: inva
    495 	 * q12={d24-d25}: ir/iak/iakr/dsbX_0(io)
    496 	 * q13={d26-d27}: jr/jak/jakr/dsbX_1(jo)
    497 	 * q14={d28-d29}: rk/xmc
    498 	 * q15={d30-d31}: mc/sr[3 & ~(nrounds - 1)]
    499 	 */
    500 
    501 	/* r12 := .Lconstants - .Lconstants_addr, r11 := .Lconstants_addr */
    502 	ldr	r12, .Lconstants_addr
    503 	adr	r11, .Lconstants_addr
    504 
    505 	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
    506 	rsb	r3, r1, #0		/* r3 := ~(x - 1) = -x */
    507 	vmov.i8	q1, #0x0f
    508 	and	r3, r3, #3		/* r3 := 3 & ~(x - 1) */
    509 
    510 	/* r12 := .Lconstants */
    511 	add	r12, r12, r11
    512 
    513 	/* (q4, q5) := (diptlo, dipthi) */
    514 	add	r6, r12, #(diptlo - .Lconstants)
    515 	add	r7, r12, #(dipthi - .Lconstants)
    516 	vld1.8	{d8-d9}, [r6 :128]
    517 	vld1.8	{d10-d11}, [r7 :128]
    518 
    519 	/* load the rest of the constants */
    520 	add	r4, r12, #(dsbb_0 - .Lconstants)
    521 	add	r5, r12, #(dsbb_1 - .Lconstants)
    522 	add	r6, r12, #(inv - .Lconstants)
    523 	add	r7, r12, #(inva - .Lconstants)
    524 	add	r8, r12, #(.Lmc_forward_3 - .Lconstants)
    525 	vld1.8	{d12-d13}, [r4 :128]	/* q6 := dsbb[0] */
    526 	vld1.8	{d14-d15}, [r5 :128]	/* q7 := dsbb[1] */
    527 	vld1.8	{d20-d21}, [r6 :128]	/* q10 := inv */
    528 	vld1.8	{d22-d23}, [r7 :128]	/* q11 := inva */
    529 	vld1.8	{d30-d31}, [r8 :128]	/* q15 := mc_forward[3] */
    530 
    531 	/* (q2, q3) := (lo, hi) */
    532 	vshr.u8	q3, q0, #4
    533 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    534 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    535 
    536 	/* (q2, q3) := (diptlo(lo), dipthi(hi)) */
    537 	vtbl.8	d4, {d8-d9}, d4
    538 	vtbl.8	d5, {d8-d9}, d5
    539 	vtbl.8	d6, {d10-d11}, d6
    540 	vtbl.8	d7, {d10-d11}, d7
    541 
    542 	/* load dsb9 */
    543 	add	r4, r12, #(dsb9_0 - .Lconstants)
    544 	add	r5, r12, #(dsb9_1 - .Lconstants)
    545 	vld1.8	{d8-d9}, [r4 :128]	/* q4 := dsb9[0] */
    546 	vld1.8	{d10-d11}, [r5 :128]	/* q5 := dsb9[1] */
    547 
    548 	/* q0 := rk[0] + diptlo(lo) + dipthi(hi) */
    549 	veor	q0, q14, q2
    550 	veor	q0, q0, q3
    551 
    552 	b	2f
    553 
    554 	_ALIGN_TEXT
    555 1:	/* load dsbd */
    556 	add	r4, r12, #(dsbd_0 - .Lconstants)
    557 	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbd[0] */
    558 	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbd[1] */
    559 
    560 	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
    561 
    562 	/* q0 := rk[i] + dsb9_0(io) + dsb9_1(jo) */
    563 	vtbl.8	d24, {d8-d9}, d4
    564 	vtbl.8	d25, {d8-d9}, d5
    565 	vtbl.8	d26, {d10-d11}, d6
    566 	vtbl.8	d27, {d10-d11}, d7
    567 	veor	q0, q14, q12
    568 	veor	q0, q0, q13
    569 
    570 	/* q14 := x(mc) */
    571 	vtbl.8	d28, {d0-d1}, d30
    572 	vtbl.8	d29, {d0-d1}, d31
    573 
    574 	/* q0 := x(mc) + dsbd_0(io) + dsbd_1(jo) */
    575 	vtbl.8	d24, {d16-d17}, d4
    576 	vtbl.8	d25, {d16-d17}, d5
    577 	vtbl.8	d26, {d18-d19}, d6
    578 	vtbl.8	d27, {d18-d19}, d7
    579 	veor	q0, q14, q12
    580 	veor	q0, q0, q13
    581 
    582 	/* load dsbe */
    583 	add	r4, r12, #(dsbe_0 - .Lconstants)
    584 	vld1.8	{d16-d17}, [r4 :128]!	/* q8 := dsbe[0] */
    585 	vld1.8	{d18-d19}, [r4 :128]	/* q9 := dsbe[1] */
    586 
    587 	/* q0 := x(mc) + dsbb_0(io) + dsbb_1(jo) */
    588 	vtbl.8	d28, {d0-d1}, d30
    589 	vtbl.8	d29, {d0-d1}, d31
    590 	vtbl.8	d24, {d12-d13}, d4
    591 	vtbl.8	d25, {d12-d13}, d5
    592 	vtbl.8	d26, {d14-d15}, d6
    593 	vtbl.8	d27, {d14-d15}, d7
    594 	veor	q0, q14, q12
    595 	veor	q0, q0, q13
    596 
    597 	/* q0 := x(mc) + dsbe_0(io) + dsbe_1(jo) */
    598 	vtbl.8	d28, {d0-d1}, d30
    599 	vtbl.8	d29, {d0-d1}, d31
    600 	vtbl.8	d24, {d16-d17}, d4
    601 	vtbl.8	d25, {d16-d17}, d5
    602 	vtbl.8	d26, {d18-d19}, d6
    603 	vtbl.8	d27, {d18-d19}, d7
    604 	veor	q0, q14, q12
    605 	veor	q0, q0, q13
    606 
    607 	/* q15 := mc := mc <<< 12*8 */
    608 	vext.8	q15, q15, q15, #12
    609 
    610 2:	/*
    611 	 * SubBytes
    612 	 */
    613 
    614 	/* (q2, q3) := (k, i) */
    615 	vshr.u8	q3, q0, #4
    616 	vand	q2, q0, q1		/* q2 := x & 0x0f0f... */
    617 	vand	q3, q3, q1		/* q3 := (x >> 4) & 0x0f0f... */
    618 
    619 	/* q0 := a/k */
    620 	vtbl.8	d0, {d22-d23}, d4
    621 	vtbl.8	d1, {d22-d23}, d5
    622 
    623 	/* q2 := j = i + k */
    624 	veor	q2, q3, q2
    625 
    626 	/* q12 := ir = 1/i */
    627 	vtbl.8	d24, {d20-d21}, d6
    628 	vtbl.8	d25, {d20-d21}, d7
    629 
    630 	/* q13 := jr = 1/j */
    631 	vtbl.8	d26, {d20-d21}, d4
    632 	vtbl.8	d27, {d20-d21}, d5
    633 
    634 	/* q12 := iak = 1/i + a/k */
    635 	veor	q12, q12, q0
    636 
    637 	/* q13 := jak = 1/j + a/k */
    638 	veor	q13, q13, q0
    639 
    640 	/* q12 := iakr = 1/(1/i + a/k) */
    641 	vtbl.8	d24, {d20-d21}, d24
    642 	vtbl.8	d25, {d20-d21}, d25
    643 
    644 	/* q13 := jakr = 1/(1/j + a/k) */
    645 	vtbl.8	d26, {d20-d21}, d26
    646 	vtbl.8	d27, {d20-d21}, d27
    647 
    648 	/* q2 := io = j + 1/(1/i + a/k) */
    649 	veor	q2, q2, q12
    650 
    651 	/* q3 := jo = i + 1/(1/j + a/k) */
    652 	veor	q3, q3, q13
    653 
    654 	/* advance round */
    655 	subs	r1, r1, #1
    656 	bne	1b
    657 
    658 	/* (q6, q7, q15) := (dsbo[0], dsbo[1], sr[i]) */
    659 	add	r8, r12, #(sr - .Lconstants)
    660 	add	r6, r12, #(dsbo_0 - .Lconstants)
    661 	add	r7, r12, #(dsbo_1 - .Lconstants)
    662 	add	r8, r8, r3, lsl #4
    663 	vld1.8	{d12-d13}, [r6 :128]
    664 	vld1.8	{d14-d15}, [r7 :128]
    665 	vld1.8	{d30-d31}, [r8 :128]
    666 
    667 	vld1.8	{d28-d29}, [r0 :128]!	/* q14 = *rk++ */
    668 
    669 	/* (q2, q3) := (dsbo_0(io), dsbo_1(jo)) */
    670 	vtbl.8	d4, {d12-d13}, d4
    671 	vtbl.8	d5, {d12-d13}, d5
    672 	vtbl.8	d6, {d14-d15}, d6
    673 	vtbl.8	d7, {d14-d15}, d7
    674 
    675 	/* q2 := x = rk[nr] + dsbo_0(io) + dsbo_1(jo) */
    676 	veor	q2, q2, q14
    677 	veor	q2, q2, q3
    678 
    679 	/* q0 := x(sr[i]) */
    680 	vtbl.8	d0, {d4-d5}, d30
    681 	vtbl.8	d1, {d4-d5}, d31
    682 
    683 	vpop	{d8-d15}
    684 	pop	{r4, r5, r6, r7, r8, r10, r11, lr}
    685 #ifdef __SOFTFP__
    686 #ifdef __ARM_BIG_ENDIAN
    687 	vmov	r1, r0, d0
    688 	vmov	r3, r2, d1
    689 #else
    690 	vmov	r0, r1, d0
    691 	vmov	r2, r3, d1
    692 #endif
    693 #endif
    694 	bx	lr
    695 END(aes_neon_dec1)
    696